diff --git a/conf/default.yaml b/conf/default.yaml index a59c931..0a2ac58 100644 --- a/conf/default.yaml +++ b/conf/default.yaml @@ -36,8 +36,7 @@ recognizer_registry: load_predefined_recognizers: true recognizers: - # --- On déclare le détecteur NLP de base --- - + # --- détecteur de dates flexibles --- - name: FlexibleDateRecognizer supported_language: fr supported_entity: FLEXIBLE_DATE @@ -50,6 +49,7 @@ recognizer_registry: score: 1.0 context: ["date", "né le", "signé le", "incident du"] + # --- adresse belge complète --- - name: BelgianAddressRecognizer supported_language: fr supported_entity: BE_ADDRESS @@ -59,6 +59,7 @@ recognizer_registry: score: 1.0 context: ["demeurant", "adresse", "siège social", "bureaux situés"] + # --- numéro téléphone belge --- - name: BelgianPhoneRecognizer supported_language: fr supported_entity: BE_PHONE_NUMBER @@ -68,6 +69,7 @@ recognizer_registry: score: 0.95 context: ["Tel", "Tél", "téléphone", "gsm", "mobile"] + # --- organisation avec forme légale --- - name: SmartOrganizationRecognizer supported_language: fr supported_entity: ORGANIZATION @@ -80,6 +82,7 @@ recognizer_registry: score: 0.9 context: ["société", "entreprise", "gérant de la"] + # --- numéro professionnel IEC --- - name: ProfessionalIdRecognizer supported_language: fr supported_entity: BE_PRO_ID @@ -89,6 +92,7 @@ recognizer_registry: score: 1.0 context: ["expert-comptable"] + # --- numéro BCE/TVA belge --- - name: BelgianEnterpriseRecognizer supported_language: fr supported_entity: BE_ENTERPRISE_NUMBER @@ -97,22 +101,38 @@ recognizer_registry: regex: "\\b(BE)?\\s?0?\\d{3}[\\.\\s]?\\d{3}[\\.\\s]?\\d{3}\\b" score: 1.0 context: ["BCE", "TVA", "intracommunautaire"] - + + # --- Email --- - name: EmailRecognizer supported_language: fr supported_entity: EMAIL_ADDRESS - patterns: [{name: Email Pattern, regex: "\\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Z|a-z]{2,}\\b", score: 1.0}] + patterns: + - name: Email Pattern + regex: "\\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Z|a-z]{2,}\\b" + score: 1.0 context: ["email", "courriel", "mail"] + + # --- IBAN --- - name: IbanRecognizer supported_language: fr supported_entity: IBAN - patterns: [{name: IBAN Pattern, regex: "\\b[A-Z]{2}[0-9]{2}\\s?(?:[A-Z0-9]{4}\\s?){2,7}[A-Z0-9]{1,4}\\b", score: 0.95}] + patterns: + - name: IBAN Pattern + regex: "\\b[A-Z]{2}[0-9]{2}\\s?(?:[A-Z0-9]{4}\\s?){2,7}[A-Z0-9]{1,4}\\b" + score: 0.95 context: ["iban", "compte"] + + # --- Numéro registre national belge --- - name: BelgianNRNRecognizer supported_language: fr supported_entity: BE_NATIONAL_REGISTER_NUMBER - patterns: [{name: NRN Pattern, regex: "\\b[0-9]{2}\\.[0-9]{2}\\.[0-9]{2}-[0-9]{3}\\.[0-9]{2}\\b", score: 1.0}] + patterns: + - name: NRN Pattern + regex: "\\b[0-9]{2}\\.[0-9]{2}\\.[0-9]{2}-[0-9]{3}\\.[0-9]{2}\\b" + score: 1.0 context: ["registre national"] + + # --- Numéro sécurité sociale France (INSEE) --- - name: FrenchINSEERecognizer supported_language: fr supported_entity: FR_SOCIAL_SECURITY_NUMBER @@ -122,6 +142,19 @@ recognizer_registry: score: 0.95 context: ["sécurité sociale", "insee", "nir"] + # --- Adresse IP (IPv4 et IPv6) --- + - name: IpAddressRecognizer + supported_language: fr + supported_entity: IP_ADDRESS + patterns: + - name: IPv4 + regex: "\\b(?:(?:25[0-5]|2[0-4][0-9]|1[0-9]{2}|[1-9]?[0-9])\\.){3}(?:25[0-5]|2[0-4][0-9]|1[0-9]{2}|[1-9]?[0-9])\\b" + score: 1.0 + - name: IPv6 + regex: "\\b([0-9a-fA-F]{1,4}:){7}[0-9a-fA-F]{1,4}\\b" + score: 0.9 + context: ["adresse ip", "ip", "serveur", "exposé"] + # 3. LISTE D'EXCLUSION # ===================================================================== allow_list: @@ -169,9 +202,12 @@ allow_list: - Prix - Coordonnées - Témoins - - "Coordonnées bancaires" - - "Témoins clés" - + - Coordonnées bancaires + - Témoins clés + - montrent + - montrent des + - montrent des irrégularités + - bénéficiaire # 4. CONFIGURATION DES TRANSFORMATIONS D'ANONYMISATION # ===================================================================== @@ -208,4 +244,4 @@ anonymizer_config: FLEXIBLE_DATE: "" BE_ADDRESS: "" BE_PRO_ID: "" - IP_ADDRESS: "" \ No newline at end of file + IP_ADDRESS: ""