Actualiser conf/default.yaml

This commit is contained in:
2025-07-28 18:09:43 +00:00
parent a37ebe148e
commit 0aee3cbec1

View File

@@ -36,8 +36,7 @@ recognizer_registry:
load_predefined_recognizers: true
recognizers:
# --- On déclare le détecteur NLP de base ---
# --- détecteur de dates flexibles ---
- name: FlexibleDateRecognizer
supported_language: fr
supported_entity: FLEXIBLE_DATE
@@ -50,6 +49,7 @@ recognizer_registry:
score: 1.0
context: ["date", "né le", "signé le", "incident du"]
# --- adresse belge complète ---
- name: BelgianAddressRecognizer
supported_language: fr
supported_entity: BE_ADDRESS
@@ -59,6 +59,7 @@ recognizer_registry:
score: 1.0
context: ["demeurant", "adresse", "siège social", "bureaux situés"]
# --- numéro téléphone belge ---
- name: BelgianPhoneRecognizer
supported_language: fr
supported_entity: BE_PHONE_NUMBER
@@ -68,6 +69,7 @@ recognizer_registry:
score: 0.95
context: ["Tel", "Tél", "téléphone", "gsm", "mobile"]
# --- organisation avec forme légale ---
- name: SmartOrganizationRecognizer
supported_language: fr
supported_entity: ORGANIZATION
@@ -80,6 +82,7 @@ recognizer_registry:
score: 0.9
context: ["société", "entreprise", "gérant de la"]
# --- numéro professionnel IEC ---
- name: ProfessionalIdRecognizer
supported_language: fr
supported_entity: BE_PRO_ID
@@ -89,6 +92,7 @@ recognizer_registry:
score: 1.0
context: ["expert-comptable"]
# --- numéro BCE/TVA belge ---
- name: BelgianEnterpriseRecognizer
supported_language: fr
supported_entity: BE_ENTERPRISE_NUMBER
@@ -97,22 +101,38 @@ recognizer_registry:
regex: "\\b(BE)?\\s?0?\\d{3}[\\.\\s]?\\d{3}[\\.\\s]?\\d{3}\\b"
score: 1.0
context: ["BCE", "TVA", "intracommunautaire"]
# --- Email ---
- name: EmailRecognizer
supported_language: fr
supported_entity: EMAIL_ADDRESS
patterns: [{name: Email Pattern, regex: "\\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Z|a-z]{2,}\\b", score: 1.0}]
patterns:
- name: Email Pattern
regex: "\\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Z|a-z]{2,}\\b"
score: 1.0
context: ["email", "courriel", "mail"]
# --- IBAN ---
- name: IbanRecognizer
supported_language: fr
supported_entity: IBAN
patterns: [{name: IBAN Pattern, regex: "\\b[A-Z]{2}[0-9]{2}\\s?(?:[A-Z0-9]{4}\\s?){2,7}[A-Z0-9]{1,4}\\b", score: 0.95}]
patterns:
- name: IBAN Pattern
regex: "\\b[A-Z]{2}[0-9]{2}\\s?(?:[A-Z0-9]{4}\\s?){2,7}[A-Z0-9]{1,4}\\b"
score: 0.95
context: ["iban", "compte"]
# --- Numéro registre national belge ---
- name: BelgianNRNRecognizer
supported_language: fr
supported_entity: BE_NATIONAL_REGISTER_NUMBER
patterns: [{name: NRN Pattern, regex: "\\b[0-9]{2}\\.[0-9]{2}\\.[0-9]{2}-[0-9]{3}\\.[0-9]{2}\\b", score: 1.0}]
patterns:
- name: NRN Pattern
regex: "\\b[0-9]{2}\\.[0-9]{2}\\.[0-9]{2}-[0-9]{3}\\.[0-9]{2}\\b"
score: 1.0
context: ["registre national"]
# --- Numéro sécurité sociale France (INSEE) ---
- name: FrenchINSEERecognizer
supported_language: fr
supported_entity: FR_SOCIAL_SECURITY_NUMBER
@@ -122,6 +142,19 @@ recognizer_registry:
score: 0.95
context: ["sécurité sociale", "insee", "nir"]
# --- Adresse IP (IPv4 et IPv6) ---
- name: IpAddressRecognizer
supported_language: fr
supported_entity: IP_ADDRESS
patterns:
- name: IPv4
regex: "\\b(?:(?:25[0-5]|2[0-4][0-9]|1[0-9]{2}|[1-9]?[0-9])\\.){3}(?:25[0-5]|2[0-4][0-9]|1[0-9]{2}|[1-9]?[0-9])\\b"
score: 1.0
- name: IPv6
regex: "\\b([0-9a-fA-F]{1,4}:){7}[0-9a-fA-F]{1,4}\\b"
score: 0.9
context: ["adresse ip", "ip", "serveur", "exposé"]
# 3. LISTE D'EXCLUSION
# =====================================================================
allow_list:
@@ -169,9 +202,12 @@ allow_list:
- Prix
- Coordonnées
- Témoins
- "Coordonnées bancaires"
- "Témoins clés"
- Coordonnées bancaires
- Témoins clés
- montrent
- montrent des
- montrent des irrégularités
- bénéficiaire
# 4. CONFIGURATION DES TRANSFORMATIONS D'ANONYMISATION
# =====================================================================
@@ -208,4 +244,4 @@ anonymizer_config:
FLEXIBLE_DATE: "<DATE>"
BE_ADDRESS: "<ADRESSE_BELGE>"
BE_PRO_ID: "<ID_PROFESSIONNEL>"
IP_ADDRESS: "<ADRESSE_IP>"
IP_ADDRESS: "<ADRESSE_IP>"