239 lines
7.1 KiB
YAML
239 lines
7.1 KiB
YAML
# =====================================================================
|
|
# CONFIGURATION PRESIDIO - v27
|
|
# =====================================================================
|
|
supported_languages: [en, fr]
|
|
|
|
# 1. CONFIGURATION DU MOTEUR NLP (INCHANGÉ)
|
|
# =====================================================================
|
|
nlp_configuration:
|
|
nlp_engine_name: spacy
|
|
models:
|
|
- lang_code: en
|
|
model_name: en_core_web_lg
|
|
- lang_code: fr
|
|
model_name: fr_core_news_lg
|
|
ner_model_configuration:
|
|
labels_to_ignore:
|
|
- LOCATION
|
|
- MISC
|
|
- CARDINAL
|
|
- EVENT
|
|
- LANGUAGE
|
|
- LAW
|
|
- ORDINAL
|
|
- PERCENT
|
|
- PRODUCT
|
|
- QUANTITY
|
|
- WORK_OF_ART
|
|
confidence_thresholds:
|
|
DEFAULT_CONFIDENCE: 0.85
|
|
PERSON: 0.85
|
|
ORGANIZATION: 0.55
|
|
|
|
# 2. CONFIGURATION DU REGISTRE DES DÉTECTEURS
|
|
# =====================================================================
|
|
recognizer_registry:
|
|
load_predefined_recognizers: true
|
|
|
|
recognizers:
|
|
- name: FlexibleDateRecognizer
|
|
supported_language: fr
|
|
supported_entity: FLEXIBLE_DATE
|
|
patterns:
|
|
- name: Date format JJ mois AAAA
|
|
regex: "\\b(0?[1-9]|[12][0-9]|3[01])\\s+(janvier|février|mars|avril|mai|juin|juillet|août|septembre|octobre|novembre|décembre)\\s+(19|20)\\d{2}\\b"
|
|
score: 1.0
|
|
- name: Date format JJ/MM/AAAA
|
|
regex: "\\b(0[1-9]|[12][0-9]|3[01])[-/.](0[1-9]|1[012])[-/.](19|20)\\d{2}\\b"
|
|
score: 1.0
|
|
context: ["date", "né le", "signé le", "incident du"]
|
|
|
|
- name: BelgianAddressRecognizer
|
|
supported_language: fr
|
|
supported_entity: BE_ADDRESS
|
|
patterns:
|
|
- name: Adresse Belge complète
|
|
regex: "\\b(?:\\d{1,4}[A-Za-z]?(?:\\s*,)?\\s+)?(?:Avenue|Rue|Boulevard|Chaussée|Place|Quai|Impasse|Drève)(?:\\s+(?:de|la|le|d'|des))?(?:\\s+[A-Z][a-zà-ÿ'-]+)+,?(?:\\s+\\d{1,4}[A-Za-z]?)?,\\s*\\d{4}\\s+[A-Za-zà-ÿ'-]+"
|
|
score: 1.0
|
|
context: ["demeurant", "adresse", "siège social", "bureaux situés"]
|
|
|
|
- name: BelgianPhoneRecognizer
|
|
supported_language: fr
|
|
supported_entity: BE_PHONE_NUMBER
|
|
patterns:
|
|
- name: Numéro téléphone Belge (fixe ou mobile)
|
|
regex: "\\b0[1-9](?:[./\\s]?\\d{2,3}){3}\\b"
|
|
score: 0.95
|
|
context: ["Tel", "Tél", "téléphone", "gsm", "mobile"]
|
|
|
|
- name: SmartOrganizationRecognizer
|
|
supported_language: fr
|
|
supported_entity: ORGANIZATION
|
|
patterns:
|
|
- name: Nom + Forme légale (DigitalConsult SPRL)
|
|
regex: "\\b([A-Z][a-zà-ÿ]+(?:\\s[A-Z][a-zà-ÿ]+)*)\\s+(SPRL|SRL|SA|SCS|SNC)\\b"
|
|
score: 0.9
|
|
- name: Forme légale + Nom (SPRL DigitalConsult)
|
|
regex: "\\b(SPRL|SRL|SA|SCS|SNC)\\s+([A-Z][a-zà-ÿ]+(?:\\s[A-Z][a-zà-ÿ]+)*)\\b"
|
|
score: 0.9
|
|
context: ["société", "entreprise", "gérant de la"]
|
|
|
|
- name: ProfessionalIdRecognizer
|
|
supported_language: fr
|
|
supported_entity: BE_PRO_ID
|
|
patterns:
|
|
- name: Numéro IEC
|
|
regex: "(n°\\sIEC:?|IEC:?)\\s*\\d{6}"
|
|
score: 1.0
|
|
context: ["expert-comptable"]
|
|
|
|
- name: BelgianEnterpriseRecognizer
|
|
supported_language: fr
|
|
supported_entity: BE_ENTERPRISE_NUMBER
|
|
patterns:
|
|
- name: Numéro BCE/TVA Belge (avec ou sans BE)
|
|
regex: "\\b(BE)?\\s?0?\\d{3}[\\.\\s]?\\d{3}[\\.\\s]?\\d{3}\\b"
|
|
score: 1.0
|
|
context: ["BCE", "TVA", "intracommunautaire"]
|
|
|
|
- name: EmailRecognizer
|
|
supported_language: fr
|
|
supported_entity: EMAIL_ADDRESS
|
|
patterns:
|
|
- name: Email Pattern
|
|
regex: "\\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Z|a-z]{2,}\\b"
|
|
score: 1.0
|
|
context: ["email", "courriel", "mail"]
|
|
|
|
- name: IbanRecognizer
|
|
supported_language: fr
|
|
supported_entity: IBAN
|
|
patterns:
|
|
- name: IBAN Pattern
|
|
# Remplacement de \s? par \s obligatoire pour éviter recouvrement trop grand
|
|
regex: "\\b[A-Z]{2}[0-9]{2}(?:\\s[A-Z0-9]{4}){4,7}\\b"
|
|
score: 0.95
|
|
context: ["iban", "compte"]
|
|
|
|
- name: BelgianNRNRecognizer
|
|
supported_language: fr
|
|
supported_entity: BE_NATIONAL_REGISTER_NUMBER
|
|
patterns:
|
|
- name: NRN Pattern
|
|
regex: "\\b[0-9]{2}\\.[0-9]{2}\\.[0-9]{2}-[0-9]{3}\\.[0-9]{2}\\b"
|
|
score: 1.0
|
|
context: ["registre national"]
|
|
|
|
- name: FrenchINSEERecognizer
|
|
supported_language: fr
|
|
supported_entity: FR_SOCIAL_SECURITY_NUMBER
|
|
patterns:
|
|
- name: INSEE Pattern with flexible spaces
|
|
regex: "\\b[12]\\s*[0-9]{2}\\s*(?:0[1-9]|1[0-2])\\s*(?:2[ABab]|[0-9]{2})\\s*[0-9]{3}\\s*[0-9]{3}[\\s]?[0-9]{2}\\b"
|
|
score: 0.95
|
|
context: ["sécurité sociale", "insee", "nir"]
|
|
|
|
- name: IpAddressRecognizer
|
|
supported_language: fr
|
|
supported_entity: IP_ADDRESS
|
|
patterns:
|
|
- name: IPv4
|
|
regex: "\\b(?:(?:25[0-5]|2[0-4][0-9]|1[0-9]{2}|[1-9]?[0-9])\\.){3}(?:25[0-5]|2[0-4][0-9]|1[0-9]{2}|[1-9]?[0-9])\\b"
|
|
score: 1.0
|
|
- name: IPv6
|
|
regex: "\\b([0-9a-fA-F]{1,4}:){7}[0-9a-fA-F]{1,4}\\b"
|
|
score: 0.9
|
|
# On retire le contexte pour plus de souplesse
|
|
# context: ["adresse ip", "ip", "serveur", "exposé"]
|
|
|
|
# 3. LISTE D'EXCLUSION
|
|
# =====================================================================
|
|
allow_list:
|
|
- Adresse
|
|
- ADRESSE
|
|
- Contrat
|
|
- Document
|
|
- Société
|
|
- Investisseur
|
|
- Montant
|
|
- Prêt
|
|
- Intérêt
|
|
- Partie
|
|
- Parties
|
|
- Annexe
|
|
- Remboursement
|
|
- Conversion
|
|
- Financement
|
|
- Sortie
|
|
- "Juste Valeur Marchande"
|
|
- Échéance
|
|
- Clause
|
|
- Clauses
|
|
- Principe
|
|
- Coûts
|
|
- Notifications
|
|
- Article
|
|
- Paragraphe
|
|
- Directeur
|
|
- Gérant
|
|
- Président
|
|
- DocuSign
|
|
- SPRL
|
|
- SA
|
|
- Loi
|
|
- Code
|
|
- Règlement
|
|
- Décret
|
|
- Arrêté
|
|
- Euro
|
|
- EUR
|
|
- Euros
|
|
- Taux
|
|
- Valeur
|
|
- Prix
|
|
- Coordonnées
|
|
- Témoins
|
|
- "Coordonnées bancaires"
|
|
- "Témoins clés"
|
|
- "montrent"
|
|
- "montrent des"
|
|
- "montrent des irrégularités"
|
|
- "bénéficiaire"
|
|
|
|
# 4. CONFIGURATION DES TRANSFORMATIONS D'ANONYMISATION
|
|
# =====================================================================
|
|
anonymizer_config:
|
|
default_anonymizers:
|
|
PERSON: replace
|
|
LOCATION: replace
|
|
ORGANIZATION: replace
|
|
DATE_TIME: replace
|
|
MONEY: replace
|
|
EMAIL_ADDRESS: replace
|
|
IBAN: replace
|
|
BE_ENTERPRISE_NUMBER: replace
|
|
BE_NATIONAL_REGISTER_NUMBER: replace
|
|
FR_SOCIAL_SECURITY_NUMBER: replace
|
|
BE_PHONE_NUMBER: replace
|
|
FLEXIBLE_DATE: replace
|
|
BE_ADDRESS: replace
|
|
BE_PRO_ID: replace
|
|
IP_ADDRESS: replace
|
|
|
|
replacements:
|
|
PERSON: "<PERSONNE>"
|
|
LOCATION: "<LIEU>"
|
|
ORGANIZATION: "<ORGANISATION>"
|
|
DATE_TIME: "<DATE>"
|
|
MONEY: "<MONTANT>"
|
|
EMAIL_ADDRESS: "<EMAIL>"
|
|
IBAN: "<IBAN>"
|
|
BE_ENTERPRISE_NUMBER: "<NUM_ENTREPRISE_BE>"
|
|
BE_NATIONAL_REGISTER_NUMBER: "<NRN_BELGE>"
|
|
FR_SOCIAL_SECURITY_NUMBER: "<NUM_SECU_FR>"
|
|
BE_PHONE_NUMBER: "<TELEPHONE_BE>"
|
|
FLEXIBLE_DATE: "<DATE>"
|
|
BE_ADDRESS: "<ADRESSE_BELGE>"
|
|
BE_PRO_ID: "<ID_PROFESSIONNEL>"
|
|
IP_ADDRESS: "<ADRESSE_IP>"
|