Presidio/default.yaml

# =====================================================================
# CONFIGURATION PRESIDIO POUR DOCUMENTS FRANÇAIS/BELGES
# =====================================================================

# Configuration du moteur de langage (NLP)
nlp_engine_name: spacy
supported_languages: [en, fr]
models:
  - lang_code: en
    model_name: en_core_web_lg
  - lang_code: fr
    model_name: fr_core_news_sm

# Configuration du comportement du moteur NLP
ner_model_configuration:
  # Ne plus ignorer ORG. On va gérer les faux positifs d'ORG via l'allow_list, c'est plus précis.
  labels_to_ignore:
    - MISC

  # Seuils de confiance par entité. Augmentés pour réduire les faux positifs.
  confidence_threshold:
    default: 0.6 # Plus strict que 0.35 par défaut
    EMAIL_ADDRESS: 0.8
    PHONE_NUMBER: 0.8
    PERSON: 0.85 # Très strict pour les noms de personnes
    LOCATION: 0.75 # Moins de faux positifs pour les lieux
    MONEY: 0.85 # Nouveau type d'entité pour les montants

# Section où vous définissez vos recognizers personnalisés basés sur des regex
# C'est ici que sont déplacées les définitions de 'ad_hoc_recognizers' de route.ts
recognizers:
  - name: BelgianNRNRecognizer
    entity_name: BE_NATIONAL_REGISTER_NUMBER
    supported_language: fr
    patterns:
      - name: NRN_Pattern
        # Regex plus robuste pour le NRN belge (prend en compte différents formats)
        regex: "\\b(?:[0-9]{2}(?:0[1-9]|1[0-2])(?:0[1-9]|[12][0-9]|3[01]))-?\\d{3}\\.?\\d{2}\\b"
        score: 1.0 # Score très élevé car regex spécifique

  - name: BelgianEnterpriseRecognizer
    entity_name: BE_ENTERPRISE_NUMBER
    supported_language: fr
    patterns:
      - name: BTW_Pattern
        # Regex pour les numéros d'entreprise belges (BE 0XXX.XXX.XXX)
        regex: "\\bBE\\s?0\\d{3}[\\.\\s]?\\d{3}[\\.\\s]?\\d{3}\\b"
        score: 0.95

  - name: IBANRecognizer
    entity_name: IBAN
    supported_language: fr
    patterns:
      - name: IBAN_Pattern
        # Regex IBAN (plus générique, mais capturera les belges aussi)
        regex: "\\b[A-Z]{2}\\d{2}\\s?(?:[A-Z0-9]{4}\\s?){2,7}[A-Z0-9]{1,4}\\b"
        score: 0.95

  - name: PhoneRecognizer
    entity_name: PHONE_NUMBER
    supported_language: fr
    patterns:
      - name: Phone_Pattern
        # Regex pour numéros de téléphone belges/français/luxembourgeois
        regex: "\\b(?:(?:\\+|00)?(?:32|33|352)|0)\\s?[1-9](?:[\\s.-]?\\d{2}){3,4}\\b"
        score: 0.8

  - name: EmailRecognizer
    entity_name: EMAIL_ADDRESS
    supported_language: fr
    patterns:
      - name: Email_Pattern
        # Regex pour les adresses email
        regex: "\\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Z|a-z]{2,}\\b"
        score: 1.0

  - name: MoneyRecognizer # NOUVEAU : Recognizer pour les montants monétaires
    entity_name: MONEY
    supported_language: fr
    patterns:
      - name: Money_Pattern
        # Regex pour des formats comme EUR 250.000, 250.000 EUR, 250 000€
        regex: "(?:EUR|€)\\s*\\d{1,3}(?:[.,\\s]\\d{3})*(?:[.,]\\d{2})?|\\d{1,3}(?:[.,\\s]\\d{3})*(?:[.,]\\d{2})?\\s*(?:EUR|€)"
        score: 0.85


# Configuration des détecteurs (Recognizers)
# CECI EST LA CLÉ : vous listez les noms des recognizers que vous avez définis ci-dessus.
# SUPPRIMEZ : - presidio_config.custom_recognizers.custom_recognizers
# AJOUTEZ : Les noms de vos recognizers custom.
recognizer_registry:
  - default # Gardez toujours 'default' pour les recognizers intégrés de Presidio
  - BelgianNRNRecognizer
  - BelgianEnterpriseRecognizer
  - IBANRecognizer
  - PhoneRecognizer
  - EmailRecognizer
  - MoneyRecognizer # Active le nouveau recognizer MONEY

# Liste des mots à ignorer (faux positifs courants)
# Important: spécifiez le type d'entité si le mot est souvent mal labellisé comme ce type
allow_list:
  # Termes contractuels/juridiques/financiers génériques souvent mal labellisés
  - text: Contrat
    type: LOCATION # Si 'Contrat' est souvent vu comme une LOCATION
  - text: contrat
    type: LOCATION
  - text: Contrats
    type: LOCATION
  - text: Document
    type: LOCATION
  - text: document
    type: LOCATION
  - text: Société
    type: PERSON # Si 'Société' est souvent vu comme une PERSON
  - text: Investisseur
    type: PERSON
  - text: Montant
    type: LOCATION # Si 'Montant' est souvent vu comme une LOCATION. Maintenant aussi MONEY sera détecté.
  - text: Prêt
    type: LOCATION
  - text: Intérêt
    type: LOCATION
  - text: Intérêts
    type: LOCATION
  - text: Partie
    type: LOCATION
  - text: Parties
    type: PERSON # Si 'Parties' est souvent vu comme une PERSON
  - text: Annexe
    type: LOCATION
  - text: Remboursement
    type: LOCATION
  - text: Conversion
    type: LOCATION
  - text: Financement
    type: LOCATION
  - text: Sortie
    type: LOCATION # Important, 'Sortie' était mal labellisé
  - text: "Juste Valeur Marchande"
    type: PERSON # Était mal labellisé
  - text: Échéance
    type: LOCATION
  - text: Clause
    type: LOCATION
  - text: Clauses
    type: LOCATION
  - text: Principe
    type: LOCATION
  - text: Coûts
    type: PERSON # Était mal labellisé
  - text: Notifications
    type: LOCATION # Était mal labellisé
  - text: Article
    type: LOCATION
  - text: Paragraphe
    type: LOCATION
  - text: Directeur
    type: LOCATION # Ou PERSON, selon le contexte où il est mal labellisé seul
  - text: Gérant
    type: LOCATION
  - text: Président
    type: LOCATION
  - text: DocuSign # Ajouté pour éviter le faux positif comme PERSON
    type: PERSON
  - text: SPRL # Société belge
    type: ORG # Indique que ce n'est pas une personne


  # Termes financiers courants
  - text: Euro
    type: LOCATION # Si 'Euro' est mal labellisé
  - text: EUR
    type: LOCATION
  - text: Euros
    type: LOCATION
  - text: Pourcentage
    type: LOCATION
  - text: Taux
    type: LOCATION
  - text: Valeur
    type: LOCATION
  - text: Prix
    type: LOCATION

  # Termes juridiques
  - text: Loi
    type: LOCATION
  - text: Code
    type: LOCATION
  - text: Règlement
    type: LOCATION
  - text: Décret
    type: LOCATION
  - text: Arrêté
    type: LOCATION

  # Mois et dates génériques (si mal labellisés)
  - text: Janvier
    type: LOCATION
  - text: Février
    type: LOCATION
  - text: Mars
    type: LOCATION
  - text: Avril
    type: LOCATION
  - text: Mai
    type: LOCATION
  - text: Juin
    type: LOCATION
  - text: Juillet
    type: LOCATION
  - text: Août
    type: LOCATION
  - text: Septembre
    type: LOCATION
  - text: Octobre
    type: LOCATION
  - text: Novembre
    type: LOCATION
  - text: Décembre

# Configuration des transformations d'anonymisation
anonymizer_config:
  default_anonymizers:
    PERSON: replace
    EMAIL_ADDRESS: replace
    PHONE_NUMBER: replace
    BE_NATIONAL_REGISTER_NUMBER: replace
    BE_ENTERPRISE_NUMBER: replace
    FR_SOCIAL_SECURITY_NUMBER: replace
    IBAN: replace
    BE_BANK_ACCOUNT: replace # Si vous avez BE_BANK_ACCOUNT comme entité
    MONEY: replace # Ajoutez cette ligne pour anonymiser la nouvelle entité MONEY
    LOCATION: replace # Ajoutez si vous voulez anonymiser les locations détectées
    ORG: replace # Ajoutez si vous voulez anonymiser les organisations

  # Valeurs de remplacement personnalisées
  replacements:
    PERSON: "<PERSON>"
    EMAIL_ADDRESS: "<EMAIL_ADDRESS>"
    PHONE_NUMBER: "<PHONE_NUMBER>"
    BE_NATIONAL_REGISTER_NUMBER: "<BE_NATIONAL_REGISTER_NUMBER>"
    BE_ENTERPRISE_NUMBER: "<BE_ENTERPRISE_NUMBER>"
    FR_SOCIAL_SECURITY_NUMBER: "<FR_SOCIAL_SECURITY_NUMBER>"
    IBAN: "<IBAN>"
    BE_BANK_ACCOUNT: "<COMPTE_BANCAIRE_BE>"
    MONEY: "<MONEY>" # Remplacement pour les montants monétaires
    LOCATION: "<LOCATION>" # Remplacement pour les locations
    ORG: "<ORGANIZATION>" # Remplacement pour les organisations