diff --git a/app.py b/app.py index 4ea6fd6..768a1b4 100644 --- a/app.py +++ b/app.py @@ -28,6 +28,7 @@ try: config = config_loader.load_config("main.yaml") logger.info("✅ Configuration modulaire chargée avec succès") + # Normalisation douce de l'allow_list (préserve la structure des mots) allow_list_terms = set(term.lower().strip() for term in config.get('allow_list', [])) logger.info(f"✅ Allow list chargée avec {len(allow_list_terms)} termes") @@ -69,9 +70,114 @@ except Exception as e: def normalize_label(text: str) -> str: + # Règles générales de normalisation pour gérer tous les cas + text = text.strip().lower() + + # 1. Supprimer parenthèses et leur contenu + text = re.sub(r'\([^)]*\)', '', text) + + # 2. Supprimer virgules et points suivis d'un espace + text = re.sub(r'[,.] ', ' ', text) + + # 3. Supprimer points collés (ex: "Dr.Marie" -> "Dr Marie") + text = re.sub(r'\.(\w)', r' \1', text) + + # 4. Supprimer tirets collés aux espaces SEULEMENT (garder les tirets dans les mots composés) + text = re.sub(r'- ', ' ', text) # "expert- comptable" -> "expert comptable" + text = re.sub(r' -', ' ', text) # "expert -comptable" -> "expert comptable" + + # 5. Supprimer deux-points et ce qui suit (ex: "n° IEC: 567890" -> "n° IEC") + text = re.sub(r':.*$', '', text) + + # 6. Normaliser les espaces multiples + text = re.sub(r'\s+', ' ', text) + + # 7. Normalisation finale : garder lettres, chiffres, espaces ET tirets pour mots composés + cleaned = re.sub(r'[^\w\s-]', '', text) + + # 8. Nettoyer les espaces en début/fin + return cleaned.strip() - cleaned = re.sub(r'[^\w\s]', '', text.strip().lower()) - return cleaned + +def filter_by_category(results, mode): + """Filtre les résultats selon la catégorie sélectionnée""" + if mode == "pii_business": + return results # Tout + + # Définir les entités PII (Données personnelles) + pii_entities = { + # Données personnelles de base + 'PERSONNE', 'PERSON', 'DATE', 'DATE_TIME', + 'EMAIL_ADDRESS', 'ADRESSE_EMAIL', 'PHONE_NUMBER', 'TELEPHONE', + 'CREDIT_CARD', 'IBAN', 'ADRESSE_IP', + + # Adresses personnelles + 'ADRESSE', 'ADRESSE_FRANCAISE', 'ADRESSE_BELGE', 'LOCATION', + + # Téléphones personnels + 'TELEPHONE_FRANCAIS', 'TELEPHONE_BELGE', + + # Documents d'identité personnels + 'NUMERO_SECURITE_SOCIALE_FRANCAIS', 'REGISTRE_NATIONAL_BELGE', + 'CARTE_IDENTITE_FRANCAISE', 'CARTE_IDENTITE_BELGE', + 'PASSEPORT_FRANCAIS', 'PASSEPORT_BELGE', + 'PERMIS_CONDUIRE_FRANCAIS', + + # Données financières personnelles + 'COMPTE_BANCAIRE_FRANCAIS', + + # Données sensibles RGPD + 'HEALTH_DATA', 'DONNEES_SANTE', + 'SEXUAL_ORIENTATION', 'ORIENTATION_SEXUELLE', + 'POLITICAL_OPINIONS', 'OPINIONS_POLITIQUES', + 'BIOMETRIC_DATA', 'DONNEES_BIOMETRIQUES', + 'RGPD_FINANCIAL_DATA', 'DONNEES_FINANCIERES_RGPD', + + # Identifiants personnels + 'IDENTIFIANT_PERSONNEL' + } + + # Définir les entités Business (Données d'entreprise) + business_entities = { + # Organisations et sociétés + 'ORGANISATION', 'ORGANIZATION', + 'SOCIETE_FRANCAISE', 'SOCIETE_BELGE', + + # Identifiants fiscaux et d'entreprise + 'TVA_FRANCAISE', 'TVA_BELGE', + 'NUMERO_FISCAL_FRANCAIS', 'SIRET_SIREN_FRANCAIS', + 'NUMERO_ENTREPRISE_BELGE', + + # Identifiants professionnels + 'ID_PROFESSIONNEL_BELGE', + + # Données commerciales + 'MARKET_SHARE', 'SECRET_COMMERCIAL', + 'REFERENCE_CONTRAT', 'MONTANT_FINANCIER', + + # Données techniques d'entreprise + 'CLE_API_SECRETE' + } + + # Définir les entités mixtes (PII + Business) + mixed_entities = { + # Données pouvant être personnelles ou professionnelles + 'TITRE_CIVILITE', 'DONNEES_PROFESSIONNELLES', + 'LOCALISATION_GPS', 'URL_IDENTIFIANT' + } + + if mode == "pii": + # Inclure PII + mixtes + allowed_entities = pii_entities | mixed_entities + return [r for r in results if r.entity_type in allowed_entities] + + elif mode == "business": + # Inclure Business + mixtes + allowed_entities = business_entities | mixed_entities + return [r for r in results if r.entity_type in allowed_entities] + + # Par défaut, retourner tous les résultats + return results # Remplacer ligne 18 @@ -87,6 +193,7 @@ def analyze_text(): data = request.get_json(force=True) text_to_analyze = data.get("text", "") language = data.get("language", "fr") + mode = data.get("mode", "pii_business") # Nouveau paramètre if not text_to_analyze: return jsonify({"error": "text field is missing or empty"}), 400 @@ -94,8 +201,11 @@ def analyze_text(): # Analyse brute raw_results = analyzer.analyze(text=text_to_analyze, language=language) + # Filtrer selon la catégorie + filtered_results = filter_by_category(raw_results, mode) + # Pipeline modulaire complet - final_results = pipeline.process(text_to_analyze, raw_results, allow_list_terms) + final_results = pipeline.process(text_to_analyze, filtered_results, allow_list_terms) response_data = [res.to_dict() for res in final_results] return make_response(jsonify(response_data), 200) @@ -216,12 +326,12 @@ def anonymize_text(): logger.info(f"🔍 Traitement entité: {res.entity_type} = '{ent_text}' (score: {res.score})") logger.info(f"🔍 Allow list terms: {allow_list_terms}") - # Vérification améliorée de la allow list - ent_text_clean = re.sub(r'[^\w]', '', ent_text.strip().lower()) - logger.info(f"🔍 Texte nettoyé: '{ent_text_clean}'") + # Normalisation douce du texte de l'entité (cohérente avec l'allow_list) + ent_text_normalized = ent_text.lower().strip() + logger.info(f"🔍 Texte normalisé: '{ent_text_normalized}'") - # Vérifier si le texte correspond exactement ou commence par un terme de la allow list - is_allowed = any(ent_text_clean == term or ent_text_clean.startswith(term) for term in allow_list_terms) + # Vérifier si l'entité est dans l'allow-list (correspondance exacte) + is_allowed = ent_text_normalized in allow_list_terms if is_allowed: logger.info(f"✅ Entité '{ent_text}' ignorée (dans allow list)") diff --git a/conf/anonymization/allow_list.yaml b/conf/anonymization/allow_list.yaml index b4b9235..c819b54 100644 --- a/conf/anonymization/allow_list.yaml +++ b/conf/anonymization/allow_list.yaml @@ -9,12 +9,8 @@ allow_list: - BCE - TVA - IEC - - expert-comptable - prestataire # Termes financiers - - Euro - - EUR - - Euros - Taux - Valeur - Prix diff --git a/conf/anonymization/replacements.yaml b/conf/anonymization/replacements.yaml index f292e5a..65c5b21 100644 --- a/conf/anonymization/replacements.yaml +++ b/conf/anonymization/replacements.yaml @@ -1,89 +1,85 @@ -# Configuration d'anonymisation complète +# Configuration d'anonymisation anonymizer_config: - default_anonymizers: - # Entités génériques - PERSON: replace - LOCATION: replace - ORGANIZATION: replace - DATE: replace - MONEY: replace - EMAIL_ADDRESS: replace - IBAN: replace - IP_ADDRESS: replace - - # PII Génériques - Données sensibles RGPD - HEALTH_DATA: replace - BIOMETRIC_DATA: replace - SEXUAL_ORIENTATION: replace - POLITICAL_OPINIONS: replace - RGPD_FINANCIAL_DATA: replace - - # PII Belges - BE_ENTERPRISE_NUMBER: replace - BE_NATIONAL_REGISTER_NUMBER: replace - BE_PHONE_NUMBER: replace - BE_ADDRESS: replace - BE_ID_CARD: replace - BE_PASSPORT: replace - - # PII Françaises - FR_SOCIAL_SECURITY_NUMBER: replace - FR_SIRET: replace - FR_ADDRESS: replace - FR_TAX_ID: replace - FR_BANK_ACCOUNT: replace - FR_ID_CARD: replace - FR_PASSPORT: replace - FR_DRIVER_LICENSE: replace - - # Business - BE_PROFESSIONAL_ID: replace - MARKET_SHARE: replace - replacements: - # Entités génériques + # ======================================== + # ENTITÉS PII (Personally Identifiable Information) + # ======================================== + + # Données personnelles de base + DATE: "[DATE]" + DATE_TIME: "[DATE]" + PERSONNE: "[PERSONNE]" PERSON: "[PERSONNE]" LOCATION: "[LIEU]" - ORGANIZATION: "[ORGANISATION]" - DATE: "[DATE]" - MONEY: "[MONTANT]" - EMAIL_ADDRESS: "[EMAIL]" + EMAIL_ADDRESS: "[ADRESSE_EMAIL]" + ADRESSE_EMAIL: "[ADRESSE_EMAIL]" + PHONE_NUMBER: "[TELEPHONE]" + CREDIT_CARD: "[CARTE_CREDIT]" IBAN: "[IBAN]" - IP_ADDRESS: "[ADRESSE_IP]" - - # PII Belges - AJOUTER CES LIGNES - BE_ENTERPRISE_NUMBER: "[ENTREPRISE_BELGE]" - BE_PHONE_NUMBER: "[TELEPHONE_BELGE]" - BE_ADDRESS: "[ADRESSE_BELGE]" - BE_ID_CARD: "[CARTE_ID_BELGE]" - BE_PASSPORT: "[PASSEPORT_BELGE]" + ADRESSE_IP: "[ADRESSE_IP]" - # PII Génériques - Données sensibles RGPD - HEALTH_DATA: "[DONNEES_SANTE]" - BIOMETRIC_DATA: "[DONNEES_BIOMETRIQUES]" - SEXUAL_ORIENTATION: "[ORIENTATION_SEXUELLE]" - POLITICAL_OPINIONS: "[OPINIONS_POLITIQUES]" - RGPD_FINANCIAL_DATA: "[DONNEES_FINANCIERES]" + # Adresses personnelles + ADRESSE: "[ADRESSE]" + ADRESSE_FRANCAISE: "[ADRESSE_FRANCAISE]" + ADRESSE_BELGE: "[ADRESSE_BELGE]" - # PII Belges - BE_ENTERPRISE_NUMBER: "[ENTREPRISE_BELGE]" - BE_NATIONAL_REGISTER_NUMBER: "[NRN_BELGE]" - BE_PHONE_NUMBER: "[TELEPHONE_BE]" - BE_ADDRESS: "[ADRESSE_BELGE]" - BE_ID_CARD: "[CARTE_ID_BE]" - BE_PASSPORT: "[PASSEPORT_BE]" + # Téléphones personnels + TELEPHONE: "[TELEPHONE]" + TELEPHONE_FRANCAIS: "[TELEPHONE_FRANCAIS]" + TELEPHONE_BELGE: "[TELEPHONE_BELGE]" - # PII Françaises - FR_SOCIAL_SECURITY_NUMBER: "[NUM_SECU_FR]" - FR_SIRET: "[SIRET_FR]" - FR_ADDRESS: "[ADRESSE_FR]" - FR_TAX_ID: "[NUM_FISCAL_FR]" - FR_BANK_ACCOUNT: "[COMPTE_BANCAIRE_FR]" - FR_ID_CARD: "[CARTE_ID_FR]" - FR_PASSPORT: "[PASSEPORT_FR]" - FR_DRIVER_LICENSE: "[PERMIS_FR]" + # Documents d'identité personnels + NUMERO_SECURITE_SOCIALE_FRANCAIS: "[NUMERO_SECURITE_SOCIALE]" + REGISTRE_NATIONAL_BELGE: "[REGISTRE_NATIONAL_BELGE]" + CARTE_IDENTITE_FRANCAISE: "[CARTE_IDENTITE_FRANCAISE]" + CARTE_IDENTITE_BELGE: "[CARTE_IDENTITE_BELGE]" + PASSEPORT_FRANCAIS: "[PASSEPORT_FRANCAIS]" + PASSEPORT_BELGE: "[PASSEPORT_BELGE]" + PERMIS_CONDUIRE_FRANCAIS: "[PERMIS_CONDUIRE_FRANCAIS]" - # Business + # Données financières personnelles + COMPTE_BANCAIRE_FRANCAIS: "[COMPTE_BANCAIRE_FRANCAIS]" - BE_PROFESSIONAL_ID: "[ID_PROFESSIONNEL_BE]" + # ======================================== + # ENTITÉS BUSINESS (Données d'entreprise) + # ======================================== + + # Organisations et sociétés + ORGANISATION: "[ORGANISATION]" + ORGANIZATION: "[ORGANISATION]" + SOCIETE_FRANCAISE: "[SOCIETE_FRANCAISE]" + SOCIETE_BELGE: "[SOCIETE_BELGE]" + + # Identifiants fiscaux et d'entreprise + TVA_FRANCAISE: "[TVA_FRANCAISE]" + TVA_BELGE: "[TVA_BELGE]" + NUMERO_FISCAL_FRANCAIS: "[NUMERO_FISCAL_FRANCAIS]" + SIRET_SIREN_FRANCAIS: "[SIRET_SIREN]" + NUMERO_ENTREPRISE_BELGE: "[NUMERO_ENTREPRISE_BELGE]" + + # Identifiants professionnels + ID_PROFESSIONNEL_BELGE: "[ID_PROFESSIONNEL_BELGE]" + + # ======================================== + # ENTITÉS MIXTES (PII + Business) + # ======================================== + + # Données pouvant être personnelles ou professionnelles + TITRE_CIVILITE: "[TITRE_CIVILITE]" + DONNEES_PROFESSIONNELLES: "[DONNEES_PROFESSIONNELLES]" + REFERENCE_CONTRAT: "[REFERENCE_CONTRAT]" + IDENTIFIANT_PERSONNEL: "[IDENTIFIANT_PERSONNEL]" + + # Données techniques et confidentielles + LOCALISATION_GPS: "[LOCALISATION_GPS]" + SECRET_COMMERCIAL: "[SECRET_COMMERCIAL]" + CLE_API_SECRETE: "[CLE_API_SECRETE]" + URL_IDENTIFIANT: "[URL_IDENTIFIANT]" + DONNEES_BIOMETRIQUES: "[DONNEES_BIOMETRIQUES]" + DONNEES_SANTE: "[DONNEES_SANTE]" + ORIENTATION_SEXUELLE: "[ORIENTATION_SEXUELLE]" + OPINIONS_POLITIQUES: "[OPINIONS_POLITIQUES]" + MONTANT_FINANCIER: "[MONTANT_FINANCIER]" + + DONNEES_FINANCIERES_RGPD: "[DONNEES_FINANCIERES_RGPD]" MARKET_SHARE: "[PART_DE_MARCHE]" diff --git a/conf/main.yaml b/conf/main.yaml index 586396c..6b3a5c5 100644 --- a/conf/main.yaml +++ b/conf/main.yaml @@ -11,14 +11,18 @@ includes: # Configuration NLP (spaCy préservée) - nlp/spacy_config.yaml - # Recognizers PII par dossier (garder uniquement les dossiers récents) + # Recognizers PII par dossier (ordre important : spécifiques avant génériques) - recognizers/PII/belgian/* - recognizers/PII/french/* - recognizers/PII/generic/* - # Recognizers Business par dossier + # Recognizers Business par dossier (ordre important : spécifiques avant génériques) - recognizers/Business/belgian/* - recognizers/Business/french/* + - recognizers/Business/generic/* + + # Recognizers génériques communs (en dernier) + - recognizers/generic/* # Configuration d'anonymisation - anonymization/* diff --git a/conf/nlp/spacy_config.yaml b/conf/nlp/spacy_config.yaml index 9f78428..345ac04 100644 --- a/conf/nlp/spacy_config.yaml +++ b/conf/nlp/spacy_config.yaml @@ -9,10 +9,10 @@ nlp_configuration: # Configuration NER globale (sans confidence_thresholds) ner_model_configuration: model_to_presidio_entity_mapping: - PER: PERSON - PERSON: PERSON - ORG: ORGANIZATION - ORGANIZATION: ORGANIZATION + PER: PERSONNE + PERSON: PERSONNE + ORG: ORGANISATION + ORGANIZATION: ORGANISATION LOC: LOCATION LOCATION: LOCATION DATE: DATE diff --git a/conf/recognizers/Business/belgian/company_forms.yaml b/conf/recognizers/Business/belgian/company_forms.yaml new file mode 100644 index 0000000..a4948bf --- /dev/null +++ b/conf/recognizers/Business/belgian/company_forms.yaml @@ -0,0 +1,34 @@ +# Recognizer pour formes juridiques belges +recognizer_registry: + recognizers: + - name: BelgianCompanyFormsRecognizer + supported_language: fr + supported_entity: SOCIETE_BELGE + patterns: + - name: SRL avec nom + regex: "\\b(?:SRL|Srl)\\s+[A-ZÀ-Ÿ][a-zà-ÿ\\s&'-]+\\b" + score: 0.95 + - name: SA avec nom + regex: "\\b(?:SA|Sa)\\s+[A-ZÀ-Ÿ][a-zà-ÿ\\s&'-]+\\b" + score: 0.95 + - name: ASBL avec nom + regex: "\\b(?:ASBL|Asbl)\\s+[A-ZÀ-Ÿ][a-zà-ÿ\\s&'-]+\\b" + score: 0.95 + - name: SC avec nom + regex: "\\b(?:SC|Sc)\\s+[A-ZÀ-Ÿ][a-zà-ÿ\\s&'-]+\\b" + score: 0.9 + - name: SNC avec nom + regex: "\\b(?:SNC|Snc)\\s+[A-ZÀ-Ÿ][a-zà-ÿ\\s&'-]+\\b" + score: 0.9 + - name: SComm avec nom + regex: "\\b(?:SComm|Scomm)\\s+[A-ZÀ-Ÿ][a-zà-ÿ\\s&'-]+\\b" + score: 0.9 + context: + [ + "société", + "entreprise", + "forme juridique", + "statut", + "commercial", + "association", + ] diff --git a/conf/recognizers/Business/belgian/enterprise_numbers.yaml b/conf/recognizers/Business/belgian/enterprise_numbers.yaml index b6da68c..787a275 100644 --- a/conf/recognizers/Business/belgian/enterprise_numbers.yaml +++ b/conf/recognizers/Business/belgian/enterprise_numbers.yaml @@ -3,7 +3,7 @@ recognizer_registry: recognizers: - name: BelgianEnterpriseRecognizer supported_language: fr - supported_entity: BE_ENTERPRISE_NUMBER + supported_entity: NUMERO_ENTREPRISE_BELGE patterns: - name: Numéro BCE avec deux points regex: "(?<=\\bBCE\\s*:\\s*)((BE)?\\s?0\\d{3}[\\.\\s]?\\d{3}[\\.\\s]?\\d{3})\\b" @@ -20,5 +20,4 @@ recognizer_registry: - name: Numéro patronal regex: "\\b(?:numéro\\s+)?patronal\\s*:?\\s*\\d{7}\\b" score: 0.9 - context: - ["TVA", "intracommunautaire", "ONSS", "entreprise", "patronal"] + context: ["TVA", "intracommunautaire", "ONSS", "entreprise", "patronal"] diff --git a/conf/recognizers/Business/belgian/organization_names.yaml b/conf/recognizers/Business/belgian/organization_names.yaml index a907189..294a202 100644 --- a/conf/recognizers/Business/belgian/organization_names.yaml +++ b/conf/recognizers/Business/belgian/organization_names.yaml @@ -3,7 +3,7 @@ recognizer_registry: recognizers: - name: SmartOrganizationRecognizer supported_language: fr - supported_entity: ORGANIZATION + supported_entity: ORGANISATION patterns: # Noms avec suffixes typiques d'entreprise - name: Noms entreprise avec suffixes diff --git a/conf/recognizers/Business/belgian/professional_ids.yaml b/conf/recognizers/Business/belgian/professional_ids.yaml index 74e124e..166d65c 100644 --- a/conf/recognizers/Business/belgian/professional_ids.yaml +++ b/conf/recognizers/Business/belgian/professional_ids.yaml @@ -3,7 +3,7 @@ recognizer_registry: recognizers: - name: BelgianProfessionalIdRecognizer supported_language: fr - supported_entity: BE_PROFESSIONAL_ID + supported_entity: ID_PROFESSIONNEL_BELGE patterns: - name: Numéro IEC avec deux points regex: "(?<=\\bIEC\\s*:\\s*)\\d{6}\\b" @@ -17,4 +17,4 @@ recognizer_registry: - name: Numéro de médecin regex: "\\b(?:Dr\\.|médecin)\\s*n°\\s*\\d{5,7}\\b" score: 0.85 - context: ["expert-comptable", "IEC", "avocat", "médecin", "professionnel"] + context: ["IEC", "avocat", "médecin", "professionnel"] diff --git a/conf/recognizers/Business/french/company_forms.yaml b/conf/recognizers/Business/french/company_forms.yaml new file mode 100644 index 0000000..149d899 --- /dev/null +++ b/conf/recognizers/Business/french/company_forms.yaml @@ -0,0 +1,23 @@ +# Recognizer pour formes juridiques françaises +recognizer_registry: + recognizers: + - name: FrenchCompanyFormsRecognizer + supported_language: fr + supported_entity: SOCIETE_FRANCAISE + patterns: + - name: SARL avec nom + regex: "\\b(?:SARL|Sarl)\\s+[A-ZÀ-Ÿ][a-zà-ÿ\\s&'-]+\\b" + score: 0.95 + - name: SAS avec nom + regex: "\\b(?:SAS|Sas)\\s+[A-ZÀ-Ÿ][a-zà-ÿ\\s&'-]+\\b" + score: 0.95 + - name: SA avec nom + regex: "\\b(?:SA|Sa)\\s+[A-ZÀ-Ÿ][a-zà-ÿ\\s&'-]+\\b" + score: 0.9 + - name: EURL avec nom + regex: "\\b(?:EURL|Eurl)\\s+[A-ZÀ-Ÿ][a-zà-ÿ\\s&'-]+\\b" + score: 0.95 + - name: SCI avec nom + regex: "\\b(?:SCI|Sci)\\s+[A-ZÀ-Ÿ][a-zà-ÿ\\s&'-]+\\b" + score: 0.9 + context: ["société", "entreprise", "forme juridique", "statut", "commercial"] \ No newline at end of file diff --git a/conf/recognizers/Business/french/siret_siren.yaml b/conf/recognizers/Business/french/siret_siren.yaml index 0e2bb5e..18aa248 100644 --- a/conf/recognizers/Business/french/siret_siren.yaml +++ b/conf/recognizers/Business/french/siret_siren.yaml @@ -3,7 +3,7 @@ recognizer_registry: recognizers: - name: FrenchSIRETRecognizer supported_language: fr - supported_entity: FR_SIRET + supported_entity: SIRET_SIREN_FRANCAIS patterns: - name: SIRET complet regex: "\\b[0-9]{3}\\s?[0-9]{3}\\s?[0-9]{3}\\s?[0-9]{5}\\b" diff --git a/conf/recognizers/Business/generic/api_secrets.yaml b/conf/recognizers/Business/generic/api_secrets.yaml new file mode 100644 index 0000000..6685dfc --- /dev/null +++ b/conf/recognizers/Business/generic/api_secrets.yaml @@ -0,0 +1,20 @@ +# Recognizer pour clés API et secrets techniques +recognizer_registry: + recognizers: + - name: APISecretsRecognizer + supported_language: fr + supported_entity: CLE_API_SECRETE + patterns: + - name: Clé API générique + regex: "\\b(?:API[_\\s]?KEY|api[_\\s]?key)\\s*[=:]\\s*[A-Za-z0-9\\-_]{16,64}\\b" + score: 1.0 + - name: Token d'accès + regex: "\\b(?:access[_\\s]?token|token)\\s*[=:]\\s*[A-Za-z0-9\\-_\\.]{20,128}\\b" + score: 0.95 + - name: Secret AWS + regex: "\\b(?:AWS[_\\s]?SECRET|aws[_\\s]?secret)\\s*[=:]\\s*[A-Za-z0-9/+=]{40}\\b" + score: 1.0 + - name: Clé privée + regex: "\\b(?:private[_\\s]?key|secret[_\\s]?key)\\s*[=:]\\s*[A-Za-z0-9\\-_]{16,64}\\b" + score: 0.95 + context: ["API", "clé", "secret", "token", "authentification", "accès"] \ No newline at end of file diff --git a/conf/recognizers/Business/generic/contracts_references.yaml b/conf/recognizers/Business/generic/contracts_references.yaml new file mode 100644 index 0000000..5c41503 --- /dev/null +++ b/conf/recognizers/Business/generic/contracts_references.yaml @@ -0,0 +1,20 @@ +# Recognizer pour contrats et références internes +recognizer_registry: + recognizers: + - name: ContractReferenceRecognizer + supported_language: fr + supported_entity: REFERENCE_CONTRAT + patterns: + - name: Numéro de contrat + regex: "\\b(?:contrat|contract)\\s*n?°?\\s*:?\\s*[A-Z0-9\\-/]{4,15}\\b" + score: 0.95 + - name: Référence interne + regex: "\\b(?:ref|référence|dossier)\\s*:?\\s*[A-Z]{2,4}[\\-/]?[0-9]{4,8}\\b" + score: 0.9 + - name: ID transaction + regex: "\\b(?:transaction|trans)\\s*ID\\s*:?\\s*[A-Z0-9]{6,12}\\b" + score: 0.95 + - name: Numéro de facture + regex: "\\b(?:facture|invoice)\\s*n?°?\\s*:?\\s*[A-Z0-9\\-/]{4,12}\\b" + score: 0.9 + context: ["contrat", "référence", "dossier", "facture", "transaction", "commande"] \ No newline at end of file diff --git a/conf/recognizers/Business/generic/employee_client_ids.yaml b/conf/recognizers/Business/generic/employee_client_ids.yaml new file mode 100644 index 0000000..4672a47 --- /dev/null +++ b/conf/recognizers/Business/generic/employee_client_ids.yaml @@ -0,0 +1,20 @@ +# Recognizer pour identifiants employés et clients +recognizer_registry: + recognizers: + - name: EmployeeClientIDRecognizer + supported_language: fr + supported_entity: ID_PROFESSIONNEL_BELGE + patterns: + - name: Matricule employé + regex: "\\b(?:matricule|employee|emp)\\s*:?\\s*[A-Z0-9]{4,10}\\b" + score: 0.95 + - name: ID client + regex: "\\b(?:client|customer)\\s*ID\\s*:?\\s*[A-Z0-9]{4,12}\\b" + score: 0.95 + - name: Code utilisateur + regex: "\\b(?:user|utilisateur)\\s*:?\\s*[a-z]+\\.[a-z]+\\b" + score: 0.9 + - name: Identifiant RH + regex: "\\b(?:RH|HR)[\\-/]?[0-9]{4,8}\\b" + score: 0.85 + context: ["matricule", "employé", "client", "utilisateur", "ID", "identifiant"] \ No newline at end of file diff --git a/conf/recognizers/Business/generic/financial_amounts.yaml b/conf/recognizers/Business/generic/financial_amounts.yaml new file mode 100644 index 0000000..8841fa1 --- /dev/null +++ b/conf/recognizers/Business/generic/financial_amounts.yaml @@ -0,0 +1,29 @@ +# Recognizer pour montants financiers et devises +recognizer_registry: + recognizers: + - name: FinancialAmountRecognizer + supported_language: fr + supported_entity: MONTANT_FINANCIER + patterns: + - name: Montant avec devise EUR + regex: "\\b(?:[0-9]{1,3}(?:[\\s.,][0-9]{3})*|[0-9]+)(?:[.,][0-9]{1,2})?\\s*€\\b" + score: 0.95 + - name: Montant avec devise USD + regex: "\\b(?:[0-9]{1,3}(?:[\\s.,][0-9]{3})*|[0-9]+)(?:[.,][0-9]{1,2})?\\s*(?:USD|\\$)\\b" + score: 0.95 + - name: Montant abrégé avec K/M + regex: "" + score: 0.9 + - name: Salaire annuel + regex: "" + score: 0.95 + context: + [ + "montant", + "prix", + "coût", + "budget", + "salaire", + "rémunération", + "facture", + ] diff --git a/conf/recognizers/Business/generic/market_share.yaml b/conf/recognizers/Business/generic/market_share.yaml index 84bebc1..5e43e09 100644 --- a/conf/recognizers/Business/generic/market_share.yaml +++ b/conf/recognizers/Business/generic/market_share.yaml @@ -5,30 +5,52 @@ recognizer_registry: supported_language: fr supported_entity: MARKET_SHARE patterns: - # Pourcentages de marché + # Pourcentages simples (nouveau pattern plus permissif) + - name: Simple Percentage + regex: "\\b\\d{1,2}(?:[,.]\\d{1,2})?%\\b" + score: 0.7 + + # Part de marché explicite + - name: Explicit Market Share + regex: "\\b(?:part\\s+de\\s+marché|parts?\\s+de\\s+marché)\\b" + score: 0.9 + + # Pourcentages de marché avec contexte - name: Market Share Percentage regex: "\\b(?:détient|possède|contrôle|représente)?\\s*(?:environ\\s+)?(?:\\d{1,2}(?:[,.]\\d{1,2})?%)\\s*(?:de\\s+(?:part\\s+de\\s+)?marché|du\\s+marché|de\\s+parts?)\\b" score: 0.9 - + # Positions de marché - name: Market Position regex: "\\b(?:leader|numéro\\s+\\d+|\\d+(?:er|ème)\\s+acteur|position\\s+dominante|monopole)\\s+(?:du\\s+)?(?:marché|secteur)\\b" score: 0.85 - + # Parts relatives - name: Relative Market Share regex: "\\b(?:majoritaire|minoritaire|principale|significative)\\s+(?:part\\s+de\\s+)?marché\\b" score: 0.8 - + # Données de concentration - name: Market Concentration regex: "\\b(?:concentration|consolidation|fusion)\\s+(?:du\\s+)?marché\\b" score: 0.75 - + # Chiffres d'affaires relatifs - name: Revenue Share regex: "\\b(?:\\d{1,2}(?:[,.]\\d{1,2})?%)\\s*(?:du\\s+)?(?:chiffre\\s+d'affaires|CA|revenus?)\\s+(?:du\\s+)?(?:marché|secteur)\\b" score: 0.85 - + context: - ["part de marché", "position concurrentielle", "leader", "concurrent", "secteur", "industrie", "chiffre d'affaires", "revenus", "concentration", "monopole", "oligopole"] \ No newline at end of file + [ + "part de marché", + "position concurrentielle", + "leader", + "concurrent", + "secteur", + "industrie", + "chiffre d'affaires", + "revenus", + "concentration", + "monopole", + "oligopole", + ] diff --git a/conf/recognizers/Business/generic/professional_data.yaml b/conf/recognizers/Business/generic/professional_data.yaml new file mode 100644 index 0000000..71b83f6 --- /dev/null +++ b/conf/recognizers/Business/generic/professional_data.yaml @@ -0,0 +1,71 @@ +# Recognizer pour données professionnelles génériques (France/Belgique) +recognizer_registry: + recognizers: + # Recognizer pour titres de civilité + - name: GenericCivilityTitleRecognizer + supported_language: fr + supported_entity: TITRE_CIVILITE + patterns: + - name: Titres de civilité + regex: "\\b(?:M\\.|Mme|Mlle|Dr\\.|Pr\\.|Prof\\.|Docteur|Professeur|Maître|Me\\.)(?=\\s+[A-ZÀ-Ÿ])" + score: 0.9 + - name: Titres honorifiques + regex: "\\b(?:Monsieur|Madame|Mademoiselle)(?=\\s+[A-ZÀ-Ÿ])" + score: 0.85 + context: ["identité", "titre", "civilité"] + + # Recognizer pour données professionnelles générales + - name: GenericProfessionalDataRecognizer + supported_language: fr + supported_entity: DONNEES_PROFESSIONNELLES + patterns: + - name: Titre de poste + regex: "\\b(?:directeur|directrice|manager|responsable|chef|ingénieur|ingénieure|consultant|consultante)\\s+[a-zà-ÿ\\s]+\\b" + score: 0.8 + - name: Département + regex: "\\b(?:département|service|division)\\s+[A-ZÀ-Ÿ][a-zà-ÿ\\s]+\\b" + score: 0.75 + - name: Adresse professionnelle + regex: "\\b(?:siège\\s+social|adresse\\s+professionnelle)\\s*:?\\s*[0-9]{1,4}\\s+[A-ZÀ-Ÿ][a-zà-ÿ\\s'-]+\\b" + score: 0.9 + - name: Email professionnel + regex: "\\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}\\b" + score: 0.85 + - name: Numéro IEC + regex: "\\b(?:n°\\s*IEC|numéro\\s*IEC|IEC)\\s*:?\\s*([0-9]{6,8})\\b" + score: 0.9 + - name: Avocat + regex: "\\b(?:avocat|avocate)\\b" + score: 0.9 + - name: Expert-comptable + regex: "\\b(?:expert-comptable|expert\\s+comptable)\\b" + score: 0.99 + - name: Notaire + regex: "\\b(?:notaire)\\b" + score: 0.95 + - name: Médecin + regex: "\\b(?:médecin|docteur\\s+en\\s+médecine)\\b" + score: 0.95 + # Données spécifiques belges intégrées + - name: Numéro ONSS employeur + regex: "\\b(?:ONSS|onss)\\s*:?\\s*[0-9]{7}\\b" + score: 0.95 + - name: Numéro patronal + regex: "\\b(?:numéro\\s+)?patronal\\s*:?\\s*[0-9]{7}\\b" + score: 0.9 + context: + [ + "professionnel", + "travail", + "bureau", + "entreprise", + "poste", + "fonction", + "réglementé", + "ordre", + "diplôme", + "ONSS", + "patronal", + "employeur", + "siège social" + ] \ No newline at end of file diff --git a/conf/recognizers/Business/generic/trade_secrets.yaml b/conf/recognizers/Business/generic/trade_secrets.yaml new file mode 100644 index 0000000..f8e0bb4 --- /dev/null +++ b/conf/recognizers/Business/generic/trade_secrets.yaml @@ -0,0 +1,20 @@ +# Recognizer pour secrets d'affaires et projets +recognizer_registry: + recognizers: + - name: TradeSecretsRecognizer + supported_language: fr + supported_entity: SECRET_COMMERCIAL + patterns: + - name: Nom de projet interne + regex: "\\b(?:projet|project)\\s+[A-Z][a-zA-Z]{3,15}\\b" + score: 0.85 + - name: Code projet + regex: "\\b(?:projet|project)\\s*:?\\s*[A-Z]{2,4}[\\-/]?[0-9]{2,4}\\b" + score: 0.9 + - name: Plan stratégique + regex: "\\b(?:plan|stratégie)\\s+(?:stratégique|business)\\s+[0-9]{4}\\b" + score: 0.9 + - name: Formule interne + regex: "\\b(?:formule|recette|procédé)\\s+[A-Z][\\-0-9A-Z]{2,10}\\b" + score: 0.85 + context: ["projet", "stratégique", "confidentiel", "interne", "secret", "propriétaire"] \ No newline at end of file diff --git a/conf/recognizers/PII/belgian/addresses.yaml b/conf/recognizers/PII/belgian/addresses.yaml index d17fdee..9757f03 100644 --- a/conf/recognizers/PII/belgian/addresses.yaml +++ b/conf/recognizers/PII/belgian/addresses.yaml @@ -3,7 +3,7 @@ recognizer_registry: recognizers: - name: BelgianAddressRecognizer supported_language: fr - supported_entity: BE_ADDRESS + supported_entity: ADRESSE_BELGE patterns: # Pattern principal : numéro + rue + code postal + ville (SANS contexte) - name: Adresse complète avec numéro devant diff --git a/conf/recognizers/PII/belgian/documents.yaml b/conf/recognizers/PII/belgian/documents.yaml index 1483fd6..d191a5c 100644 --- a/conf/recognizers/PII/belgian/documents.yaml +++ b/conf/recognizers/PII/belgian/documents.yaml @@ -3,7 +3,7 @@ recognizer_registry: recognizers: - name: BelgianIDCardRecognizer supported_language: fr - supported_entity: BE_ID_CARD + supported_entity: CARTE_IDENTITE_BELGE patterns: - name: Carte d'identité belge regex: "\\b[0-9]{3}\\-[0-9]{7}\\-[0-9]{2}\\b" @@ -15,7 +15,7 @@ recognizer_registry: - name: BelgianPassportRecognizer supported_language: fr - supported_entity: BE_PASSPORT + supported_entity: PASSEPORT_BELGE patterns: - name: Passeport belge regex: "\\b[A-Z]{2}[0-9]{6}\\b" diff --git a/conf/recognizers/PII/belgian/national_register.yaml b/conf/recognizers/PII/belgian/national_register.yaml index 34b8f95..8dddf84 100644 --- a/conf/recognizers/PII/belgian/national_register.yaml +++ b/conf/recognizers/PII/belgian/national_register.yaml @@ -3,7 +3,7 @@ recognizer_registry: recognizers: - name: BelgianNRNRecognizer supported_language: fr - supported_entity: BE_NATIONAL_REGISTER_NUMBER + supported_entity: REGISTRE_NATIONAL_BELGE patterns: - name: NRN avec points et tiret regex: "\\b[0-9]{2}\\.[0-9]{2}\\.[0-9]{2}-[0-9]{3}\\.[0-9]{2}\\b" diff --git a/conf/recognizers/PII/belgian/phones.yaml b/conf/recognizers/PII/belgian/phones.yaml index eda9bc2..4a3c768 100644 --- a/conf/recognizers/PII/belgian/phones.yaml +++ b/conf/recognizers/PII/belgian/phones.yaml @@ -3,7 +3,7 @@ recognizer_registry: recognizers: - name: BelgianPhoneRecognizer supported_language: fr - supported_entity: BE_PHONE_NUMBER + supported_entity: TELEPHONE_BELGE patterns: # Patterns avec contexte Tel: et Tél: - name: Téléphone fixe avec contexte Tel @@ -20,7 +20,7 @@ recognizer_registry: score: 0.99 # Patterns généraux (sans contexte spécifique) - name: Téléphone fixe belge - regex: '(? {new_regex}") + def _merge_config(self, new_config: Dict[str, Any]): for key, value in new_config.items(): if key == 'recognizer_registry': diff --git a/pipeline_manager.py b/pipeline_manager.py index 75fdcda..8c30d76 100644 --- a/pipeline_manager.py +++ b/pipeline_manager.py @@ -2,6 +2,7 @@ from typing import List from presidio_analyzer import RecognizerResult from entity_refiners import EntityRefinerManager from post_processors import DeduplicationProcessor, OverlapResolver +from post_processors.cleanup_processor import CleanupProcessor import logging logger = logging.getLogger(__name__) @@ -9,9 +10,10 @@ logger = logging.getLogger(__name__) class AnalysisPipeline: def __init__(self): self.refiner_manager = EntityRefinerManager() + self.cleanup_processor = CleanupProcessor() self.overlap_resolver = OverlapResolver() self.deduplicator = DeduplicationProcessor() - logger.info("🚀 Pipeline d'analyse initialisé") + logger.info("🚀 Pipeline d'analyse initialisé avec nettoyage avancé") def process(self, text: str, results: List[RecognizerResult], allow_list_terms: List[str]) -> List[RecognizerResult]: """Traite les résultats à travers le pipeline complet""" @@ -38,10 +40,13 @@ class AnalysisPipeline: ) refined_results.append(refined_result) - # 3. Résolution des chevauchements - resolved_results = self.overlap_resolver.process(refined_results, text) + # 3. Nettoyage avancé des résultats + cleaned_results = self.cleanup_processor.process(refined_results) - # 4. Déduplication + # 4. Résolution des chevauchements + resolved_results = self.overlap_resolver.process(cleaned_results, text) + + # 5. Déduplication final_results = self.deduplicator.process(resolved_results, text) logger.info(f"🎯 Pipeline complet: {len(results)} -> {len(final_results)} entités") diff --git a/post_processors/cleanup_processor.py b/post_processors/cleanup_processor.py new file mode 100644 index 0000000..8ececca --- /dev/null +++ b/post_processors/cleanup_processor.py @@ -0,0 +1,59 @@ +from typing import List +from presidio_analyzer import RecognizerResult +import re + +class CleanupProcessor: + """Post-processor pour nettoyer les résultats d'anonymisation et éviter les chevauchements.""" + + def __init__(self): + self.name = "CleanupProcessor" + + def process(self, results: List[RecognizerResult]) -> List[RecognizerResult]: + """Nettoie les résultats pour éviter les chevauchements et les détections incorrectes.""" + if not results: + return results + + # Trier par position de début + sorted_results = sorted(results, key=lambda x: x.start) + + # Supprimer les chevauchements en gardant le score le plus élevé + cleaned_results = [] + + for current in sorted_results: + # Vérifier si ce résultat chevauche avec un résultat déjà accepté + overlaps = False + for accepted in cleaned_results: + if self._overlaps(current, accepted): + # Si le score actuel est plus élevé, remplacer + if current.score > accepted.score: + cleaned_results.remove(accepted) + cleaned_results.append(current) + overlaps = True + break + + if not overlaps: + cleaned_results.append(current) + + # Filtrer les résultats trop courts ou suspects + final_results = [] + for result in cleaned_results: + if self._is_valid_result(result): + final_results.append(result) + + return final_results + + def _overlaps(self, result1: RecognizerResult, result2: RecognizerResult) -> bool: + """Vérifie si deux résultats se chevauchent.""" + return not (result1.end <= result2.start or result2.end <= result1.start) + + def _is_valid_result(self, result: RecognizerResult) -> bool: + """Vérifie si un résultat est valide (pas trop court, pas suspect).""" + # Longueur minimale + if result.end - result.start < 2: + return False + + # Éviter les détections sur des caractères isolés + if result.entity_type == "PERSON_NAME" and result.end - result.start < 4: + return False + + return True \ No newline at end of file diff --git a/post_processors/overlap_resolver.py b/post_processors/overlap_resolver.py index b2fe4a2..4480674 100644 --- a/post_processors/overlap_resolver.py +++ b/post_processors/overlap_resolver.py @@ -20,14 +20,26 @@ class OverlapResolver: 'BE_ENTERPRISE_NUMBER': 88, 'PHONE_NUMBER': 85, 'BE_PHONE_NUMBER': 85, + 'TELEPHONE': 84, + 'TELEPHONE_FRANCAIS': 86, 'IP_ADDRESS': 82, + 'ADRESSE_FRANCAISE': 78, # Priorité plus élevée pour adresses françaises spécifiques 'BE_ADDRESS': 75, 'FR_ADDRESS': 75, - 'ORGANIZATION': 65, - 'LOCATION': 60, + 'ADRESSE': 70, # Adresse générique avec priorité plus faible + 'ORGANISATION': 65, + 'LOCATION': 60, # Priorité plus faible que les adresses 'PERSON': 50, + 'PERSON_NAME': 45, 'NRP': 40, - 'URL': 35 + 'BE_PROFESSIONAL_ID': 40, + 'FR_CIVILITY_TITLE': 85, + 'FR_REGULATED_PROFESSION': 80, + 'CARTE_IDENTITE_FRANCAISE': 78, + 'PERMIS_CONDUIRE_FRANCAIS': 76, + 'PASSEPORT_FRANCAIS': 77, + 'URL': 35, + 'MARKET_SHARE': 35 } # Patterns pour identifier les organisations @@ -112,12 +124,12 @@ class OverlapResolver: # Correction 1: PERSON -> ORGANIZATION pour les noms d'entreprise if result.entity_type == 'PERSON' and self._is_organization_name(entity_text): corrected_result = RecognizerResult( - entity_type='ORGANIZATION', + entity_type='ORGANISATION', start=result.start, end=result.end, score=result.score + 0.1 # Bonus de confiance ) - logger.debug(f"🔄 Correction PERSON -> ORGANIZATION: '{entity_text}'") + logger.debug(f"🔄 Correction PERSON -> ORGANISATION: '{entity_text}'") corrected_results.append(corrected_result) # Correction 2: Séparer IP des adresses physiques diff --git a/refiners/iban_refiner.py b/refiners/iban_refiner.py index 87b8290..19a9cb1 100644 --- a/refiners/iban_refiner.py +++ b/refiners/iban_refiner.py @@ -25,7 +25,7 @@ class IBANRefiner(EntityRefiner): def __init__(self): super().__init__("IBAN") - self.iban_regex = re.compile(r"\b[A-Z]{2}[0-9]{2}(?:\s[0-9]{4}){3}\b", re.IGNORECASE) + self.iban_regex = re.compile(r"\b[A-Z]{2}[0-9]{2}\s?(?:[A-Z0-9]{4}\s?){2,7}[A-Z0-9]{1,4}\b", re.IGNORECASE) def refine(self, text: str, start: int, end: int) -> Optional[Tuple[int, int]]: ent_text = text[start:end].strip()