new full
This commit is contained in:
126
app.py
126
app.py
@@ -28,6 +28,7 @@ try:
|
||||
config = config_loader.load_config("main.yaml")
|
||||
logger.info("✅ Configuration modulaire chargée avec succès")
|
||||
|
||||
# Normalisation douce de l'allow_list (préserve la structure des mots)
|
||||
allow_list_terms = set(term.lower().strip() for term in config.get('allow_list', []))
|
||||
logger.info(f"✅ Allow list chargée avec {len(allow_list_terms)} termes")
|
||||
|
||||
@@ -69,9 +70,114 @@ except Exception as e:
|
||||
|
||||
|
||||
def normalize_label(text: str) -> str:
|
||||
# Règles générales de normalisation pour gérer tous les cas
|
||||
text = text.strip().lower()
|
||||
|
||||
# 1. Supprimer parenthèses et leur contenu
|
||||
text = re.sub(r'\([^)]*\)', '', text)
|
||||
|
||||
# 2. Supprimer virgules et points suivis d'un espace
|
||||
text = re.sub(r'[,.] ', ' ', text)
|
||||
|
||||
# 3. Supprimer points collés (ex: "Dr.Marie" -> "Dr Marie")
|
||||
text = re.sub(r'\.(\w)', r' \1', text)
|
||||
|
||||
# 4. Supprimer tirets collés aux espaces SEULEMENT (garder les tirets dans les mots composés)
|
||||
text = re.sub(r'- ', ' ', text) # "expert- comptable" -> "expert comptable"
|
||||
text = re.sub(r' -', ' ', text) # "expert -comptable" -> "expert comptable"
|
||||
|
||||
# 5. Supprimer deux-points et ce qui suit (ex: "n° IEC: 567890" -> "n° IEC")
|
||||
text = re.sub(r':.*$', '', text)
|
||||
|
||||
# 6. Normaliser les espaces multiples
|
||||
text = re.sub(r'\s+', ' ', text)
|
||||
|
||||
# 7. Normalisation finale : garder lettres, chiffres, espaces ET tirets pour mots composés
|
||||
cleaned = re.sub(r'[^\w\s-]', '', text)
|
||||
|
||||
# 8. Nettoyer les espaces en début/fin
|
||||
return cleaned.strip()
|
||||
|
||||
cleaned = re.sub(r'[^\w\s]', '', text.strip().lower())
|
||||
return cleaned
|
||||
|
||||
def filter_by_category(results, mode):
|
||||
"""Filtre les résultats selon la catégorie sélectionnée"""
|
||||
if mode == "pii_business":
|
||||
return results # Tout
|
||||
|
||||
# Définir les entités PII (Données personnelles)
|
||||
pii_entities = {
|
||||
# Données personnelles de base
|
||||
'PERSONNE', 'PERSON', 'DATE', 'DATE_TIME',
|
||||
'EMAIL_ADDRESS', 'ADRESSE_EMAIL', 'PHONE_NUMBER', 'TELEPHONE',
|
||||
'CREDIT_CARD', 'IBAN', 'ADRESSE_IP',
|
||||
|
||||
# Adresses personnelles
|
||||
'ADRESSE', 'ADRESSE_FRANCAISE', 'ADRESSE_BELGE', 'LOCATION',
|
||||
|
||||
# Téléphones personnels
|
||||
'TELEPHONE_FRANCAIS', 'TELEPHONE_BELGE',
|
||||
|
||||
# Documents d'identité personnels
|
||||
'NUMERO_SECURITE_SOCIALE_FRANCAIS', 'REGISTRE_NATIONAL_BELGE',
|
||||
'CARTE_IDENTITE_FRANCAISE', 'CARTE_IDENTITE_BELGE',
|
||||
'PASSEPORT_FRANCAIS', 'PASSEPORT_BELGE',
|
||||
'PERMIS_CONDUIRE_FRANCAIS',
|
||||
|
||||
# Données financières personnelles
|
||||
'COMPTE_BANCAIRE_FRANCAIS',
|
||||
|
||||
# Données sensibles RGPD
|
||||
'HEALTH_DATA', 'DONNEES_SANTE',
|
||||
'SEXUAL_ORIENTATION', 'ORIENTATION_SEXUELLE',
|
||||
'POLITICAL_OPINIONS', 'OPINIONS_POLITIQUES',
|
||||
'BIOMETRIC_DATA', 'DONNEES_BIOMETRIQUES',
|
||||
'RGPD_FINANCIAL_DATA', 'DONNEES_FINANCIERES_RGPD',
|
||||
|
||||
# Identifiants personnels
|
||||
'IDENTIFIANT_PERSONNEL'
|
||||
}
|
||||
|
||||
# Définir les entités Business (Données d'entreprise)
|
||||
business_entities = {
|
||||
# Organisations et sociétés
|
||||
'ORGANISATION', 'ORGANIZATION',
|
||||
'SOCIETE_FRANCAISE', 'SOCIETE_BELGE',
|
||||
|
||||
# Identifiants fiscaux et d'entreprise
|
||||
'TVA_FRANCAISE', 'TVA_BELGE',
|
||||
'NUMERO_FISCAL_FRANCAIS', 'SIRET_SIREN_FRANCAIS',
|
||||
'NUMERO_ENTREPRISE_BELGE',
|
||||
|
||||
# Identifiants professionnels
|
||||
'ID_PROFESSIONNEL_BELGE',
|
||||
|
||||
# Données commerciales
|
||||
'MARKET_SHARE', 'SECRET_COMMERCIAL',
|
||||
'REFERENCE_CONTRAT', 'MONTANT_FINANCIER',
|
||||
|
||||
# Données techniques d'entreprise
|
||||
'CLE_API_SECRETE'
|
||||
}
|
||||
|
||||
# Définir les entités mixtes (PII + Business)
|
||||
mixed_entities = {
|
||||
# Données pouvant être personnelles ou professionnelles
|
||||
'TITRE_CIVILITE', 'DONNEES_PROFESSIONNELLES',
|
||||
'LOCALISATION_GPS', 'URL_IDENTIFIANT'
|
||||
}
|
||||
|
||||
if mode == "pii":
|
||||
# Inclure PII + mixtes
|
||||
allowed_entities = pii_entities | mixed_entities
|
||||
return [r for r in results if r.entity_type in allowed_entities]
|
||||
|
||||
elif mode == "business":
|
||||
# Inclure Business + mixtes
|
||||
allowed_entities = business_entities | mixed_entities
|
||||
return [r for r in results if r.entity_type in allowed_entities]
|
||||
|
||||
# Par défaut, retourner tous les résultats
|
||||
return results
|
||||
|
||||
|
||||
# Remplacer ligne 18
|
||||
@@ -87,6 +193,7 @@ def analyze_text():
|
||||
data = request.get_json(force=True)
|
||||
text_to_analyze = data.get("text", "")
|
||||
language = data.get("language", "fr")
|
||||
mode = data.get("mode", "pii_business") # Nouveau paramètre
|
||||
|
||||
if not text_to_analyze:
|
||||
return jsonify({"error": "text field is missing or empty"}), 400
|
||||
@@ -94,8 +201,11 @@ def analyze_text():
|
||||
# Analyse brute
|
||||
raw_results = analyzer.analyze(text=text_to_analyze, language=language)
|
||||
|
||||
# Filtrer selon la catégorie
|
||||
filtered_results = filter_by_category(raw_results, mode)
|
||||
|
||||
# Pipeline modulaire complet
|
||||
final_results = pipeline.process(text_to_analyze, raw_results, allow_list_terms)
|
||||
final_results = pipeline.process(text_to_analyze, filtered_results, allow_list_terms)
|
||||
|
||||
response_data = [res.to_dict() for res in final_results]
|
||||
return make_response(jsonify(response_data), 200)
|
||||
@@ -216,12 +326,12 @@ def anonymize_text():
|
||||
logger.info(f"🔍 Traitement entité: {res.entity_type} = '{ent_text}' (score: {res.score})")
|
||||
logger.info(f"🔍 Allow list terms: {allow_list_terms}")
|
||||
|
||||
# Vérification améliorée de la allow list
|
||||
ent_text_clean = re.sub(r'[^\w]', '', ent_text.strip().lower())
|
||||
logger.info(f"🔍 Texte nettoyé: '{ent_text_clean}'")
|
||||
# Normalisation douce du texte de l'entité (cohérente avec l'allow_list)
|
||||
ent_text_normalized = ent_text.lower().strip()
|
||||
logger.info(f"🔍 Texte normalisé: '{ent_text_normalized}'")
|
||||
|
||||
# Vérifier si le texte correspond exactement ou commence par un terme de la allow list
|
||||
is_allowed = any(ent_text_clean == term or ent_text_clean.startswith(term) for term in allow_list_terms)
|
||||
# Vérifier si l'entité est dans l'allow-list (correspondance exacte)
|
||||
is_allowed = ent_text_normalized in allow_list_terms
|
||||
|
||||
if is_allowed:
|
||||
logger.info(f"✅ Entité '{ent_text}' ignorée (dans allow list)")
|
||||
|
||||
@@ -9,12 +9,8 @@ allow_list:
|
||||
- BCE
|
||||
- TVA
|
||||
- IEC
|
||||
- expert-comptable
|
||||
- prestataire
|
||||
# Termes financiers
|
||||
- Euro
|
||||
- EUR
|
||||
- Euros
|
||||
- Taux
|
||||
- Valeur
|
||||
- Prix
|
||||
|
||||
@@ -1,89 +1,85 @@
|
||||
# Configuration d'anonymisation complète
|
||||
# Configuration d'anonymisation
|
||||
anonymizer_config:
|
||||
default_anonymizers:
|
||||
# Entités génériques
|
||||
PERSON: replace
|
||||
LOCATION: replace
|
||||
ORGANIZATION: replace
|
||||
DATE: replace
|
||||
MONEY: replace
|
||||
EMAIL_ADDRESS: replace
|
||||
IBAN: replace
|
||||
IP_ADDRESS: replace
|
||||
|
||||
# PII Génériques - Données sensibles RGPD
|
||||
HEALTH_DATA: replace
|
||||
BIOMETRIC_DATA: replace
|
||||
SEXUAL_ORIENTATION: replace
|
||||
POLITICAL_OPINIONS: replace
|
||||
RGPD_FINANCIAL_DATA: replace
|
||||
|
||||
# PII Belges
|
||||
BE_ENTERPRISE_NUMBER: replace
|
||||
BE_NATIONAL_REGISTER_NUMBER: replace
|
||||
BE_PHONE_NUMBER: replace
|
||||
BE_ADDRESS: replace
|
||||
BE_ID_CARD: replace
|
||||
BE_PASSPORT: replace
|
||||
|
||||
# PII Françaises
|
||||
FR_SOCIAL_SECURITY_NUMBER: replace
|
||||
FR_SIRET: replace
|
||||
FR_ADDRESS: replace
|
||||
FR_TAX_ID: replace
|
||||
FR_BANK_ACCOUNT: replace
|
||||
FR_ID_CARD: replace
|
||||
FR_PASSPORT: replace
|
||||
FR_DRIVER_LICENSE: replace
|
||||
|
||||
# Business
|
||||
BE_PROFESSIONAL_ID: replace
|
||||
MARKET_SHARE: replace
|
||||
|
||||
replacements:
|
||||
# Entités génériques
|
||||
# ========================================
|
||||
# ENTITÉS PII (Personally Identifiable Information)
|
||||
# ========================================
|
||||
|
||||
# Données personnelles de base
|
||||
DATE: "[DATE]"
|
||||
DATE_TIME: "[DATE]"
|
||||
PERSONNE: "[PERSONNE]"
|
||||
PERSON: "[PERSONNE]"
|
||||
LOCATION: "[LIEU]"
|
||||
ORGANIZATION: "[ORGANISATION]"
|
||||
DATE: "[DATE]"
|
||||
MONEY: "[MONTANT]"
|
||||
EMAIL_ADDRESS: "[EMAIL]"
|
||||
EMAIL_ADDRESS: "[ADRESSE_EMAIL]"
|
||||
ADRESSE_EMAIL: "[ADRESSE_EMAIL]"
|
||||
PHONE_NUMBER: "[TELEPHONE]"
|
||||
CREDIT_CARD: "[CARTE_CREDIT]"
|
||||
IBAN: "[IBAN]"
|
||||
IP_ADDRESS: "[ADRESSE_IP]"
|
||||
|
||||
# PII Belges - AJOUTER CES LIGNES
|
||||
BE_ENTERPRISE_NUMBER: "[ENTREPRISE_BELGE]"
|
||||
BE_PHONE_NUMBER: "[TELEPHONE_BELGE]"
|
||||
BE_ADDRESS: "[ADRESSE_BELGE]"
|
||||
BE_ID_CARD: "[CARTE_ID_BELGE]"
|
||||
BE_PASSPORT: "[PASSEPORT_BELGE]"
|
||||
ADRESSE_IP: "[ADRESSE_IP]"
|
||||
|
||||
# PII Génériques - Données sensibles RGPD
|
||||
HEALTH_DATA: "[DONNEES_SANTE]"
|
||||
BIOMETRIC_DATA: "[DONNEES_BIOMETRIQUES]"
|
||||
SEXUAL_ORIENTATION: "[ORIENTATION_SEXUELLE]"
|
||||
POLITICAL_OPINIONS: "[OPINIONS_POLITIQUES]"
|
||||
RGPD_FINANCIAL_DATA: "[DONNEES_FINANCIERES]"
|
||||
# Adresses personnelles
|
||||
ADRESSE: "[ADRESSE]"
|
||||
ADRESSE_FRANCAISE: "[ADRESSE_FRANCAISE]"
|
||||
ADRESSE_BELGE: "[ADRESSE_BELGE]"
|
||||
|
||||
# PII Belges
|
||||
BE_ENTERPRISE_NUMBER: "[ENTREPRISE_BELGE]"
|
||||
BE_NATIONAL_REGISTER_NUMBER: "[NRN_BELGE]"
|
||||
BE_PHONE_NUMBER: "[TELEPHONE_BE]"
|
||||
BE_ADDRESS: "[ADRESSE_BELGE]"
|
||||
BE_ID_CARD: "[CARTE_ID_BE]"
|
||||
BE_PASSPORT: "[PASSEPORT_BE]"
|
||||
# Téléphones personnels
|
||||
TELEPHONE: "[TELEPHONE]"
|
||||
TELEPHONE_FRANCAIS: "[TELEPHONE_FRANCAIS]"
|
||||
TELEPHONE_BELGE: "[TELEPHONE_BELGE]"
|
||||
|
||||
# PII Françaises
|
||||
FR_SOCIAL_SECURITY_NUMBER: "[NUM_SECU_FR]"
|
||||
FR_SIRET: "[SIRET_FR]"
|
||||
FR_ADDRESS: "[ADRESSE_FR]"
|
||||
FR_TAX_ID: "[NUM_FISCAL_FR]"
|
||||
FR_BANK_ACCOUNT: "[COMPTE_BANCAIRE_FR]"
|
||||
FR_ID_CARD: "[CARTE_ID_FR]"
|
||||
FR_PASSPORT: "[PASSEPORT_FR]"
|
||||
FR_DRIVER_LICENSE: "[PERMIS_FR]"
|
||||
# Documents d'identité personnels
|
||||
NUMERO_SECURITE_SOCIALE_FRANCAIS: "[NUMERO_SECURITE_SOCIALE]"
|
||||
REGISTRE_NATIONAL_BELGE: "[REGISTRE_NATIONAL_BELGE]"
|
||||
CARTE_IDENTITE_FRANCAISE: "[CARTE_IDENTITE_FRANCAISE]"
|
||||
CARTE_IDENTITE_BELGE: "[CARTE_IDENTITE_BELGE]"
|
||||
PASSEPORT_FRANCAIS: "[PASSEPORT_FRANCAIS]"
|
||||
PASSEPORT_BELGE: "[PASSEPORT_BELGE]"
|
||||
PERMIS_CONDUIRE_FRANCAIS: "[PERMIS_CONDUIRE_FRANCAIS]"
|
||||
|
||||
# Business
|
||||
# Données financières personnelles
|
||||
COMPTE_BANCAIRE_FRANCAIS: "[COMPTE_BANCAIRE_FRANCAIS]"
|
||||
|
||||
BE_PROFESSIONAL_ID: "[ID_PROFESSIONNEL_BE]"
|
||||
# ========================================
|
||||
# ENTITÉS BUSINESS (Données d'entreprise)
|
||||
# ========================================
|
||||
|
||||
# Organisations et sociétés
|
||||
ORGANISATION: "[ORGANISATION]"
|
||||
ORGANIZATION: "[ORGANISATION]"
|
||||
SOCIETE_FRANCAISE: "[SOCIETE_FRANCAISE]"
|
||||
SOCIETE_BELGE: "[SOCIETE_BELGE]"
|
||||
|
||||
# Identifiants fiscaux et d'entreprise
|
||||
TVA_FRANCAISE: "[TVA_FRANCAISE]"
|
||||
TVA_BELGE: "[TVA_BELGE]"
|
||||
NUMERO_FISCAL_FRANCAIS: "[NUMERO_FISCAL_FRANCAIS]"
|
||||
SIRET_SIREN_FRANCAIS: "[SIRET_SIREN]"
|
||||
NUMERO_ENTREPRISE_BELGE: "[NUMERO_ENTREPRISE_BELGE]"
|
||||
|
||||
# Identifiants professionnels
|
||||
ID_PROFESSIONNEL_BELGE: "[ID_PROFESSIONNEL_BELGE]"
|
||||
|
||||
# ========================================
|
||||
# ENTITÉS MIXTES (PII + Business)
|
||||
# ========================================
|
||||
|
||||
# Données pouvant être personnelles ou professionnelles
|
||||
TITRE_CIVILITE: "[TITRE_CIVILITE]"
|
||||
DONNEES_PROFESSIONNELLES: "[DONNEES_PROFESSIONNELLES]"
|
||||
REFERENCE_CONTRAT: "[REFERENCE_CONTRAT]"
|
||||
IDENTIFIANT_PERSONNEL: "[IDENTIFIANT_PERSONNEL]"
|
||||
|
||||
# Données techniques et confidentielles
|
||||
LOCALISATION_GPS: "[LOCALISATION_GPS]"
|
||||
SECRET_COMMERCIAL: "[SECRET_COMMERCIAL]"
|
||||
CLE_API_SECRETE: "[CLE_API_SECRETE]"
|
||||
URL_IDENTIFIANT: "[URL_IDENTIFIANT]"
|
||||
DONNEES_BIOMETRIQUES: "[DONNEES_BIOMETRIQUES]"
|
||||
DONNEES_SANTE: "[DONNEES_SANTE]"
|
||||
ORIENTATION_SEXUELLE: "[ORIENTATION_SEXUELLE]"
|
||||
OPINIONS_POLITIQUES: "[OPINIONS_POLITIQUES]"
|
||||
MONTANT_FINANCIER: "[MONTANT_FINANCIER]"
|
||||
|
||||
DONNEES_FINANCIERES_RGPD: "[DONNEES_FINANCIERES_RGPD]"
|
||||
MARKET_SHARE: "[PART_DE_MARCHE]"
|
||||
|
||||
@@ -11,14 +11,18 @@ includes:
|
||||
# Configuration NLP (spaCy préservée)
|
||||
- nlp/spacy_config.yaml
|
||||
|
||||
# Recognizers PII par dossier (garder uniquement les dossiers récents)
|
||||
# Recognizers PII par dossier (ordre important : spécifiques avant génériques)
|
||||
- recognizers/PII/belgian/*
|
||||
- recognizers/PII/french/*
|
||||
- recognizers/PII/generic/*
|
||||
|
||||
# Recognizers Business par dossier
|
||||
# Recognizers Business par dossier (ordre important : spécifiques avant génériques)
|
||||
- recognizers/Business/belgian/*
|
||||
- recognizers/Business/french/*
|
||||
- recognizers/Business/generic/*
|
||||
|
||||
# Recognizers génériques communs (en dernier)
|
||||
- recognizers/generic/*
|
||||
|
||||
# Configuration d'anonymisation
|
||||
- anonymization/*
|
||||
|
||||
@@ -9,10 +9,10 @@ nlp_configuration:
|
||||
# Configuration NER globale (sans confidence_thresholds)
|
||||
ner_model_configuration:
|
||||
model_to_presidio_entity_mapping:
|
||||
PER: PERSON
|
||||
PERSON: PERSON
|
||||
ORG: ORGANIZATION
|
||||
ORGANIZATION: ORGANIZATION
|
||||
PER: PERSONNE
|
||||
PERSON: PERSONNE
|
||||
ORG: ORGANISATION
|
||||
ORGANIZATION: ORGANISATION
|
||||
LOC: LOCATION
|
||||
LOCATION: LOCATION
|
||||
DATE: DATE
|
||||
|
||||
34
conf/recognizers/Business/belgian/company_forms.yaml
Normal file
34
conf/recognizers/Business/belgian/company_forms.yaml
Normal file
@@ -0,0 +1,34 @@
|
||||
# Recognizer pour formes juridiques belges
|
||||
recognizer_registry:
|
||||
recognizers:
|
||||
- name: BelgianCompanyFormsRecognizer
|
||||
supported_language: fr
|
||||
supported_entity: SOCIETE_BELGE
|
||||
patterns:
|
||||
- name: SRL avec nom
|
||||
regex: "\\b(?:SRL|Srl)\\s+[A-ZÀ-Ÿ][a-zà-ÿ\\s&'-]+\\b"
|
||||
score: 0.95
|
||||
- name: SA avec nom
|
||||
regex: "\\b(?:SA|Sa)\\s+[A-ZÀ-Ÿ][a-zà-ÿ\\s&'-]+\\b"
|
||||
score: 0.95
|
||||
- name: ASBL avec nom
|
||||
regex: "\\b(?:ASBL|Asbl)\\s+[A-ZÀ-Ÿ][a-zà-ÿ\\s&'-]+\\b"
|
||||
score: 0.95
|
||||
- name: SC avec nom
|
||||
regex: "\\b(?:SC|Sc)\\s+[A-ZÀ-Ÿ][a-zà-ÿ\\s&'-]+\\b"
|
||||
score: 0.9
|
||||
- name: SNC avec nom
|
||||
regex: "\\b(?:SNC|Snc)\\s+[A-ZÀ-Ÿ][a-zà-ÿ\\s&'-]+\\b"
|
||||
score: 0.9
|
||||
- name: SComm avec nom
|
||||
regex: "\\b(?:SComm|Scomm)\\s+[A-ZÀ-Ÿ][a-zà-ÿ\\s&'-]+\\b"
|
||||
score: 0.9
|
||||
context:
|
||||
[
|
||||
"société",
|
||||
"entreprise",
|
||||
"forme juridique",
|
||||
"statut",
|
||||
"commercial",
|
||||
"association",
|
||||
]
|
||||
@@ -3,7 +3,7 @@ recognizer_registry:
|
||||
recognizers:
|
||||
- name: BelgianEnterpriseRecognizer
|
||||
supported_language: fr
|
||||
supported_entity: BE_ENTERPRISE_NUMBER
|
||||
supported_entity: NUMERO_ENTREPRISE_BELGE
|
||||
patterns:
|
||||
- name: Numéro BCE avec deux points
|
||||
regex: "(?<=\\bBCE\\s*:\\s*)((BE)?\\s?0\\d{3}[\\.\\s]?\\d{3}[\\.\\s]?\\d{3})\\b"
|
||||
@@ -20,5 +20,4 @@ recognizer_registry:
|
||||
- name: Numéro patronal
|
||||
regex: "\\b(?:numéro\\s+)?patronal\\s*:?\\s*\\d{7}\\b"
|
||||
score: 0.9
|
||||
context:
|
||||
["TVA", "intracommunautaire", "ONSS", "entreprise", "patronal"]
|
||||
context: ["TVA", "intracommunautaire", "ONSS", "entreprise", "patronal"]
|
||||
|
||||
@@ -3,7 +3,7 @@ recognizer_registry:
|
||||
recognizers:
|
||||
- name: SmartOrganizationRecognizer
|
||||
supported_language: fr
|
||||
supported_entity: ORGANIZATION
|
||||
supported_entity: ORGANISATION
|
||||
patterns:
|
||||
# Noms avec suffixes typiques d'entreprise
|
||||
- name: Noms entreprise avec suffixes
|
||||
|
||||
@@ -3,7 +3,7 @@ recognizer_registry:
|
||||
recognizers:
|
||||
- name: BelgianProfessionalIdRecognizer
|
||||
supported_language: fr
|
||||
supported_entity: BE_PROFESSIONAL_ID
|
||||
supported_entity: ID_PROFESSIONNEL_BELGE
|
||||
patterns:
|
||||
- name: Numéro IEC avec deux points
|
||||
regex: "(?<=\\bIEC\\s*:\\s*)\\d{6}\\b"
|
||||
@@ -17,4 +17,4 @@ recognizer_registry:
|
||||
- name: Numéro de médecin
|
||||
regex: "\\b(?:Dr\\.|médecin)\\s*n°\\s*\\d{5,7}\\b"
|
||||
score: 0.85
|
||||
context: ["expert-comptable", "IEC", "avocat", "médecin", "professionnel"]
|
||||
context: ["IEC", "avocat", "médecin", "professionnel"]
|
||||
|
||||
23
conf/recognizers/Business/french/company_forms.yaml
Normal file
23
conf/recognizers/Business/french/company_forms.yaml
Normal file
@@ -0,0 +1,23 @@
|
||||
# Recognizer pour formes juridiques françaises
|
||||
recognizer_registry:
|
||||
recognizers:
|
||||
- name: FrenchCompanyFormsRecognizer
|
||||
supported_language: fr
|
||||
supported_entity: SOCIETE_FRANCAISE
|
||||
patterns:
|
||||
- name: SARL avec nom
|
||||
regex: "\\b(?:SARL|Sarl)\\s+[A-ZÀ-Ÿ][a-zà-ÿ\\s&'-]+\\b"
|
||||
score: 0.95
|
||||
- name: SAS avec nom
|
||||
regex: "\\b(?:SAS|Sas)\\s+[A-ZÀ-Ÿ][a-zà-ÿ\\s&'-]+\\b"
|
||||
score: 0.95
|
||||
- name: SA avec nom
|
||||
regex: "\\b(?:SA|Sa)\\s+[A-ZÀ-Ÿ][a-zà-ÿ\\s&'-]+\\b"
|
||||
score: 0.9
|
||||
- name: EURL avec nom
|
||||
regex: "\\b(?:EURL|Eurl)\\s+[A-ZÀ-Ÿ][a-zà-ÿ\\s&'-]+\\b"
|
||||
score: 0.95
|
||||
- name: SCI avec nom
|
||||
regex: "\\b(?:SCI|Sci)\\s+[A-ZÀ-Ÿ][a-zà-ÿ\\s&'-]+\\b"
|
||||
score: 0.9
|
||||
context: ["société", "entreprise", "forme juridique", "statut", "commercial"]
|
||||
@@ -3,7 +3,7 @@ recognizer_registry:
|
||||
recognizers:
|
||||
- name: FrenchSIRETRecognizer
|
||||
supported_language: fr
|
||||
supported_entity: FR_SIRET
|
||||
supported_entity: SIRET_SIREN_FRANCAIS
|
||||
patterns:
|
||||
- name: SIRET complet
|
||||
regex: "\\b[0-9]{3}\\s?[0-9]{3}\\s?[0-9]{3}\\s?[0-9]{5}\\b"
|
||||
|
||||
20
conf/recognizers/Business/generic/api_secrets.yaml
Normal file
20
conf/recognizers/Business/generic/api_secrets.yaml
Normal file
@@ -0,0 +1,20 @@
|
||||
# Recognizer pour clés API et secrets techniques
|
||||
recognizer_registry:
|
||||
recognizers:
|
||||
- name: APISecretsRecognizer
|
||||
supported_language: fr
|
||||
supported_entity: CLE_API_SECRETE
|
||||
patterns:
|
||||
- name: Clé API générique
|
||||
regex: "\\b(?:API[_\\s]?KEY|api[_\\s]?key)\\s*[=:]\\s*[A-Za-z0-9\\-_]{16,64}\\b"
|
||||
score: 1.0
|
||||
- name: Token d'accès
|
||||
regex: "\\b(?:access[_\\s]?token|token)\\s*[=:]\\s*[A-Za-z0-9\\-_\\.]{20,128}\\b"
|
||||
score: 0.95
|
||||
- name: Secret AWS
|
||||
regex: "\\b(?:AWS[_\\s]?SECRET|aws[_\\s]?secret)\\s*[=:]\\s*[A-Za-z0-9/+=]{40}\\b"
|
||||
score: 1.0
|
||||
- name: Clé privée
|
||||
regex: "\\b(?:private[_\\s]?key|secret[_\\s]?key)\\s*[=:]\\s*[A-Za-z0-9\\-_]{16,64}\\b"
|
||||
score: 0.95
|
||||
context: ["API", "clé", "secret", "token", "authentification", "accès"]
|
||||
20
conf/recognizers/Business/generic/contracts_references.yaml
Normal file
20
conf/recognizers/Business/generic/contracts_references.yaml
Normal file
@@ -0,0 +1,20 @@
|
||||
# Recognizer pour contrats et références internes
|
||||
recognizer_registry:
|
||||
recognizers:
|
||||
- name: ContractReferenceRecognizer
|
||||
supported_language: fr
|
||||
supported_entity: REFERENCE_CONTRAT
|
||||
patterns:
|
||||
- name: Numéro de contrat
|
||||
regex: "\\b(?:contrat|contract)\\s*n?°?\\s*:?\\s*[A-Z0-9\\-/]{4,15}\\b"
|
||||
score: 0.95
|
||||
- name: Référence interne
|
||||
regex: "\\b(?:ref|référence|dossier)\\s*:?\\s*[A-Z]{2,4}[\\-/]?[0-9]{4,8}\\b"
|
||||
score: 0.9
|
||||
- name: ID transaction
|
||||
regex: "\\b(?:transaction|trans)\\s*ID\\s*:?\\s*[A-Z0-9]{6,12}\\b"
|
||||
score: 0.95
|
||||
- name: Numéro de facture
|
||||
regex: "\\b(?:facture|invoice)\\s*n?°?\\s*:?\\s*[A-Z0-9\\-/]{4,12}\\b"
|
||||
score: 0.9
|
||||
context: ["contrat", "référence", "dossier", "facture", "transaction", "commande"]
|
||||
20
conf/recognizers/Business/generic/employee_client_ids.yaml
Normal file
20
conf/recognizers/Business/generic/employee_client_ids.yaml
Normal file
@@ -0,0 +1,20 @@
|
||||
# Recognizer pour identifiants employés et clients
|
||||
recognizer_registry:
|
||||
recognizers:
|
||||
- name: EmployeeClientIDRecognizer
|
||||
supported_language: fr
|
||||
supported_entity: ID_PROFESSIONNEL_BELGE
|
||||
patterns:
|
||||
- name: Matricule employé
|
||||
regex: "\\b(?:matricule|employee|emp)\\s*:?\\s*[A-Z0-9]{4,10}\\b"
|
||||
score: 0.95
|
||||
- name: ID client
|
||||
regex: "\\b(?:client|customer)\\s*ID\\s*:?\\s*[A-Z0-9]{4,12}\\b"
|
||||
score: 0.95
|
||||
- name: Code utilisateur
|
||||
regex: "\\b(?:user|utilisateur)\\s*:?\\s*[a-z]+\\.[a-z]+\\b"
|
||||
score: 0.9
|
||||
- name: Identifiant RH
|
||||
regex: "\\b(?:RH|HR)[\\-/]?[0-9]{4,8}\\b"
|
||||
score: 0.85
|
||||
context: ["matricule", "employé", "client", "utilisateur", "ID", "identifiant"]
|
||||
29
conf/recognizers/Business/generic/financial_amounts.yaml
Normal file
29
conf/recognizers/Business/generic/financial_amounts.yaml
Normal file
@@ -0,0 +1,29 @@
|
||||
# Recognizer pour montants financiers et devises
|
||||
recognizer_registry:
|
||||
recognizers:
|
||||
- name: FinancialAmountRecognizer
|
||||
supported_language: fr
|
||||
supported_entity: MONTANT_FINANCIER
|
||||
patterns:
|
||||
- name: Montant avec devise EUR
|
||||
regex: "\\b(?:[0-9]{1,3}(?:[\\s.,][0-9]{3})*|[0-9]+)(?:[.,][0-9]{1,2})?\\s*€\\b"
|
||||
score: 0.95
|
||||
- name: Montant avec devise USD
|
||||
regex: "\\b(?:[0-9]{1,3}(?:[\\s.,][0-9]{3})*|[0-9]+)(?:[.,][0-9]{1,2})?\\s*(?:USD|\\$)\\b"
|
||||
score: 0.95
|
||||
- name: Montant abrégé avec K/M
|
||||
regex: ""
|
||||
score: 0.9
|
||||
- name: Salaire annuel
|
||||
regex: ""
|
||||
score: 0.95
|
||||
context:
|
||||
[
|
||||
"montant",
|
||||
"prix",
|
||||
"coût",
|
||||
"budget",
|
||||
"salaire",
|
||||
"rémunération",
|
||||
"facture",
|
||||
]
|
||||
@@ -5,30 +5,52 @@ recognizer_registry:
|
||||
supported_language: fr
|
||||
supported_entity: MARKET_SHARE
|
||||
patterns:
|
||||
# Pourcentages de marché
|
||||
# Pourcentages simples (nouveau pattern plus permissif)
|
||||
- name: Simple Percentage
|
||||
regex: "\\b\\d{1,2}(?:[,.]\\d{1,2})?%\\b"
|
||||
score: 0.7
|
||||
|
||||
# Part de marché explicite
|
||||
- name: Explicit Market Share
|
||||
regex: "\\b(?:part\\s+de\\s+marché|parts?\\s+de\\s+marché)\\b"
|
||||
score: 0.9
|
||||
|
||||
# Pourcentages de marché avec contexte
|
||||
- name: Market Share Percentage
|
||||
regex: "\\b(?:détient|possède|contrôle|représente)?\\s*(?:environ\\s+)?(?:\\d{1,2}(?:[,.]\\d{1,2})?%)\\s*(?:de\\s+(?:part\\s+de\\s+)?marché|du\\s+marché|de\\s+parts?)\\b"
|
||||
score: 0.9
|
||||
|
||||
|
||||
# Positions de marché
|
||||
- name: Market Position
|
||||
regex: "\\b(?:leader|numéro\\s+\\d+|\\d+(?:er|ème)\\s+acteur|position\\s+dominante|monopole)\\s+(?:du\\s+)?(?:marché|secteur)\\b"
|
||||
score: 0.85
|
||||
|
||||
|
||||
# Parts relatives
|
||||
- name: Relative Market Share
|
||||
regex: "\\b(?:majoritaire|minoritaire|principale|significative)\\s+(?:part\\s+de\\s+)?marché\\b"
|
||||
score: 0.8
|
||||
|
||||
|
||||
# Données de concentration
|
||||
- name: Market Concentration
|
||||
regex: "\\b(?:concentration|consolidation|fusion)\\s+(?:du\\s+)?marché\\b"
|
||||
score: 0.75
|
||||
|
||||
|
||||
# Chiffres d'affaires relatifs
|
||||
- name: Revenue Share
|
||||
regex: "\\b(?:\\d{1,2}(?:[,.]\\d{1,2})?%)\\s*(?:du\\s+)?(?:chiffre\\s+d'affaires|CA|revenus?)\\s+(?:du\\s+)?(?:marché|secteur)\\b"
|
||||
score: 0.85
|
||||
|
||||
|
||||
context:
|
||||
["part de marché", "position concurrentielle", "leader", "concurrent", "secteur", "industrie", "chiffre d'affaires", "revenus", "concentration", "monopole", "oligopole"]
|
||||
[
|
||||
"part de marché",
|
||||
"position concurrentielle",
|
||||
"leader",
|
||||
"concurrent",
|
||||
"secteur",
|
||||
"industrie",
|
||||
"chiffre d'affaires",
|
||||
"revenus",
|
||||
"concentration",
|
||||
"monopole",
|
||||
"oligopole",
|
||||
]
|
||||
|
||||
71
conf/recognizers/Business/generic/professional_data.yaml
Normal file
71
conf/recognizers/Business/generic/professional_data.yaml
Normal file
@@ -0,0 +1,71 @@
|
||||
# Recognizer pour données professionnelles génériques (France/Belgique)
|
||||
recognizer_registry:
|
||||
recognizers:
|
||||
# Recognizer pour titres de civilité
|
||||
- name: GenericCivilityTitleRecognizer
|
||||
supported_language: fr
|
||||
supported_entity: TITRE_CIVILITE
|
||||
patterns:
|
||||
- name: Titres de civilité
|
||||
regex: "\\b(?:M\\.|Mme|Mlle|Dr\\.|Pr\\.|Prof\\.|Docteur|Professeur|Maître|Me\\.)(?=\\s+[A-ZÀ-Ÿ])"
|
||||
score: 0.9
|
||||
- name: Titres honorifiques
|
||||
regex: "\\b(?:Monsieur|Madame|Mademoiselle)(?=\\s+[A-ZÀ-Ÿ])"
|
||||
score: 0.85
|
||||
context: ["identité", "titre", "civilité"]
|
||||
|
||||
# Recognizer pour données professionnelles générales
|
||||
- name: GenericProfessionalDataRecognizer
|
||||
supported_language: fr
|
||||
supported_entity: DONNEES_PROFESSIONNELLES
|
||||
patterns:
|
||||
- name: Titre de poste
|
||||
regex: "\\b(?:directeur|directrice|manager|responsable|chef|ingénieur|ingénieure|consultant|consultante)\\s+[a-zà-ÿ\\s]+\\b"
|
||||
score: 0.8
|
||||
- name: Département
|
||||
regex: "\\b(?:département|service|division)\\s+[A-ZÀ-Ÿ][a-zà-ÿ\\s]+\\b"
|
||||
score: 0.75
|
||||
- name: Adresse professionnelle
|
||||
regex: "\\b(?:siège\\s+social|adresse\\s+professionnelle)\\s*:?\\s*[0-9]{1,4}\\s+[A-ZÀ-Ÿ][a-zà-ÿ\\s'-]+\\b"
|
||||
score: 0.9
|
||||
- name: Email professionnel
|
||||
regex: "\\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}\\b"
|
||||
score: 0.85
|
||||
- name: Numéro IEC
|
||||
regex: "\\b(?:n°\\s*IEC|numéro\\s*IEC|IEC)\\s*:?\\s*([0-9]{6,8})\\b"
|
||||
score: 0.9
|
||||
- name: Avocat
|
||||
regex: "\\b(?:avocat|avocate)\\b"
|
||||
score: 0.9
|
||||
- name: Expert-comptable
|
||||
regex: "\\b(?:expert-comptable|expert\\s+comptable)\\b"
|
||||
score: 0.99
|
||||
- name: Notaire
|
||||
regex: "\\b(?:notaire)\\b"
|
||||
score: 0.95
|
||||
- name: Médecin
|
||||
regex: "\\b(?:médecin|docteur\\s+en\\s+médecine)\\b"
|
||||
score: 0.95
|
||||
# Données spécifiques belges intégrées
|
||||
- name: Numéro ONSS employeur
|
||||
regex: "\\b(?:ONSS|onss)\\s*:?\\s*[0-9]{7}\\b"
|
||||
score: 0.95
|
||||
- name: Numéro patronal
|
||||
regex: "\\b(?:numéro\\s+)?patronal\\s*:?\\s*[0-9]{7}\\b"
|
||||
score: 0.9
|
||||
context:
|
||||
[
|
||||
"professionnel",
|
||||
"travail",
|
||||
"bureau",
|
||||
"entreprise",
|
||||
"poste",
|
||||
"fonction",
|
||||
"réglementé",
|
||||
"ordre",
|
||||
"diplôme",
|
||||
"ONSS",
|
||||
"patronal",
|
||||
"employeur",
|
||||
"siège social"
|
||||
]
|
||||
20
conf/recognizers/Business/generic/trade_secrets.yaml
Normal file
20
conf/recognizers/Business/generic/trade_secrets.yaml
Normal file
@@ -0,0 +1,20 @@
|
||||
# Recognizer pour secrets d'affaires et projets
|
||||
recognizer_registry:
|
||||
recognizers:
|
||||
- name: TradeSecretsRecognizer
|
||||
supported_language: fr
|
||||
supported_entity: SECRET_COMMERCIAL
|
||||
patterns:
|
||||
- name: Nom de projet interne
|
||||
regex: "\\b(?:projet|project)\\s+[A-Z][a-zA-Z]{3,15}\\b"
|
||||
score: 0.85
|
||||
- name: Code projet
|
||||
regex: "\\b(?:projet|project)\\s*:?\\s*[A-Z]{2,4}[\\-/]?[0-9]{2,4}\\b"
|
||||
score: 0.9
|
||||
- name: Plan stratégique
|
||||
regex: "\\b(?:plan|stratégie)\\s+(?:stratégique|business)\\s+[0-9]{4}\\b"
|
||||
score: 0.9
|
||||
- name: Formule interne
|
||||
regex: "\\b(?:formule|recette|procédé)\\s+[A-Z][\\-0-9A-Z]{2,10}\\b"
|
||||
score: 0.85
|
||||
context: ["projet", "stratégique", "confidentiel", "interne", "secret", "propriétaire"]
|
||||
@@ -3,7 +3,7 @@ recognizer_registry:
|
||||
recognizers:
|
||||
- name: BelgianAddressRecognizer
|
||||
supported_language: fr
|
||||
supported_entity: BE_ADDRESS
|
||||
supported_entity: ADRESSE_BELGE
|
||||
patterns:
|
||||
# Pattern principal : numéro + rue + code postal + ville (SANS contexte)
|
||||
- name: Adresse complète avec numéro devant
|
||||
|
||||
@@ -3,7 +3,7 @@ recognizer_registry:
|
||||
recognizers:
|
||||
- name: BelgianIDCardRecognizer
|
||||
supported_language: fr
|
||||
supported_entity: BE_ID_CARD
|
||||
supported_entity: CARTE_IDENTITE_BELGE
|
||||
patterns:
|
||||
- name: Carte d'identité belge
|
||||
regex: "\\b[0-9]{3}\\-[0-9]{7}\\-[0-9]{2}\\b"
|
||||
@@ -15,7 +15,7 @@ recognizer_registry:
|
||||
|
||||
- name: BelgianPassportRecognizer
|
||||
supported_language: fr
|
||||
supported_entity: BE_PASSPORT
|
||||
supported_entity: PASSEPORT_BELGE
|
||||
patterns:
|
||||
- name: Passeport belge
|
||||
regex: "\\b[A-Z]{2}[0-9]{6}\\b"
|
||||
|
||||
@@ -3,7 +3,7 @@ recognizer_registry:
|
||||
recognizers:
|
||||
- name: BelgianNRNRecognizer
|
||||
supported_language: fr
|
||||
supported_entity: BE_NATIONAL_REGISTER_NUMBER
|
||||
supported_entity: REGISTRE_NATIONAL_BELGE
|
||||
patterns:
|
||||
- name: NRN avec points et tiret
|
||||
regex: "\\b[0-9]{2}\\.[0-9]{2}\\.[0-9]{2}-[0-9]{3}\\.[0-9]{2}\\b"
|
||||
|
||||
@@ -3,7 +3,7 @@ recognizer_registry:
|
||||
recognizers:
|
||||
- name: BelgianPhoneRecognizer
|
||||
supported_language: fr
|
||||
supported_entity: BE_PHONE_NUMBER
|
||||
supported_entity: TELEPHONE_BELGE
|
||||
patterns:
|
||||
# Patterns avec contexte Tel: et Tél:
|
||||
- name: Téléphone fixe avec contexte Tel
|
||||
@@ -20,7 +20,7 @@ recognizer_registry:
|
||||
score: 0.99
|
||||
# Patterns généraux (sans contexte spécifique)
|
||||
- name: Téléphone fixe belge
|
||||
regex: '(?<!BCE\s*:?\s*)\b0[1-9](?:[./\s]?\d{2,3}){3}(?=\b|\)|$|[.,;])(?!.*BCE)'
|
||||
regex: '(?<!BCE\s*:?\s*)(?<!FR[0-9]{2}\s[0-9]{4}\s[0-9]{4}\s)\b0[1-9](?:[./\s]?\d{2,3}){3}(?=\b|\)|$|[.,;])(?!.*BCE)(?!\s[0-9A-Z]{4}\s[0-9A-Z]{3})'
|
||||
score: 0.95
|
||||
- name: Mobile belge
|
||||
regex: '\b04[0-9]{2}[./\s]?[0-9]{2}[./\s]?[0-9]{2}[./\s]?[0-9]{2}(?=\b|\)|$|[.,;])'
|
||||
|
||||
20
conf/recognizers/PII/belgian/vat_numbers.yaml
Normal file
20
conf/recognizers/PII/belgian/vat_numbers.yaml
Normal file
@@ -0,0 +1,20 @@
|
||||
# Recognizer pour numéros de TVA belges
|
||||
recognizer_registry:
|
||||
recognizers:
|
||||
- name: BelgianVATRecognizer
|
||||
supported_language: fr
|
||||
supported_entity: TVA_BELGE
|
||||
patterns:
|
||||
- name: TVA belge format standard
|
||||
regex: "\\bBE[0-9]{4}\\.[0-9]{3}\\.[0-9]{3}\\b"
|
||||
score: 1.0
|
||||
- name: TVA belge compact
|
||||
regex: "\\bBE[0-9]{10}\\b"
|
||||
score: 0.95
|
||||
- name: TVA avec contexte
|
||||
regex: "\\b(?:TVA|tva)\\s*:?\\s*BE[0-9]{4}\\.[0-9]{3}\\.[0-9]{3}\\b"
|
||||
score: 1.0
|
||||
- name: Numéro d'entreprise BCE
|
||||
regex: "\\b(?:BCE|bce)\\s*:?\\s*BE[0-9]{4}\\.[0-9]{3}\\.[0-9]{3}\\b"
|
||||
score: 1.0
|
||||
context: ["TVA", "BCE", "numéro d'entreprise", "identification", "intracommunautaire"]
|
||||
@@ -3,12 +3,18 @@ recognizer_registry:
|
||||
recognizers:
|
||||
- name: FrenchAddressRecognizer
|
||||
supported_language: fr
|
||||
supported_entity: FR_ADDRESS
|
||||
supported_entity: ADRESSE_FRANCAISE
|
||||
patterns:
|
||||
- name: Adresse française complète
|
||||
regex: "\\b\\d{1,4}(?:bis|ter|quater)?\\s+(?:rue|avenue|boulevard|place|impasse|allée|chemin|route)\\s+[A-Za-zà-ÿ\\s'-]+,\\s*[0-9]{5}\\s+[A-Za-zà-ÿ\\s'-]+\\b"
|
||||
- name: Adresse française complète avec virgule
|
||||
regex: "\\b\\d{1,4}(?:bis|ter|quater)?\\s+(?:rue|avenue|boulevard|place|impasse|allée|chemin|route|square|villa|cité|passage|quai|cours|esplanade)\\s+[A-Za-zà-ÿ\\s'-]+?,\\s*[0-9]{5}\\s+[A-Za-zà-ÿ'-]+(?=\\s*$|\\s*\\n|\\s*-)"
|
||||
score: 0.95
|
||||
- name: Code postal français
|
||||
regex: "\\b[0-9]{5}\\b"
|
||||
- name: Adresse française sans virgule
|
||||
regex: "\\b\\d{1,4}(?:bis|ter|quater)?\\s+(?:rue|avenue|boulevard|place|impasse|allée|chemin|route|square|villa|cité|passage|quai|cours|esplanade)\\s+[A-Za-zà-ÿ\\s'-]+?\\s+[0-9]{5}\\s+[A-Za-zà-ÿ'-]+(?=\\s*$|\\s*\\n|\\s*-)"
|
||||
score: 0.9
|
||||
- name: Numéro et type de voie
|
||||
regex: "\\b\\d{1,4}(?:bis|ter|quater)?\\s+(?:rue|avenue|boulevard|place|impasse|allée|chemin|route|square|villa|cité|passage|quai|cours|esplanade)\\s+[A-Za-zà-ÿ\\s'-]{3,30}(?=\\s*[-,]|\\s*$|\\s+[0-9]{5}|\\s*\\n)"
|
||||
score: 0.65
|
||||
- name: Code postal français (isolé)
|
||||
regex: "(?<!\\d{1,4}(?:bis|ter|quater)?\\s+(?:rue|avenue|boulevard|place|impasse|allée|chemin|route|square|villa|cité|passage|quai|cours|esplanade)\\s+[A-Za-zà-ÿ\\s'-]+?,\\s*)\\b[0-9]{5}\\b(?!\\s+[A-Za-zà-ÿ'-]+)"
|
||||
score: 0.6
|
||||
context: ["adresse", "domicile", "résidence", "siège social"]
|
||||
|
||||
@@ -3,12 +3,18 @@ recognizer_registry:
|
||||
recognizers:
|
||||
- name: FrenchIDCardRecognizer
|
||||
supported_language: fr
|
||||
supported_entity: FR_ID_CARD
|
||||
supported_entity: CARTE_IDENTITE_FRANCAISE
|
||||
patterns:
|
||||
- name: Numéro CNI nouveau format
|
||||
- name: Numéro CNI nouveau format (9 chiffres)
|
||||
regex: "\\b[0-9]{9}\\b"
|
||||
score: 0.98
|
||||
- name: Numéro CNI ancien format (12 chiffres)
|
||||
regex: "\\b[0-9]{12}\\b"
|
||||
score: 0.85
|
||||
- name: Numéro CNI avec espaces
|
||||
score: 0.95
|
||||
- name: Numéro CNI avec espaces (nouveau)
|
||||
regex: "\\b[0-9]{3}\\s[0-9]{3}\\s[0-9]{3}\\b"
|
||||
score: 0.99
|
||||
- name: Numéro CNI avec espaces (ancien)
|
||||
regex: "\\b[0-9]{4}\\s[0-9]{4}\\s[0-9]{4}\\b"
|
||||
score: 0.9
|
||||
context:
|
||||
@@ -16,7 +22,7 @@ recognizer_registry:
|
||||
|
||||
- name: FrenchPassportRecognizer
|
||||
supported_language: fr
|
||||
supported_entity: FR_PASSPORT
|
||||
supported_entity: PASSEPORT_FRANCAIS
|
||||
patterns:
|
||||
- name: Numéro de passeport français
|
||||
regex: "\\b[0-9]{2}[A-Z]{2}[0-9]{5}\\b"
|
||||
@@ -28,7 +34,7 @@ recognizer_registry:
|
||||
|
||||
- name: FrenchDriverLicenseRecognizer
|
||||
supported_language: fr
|
||||
supported_entity: FR_DRIVER_LICENSE
|
||||
supported_entity: PERMIS_CONDUIRE_FRANCAIS
|
||||
patterns:
|
||||
- name: Permis de conduire français
|
||||
regex: "\\b[0-9]{12}\\b"
|
||||
|
||||
@@ -3,7 +3,7 @@ recognizer_registry:
|
||||
recognizers:
|
||||
- name: FrenchTaxIDRecognizer
|
||||
supported_language: fr
|
||||
supported_entity: FR_TAX_ID
|
||||
supported_entity: NUMERO_FISCAL_FRANCAIS
|
||||
patterns:
|
||||
- name: Numéro fiscal français
|
||||
regex: "\\b[0-9]{13}\\b"
|
||||
@@ -15,7 +15,7 @@ recognizer_registry:
|
||||
|
||||
- name: FrenchBankAccountRecognizer
|
||||
supported_language: fr
|
||||
supported_entity: FR_BANK_ACCOUNT
|
||||
supported_entity: COMPTE_BANCAIRE_FRANCAIS
|
||||
patterns:
|
||||
- name: RIB français
|
||||
regex: "\\b[0-9]{5}\\s[0-9]{5}\\s[0-9]{11}\\s[0-9]{2}\\b"
|
||||
|
||||
20
conf/recognizers/PII/french/phone_numbers.yaml
Normal file
20
conf/recognizers/PII/french/phone_numbers.yaml
Normal file
@@ -0,0 +1,20 @@
|
||||
# Recognizer pour numéros de téléphone français
|
||||
recognizer_registry:
|
||||
recognizers:
|
||||
- name: FrenchPhoneRecognizer
|
||||
supported_language: fr
|
||||
supported_entity: TELEPHONE_FRANCAIS
|
||||
patterns:
|
||||
- name: Téléphone français avec indicatif international
|
||||
regex: "\\b(?:\\+33|0033|33)\\s?0?[1-9](?:[\\s\\-\\.]?[0-9]{2}){4}\\b"
|
||||
score: 1.0
|
||||
- name: Téléphone français format standard
|
||||
regex: "\\b(?<!\\s)(?<!boulevard\\s)(?<!avenue\\s)(?<!rue\\s)(?<!place\\s)0[1-9](?:[\\s\\-\\.]?[0-9]{2}){4}(?![0-9])(?!\\s[A-Za-z])(?!\\s[A-Z][a-z])\\b"
|
||||
score: 0.85
|
||||
- name: Mobile français
|
||||
regex: "\\b(?<!boulevard\\s)(?<!avenue\\s)(?<!rue\\s)(?<!place\\s)(?:(?:\\+33|0033|33)\\s?0?[67]|0[67])(?:[\\s\\-\\.]?[0-9]{2}){4}(?![0-9])(?!\\s[A-Za-z])(?!\\s[A-Z][a-z])\\b"
|
||||
score: 0.95
|
||||
- name: Téléphone avec parenthèses
|
||||
regex: "\\b\\(0[1-9]\\)(?:[\\s\\-\\.]?[0-9]{2}){4}\\b"
|
||||
score: 0.9
|
||||
context: ["téléphone", "tél", "mobile", "portable", "fixe", "numéro"]
|
||||
@@ -3,7 +3,7 @@ recognizer_registry:
|
||||
recognizers:
|
||||
- name: FrenchINSEERecognizer
|
||||
supported_language: fr
|
||||
supported_entity: FR_SOCIAL_SECURITY_NUMBER
|
||||
supported_entity: NUMERO_SECURITE_SOCIALE_FRANCAIS
|
||||
patterns:
|
||||
- name: INSEE complet avec espaces
|
||||
regex: "\\b[12]\\s*[0-9]{2}\\s*(?:0[1-9]|1[0-2])\\s*(?:2[ABab]|[0-9]{2})\\s*[0-9]{3}\\s*[0-9]{3}[\\s]?[0-9]{2}\\b"
|
||||
|
||||
17
conf/recognizers/PII/french/vat_numbers.yaml
Normal file
17
conf/recognizers/PII/french/vat_numbers.yaml
Normal file
@@ -0,0 +1,17 @@
|
||||
# Recognizer pour numéros de TVA français
|
||||
recognizer_registry:
|
||||
recognizers:
|
||||
- name: FrenchVATRecognizer
|
||||
supported_language: fr
|
||||
supported_entity: TVA_FRANCAISE
|
||||
patterns:
|
||||
- name: TVA intracommunautaire française
|
||||
regex: "\\bFR[0-9A-Z]{2}[0-9]{9}\\b"
|
||||
score: 1.0
|
||||
- name: TVA avec espaces
|
||||
regex: "\\bFR\\s[0-9A-Z]{2}\\s[0-9]{3}\\s[0-9]{3}\\s[0-9]{3}\\b"
|
||||
score: 0.95
|
||||
- name: Numéro TVA avec contexte
|
||||
regex: "\\b(?:TVA|tva)\\s*:?\\s*FR[0-9A-Z]{2}[0-9]{9}\\b"
|
||||
score: 1.0
|
||||
context: ["TVA", "intracommunautaire", "numéro de TVA", "identification fiscale"]
|
||||
24
conf/recognizers/PII/generic/addresses.yaml
Normal file
24
conf/recognizers/PII/generic/addresses.yaml
Normal file
@@ -0,0 +1,24 @@
|
||||
# Recognizer pour adresses postales génériques
|
||||
recognizer_registry:
|
||||
recognizers:
|
||||
- name: GenericAddressRecognizer
|
||||
supported_language: fr
|
||||
supported_entity: ADRESSE
|
||||
patterns:
|
||||
- name: Adresse avec numéro et rue
|
||||
regex: "\\b\\d{1,4}(?:bis|ter|quater)?[A-Za-z]?\\s+(?:rue|avenue|boulevard|place|impasse|allée|chemin|route|quai|square|passage|villa|cité|cours|esplanade|promenade|sentier|voie|lotissement)\\s+[A-ZÀ-Ÿ][a-zà-ÿ\\s'-]{3,30}(?=\\s*[-,]|\\s*$|\\s+[0-9]{4,5}|\\s*\\n)"
|
||||
score: 0.8
|
||||
- name: Code postal générique
|
||||
regex: "\\b(?<!FR[0-9]{2}\\s)(?<![0-9]\\s)[0-9]{4,5}(?!\\s[0-9]{4})(?!\\s[0-9A-Z]{4})\\b"
|
||||
score: 0.6
|
||||
- name: Adresse complète avec ville et virgule
|
||||
regex: "\\b\\d{1,4}(?:bis|ter|quater)?[A-Za-z]?\\s+(?:rue|avenue|boulevard|place|impasse|allée|chemin|route|quai|square|passage|villa|cité|cours|esplanade|promenade|sentier|voie|lotissement)\\s+[A-ZÀ-Ÿ][a-zà-ÿ\\s'-]+?,\\s*[0-9]{4,5}\\s+[A-ZÀ-Ÿ][a-zà-ÿ'-]+(?=\\s*$|\\s*\\n|\\s*-)"
|
||||
score: 0.95
|
||||
- name: Adresse complète sans virgule
|
||||
regex: "\\b\\d{1,4}(?:bis|ter|quater)?[A-Za-z]?\\s+(?:rue|avenue|boulevard|place|impasse|allée|chemin|route|quai|square|passage|villa|cité|cours|esplanade|promenade|sentier|voie|lotissement)\\s+[A-ZÀ-Ÿ][a-zà-ÿ\\s'-]+?\\s+[0-9]{4,5}\\s+[A-ZÀ-Ÿ][a-zà-ÿ'-]+(?=\\s*$|\\s*\\n|\\s*-)"
|
||||
score: 0.9
|
||||
- name: Adresse simple avec type de voie
|
||||
regex: "\\b(?:rue|avenue|boulevard|place|impasse|allée|chemin|route|quai|square|passage|villa|cité|cours|esplanade|promenade|sentier|voie|lotissement)\\s+[A-ZÀ-Ÿ][a-zà-ÿ\\s'-]{3,30}(?=\\s*[-,]|\\s*$|\\s+[0-9]{4,5}|\\s*\\n)"
|
||||
score: 0.65
|
||||
context:
|
||||
["adresse", "domicile", "résidence", "siège", "demeurant", "résidant"]
|
||||
@@ -11,9 +11,6 @@ recognizer_registry:
|
||||
- name: IBAN compact
|
||||
regex: "\\b[A-Z]{2}[0-9]{2}[0-9A-Z]{12,30}\\b"
|
||||
score: 0.9
|
||||
- name: IBAN belge spécifique
|
||||
regex: "\\bBE[0-9]{2}\\s?[0-9]{4}\\s?[0-9]{4}\\s?[0-9]{4}\\b"
|
||||
score: 0.95
|
||||
- name: IBAN français spécifique
|
||||
regex: "\\bFR[0-9]{2}\\s?[0-9]{4}\\s?[0-9]{4}\\s?[0-9]{4}\\s?[0-9]{4}\\s?[0-9]{3}\\b"
|
||||
score: 0.95
|
||||
|
||||
39
conf/recognizers/PII/generic/credit_cards.yaml
Normal file
39
conf/recognizers/PII/generic/credit_cards.yaml
Normal file
@@ -0,0 +1,39 @@
|
||||
# Recognizer pour numéros de carte de crédit
|
||||
recognizer_registry:
|
||||
recognizers:
|
||||
- name: CreditCardRecognizer
|
||||
supported_language: fr
|
||||
supported_entity: CREDIT_CARD
|
||||
patterns:
|
||||
- name: Visa
|
||||
regex: "\\b4[0-9]{3}\\s?[0-9]{4}\\s?[0-9]{4}\\s?[0-9]{4}\\b"
|
||||
score: 1.0
|
||||
- name: Mastercard
|
||||
regex: "\\b5[1-5][0-9]{2}\\s?[0-9]{4}\\s?[0-9]{4}\\s?[0-9]{4}\\b"
|
||||
score: 1.0
|
||||
- name: American Express
|
||||
regex: "\\b3[47][0-9]{2}\\s?[0-9]{6}\\s?[0-9]{5}\\b"
|
||||
score: 1.0
|
||||
- name: Carte générique 16 chiffres
|
||||
regex: "\\b(?<!FR[0-9]{2}\\s)[0-9]{4}\\s?[0-9]{4}\\s?[0-9]{4}\\s?[0-9]{4}(?!\\s[0-9A-Z]{4}\\s[0-9A-Z]{3})\\b"
|
||||
score: 0.7
|
||||
- name: Carte générique 15 chiffres
|
||||
regex: "\\b[0-9]{4}\\s?[0-9]{6}\\s?[0-9]{5}\\b"
|
||||
score: 0.7
|
||||
- name: Carte avec contexte
|
||||
regex: "\\b(?:carte|card|numéro de carte)\\s*:?\\s*([0-9]{4}\\s?[0-9]{4}\\s?[0-9]{4}\\s?[0-9]{4})\\b"
|
||||
score: 0.9
|
||||
- name: Carte masquée
|
||||
regex: "\\b[0-9]{4}\\s?[*X]{4}\\s?[*X]{4}\\s?[0-9]{4}\\b"
|
||||
score: 0.8
|
||||
context:
|
||||
[
|
||||
"carte",
|
||||
"credit",
|
||||
"crédit",
|
||||
"visa",
|
||||
"mastercard",
|
||||
"amex",
|
||||
"paiement",
|
||||
"CB",
|
||||
]
|
||||
@@ -3,16 +3,22 @@ recognizer_registry:
|
||||
recognizers:
|
||||
- name: EmailRecognizer
|
||||
supported_language: fr
|
||||
supported_entity: EMAIL_ADDRESS
|
||||
supported_entity: ADRESSE_EMAIL
|
||||
patterns:
|
||||
- name: Email standard
|
||||
regex: "\\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Z|a-z]{2,}\\b"
|
||||
score: 1.0
|
||||
- name: Email obfusqué
|
||||
- name: Email obfusqué avec crochets
|
||||
regex: "\\b[A-Za-z0-9._%+-]+\\s*\\[at\\]\\s*[A-Za-z0-9.-]+\\s*\\[dot\\]\\s*[A-Z|a-z]{2,}\\b"
|
||||
score: 0.8
|
||||
- name: Email avec (at) et (point)
|
||||
regex: "\\b[A-Za-z0-9._%+-]+\\s*\\(at\\)\\s*[A-Za-z0-9.-]+\\s*\\(point\\)\\s*[A-Z|a-z]{2,}\\b"
|
||||
score: 0.7
|
||||
- name: Email avec arobase écrite
|
||||
regex: "\\b[A-Za-z0-9._%+-]+\\s*(?:arobase|at)\\s*[A-Za-z0-9.-]+\\s*(?:point|dot)\\s*[A-Z|a-z]{2,}\\b"
|
||||
score: 0.75
|
||||
- name: Email avec espaces
|
||||
regex: "\\b[A-Za-z0-9._%+-]+\\s+@\\s+[A-Za-z0-9.-]+\\s+\\.\\s+[A-Z|a-z]{2,}\\b"
|
||||
score: 0.9
|
||||
context:
|
||||
["email", "courriel", "mail", "@", "contact", "adresse électronique"]
|
||||
|
||||
32
conf/recognizers/PII/generic/iban.yaml
Normal file
32
conf/recognizers/PII/generic/iban.yaml
Normal file
@@ -0,0 +1,32 @@
|
||||
# Recognizer pour numéros IBAN
|
||||
recognizer_registry:
|
||||
recognizers:
|
||||
- name: IBANRecognizer
|
||||
supported_language: fr
|
||||
supported_entity: IBAN
|
||||
patterns:
|
||||
- name: IBAN français
|
||||
regex: "\\bFR[0-9]{2}\\s?[0-9]{4}\\s?[0-9]{4}\\s?[0-9]{4}\\s?[0-9]{4}\\s?[0-9]{4}\\s?[0-9]{3}\\b"
|
||||
score: 1.0
|
||||
- name: IBAN belge
|
||||
regex: "\\bBE[0-9]{2}\\s?[0-9]{4}\\s?[0-9]{4}\\s?[0-9]{4}\\b"
|
||||
score: 1.0
|
||||
- name: IBAN générique européen
|
||||
regex: "\\b[A-Z]{2}[0-9]{2}\\s?(?:[A-Z0-9]{4}\\s?){2,7}[A-Z0-9]{1,4}\\b"
|
||||
score: 0.9
|
||||
- name: IBAN compact
|
||||
regex: "\\b[A-Z]{2}[0-9]{2}[A-Z0-9]{12,30}\\b"
|
||||
score: 0.85
|
||||
- name: IBAN avec contexte
|
||||
regex: "\\b(?:IBAN|iban)\\s*:?\\s*([A-Z]{2}[0-9]{2}\\s?(?:[A-Z0-9]{4}\\s?){2,7}[A-Z0-9]{1,4})\\b"
|
||||
score: 0.95
|
||||
context:
|
||||
[
|
||||
"IBAN",
|
||||
"iban",
|
||||
"compte",
|
||||
"bancaire",
|
||||
"virement",
|
||||
"RIB",
|
||||
"coordonnées bancaires",
|
||||
]
|
||||
@@ -3,7 +3,7 @@ recognizer_registry:
|
||||
recognizers:
|
||||
- name: IpAddressRecognizer
|
||||
supported_language: fr
|
||||
supported_entity: IP_ADDRESS
|
||||
supported_entity: ADRESSE_IP
|
||||
patterns:
|
||||
- name: IPv4
|
||||
regex: "\\b(?:(?:25[0-5]|2[0-4][0-9]|1[0-9]{2}|[1-9]?[0-9])\\.){3}(?:25[0-5]|2[0-4][0-9]|1[0-9]{2}|[1-9]?[0-9])\\b"
|
||||
|
||||
31
conf/recognizers/PII/generic/locations.yaml
Normal file
31
conf/recognizers/PII/generic/locations.yaml
Normal file
@@ -0,0 +1,31 @@
|
||||
# Recognizer pour lieux géographiques
|
||||
recognizer_registry:
|
||||
recognizers:
|
||||
- name: LocationRecognizer
|
||||
supported_language: fr
|
||||
supported_entity: LOCATION
|
||||
patterns:
|
||||
- name: Ville française (sauf si dans adresse)
|
||||
regex: "(?<!\\d{1,4}(?:bis|ter|quater)?\\s+(?:rue|avenue|boulevard|place|impasse|allée|chemin|route|square|villa|cité|passage|quai|cours|esplanade)\\s+[A-Za-zà-ÿ\\s'-]+?,\\s*[0-9]{5}\\s+)\\b(?:Paris|Marseille|Lyon|Toulouse|Nice|Nantes|Strasbourg|Montpellier|Bordeaux|Lille|Rennes|Reims|Le Havre|Saint-Étienne|Toulon|Grenoble|Dijon|Angers|Nîmes|Villeurbanne|Saint-Denis|Le Mans|Aix-en-Provence|Clermont-Ferrand|Brest|Limoges|Tours|Amiens|Perpignan|Metz|Besançon|Boulogne-Billancourt|Orléans|Mulhouse|Rouen|Caen|Nancy|Saint-Denis|Argenteuil|Montreuil|Roubaix|Tourcoing|Nanterre|Avignon|Créteil|Dunkerque|Poitiers|Asnières-sur-Seine|Courbevoie|Versailles|Colombes|Fort-de-France|Aulnay-sous-Bois|Saint-Pierre|Rueil-Malmaison|Pau|Aubervilliers|Champigny-sur-Marne|Antibes|La Rochelle|Cannes|Calais|Béziers|Colmar|Bourges|Mérignac|Saint-Nazaire|Issy-les-Moulineaux|Noisy-le-Grand|Évry|Cergy|Pessac|Vénissieux|Ivry-sur-Seine|Clichy|Troyes|Lorient|Montauban|Neuilly-sur-Seine|Antony|Sarcelles|Niort|Chambéry|Le Blanc-Mesnil|Beauvais|Maisons-Alfort|Chelles|Meaux|Levallois-Perret|Valence|Quimper|Arras|Villejuif|Hyères|La Seyne-sur-Mer|Fréjus|Albi|Sartrouville|Fontenay-sous-Bois|Clamart|Sevran|Compiègne|Drancy|Le Tampon|Bayonne|Massy|Gennevilliers|Corbeil-Essonnes|Saint-Ouen|Garges-lès-Gonesse|Bagneux|Cagnes-sur-Mer|Grasse|Talence|Castres|Douai|Wattrelos|Cholet|Vannes|Suresnes|Puteaux|Gagny|Belfort|Chartres|Saint-Priest|Vincennes|Montrouge|Meyzieu|Villepinte|Caluire-et-Cuire|Bourg-en-Bresse|Roanne|Concarneau|Saint-Brieuc|Épinay-sur-Seine|Vaulx-en-Velin|Rosny-sous-Bois|Arles|Thonon-les-Bains|Viry-Châtillon|Alfortville|Livry-Gargan|Herblay|Houilles|Schiltigheim|Franconville|Châtillon|Nogent-sur-Marne|Pontoise|L'Haÿ-les-Roses|Malakoff|Châtenay-Malabry|Conflans-Sainte-Honorine|Villemomble|Tremblay-en-France|Montigny-le-Bretonneux|Athis-Mons|Chatou|Villeneuve-Saint-Georges|Les Mureaux|Champs-sur-Marne|Yerres|Savigny-sur-Orge|Villetaneuse|Sainte-Geneviève-des-Bois|Marignane|Goussainville|Stains|Poissy|Rillieux-la-Pape|Charleville-Mézières)\\b"
|
||||
score: 0.7
|
||||
- name: Pays
|
||||
regex: "\\b(?:France|Belgique|Suisse|Canada|Allemagne|Italie|Espagne|Portugal|Royaume-Uni|États-Unis|Maroc|Algérie|Tunisie|Sénégal|Côte d'Ivoire|Mali|Burkina Faso|Niger|Tchad|République centrafricaine|Cameroun|Gabon|République démocratique du Congo|Madagascar|Maurice|Seychelles|Comores|Djibouti|Vanuatu|Nouvelle-Calédonie|Polynésie française|Wallis-et-Futuna|Saint-Pierre-et-Miquelon|Mayotte|Guyane|Martinique|Guadeloupe|La Réunion)\\b"
|
||||
score: 0.85
|
||||
- name: Lieu de naissance
|
||||
regex: "\\b(?:né|née|naissance)\\s+(?:à|au|aux|en|dans)\\s+([A-ZÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞ][a-zàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþß'-]+(?:[\\s-][A-Za-zàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþß'-]+)*)\\b"
|
||||
score: 0.95
|
||||
- name: Région française
|
||||
regex: "\\b(?:Île-de-France|Auvergne-Rhône-Alpes|Hauts-de-France|Nouvelle-Aquitaine|Occitanie|Grand Est|Provence-Alpes-Côte d'Azur|Pays de la Loire|Bretagne|Normandie|Bourgogne-Franche-Comté|Centre-Val de Loire|Corse)\\b"
|
||||
score: 0.8
|
||||
context:
|
||||
[
|
||||
"lieu",
|
||||
"ville",
|
||||
"pays",
|
||||
"région",
|
||||
"naissance",
|
||||
"domicile",
|
||||
"résidence",
|
||||
"né",
|
||||
"née",
|
||||
]
|
||||
37
conf/recognizers/PII/generic/names.yaml
Normal file
37
conf/recognizers/PII/generic/names.yaml
Normal file
@@ -0,0 +1,37 @@
|
||||
# Recognizer pour noms et prénoms génériques
|
||||
recognizer_registry:
|
||||
recognizers:
|
||||
- name: GenericPersonNameRecognizer
|
||||
supported_language: fr
|
||||
supported_entity: PERSONNE
|
||||
patterns:
|
||||
- name: Nom avec titre de civilité complet
|
||||
regex: "(?<!\\w)(?:Monsieur|Madame|Mademoiselle|Docteur|Professeur)\\s+[A-ZÀ-Ÿ][a-zà-ÿ'-]{2,}(?:\\s+[A-ZÀ-Ÿ][a-zà-ÿ'-]{2,})*(?=\\s*\\(|\\s*,|\\s*$|\\s+(?:né|née|demeurant))"
|
||||
score: 0.8
|
||||
- name: Nom avec contexte explicite
|
||||
regex: "(?<=\\b(?:témoin|expert|consultant|gérant|directeur)\\s+)[A-ZÀ-Ÿ][a-zà-ÿ'-]{2,}\\s+[A-ZÀ-Ÿ][a-zà-ÿ'-]{2,}(?=\\s*\\(|\\s*,|\\s*$)"
|
||||
score: 0.7
|
||||
- name: Prénom seul avec contexte
|
||||
regex: "(?<=\\b(?:je suis|mon nom est|je m'appelle|appelé|nommé)\\s+)[A-ZÀ-Ÿ][a-zà-ÿ'-]{2,}(?=\\s|$|\\.|,)"
|
||||
score: 0.6
|
||||
- name: Nom complet simple
|
||||
regex: "(?<=\\b(?:témoin|expert|consultant|gérant|directeur|Monsieur|Madame|appelé|nommé)\\s+)[A-ZÀ-Ÿ][a-zà-ÿ'-]{2,}\\s+[A-ZÀ-Ÿ][a-zà-ÿ'-]{2,}(?=\\s*\\(|\\s*,|\\s*$)"
|
||||
score: 0.5
|
||||
- name: Prénom français courant
|
||||
regex: "(?<!\\w)(?:Nicolas|Pierre|Jean|Marie|Paul|Michel|Philippe|Alain|Bernard|Christian|Daniel|François|Jacques|Laurent|Marc|Olivier|Pascal|Patrick|Stéphane|Thierry|Vincent|Antoine|Bruno|Christophe|David|Eric|Frédéric|Guillaume|Henri|Julien|Luc|Mathieu|Maxime|Sébastien|Thomas|Yves|Alexandre|André|Arnaud|Benoît|Cédric|Didier|Dominique|Emmanuel|Fabrice|Gérard|Hervé|Jérôme|Lionel|Louis|Ludovic|Marcel|Maurice|Michaël|Patrice|Raphaël|Raymond|Rémi|René|Richard|Robert|Roger|Serge|Sylvain|Sylvie|Anne|Catherine|Christine|Françoise|Isabelle|Martine|Monique|Nathalie|Nicole|Pascale|Sandrine|Sophie|Valérie|Véronique|Brigitte|Chantal|Corinne|Dominique|Élisabeth|Hélène|Jacqueline|Jeanne|Joëlle|Karine|Laurence|Michèle|Nadine|Patricia|Sabine|Simone|Suzanne|Sylviane|Thérèse|Viviane|Yvette|Yvonne|Agnès|Amélie|Audrey|Béatrice|Bénédicte|Camille|Carole|Caroline|Céline|Claire|Delphine|Émilie|Estelle|Évelyne|Florence|Geneviève|Gwenaëlle|Ingrid|Julie|Laetitia|Laure|Magali|Mélanie|Muriel|Odile|Pauline|Solange|Stéphanie|Virginie)(?=\\s|$|\\.|,)"
|
||||
score: 0.7
|
||||
context:
|
||||
[
|
||||
"témoin",
|
||||
"expert",
|
||||
"consultant",
|
||||
"gérant",
|
||||
"directeur",
|
||||
"Monsieur",
|
||||
"Madame",
|
||||
"appelé",
|
||||
"nommé",
|
||||
"je suis",
|
||||
"je m'appelle",
|
||||
"mon nom est",
|
||||
]
|
||||
12
conf/recognizers/PII/generic/personal_ids.yaml
Normal file
12
conf/recognizers/PII/generic/personal_ids.yaml
Normal file
@@ -0,0 +1,12 @@
|
||||
# Recognizer pour identifiants personnels génériques
|
||||
recognizer_registry:
|
||||
recognizers:
|
||||
- name: GenericPersonalIDRecognizer
|
||||
supported_language: fr
|
||||
supported_entity: IDENTIFIANT_PERSONNEL
|
||||
patterns:
|
||||
- name: Numéro d'identité générique
|
||||
regex: "\\b(?:ID|identifiant|matricule|numéro)\\s*:?\\s*[A-Z0-9]{6,15}\\b"
|
||||
score: 0.8
|
||||
|
||||
context: ["identifiant", "matricule", "référence", "code", "numéro ID"]
|
||||
26
conf/recognizers/PII/generic/phone_numbers.yaml
Normal file
26
conf/recognizers/PII/generic/phone_numbers.yaml
Normal file
@@ -0,0 +1,26 @@
|
||||
# Recognizer pour numéros de téléphone génériques
|
||||
recognizer_registry:
|
||||
recognizers:
|
||||
- name: GenericPhoneRecognizer
|
||||
supported_language: fr
|
||||
supported_entity: TELEPHONE
|
||||
patterns:
|
||||
- name: Téléphone international avec indicatif
|
||||
regex: "\\b\\+[1-9]\\d{1,3}[\\s\\-\\.]?\\d{1,4}[\\s\\-\\.]?\\d{1,4}[\\s\\-\\.]?\\d{1,9}\\b"
|
||||
score: 0.95
|
||||
- name: Téléphone français format standard
|
||||
regex: "\\b(?<!\\s)0[1-9](?:[\\s\\-\\.]?[0-9]{2}){4}(?![0-9])(?!\\s[A-Za-z])\\b"
|
||||
score: 0.85
|
||||
- name: Téléphone avec parenthèses
|
||||
regex: "\\b\\([0-9]{2,4}\\)[\\s\\-\\.]?[0-9]{2,4}[\\s\\-\\.]?[0-9]{2,4}[\\s\\-\\.]?[0-9]{2,4}\\b"
|
||||
score: 0.9
|
||||
- name: Téléphone format groupé
|
||||
regex: "\\b(?<!FR)(?<![0-9]{5}\\s)(?<!\\s)(?<!boulevard\\s)(?<!avenue\\s)(?<!rue\\s)(?<!place\\s)0[1-9][\\s\\-\\.][0-9]{2}[\\s\\-\\.][0-9]{2}[\\s\\-\\.][0-9]{2}[\\s\\-\\.][0-9]{2}(?![\\s\\-\\.][0-9])(?!\\s[A-Za-z])(?!\\s[A-Z][a-z])"
|
||||
score: 0.7
|
||||
- name: Téléphone compact 10 chiffres
|
||||
regex: "\\b(?<!FR[0-9]{2}\\s)(?<![0-9])(?<!\\s)(?<!boulevard\\s)(?<!avenue\\s)(?<!rue\\s)(?<!place\\s)0[1-9](?:[0-9]{2}){4}(?![0-9])(?!\\s[A-Za-z])(?!\\s[A-Z][a-z])\\b"
|
||||
score: 0.5
|
||||
- name: Téléphone avec contexte
|
||||
regex: "\\b(?:tél|téléphone|mobile|portable)\\s*:?\\s*(?:(?:\\+33|0033|33)\\s?0?[1-9](?:[\\s\\-\\.]?[0-9]{2}){4}|0[1-9](?:[\\s\\-\\.]?[0-9]{2}){4})\\b"
|
||||
score: 0.9
|
||||
context: ["téléphone", "tél", "mobile", "portable", "contact", "numéro"]
|
||||
20
conf/recognizers/generic/location_gps.yaml
Normal file
20
conf/recognizers/generic/location_gps.yaml
Normal file
@@ -0,0 +1,20 @@
|
||||
# Recognizer pour données de localisation GPS
|
||||
recognizer_registry:
|
||||
recognizers:
|
||||
- name: LocationGPSRecognizer
|
||||
supported_language: fr
|
||||
supported_entity: LOCALISATION_GPS
|
||||
patterns:
|
||||
- name: Coordonnées GPS décimales
|
||||
regex: "\\b(?:lat|latitude)\\s*[=:]?\\s*[-+]?[0-9]{1,2}\\.[0-9]{4,}\\s*,?\\s*(?:lon|lng|longitude)\\s*[=:]?\\s*[-+]?[0-9]{1,3}\\.[0-9]{4,}\\b"
|
||||
score: 0.95
|
||||
- name: Coordonnées simples
|
||||
regex: "\\b[-+]?[0-9]{1,2}\\.[0-9]{4,}\\s*,\\s*[-+]?[0-9]{1,3}\\.[0-9]{4,}\\b"
|
||||
score: 0.85
|
||||
- name: Adresse MAC
|
||||
regex: "\\b[0-9A-Fa-f]{2}:[0-9A-Fa-f]{2}:[0-9A-Fa-f]{2}:[0-9A-Fa-f]{2}:[0-9A-Fa-f]{2}:[0-9A-Fa-f]{2}\\b"
|
||||
score: 0.9
|
||||
- name: Géolocalisation
|
||||
regex: "\\b(?:géolocalisation|GPS|position)\\s*:?\\s*[-+]?[0-9]{1,3}\\.[0-9]+\\s*,\\s*[-+]?[0-9]{1,3}\\.[0-9]+\\b"
|
||||
score: 0.9
|
||||
context: ["GPS", "coordonnées", "latitude", "longitude", "géolocalisation", "position"]
|
||||
19
conf/recognizers/generic/urls_online_ids.yaml
Normal file
19
conf/recognizers/generic/urls_online_ids.yaml
Normal file
@@ -0,0 +1,19 @@
|
||||
# Recognizer pour URLs et identifiants en ligne
|
||||
recognizer_registry:
|
||||
recognizers:
|
||||
- name: URLOnlineIDRecognizer
|
||||
supported_language: fr
|
||||
supported_entity: URL_IDENTIFIANT
|
||||
patterns:
|
||||
- name: URL complète
|
||||
regex: "\\bhttps?://[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}(?:/[^\\s]*)?\\b"
|
||||
score: 0.95
|
||||
|
||||
- name: Identifiant utilisateur
|
||||
regex: "\\b(?:user:|username:)\\s*[a-zA-Z0-9._-]{3,20}\\b"
|
||||
score: 0.85
|
||||
- name: Cookie ID
|
||||
regex: "\\b(?:cookie|session)\\s*[=:]\\s*[A-Za-z0-9+/=]{16,}\\b"
|
||||
score: 0.9
|
||||
context:
|
||||
["URL", "lien", "site", "utilisateur", "connexion", "session"]
|
||||
@@ -1,6 +1,7 @@
|
||||
import os
|
||||
import yaml
|
||||
import glob
|
||||
import re
|
||||
from typing import Dict, Any, List
|
||||
import logging
|
||||
|
||||
@@ -25,6 +26,8 @@ class ConfigLoader:
|
||||
for include_pattern in main_config['includes']:
|
||||
self._load_includes(include_pattern)
|
||||
|
||||
# Préprocesser les patterns du fichier principal aussi
|
||||
self._preprocess_regex_patterns(main_config)
|
||||
self._merge_config(main_config)
|
||||
|
||||
logger.info(f"Configuration chargée avec {len(self.config.get('recognizer_registry', {}).get('recognizers', []))} recognizers")
|
||||
@@ -41,11 +44,29 @@ class ConfigLoader:
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
module_config = yaml.safe_load(f)
|
||||
if module_config:
|
||||
# Préprocesser les patterns regex pour gérer la ponctuation
|
||||
self._preprocess_regex_patterns(module_config)
|
||||
self._merge_config(module_config)
|
||||
logger.debug(f"Module chargé: {file_path}")
|
||||
except Exception as e:
|
||||
logger.error(f"Erreur lors du chargement de {file_path}: {e}")
|
||||
|
||||
def _preprocess_regex_patterns(self, config: Dict[str, Any]):
|
||||
"""Préprocesse les patterns regex pour gérer automatiquement la ponctuation"""
|
||||
if 'recognizer_registry' in config and 'recognizers' in config['recognizer_registry']:
|
||||
for recognizer in config['recognizer_registry']['recognizers']:
|
||||
if 'patterns' in recognizer:
|
||||
for pattern in recognizer['patterns']:
|
||||
if 'regex' in pattern:
|
||||
original_regex = pattern['regex']
|
||||
# Remplacer \b en fin de regex par un lookahead pour la ponctuation
|
||||
# Seulement si le pattern se termine par \b
|
||||
if original_regex.endswith('\\b'):
|
||||
# Enlever le \b final et ajouter le lookahead
|
||||
new_regex = original_regex[:-2] + '(?=\\s|[,.;:!?()]|$)'
|
||||
pattern['regex'] = new_regex
|
||||
logger.debug(f"Pattern modifié: {original_regex} -> {new_regex}")
|
||||
|
||||
def _merge_config(self, new_config: Dict[str, Any]):
|
||||
for key, value in new_config.items():
|
||||
if key == 'recognizer_registry':
|
||||
|
||||
@@ -2,6 +2,7 @@ from typing import List
|
||||
from presidio_analyzer import RecognizerResult
|
||||
from entity_refiners import EntityRefinerManager
|
||||
from post_processors import DeduplicationProcessor, OverlapResolver
|
||||
from post_processors.cleanup_processor import CleanupProcessor
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -9,9 +10,10 @@ logger = logging.getLogger(__name__)
|
||||
class AnalysisPipeline:
|
||||
def __init__(self):
|
||||
self.refiner_manager = EntityRefinerManager()
|
||||
self.cleanup_processor = CleanupProcessor()
|
||||
self.overlap_resolver = OverlapResolver()
|
||||
self.deduplicator = DeduplicationProcessor()
|
||||
logger.info("🚀 Pipeline d'analyse initialisé")
|
||||
logger.info("🚀 Pipeline d'analyse initialisé avec nettoyage avancé")
|
||||
|
||||
def process(self, text: str, results: List[RecognizerResult], allow_list_terms: List[str]) -> List[RecognizerResult]:
|
||||
"""Traite les résultats à travers le pipeline complet"""
|
||||
@@ -38,10 +40,13 @@ class AnalysisPipeline:
|
||||
)
|
||||
refined_results.append(refined_result)
|
||||
|
||||
# 3. Résolution des chevauchements
|
||||
resolved_results = self.overlap_resolver.process(refined_results, text)
|
||||
# 3. Nettoyage avancé des résultats
|
||||
cleaned_results = self.cleanup_processor.process(refined_results)
|
||||
|
||||
# 4. Déduplication
|
||||
# 4. Résolution des chevauchements
|
||||
resolved_results = self.overlap_resolver.process(cleaned_results, text)
|
||||
|
||||
# 5. Déduplication
|
||||
final_results = self.deduplicator.process(resolved_results, text)
|
||||
|
||||
logger.info(f"🎯 Pipeline complet: {len(results)} -> {len(final_results)} entités")
|
||||
|
||||
59
post_processors/cleanup_processor.py
Normal file
59
post_processors/cleanup_processor.py
Normal file
@@ -0,0 +1,59 @@
|
||||
from typing import List
|
||||
from presidio_analyzer import RecognizerResult
|
||||
import re
|
||||
|
||||
class CleanupProcessor:
|
||||
"""Post-processor pour nettoyer les résultats d'anonymisation et éviter les chevauchements."""
|
||||
|
||||
def __init__(self):
|
||||
self.name = "CleanupProcessor"
|
||||
|
||||
def process(self, results: List[RecognizerResult]) -> List[RecognizerResult]:
|
||||
"""Nettoie les résultats pour éviter les chevauchements et les détections incorrectes."""
|
||||
if not results:
|
||||
return results
|
||||
|
||||
# Trier par position de début
|
||||
sorted_results = sorted(results, key=lambda x: x.start)
|
||||
|
||||
# Supprimer les chevauchements en gardant le score le plus élevé
|
||||
cleaned_results = []
|
||||
|
||||
for current in sorted_results:
|
||||
# Vérifier si ce résultat chevauche avec un résultat déjà accepté
|
||||
overlaps = False
|
||||
for accepted in cleaned_results:
|
||||
if self._overlaps(current, accepted):
|
||||
# Si le score actuel est plus élevé, remplacer
|
||||
if current.score > accepted.score:
|
||||
cleaned_results.remove(accepted)
|
||||
cleaned_results.append(current)
|
||||
overlaps = True
|
||||
break
|
||||
|
||||
if not overlaps:
|
||||
cleaned_results.append(current)
|
||||
|
||||
# Filtrer les résultats trop courts ou suspects
|
||||
final_results = []
|
||||
for result in cleaned_results:
|
||||
if self._is_valid_result(result):
|
||||
final_results.append(result)
|
||||
|
||||
return final_results
|
||||
|
||||
def _overlaps(self, result1: RecognizerResult, result2: RecognizerResult) -> bool:
|
||||
"""Vérifie si deux résultats se chevauchent."""
|
||||
return not (result1.end <= result2.start or result2.end <= result1.start)
|
||||
|
||||
def _is_valid_result(self, result: RecognizerResult) -> bool:
|
||||
"""Vérifie si un résultat est valide (pas trop court, pas suspect)."""
|
||||
# Longueur minimale
|
||||
if result.end - result.start < 2:
|
||||
return False
|
||||
|
||||
# Éviter les détections sur des caractères isolés
|
||||
if result.entity_type == "PERSON_NAME" and result.end - result.start < 4:
|
||||
return False
|
||||
|
||||
return True
|
||||
@@ -20,14 +20,26 @@ class OverlapResolver:
|
||||
'BE_ENTERPRISE_NUMBER': 88,
|
||||
'PHONE_NUMBER': 85,
|
||||
'BE_PHONE_NUMBER': 85,
|
||||
'TELEPHONE': 84,
|
||||
'TELEPHONE_FRANCAIS': 86,
|
||||
'IP_ADDRESS': 82,
|
||||
'ADRESSE_FRANCAISE': 78, # Priorité plus élevée pour adresses françaises spécifiques
|
||||
'BE_ADDRESS': 75,
|
||||
'FR_ADDRESS': 75,
|
||||
'ORGANIZATION': 65,
|
||||
'LOCATION': 60,
|
||||
'ADRESSE': 70, # Adresse générique avec priorité plus faible
|
||||
'ORGANISATION': 65,
|
||||
'LOCATION': 60, # Priorité plus faible que les adresses
|
||||
'PERSON': 50,
|
||||
'PERSON_NAME': 45,
|
||||
'NRP': 40,
|
||||
'URL': 35
|
||||
'BE_PROFESSIONAL_ID': 40,
|
||||
'FR_CIVILITY_TITLE': 85,
|
||||
'FR_REGULATED_PROFESSION': 80,
|
||||
'CARTE_IDENTITE_FRANCAISE': 78,
|
||||
'PERMIS_CONDUIRE_FRANCAIS': 76,
|
||||
'PASSEPORT_FRANCAIS': 77,
|
||||
'URL': 35,
|
||||
'MARKET_SHARE': 35
|
||||
}
|
||||
|
||||
# Patterns pour identifier les organisations
|
||||
@@ -112,12 +124,12 @@ class OverlapResolver:
|
||||
# Correction 1: PERSON -> ORGANIZATION pour les noms d'entreprise
|
||||
if result.entity_type == 'PERSON' and self._is_organization_name(entity_text):
|
||||
corrected_result = RecognizerResult(
|
||||
entity_type='ORGANIZATION',
|
||||
entity_type='ORGANISATION',
|
||||
start=result.start,
|
||||
end=result.end,
|
||||
score=result.score + 0.1 # Bonus de confiance
|
||||
)
|
||||
logger.debug(f"🔄 Correction PERSON -> ORGANIZATION: '{entity_text}'")
|
||||
logger.debug(f"🔄 Correction PERSON -> ORGANISATION: '{entity_text}'")
|
||||
corrected_results.append(corrected_result)
|
||||
|
||||
# Correction 2: Séparer IP des adresses physiques
|
||||
|
||||
@@ -25,7 +25,7 @@ class IBANRefiner(EntityRefiner):
|
||||
|
||||
def __init__(self):
|
||||
super().__init__("IBAN")
|
||||
self.iban_regex = re.compile(r"\b[A-Z]{2}[0-9]{2}(?:\s[0-9]{4}){3}\b", re.IGNORECASE)
|
||||
self.iban_regex = re.compile(r"\b[A-Z]{2}[0-9]{2}\s?(?:[A-Z0-9]{4}\s?){2,7}[A-Z0-9]{1,4}\b", re.IGNORECASE)
|
||||
|
||||
def refine(self, text: str, start: int, end: int) -> Optional[Tuple[int, int]]:
|
||||
ent_text = text[start:end].strip()
|
||||
|
||||
Reference in New Issue
Block a user