presidio modulaire

This commit is contained in:
nBiqoz
2025-09-07 12:29:08 +02:00
parent 85d95d05e5
commit c62e5b92d5
42 changed files with 1802 additions and 324 deletions

View File

@@ -31,7 +31,7 @@ COPY . /app/
# Définir la variable d'environnement pour que Presidio trouve notre fichier de configuration
# Dit à Presidio : "Ton fichier de config est ici"
ENV PRESIDIO_ANALYZER_CONFIG_FILE=/app/conf/default.yaml
ENV PRESIDIO_ANALYZER_CONFIG_FILE=/app/conf/main.yaml
# Exposer le port que Gunicorn va utiliser
EXPOSE 5001

306
app.py
View File

@@ -1,59 +1,83 @@
import os
import re
import logging
import re
import yaml
from flask import Flask, request, jsonify, make_response
from presidio_analyzer import AnalyzerEngineProvider
from config_loader import ConfigLoader
from presidio_anonymizer import AnonymizerEngine
from presidio_anonymizer.entities import OperatorConfig
from entity_refiners import EntityRefinerManager
from pipeline_manager import AnalysisPipeline
# Initialisation logger
logging.basicConfig(level=logging.INFO,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)
app = Flask(__name__)
# Chargement du moteur
refiner_manager = EntityRefinerManager()
analyzer = None
allow_list_terms = set()
try:
logger.info("--- Presidio Analyzer Service Starting ---")
logger.info("--- Presidio Analyzer Service Starting (Architecture Modulaire) ---")
config_loader = ConfigLoader()
try:
config = config_loader.load_config("main.yaml")
logger.info("✅ Configuration modulaire chargée avec succès")
allow_list_terms = set(term.lower().strip() for term in config.get('allow_list', []))
logger.info(f"✅ Allow list chargée avec {len(allow_list_terms)} termes")
recognizers_count = len(config.get('recognizer_registry', {}).get('recognizers', []))
logger.info(f"📊 Nombre de recognizers chargés: {recognizers_count}")
import tempfile
# Écriture fichier temporaire config pour Presidio
with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml', delete=False, encoding='utf-8') as tmp_file:
yaml.dump(config, tmp_file, default_flow_style=False, allow_unicode=True)
temp_config_path = tmp_file.name
with open(temp_config_path, 'r', encoding='utf-8') as f:
temp_content = f.read()
logger.info(f"🔍 Contenu du fichier temporaire COMPLET:\n{temp_content[:1000]}")
if 'nlp_configuration' in config:
logger.info("✅ nlp_configuration trouvée")
else:
logger.warning("❌ nlp_configuration MANQUANTE dans la config finale")
provider = AnalyzerEngineProvider(analyzer_engine_conf_file=temp_config_path)
analyzer = provider.create_engine()
os.unlink(temp_config_path)
except Exception as e:
logger.error(f"❌ Erreur avec la config modulaire: {e}")
logger.warning("🔄 Fallback vers default.yaml")
CONFIG_FILE_PATH = os.environ.get("PRESIDIO_ANALYZER_CONFIG_FILE", "conf/default.yaml")
provider = AnalyzerEngineProvider(analyzer_engine_conf_file=CONFIG_FILE_PATH)
analyzer = provider.create_engine()
logger.info(f"Analyzer ready. Languages: {analyzer.supported_languages}")
except Exception as e:
logger.exception("Error during AnalyzerEngine initialization.")
analyzer = None
# Test Temporaire pour les Regex via du Python directement
IBAN_REGEX = re.compile(r"\b[A-Z]{2}[0-9]{2}(?:\s[0-9]{4}){3}\b", re.IGNORECASE)
IPV4_REGEX = re.compile(
r"\b(?:(?:25[0-5]|2[0-4][0-9]|1\d{2}|[1-9]?\d)\.){3}"
r"(?:25[0-5]|2[0-4][0-9]|1\d{2}|[1-9]?\d)\b"
)
# Liste Temporaire en surcouche des labels/phrases à exclure danonymisation
IGNORE_LABELS = {
"témoins",
"témoins clés",
"coordonnées",
"coordonnées bancaires",
"contexte financier",
"données sensibles",
"contexte",
"montrent",
"montrent des",
"montrent des irrégularités",
"bénéficiaire",
}
def normalize_label(text: str) -> str:
return text.strip().lower()
cleaned = re.sub(r'[^\w\s]', '', text.strip().lower())
return cleaned
# Remplacer ligne 18
pipeline = AnalysisPipeline()
# Modifier la fonction analyze_text (lignes 73-105)
@app.route("/analyze", methods=["POST"])
def analyze_text():
if not analyzer:
@@ -67,62 +91,182 @@ def analyze_text():
if not text_to_analyze:
return jsonify({"error": "text field is missing or empty"}), 400
results = analyzer.analyze(text=text_to_analyze, language=language)
# Analyse brute
raw_results = analyzer.analyze(text=text_to_analyze, language=language)
filtered_results = []
for res in results:
ent_text = text_to_analyze[res.start:res.end].strip()
ent_text_norm = normalize_label(ent_text)
# Pipeline modulaire complet
final_results = pipeline.process(text_to_analyze, raw_results, allow_list_terms)
if ent_text_norm in IGNORE_LABELS:
logger.debug(f"Skipping anonymization of label: '{ent_text}'")
continue
# Recadrage IBAN
if res.entity_type == "IBAN":
match = IBAN_REGEX.search(ent_text)
if match:
true_iban = match.group(0)
start_offset = ent_text.find(true_iban)
if start_offset != -1:
old_start, old_end = res.start, res.end
res.start += start_offset
res.end = res.start + len(true_iban)
logger.debug(f"Adjusted IBAN span: {old_start}-{old_end} => {res.start}-{res.end}")
else:
logger.warning(f"IBAN regex match but cannot find substring position: '{ent_text}'")
else:
logger.warning(f"Invalid IBAN detected, skipping: '{ent_text}'")
continue
# Recadrage IP_ADDRESS
if res.entity_type == "IP_ADDRESS":
match = IPV4_REGEX.search(ent_text)
if match:
true_ip = match.group(0)
start_offset = ent_text.find(true_ip)
if start_offset != -1:
old_start, old_end = res.start, res.end
res.start += start_offset
res.end = res.start + len(true_ip)
logger.debug(f"Adjusted IP span: {old_start}-{old_end} => {res.start}-{res.end}")
else:
logger.warning(f"IP regex match but cannot find substring position: '{ent_text}'")
else:
logger.warning(f"Invalid IP detected, skipping: '{ent_text}'")
continue
filtered_results.append(res)
# Résultat nettoyé
response_data = [res.to_dict() for res in filtered_results]
response_data = [res.to_dict() for res in final_results]
return make_response(jsonify(response_data), 200)
except Exception as e:
logger.exception("Error processing analysis")
return jsonify({"error": str(e)}), 500
@app.route("/health", methods=["GET"])
def health_check():
if analyzer:
return jsonify({
"status": "healthy",
"languages": analyzer.supported_languages,
"version": "2.0.0"
}), 200
else:
return jsonify({"status": "unhealthy", "error": "Analyzer not initialized"}), 503
def load_replacements():
"""Charge les configurations d'anonymisation depuis YAML"""
try:
config_path = "conf/anonymization/replacements.yaml"
if not os.path.exists(config_path):
logger.warning(f"❌ Fichier de configuration non trouvé: {config_path}")
return {}
with open(config_path, "r", encoding="utf-8") as f:
config = yaml.safe_load(f)
if not config:
logger.warning("❌ Fichier de configuration vide")
return {}
anonymizer_config = config.get("anonymizer_config", {})
replacements = anonymizer_config.get("replacements", {})
if not replacements:
logger.warning("❌ Aucun remplacement trouvé dans la configuration")
return {}
operators = {}
for entity_type, replacement_value in replacements.items():
try:
operators[entity_type] = OperatorConfig("replace", {"new_value": replacement_value})
except Exception as e:
logger.error(f"❌ Erreur lors création opérateur {entity_type}: {e}")
continue
logger.info(f"✅ Loaded {len(operators)} replacement operators from config")
return operators
except Exception as e:
logger.error(f"❌ Failed to load replacements config: {e}")
return {}
# Initialisation anonymizer et opérateurs
try:
anonymizer = AnonymizerEngine()
logger.info("✅ Anonymizer engine initialized successfully")
replacement_operators = load_replacements()
if replacement_operators:
logger.info(f"✅ Loaded {len(replacement_operators)} custom replacement operators")
else:
logger.warning("⚠️ Aucun opérateur remplacement chargé, fallback par défaut")
replacement_operators = {}
except Exception as e:
logger.error(f"❌ Anonymizer initialization failed: {e}")
anonymizer = None
replacement_operators = {}
@app.route("/anonymize", methods=["POST"])
def anonymize_text():
logger.error("🚨 ENDPOINT /anonymize APPELÉ")
global anonymizer, replacement_operators
if anonymizer is None:
return jsonify({"error": "Anonymizer not initialized"}), 500
if not replacement_operators:
logger.warning("⚠️ replacement_operators non défini, rechargement...")
replacement_operators = load_replacements()
logger.info(f"🔍 Opérateurs disponibles: {list(replacement_operators.keys())}")
try:
data = request.get_json(force=True)
text_to_anonymize = data.get("text", "")
language = data.get("language", "fr")
mode = data.get("mode", "pii")
if not text_to_anonymize:
return jsonify({"error": "No text provided"}), 400
logger.info(f"🔍 Texte à anonymiser: '{text_to_anonymize}'")
entities_to_detect = get_entities_by_mode(mode) if 'get_entities_by_mode' in globals() else None
analyzer_results = analyzer.analyze(
text=text_to_anonymize,
language=language,
entities=entities_to_detect
)
logger.info(f"🔍 Entités détectées: {[(r.entity_type, text_to_anonymize[r.start:r.end], r.score) for r in analyzer_results]}")
filtered_results = []
for res in analyzer_results:
ent_text = text_to_anonymize[res.start:res.end].strip()
ent_text_norm = normalize_label(ent_text)
logger.info(f"🔍 Traitement entité: {res.entity_type} = '{ent_text}' (score: {res.score})")
logger.info(f"🔍 Allow list terms: {allow_list_terms}")
# Vérification améliorée de la allow list
ent_text_clean = re.sub(r'[^\w]', '', ent_text.strip().lower())
logger.info(f"🔍 Texte nettoyé: '{ent_text_clean}'")
# Vérifier si le texte correspond exactement ou commence par un terme de la allow list
is_allowed = any(ent_text_clean == term or ent_text_clean.startswith(term) for term in allow_list_terms)
if is_allowed:
logger.info(f"✅ Entité '{ent_text}' ignorée (dans allow list)")
continue
refined_positions = refiner_manager.refine_entity(text_to_anonymize, res.entity_type, res.start, res.end)
if refined_positions is None:
logger.info(f"❌ Entité {res.entity_type} supprimée par le refiner")
continue
res.start, res.end = refined_positions
filtered_results.append(res)
logger.info(f"✅ Entité {res.entity_type} conservée après refinement")
logger.info(f"🔍 Entités finales pour anonymisation: {[(r.entity_type, text_to_anonymize[r.start:r.end]) for r in filtered_results]}")
operators_to_use = replacement_operators if replacement_operators else {}
logger.info(f"🔍 Opérateurs utilisés: {list(operators_to_use.keys())}")
anonymized_result = anonymizer.anonymize(
text=text_to_anonymize,
analyzer_results=filtered_results,
operators=operators_to_use
)
logger.info(f"🔍 Résultat anonymisation: '{anonymized_result.text}'")
return jsonify({
"original_text": text_to_anonymize,
"anonymized_text": anonymized_result.text,
"entities_found": [
{
"entity_type": result.entity_type,
"start": result.start,
"end": result.end,
"score": result.score
} for result in filtered_results
],
"mode": mode
})
except Exception as e:
logger.error(f"Error during anonymization: {e}")
return jsonify({"error": str(e)}), 500
if __name__ == "__main__":
app.run(host="0.0.0.0", port=5001)

View File

@@ -0,0 +1,19 @@
# Liste blanche - termes à ne pas anonymiser
allow_list:
# Références légales
- Loi
- Code
- Règlement
- Décret
- Arrêté
- BCE
- TVA
- IEC
- expert-comptable
# Termes financiers
- Euro
- EUR
- Euros
- Taux
- Valeur
- Prix

View File

@@ -0,0 +1,82 @@
# Configuration d'anonymisation complète
anonymizer_config:
default_anonymizers:
# Entités génériques
PERSON: replace
LOCATION: replace
ORGANIZATION: replace
DATE_TIME: replace
MONEY: replace
EMAIL_ADDRESS: replace
IBAN: replace
IP_ADDRESS: replace
# PII Génériques - Données sensibles RGPD
HEALTH_DATA: replace
BIOMETRIC_DATA: replace
SEXUAL_ORIENTATION: replace
POLITICAL_OPINIONS: replace
RGPD_FINANCIAL_DATA: replace
# PII Belges
BE_ENTERPRISE_NUMBER: replace
BE_NATIONAL_REGISTER_NUMBER: replace
BE_PHONE_NUMBER: replace
BE_ADDRESS: replace
BE_ID_CARD: replace
BE_PASSPORT: replace
# PII Françaises
FR_SOCIAL_SECURITY_NUMBER: replace
FR_SIRET: replace
FR_ADDRESS: replace
FR_TAX_ID: replace
FR_BANK_ACCOUNT: replace
FR_ID_CARD: replace
FR_PASSPORT: replace
FR_DRIVER_LICENSE: replace
# Business
BE_PROFESSIONAL_ID: replace
MARKET_SHARE: replace
replacements:
# Entités génériques
PERSON: "[PERSONNE]"
LOCATION: "[LIEU]"
ORGANIZATION: "[ORGANISATION]"
DATE_TIME: "[DATE]"
MONEY: "[MONTANT]"
EMAIL_ADDRESS: "[EMAIL]"
IBAN: "[IBAN]"
IP_ADDRESS: "[ADRESSE_IP]"
# PII Génériques - Données sensibles RGPD
HEALTH_DATA: "[DONNEES_SANTE]"
BIOMETRIC_DATA: "[DONNEES_BIOMETRIQUES]"
SEXUAL_ORIENTATION: "[ORIENTATION_SEXUELLE]"
POLITICAL_OPINIONS: "[OPINIONS_POLITIQUES]"
RGPD_FINANCIAL_DATA: "[DONNEES_FINANCIERES]"
# PII Belges
BE_ENTERPRISE_NUMBER: "[ENTREPRISE_BELGE]"
BE_NATIONAL_REGISTER_NUMBER: "[NRN_BELGE]"
BE_PHONE_NUMBER: "[TELEPHONE_BE]"
BE_ADDRESS: "[ADRESSE_BELGE]"
BE_ID_CARD: "[CARTE_ID_BE]"
BE_PASSPORT: "[PASSEPORT_BE]"
# PII Françaises
FR_SOCIAL_SECURITY_NUMBER: "[NUM_SECU_FR]"
FR_SIRET: "[SIRET_FR]"
FR_ADDRESS: "[ADRESSE_FR]"
FR_TAX_ID: "[NUM_FISCAL_FR]"
FR_BANK_ACCOUNT: "[COMPTE_BANCAIRE_FR]"
FR_ID_CARD: "[CARTE_ID_FR]"
FR_PASSPORT: "[PASSEPORT_FR]"
FR_DRIVER_LICENSE: "[PERMIS_FR]"
# Business
BE_PROFESSIONAL_ID: "[ID_PROFESSIONNEL_BE]"
MARKET_SHARE: "[PART_DE_MARCHE]"

View File

@@ -1,227 +0,0 @@
# =======================
# CONFIGURATION PRESIDIO
# =======================
supported_languages: [en, fr]
nlp_configuration:
nlp_engine_name: spacy
models:
- lang_code: en
model_name: en_core_web_lg
- lang_code: fr
model_name: fr_core_news_lg
ner_model_configuration:
labels_to_ignore:
- LOCATION
- MISC
- CARDINAL
- EVENT
- LANGUAGE
- LAW
- ORDINAL
- PERCENT
- PRODUCT
- QUANTITY
- WORK_OF_ART
confidence_thresholds:
DEFAULT_CONFIDENCE: 0.85
PERSON: 0.85
ORGANIZATION: 0.55
recognizer_registry:
load_predefined_recognizers: true
recognizers:
- name: FlexibleDateRecognizer
supported_language: fr
supported_entity: FLEXIBLE_DATE
patterns:
- name: Date format JJ mois AAAA
regex: "\\b(0?[1-9]|[12][0-9]|3[01])\\s+(janvier|février|mars|avril|mai|juin|juillet|août|septembre|octobre|novembre|décembre)\\s+(19|20)\\d{2}\\b"
score: 1.0
- name: Date format JJ/MM/AAAA
regex: "\\b(0[1-9]|[12][0-9]|3[01])[-/.](0[1-9]|1[012])[-/.](19|20)\\d{2}\\b"
score: 1.0
context: ["date", "né le", "signé le", "incident du"]
- name: BelgianAddressRecognizer
supported_language: fr
supported_entity: BE_ADDRESS
patterns:
- name: Adresse Belge complète
regex: "\\b(?:\\d{1,4}[A-Za-z]?(?:\\s*,)?\\s+)?(?:Avenue|Rue|Boulevard|Chaussée|Place|Quai|Impasse|Drève)(?:\\s+(?:de|la|le|d'|des))?(?:\\s+[A-Z][a-zà-ÿ'-]+)+,?(?:\\s+\\d{1,4}[A-Za-z]?)?,\\s*\\d{4}\\s+[A-Za-zà-ÿ'-]+"
score: 1.0
context: ["demeurant", "adresse", "siège social", "bureaux situés"]
- name: BelgianPhoneRecognizer
supported_language: fr
supported_entity: BE_PHONE_NUMBER
patterns:
- name: Numéro téléphone Belge (fixe ou mobile)
regex: "\\b0[1-9](?:[./\\s]?\\d{2,3}){3}\\b"
score: 0.95
context: ["Tel", "Tél", "téléphone", "gsm", "mobile"]
- name: SmartOrganizationRecognizer
supported_language: fr
supported_entity: ORGANIZATION
patterns:
- name: Nom + Forme légale (DigitalConsult SPRL)
regex: "\\b([A-Z][a-zà-ÿ]+(?:\\s[A-Z][a-zà-ÿ]+)*)\\s+(SPRL|SRL|SA|SCS|SNC)\\b"
score: 0.9
- name: Forme légale + Nom (SPRL DigitalConsult)
regex: "\\b(SPRL|SRL|SA|SCS|SNC)\\s+([A-Z][a-zà-ÿ]+(?:\\s[A-Z][a-zà-ÿ]+)*)\\b"
score: 0.9
context: ["société", "entreprise", "gérant de la"]
- name: ProfessionalIdRecognizer
supported_language: fr
supported_entity: BE_PRO_ID
patterns:
- name: Numéro IEC
regex: "(n°\\sIEC:?|IEC:?)\\s*\\d{6}"
score: 1.0
context: ["expert-comptable"]
- name: BelgianEnterpriseRecognizer
supported_language: fr
supported_entity: BE_ENTERPRISE_NUMBER
patterns:
- name: Numéro BCE/TVA Belge (avec ou sans BE)
regex: "\\b(BE)?\\s?0?\\d{3}[\\.\\s]?\\d{3}[\\.\\s]?\\d{3}\\b"
score: 1.0
context: ["BCE", "TVA", "intracommunautaire"]
- name: EmailRecognizer
supported_language: fr
supported_entity: EMAIL_ADDRESS
patterns:
- name: Email Pattern
regex: "\\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Z|a-z]{2,}\\b"
score: 1.0
context: ["email", "courriel", "mail"]
- name: IbanRecognizer
supported_language: fr
supported_entity: IBAN
patterns:
- name: IBAN Pattern
regex: "\\b[A-Z]{2}[0-9]{2}(?:\\s[0-9]{4}){3}\\b"
score: 1.0
context: ["iban", "compte"]
- name: BelgianNRNRecognizer
supported_language: fr
supported_entity: BE_NATIONAL_REGISTER_NUMBER
patterns:
- name: NRN Pattern
regex: "\\b[0-9]{2}\\.[0-9]{2}\\.[0-9]{2}-[0-9]{3}\\.[0-9]{2}\\b"
score: 1.0
context: ["registre national"]
- name: FrenchINSEERecognizer
supported_language: fr
supported_entity: FR_SOCIAL_SECURITY_NUMBER
patterns:
- name: INSEE Pattern with flexible spaces
regex: "\\b[12]\\s*[0-9]{2}\\s*(?:0[1-9]|1[0-2])\\s*(?:2[ABab]|[0-9]{2})\\s*[0-9]{3}\\s*[0-9]{3}[\\s]?[0-9]{2}\\b"
score: 0.95
context: ["sécurité sociale", "insee", "nir"]
- name: IpAddressRecognizer
supported_language: fr
supported_entity: IP_ADDRESS
patterns:
- name: IPv4
regex: "\\b(?:(?:25[0-5]|2[0-4][0-9]|1[0-9]{2}|[1-9]?[0-9])\\.){3}(?:25[0-5]|2[0-4][0-9]|1[0-9]{2}|[1-9]?[0-9])\\b"
score: 1.0
- name: IPv6
regex: "\\b([0-9a-fA-F]{1,4}:){7}[0-9a-fA-F]{1,4}\\b"
score: 0.9
allow_list:
- Adresse
- ADRESSE
- Contrat
- Document
- Société
- Investisseur
- Montant
- Prêt
- Intérêt
- Partie
- Parties
- Annexe
- Remboursement
- Conversion
- Financement
- Sortie
- "Juste Valeur Marchande"
- Échéance
- Clause
- Clauses
- Principe
- Coûts
- Notifications
- Article
- Paragraphe
- Directeur
- Gérant
- Président
- DocuSign
- SPRL
- SA
- Loi
- Code
- Règlement
- Décret
- Arrêté
- Euro
- EUR
- Euros
- Taux
- Valeur
- Prix
- Coordonnées
- Témoins
- "Coordonnées bancaires"
- "Témoins clés"
- "montrent"
- "montrent des"
- "montrent des irrégularités"
- "bénéficiaire"
anonymizer_config:
default_anonymizers:
PERSON: replace
LOCATION: replace
ORGANIZATION: replace
DATE_TIME: replace
MONEY: replace
EMAIL_ADDRESS: replace
IBAN: replace
BE_ENTERPRISE_NUMBER: replace
BE_NATIONAL_REGISTER_NUMBER: replace
FR_SOCIAL_SECURITY_NUMBER: replace
BE_PHONE_NUMBER: replace
FLEXIBLE_DATE: replace
BE_ADDRESS: replace
BE_PRO_ID: replace
IP_ADDRESS: replace
replacements:
PERSON: "<PERSONNE>"
LOCATION: "<LIEU>"
ORGANIZATION: "<ORGANISATION>"
DATE_TIME: "<DATE>"
MONEY: "<MONTANT>"
EMAIL_ADDRESS: "<EMAIL>"
IBAN: "<IBAN>"
BE_ENTERPRISE_NUMBER: "<NUM_ENTREPRISE_BE>"
BE_NATIONAL_REGISTER_NUMBER: "<NRN_BELGE>"
FR_SOCIAL_SECURITY_NUMBER: "<NUM_SECU_FR>"
BE_PHONE_NUMBER: "<TELEPHONE_BE>"
FLEXIBLE_DATE: "<DATE>"
BE_ADDRESS: "<ADRESSE_BELGE>"
BE_PRO_ID: "<ID_PROFESSIONNEL>"
IP_ADDRESS: "<ADRESSE_IP>"

30
conf/main.yaml Normal file
View File

@@ -0,0 +1,30 @@
# =======================
# CONFIGURATION PRESIDIO MODULAIRE
# =======================
# Langues supportées
supported_languages: [en, fr]
default_language: fr
# Inclusion des modules de configuration
includes:
# Configuration NLP (spaCy préservée)
- nlp/spacy_config.yaml
# Recognizers PII par dossier (garder uniquement les dossiers récents)
- recognizers/PII/belgian/*
- recognizers/PII/french/*
- recognizers/PII/generic/*
# Recognizers Business par dossier
- recognizers/Business/belgian/*
- recognizers/Business/french/*
# Configuration d'anonymisation
- anonymization/*
# Configuration globale simplifiée
global_settings:
version: "2.0.0"
cache_enabled: true
timeout_seconds: 30

View File

@@ -0,0 +1,33 @@
nlp_configuration:
nlp_engine_name: spacy
models:
- lang_code: en
model_name: en_core_web_lg
- lang_code: fr
model_name: fr_core_news_lg
# Configuration NER globale (sans confidence_thresholds)
ner_model_configuration:
model_to_presidio_entity_mapping:
PER: PERSON
PERSON: PERSON
ORG: ORGANIZATION
ORGANIZATION: ORGANIZATION
LOC: LOCATION
LOCATION: LOCATION
DATE: DATE_TIME
TIME: DATE_TIME
MISC: DATE_TIME
labels_to_ignore:
- LOCATION
- MISC
- CARDINAL
- EVENT
- LANGUAGE
- LAW
- ORDINAL
- PERCENT
- PRODUCT
- QUANTITY
- WORK_OF_ART
low_score_entity_names: []

View File

@@ -0,0 +1,24 @@
# Recognizer pour numéros d'entreprise belges
recognizer_registry:
recognizers:
- name: BelgianEnterpriseRecognizer
supported_language: fr
supported_entity: BE_ENTERPRISE_NUMBER
patterns:
- name: Numéro BCE avec deux points
regex: "(?<=\\bBCE\\s*:\\s*)((BE)?\\s?0\\d{3}[\\.\\s]?\\d{3}[\\.\\s]?\\d{3})\\b"
score: 1.0
- name: Numéro TVA avec deux points
regex: "(?<=\\bTVA\\s*:\\s*)(BE\\d{4}\\.\\d{3}\\.\\d{3})\\b"
score: 1.0
- name: Numéro d'entreprise général
regex: "(?<!(?:BCE|TVA)\\s*:\\s*)\\b(BE)?\\s?0\\d{3}[\\.\\s]?\\d{3}[\\.\\s]?\\d{3}\\b"
score: 0.9
- name: Numéro ONSS
regex: "\\bONSS\\s*:?\\s*\\d{7}\\b"
score: 0.95
- name: Numéro patronal
regex: "\\b(?:numéro\\s+)?patronal\\s*:?\\s*\\d{7}\\b"
score: 0.9
context:
["TVA", "intracommunautaire", "ONSS", "entreprise", "patronal"]

View File

@@ -0,0 +1,28 @@
# Recognizer pour noms d'organisations belges
recognizer_registry:
recognizers:
- name: SmartOrganizationRecognizer
supported_language: fr
supported_entity: ORGANIZATION
patterns:
# Noms avec suffixes typiques d'entreprise
- name: Noms entreprise avec suffixes
regex: "\\b([A-Z][a-zA-Zà-ÿ]+(?:Consult|Tech|Soft|Digital|Solutions|Services|Group|Corp|Company|Systems|Data|Cloud|Web|Net|Info|Cyber|Smart|Pro|Expert|Plus|Max|Global|International|Europe|Belgium|Brussels|Wallonie|Flandre))\\b(?!\\s*\\([^)]*(?:BCE|TVA)[^)]*\\))"
score: 0.9
# Formes légales complètes avec nom d'entreprise
- name: Formes légales complètes
regex: "\\b((?:SPRL|SRL|SA|ASBL|SCS|SNC)\\s+[A-Z][a-zA-Zà-ÿ]+(?:\\s+[A-Z][a-zA-Zà-ÿ]+)*)(?!\\s*\\([^)]*(?:BCE|TVA)[^)]*\\))"
score: 0.95
# Noms d'entreprise avec contexte spécifique (garder pour autres cas)
- name: Noms avec contexte entreprise
regex: "(?<=\\b(?:société|entreprise)\\s+)([A-Z][a-zA-Zà-ÿ]+(?:\\s+[A-Z][a-zA-Zà-ÿ]+)*)(?!\\s*\\([^)]*(?:BCE|TVA)[^)]*\\))"
score: 0.85
# Noms précédés de "gérant de la"
- name: Noms après gérant
regex: "(?<=gérant\\s+de\\s+la\\s+)([A-Z][a-zA-Zà-ÿ]+(?:\\s+[A-Z][a-zA-Zà-ÿ]+)*)(?!\\s*\\([^)]*(?:BCE|TVA)[^)]*\\))"
score: 0.8
context: ["société", "entreprise", "gérant de la", "administrateur"]

View File

@@ -0,0 +1,20 @@
# Recognizer pour identifiants professionnels belges
recognizer_registry:
recognizers:
- name: BelgianProfessionalIdRecognizer
supported_language: fr
supported_entity: BE_PROFESSIONAL_ID
patterns:
- name: Numéro IEC avec deux points
regex: "(?<=\\bIEC\\s*:\\s*)\\d{6}\\b"
score: 1.0
- name: Numéro IEC général
regex: "(?<!IEC\\s*:\\s*)\\b(?:n°\\s*)?IEC\\s*:?\\s*\\d{6}\\b"
score: 0.9
- name: Numéro d'avocat
regex: "\\b(?:avocat\\s+)?n°\\s*\\d{4,6}\\b"
score: 0.8
- name: Numéro de médecin
regex: "\\b(?:Dr\\.|médecin)\\s*n°\\s*\\d{5,7}\\b"
score: 0.85
context: ["expert-comptable", "IEC", "avocat", "médecin", "professionnel"]

View File

@@ -0,0 +1,17 @@
# Recognizer pour SIRET/SIREN français
recognizer_registry:
recognizers:
- name: FrenchSIRETRecognizer
supported_language: fr
supported_entity: FR_SIRET
patterns:
- name: SIRET complet
regex: "\\b[0-9]{3}\\s?[0-9]{3}\\s?[0-9]{3}\\s?[0-9]{5}\\b"
score: 0.9
- name: SIREN
regex: "\\b[0-9]{3}\\s?[0-9]{3}\\s?[0-9]{3}\\b"
score: 0.85
- name: SIRET avec espaces
regex: "\\bSIRET\\s*:?\\s*[0-9]{3}\\s[0-9]{3}\\s[0-9]{3}\\s[0-9]{5}\\b"
score: 0.95
context: ["SIRET", "SIREN", "établissement", "entreprise", "société"]

View File

@@ -0,0 +1,34 @@
# Recognizer pour parts de marché
recognizer_registry:
recognizers:
- name: MarketShareRecognizer
supported_language: fr
supported_entity: MARKET_SHARE
patterns:
# Pourcentages de marché
- name: Market Share Percentage
regex: "\\b(?:détient|possède|contrôle|représente)?\\s*(?:environ\\s+)?(?:\\d{1,2}(?:[,.]\\d{1,2})?%)\\s*(?:de\\s+(?:part\\s+de\\s+)?marché|du\\s+marché|de\\s+parts?)\\b"
score: 0.9
# Positions de marché
- name: Market Position
regex: "\\b(?:leader|numéro\\s+\\d+|\\d+(?:er|ème)\\s+acteur|position\\s+dominante|monopole)\\s+(?:du\\s+)?(?:marché|secteur)\\b"
score: 0.85
# Parts relatives
- name: Relative Market Share
regex: "\\b(?:majoritaire|minoritaire|principale|significative)\\s+(?:part\\s+de\\s+)?marché\\b"
score: 0.8
# Données de concentration
- name: Market Concentration
regex: "\\b(?:concentration|consolidation|fusion)\\s+(?:du\\s+)?marché\\b"
score: 0.75
# Chiffres d'affaires relatifs
- name: Revenue Share
regex: "\\b(?:\\d{1,2}(?:[,.]\\d{1,2})?%)\\s*(?:du\\s+)?(?:chiffre\\s+d'affaires|CA|revenus?)\\s+(?:du\\s+)?(?:marché|secteur)\\b"
score: 0.85
context:
["part de marché", "position concurrentielle", "leader", "concurrent", "secteur", "industrie", "chiffre d'affaires", "revenus", "concentration", "monopole", "oligopole"]

View File

@@ -0,0 +1,44 @@
# Recognizer pour adresses belges complètes
recognizer_registry:
recognizers:
- name: BelgianAddressRecognizer
supported_language: fr
supported_entity: BE_ADDRESS
patterns:
# Pattern principal : numéro + rue + code postal + ville (SANS contexte)
- name: Adresse complète avec numéro devant
regex: "\\b\\d{1,4}[A-Za-z]?\\s+(?:Avenue|Rue|Boulevard|Chaussée|Place|Quai|Impasse|Drève|Clos|Allée)\\s+(?:de\\s+la\\s+|de\\s+|du\\s+|des\\s+|d'|la\\s+|le\\s+)?[A-ZÀ-Ÿ][a-zà-ÿ'-]+(?:\\s+[A-ZÀ-Ÿ][a-zà-ÿ'-]+)*\\s*,\\s*[1-9]\\d{3}\\s+[A-ZÀ-Ÿ][a-zà-ÿ'-]+(?:\\s+[A-ZÀ-Ÿ][a-zà-ÿ'-]+)*"
score: 1.0
# Pattern avec lookbehind positif pour exclure "demeurant" de la capture
- name: Adresse après contexte demeurant
regex: "(?<=\\bdemeurant\\s)\\d{1,4}[A-Za-z]?\\s+(?:Avenue|Rue|Boulevard|Chaussée|Place|Quai|Impasse|Drève|Clos|Allée)\\s+(?:de\\s+la\\s+|de\\s+|du\\s+|des\\s+|d'|la\\s+|le\\s+)?[A-ZÀ-Ÿ][a-zà-ÿ'-]+(?:\\s+[A-ZÀ-Ÿ][a-zà-ÿ'-]+)*\\s*,\\s*[1-9]\\d{3}\\s+[A-ZÀ-Ÿ][a-zà-ÿ'-]+(?:\\s+[A-ZÀ-Ÿ][a-zà-ÿ'-]+)*"
score: 1.0
# Pattern avec lookbehind pour autres contextes
- name: Adresse après contexte résidant
regex: "(?<=\\b(?:résidant|domicilié|habite|situé)\\s)\\d{1,4}[A-Za-z]?\\s+(?:Avenue|Rue|Boulevard|Chaussée|Place|Quai|Impasse|Drève|Clos|Allée)\\s+(?:de\\s+la\\s+|de\\s+|du\\s+|des\\s+|d'|la\\s+|le\\s+)?[A-ZÀ-Ÿ][a-zà-ÿ'-]+(?:\\s+[A-ZÀ-Ÿ][a-zà-ÿ'-]+)*\\s*,\\s*[1-9]\\d{3}\\s+[A-ZÀ-Ÿ][a-zà-ÿ'-]+(?:\\s+[A-ZÀ-Ÿ][a-zà-ÿ'-]+)*"
score: 1.0
# Pattern avec lookbehind pour contexte bureaux
- name: Adresse après contexte bureaux
regex: "(?<=\\b(?:dans les bureaux situés|siège social situé)\\s)\\d{1,4}[A-Za-z]?\\s+(?:Avenue|Rue|Boulevard|Chaussée|Place|Quai|Impasse|Drève|Clos|Allée)\\s+(?:de\\s+la\\s+|de\\s+|du\\s+|des\\s+|d'|la\\s+|le\\s+)?[A-ZÀ-Ÿ][a-zà-ÿ'-]+(?:\\s+[A-ZÀ-Ÿ][a-zà-ÿ'-]+)*\\s*,\\s*[1-9]\\d{3}\\s+[A-ZÀ-Ÿ][a-zà-ÿ'-]+(?:\\s+[A-ZÀ-Ÿ][a-zà-ÿ'-]+)*"
score: 0.9
# Pattern alternatif : rue + numéro + ville (format classique)
- name: Adresse format classique
regex: "\\b(?:Avenue|Rue|Boulevard|Chaussée|Place|Quai|Impasse|Drève|Clos|Allée)\\s+(?:de\\s+la\\s+|de\\s+|du\\s+|des\\s+|d'|la\\s+|le\\s+)?[A-ZÀ-Ÿ][a-zà-ÿ'-]+(?:\\s+[A-ZÀ-Ÿ][a-zà-ÿ'-]+)*(?:\\s*,?\\s*\\d{1,4}[A-Za-z]?)?\\s*,\\s*[1-9]\\d{3}\\s+[A-ZÀ-Ÿ][a-zà-ÿ'-]+(?:\\s+[A-ZÀ-Ÿ][a-zà-ÿ'-]+)*"
score: 0.8
context:
[
"demeurant",
"résidant",
"domicilié",
"habite",
"situé au",
"sis à",
"dans les bureaux situés",
"siège social",
"adresse",
]

View File

@@ -0,0 +1,26 @@
# Recognizer pour documents belges
recognizer_registry:
recognizers:
- name: BelgianIDCardRecognizer
supported_language: fr
supported_entity: BE_ID_CARD
patterns:
- name: Carte d'identité belge
regex: "\\b[0-9]{3}\\-[0-9]{7}\\-[0-9]{2}\\b"
score: 0.95
- name: eID compact
regex: "\\b[0-9]{12}\\b"
score: 0.8
context: ["carte d'identité", "eID", "identiteitskaart", "pièce d'identité"]
- name: BelgianPassportRecognizer
supported_language: fr
supported_entity: BE_PASSPORT
patterns:
- name: Passeport belge
regex: "\\b[A-Z]{2}[0-9]{6}\\b"
score: 0.95
- name: Passeport avec espaces
regex: "\\b[A-Z]{2}\\s[0-9]{6}\\b"
score: 0.9
context: ["passeport", "passport", "paspoort", "document de voyage"]

View File

@@ -0,0 +1,25 @@
# Recognizer pour numéro de registre national belge
recognizer_registry:
recognizers:
- name: BelgianNRNRecognizer
supported_language: fr
supported_entity: BE_NATIONAL_REGISTER_NUMBER
patterns:
- name: NRN avec points et tiret
regex: "\\b[0-9]{2}\\.[0-9]{2}\\.[0-9]{2}-[0-9]{3}\\.[0-9]{2}\\b"
score: 1.0
- name: NRN compact
regex: "\\b[0-9]{11}\\b"
score: 0.7
- name: NRN avec espaces
regex: "\\b[0-9]{2}\\s[0-9]{2}\\s[0-9]{2}\\s[0-9]{3}\\s[0-9]{2}\\b"
score: 0.8
context:
[
"registre national",
"numéro national",
"NN",
"RN",
"identité",
"carte d'identité",
]

View File

@@ -0,0 +1,44 @@
# Recognizer pour numéros de téléphone belges
recognizer_registry:
recognizers:
- name: BelgianPhoneRecognizer
supported_language: fr
supported_entity: BE_PHONE_NUMBER
patterns:
# Patterns avec contexte Tel: et Tél:
- name: Téléphone fixe avec contexte Tel
regex: '(?<=Tel\s*:\s*)0[1-9](?:[./\s]?\d{2,3}){3}(?=\s|\)|$|[.,;])'
score: 0.99
- name: Téléphone fixe avec contexte Tél
regex: '(?<=Tél\s*:\s*)0[1-9](?:[./\s]?\d{2,3}){3}(?=\s|\)|$|[.,;])'
score: 0.99
- name: Mobile avec contexte Tel
regex: '(?<=Tel\s*:\s*)04[0-9]{2}[./\s]?[0-9]{2}[./\s]?[0-9]{2}[./\s]?[0-9]{2}(?=\s|\)|$|[.,;])'
score: 0.99
- name: Mobile avec contexte Tél
regex: '(?<=Tél\s*:\s*)04[0-9]{2}[./\s]?[0-9]{2}[./\s]?[0-9]{2}[./\s]?[0-9]{2}(?=\s|\)|$|[.,;])'
score: 0.99
# Patterns généraux (sans contexte spécifique)
- name: Téléphone fixe belge
regex: '(?<!BCE\s*:?\s*)\b0[1-9](?:[./\s]?\d{2,3}){3}(?=\b|\)|$|[.,;])(?!.*BCE)'
score: 0.95
- name: Mobile belge
regex: '\b04[0-9]{2}[./\s]?[0-9]{2}[./\s]?[0-9]{2}[./\s]?[0-9]{2}(?=\b|\)|$|[.,;])'
score: 0.98
- name: International belge
regex: '\+32\s?[1-9](?:[./\s]?\d{2,3}){3}(?=\b|\)|$|[.,;])'
score: 0.99
- name: Numéro vert belge
regex: '\b0800[./\s]?[0-9]{2}[./\s]?[0-9]{3}(?=\b|\)|$|[.,;])'
score: 0.9
context:
[
"Tel",
"Tél",
"téléphone",
"gsm",
"mobile",
"portable",
"appeler",
"joindre",
]

View File

@@ -0,0 +1,14 @@
# Recognizer pour adresses françaises
recognizer_registry:
recognizers:
- name: FrenchAddressRecognizer
supported_language: fr
supported_entity: FR_ADDRESS
patterns:
- name: Adresse française complète
regex: "\\b\\d{1,4}(?:bis|ter|quater)?\\s+(?:rue|avenue|boulevard|place|impasse|allée|chemin|route)\\s+[A-Za-zà-ÿ\\s'-]+,\\s*[0-9]{5}\\s+[A-Za-zà-ÿ\\s'-]+\\b"
score: 0.95
- name: Code postal français
regex: "\\b[0-9]{5}\\b"
score: 0.6
context: ["adresse", "domicile", "résidence", "siège social"]

View File

@@ -0,0 +1,39 @@
# Recognizer pour documents d'identité français
recognizer_registry:
recognizers:
- name: FrenchIDCardRecognizer
supported_language: fr
supported_entity: FR_ID_CARD
patterns:
- name: Numéro CNI nouveau format
regex: "\\b[0-9]{12}\\b"
score: 0.85
- name: Numéro CNI avec espaces
regex: "\\b[0-9]{4}\\s[0-9]{4}\\s[0-9]{4}\\b"
score: 0.9
context:
["carte d'identité", "CNI", "pièce d'identité", "numéro d'identité"]
- name: FrenchPassportRecognizer
supported_language: fr
supported_entity: FR_PASSPORT
patterns:
- name: Numéro de passeport français
regex: "\\b[0-9]{2}[A-Z]{2}[0-9]{5}\\b"
score: 0.95
- name: Passeport avec espaces
regex: "\\b[0-9]{2}\\s[A-Z]{2}\\s[0-9]{5}\\b"
score: 0.9
context: ["passeport", "passport", "document de voyage"]
- name: FrenchDriverLicenseRecognizer
supported_language: fr
supported_entity: FR_DRIVER_LICENSE
patterns:
- name: Permis de conduire français
regex: "\\b[0-9]{12}\\b"
score: 0.8
- name: Permis avec format
regex: "\\b(?:permis\\s+(?:de\\s+)?conduire\\s*:?\\s*)?[0-9]{4}\\s?[0-9]{4}\\s?[0-9]{4}\\b"
score: 0.9
context: ["permis de conduire", "permis", "conduire", "licence"]

View File

@@ -0,0 +1,26 @@
# Recognizer pour données financières françaises
recognizer_registry:
recognizers:
- name: FrenchTaxIDRecognizer
supported_language: fr
supported_entity: FR_TAX_ID
patterns:
- name: Numéro fiscal français
regex: "\\b[0-9]{13}\\b"
score: 0.85
- name: Référence fiscale
regex: "\\b(?:numéro\\s+fiscal\\s*:?\\s*)?[0-9]{4}\\s?[0-9]{4}\\s?[0-9]{5}\\b"
score: 0.9
context: ["numéro fiscal", "référence fiscale", "impôts", "SIP"]
- name: FrenchBankAccountRecognizer
supported_language: fr
supported_entity: FR_BANK_ACCOUNT
patterns:
- name: RIB français
regex: "\\b[0-9]{5}\\s[0-9]{5}\\s[0-9]{11}\\s[0-9]{2}\\b"
score: 0.95
- name: Numéro de compte
regex: "\\b(?:compte\\s*:?\\s*)?[0-9]{5}[0-9]{5}[0-9]{11}[0-9]{2}\\b"
score: 0.9
context: ["RIB", "compte bancaire", "numéro de compte", "relevé"]

View File

@@ -0,0 +1,18 @@
# Recognizer pour numéros INSEE français
recognizer_registry:
recognizers:
- name: FrenchINSEERecognizer
supported_language: fr
supported_entity: FR_SOCIAL_SECURITY_NUMBER
patterns:
- name: INSEE complet avec espaces
regex: "\\b[12]\\s*[0-9]{2}\\s*(?:0[1-9]|1[0-2])\\s*(?:2[ABab]|[0-9]{2})\\s*[0-9]{3}\\s*[0-9]{3}[\\s]?[0-9]{2}\\b"
score: 0.95
- name: NIR compact
regex: "\\b[12][0-9]{12}[0-9]{2}\\b"
score: 0.85
- name: INSEE avec tirets
regex: "\\b[12]-[0-9]{2}-[0-9]{2}-[0-9]{2}-[0-9]{3}-[0-9]{3}-[0-9]{2}\\b"
score: 0.9
context:
["sécurité sociale", "insee", "nir", "numéro de sécu", "carte vitale"]

View File

@@ -0,0 +1,20 @@
# Recognizer pour IBAN
recognizer_registry:
recognizers:
- name: IbanRecognizer
supported_language: fr
supported_entity: IBAN
patterns:
- name: IBAN avec espaces
regex: "\\b[A-Z]{2}[0-9]{2}(?:\\s[0-9A-Z]{4}){3,7}\\b"
score: 1.0
- name: IBAN compact
regex: "\\b[A-Z]{2}[0-9]{2}[0-9A-Z]{12,30}\\b"
score: 0.9
- name: IBAN belge spécifique
regex: "\\bBE[0-9]{2}\\s?[0-9]{4}\\s?[0-9]{4}\\s?[0-9]{4}\\b"
score: 0.95
- name: IBAN français spécifique
regex: "\\bFR[0-9]{2}\\s?[0-9]{4}\\s?[0-9]{4}\\s?[0-9]{4}\\s?[0-9]{4}\\s?[0-9]{3}\\b"
score: 0.95
context: ["iban", "compte", "bancaire", "virement", "RIB"]

View File

@@ -0,0 +1,14 @@
# Recognizer pour données biométriques
recognizer_registry:
recognizers:
- name: BiometricDataRecognizer
supported_language: fr
supported_entity: BIOMETRIC_DATA
patterns:
- name: Données biométriques
regex: "\\b(?:empreinte(?:s)?\\s+digitale(?:s)?|reconnaissance\\s+faciale|scan\\s+(?:iris|rétine)|biométrie|ADN|profil\\s+génétique)\\b"
score: 0.95
- name: Identifiants biométriques
regex: "\\b(?:template|hash)\\s+biométrique\\s*:?\\s*[A-F0-9]{32,}\\b"
score: 0.9
context: ["biométrie", "empreinte", "reconnaissance", "scan", "identification"]

View File

@@ -0,0 +1,74 @@
recognizer_registry:
recognizers:
- name: DateTimeRecognizer
supported_language: fr
supported_entity: DATE_TIME
patterns:
# Formats français standards avec différents séparateurs
- name: Date française DD/MM/YYYY
regex: "\\b(?:0?[1-9]|[12][0-9]|3[01])/(?:0?[1-9]|1[0-2])/(?:19|20)\\d{2}\\b"
score: 0.95
- name: Date française DD-MM-YYYY
regex: "\\b(?:0?[1-9]|[12][0-9]|3[01])-(?:0?[1-9]|1[0-2])-(?:19|20)\\d{2}\\b"
score: 0.95
- name: Date française DD MM YYYY (espaces)
regex: "\\b(?:0?[1-9]|[12][0-9]|3[01])\\s+(?:0?[1-9]|1[0-2])\\s+(?:19|20)\\d{2}\\b"
score: 0.9
- name: Date ISO YYYY-MM-DD
regex: "\\b(?:19|20)\\d{2}-(?:0[1-9]|1[0-2])-(?:0[1-9]|[12][0-9]|3[01])\\b"
score: 0.98
# Dates avec mois en lettres (joli format)
- name: Date avec mois en lettres
regex: "\\b(?:0?[1-9]|[12][0-9]|3[01])\\s+(?:janvier|février|mars|avril|mai|juin|juillet|août|septembre|octobre|novembre|décembre)\\s+(?:19|20)\\d{2}\\b"
score: 0.99
# Format belge DD.MM.YYYY
- name: Date belge DD.MM.YYYY
regex: "\\b(?:0?[1-9]|[12][0-9]|3[01])\\.(?:0?[1-9]|1[0-2])\\.(?:19|20)\\d{2}\\b"
score: 0.95
# Heures (HH:MM et HH:MM:SS)
- name: Heure HH:MM(:SS)?
regex: "\\b(?:[01]?[0-9]|2[0-3]):[0-5][0-9](?::[0-5][0-9])?\\b"
score: 0.85
# Date et heure combinées (ex: 12/05/2023 14:30)
- name: Date et heure combinées
regex: "\\b(?:0?[1-9]|[12][0-9]|3[01])[/-](?:0?[1-9]|1[0-2])[/-](?:19|20)\\d{2}\\s+(?:[01]?[0-9]|2[0-3]):[0-5][0-9](?::[0-5][0-9])?\\b"
score: 0.97
# Années seules dans un contexte fort
- name: Année avec contexte
regex: "\\b(?:en|depuis|année|an|né en|décédé en)\\s+(?:19|20)\\d{2}\\b"
score: 0.8
context:
[
"date",
"né le",
"née le",
"naissance",
"décès",
"décédé le",
"le",
"du",
"au",
"depuis",
"jusqu'au",
"entre",
"avant",
"après",
"heure",
"horaire",
"rendez-vous",
"réunion",
"événement",
"signature",
"signé le",
"établi le",
"fait le",
]

View File

@@ -0,0 +1,18 @@
# Recognizer pour adresses email
recognizer_registry:
recognizers:
- name: EmailRecognizer
supported_language: fr
supported_entity: EMAIL_ADDRESS
patterns:
- name: Email standard
regex: "\\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Z|a-z]{2,}\\b"
score: 1.0
- name: Email obfusqué
regex: "\\b[A-Za-z0-9._%+-]+\\s*\\[at\\]\\s*[A-Za-z0-9.-]+\\s*\\[dot\\]\\s*[A-Z|a-z]{2,}\\b"
score: 0.8
- name: Email avec (at) et (point)
regex: "\\b[A-Za-z0-9._%+-]+\\s*\\(at\\)\\s*[A-Za-z0-9.-]+\\s*\\(point\\)\\s*[A-Z|a-z]{2,}\\b"
score: 0.7
context:
["email", "courriel", "mail", "@", "contact", "adresse électronique"]

View File

@@ -0,0 +1,17 @@
# Recognizer pour données financières RGPD
recognizer_registry:
recognizers:
- name: RGPDFinancialDataRecognizer
supported_language: fr
supported_entity: RGPD_FINANCIAL_DATA
patterns:
- name: Numéro de carte bancaire
regex: "\\b(?:4[0-9]{12}(?:[0-9]{3})?|5[1-5][0-9]{14}|3[47][0-9]{13})\\b"
score: 1.0
- name: Code de sécurité
regex: "\\b(?:CVV|CVC|cryptogramme)\\s*:?\\s*[0-9]{3,4}\\b"
score: 0.95
- name: Revenus
regex: "\\b(?:salaire|revenu|rémunération)\\s*:?\\s*[0-9]{1,3}(?:[\\s.,][0-9]{3})*\\s*€?\\b"
score: 0.8
context: ["financier", "bancaire", "carte", "paiement", "salaire"]

View File

@@ -0,0 +1,17 @@
# Recognizer pour données de santé
recognizer_registry:
recognizers:
- name: HealthDataRecognizer
supported_language: fr
supported_entity: HEALTH_DATA
patterns:
- name: Informations médicales
regex: "\\b(?:maladie|pathologie|diagnostic|traitement|médicament|hospitalisation|chirurgie|opération|allergie|antécédent|symptôme)\\b"
score: 0.85
- name: Données médicales sensibles
regex: "\\b(?:VIH|SIDA|cancer|diabète|dépression|schizophrénie|bipolarité|addiction)\\b"
score: 0.95
- name: Professionnels de santé
regex: "\\b(?:Dr|Docteur|Médecin|Infirmier|Psychiatre|Psychologue)\\s+[A-Z][a-z]+\\b"
score: 0.9
context: ["santé", "médical", "hôpital", "clinique", "patient", "dossier médical"]

View File

@@ -0,0 +1,17 @@
# Recognizer pour adresses IP
recognizer_registry:
recognizers:
- name: IpAddressRecognizer
supported_language: fr
supported_entity: IP_ADDRESS
patterns:
- name: IPv4
regex: "\\b(?:(?:25[0-5]|2[0-4][0-9]|1[0-9]{2}|[1-9]?[0-9])\\.){3}(?:25[0-5]|2[0-4][0-9]|1[0-9]{2}|[1-9]?[0-9])\\b"
score: 1.0
- name: IPv6
regex: "\\b([0-9a-fA-F]{1,4}:){7}[0-9a-fA-F]{1,4}\\b"
score: 0.9
- name: IPv6 compressé
regex: "\\b([0-9a-fA-F]{1,4}:){1,7}:([0-9a-fA-F]{1,4}:){0,6}[0-9a-fA-F]{1,4}\\b"
score: 0.85
context: ["IP", "adresse", "serveur", "réseau", "connexion"]

View File

@@ -0,0 +1,11 @@
# Recognizer pour opinions politiques
recognizer_registry:
recognizers:
- name: PoliticalOpinionsRecognizer
supported_language: fr
supported_entity: POLITICAL_OPINIONS
patterns:
- name: Opinions politiques
regex: "\\b(?:vote|électeur|parti\\s+politique|opinion\\s+politique)\\b"
score: 0.7
context: ["données sensibles", "RGPD", "politique", "privé"]

View File

@@ -0,0 +1,11 @@
# Recognizer pour orientation sexuelle
recognizer_registry:
recognizers:
- name: SexualOrientationRecognizer
supported_language: fr
supported_entity: SEXUAL_ORIENTATION
patterns:
- name: Orientation sexuelle
regex: "\\b(?:homosexuel|hétérosexuel|bisexuel|orientation\\s+sexuelle)\\b"
score: 0.9
context: ["données sensibles", "RGPD", "orientation", "privé"]

107
config_loader.py Normal file
View File

@@ -0,0 +1,107 @@
import os
import yaml
import glob
from typing import Dict, Any, List
import logging
logger = logging.getLogger(__name__)
class ConfigLoader:
def __init__(self, config_dir: str = "conf"):
self.config_dir = config_dir
self.config = {}
def load_config(self, main_config_file: str = "main.yaml") -> Dict[str, Any]:
main_config_path = os.path.join(self.config_dir, main_config_file)
if not os.path.exists(main_config_path):
logger.warning(f"Fichier de configuration principal non trouvé: {main_config_path}")
return self._load_legacy_config()
with open(main_config_path, 'r', encoding='utf-8') as f:
main_config = yaml.safe_load(f)
if 'includes' in main_config:
for include_pattern in main_config['includes']:
self._load_includes(include_pattern)
self._merge_config(main_config)
logger.info(f"Configuration chargée avec {len(self.config.get('recognizer_registry', {}).get('recognizers', []))} recognizers")
return self.config
def _load_includes(self, pattern: str):
pattern = os.path.expandvars(pattern)
full_pattern = os.path.join(self.config_dir, pattern)
matching_files = glob.glob(full_pattern, recursive=True)
for file_path in sorted(matching_files):
if os.path.isfile(file_path) and file_path.endswith('.yaml'):
try:
with open(file_path, 'r', encoding='utf-8') as f:
module_config = yaml.safe_load(f)
if module_config:
self._merge_config(module_config)
logger.debug(f"Module chargé: {file_path}")
except Exception as e:
logger.error(f"Erreur lors du chargement de {file_path}: {e}")
def _merge_config(self, new_config: Dict[str, Any]):
for key, value in new_config.items():
if key == 'recognizer_registry':
if 'recognizer_registry' not in self.config:
self.config['recognizer_registry'] = {'recognizers': []}
if 'recognizers' in value:
self.config['recognizer_registry']['recognizers'].extend(value['recognizers'])
for reg_key, reg_value in value.items():
if reg_key != 'recognizers':
self.config['recognizer_registry'][reg_key] = reg_value
elif key == 'allow_list':
if 'allow_list' not in self.config:
self.config['allow_list'] = []
if isinstance(value, list):
self.config['allow_list'].extend(value)
elif key == 'nlp_configuration':
logger.info(f"🔧 Fusion de nlp_configuration: {value}")
if 'nlp_configuration' not in self.config:
self.config['nlp_configuration'] = {}
self._merge_dict(self.config['nlp_configuration'], value)
elif isinstance(value, dict) and key in self.config and isinstance(self.config[key], dict):
self._merge_dict(self.config[key], value)
else:
self.config[key] = value
def _merge_dict(self, target: Dict[str, Any], source: Dict[str, Any]):
for key, value in source.items():
if isinstance(value, dict) and key in target and isinstance(target[key], dict):
self._merge_dict(target[key], value)
else:
target[key] = value
def _load_legacy_config(self) -> Dict[str, Any]:
legacy_path = os.path.join(self.config_dir, "default.yaml")
if os.path.exists(legacy_path):
logger.info("Utilisation de la configuration legacy: default.yaml")
with open(legacy_path, 'r', encoding='utf-8') as f:
return yaml.safe_load(f)
else:
raise FileNotFoundError(f"Aucun fichier de configuration trouvé dans {self.config_dir}")
def get_recognizers(self) -> List[Dict[str, Any]]:
return self.config.get('recognizer_registry', {}).get('recognizers', [])
def get_supported_languages(self) -> List[str]:
return self.config.get('supported_languages', ['fr'])
def load_single_file(self, file_path: str) -> Dict[str, Any]:
full_path = os.path.join(self.config_dir, file_path) if not os.path.isabs(file_path) else file_path
if not os.path.exists(full_path):
raise FileNotFoundError(f"Fichier de configuration non trouvé: {full_path}")
with open(full_path, 'r', encoding='utf-8') as f:
return yaml.safe_load(f)

View File

@@ -1,5 +1,3 @@
version: '3.8'
services:
presidio-analyzer:
build:
@@ -8,11 +6,4 @@ services:
container_name: presidio-analyzer
restart: unless-stopped
ports:
- "5001" # Port corrigé selon la doc Microsoft
presidio-anonymizer:
image: mcr.microsoft.com/presidio-anonymizer:latest
container_name: presidio-anonymizer
restart: unless-stopped
ports:
- "5002" # Port corrigé selon la doc Microsoft
- "5001:5001"

56
entity_refiners.py Normal file
View File

@@ -0,0 +1,56 @@
from abc import ABC, abstractmethod
from typing import Optional, Tuple
import re
import logging
# Imports des raffineurs modulaires
from refiners.iban_refiner import IBANRefiner
from refiners.ip_refiner import IPAddressRefiner
from refiners.date_refiner import DateRefiner
from refiners.location_address_refiner import LocationAddressRefiner
from refiners.word_boundary_refiner import WordBoundaryRefiner
logger = logging.getLogger(__name__)
class EntityRefiner(ABC):
"""Classe de base pour le recadrage d'entités"""
def __init__(self, entity_type: str):
self.entity_type = entity_type
@abstractmethod
def refine(self, text: str, start: int, end: int) -> Optional[Tuple[int, int]]:
"""Recadre une entité détectée"""
pass
def should_process(self, entity_type: str) -> bool:
"""Vérifie si ce raffineur doit traiter ce type d'entité"""
return entity_type == self.entity_type
class EntityRefinerManager:
"""Gestionnaire des raffineurs d'entités"""
def __init__(self):
self.refiners = [
WordBoundaryRefiner(), # En premier pour étendre aux mots complets
IBANRefiner(),
IPAddressRefiner(),
DateRefiner(),
LocationAddressRefiner()
]
logger.info(f"Initialized {len(self.refiners)} entity refiners")
def register_refiner(self, refiner):
"""Enregistre un nouveau raffineur"""
self.refiners.append(refiner)
def refine_entity(self, text: str, entity_type: str, start: int, end: int) -> Optional[Tuple[int, int]]:
"""Applique tous les raffineurs applicables à une entité"""
for refiner in self.refiners:
if refiner.should_process(entity_type):
result = refiner.refine(text, start, end)
if result:
logger.debug(f"Entity refined by {refiner.__class__.__name__}: {start}-{end} -> {result[0]}-{result[1]}")
return result
return (start, end)

68
pipeline_manager.py Normal file
View File

@@ -0,0 +1,68 @@
from typing import List
from presidio_analyzer import RecognizerResult
from entity_refiners import EntityRefinerManager
from post_processors import DeduplicationProcessor, OverlapResolver
import logging
logger = logging.getLogger(__name__)
class AnalysisPipeline:
def __init__(self):
self.refiner_manager = EntityRefinerManager()
self.overlap_resolver = OverlapResolver()
self.deduplicator = DeduplicationProcessor()
logger.info("🚀 Pipeline d'analyse initialisé")
def process(self, text: str, results: List[RecognizerResult], allow_list_terms: List[str]) -> List[RecognizerResult]:
"""Traite les résultats à travers le pipeline complet"""
# 1. Filtrage allow-list
filtered_results = self._filter_allow_list(results, allow_list_terms, text)
# 2. Raffinement individuel des entités
refined_results = []
for result in filtered_results:
refined_coords = self.refiner_manager.refine_entity(
text,
result.entity_type,
result.start,
result.end
)
if refined_coords is not None:
# Créer un nouveau RecognizerResult avec les coordonnées raffinées
refined_result = RecognizerResult(
entity_type=result.entity_type,
start=refined_coords[0],
end=refined_coords[1],
score=result.score
)
refined_results.append(refined_result)
# 3. Résolution des chevauchements
resolved_results = self.overlap_resolver.process(refined_results, text)
# 4. Déduplication
final_results = self.deduplicator.process(resolved_results, text)
logger.info(f"🎯 Pipeline complet: {len(results)} -> {len(final_results)} entités")
return final_results
def _filter_allow_list(self, results: List[RecognizerResult], allow_list_terms: List[str], text: str) -> List[RecognizerResult]:
"""Filtre les résultats en supprimant les termes de la allow-list"""
if not allow_list_terms:
return results
filtered_results = []
allow_list_lower = [term.lower().strip() for term in allow_list_terms]
for result in results:
entity_text = text[result.start:result.end].lower().strip()
# Garder l'entité si elle n'est pas dans la allow-list
if entity_text not in allow_list_lower:
filtered_results.append(result)
else:
logger.debug(f"🚫 Entité filtrée (allow-list): '{entity_text}'")
logger.info(f"🔍 Filtrage allow-list: {len(results)} -> {len(filtered_results)} entités")
return filtered_results

View File

@@ -0,0 +1,4 @@
from .deduplication_processor import DeduplicationProcessor
from .overlap_resolver import OverlapResolver
__all__ = ['DeduplicationProcessor', 'OverlapResolver']

View File

@@ -0,0 +1,66 @@
from typing import List
from presidio_analyzer import RecognizerResult
import logging
logger = logging.getLogger(__name__)
class DeduplicationProcessor:
def __init__(self):
self.rules = [
LocationAddressRule()
]
logger.info("🔧 DeduplicationProcessor initialisé avec les règles de déduplication")
def process(self, results: List[RecognizerResult], text: str) -> List[RecognizerResult]:
"""Applique les règles de déduplication aux résultats"""
processed_results = results.copy()
for rule in self.rules:
processed_results = rule.apply(processed_results, text)
logger.info(f"🔧 DeduplicationProcessor: {len(results)} -> {len(processed_results)} entités")
return processed_results
class LocationAddressRule:
"""Règle pour éviter les doublons entre LOCATION et ADDRESS"""
def __init__(self):
self.insignificant_terms = {'le', 'la', 'les', 'de', 'du', 'des', 'à', 'au', 'aux'}
def apply(self, results: List[RecognizerResult], text: str) -> List[RecognizerResult]:
"""Supprime les LOCATION qui sont des doublons d'ADDRESS"""
locations = [r for r in results if r.entity_type == 'LOCATION']
addresses = [r for r in results if r.entity_type == 'ADDRESS']
others = [r for r in results if r.entity_type not in ['LOCATION', 'ADDRESS']]
filtered_locations = []
for location in locations:
if self._should_keep_location(location, addresses, text):
filtered_locations.append(location)
else:
location_text = text[location.start:location.end]
logger.debug(f"🗑️ Suppression LOCATION dupliquée: '{location_text}'")
return addresses + filtered_locations + others
def _should_keep_location(self, location: RecognizerResult, addresses: List[RecognizerResult], text: str) -> bool:
location_text = text[location.start:location.end].strip().lower()
# Ignorer termes non significatifs
if (len(location_text) <= 3 or
location_text in self.insignificant_terms):
return False
# Vérifier chevauchement avec adresses
for address in addresses:
if self._is_overlapping_or_contained(location, address, text):
return False
return True
def _is_overlapping_or_contained(self, loc: RecognizerResult, addr: RecognizerResult, text: str) -> bool:
"""Vérifie si une location est contenue dans une address"""
loc_text = text[loc.start:loc.end].strip().lower()
addr_text = text[addr.start:addr.end].strip().lower()
return loc_text in addr_text

View File

@@ -0,0 +1,241 @@
from typing import List
from presidio_analyzer import RecognizerResult
import logging
import re
logger = logging.getLogger(__name__)
class OverlapResolver:
"""
Résout les chevauchements entre entités de différents types
Priorités: IBAN > EMAIL > PHONE > IP_ADDRESS > ADDRESS > LOCATION > ORGANIZATION > PERSON
"""
def __init__(self):
# Ordre de priorité (plus haut = plus prioritaire)
self.priority_order = {
'IBAN': 100,
'CREDIT_CARD': 95,
'EMAIL_ADDRESS': 90,
'BE_ENTERPRISE_NUMBER': 88,
'PHONE_NUMBER': 85,
'BE_PHONE_NUMBER': 85,
'IP_ADDRESS': 82,
'BE_ADDRESS': 75,
'FR_ADDRESS': 75,
'DATE_TIME': 70,
'ORGANIZATION': 65,
'LOCATION': 60,
'PERSON': 50,
'NRP': 40,
'URL': 35
}
# Patterns pour identifier les organisations
self.organization_patterns = [
r'\\b\\w+Consult\\b',
r'\\bSPRL\\s+\\w+\\b', # Pattern pour SPRL + nom
r'\\bSRL\\s+\\w+\\b', # Pattern pour SRL + nom
r'\\bSA\\s+\\w+\\b', # Pattern pour SA + nom
r'\\bASBL\\s+\\w+\\b', # Pattern pour ASBL + nom
r'\\bSCS\\s+\\w+\\b', # Pattern pour SCS + nom
r'\\bSNC\\s+\\w+\\b', # Pattern pour SNC + nom
r'\\bSPRL\\b',
r'\\bSRL\\b',
r'\\bSA\\b',
r'\\bASBL\\b',
r'\\bSCS\\b',
r'\\bSNC\\b',
r'\\bLtd\\b',
r'\\bInc\\b',
r'\\bCorp\\b',
r'\\bGmbH\\b'
]
logger.info(f"✅ OverlapResolver initialisé avec {len(self.priority_order)} types d'entités")
def process(self, results: List[RecognizerResult], text: str = "") -> List[RecognizerResult]:
"""
Résout les chevauchements en gardant l'entité la plus prioritaire
"""
if not results:
return results
original_count = len(results)
# Appliquer les corrections spécifiques avant résolution des chevauchements
corrected_results = self._apply_specific_corrections(results, text)
# Trier par position pour traitement séquentiel
sorted_results = sorted(corrected_results, key=lambda x: (x.start, x.end))
resolved_results = []
i = 0
while i < len(sorted_results):
current = sorted_results[i]
overlapping_group = [current]
# Trouver tous les chevauchements avec l'entité courante
j = i + 1
while j < len(sorted_results):
if self._is_overlapping(current, sorted_results[j]):
overlapping_group.append(sorted_results[j])
j += 1
elif sorted_results[j].start >= current.end:
# Plus de chevauchement possible
break
else:
j += 1
# Résoudre le groupe de chevauchements
if len(overlapping_group) > 1:
winner = self._resolve_overlap_group(overlapping_group, text)
resolved_results.append(winner)
# Avancer l'index pour éviter de retraiter les entités du groupe
i = j
else:
resolved_results.append(current)
i += 1
logger.info(f"🔧 OverlapResolver: {original_count} -> {len(resolved_results)} entités")
return resolved_results
def _apply_specific_corrections(self, results: List[RecognizerResult], text: str) -> List[RecognizerResult]:
"""
Applique des corrections spécifiques avant la résolution des chevauchements
"""
corrected_results = []
for result in results:
entity_text = text[result.start:result.end] if text else ""
# Correction 1: PERSON -> ORGANIZATION pour les noms d'entreprise
if result.entity_type == 'PERSON' and self._is_organization_name(entity_text):
corrected_result = RecognizerResult(
entity_type='ORGANIZATION',
start=result.start,
end=result.end,
score=result.score + 0.1 # Bonus de confiance
)
logger.debug(f"🔄 Correction PERSON -> ORGANIZATION: '{entity_text}'")
corrected_results.append(corrected_result)
# Correction 2: Séparer IP des adresses physiques
elif result.entity_type in ['BE_ADDRESS', 'FR_ADDRESS'] and self._contains_ip_address(entity_text):
# Extraire l'IP et créer une entité séparée
ip_matches = list(re.finditer(r'\b(?:[0-9]{1,3}\.){3}[0-9]{1,3}\b', entity_text))
if ip_matches:
for ip_match in ip_matches:
ip_start = result.start + ip_match.start()
ip_end = result.start + ip_match.end()
# Créer l'entité IP
ip_result = RecognizerResult(
entity_type='IP_ADDRESS',
start=ip_start,
end=ip_end,
score=0.95
)
corrected_results.append(ip_result)
logger.debug(f"🔄 IP extraite de l'adresse: '{ip_match.group()}'")
# Créer une nouvelle entité adresse SANS la partie IP
# Chercher la partie adresse physique (après l'IP)
address_pattern = r'\b(?:Avenue|Rue|Boulevard|Chaussée|Place|Quai|Impasse|Drève|Clos|Allée)\b.*?\b[1-9]\d{3}\s+[A-Za-zà-ÿ\'-]+'
address_match = re.search(address_pattern, entity_text, re.IGNORECASE)
if address_match:
address_start = result.start + address_match.start()
address_end = result.start + address_match.end()
# Vérifier qu'il n'y a pas de chevauchement avec l'IP
ip_overlaps = any(not (address_end <= ip_start or address_start >= ip_end)
for ip_match in ip_matches
for ip_start, ip_end in [(result.start + ip_match.start(), result.start + ip_match.end())])
if not ip_overlaps:
address_result = RecognizerResult(
entity_type=result.entity_type,
start=address_start,
end=address_end,
score=result.score
)
corrected_results.append(address_result)
logger.debug(f"🔄 Adresse physique séparée: '{address_match.group()}'")
else:
corrected_results.append(result)
else:
corrected_results.append(result)
return corrected_results
def _is_organization_name(self, text: str) -> bool:
"""
Détermine si un texte ressemble à un nom d'organisation
"""
for pattern in self.organization_patterns:
if re.search(pattern, text, re.IGNORECASE):
return True
return False
def _contains_ip_address(self, text: str) -> bool:
"""
Vérifie si le texte contient une adresse IP
"""
ip_pattern = r'\b(?:[0-9]{1,3}\.){3}[0-9]{1,3}\b'
return bool(re.search(ip_pattern, text))
def _is_overlapping(self, entity1: RecognizerResult, entity2: RecognizerResult) -> bool:
"""
Vérifie si deux entités se chevauchent
"""
return not (entity1.end <= entity2.start or entity1.start >= entity2.end)
def _resolve_overlap_group(self, overlapping_entities: List[RecognizerResult], text: str = "") -> RecognizerResult:
"""
Résout un groupe d'entités qui se chevauchent
Critères: 1) Priorité du type, 2) Score de confiance, 3) Longueur
"""
def get_priority_score(entity):
base_priority = self.priority_order.get(entity.entity_type, 0)
confidence_bonus = entity.score * 10 # Score 0.9 = +9 points
# Calculer la longueur depuis les positions
entity_length = entity.end - entity.start
length_bonus = entity_length * 0.1 # Bonus longueur
# Bonus spécial pour IBAN vs FR_DRIVER_LICENSE
if entity.entity_type == 'IBAN':
# Vérifier si c'est un vrai IBAN (commence par code pays)
if text:
entity_text = text[entity.start:entity.end].replace(' ', '')
if re.match(r'^[A-Z]{2}[0-9]{2}', entity_text):
base_priority += 20 # Bonus pour vrai IBAN
return base_priority + confidence_bonus + length_bonus
# Trier par score de priorité décroissant
sorted_entities = sorted(overlapping_entities,
key=get_priority_score,
reverse=True)
winner = sorted_entities[0]
# Log des entités écartées (si texte disponible)
if text:
for loser in sorted_entities[1:]:
loser_text = text[loser.start:loser.end]
logger.debug(f"❌ Écarté: {loser.entity_type} '{loser_text}' (score: {get_priority_score(loser):.1f})")
winner_text = text[winner.start:winner.end]
logger.debug(f"✅ Gagnant: {winner.entity_type} '{winner_text}' (score: {get_priority_score(winner):.1f})")
return winner
def add_entity_priority(self, entity_type: str, priority: int):
"""
Ajoute ou modifie la priorité d'un type d'entité
"""
self.priority_order[entity_type] = priority
logger.info(f"📊 Priorité mise à jour: {entity_type} = {priority}")

1
refiners/__init__.py Normal file
View File

@@ -0,0 +1 @@
# Refiners package

89
refiners/date_refiner.py Normal file
View File

@@ -0,0 +1,89 @@
from abc import ABC, abstractmethod
from typing import Optional, Tuple
import re
import logging
logger = logging.getLogger(__name__)
class EntityRefiner(ABC):
"""Classe de base pour le recadrage d'entités"""
def __init__(self, entity_type: str):
self.entity_type = entity_type
@abstractmethod
def refine(self, text: str, start: int, end: int) -> Optional[Tuple[int, int]]:
"""Recadre une entité détectée"""
pass
def should_process(self, entity_type: str) -> bool:
"""Vérifie si ce raffineur doit traiter ce type d'entité"""
return entity_type == self.entity_type
class DateRefiner(EntityRefiner):
"""Raffineur pour les dates - élimine les faux positifs"""
def __init__(self):
super().__init__("DATE_TIME")
# Patterns pour valider les vraies dates
self.valid_date_patterns = [
# Format DD/MM/YYYY
re.compile(r"\b(?:0[1-9]|[12][0-9]|3[01])/(?:0[1-9]|1[0-2])/(?:19|20)\d{2}\b"),
# Format DD-MM-YYYY
re.compile(r"\b(?:0[1-9]|[12][0-9]|3[01])-(?:0[1-9]|1[0-2])-(?:19|20)\d{2}\b"),
# Format ISO YYYY-MM-DD
re.compile(r"\b(?:19|20)\d{2}-(?:0[1-9]|1[0-2])-(?:0[1-9]|[12][0-9]|3[01])\b"),
# Dates avec mois en lettres
re.compile(r"\b(?:0?[1-9]|[12][0-9]|3[01])\s+(?:janvier|février|mars|avril|mai|juin|juillet|août|septembre|octobre|novembre|décembre)\s+(?:19|20)\d{2}\b", re.IGNORECASE),
# Heures
re.compile(r"\b(?:[01][0-9]|2[0-3]):[0-5][0-9](?::[0-5][0-9])?\b")
]
# Patterns à rejeter (faux positifs courants)
self.reject_patterns = [
# Codes IBAN belges (BE + chiffres)
re.compile(r"\bBE\d{2,}\b", re.IGNORECASE),
# Numéros d'entreprise belges
re.compile(r"\bBE\d{3}\.\d{3}\.\d{3}\b"),
# Mots comme HTVA, TVA, etc.
re.compile(r"\b(?:HTVA|TVA|BCE|ONSS|SIREN|SIRET)\b", re.IGNORECASE),
# Données sensibles (texte)
re.compile(r"\b(?:données?\s+sensibles?)\b", re.IGNORECASE),
# Codes postaux isolés
re.compile(r"^\d{4}$"),
# Codes courts (2-4 caractères alphanumériques)
re.compile(r"^[A-Z]{2}\d{1,2}$")
]
def refine(self, text: str, start: int, end: int) -> Optional[Tuple[int, int]]:
"""Valide si l'entité détectée est vraiment une date"""
ent_text = text[start:end].strip()
# Vérifier si c'est un pattern à rejeter
for reject_pattern in self.reject_patterns:
if reject_pattern.search(ent_text):
logger.info(f"Date rejetée (faux positif): '{ent_text}'")
return None
# Vérifier si c'est un pattern de date valide
for valid_pattern in self.valid_date_patterns:
if valid_pattern.search(ent_text):
logger.info(f"Date validée: '{ent_text}'")
return (start, end)
# Si aucun pattern valide trouvé, rejeter
logger.info(f"Date rejetée (format invalide): '{ent_text}'")
return None
def validate_date_logic(self, day: int, month: int, year: int) -> bool:
"""Valide la logique de la date (jours/mois corrects)"""
if month < 1 or month > 12:
return False
days_in_month = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
# Année bissextile
if year % 4 == 0 and (year % 100 != 0 or year % 400 == 0):
days_in_month[1] = 29
return 1 <= day <= days_in_month[month - 1]

49
refiners/iban_refiner.py Normal file
View File

@@ -0,0 +1,49 @@
from abc import ABC, abstractmethod
from typing import Optional, Tuple
import re
import logging
logger = logging.getLogger(__name__)
class EntityRefiner(ABC):
"""Classe de base pour le recadrage d'entités"""
def __init__(self, entity_type: str):
self.entity_type = entity_type
@abstractmethod
def refine(self, text: str, start: int, end: int) -> Optional[Tuple[int, int]]:
"""Recadre une entité détectée"""
pass
def should_process(self, entity_type: str) -> bool:
"""Vérifie si ce raffineur doit traiter ce type d'entité"""
return entity_type == self.entity_type
class IBANRefiner(EntityRefiner):
"""Raffineur pour les IBAN"""
def __init__(self):
super().__init__("IBAN")
self.iban_regex = re.compile(r"\b[A-Z]{2}[0-9]{2}(?:\s[0-9]{4}){3}\b", re.IGNORECASE)
def refine(self, text: str, start: int, end: int) -> Optional[Tuple[int, int]]:
ent_text = text[start:end].strip()
match = self.iban_regex.search(ent_text)
if not match:
logger.warning(f"Invalid IBAN detected, skipping: '{ent_text}'")
return None
true_iban = match.group(0)
start_offset = ent_text.find(true_iban)
if start_offset == -1:
logger.warning(f"IBAN regex match but cannot find substring position: '{ent_text}'")
return None
new_start = start + start_offset
new_end = new_start + len(true_iban)
logger.debug(f"Adjusted IBAN span: {start}-{end} => {new_start}-{new_end}")
return (new_start, new_end)

52
refiners/ip_refiner.py Normal file
View File

@@ -0,0 +1,52 @@
from abc import ABC, abstractmethod
from typing import Optional, Tuple
import re
import logging
logger = logging.getLogger(__name__)
class EntityRefiner(ABC):
"""Classe de base pour le recadrage d'entités"""
def __init__(self, entity_type: str):
self.entity_type = entity_type
@abstractmethod
def refine(self, text: str, start: int, end: int) -> Optional[Tuple[int, int]]:
"""Recadre une entité détectée"""
pass
def should_process(self, entity_type: str) -> bool:
"""Vérifie si ce raffineur doit traiter ce type d'entité"""
return entity_type == self.entity_type
class IPAddressRefiner(EntityRefiner):
"""Raffineur pour les adresses IP"""
def __init__(self):
super().__init__("IP_ADDRESS")
self.ipv4_regex = re.compile(
r"\b(?:(?:25[0-5]|2[0-4][0-9]|1\d{2}|[1-9]?\d)\.){3}"
r"(?:25[0-5]|2[0-4][0-9]|1\d{2}|[1-9]?\d)\b"
)
def refine(self, text: str, start: int, end: int) -> Optional[Tuple[int, int]]:
ent_text = text[start:end].strip()
match = self.ipv4_regex.search(ent_text)
if not match:
logger.warning(f"Invalid IP detected, skipping: '{ent_text}'")
return None
true_ip = match.group(0)
start_offset = ent_text.find(true_ip)
if start_offset == -1:
logger.warning(f"IP regex match but cannot find substring position: '{ent_text}'")
return None
new_start = start + start_offset
new_end = new_start + len(true_ip)
logger.debug(f"Adjusted IP span: {start}-{end} => {new_start}-{new_end}")
return (new_start, new_end)

View File

@@ -0,0 +1,76 @@
from typing import List, Optional, Tuple
from presidio_analyzer import RecognizerResult
from abc import ABC, abstractmethod
import logging
logger = logging.getLogger(__name__)
class EntityRefiner(ABC):
"""Classe de base pour le recadrage d'entités"""
def __init__(self, entity_type: str):
self.entity_type = entity_type
@abstractmethod
def refine(self, text: str, start: int, end: int) -> Optional[Tuple[int, int]]:
"""
Recadre une entité détectée
Args:
text: Le texte complet
start: Position de début de l'entité détectée
end: Position de fin de l'entité détectée
Returns:
Tuple (nouveau_start, nouveau_end) ou None si l'entité doit être ignorée
"""
pass
def should_process(self, entity_type: str) -> bool:
"""Vérifie si ce raffineur doit traiter ce type d'entité"""
return entity_type == self.entity_type
class LocationAddressRefiner(EntityRefiner):
"""
Refiner pour filtrer les doublons entre LOCATION et BE_ADDRESS/FR_ADDRESS.
Ce refiner ne modifie pas les positions mais peut supprimer des entités.
"""
def __init__(self):
super().__init__("LOCATION") # Ne traite que les LOCATION
self.address_entities = {'BE_ADDRESS', 'FR_ADDRESS'}
self.location_entity = 'LOCATION'
# Cache pour stocker les adresses détectées
self._detected_addresses = []
def refine(self, text: str, start: int, end: int) -> Optional[Tuple[int, int]]:
"""
Vérifie si cette LOCATION fait partie d'une adresse déjà détectée.
Args:
text: Le texte complet
start: Position de début de la LOCATION
end: Position de fin de la LOCATION
Returns:
Tuple (start, end) si la location doit être conservée, None sinon
"""
location_text = text[start:end].strip().lower()
# Ignorer les locations trop courtes ou non significatives
if len(location_text) <= 3 or location_text in ['tel', 'fax', 'gsm']:
logger.debug(f"Ignoring short/insignificant location: '{location_text}'")
return None
# Chercher des adresses dans le texte (simple heuristique)
# Cette approche est limitée car on n'a accès qu'à une entité à la fois
# Une meilleure approche serait de modifier l'architecture globale
# Pour l'instant, on garde toutes les locations valides
# et on laisse un post-processing global gérer les doublons
logger.debug(f"Keeping location: '{location_text}'")
return (start, end)
def should_process(self, entity_type: str) -> bool:
"""Ne traite que les entités LOCATION"""
return entity_type == self.location_entity

View File

@@ -0,0 +1,39 @@
import re
from typing import Optional, Tuple
import logging
logger = logging.getLogger(__name__)
class WordBoundaryRefiner:
"""Refiner pour étendre les entités aux limites de mots complets"""
def __init__(self):
self.entity_type = "ALL" # S'applique à tous les types d'entités
def should_process(self, entity_type: str) -> bool:
"""Ce refiner s'applique à tous les types d'entités"""
return True
def refine(self, text: str, start: int, end: int) -> Optional[Tuple[int, int]]:
"""Étend l'entité pour inclure le mot complet"""
try:
# Trouver le début du mot
new_start = start
while new_start > 0 and text[new_start - 1].isalnum():
new_start -= 1
# Trouver la fin du mot
new_end = end
while new_end < len(text) and text[new_end].isalnum():
new_end += 1
# Retourner les nouvelles positions si elles ont changé
if new_start != start or new_end != end:
logger.debug(f"Extended entity boundaries from [{start}:{end}] to [{new_start}:{new_end}]")
return (new_start, new_end)
return None
except Exception as e:
logger.error(f"Error in WordBoundaryRefiner: {e}")
return None