new full
This commit is contained in:
59
post_processors/cleanup_processor.py
Normal file
59
post_processors/cleanup_processor.py
Normal file
@@ -0,0 +1,59 @@
|
||||
from typing import List
|
||||
from presidio_analyzer import RecognizerResult
|
||||
import re
|
||||
|
||||
class CleanupProcessor:
|
||||
"""Post-processor pour nettoyer les résultats d'anonymisation et éviter les chevauchements."""
|
||||
|
||||
def __init__(self):
|
||||
self.name = "CleanupProcessor"
|
||||
|
||||
def process(self, results: List[RecognizerResult]) -> List[RecognizerResult]:
|
||||
"""Nettoie les résultats pour éviter les chevauchements et les détections incorrectes."""
|
||||
if not results:
|
||||
return results
|
||||
|
||||
# Trier par position de début
|
||||
sorted_results = sorted(results, key=lambda x: x.start)
|
||||
|
||||
# Supprimer les chevauchements en gardant le score le plus élevé
|
||||
cleaned_results = []
|
||||
|
||||
for current in sorted_results:
|
||||
# Vérifier si ce résultat chevauche avec un résultat déjà accepté
|
||||
overlaps = False
|
||||
for accepted in cleaned_results:
|
||||
if self._overlaps(current, accepted):
|
||||
# Si le score actuel est plus élevé, remplacer
|
||||
if current.score > accepted.score:
|
||||
cleaned_results.remove(accepted)
|
||||
cleaned_results.append(current)
|
||||
overlaps = True
|
||||
break
|
||||
|
||||
if not overlaps:
|
||||
cleaned_results.append(current)
|
||||
|
||||
# Filtrer les résultats trop courts ou suspects
|
||||
final_results = []
|
||||
for result in cleaned_results:
|
||||
if self._is_valid_result(result):
|
||||
final_results.append(result)
|
||||
|
||||
return final_results
|
||||
|
||||
def _overlaps(self, result1: RecognizerResult, result2: RecognizerResult) -> bool:
|
||||
"""Vérifie si deux résultats se chevauchent."""
|
||||
return not (result1.end <= result2.start or result2.end <= result1.start)
|
||||
|
||||
def _is_valid_result(self, result: RecognizerResult) -> bool:
|
||||
"""Vérifie si un résultat est valide (pas trop court, pas suspect)."""
|
||||
# Longueur minimale
|
||||
if result.end - result.start < 2:
|
||||
return False
|
||||
|
||||
# Éviter les détections sur des caractères isolés
|
||||
if result.entity_type == "PERSON_NAME" and result.end - result.start < 4:
|
||||
return False
|
||||
|
||||
return True
|
||||
@@ -20,14 +20,26 @@ class OverlapResolver:
|
||||
'BE_ENTERPRISE_NUMBER': 88,
|
||||
'PHONE_NUMBER': 85,
|
||||
'BE_PHONE_NUMBER': 85,
|
||||
'TELEPHONE': 84,
|
||||
'TELEPHONE_FRANCAIS': 86,
|
||||
'IP_ADDRESS': 82,
|
||||
'ADRESSE_FRANCAISE': 78, # Priorité plus élevée pour adresses françaises spécifiques
|
||||
'BE_ADDRESS': 75,
|
||||
'FR_ADDRESS': 75,
|
||||
'ORGANIZATION': 65,
|
||||
'LOCATION': 60,
|
||||
'ADRESSE': 70, # Adresse générique avec priorité plus faible
|
||||
'ORGANISATION': 65,
|
||||
'LOCATION': 60, # Priorité plus faible que les adresses
|
||||
'PERSON': 50,
|
||||
'PERSON_NAME': 45,
|
||||
'NRP': 40,
|
||||
'URL': 35
|
||||
'BE_PROFESSIONAL_ID': 40,
|
||||
'FR_CIVILITY_TITLE': 85,
|
||||
'FR_REGULATED_PROFESSION': 80,
|
||||
'CARTE_IDENTITE_FRANCAISE': 78,
|
||||
'PERMIS_CONDUIRE_FRANCAIS': 76,
|
||||
'PASSEPORT_FRANCAIS': 77,
|
||||
'URL': 35,
|
||||
'MARKET_SHARE': 35
|
||||
}
|
||||
|
||||
# Patterns pour identifier les organisations
|
||||
@@ -112,12 +124,12 @@ class OverlapResolver:
|
||||
# Correction 1: PERSON -> ORGANIZATION pour les noms d'entreprise
|
||||
if result.entity_type == 'PERSON' and self._is_organization_name(entity_text):
|
||||
corrected_result = RecognizerResult(
|
||||
entity_type='ORGANIZATION',
|
||||
entity_type='ORGANISATION',
|
||||
start=result.start,
|
||||
end=result.end,
|
||||
score=result.score + 0.1 # Bonus de confiance
|
||||
)
|
||||
logger.debug(f"🔄 Correction PERSON -> ORGANIZATION: '{entity_text}'")
|
||||
logger.debug(f"🔄 Correction PERSON -> ORGANISATION: '{entity_text}'")
|
||||
corrected_results.append(corrected_result)
|
||||
|
||||
# Correction 2: Séparer IP des adresses physiques
|
||||
|
||||
Reference in New Issue
Block a user