Files
Presidio/post_processors/deduplication_processor.py
2025-09-07 12:29:08 +02:00

66 lines
2.7 KiB
Python

from typing import List
from presidio_analyzer import RecognizerResult
import logging
logger = logging.getLogger(__name__)
class DeduplicationProcessor:
def __init__(self):
self.rules = [
LocationAddressRule()
]
logger.info("🔧 DeduplicationProcessor initialisé avec les règles de déduplication")
def process(self, results: List[RecognizerResult], text: str) -> List[RecognizerResult]:
"""Applique les règles de déduplication aux résultats"""
processed_results = results.copy()
for rule in self.rules:
processed_results = rule.apply(processed_results, text)
logger.info(f"🔧 DeduplicationProcessor: {len(results)} -> {len(processed_results)} entités")
return processed_results
class LocationAddressRule:
"""Règle pour éviter les doublons entre LOCATION et ADDRESS"""
def __init__(self):
self.insignificant_terms = {'le', 'la', 'les', 'de', 'du', 'des', 'à', 'au', 'aux'}
def apply(self, results: List[RecognizerResult], text: str) -> List[RecognizerResult]:
"""Supprime les LOCATION qui sont des doublons d'ADDRESS"""
locations = [r for r in results if r.entity_type == 'LOCATION']
addresses = [r for r in results if r.entity_type == 'ADDRESS']
others = [r for r in results if r.entity_type not in ['LOCATION', 'ADDRESS']]
filtered_locations = []
for location in locations:
if self._should_keep_location(location, addresses, text):
filtered_locations.append(location)
else:
location_text = text[location.start:location.end]
logger.debug(f"🗑️ Suppression LOCATION dupliquée: '{location_text}'")
return addresses + filtered_locations + others
def _should_keep_location(self, location: RecognizerResult, addresses: List[RecognizerResult], text: str) -> bool:
location_text = text[location.start:location.end].strip().lower()
# Ignorer termes non significatifs
if (len(location_text) <= 3 or
location_text in self.insignificant_terms):
return False
# Vérifier chevauchement avec adresses
for address in addresses:
if self._is_overlapping_or_contained(location, address, text):
return False
return True
def _is_overlapping_or_contained(self, loc: RecognizerResult, addr: RecognizerResult, text: str) -> bool:
"""Vérifie si une location est contenue dans une address"""
loc_text = text[loc.start:loc.end].strip().lower()
addr_text = text[addr.start:addr.end].strip().lower()
return loc_text in addr_text