presidio modulaire

This commit is contained in:
nBiqoz
2025-09-07 12:29:08 +02:00
parent 85d95d05e5
commit c62e5b92d5
42 changed files with 1802 additions and 324 deletions

68
pipeline_manager.py Normal file
View File

@@ -0,0 +1,68 @@
from typing import List
from presidio_analyzer import RecognizerResult
from entity_refiners import EntityRefinerManager
from post_processors import DeduplicationProcessor, OverlapResolver
import logging
logger = logging.getLogger(__name__)
class AnalysisPipeline:
def __init__(self):
self.refiner_manager = EntityRefinerManager()
self.overlap_resolver = OverlapResolver()
self.deduplicator = DeduplicationProcessor()
logger.info("🚀 Pipeline d'analyse initialisé")
def process(self, text: str, results: List[RecognizerResult], allow_list_terms: List[str]) -> List[RecognizerResult]:
"""Traite les résultats à travers le pipeline complet"""
# 1. Filtrage allow-list
filtered_results = self._filter_allow_list(results, allow_list_terms, text)
# 2. Raffinement individuel des entités
refined_results = []
for result in filtered_results:
refined_coords = self.refiner_manager.refine_entity(
text,
result.entity_type,
result.start,
result.end
)
if refined_coords is not None:
# Créer un nouveau RecognizerResult avec les coordonnées raffinées
refined_result = RecognizerResult(
entity_type=result.entity_type,
start=refined_coords[0],
end=refined_coords[1],
score=result.score
)
refined_results.append(refined_result)
# 3. Résolution des chevauchements
resolved_results = self.overlap_resolver.process(refined_results, text)
# 4. Déduplication
final_results = self.deduplicator.process(resolved_results, text)
logger.info(f"🎯 Pipeline complet: {len(results)} -> {len(final_results)} entités")
return final_results
def _filter_allow_list(self, results: List[RecognizerResult], allow_list_terms: List[str], text: str) -> List[RecognizerResult]:
"""Filtre les résultats en supprimant les termes de la allow-list"""
if not allow_list_terms:
return results
filtered_results = []
allow_list_lower = [term.lower().strip() for term in allow_list_terms]
for result in results:
entity_text = text[result.start:result.end].lower().strip()
# Garder l'entité si elle n'est pas dans la allow-list
if entity_text not in allow_list_lower:
filtered_results.append(result)
else:
logger.debug(f"🚫 Entité filtrée (allow-list): '{entity_text}'")
logger.info(f"🔍 Filtrage allow-list: {len(results)} -> {len(filtered_results)} entités")
return filtered_results