presidio modulaire
This commit is contained in:
316
app.py
316
app.py
@@ -1,59 +1,83 @@
|
||||
import os
|
||||
import re
|
||||
import logging
|
||||
import re
|
||||
import yaml
|
||||
from flask import Flask, request, jsonify, make_response
|
||||
|
||||
from presidio_analyzer import AnalyzerEngineProvider
|
||||
from config_loader import ConfigLoader
|
||||
from presidio_anonymizer import AnonymizerEngine
|
||||
from presidio_anonymizer.entities import OperatorConfig
|
||||
from entity_refiners import EntityRefinerManager
|
||||
from pipeline_manager import AnalysisPipeline
|
||||
|
||||
# Initialisation logger
|
||||
logging.basicConfig(level=logging.INFO,
|
||||
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
app = Flask(__name__)
|
||||
|
||||
# Chargement du moteur
|
||||
|
||||
refiner_manager = EntityRefinerManager()
|
||||
analyzer = None
|
||||
allow_list_terms = set()
|
||||
|
||||
try:
|
||||
logger.info("--- Presidio Analyzer Service Starting ---")
|
||||
CONFIG_FILE_PATH = os.environ.get("PRESIDIO_ANALYZER_CONFIG_FILE", "conf/default.yaml")
|
||||
provider = AnalyzerEngineProvider(analyzer_engine_conf_file=CONFIG_FILE_PATH)
|
||||
analyzer = provider.create_engine()
|
||||
logger.info("--- Presidio Analyzer Service Starting (Architecture Modulaire) ---")
|
||||
config_loader = ConfigLoader()
|
||||
try:
|
||||
config = config_loader.load_config("main.yaml")
|
||||
logger.info("✅ Configuration modulaire chargée avec succès")
|
||||
|
||||
allow_list_terms = set(term.lower().strip() for term in config.get('allow_list', []))
|
||||
logger.info(f"✅ Allow list chargée avec {len(allow_list_terms)} termes")
|
||||
|
||||
recognizers_count = len(config.get('recognizer_registry', {}).get('recognizers', []))
|
||||
logger.info(f"📊 Nombre de recognizers chargés: {recognizers_count}")
|
||||
|
||||
import tempfile
|
||||
|
||||
# Écriture fichier temporaire config pour Presidio
|
||||
with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml', delete=False, encoding='utf-8') as tmp_file:
|
||||
yaml.dump(config, tmp_file, default_flow_style=False, allow_unicode=True)
|
||||
temp_config_path = tmp_file.name
|
||||
|
||||
with open(temp_config_path, 'r', encoding='utf-8') as f:
|
||||
temp_content = f.read()
|
||||
logger.info(f"🔍 Contenu du fichier temporaire COMPLET:\n{temp_content[:1000]}")
|
||||
|
||||
if 'nlp_configuration' in config:
|
||||
logger.info("✅ nlp_configuration trouvée")
|
||||
else:
|
||||
logger.warning("❌ nlp_configuration MANQUANTE dans la config finale")
|
||||
|
||||
provider = AnalyzerEngineProvider(analyzer_engine_conf_file=temp_config_path)
|
||||
analyzer = provider.create_engine()
|
||||
os.unlink(temp_config_path)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Erreur avec la config modulaire: {e}")
|
||||
logger.warning("🔄 Fallback vers default.yaml")
|
||||
CONFIG_FILE_PATH = os.environ.get("PRESIDIO_ANALYZER_CONFIG_FILE", "conf/default.yaml")
|
||||
provider = AnalyzerEngineProvider(analyzer_engine_conf_file=CONFIG_FILE_PATH)
|
||||
analyzer = provider.create_engine()
|
||||
|
||||
logger.info(f"Analyzer ready. Languages: {analyzer.supported_languages}")
|
||||
|
||||
except Exception as e:
|
||||
logger.exception("Error during AnalyzerEngine initialization.")
|
||||
analyzer = None
|
||||
|
||||
|
||||
# Test Temporaire pour les Regex via du Python directement
|
||||
|
||||
IBAN_REGEX = re.compile(r"\b[A-Z]{2}[0-9]{2}(?:\s[0-9]{4}){3}\b", re.IGNORECASE)
|
||||
|
||||
|
||||
IPV4_REGEX = re.compile(
|
||||
r"\b(?:(?:25[0-5]|2[0-4][0-9]|1\d{2}|[1-9]?\d)\.){3}"
|
||||
r"(?:25[0-5]|2[0-4][0-9]|1\d{2}|[1-9]?\d)\b"
|
||||
)
|
||||
|
||||
# Liste Temporaire en surcouche des labels/phrases à exclure d’anonymisation
|
||||
|
||||
IGNORE_LABELS = {
|
||||
"témoins",
|
||||
"témoins clés",
|
||||
"coordonnées",
|
||||
"coordonnées bancaires",
|
||||
"contexte financier",
|
||||
"données sensibles",
|
||||
"contexte",
|
||||
"montrent",
|
||||
"montrent des",
|
||||
"montrent des irrégularités",
|
||||
"bénéficiaire",
|
||||
}
|
||||
|
||||
def normalize_label(text: str) -> str:
|
||||
return text.strip().lower()
|
||||
|
||||
cleaned = re.sub(r'[^\w\s]', '', text.strip().lower())
|
||||
return cleaned
|
||||
|
||||
|
||||
# Remplacer ligne 18
|
||||
pipeline = AnalysisPipeline()
|
||||
|
||||
# Modifier la fonction analyze_text (lignes 73-105)
|
||||
@app.route("/analyze", methods=["POST"])
|
||||
def analyze_text():
|
||||
if not analyzer:
|
||||
@@ -67,62 +91,182 @@ def analyze_text():
|
||||
if not text_to_analyze:
|
||||
return jsonify({"error": "text field is missing or empty"}), 400
|
||||
|
||||
results = analyzer.analyze(text=text_to_analyze, language=language)
|
||||
|
||||
filtered_results = []
|
||||
for res in results:
|
||||
ent_text = text_to_analyze[res.start:res.end].strip()
|
||||
ent_text_norm = normalize_label(ent_text)
|
||||
|
||||
if ent_text_norm in IGNORE_LABELS:
|
||||
logger.debug(f"Skipping anonymization of label: '{ent_text}'")
|
||||
continue
|
||||
|
||||
# Recadrage IBAN
|
||||
|
||||
if res.entity_type == "IBAN":
|
||||
match = IBAN_REGEX.search(ent_text)
|
||||
if match:
|
||||
true_iban = match.group(0)
|
||||
start_offset = ent_text.find(true_iban)
|
||||
if start_offset != -1:
|
||||
old_start, old_end = res.start, res.end
|
||||
res.start += start_offset
|
||||
res.end = res.start + len(true_iban)
|
||||
logger.debug(f"Adjusted IBAN span: {old_start}-{old_end} => {res.start}-{res.end}")
|
||||
else:
|
||||
logger.warning(f"IBAN regex match but cannot find substring position: '{ent_text}'")
|
||||
else:
|
||||
logger.warning(f"Invalid IBAN detected, skipping: '{ent_text}'")
|
||||
continue
|
||||
|
||||
# Recadrage IP_ADDRESS
|
||||
|
||||
if res.entity_type == "IP_ADDRESS":
|
||||
match = IPV4_REGEX.search(ent_text)
|
||||
if match:
|
||||
true_ip = match.group(0)
|
||||
start_offset = ent_text.find(true_ip)
|
||||
if start_offset != -1:
|
||||
old_start, old_end = res.start, res.end
|
||||
res.start += start_offset
|
||||
res.end = res.start + len(true_ip)
|
||||
logger.debug(f"Adjusted IP span: {old_start}-{old_end} => {res.start}-{res.end}")
|
||||
else:
|
||||
logger.warning(f"IP regex match but cannot find substring position: '{ent_text}'")
|
||||
else:
|
||||
logger.warning(f"Invalid IP detected, skipping: '{ent_text}'")
|
||||
continue
|
||||
|
||||
filtered_results.append(res)
|
||||
|
||||
# Résultat nettoyé
|
||||
response_data = [res.to_dict() for res in filtered_results]
|
||||
# Analyse brute
|
||||
raw_results = analyzer.analyze(text=text_to_analyze, language=language)
|
||||
|
||||
# Pipeline modulaire complet
|
||||
final_results = pipeline.process(text_to_analyze, raw_results, allow_list_terms)
|
||||
|
||||
response_data = [res.to_dict() for res in final_results]
|
||||
return make_response(jsonify(response_data), 200)
|
||||
|
||||
except Exception as e:
|
||||
logger.exception("Error processing analysis")
|
||||
return jsonify({"error": str(e)}), 500
|
||||
|
||||
|
||||
@app.route("/health", methods=["GET"])
|
||||
def health_check():
|
||||
if analyzer:
|
||||
return jsonify({
|
||||
"status": "healthy",
|
||||
"languages": analyzer.supported_languages,
|
||||
"version": "2.0.0"
|
||||
}), 200
|
||||
else:
|
||||
return jsonify({"status": "unhealthy", "error": "Analyzer not initialized"}), 503
|
||||
|
||||
|
||||
def load_replacements():
|
||||
"""Charge les configurations d'anonymisation depuis YAML"""
|
||||
try:
|
||||
config_path = "conf/anonymization/replacements.yaml"
|
||||
if not os.path.exists(config_path):
|
||||
logger.warning(f"❌ Fichier de configuration non trouvé: {config_path}")
|
||||
return {}
|
||||
|
||||
with open(config_path, "r", encoding="utf-8") as f:
|
||||
config = yaml.safe_load(f)
|
||||
|
||||
if not config:
|
||||
logger.warning("❌ Fichier de configuration vide")
|
||||
return {}
|
||||
|
||||
anonymizer_config = config.get("anonymizer_config", {})
|
||||
replacements = anonymizer_config.get("replacements", {})
|
||||
|
||||
if not replacements:
|
||||
logger.warning("❌ Aucun remplacement trouvé dans la configuration")
|
||||
return {}
|
||||
|
||||
operators = {}
|
||||
for entity_type, replacement_value in replacements.items():
|
||||
try:
|
||||
operators[entity_type] = OperatorConfig("replace", {"new_value": replacement_value})
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Erreur lors création opérateur {entity_type}: {e}")
|
||||
continue
|
||||
|
||||
logger.info(f"✅ Loaded {len(operators)} replacement operators from config")
|
||||
return operators
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Failed to load replacements config: {e}")
|
||||
return {}
|
||||
|
||||
|
||||
# Initialisation anonymizer et opérateurs
|
||||
try:
|
||||
anonymizer = AnonymizerEngine()
|
||||
logger.info("✅ Anonymizer engine initialized successfully")
|
||||
replacement_operators = load_replacements()
|
||||
if replacement_operators:
|
||||
logger.info(f"✅ Loaded {len(replacement_operators)} custom replacement operators")
|
||||
else:
|
||||
logger.warning("⚠️ Aucun opérateur remplacement chargé, fallback par défaut")
|
||||
replacement_operators = {}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Anonymizer initialization failed: {e}")
|
||||
anonymizer = None
|
||||
replacement_operators = {}
|
||||
|
||||
|
||||
@app.route("/anonymize", methods=["POST"])
|
||||
def anonymize_text():
|
||||
logger.error("🚨 ENDPOINT /anonymize APPELÉ")
|
||||
|
||||
global anonymizer, replacement_operators
|
||||
|
||||
if anonymizer is None:
|
||||
return jsonify({"error": "Anonymizer not initialized"}), 500
|
||||
|
||||
if not replacement_operators:
|
||||
logger.warning("⚠️ replacement_operators non défini, rechargement...")
|
||||
replacement_operators = load_replacements()
|
||||
|
||||
logger.info(f"🔍 Opérateurs disponibles: {list(replacement_operators.keys())}")
|
||||
|
||||
try:
|
||||
data = request.get_json(force=True)
|
||||
text_to_anonymize = data.get("text", "")
|
||||
language = data.get("language", "fr")
|
||||
mode = data.get("mode", "pii")
|
||||
|
||||
if not text_to_anonymize:
|
||||
return jsonify({"error": "No text provided"}), 400
|
||||
|
||||
logger.info(f"🔍 Texte à anonymiser: '{text_to_anonymize}'")
|
||||
|
||||
entities_to_detect = get_entities_by_mode(mode) if 'get_entities_by_mode' in globals() else None
|
||||
|
||||
analyzer_results = analyzer.analyze(
|
||||
text=text_to_anonymize,
|
||||
language=language,
|
||||
entities=entities_to_detect
|
||||
)
|
||||
|
||||
logger.info(f"🔍 Entités détectées: {[(r.entity_type, text_to_anonymize[r.start:r.end], r.score) for r in analyzer_results]}")
|
||||
|
||||
filtered_results = []
|
||||
for res in analyzer_results:
|
||||
ent_text = text_to_anonymize[res.start:res.end].strip()
|
||||
ent_text_norm = normalize_label(ent_text)
|
||||
|
||||
logger.info(f"🔍 Traitement entité: {res.entity_type} = '{ent_text}' (score: {res.score})")
|
||||
logger.info(f"🔍 Allow list terms: {allow_list_terms}")
|
||||
|
||||
# Vérification améliorée de la allow list
|
||||
ent_text_clean = re.sub(r'[^\w]', '', ent_text.strip().lower())
|
||||
logger.info(f"🔍 Texte nettoyé: '{ent_text_clean}'")
|
||||
|
||||
# Vérifier si le texte correspond exactement ou commence par un terme de la allow list
|
||||
is_allowed = any(ent_text_clean == term or ent_text_clean.startswith(term) for term in allow_list_terms)
|
||||
|
||||
if is_allowed:
|
||||
logger.info(f"✅ Entité '{ent_text}' ignorée (dans allow list)")
|
||||
continue
|
||||
|
||||
refined_positions = refiner_manager.refine_entity(text_to_anonymize, res.entity_type, res.start, res.end)
|
||||
if refined_positions is None:
|
||||
logger.info(f"❌ Entité {res.entity_type} supprimée par le refiner")
|
||||
continue
|
||||
|
||||
res.start, res.end = refined_positions
|
||||
filtered_results.append(res)
|
||||
logger.info(f"✅ Entité {res.entity_type} conservée après refinement")
|
||||
|
||||
logger.info(f"🔍 Entités finales pour anonymisation: {[(r.entity_type, text_to_anonymize[r.start:r.end]) for r in filtered_results]}")
|
||||
|
||||
operators_to_use = replacement_operators if replacement_operators else {}
|
||||
logger.info(f"🔍 Opérateurs utilisés: {list(operators_to_use.keys())}")
|
||||
|
||||
anonymized_result = anonymizer.anonymize(
|
||||
text=text_to_anonymize,
|
||||
analyzer_results=filtered_results,
|
||||
operators=operators_to_use
|
||||
)
|
||||
|
||||
logger.info(f"🔍 Résultat anonymisation: '{anonymized_result.text}'")
|
||||
|
||||
return jsonify({
|
||||
"original_text": text_to_anonymize,
|
||||
"anonymized_text": anonymized_result.text,
|
||||
"entities_found": [
|
||||
{
|
||||
"entity_type": result.entity_type,
|
||||
"start": result.start,
|
||||
"end": result.end,
|
||||
"score": result.score
|
||||
} for result in filtered_results
|
||||
],
|
||||
"mode": mode
|
||||
})
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error during anonymization: {e}")
|
||||
return jsonify({"error": str(e)}), 500
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app.run(host="0.0.0.0", port=5001)
|
||||
|
||||
Reference in New Issue
Block a user