presidio modulaire

2025-09-07 12:29:08 +02:00
parent 85d95d05e5
commit c62e5b92d5
42 changed files with 1802 additions and 324 deletions
--- a/app.py
+++ b/app.py
@@ -1,59 +1,83 @@
 import os
-import re
 import logging
+import re
+import yaml
 from flask import Flask, request, jsonify, make_response
-
 from presidio_analyzer import AnalyzerEngineProvider
+from config_loader import ConfigLoader
+from presidio_anonymizer import AnonymizerEngine
+from presidio_anonymizer.entities import OperatorConfig
+from entity_refiners import EntityRefinerManager
+from pipeline_manager import AnalysisPipeline

+# Initialisation logger
 logging.basicConfig(level=logging.INFO,
                    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
 logger = logging.getLogger(__name__)

 app = Flask(__name__)

-# Chargement du moteur 
-
+refiner_manager = EntityRefinerManager()
 analyzer = None
+allow_list_terms = set()
+
 try:
-    logger.info("--- Presidio Analyzer Service Starting ---")
-    CONFIG_FILE_PATH = os.environ.get("PRESIDIO_ANALYZER_CONFIG_FILE", "conf/default.yaml")
-    provider = AnalyzerEngineProvider(analyzer_engine_conf_file=CONFIG_FILE_PATH)
-    analyzer = provider.create_engine()
+    logger.info("--- Presidio Analyzer Service Starting (Architecture Modulaire) ---")
+    config_loader = ConfigLoader()
+    try:
+        config = config_loader.load_config("main.yaml")
+        logger.info("✅ Configuration modulaire chargée avec succès")
+
+        allow_list_terms = set(term.lower().strip() for term in config.get('allow_list', []))
+        logger.info(f"✅ Allow list chargée avec {len(allow_list_terms)} termes")
+
+        recognizers_count = len(config.get('recognizer_registry', {}).get('recognizers', []))
+        logger.info(f"📊 Nombre de recognizers chargés: {recognizers_count}")
+
+        import tempfile
+
+        # Écriture fichier temporaire config pour Presidio
+        with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml', delete=False, encoding='utf-8') as tmp_file:
+            yaml.dump(config, tmp_file, default_flow_style=False, allow_unicode=True)
+            temp_config_path = tmp_file.name
+
+        with open(temp_config_path, 'r', encoding='utf-8') as f:
+            temp_content = f.read()
+            logger.info(f"🔍 Contenu du fichier temporaire COMPLET:\n{temp_content[:1000]}")
+
+        if 'nlp_configuration' in config:
+            logger.info("✅ nlp_configuration trouvée")
+        else:
+            logger.warning("❌ nlp_configuration MANQUANTE dans la config finale")
+
+        provider = AnalyzerEngineProvider(analyzer_engine_conf_file=temp_config_path)
+        analyzer = provider.create_engine()
+        os.unlink(temp_config_path)
+
+    except Exception as e:
+        logger.error(f"❌ Erreur avec la config modulaire: {e}")
+        logger.warning("🔄 Fallback vers default.yaml")
+        CONFIG_FILE_PATH = os.environ.get("PRESIDIO_ANALYZER_CONFIG_FILE", "conf/default.yaml")
+        provider = AnalyzerEngineProvider(analyzer_engine_conf_file=CONFIG_FILE_PATH)
+        analyzer = provider.create_engine()
+
    logger.info(f"Analyzer ready. Languages: {analyzer.supported_languages}")
+
 except Exception as e:
    logger.exception("Error during AnalyzerEngine initialization.")
    analyzer = None


-# Test Temporaire pour les Regex via du Python directement
-
-IBAN_REGEX = re.compile(r"\b[A-Z]{2}[0-9]{2}(?:\s[0-9]{4}){3}\b", re.IGNORECASE)
-
-
-IPV4_REGEX = re.compile(
-    r"\b(?:(?:25[0-5]|2[0-4][0-9]|1\d{2}|[1-9]?\d)\.){3}"
-    r"(?:25[0-5]|2[0-4][0-9]|1\d{2}|[1-9]?\d)\b"
-)
-
-# Liste Temporaire en surcouche des labels/phrases à exclure d’anonymisation 
-
-IGNORE_LABELS = {
-    "témoins",
-    "témoins clés",
-    "coordonnées",
-    "coordonnées bancaires",
-    "contexte financier",
-    "données sensibles",
-    "contexte",
-    "montrent",
-    "montrent des",
-    "montrent des irrégularités",
-    "bénéficiaire",
-}
-
 def normalize_label(text: str) -> str:
-    return text.strip().lower()

+    cleaned = re.sub(r'[^\w\s]', '', text.strip().lower())
+    return cleaned
+
+
+# Remplacer ligne 18
+pipeline = AnalysisPipeline()
+
+# Modifier la fonction analyze_text (lignes 73-105)
@app.route("/analyze", methods=["POST"])
 def analyze_text():
    if not analyzer:
@@ -67,62 +91,182 @@ def analyze_text():
        if not text_to_analyze:
            return jsonify({"error": "text field is missing or empty"}), 400

-        results = analyzer.analyze(text=text_to_analyze, language=language)
-
-        filtered_results = []
-        for res in results:
-            ent_text = text_to_analyze[res.start:res.end].strip()
-            ent_text_norm = normalize_label(ent_text)
-
-            if ent_text_norm in IGNORE_LABELS:
-                logger.debug(f"Skipping anonymization of label: '{ent_text}'")
-                continue
-
-            # Recadrage IBAN 
-
-            if res.entity_type == "IBAN":
-                match = IBAN_REGEX.search(ent_text)
-                if match:
-                    true_iban = match.group(0)
-                    start_offset = ent_text.find(true_iban)
-                    if start_offset != -1:
-                        old_start, old_end = res.start, res.end
-                        res.start += start_offset
-                        res.end = res.start + len(true_iban)
-                        logger.debug(f"Adjusted IBAN span: {old_start}-{old_end} => {res.start}-{res.end}")
-                    else:
-                        logger.warning(f"IBAN regex match but cannot find substring position: '{ent_text}'")
-                else:
-                    logger.warning(f"Invalid IBAN detected, skipping: '{ent_text}'")
-                    continue
-
-            # Recadrage IP_ADDRESS 
-
-            if res.entity_type == "IP_ADDRESS":
-                match = IPV4_REGEX.search(ent_text)
-                if match:
-                    true_ip = match.group(0)
-                    start_offset = ent_text.find(true_ip)
-                    if start_offset != -1:
-                        old_start, old_end = res.start, res.end
-                        res.start += start_offset
-                        res.end = res.start + len(true_ip)
-                        logger.debug(f"Adjusted IP span: {old_start}-{old_end} => {res.start}-{res.end}")
-                    else:
-                        logger.warning(f"IP regex match but cannot find substring position: '{ent_text}'")
-                else:
-                    logger.warning(f"Invalid IP detected, skipping: '{ent_text}'")
-                    continue
-
-            filtered_results.append(res)
-
-        # Résultat nettoyé
-        response_data = [res.to_dict() for res in filtered_results]
+        # Analyse brute
+        raw_results = analyzer.analyze(text=text_to_analyze, language=language)
+        
+        # Pipeline modulaire complet
+        final_results = pipeline.process(text_to_analyze, raw_results, allow_list_terms)
+        
+        response_data = [res.to_dict() for res in final_results]
        return make_response(jsonify(response_data), 200)

    except Exception as e:
        logger.exception("Error processing analysis")
        return jsonify({"error": str(e)}), 500

+
+@app.route("/health", methods=["GET"])
+def health_check():
+    if analyzer:
+        return jsonify({
+            "status": "healthy",
+            "languages": analyzer.supported_languages,
+            "version": "2.0.0"
+        }), 200
+    else:
+        return jsonify({"status": "unhealthy", "error": "Analyzer not initialized"}), 503
+
+
+def load_replacements():
+    """Charge les configurations d'anonymisation depuis YAML"""
+    try:
+        config_path = "conf/anonymization/replacements.yaml"
+        if not os.path.exists(config_path):
+            logger.warning(f"❌ Fichier de configuration non trouvé: {config_path}")
+            return {}
+
+        with open(config_path, "r", encoding="utf-8") as f:
+            config = yaml.safe_load(f)
+
+        if not config:
+            logger.warning("❌ Fichier de configuration vide")
+            return {}
+
+        anonymizer_config = config.get("anonymizer_config", {})
+        replacements = anonymizer_config.get("replacements", {})
+
+        if not replacements:
+            logger.warning("❌ Aucun remplacement trouvé dans la configuration")
+            return {}
+
+        operators = {}
+        for entity_type, replacement_value in replacements.items():
+            try:
+                operators[entity_type] = OperatorConfig("replace", {"new_value": replacement_value})
+            except Exception as e:
+                logger.error(f"❌ Erreur lors création opérateur {entity_type}: {e}")
+                continue
+
+        logger.info(f"✅ Loaded {len(operators)} replacement operators from config")
+        return operators
+
+    except Exception as e:
+        logger.error(f"❌ Failed to load replacements config: {e}")
+        return {}
+
+
+# Initialisation anonymizer et opérateurs
+try:
+    anonymizer = AnonymizerEngine()
+    logger.info("✅ Anonymizer engine initialized successfully")
+    replacement_operators = load_replacements()
+    if replacement_operators:
+        logger.info(f"✅ Loaded {len(replacement_operators)} custom replacement operators")
+    else:
+        logger.warning("⚠️ Aucun opérateur remplacement chargé, fallback par défaut")
+        replacement_operators = {}
+
+except Exception as e:
+    logger.error(f"❌ Anonymizer initialization failed: {e}")
+    anonymizer = None
+    replacement_operators = {}
+
+
+@app.route("/anonymize", methods=["POST"])
+def anonymize_text():
+    logger.error("🚨 ENDPOINT /anonymize APPELÉ")
+
+    global anonymizer, replacement_operators
+
+    if anonymizer is None:
+        return jsonify({"error": "Anonymizer not initialized"}), 500
+
+    if not replacement_operators:
+        logger.warning("⚠️ replacement_operators non défini, rechargement...")
+        replacement_operators = load_replacements()
+
+    logger.info(f"🔍 Opérateurs disponibles: {list(replacement_operators.keys())}")
+
+    try:
+        data = request.get_json(force=True)
+        text_to_anonymize = data.get("text", "")
+        language = data.get("language", "fr")
+        mode = data.get("mode", "pii")
+
+        if not text_to_anonymize:
+            return jsonify({"error": "No text provided"}), 400
+
+        logger.info(f"🔍 Texte à anonymiser: '{text_to_anonymize}'")
+
+        entities_to_detect = get_entities_by_mode(mode) if 'get_entities_by_mode' in globals() else None
+
+        analyzer_results = analyzer.analyze(
+            text=text_to_anonymize,
+            language=language,
+            entities=entities_to_detect
+        )
+
+        logger.info(f"🔍 Entités détectées: {[(r.entity_type, text_to_anonymize[r.start:r.end], r.score) for r in analyzer_results]}")
+
+        filtered_results = []
+        for res in analyzer_results:
+            ent_text = text_to_anonymize[res.start:res.end].strip()
+            ent_text_norm = normalize_label(ent_text)
+
+            logger.info(f"🔍 Traitement entité: {res.entity_type} = '{ent_text}' (score: {res.score})")
+            logger.info(f"🔍 Allow list terms: {allow_list_terms}")
+
+            # Vérification améliorée de la allow list
+            ent_text_clean = re.sub(r'[^\w]', '', ent_text.strip().lower())
+            logger.info(f"🔍 Texte nettoyé: '{ent_text_clean}'")
+            
+            # Vérifier si le texte correspond exactement ou commence par un terme de la allow list
+            is_allowed = any(ent_text_clean == term or ent_text_clean.startswith(term) for term in allow_list_terms)
+            
+            if is_allowed:
+                logger.info(f"✅ Entité '{ent_text}' ignorée (dans allow list)")
+                continue
+
+            refined_positions = refiner_manager.refine_entity(text_to_anonymize, res.entity_type, res.start, res.end)
+            if refined_positions is None:
+                logger.info(f"❌ Entité {res.entity_type} supprimée par le refiner")
+                continue
+
+            res.start, res.end = refined_positions
+            filtered_results.append(res)
+            logger.info(f"✅ Entité {res.entity_type} conservée après refinement")
+
+        logger.info(f"🔍 Entités finales pour anonymisation: {[(r.entity_type, text_to_anonymize[r.start:r.end]) for r in filtered_results]}")
+
+        operators_to_use = replacement_operators if replacement_operators else {}
+        logger.info(f"🔍 Opérateurs utilisés: {list(operators_to_use.keys())}")
+
+        anonymized_result = anonymizer.anonymize(
+            text=text_to_anonymize,
+            analyzer_results=filtered_results,
+            operators=operators_to_use
+        )
+
+        logger.info(f"🔍 Résultat anonymisation: '{anonymized_result.text}'")
+
+        return jsonify({
+            "original_text": text_to_anonymize,
+            "anonymized_text": anonymized_result.text,
+            "entities_found": [
+                {
+                    "entity_type": result.entity_type,
+                    "start": result.start,
+                    "end": result.end,
+                    "score": result.score
+                } for result in filtered_results
+            ],
+            "mode": mode
+        })
+
+    except Exception as e:
+        logger.error(f"Error during anonymization: {e}")
+        return jsonify({"error": str(e)}), 500
+
+
 if __name__ == "__main__":
    app.run(host="0.0.0.0", port=5001)