From 7cf996e08b59a8bcf453b2d435d9ca1db0aa8b1a Mon Sep 17 00:00:00 2001 From: nacim Date: Sun, 3 Aug 2025 20:05:01 +0000 Subject: [PATCH] Actualiser app.py --- app.py | 110 ++++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 85 insertions(+), 25 deletions(-) diff --git a/app.py b/app.py index dd16485..83e0f29 100644 --- a/app.py +++ b/app.py @@ -1,9 +1,14 @@ import os import re import logging +import yaml # ### AJOUT ### Nécessaire pour charger la configuration manuellement + from flask import Flask, request, jsonify, make_response -from presidio_analyzer import AnalyzerEngineProvider +# ### AJOUT ### Import des classes nécessaires pour l'anonymisation +from presidio_analyzer import AnalyzerEngine, RecognizerResult +from presidio_anonymizer import AnonymizerEngine +from presidio_analyzer.nlp_engine import NlpEngineProvider logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") @@ -11,28 +16,50 @@ logger = logging.getLogger(__name__) app = Flask(__name__) -# Chargement du moteur Presidio via Provider +# --- Initialisation combinée de l'Analyzer et de l'Anonymizer --- analyzer = None -try: - logger.info("--- Presidio Analyzer Service Starting ---") - CONFIG_FILE_PATH = os.environ.get("PRESIDIO_ANALYZER_CONFIG_FILE", "conf/default.yaml") - provider = AnalyzerEngineProvider(analyzer_engine_conf_file=CONFIG_FILE_PATH) - analyzer = provider.create_engine() - logger.info(f"Analyzer ready. Languages: {analyzer.supported_languages}") -except Exception as e: - logger.exception("Error during AnalyzerEngine initialization.") - analyzer = None +anonymizer = None -# Regex strict pour IBAN belge format attendu +try: + logger.info("--- Presidio Service Starting ---") + # On récupère le chemin du fichier de configuration + CONFIG_FILE_PATH = os.environ.get("PRESIDIO_ANALYZER_CONFIG_FILE", "conf/default.yaml") + + if not os.path.exists(CONFIG_FILE_PATH): + raise FileNotFoundError(f"Configuration file not found at: {CONFIG_FILE_PATH}") + + # On charge le fichier YAML en mémoire + with open(CONFIG_FILE_PATH, 'r') as f: + config = yaml.safe_load(f) + + # 1. Créer l'Analyzer Engine en utilisant le provider et la configuration chargée + # Le provider sait comment lire la configuration pour l'analyzer + provider = AnalyzerEngineProvider(analyzer_engine_conf=config) + analyzer = provider.create_engine() + + # 2. ### AJOUT ### Créer l'Anonymizer Engine en lui passant sa section de configuration + anonymizer_config = config.get("anonymizer_config", {}) + anonymizer = AnonymizerEngine(anonymizer_config=anonymizer_config) + + logger.info(f"Analyzer and Anonymizer are ready. Languages: {analyzer.supported_languages}") + +except Exception as e: + logger.exception("FATAL: Error during Presidio engines initialization.") + analyzer = None + anonymizer = None +# --- Fin de la section d'initialisation --- + + +# Regex strict pour IBAN belge format attendu (INCHANGÉ) IBAN_REGEX = re.compile(r"\b[A-Z]{2}[0-9]{2}(?:\s[0-9]{4}){3}\b", re.IGNORECASE) -# Regex IPv4 +# Regex IPv4 (INCHANGÉ) IPV4_REGEX = re.compile( r"\b(?:(?:25[0-5]|2[0-4][0-9]|1\d{2}|[1-9]?\d)\.){3}" r"(?:25[0-5]|2[0-4][0-9]|1\d{2}|[1-9]?\d)\b" ) -# Liste des labels/phrases à exclure d’anonymisation (en minuscules) +# Liste des labels/phrases à exclure d’anonymisation IGNORE_LABELS = { "témoins", "témoins clés", @@ -50,6 +77,9 @@ IGNORE_LABELS = { def normalize_label(text: str) -> str: return text.strip().lower() +# ========================= +# ENDPOINT /analyze BASIQUE +# ========================= @app.route("/analyze", methods=["POST"]) def analyze_text(): if not analyzer: @@ -81,36 +111,27 @@ def analyze_text(): true_iban = match.group(0) start_offset = ent_text.find(true_iban) if start_offset != -1: - old_start, old_end = res.start, res.end res.start += start_offset res.end = res.start + len(true_iban) - logger.debug(f"Adjusted IBAN span: {old_start}-{old_end} => {res.start}-{res.end}") - else: - logger.warning(f"IBAN regex match but cannot find substring position: '{ent_text}'") else: logger.warning(f"Invalid IBAN detected, skipping: '{ent_text}'") continue - # Recadrage IP_ADDRESS strict IPv4 (wildcard possible pour IPv6 si besoin) + # Recadrage IP_ADDRESS strict IPv4 if res.entity_type == "IP_ADDRESS": match = IPV4_REGEX.search(ent_text) if match: true_ip = match.group(0) start_offset = ent_text.find(true_ip) if start_offset != -1: - old_start, old_end = res.start, res.end res.start += start_offset res.end = res.start + len(true_ip) - logger.debug(f"Adjusted IP span: {old_start}-{old_end} => {res.start}-{res.end}") - else: - logger.warning(f"IP regex match but cannot find substring position: '{ent_text}'") else: logger.warning(f"Invalid IP detected, skipping: '{ent_text}'") continue filtered_results.append(res) - # Retourner le résultat nettoyé response_data = [res.to_dict() for res in filtered_results] return make_response(jsonify(response_data), 200) @@ -118,5 +139,44 @@ def analyze_text(): logger.exception("Error processing analysis") return jsonify({"error": str(e)}), 500 +# ============================================ +# ENDPOINT /anonymize QUI FAIT LE REMPLACEMENT +# ============================================ + +@app.route("/anonymize", methods=["POST"]) +def anonymize_text(): + if not analyzer or not anonymizer: + return jsonify({"error": "Presidio engines are not available. Check startup logs."}), 500 + + try: + data = request.get_json(force=True) + text_to_process = data.get("text", "") + language = data.get("language", "fr") + + if not text_to_process: + return jsonify({"error": "text field is missing or empty"}), 400 + + # Étape 1 : Analyser le texte pour trouver les entités + analyzer_results = analyzer.analyze(text=text_to_process, language=language) + + # Étape 2 : Anonymiser le texte en utilisant les résultats de l'analyse + + # L'AnonymizerEngine va utiliser la config 'anonymizer_config' pour faire les remplacements + anonymized_result = anonymizer.anonymize( + text=text_to_process, + analyzer_results=analyzer_results + ) + + # Étape 3 : Renvoyer le texte anonymisé + return jsonify({"text": anonymized_result.text}), 200 + + except Exception as e: + logger.exception("Error processing anonymization request") + return jsonify({"error": str(e)}), 500 + +# ===================================================================== +# DÉMARRAGE DE L'APPLICATION FLASK (INCHANGÉ) +# ===================================================================== if __name__ == "__main__": - app.run(host="0.0.0.0", port=5001) + # Pour le déploiement, il est préférable d'utiliser un serveur WSGI comme Gunicorn + app.run(host="0.0.0.0", port=5001) \ No newline at end of file