Files
Presidio/app.py
2025-08-03 20:05:01 +00:00

182 lines
6.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import os
import re
import logging
import yaml # ### AJOUT ### Nécessaire pour charger la configuration manuellement
from flask import Flask, request, jsonify, make_response
# ### AJOUT ### Import des classes nécessaires pour l'anonymisation
from presidio_analyzer import AnalyzerEngine, RecognizerResult
from presidio_anonymizer import AnonymizerEngine
from presidio_analyzer.nlp_engine import NlpEngineProvider
logging.basicConfig(level=logging.INFO,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)
app = Flask(__name__)
# --- Initialisation combinée de l'Analyzer et de l'Anonymizer ---
analyzer = None
anonymizer = None
try:
logger.info("--- Presidio Service Starting ---")
# On récupère le chemin du fichier de configuration
CONFIG_FILE_PATH = os.environ.get("PRESIDIO_ANALYZER_CONFIG_FILE", "conf/default.yaml")
if not os.path.exists(CONFIG_FILE_PATH):
raise FileNotFoundError(f"Configuration file not found at: {CONFIG_FILE_PATH}")
# On charge le fichier YAML en mémoire
with open(CONFIG_FILE_PATH, 'r') as f:
config = yaml.safe_load(f)
# 1. Créer l'Analyzer Engine en utilisant le provider et la configuration chargée
# Le provider sait comment lire la configuration pour l'analyzer
provider = AnalyzerEngineProvider(analyzer_engine_conf=config)
analyzer = provider.create_engine()
# 2. ### AJOUT ### Créer l'Anonymizer Engine en lui passant sa section de configuration
anonymizer_config = config.get("anonymizer_config", {})
anonymizer = AnonymizerEngine(anonymizer_config=anonymizer_config)
logger.info(f"Analyzer and Anonymizer are ready. Languages: {analyzer.supported_languages}")
except Exception as e:
logger.exception("FATAL: Error during Presidio engines initialization.")
analyzer = None
anonymizer = None
# --- Fin de la section d'initialisation ---
# Regex strict pour IBAN belge format attendu (INCHANGÉ)
IBAN_REGEX = re.compile(r"\b[A-Z]{2}[0-9]{2}(?:\s[0-9]{4}){3}\b", re.IGNORECASE)
# Regex IPv4 (INCHANGÉ)
IPV4_REGEX = re.compile(
r"\b(?:(?:25[0-5]|2[0-4][0-9]|1\d{2}|[1-9]?\d)\.){3}"
r"(?:25[0-5]|2[0-4][0-9]|1\d{2}|[1-9]?\d)\b"
)
# Liste des labels/phrases à exclure danonymisation
IGNORE_LABELS = {
"témoins",
"témoins clés",
"coordonnées",
"coordonnées bancaires",
"contexte financier",
"données sensibles",
"contexte",
"montrent",
"montrent des",
"montrent des irrégularités",
"bénéficiaire",
}
def normalize_label(text: str) -> str:
return text.strip().lower()
# =========================
# ENDPOINT /analyze BASIQUE
# =========================
@app.route("/analyze", methods=["POST"])
def analyze_text():
if not analyzer:
return jsonify({"error": "Analyzer engine is not available. Check startup logs."}), 500
try:
data = request.get_json(force=True)
text_to_analyze = data.get("text", "")
language = data.get("language", "fr")
if not text_to_analyze:
return jsonify({"error": "text field is missing or empty"}), 400
results = analyzer.analyze(text=text_to_analyze, language=language)
filtered_results = []
for res in results:
ent_text = text_to_analyze[res.start:res.end].strip()
ent_text_norm = normalize_label(ent_text)
if ent_text_norm in IGNORE_LABELS:
logger.debug(f"Skipping anonymization of label: '{ent_text}'")
continue
# Recadrage IBAN strict
if res.entity_type == "IBAN":
match = IBAN_REGEX.search(ent_text)
if match:
true_iban = match.group(0)
start_offset = ent_text.find(true_iban)
if start_offset != -1:
res.start += start_offset
res.end = res.start + len(true_iban)
else:
logger.warning(f"Invalid IBAN detected, skipping: '{ent_text}'")
continue
# Recadrage IP_ADDRESS strict IPv4
if res.entity_type == "IP_ADDRESS":
match = IPV4_REGEX.search(ent_text)
if match:
true_ip = match.group(0)
start_offset = ent_text.find(true_ip)
if start_offset != -1:
res.start += start_offset
res.end = res.start + len(true_ip)
else:
logger.warning(f"Invalid IP detected, skipping: '{ent_text}'")
continue
filtered_results.append(res)
response_data = [res.to_dict() for res in filtered_results]
return make_response(jsonify(response_data), 200)
except Exception as e:
logger.exception("Error processing analysis")
return jsonify({"error": str(e)}), 500
# ============================================
# ENDPOINT /anonymize QUI FAIT LE REMPLACEMENT
# ============================================
@app.route("/anonymize", methods=["POST"])
def anonymize_text():
if not analyzer or not anonymizer:
return jsonify({"error": "Presidio engines are not available. Check startup logs."}), 500
try:
data = request.get_json(force=True)
text_to_process = data.get("text", "")
language = data.get("language", "fr")
if not text_to_process:
return jsonify({"error": "text field is missing or empty"}), 400
# Étape 1 : Analyser le texte pour trouver les entités
analyzer_results = analyzer.analyze(text=text_to_process, language=language)
# Étape 2 : Anonymiser le texte en utilisant les résultats de l'analyse
# L'AnonymizerEngine va utiliser la config 'anonymizer_config' pour faire les remplacements
anonymized_result = anonymizer.anonymize(
text=text_to_process,
analyzer_results=analyzer_results
)
# Étape 3 : Renvoyer le texte anonymisé
return jsonify({"text": anonymized_result.text}), 200
except Exception as e:
logger.exception("Error processing anonymization request")
return jsonify({"error": str(e)}), 500
# =====================================================================
# DÉMARRAGE DE L'APPLICATION FLASK (INCHANGÉ)
# =====================================================================
if __name__ == "__main__":
# Pour le déploiement, il est préférable d'utiliser un serveur WSGI comme Gunicorn
app.run(host="0.0.0.0", port=5001)