Files
Presidio/app.py
2025-08-03 20:31:34 +00:00

179 lines
6.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import os
import re
import logging
import yaml
from flask import Flask, request, jsonify, make_response
# Ces imports sont corrects pour la version qui va suivre.
from presidio_analyzer import AnalyzerEngineProvider
from presidio_anonymizer import AnonymizerEngine
logging.basicConfig(level=logging.INFO,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)
app = Flask(__name__)
# --- Initialisation ---
analyzer = None
anonymizer = None
try:
logger.info("--- Presidio Service Starting ---")
CONFIG_FILE_PATH = os.environ.get("PRESIDIO_ANALYZER_CONFIG_FILE", "conf/default.yaml")
if not os.path.exists(CONFIG_FILE_PATH):
raise FileNotFoundError(f"Configuration file not found at: {CONFIG_FILE_PATH}")
# --- CORRECTION DE LA LOGIQUE D'INITIALISATION ---
# 1. On initialise l'AnalyzerEngine en passant LE CHEMIN DU FICHIER, comme dans votre code original.
# L'argument correct est 'analyzer_engine_conf_file'.
provider = AnalyzerEngineProvider(analyzer_engine_conf_file=CONFIG_FILE_PATH)
analyzer = provider.create_engine()
logger.info("AnalyzerEngine created successfully.")
# 2. Pour l'AnonymizerEngine, nous devons charger le fichier YAML nous-mêmes
# pour extraire sa section de configuration.
with open(CONFIG_FILE_PATH, 'r') as f:
config = yaml.safe_load(f)
anonymizer_config = config.get("anonymizer_config", {})
anonymizer = AnonymizerEngine(anonymizer_config=anonymizer_config)
logger.info("AnonymizerEngine created successfully.")
logger.info(f"Analyzer and Anonymizer are ready. Languages: {analyzer.supported_languages}")
except Exception as e:
logger.exception("FATAL: Error during Presidio engines initialization.")
analyzer = None
anonymizer = None
# --- Fin de la section d'initialisation ---
# Regex strict pour IBAN belge format attendu (INCHANGÉ)
IBAN_REGEX = re.compile(r"\b[A-Z]{2}[0-9]{2}(?:\s[0-9]{4}){3}\b", re.IGNORECASE)
# Regex IPv4 (INCHANGÉ)
IPV4_REGEX = re.compile(
r"\b(?:(?:25[0-5]|2[0-4][0-9]|1\d{2}|[1-9]?\d)\.){3}"
r"(?:25[0-5]|2[0-4][0-9]|1\d{2}|[1-9]?\d)\b"
)
# Liste des labels/phrases à exclure danonymisation (en minuscules) (INCHANGÉ)
IGNORE_LABELS = {
"témoins",
"témoins clés",
"coordonnées",
"coordonnées bancaires",
"contexte financier",
"données sensibles",
"contexte",
"montrent",
"montrent des",
"montrent des irrégularités",
"bénéficiaire",
}
def normalize_label(text: str) -> str:
return text.strip().lower()
# =====================================================================
# VOTRE ENDPOINT /analyze ORIGINAL - SANS AUCUN CHANGEMENT
# =====================================================================
@app.route("/analyze", methods=["POST"])
def analyze_text():
if not analyzer:
return jsonify({"error": "Analyzer engine is not available. Check startup logs."}), 500
try:
data = request.get_json(force=True)
text_to_analyze = data.get("text", "")
language = data.get("language", "fr")
if not text_to_analyze:
return jsonify({"error": "text field is missing or empty"}), 400
results = analyzer.analyze(text=text_to_analyze, language=language)
filtered_results = []
for res in results:
ent_text = text_to_analyze[res.start:res.end].strip()
ent_text_norm = normalize_label(ent_text)
if ent_text_norm in IGNORE_LABELS:
logger.debug(f"Skipping anonymization of label: '{ent_text}'")
continue
# Recadrage IBAN strict
if res.entity_type == "IBAN":
match = IBAN_REGEX.search(ent_text)
if match:
true_iban = match.group(0)
start_offset = ent_text.find(true_iban)
if start_offset != -1:
res.start += start_offset
res.end = res.start + len(true_iban)
else:
logger.warning(f"Invalid IBAN detected, skipping: '{ent_text}'")
continue
# Recadrage IP_ADDRESS strict IPv4
if res.entity_type == "IP_ADDRESS":
match = IPV4_REGEX.search(ent_text)
if match:
true_ip = match.group(0)
start_offset = ent_text.find(true_ip)
if start_offset != -1:
res.start += start_offset
res.end = res.start + len(true_ip)
else:
logger.warning(f"Invalid IP detected, skipping: '{ent_text}'")
continue
filtered_results.append(res)
response_data = [res.to_dict() for res in filtered_results]
return make_response(jsonify(response_data), 200)
except Exception as e:
logger.exception("Error processing analysis")
return jsonify({"error": str(e)}), 500
# =====================================================================
# NOUVEL ENDPOINT /anonymize QUI FAIT LE REMPLACEMENT
# =====================================================================
@app.route("/anonymize", methods=["POST"])
def anonymize_text():
if not analyzer or not anonymizer:
return jsonify({"error": "Presidio engines are not available. Check startup logs."}), 500
try:
data = request.get_json(force=True)
text_to_process = data.get("text", "")
language = data.get("language", "fr")
if not text_to_process:
return jsonify({"error": "text field is missing or empty"}), 400
# Étape 1 : Analyser le texte pour trouver les entités
analyzer_results = analyzer.analyze(text=text_to_process, language=language)
# Étape 2 : Anonymiser le texte en utilisant les résultats de l'analyse
anonymized_result = anonymizer.anonymize(
text=text_to_process,
analyzer_results=analyzer_results
)
# Étape 3 : Renvoyer le texte anonymisé
return jsonify({"text": anonymized_result.text}), 200
except Exception as e:
logger.exception("Error processing anonymization request")
return jsonify({"error": str(e)}), 500
# =====================================================================
# DÉMARRAGE DE L'APPLICATION FLASK (INCHANGÉ)
# =====================================================================
if __name__ == "__main__":
app.run(host="0.0.0.0", port=5001)