Files
Presidio/app.py
2025-08-03 20:29:50 +00:00

176 lines
6.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import os
import re
import logging
import yaml
from flask import Flask, request, jsonify, make_response
# Ces imports sont suffisants et corrects pour votre configuration actuelle:
from presidio_analyzer import AnalyzerEngine, RecognizerResult, AnalyzerEngineProvider
from presidio_anonymizer import AnonymizerEngine
logging.basicConfig(level=logging.INFO,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)
app = Flask(__name__)
# --- Initialisation combinée de l'Analyzer et de l'Anonymizer ---
analyzer = None
anonymizer = None
try:
logger.info("--- Presidio Service Starting ---")
CONFIG_FILE_PATH = os.environ.get("PRESIDIO_ANALYZER_CONFIG_FILE", "conf/default.yaml")
if not os.path.exists(CONFIG_FILE_PATH):
raise FileNotFoundError(f"Configuration file not found at: {CONFIG_FILE_PATH}")
with open(CONFIG_FILE_PATH, 'r') as f:
config = yaml.safe_load(f)
# 1. Créer l'Analyzer Engine en utilisant le provider et la configuration chargée
# C'est cette ligne qui utilise AnalyzerEngineProvider, il n'y a donc pas besoin
# d'importer NlpEngineProvider séparément.
provider = AnalyzerEngineProvider(analyzer_engine_conf=config)
analyzer = provider.create_engine()
# 2. Créer l'Anonymizer Engine en lui passant sa section de configuration
anonymizer_config = config.get("anonymizer_config", {})
anonymizer = AnonymizerEngine(anonymizer_config=anonymizer_config)
logger.info(f"Analyzer and Anonymizer are ready. Languages: {analyzer.supported_languages}")
except Exception as e:
logger.exception("FATAL: Error during Presidio engines initialization.")
analyzer = None
anonymizer = None
# --- Fin de la section d'initialisation ---
# Regex strict pour IBAN belge format attendu (INCHANGÉ)
IBAN_REGEX = re.compile(r"\b[A-Z]{2}[0-9]{2}(?:\s[0-9]{4}){3}\b", re.IGNORECASE)
# Regex IPv4 (INCHANGÉ)
IPV4_REGEX = re.compile(
r"\b(?:(?:25[0-5]|2[0-4][0-9]|1\d{2}|[1-9]?\d)\.){3}"
r"(?:25[0-5]|2[0-4][0-9]|1\d{2}|[1-9]?\d)\b"
)
# Liste des labels/phrases à exclure danonymisation (en minuscules) (INCHANGÉ)
IGNORE_LABELS = {
"témoins",
"témoins clés",
"coordonnées",
"coordonnées bancaires",
"contexte financier",
"données sensibles",
"contexte",
"montrent",
"montrent des",
"montrent des irrégularités",
"bénéficiaire",
}
def normalize_label(text: str) -> str:
return text.strip().lower()
# =====================================================================
# VOTRE ENDPOINT /analyze ORIGINAL - SANS AUCUN CHANGEMENT
# =====================================================================
@app.route("/analyze", methods=["POST"])
def analyze_text():
if not analyzer:
return jsonify({"error": "Analyzer engine is not available. Check startup logs."}), 500
try:
data = request.get_json(force=True)
text_to_analyze = data.get("text", "")
language = data.get("language", "fr")
if not text_to_analyze:
return jsonify({"error": "text field is missing or empty"}), 400
results = analyzer.analyze(text=text_to_analyze, language=language)
filtered_results = []
for res in results:
ent_text = text_to_analyze[res.start:res.end].strip()
ent_text_norm = normalize_label(ent_text)
if ent_text_norm in IGNORE_LABELS:
logger.debug(f"Skipping anonymization of label: '{ent_text}'")
continue
# Recadrage IBAN strict
if res.entity_type == "IBAN":
match = IBAN_REGEX.search(ent_text)
if match:
true_iban = match.group(0)
start_offset = ent_text.find(true_iban)
if start_offset != -1:
res.start += start_offset
res.end = res.start + len(true_iban)
else:
logger.warning(f"Invalid IBAN detected, skipping: '{ent_text}'")
continue
# Recadrage IP_ADDRESS strict IPv4
if res.entity_type == "IP_ADDRESS":
match = IPV4_REGEX.search(ent_text)
if match:
true_ip = match.group(0)
start_offset = ent_text.find(true_ip)
if start_offset != -1:
res.start += start_offset
res.end = res.start + len(true_ip)
else:
logger.warning(f"Invalid IP detected, skipping: '{ent_text}'")
continue
filtered_results.append(res)
response_data = [res.to_dict() for res in filtered_results]
return make_response(jsonify(response_data), 200)
except Exception as e:
logger.exception("Error processing analysis")
return jsonify({"error": str(e)}), 500
# =====================================================================
# NOUVEL ENDPOINT /anonymize QUI FAIT LE REMPLACEMENT
# =====================================================================
@app.route("/anonymize", methods=["POST"])
def anonymize_text():
if not analyzer or not anonymizer:
return jsonify({"error": "Presidio engines are not available. Check startup logs."}), 500
try:
data = request.get_json(force=True)
text_to_process = data.get("text", "")
language = data.get("language", "fr")
if not text_to_process:
return jsonify({"error": "text field is missing or empty"}), 400
# Étape 1 : Analyser le texte pour trouver les entités
analyzer_results = analyzer.analyze(text=text_to_process, language=language)
# Étape 2 : Anonymiser le texte en utilisant les résultats de l'analyse
anonymized_result = anonymizer.anonymize(
text=text_to_process,
analyzer_results=analyzer_results
)
# Étape 3 : Renvoyer le texte anonymisé
return jsonify({"text": anonymized_result.text}), 200
except Exception as e:
logger.exception("Error processing anonymization request")
return jsonify({"error": str(e)}), 500
# =====================================================================
# DÉMARRAGE DE L'APPLICATION FLASK (INCHANGÉ)
# =====================================================================
if __name__ == "__main__":
app.run(host="0.0.0.0", port=5001)