From 5e6ba39ddb8b65a5e01f476f81001d51d7081b0c Mon Sep 17 00:00:00 2001 From: nacim Date: Mon, 28 Jul 2025 18:48:44 +0000 Subject: [PATCH] Actualiser app.py --- app.py | 44 ++++++++++++++++++++------------------------ 1 file changed, 20 insertions(+), 24 deletions(-) diff --git a/app.py b/app.py index 389dd4b..dd16485 100644 --- a/app.py +++ b/app.py @@ -11,24 +11,28 @@ logger = logging.getLogger(__name__) app = Flask(__name__) -# Initialisation Presidio Analyzer via Provider +# Chargement du moteur Presidio via Provider analyzer = None try: logger.info("--- Presidio Analyzer Service Starting ---") CONFIG_FILE_PATH = os.environ.get("PRESIDIO_ANALYZER_CONFIG_FILE", "conf/default.yaml") provider = AnalyzerEngineProvider(analyzer_engine_conf_file=CONFIG_FILE_PATH) analyzer = provider.create_engine() - logger.info(f"Analyzer ready. Supported languages: {analyzer.supported_languages}") + logger.info(f"Analyzer ready. Languages: {analyzer.supported_languages}") except Exception as e: logger.exception("Error during AnalyzerEngine initialization.") analyzer = None +# Regex strict pour IBAN belge format attendu +IBAN_REGEX = re.compile(r"\b[A-Z]{2}[0-9]{2}(?:\s[0-9]{4}){3}\b", re.IGNORECASE) -# Regex pour recadrage strict -IBAN_REGEX = re.compile(r"\b[A-Z]{2}[0-9]{2}(?:\s[A-Z0-9]{4}){4,7}\b", re.IGNORECASE) -IPV4_REGEX = re.compile(r"\b(?:25[0-5]|2[0-4]\d|1\d{2}|[1-9]?\d)(?:\.(?:25[0-5]|2[0-4]\d|1\d{2}|[1-9]?\d)){3}\b") +# Regex IPv4 +IPV4_REGEX = re.compile( + r"\b(?:(?:25[0-5]|2[0-4][0-9]|1\d{2}|[1-9]?\d)\.){3}" + r"(?:25[0-5]|2[0-4][0-9]|1\d{2}|[1-9]?\d)\b" +) -# Liste des labels/phrases à exclure de l'anonymisation (en minuscules) +# Liste des labels/phrases à exclure d’anonymisation (en minuscules) IGNORE_LABELS = { "témoins", "témoins clés", @@ -66,12 +70,11 @@ def analyze_text(): ent_text = text_to_analyze[res.start:res.end].strip() ent_text_norm = normalize_label(ent_text) - # Skip anonymization for labels to keep if ent_text_norm in IGNORE_LABELS: logger.debug(f"Skipping anonymization of label: '{ent_text}'") continue - # Recadrage stricte IBAN + # Recadrage IBAN strict if res.entity_type == "IBAN": match = IBAN_REGEX.search(ent_text) if match: @@ -81,16 +84,14 @@ def analyze_text(): old_start, old_end = res.start, res.end res.start += start_offset res.end = res.start + len(true_iban) - ent_text = true_iban - logger.debug(f"Adjusted IBAN span from ({old_start}-{old_end}) to ({res.start}-{res.end}): '{ent_text}'") + logger.debug(f"Adjusted IBAN span: {old_start}-{old_end} => {res.start}-{res.end}") else: - logger.warning(f"Cannot find exact IBAN substring inside entity: '{ent_text}'") + logger.warning(f"IBAN regex match but cannot find substring position: '{ent_text}'") else: - # Pas un IBAN valide, ignorer cette entité - logger.warning(f"Entity IBAN does not match strict IBAN regex: '{ent_text}'") + logger.warning(f"Invalid IBAN detected, skipping: '{ent_text}'") continue - # Recadrage stricte IP_ADDRESS + # Recadrage IP_ADDRESS strict IPv4 (wildcard possible pour IPv6 si besoin) if res.entity_type == "IP_ADDRESS": match = IPV4_REGEX.search(ent_text) if match: @@ -100,27 +101,22 @@ def analyze_text(): old_start, old_end = res.start, res.end res.start += start_offset res.end = res.start + len(true_ip) - ent_text = true_ip - logger.debug(f"Adjusted IP_ADDRESS span from ({old_start}-{old_end}) to ({res.start}-{res.end}): '{ent_text}'") + logger.debug(f"Adjusted IP span: {old_start}-{old_end} => {res.start}-{res.end}") else: - logger.warning(f"Cannot find exact IP substring inside entity: '{ent_text}'") + logger.warning(f"IP regex match but cannot find substring position: '{ent_text}'") else: - logger.warning(f"Entity IP_ADDRESS does not match IPv4 regex: '{ent_text}'") + logger.warning(f"Invalid IP detected, skipping: '{ent_text}'") continue filtered_results.append(res) - # Option: filtrer les chevauchements - + # Retourner le résultat nettoyé response_data = [res.to_dict() for res in filtered_results] return make_response(jsonify(response_data), 200) except Exception as e: - logger.exception(f"Error during analysis for language '{language}'.") - if "No matching recognizers" in str(e): - return jsonify({"error": f"No recognizers available for language '{language}'."}), 400 + logger.exception("Error processing analysis") return jsonify({"error": str(e)}), 500 - if __name__ == "__main__": app.run(host="0.0.0.0", port=5001)