diff --git a/conf/anonymization/allow_list.yaml b/conf/anonymization/allow_list.yaml index e6e4572..b4b9235 100644 --- a/conf/anonymization/allow_list.yaml +++ b/conf/anonymization/allow_list.yaml @@ -10,6 +10,7 @@ allow_list: - TVA - IEC - expert-comptable + - prestataire # Termes financiers - Euro - EUR diff --git a/conf/anonymization/replacements.yaml b/conf/anonymization/replacements.yaml index 1583740..f292e5a 100644 --- a/conf/anonymization/replacements.yaml +++ b/conf/anonymization/replacements.yaml @@ -5,7 +5,7 @@ anonymizer_config: PERSON: replace LOCATION: replace ORGANIZATION: replace - DATE_TIME: replace + DATE: replace MONEY: replace EMAIL_ADDRESS: replace IBAN: replace @@ -45,11 +45,18 @@ anonymizer_config: PERSON: "[PERSONNE]" LOCATION: "[LIEU]" ORGANIZATION: "[ORGANISATION]" - DATE_TIME: "[DATE]" + DATE: "[DATE]" MONEY: "[MONTANT]" EMAIL_ADDRESS: "[EMAIL]" IBAN: "[IBAN]" IP_ADDRESS: "[ADRESSE_IP]" + + # PII Belges - AJOUTER CES LIGNES + BE_ENTERPRISE_NUMBER: "[ENTREPRISE_BELGE]" + BE_PHONE_NUMBER: "[TELEPHONE_BELGE]" + BE_ADDRESS: "[ADRESSE_BELGE]" + BE_ID_CARD: "[CARTE_ID_BELGE]" + BE_PASSPORT: "[PASSEPORT_BELGE]" # PII Génériques - Données sensibles RGPD HEALTH_DATA: "[DONNEES_SANTE]" diff --git a/conf/nlp/spacy_config.yaml b/conf/nlp/spacy_config.yaml index 7def878..9f78428 100644 --- a/conf/nlp/spacy_config.yaml +++ b/conf/nlp/spacy_config.yaml @@ -15,9 +15,9 @@ nlp_configuration: ORGANIZATION: ORGANIZATION LOC: LOCATION LOCATION: LOCATION - DATE: DATE_TIME - TIME: DATE_TIME - MISC: DATE_TIME + DATE: DATE + TIME: TIME + MISC: MISC labels_to_ignore: - LOCATION - MISC @@ -26,7 +26,6 @@ nlp_configuration: - LANGUAGE - LAW - ORDINAL - - PERCENT - PRODUCT - QUANTITY - WORK_OF_ART diff --git a/conf/recognizers/PII/generic/dates.yaml b/conf/recognizers/PII/generic/dates.yaml index 92764ae..fa320e4 100644 --- a/conf/recognizers/PII/generic/dates.yaml +++ b/conf/recognizers/PII/generic/dates.yaml @@ -2,7 +2,7 @@ recognizer_registry: recognizers: - name: DateTimeRecognizer supported_language: fr - supported_entity: DATE_TIME + supported_entity: DATE patterns: # Formats français standards avec différents séparateurs - name: Date française DD/MM/YYYY @@ -13,6 +13,23 @@ recognizer_registry: regex: "\\b(?:0?[1-9]|[12][0-9]|3[01])-(?:0?[1-9]|1[0-2])-(?:19|20)\\d{2}\\b" score: 0.95 + # NOUVEAU: Formats courts DD-MM-YY + - name: Date française DD-MM-YY + regex: "\\b(?:0?[1-9]|[12][0-9]|3[01])-(?:0?[1-9]|1[0-2])-\\d{2}\\b" + score: 0.90 + + - name: Date française DD/MM/YY + regex: "\\b(?:0?[1-9]|[12][0-9]|3[01])/(?:0?[1-9]|1[0-2])/\\d{2}\\b" + score: 0.90 + + - name: Date belge DD.MM.YY + regex: "\\b(?:0?[1-9]|[12][0-9]|3[01])\\.(?:0?[1-9]|1[0-2])\\.\\d{2}\\b" + score: 0.90 + + - name: Date courte + regex: "\\b\\d{1,2}[-/.]\\d{1,2}[-/.]\\d{2}\\b" + score: 0.85 + - name: Date française DD MM YYYY (espaces) regex: "\\b(?:0?[1-9]|[12][0-9]|3[01])\\s+(?:0?[1-9]|1[0-2])\\s+(?:19|20)\\d{2}\\b" score: 0.9 diff --git a/post_processors/overlap_resolver.py b/post_processors/overlap_resolver.py index 350faed..b2fe4a2 100644 --- a/post_processors/overlap_resolver.py +++ b/post_processors/overlap_resolver.py @@ -23,7 +23,6 @@ class OverlapResolver: 'IP_ADDRESS': 82, 'BE_ADDRESS': 75, 'FR_ADDRESS': 75, - 'DATE_TIME': 70, 'ORGANIZATION': 65, 'LOCATION': 60, 'PERSON': 50, diff --git a/refiners/date_refiner.py b/refiners/date_refiner.py index bc0c87a..e341cff 100644 --- a/refiners/date_refiner.py +++ b/refiners/date_refiner.py @@ -24,7 +24,7 @@ class DateRefiner(EntityRefiner): """Raffineur pour les dates - élimine les faux positifs""" def __init__(self): - super().__init__("DATE_TIME") + super().__init__("DATE") # Patterns pour valider les vraies dates self.valid_date_patterns = [ # Format DD/MM/YYYY