doublon done

This commit is contained in:
nBiqoz
2025-09-12 16:55:13 +02:00
parent c62e5b92d5
commit 3e70181b58
6 changed files with 32 additions and 9 deletions

View File

@@ -10,6 +10,7 @@ allow_list:
- TVA - TVA
- IEC - IEC
- expert-comptable - expert-comptable
- prestataire
# Termes financiers # Termes financiers
- Euro - Euro
- EUR - EUR

View File

@@ -5,7 +5,7 @@ anonymizer_config:
PERSON: replace PERSON: replace
LOCATION: replace LOCATION: replace
ORGANIZATION: replace ORGANIZATION: replace
DATE_TIME: replace DATE: replace
MONEY: replace MONEY: replace
EMAIL_ADDRESS: replace EMAIL_ADDRESS: replace
IBAN: replace IBAN: replace
@@ -45,11 +45,18 @@ anonymizer_config:
PERSON: "[PERSONNE]" PERSON: "[PERSONNE]"
LOCATION: "[LIEU]" LOCATION: "[LIEU]"
ORGANIZATION: "[ORGANISATION]" ORGANIZATION: "[ORGANISATION]"
DATE_TIME: "[DATE]" DATE: "[DATE]"
MONEY: "[MONTANT]" MONEY: "[MONTANT]"
EMAIL_ADDRESS: "[EMAIL]" EMAIL_ADDRESS: "[EMAIL]"
IBAN: "[IBAN]" IBAN: "[IBAN]"
IP_ADDRESS: "[ADRESSE_IP]" IP_ADDRESS: "[ADRESSE_IP]"
# PII Belges - AJOUTER CES LIGNES
BE_ENTERPRISE_NUMBER: "[ENTREPRISE_BELGE]"
BE_PHONE_NUMBER: "[TELEPHONE_BELGE]"
BE_ADDRESS: "[ADRESSE_BELGE]"
BE_ID_CARD: "[CARTE_ID_BELGE]"
BE_PASSPORT: "[PASSEPORT_BELGE]"
# PII Génériques - Données sensibles RGPD # PII Génériques - Données sensibles RGPD
HEALTH_DATA: "[DONNEES_SANTE]" HEALTH_DATA: "[DONNEES_SANTE]"

View File

@@ -15,9 +15,9 @@ nlp_configuration:
ORGANIZATION: ORGANIZATION ORGANIZATION: ORGANIZATION
LOC: LOCATION LOC: LOCATION
LOCATION: LOCATION LOCATION: LOCATION
DATE: DATE_TIME DATE: DATE
TIME: DATE_TIME TIME: TIME
MISC: DATE_TIME MISC: MISC
labels_to_ignore: labels_to_ignore:
- LOCATION - LOCATION
- MISC - MISC
@@ -26,7 +26,6 @@ nlp_configuration:
- LANGUAGE - LANGUAGE
- LAW - LAW
- ORDINAL - ORDINAL
- PERCENT
- PRODUCT - PRODUCT
- QUANTITY - QUANTITY
- WORK_OF_ART - WORK_OF_ART

View File

@@ -2,7 +2,7 @@ recognizer_registry:
recognizers: recognizers:
- name: DateTimeRecognizer - name: DateTimeRecognizer
supported_language: fr supported_language: fr
supported_entity: DATE_TIME supported_entity: DATE
patterns: patterns:
# Formats français standards avec différents séparateurs # Formats français standards avec différents séparateurs
- name: Date française DD/MM/YYYY - name: Date française DD/MM/YYYY
@@ -13,6 +13,23 @@ recognizer_registry:
regex: "\\b(?:0?[1-9]|[12][0-9]|3[01])-(?:0?[1-9]|1[0-2])-(?:19|20)\\d{2}\\b" regex: "\\b(?:0?[1-9]|[12][0-9]|3[01])-(?:0?[1-9]|1[0-2])-(?:19|20)\\d{2}\\b"
score: 0.95 score: 0.95
# NOUVEAU: Formats courts DD-MM-YY
- name: Date française DD-MM-YY
regex: "\\b(?:0?[1-9]|[12][0-9]|3[01])-(?:0?[1-9]|1[0-2])-\\d{2}\\b"
score: 0.90
- name: Date française DD/MM/YY
regex: "\\b(?:0?[1-9]|[12][0-9]|3[01])/(?:0?[1-9]|1[0-2])/\\d{2}\\b"
score: 0.90
- name: Date belge DD.MM.YY
regex: "\\b(?:0?[1-9]|[12][0-9]|3[01])\\.(?:0?[1-9]|1[0-2])\\.\\d{2}\\b"
score: 0.90
- name: Date courte
regex: "\\b\\d{1,2}[-/.]\\d{1,2}[-/.]\\d{2}\\b"
score: 0.85
- name: Date française DD MM YYYY (espaces) - name: Date française DD MM YYYY (espaces)
regex: "\\b(?:0?[1-9]|[12][0-9]|3[01])\\s+(?:0?[1-9]|1[0-2])\\s+(?:19|20)\\d{2}\\b" regex: "\\b(?:0?[1-9]|[12][0-9]|3[01])\\s+(?:0?[1-9]|1[0-2])\\s+(?:19|20)\\d{2}\\b"
score: 0.9 score: 0.9

View File

@@ -23,7 +23,6 @@ class OverlapResolver:
'IP_ADDRESS': 82, 'IP_ADDRESS': 82,
'BE_ADDRESS': 75, 'BE_ADDRESS': 75,
'FR_ADDRESS': 75, 'FR_ADDRESS': 75,
'DATE_TIME': 70,
'ORGANIZATION': 65, 'ORGANIZATION': 65,
'LOCATION': 60, 'LOCATION': 60,
'PERSON': 50, 'PERSON': 50,

View File

@@ -24,7 +24,7 @@ class DateRefiner(EntityRefiner):
"""Raffineur pour les dates - élimine les faux positifs""" """Raffineur pour les dates - élimine les faux positifs"""
def __init__(self): def __init__(self):
super().__init__("DATE_TIME") super().__init__("DATE")
# Patterns pour valider les vraies dates # Patterns pour valider les vraies dates
self.valid_date_patterns = [ self.valid_date_patterns = [
# Format DD/MM/YYYY # Format DD/MM/YYYY