doublon done
This commit is contained in:
@@ -10,6 +10,7 @@ allow_list:
|
||||
- TVA
|
||||
- IEC
|
||||
- expert-comptable
|
||||
- prestataire
|
||||
# Termes financiers
|
||||
- Euro
|
||||
- EUR
|
||||
|
||||
@@ -5,7 +5,7 @@ anonymizer_config:
|
||||
PERSON: replace
|
||||
LOCATION: replace
|
||||
ORGANIZATION: replace
|
||||
DATE_TIME: replace
|
||||
DATE: replace
|
||||
MONEY: replace
|
||||
EMAIL_ADDRESS: replace
|
||||
IBAN: replace
|
||||
@@ -45,11 +45,18 @@ anonymizer_config:
|
||||
PERSON: "[PERSONNE]"
|
||||
LOCATION: "[LIEU]"
|
||||
ORGANIZATION: "[ORGANISATION]"
|
||||
DATE_TIME: "[DATE]"
|
||||
DATE: "[DATE]"
|
||||
MONEY: "[MONTANT]"
|
||||
EMAIL_ADDRESS: "[EMAIL]"
|
||||
IBAN: "[IBAN]"
|
||||
IP_ADDRESS: "[ADRESSE_IP]"
|
||||
|
||||
# PII Belges - AJOUTER CES LIGNES
|
||||
BE_ENTERPRISE_NUMBER: "[ENTREPRISE_BELGE]"
|
||||
BE_PHONE_NUMBER: "[TELEPHONE_BELGE]"
|
||||
BE_ADDRESS: "[ADRESSE_BELGE]"
|
||||
BE_ID_CARD: "[CARTE_ID_BELGE]"
|
||||
BE_PASSPORT: "[PASSEPORT_BELGE]"
|
||||
|
||||
# PII Génériques - Données sensibles RGPD
|
||||
HEALTH_DATA: "[DONNEES_SANTE]"
|
||||
|
||||
@@ -15,9 +15,9 @@ nlp_configuration:
|
||||
ORGANIZATION: ORGANIZATION
|
||||
LOC: LOCATION
|
||||
LOCATION: LOCATION
|
||||
DATE: DATE_TIME
|
||||
TIME: DATE_TIME
|
||||
MISC: DATE_TIME
|
||||
DATE: DATE
|
||||
TIME: TIME
|
||||
MISC: MISC
|
||||
labels_to_ignore:
|
||||
- LOCATION
|
||||
- MISC
|
||||
@@ -26,7 +26,6 @@ nlp_configuration:
|
||||
- LANGUAGE
|
||||
- LAW
|
||||
- ORDINAL
|
||||
- PERCENT
|
||||
- PRODUCT
|
||||
- QUANTITY
|
||||
- WORK_OF_ART
|
||||
|
||||
@@ -2,7 +2,7 @@ recognizer_registry:
|
||||
recognizers:
|
||||
- name: DateTimeRecognizer
|
||||
supported_language: fr
|
||||
supported_entity: DATE_TIME
|
||||
supported_entity: DATE
|
||||
patterns:
|
||||
# Formats français standards avec différents séparateurs
|
||||
- name: Date française DD/MM/YYYY
|
||||
@@ -13,6 +13,23 @@ recognizer_registry:
|
||||
regex: "\\b(?:0?[1-9]|[12][0-9]|3[01])-(?:0?[1-9]|1[0-2])-(?:19|20)\\d{2}\\b"
|
||||
score: 0.95
|
||||
|
||||
# NOUVEAU: Formats courts DD-MM-YY
|
||||
- name: Date française DD-MM-YY
|
||||
regex: "\\b(?:0?[1-9]|[12][0-9]|3[01])-(?:0?[1-9]|1[0-2])-\\d{2}\\b"
|
||||
score: 0.90
|
||||
|
||||
- name: Date française DD/MM/YY
|
||||
regex: "\\b(?:0?[1-9]|[12][0-9]|3[01])/(?:0?[1-9]|1[0-2])/\\d{2}\\b"
|
||||
score: 0.90
|
||||
|
||||
- name: Date belge DD.MM.YY
|
||||
regex: "\\b(?:0?[1-9]|[12][0-9]|3[01])\\.(?:0?[1-9]|1[0-2])\\.\\d{2}\\b"
|
||||
score: 0.90
|
||||
|
||||
- name: Date courte
|
||||
regex: "\\b\\d{1,2}[-/.]\\d{1,2}[-/.]\\d{2}\\b"
|
||||
score: 0.85
|
||||
|
||||
- name: Date française DD MM YYYY (espaces)
|
||||
regex: "\\b(?:0?[1-9]|[12][0-9]|3[01])\\s+(?:0?[1-9]|1[0-2])\\s+(?:19|20)\\d{2}\\b"
|
||||
score: 0.9
|
||||
|
||||
@@ -23,7 +23,6 @@ class OverlapResolver:
|
||||
'IP_ADDRESS': 82,
|
||||
'BE_ADDRESS': 75,
|
||||
'FR_ADDRESS': 75,
|
||||
'DATE_TIME': 70,
|
||||
'ORGANIZATION': 65,
|
||||
'LOCATION': 60,
|
||||
'PERSON': 50,
|
||||
|
||||
@@ -24,7 +24,7 @@ class DateRefiner(EntityRefiner):
|
||||
"""Raffineur pour les dates - élimine les faux positifs"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__("DATE_TIME")
|
||||
super().__init__("DATE")
|
||||
# Patterns pour valider les vraies dates
|
||||
self.valid_date_patterns = [
|
||||
# Format DD/MM/YYYY
|
||||
|
||||
Reference in New Issue
Block a user