doublon done
This commit is contained in:
@@ -10,6 +10,7 @@ allow_list:
|
|||||||
- TVA
|
- TVA
|
||||||
- IEC
|
- IEC
|
||||||
- expert-comptable
|
- expert-comptable
|
||||||
|
- prestataire
|
||||||
# Termes financiers
|
# Termes financiers
|
||||||
- Euro
|
- Euro
|
||||||
- EUR
|
- EUR
|
||||||
|
|||||||
@@ -5,7 +5,7 @@ anonymizer_config:
|
|||||||
PERSON: replace
|
PERSON: replace
|
||||||
LOCATION: replace
|
LOCATION: replace
|
||||||
ORGANIZATION: replace
|
ORGANIZATION: replace
|
||||||
DATE_TIME: replace
|
DATE: replace
|
||||||
MONEY: replace
|
MONEY: replace
|
||||||
EMAIL_ADDRESS: replace
|
EMAIL_ADDRESS: replace
|
||||||
IBAN: replace
|
IBAN: replace
|
||||||
@@ -45,12 +45,19 @@ anonymizer_config:
|
|||||||
PERSON: "[PERSONNE]"
|
PERSON: "[PERSONNE]"
|
||||||
LOCATION: "[LIEU]"
|
LOCATION: "[LIEU]"
|
||||||
ORGANIZATION: "[ORGANISATION]"
|
ORGANIZATION: "[ORGANISATION]"
|
||||||
DATE_TIME: "[DATE]"
|
DATE: "[DATE]"
|
||||||
MONEY: "[MONTANT]"
|
MONEY: "[MONTANT]"
|
||||||
EMAIL_ADDRESS: "[EMAIL]"
|
EMAIL_ADDRESS: "[EMAIL]"
|
||||||
IBAN: "[IBAN]"
|
IBAN: "[IBAN]"
|
||||||
IP_ADDRESS: "[ADRESSE_IP]"
|
IP_ADDRESS: "[ADRESSE_IP]"
|
||||||
|
|
||||||
|
# PII Belges - AJOUTER CES LIGNES
|
||||||
|
BE_ENTERPRISE_NUMBER: "[ENTREPRISE_BELGE]"
|
||||||
|
BE_PHONE_NUMBER: "[TELEPHONE_BELGE]"
|
||||||
|
BE_ADDRESS: "[ADRESSE_BELGE]"
|
||||||
|
BE_ID_CARD: "[CARTE_ID_BELGE]"
|
||||||
|
BE_PASSPORT: "[PASSEPORT_BELGE]"
|
||||||
|
|
||||||
# PII Génériques - Données sensibles RGPD
|
# PII Génériques - Données sensibles RGPD
|
||||||
HEALTH_DATA: "[DONNEES_SANTE]"
|
HEALTH_DATA: "[DONNEES_SANTE]"
|
||||||
BIOMETRIC_DATA: "[DONNEES_BIOMETRIQUES]"
|
BIOMETRIC_DATA: "[DONNEES_BIOMETRIQUES]"
|
||||||
|
|||||||
@@ -15,9 +15,9 @@ nlp_configuration:
|
|||||||
ORGANIZATION: ORGANIZATION
|
ORGANIZATION: ORGANIZATION
|
||||||
LOC: LOCATION
|
LOC: LOCATION
|
||||||
LOCATION: LOCATION
|
LOCATION: LOCATION
|
||||||
DATE: DATE_TIME
|
DATE: DATE
|
||||||
TIME: DATE_TIME
|
TIME: TIME
|
||||||
MISC: DATE_TIME
|
MISC: MISC
|
||||||
labels_to_ignore:
|
labels_to_ignore:
|
||||||
- LOCATION
|
- LOCATION
|
||||||
- MISC
|
- MISC
|
||||||
@@ -26,7 +26,6 @@ nlp_configuration:
|
|||||||
- LANGUAGE
|
- LANGUAGE
|
||||||
- LAW
|
- LAW
|
||||||
- ORDINAL
|
- ORDINAL
|
||||||
- PERCENT
|
|
||||||
- PRODUCT
|
- PRODUCT
|
||||||
- QUANTITY
|
- QUANTITY
|
||||||
- WORK_OF_ART
|
- WORK_OF_ART
|
||||||
|
|||||||
@@ -2,7 +2,7 @@ recognizer_registry:
|
|||||||
recognizers:
|
recognizers:
|
||||||
- name: DateTimeRecognizer
|
- name: DateTimeRecognizer
|
||||||
supported_language: fr
|
supported_language: fr
|
||||||
supported_entity: DATE_TIME
|
supported_entity: DATE
|
||||||
patterns:
|
patterns:
|
||||||
# Formats français standards avec différents séparateurs
|
# Formats français standards avec différents séparateurs
|
||||||
- name: Date française DD/MM/YYYY
|
- name: Date française DD/MM/YYYY
|
||||||
@@ -13,6 +13,23 @@ recognizer_registry:
|
|||||||
regex: "\\b(?:0?[1-9]|[12][0-9]|3[01])-(?:0?[1-9]|1[0-2])-(?:19|20)\\d{2}\\b"
|
regex: "\\b(?:0?[1-9]|[12][0-9]|3[01])-(?:0?[1-9]|1[0-2])-(?:19|20)\\d{2}\\b"
|
||||||
score: 0.95
|
score: 0.95
|
||||||
|
|
||||||
|
# NOUVEAU: Formats courts DD-MM-YY
|
||||||
|
- name: Date française DD-MM-YY
|
||||||
|
regex: "\\b(?:0?[1-9]|[12][0-9]|3[01])-(?:0?[1-9]|1[0-2])-\\d{2}\\b"
|
||||||
|
score: 0.90
|
||||||
|
|
||||||
|
- name: Date française DD/MM/YY
|
||||||
|
regex: "\\b(?:0?[1-9]|[12][0-9]|3[01])/(?:0?[1-9]|1[0-2])/\\d{2}\\b"
|
||||||
|
score: 0.90
|
||||||
|
|
||||||
|
- name: Date belge DD.MM.YY
|
||||||
|
regex: "\\b(?:0?[1-9]|[12][0-9]|3[01])\\.(?:0?[1-9]|1[0-2])\\.\\d{2}\\b"
|
||||||
|
score: 0.90
|
||||||
|
|
||||||
|
- name: Date courte
|
||||||
|
regex: "\\b\\d{1,2}[-/.]\\d{1,2}[-/.]\\d{2}\\b"
|
||||||
|
score: 0.85
|
||||||
|
|
||||||
- name: Date française DD MM YYYY (espaces)
|
- name: Date française DD MM YYYY (espaces)
|
||||||
regex: "\\b(?:0?[1-9]|[12][0-9]|3[01])\\s+(?:0?[1-9]|1[0-2])\\s+(?:19|20)\\d{2}\\b"
|
regex: "\\b(?:0?[1-9]|[12][0-9]|3[01])\\s+(?:0?[1-9]|1[0-2])\\s+(?:19|20)\\d{2}\\b"
|
||||||
score: 0.9
|
score: 0.9
|
||||||
|
|||||||
@@ -23,7 +23,6 @@ class OverlapResolver:
|
|||||||
'IP_ADDRESS': 82,
|
'IP_ADDRESS': 82,
|
||||||
'BE_ADDRESS': 75,
|
'BE_ADDRESS': 75,
|
||||||
'FR_ADDRESS': 75,
|
'FR_ADDRESS': 75,
|
||||||
'DATE_TIME': 70,
|
|
||||||
'ORGANIZATION': 65,
|
'ORGANIZATION': 65,
|
||||||
'LOCATION': 60,
|
'LOCATION': 60,
|
||||||
'PERSON': 50,
|
'PERSON': 50,
|
||||||
|
|||||||
@@ -24,7 +24,7 @@ class DateRefiner(EntityRefiner):
|
|||||||
"""Raffineur pour les dates - élimine les faux positifs"""
|
"""Raffineur pour les dates - élimine les faux positifs"""
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
super().__init__("DATE_TIME")
|
super().__init__("DATE")
|
||||||
# Patterns pour valider les vraies dates
|
# Patterns pour valider les vraies dates
|
||||||
self.valid_date_patterns = [
|
self.valid_date_patterns = [
|
||||||
# Format DD/MM/YYYY
|
# Format DD/MM/YYYY
|
||||||
|
|||||||
Reference in New Issue
Block a user