presidio ok v.1
This commit is contained in:
@@ -1,5 +1,5 @@
|
||||
import { NextResponse, type NextRequest } from "next/server";
|
||||
import pdf from "pdf-parse/lib/pdf-parse";
|
||||
import pdf from "pdf-parse"; // ✅ Import correct
|
||||
import mammoth from "mammoth";
|
||||
|
||||
export async function POST(req: NextRequest) {
|
||||
@@ -27,13 +27,20 @@ export async function POST(req: NextRequest) {
|
||||
const data = await pdf(buffer);
|
||||
fileContent = data.text || "";
|
||||
console.log("✅ Extraction PDF réussie, longueur:", fileContent.length);
|
||||
|
||||
// ✅ Vérification supplémentaire
|
||||
if (!fileContent.trim()) {
|
||||
console.log("⚠️ PDF vide ou non lisible");
|
||||
return NextResponse.json(
|
||||
{ error: "Le PDF ne contient pas de texte extractible ou est protégé." },
|
||||
{ status: 400 }
|
||||
);
|
||||
}
|
||||
} catch (pdfError) {
|
||||
console.error("❌ Erreur PDF:", pdfError);
|
||||
console.error("❌ Erreur PDF détaillée:", pdfError);
|
||||
return NextResponse.json(
|
||||
{
|
||||
error: `Erreur traitement PDF: ${
|
||||
pdfError instanceof Error ? pdfError.message : "Erreur inconnue"
|
||||
}`,
|
||||
error: `Erreur traitement PDF: ${pdfError instanceof Error ? pdfError.message : "Erreur inconnue"}. Vérifiez que le PDF n'est pas protégé ou corrompu.`,
|
||||
},
|
||||
{ status: 500 }
|
||||
);
|
||||
@@ -110,10 +117,10 @@ export async function POST(req: NextRequest) {
|
||||
};
|
||||
|
||||
console.log("🔍 Appel à Presidio Analyzer...");
|
||||
|
||||
const presidioAnalyzerUrl =
|
||||
"http://analyzer.151.80.20.211.sslip.io/analyze";
|
||||
|
||||
|
||||
// ✅ Définir l'URL AVANT de l'utiliser
|
||||
const presidioAnalyzerUrl = "http://analyzer.151.80.20.211.sslip.io/analyze";
|
||||
|
||||
try {
|
||||
const analyzeResponse = await fetch(presidioAnalyzerUrl, {
|
||||
method: "POST",
|
||||
@@ -123,11 +130,15 @@ export async function POST(req: NextRequest) {
|
||||
},
|
||||
body: JSON.stringify(analyzerConfig),
|
||||
});
|
||||
|
||||
|
||||
console.log("📊 Statut Analyzer:", analyzeResponse.status);
|
||||
console.log("📊 Headers Analyzer:", analyzeResponse.headers);
|
||||
|
||||
if (!analyzeResponse.ok) {
|
||||
const errorBody = await analyzeResponse.text();
|
||||
console.error("❌ Erreur Analyzer:", errorBody);
|
||||
console.error("❌ URL utilisée:", presidioAnalyzerUrl);
|
||||
console.error("❌ Config envoyée:", analyzerConfig);
|
||||
// Fallback: retourner juste le texte si Presidio n'est pas disponible
|
||||
return NextResponse.json({ text: fileContent }, { status: 200 });
|
||||
}
|
||||
@@ -172,6 +183,7 @@ export async function POST(req: NextRequest) {
|
||||
text: fileContent,
|
||||
anonymizedText: anonymizerResult.text,
|
||||
piiCount: analyzerResults.length,
|
||||
analyzerResults: analyzerResults,
|
||||
};
|
||||
|
||||
return NextResponse.json(result, { status: 200 });
|
||||
|
||||
@@ -17,34 +17,75 @@ export const AnonymizationInterface = ({
|
||||
|
||||
const anonymizedTypes = new Set<string>();
|
||||
|
||||
// Détecter les patterns d'anonymisation dans le texte de sortie
|
||||
// ✅ NOUVEAUX PATTERNS PRESIDIO
|
||||
|
||||
// Noms (PERSON)
|
||||
if (outputText.includes("<PERSON>")) {
|
||||
anonymizedTypes.add("Prénoms");
|
||||
anonymizedTypes.add("Noms de famille");
|
||||
anonymizedTypes.add("Noms complets");
|
||||
}
|
||||
|
||||
// Noms (Prénoms, Noms de famille, Noms complets)
|
||||
// Emails (EMAIL_ADDRESS)
|
||||
if (outputText.includes("<EMAIL_ADDRESS>")) {
|
||||
anonymizedTypes.add("Adresses e-mail");
|
||||
}
|
||||
|
||||
// Téléphones (PHONE_NUMBER)
|
||||
if (outputText.includes("<PHONE_NUMBER>")) {
|
||||
anonymizedTypes.add("Numéros de téléphone");
|
||||
}
|
||||
|
||||
// Adresses (LOCATION)
|
||||
if (outputText.includes("<LOCATION>")) {
|
||||
anonymizedTypes.add("Adresses");
|
||||
}
|
||||
|
||||
// IBAN (IBAN)
|
||||
if (outputText.includes("<IBAN>")) {
|
||||
anonymizedTypes.add("Numéros d'ID"); // Ou créer une nouvelle catégorie "IBAN"
|
||||
}
|
||||
|
||||
// Organisations (ORGANIZATION)
|
||||
if (outputText.includes("<ORGANIZATION>")) {
|
||||
anonymizedTypes.add("Noms de domaine"); // Ou adapter selon vos besoins
|
||||
}
|
||||
|
||||
// Dates personnalisées (CUSTOM_DATE)
|
||||
if (outputText.includes("<CUSTOM_DATE>")) {
|
||||
anonymizedTypes.add("Dates");
|
||||
}
|
||||
|
||||
// Numéros d'entreprise belges (BE_ENTERPRISE_NUMBER)
|
||||
if (outputText.includes("<BE_ENTERPRISE_NUMBER>")) {
|
||||
anonymizedTypes.add("Numéros d'ID");
|
||||
}
|
||||
|
||||
// ✅ ANCIENS PATTERNS (pour compatibilité)
|
||||
|
||||
// Noms (anciens patterns [Nom1], [Nom2]...)
|
||||
if (outputText.includes("[Nom1]") || outputText.includes("[Nom")) {
|
||||
anonymizedTypes.add("Prénoms");
|
||||
anonymizedTypes.add("Noms de famille");
|
||||
anonymizedTypes.add("Noms complets");
|
||||
}
|
||||
|
||||
// Emails
|
||||
// Emails (anciens patterns)
|
||||
if (outputText.includes("[Email1]") || outputText.includes("[Email")) {
|
||||
anonymizedTypes.add("Adresses e-mail");
|
||||
}
|
||||
|
||||
// Téléphones
|
||||
if (
|
||||
outputText.includes("[Téléphone1]") ||
|
||||
outputText.includes("[Téléphone")
|
||||
) {
|
||||
// Téléphones (anciens patterns)
|
||||
if (outputText.includes("[Téléphone1]") || outputText.includes("[Téléphone")) {
|
||||
anonymizedTypes.add("Numéros de téléphone");
|
||||
}
|
||||
|
||||
// Adresses
|
||||
// Adresses (anciens patterns)
|
||||
if (outputText.includes("[Adresse1]") || outputText.includes("[Adresse")) {
|
||||
anonymizedTypes.add("Adresses");
|
||||
}
|
||||
|
||||
// Numéros d'ID / Sécurité sociale
|
||||
// Numéros d'ID / Sécurité sociale (anciens patterns)
|
||||
if (
|
||||
outputText.includes("[NuméroSS1]") ||
|
||||
outputText.includes("[NuméroSS") ||
|
||||
@@ -53,14 +94,6 @@ export const AnonymizationInterface = ({
|
||||
anonymizedTypes.add("Numéros d'ID");
|
||||
}
|
||||
|
||||
// Dates
|
||||
if (
|
||||
outputText.includes("[Date") ||
|
||||
/\[\d{2}\/\d{2}\/\d{4}\]/.test(outputText)
|
||||
) {
|
||||
anonymizedTypes.add("Dates");
|
||||
}
|
||||
|
||||
// Valeurs monétaires
|
||||
if (outputText.includes("[Montant") || /\[\d+[€$]\]/.test(outputText)) {
|
||||
anonymizedTypes.add("Valeurs monétaires");
|
||||
|
||||
@@ -8,6 +8,29 @@ interface EntityMapping {
|
||||
endIndex: number;
|
||||
}
|
||||
|
||||
// Nouvelle interface pour les résultats de Presidio Analyzer
|
||||
interface PresidioAnalyzerResult {
|
||||
entity_type: string;
|
||||
start: number;
|
||||
end: number;
|
||||
score: number;
|
||||
analysis_explanation?: {
|
||||
recognizer: string;
|
||||
pattern_name?: string;
|
||||
pattern?: string;
|
||||
validation_result?: boolean;
|
||||
};
|
||||
}
|
||||
|
||||
// Interface pour la réponse de l'API
|
||||
interface ProcessDocumentResponse {
|
||||
text?: string;
|
||||
anonymizedText?: string;
|
||||
piiCount?: number;
|
||||
analyzerResults?: PresidioAnalyzerResult[];
|
||||
error?: string;
|
||||
}
|
||||
|
||||
interface AnonymizationLogicProps {
|
||||
sourceText: string;
|
||||
fileContent: string;
|
||||
@@ -31,9 +54,7 @@ export const useAnonymization = ({
|
||||
const textToProcess = sourceText || fileContent || "";
|
||||
|
||||
if (!textToProcess.trim()) {
|
||||
setError(
|
||||
"Veuillez saisir du texte à anonymiser ou télécharger un fichier"
|
||||
);
|
||||
setError("Veuillez saisir du texte à anonymiser ou télécharger un fichier");
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -43,131 +64,65 @@ export const useAnonymization = ({
|
||||
setEntityMappings([]);
|
||||
|
||||
try {
|
||||
if (
|
||||
uploadedFile &&
|
||||
uploadedFile.type === "application/pdf" &&
|
||||
!fileContent
|
||||
) {
|
||||
const formData = new FormData();
|
||||
formData.append("file", uploadedFile);
|
||||
|
||||
const response = await fetch("/api/process-document", {
|
||||
method: "POST",
|
||||
body: formData,
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error("Erreur lors du traitement du PDF");
|
||||
}
|
||||
|
||||
const data = await response.json();
|
||||
|
||||
if (data.error) {
|
||||
throw new Error(data.error);
|
||||
}
|
||||
|
||||
if (data.anonymizedText) {
|
||||
setOutputText(data.anonymizedText);
|
||||
// TODO: Extraire les mappings depuis les résultats Presidio
|
||||
setIsProcessing(false);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
await new Promise((resolve) => setTimeout(resolve, 1500));
|
||||
|
||||
// Simulation des mappings pour le fallback
|
||||
const mappings: EntityMapping[] = [];
|
||||
let anonymized = textToProcess;
|
||||
console.log("🚀 Début anonymisation avec Presidio");
|
||||
|
||||
// Noms
|
||||
const nameMatches = textToProcess.matchAll(/\b[A-Z][a-z]+ [A-Z][a-z]+\b/g);
|
||||
let nameCounter = 1;
|
||||
for (const match of nameMatches) {
|
||||
const replacement = `[Nom${nameCounter}]`;
|
||||
mappings.push({
|
||||
originalValue: match[0],
|
||||
anonymizedValue: replacement,
|
||||
entityType: "PERSON",
|
||||
startIndex: match.index!,
|
||||
endIndex: match.index! + match[0].length
|
||||
});
|
||||
anonymized = anonymized.replace(match[0], replacement);
|
||||
nameCounter++;
|
||||
const formData = new FormData();
|
||||
|
||||
if (uploadedFile) {
|
||||
console.log("📁 Traitement fichier:", uploadedFile.name);
|
||||
formData.append("file", uploadedFile);
|
||||
} else {
|
||||
console.log("📝 Traitement texte saisi");
|
||||
const textBlob = new Blob([textToProcess], { type: "text/plain" });
|
||||
const textFile = new File([textBlob], "input.txt", { type: "text/plain" });
|
||||
formData.append("file", textFile);
|
||||
}
|
||||
|
||||
// Téléphones
|
||||
const phoneMatches = textToProcess.matchAll(/\b0[1-9](?:[\s.-]?\d{2}){4}\b/g);
|
||||
let phoneCounter = 1;
|
||||
for (const match of phoneMatches) {
|
||||
const replacement = `[Téléphone${phoneCounter}]`;
|
||||
mappings.push({
|
||||
originalValue: match[0],
|
||||
anonymizedValue: replacement,
|
||||
entityType: "PHONE_NUMBER",
|
||||
startIndex: match.index!,
|
||||
endIndex: match.index! + match[0].length
|
||||
});
|
||||
anonymized = anonymized.replace(match[0], replacement);
|
||||
phoneCounter++;
|
||||
console.log("🔍 Appel à /api/process-document avec Presidio...");
|
||||
const response = await fetch("/api/process-document", {
|
||||
method: "POST",
|
||||
body: formData,
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error(`Erreur API: ${response.status}`);
|
||||
}
|
||||
|
||||
// Emails
|
||||
const emailMatches = textToProcess.matchAll(/\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b/g);
|
||||
let emailCounter = 1;
|
||||
for (const match of emailMatches) {
|
||||
const replacement = `[Email${emailCounter}]`;
|
||||
mappings.push({
|
||||
originalValue: match[0],
|
||||
anonymizedValue: replacement,
|
||||
entityType: "EMAIL_ADDRESS",
|
||||
startIndex: match.index!,
|
||||
endIndex: match.index! + match[0].length
|
||||
});
|
||||
anonymized = anonymized.replace(match[0], replacement);
|
||||
emailCounter++;
|
||||
const data: ProcessDocumentResponse = await response.json();
|
||||
console.log("📊 Réponse API:", data);
|
||||
|
||||
if (data.error) {
|
||||
throw new Error(data.error);
|
||||
}
|
||||
|
||||
// Adresses
|
||||
const addressMatches = textToProcess.matchAll(/\b\d{1,3}\s+[a-zA-Z\s]+,\s*\d{5}\s+[a-zA-Z\s]+\b/g);
|
||||
let addressCounter = 1;
|
||||
for (const match of addressMatches) {
|
||||
const replacement = `[Adresse${addressCounter}]`;
|
||||
mappings.push({
|
||||
originalValue: match[0],
|
||||
anonymizedValue: replacement,
|
||||
entityType: "LOCATION",
|
||||
startIndex: match.index!,
|
||||
endIndex: match.index! + match[0].length
|
||||
});
|
||||
anonymized = anonymized.replace(match[0], replacement);
|
||||
addressCounter++;
|
||||
if (data.anonymizedText) {
|
||||
console.log("✅ Anonymisation réussie avec Presidio");
|
||||
setOutputText(data.anonymizedText);
|
||||
|
||||
// Extraire les mappings depuis les résultats Presidio (plus d'erreur 'any')
|
||||
if (data.analyzerResults && data.text) {
|
||||
const mappings: EntityMapping[] = data.analyzerResults.map((entity: PresidioAnalyzerResult, index: number) => ({
|
||||
originalValue: data.text!.substring(entity.start, entity.end),
|
||||
anonymizedValue: `[${entity.entity_type}${index + 1}]`,
|
||||
entityType: entity.entity_type,
|
||||
startIndex: entity.start,
|
||||
endIndex: entity.end
|
||||
}));
|
||||
setEntityMappings(mappings);
|
||||
console.log("📋 Entités détectées:", mappings.length);
|
||||
console.log("🔍 Détails des entités:", mappings);
|
||||
}
|
||||
} else if (data.text) {
|
||||
console.log("⚠️ Fallback: Presidio non disponible, texte original retourné");
|
||||
setOutputText(data.text);
|
||||
setError("Presidio temporairement indisponible. Texte non anonymisé.");
|
||||
}
|
||||
|
||||
// Numéros de sécurité sociale
|
||||
const ssnMatches = textToProcess.matchAll(/\b\d\s\d{2}\s\d{2}\s\d{2}\s\d{3}\s\d{3}\s\d{2}\b/g);
|
||||
let ssnCounter = 1;
|
||||
for (const match of ssnMatches) {
|
||||
const replacement = `[NuméroSS${ssnCounter}]`;
|
||||
mappings.push({
|
||||
originalValue: match[0],
|
||||
anonymizedValue: replacement,
|
||||
entityType: "FR_NIR",
|
||||
startIndex: match.index!,
|
||||
endIndex: match.index! + match[0].length
|
||||
});
|
||||
anonymized = anonymized.replace(match[0], replacement);
|
||||
ssnCounter++;
|
||||
}
|
||||
|
||||
setOutputText(anonymized);
|
||||
setEntityMappings(mappings);
|
||||
} catch (error) {
|
||||
console.error("Erreur anonymisation:", error);
|
||||
console.error("❌ Erreur anonymisation:", error);
|
||||
setError(
|
||||
error instanceof Error
|
||||
? error.message
|
||||
: "Erreur lors de l'anonymisation"
|
||||
? `Erreur Presidio: ${error.message}`
|
||||
: "Erreur lors de l'anonymisation avec Presidio"
|
||||
);
|
||||
} finally {
|
||||
setIsProcessing(false);
|
||||
|
||||
@@ -1,31 +1,89 @@
|
||||
export const highlightEntities = (text: string) => {
|
||||
import { ReactNode } from 'react';
|
||||
|
||||
export const highlightEntities = (text: string): ReactNode => {
|
||||
if (!text) return text;
|
||||
|
||||
const entityPattern = /\[([^\]]+)\]/g;
|
||||
const parts = [];
|
||||
let lastIndex = 0;
|
||||
let match;
|
||||
// Patterns pour les différents types d'entités Presidio
|
||||
const patterns = [
|
||||
// ✅ Patterns Presidio existants
|
||||
{ regex: /<PERSON>/g, className: "bg-blue-200 text-blue-800", label: "Personne" },
|
||||
{ regex: /<EMAIL_ADDRESS>/g, className: "bg-green-200 text-green-800", label: "Email" },
|
||||
{ regex: /<PHONE_NUMBER>/g, className: "bg-purple-200 text-purple-800", label: "Téléphone" },
|
||||
{ regex: /<LOCATION>/g, className: "bg-red-200 text-red-800", label: "Lieu" },
|
||||
{ regex: /<IBAN>/g, className: "bg-yellow-200 text-yellow-800", label: "IBAN" },
|
||||
{ regex: /<ORGANIZATION>/g, className: "bg-indigo-200 text-indigo-800", label: "Organisation" },
|
||||
|
||||
// 🆕 Patterns spécifiques détectés dans votre texte
|
||||
{ regex: /<FLEXIBLE_DATE>/g, className: "bg-pink-200 text-pink-800", label: "Date" },
|
||||
{ regex: /<BE_ADDRESS>/g, className: "bg-cyan-200 text-cyan-800", label: "Adresse BE" },
|
||||
{ regex: /<BE_PHONE_NUMBER>/g, className: "bg-violet-200 text-violet-800", label: "Tél. BE" },
|
||||
{ regex: /<BE_ENTERPRISE_NUMBER>/g, className: "bg-orange-200 text-orange-800", label: "N° Entreprise BE" },
|
||||
{ regex: /<BE_PRO_ID>/g, className: "bg-emerald-200 text-emerald-800", label: "ID Professionnel BE" },
|
||||
{ regex: /<IP_ADDRESS>/g, className: "bg-slate-200 text-slate-800", label: "Adresse IP" },
|
||||
|
||||
// Anciens patterns (pour compatibilité)
|
||||
{ regex: /\[([^\]]+)\]/g, className: "bg-[#f7ab6e] text-[#092727]", label: "Anonymisé" },
|
||||
];
|
||||
|
||||
while ((match = entityPattern.exec(text)) !== null) {
|
||||
if (match.index > lastIndex) {
|
||||
parts.push(text.slice(lastIndex, match.index));
|
||||
const replacements: Array<{ start: number; end: number; element: ReactNode }> = [];
|
||||
|
||||
// Trouver toutes les correspondances
|
||||
patterns.forEach((pattern, patternIndex) => {
|
||||
const regex = new RegExp(pattern.regex.source, pattern.regex.flags);
|
||||
let match;
|
||||
|
||||
while ((match = regex.exec(text)) !== null) {
|
||||
const start = match.index;
|
||||
const end = match.index + match[0].length;
|
||||
|
||||
// Vérifier qu'il n'y a pas de chevauchement avec des remplacements existants
|
||||
const hasOverlap = replacements.some(r =>
|
||||
(start >= r.start && start < r.end) || (end > r.start && end <= r.end)
|
||||
);
|
||||
|
||||
if (!hasOverlap) {
|
||||
const element = (
|
||||
<span
|
||||
key={`${patternIndex}-${start}`}
|
||||
className={`${pattern.className} px-2 py-1 rounded-md font-medium text-xs inline-block mx-0.5 shadow-sm border`}
|
||||
title={`${pattern.label} anonymisé`}
|
||||
>
|
||||
{match[0]}
|
||||
</span>
|
||||
);
|
||||
|
||||
replacements.push({ start, end, element });
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
parts.push(
|
||||
<span
|
||||
key={match.index}
|
||||
className="bg-[#f7ab6e] text-[#092727] px-1 py-0.5 rounded font-medium"
|
||||
>
|
||||
{match[0]}
|
||||
</span>
|
||||
);
|
||||
// Trier les remplacements par position
|
||||
replacements.sort((a, b) => a.start - b.start);
|
||||
|
||||
lastIndex = match.index + match[0].length;
|
||||
// Construire le résultat final
|
||||
if (replacements.length === 0) {
|
||||
return text;
|
||||
}
|
||||
|
||||
const parts: ReactNode[] = [];
|
||||
let lastIndex = 0;
|
||||
|
||||
replacements.forEach((replacement) => {
|
||||
// Ajouter le texte avant le remplacement
|
||||
if (replacement.start > lastIndex) {
|
||||
parts.push(text.slice(lastIndex, replacement.start));
|
||||
}
|
||||
|
||||
// Ajouter l'élément de remplacement
|
||||
parts.push(replacement.element);
|
||||
|
||||
lastIndex = replacement.end;
|
||||
});
|
||||
|
||||
// Ajouter le texte restant
|
||||
if (lastIndex < text.length) {
|
||||
parts.push(text.slice(lastIndex));
|
||||
}
|
||||
|
||||
return parts.length > 0 ? parts : text;
|
||||
return parts;
|
||||
};
|
||||
Reference in New Issue
Block a user