presidio ok v.1

2025-07-28 20:55:11 +02:00
parent 6d12017561
commit dc734e08f0
4 changed files with 221 additions and 163 deletions
--- a/app/api/process-document/route.ts
+++ b/app/api/process-document/route.ts
@@ -1,5 +1,5 @@
 import { NextResponse, type NextRequest } from "next/server";
-import pdf from "pdf-parse/lib/pdf-parse";
+import pdf from "pdf-parse"; // ✅ Import correct
 import mammoth from "mammoth";

 export async function POST(req: NextRequest) {
@@ -27,13 +27,20 @@ export async function POST(req: NextRequest) {
        const data = await pdf(buffer);
        fileContent = data.text || "";
        console.log("✅ Extraction PDF réussie, longueur:", fileContent.length);
+        
+        // ✅ Vérification supplémentaire
+        if (!fileContent.trim()) {
+          console.log("⚠️ PDF vide ou non lisible");
+          return NextResponse.json(
+            { error: "Le PDF ne contient pas de texte extractible ou est protégé." },
+            { status: 400 }
+          );
+        }
      } catch (pdfError) {
-        console.error("❌ Erreur PDF:", pdfError);
+        console.error("❌ Erreur PDF détaillée:", pdfError);
        return NextResponse.json(
          {
-            error: `Erreur traitement PDF: ${
-              pdfError instanceof Error ? pdfError.message : "Erreur inconnue"
-            }`,
+            error: `Erreur traitement PDF: ${pdfError instanceof Error ? pdfError.message : "Erreur inconnue"}. Vérifiez que le PDF n'est pas protégé ou corrompu.`,
          },
          { status: 500 }
        );
@@ -110,10 +117,10 @@ export async function POST(req: NextRequest) {
    };

    console.log("🔍 Appel à Presidio Analyzer...");
-
-    const presidioAnalyzerUrl =
-      "http://analyzer.151.80.20.211.sslip.io/analyze";
-
+    
+    // ✅ Définir l'URL AVANT de l'utiliser
+    const presidioAnalyzerUrl = "http://analyzer.151.80.20.211.sslip.io/analyze";
+    
    try {
      const analyzeResponse = await fetch(presidioAnalyzerUrl, {
        method: "POST",
@@ -123,11 +130,15 @@ export async function POST(req: NextRequest) {
        },
        body: JSON.stringify(analyzerConfig),
      });
-
+      
      console.log("📊 Statut Analyzer:", analyzeResponse.status);
+      console.log("📊 Headers Analyzer:", analyzeResponse.headers);
+      
      if (!analyzeResponse.ok) {
        const errorBody = await analyzeResponse.text();
        console.error("❌ Erreur Analyzer:", errorBody);
+        console.error("❌ URL utilisée:", presidioAnalyzerUrl);
+        console.error("❌ Config envoyée:", analyzerConfig);
        // Fallback: retourner juste le texte si Presidio n'est pas disponible
        return NextResponse.json({ text: fileContent }, { status: 200 });
      }
@@ -172,6 +183,7 @@ export async function POST(req: NextRequest) {
        text: fileContent,
        anonymizedText: anonymizerResult.text,
        piiCount: analyzerResults.length,
+        analyzerResults: analyzerResults,
      };

      return NextResponse.json(result, { status: 200 });
--- a/app/components/AnonymizationInterface.tsx
+++ b/app/components/AnonymizationInterface.tsx
@@ -17,34 +17,75 @@ export const AnonymizationInterface = ({

    const anonymizedTypes = new Set<string>();

-    // Détecter les patterns d'anonymisation dans le texte de sortie
+    // ✅ NOUVEAUX PATTERNS PRESIDIO
+    
+    // Noms (PERSON)
+    if (outputText.includes("<PERSON>")) {
+      anonymizedTypes.add("Prénoms");
+      anonymizedTypes.add("Noms de famille");
+      anonymizedTypes.add("Noms complets");
+    }

-    // Noms (Prénoms, Noms de famille, Noms complets)
+    // Emails (EMAIL_ADDRESS)
+    if (outputText.includes("<EMAIL_ADDRESS>")) {
+      anonymizedTypes.add("Adresses e-mail");
+    }
+
+    // Téléphones (PHONE_NUMBER)
+    if (outputText.includes("<PHONE_NUMBER>")) {
+      anonymizedTypes.add("Numéros de téléphone");
+    }
+
+    // Adresses (LOCATION)
+    if (outputText.includes("<LOCATION>")) {
+      anonymizedTypes.add("Adresses");
+    }
+
+    // IBAN (IBAN)
+    if (outputText.includes("<IBAN>")) {
+      anonymizedTypes.add("Numéros d'ID"); // Ou créer une nouvelle catégorie "IBAN"
+    }
+
+    // Organisations (ORGANIZATION)
+    if (outputText.includes("<ORGANIZATION>")) {
+      anonymizedTypes.add("Noms de domaine"); // Ou adapter selon vos besoins
+    }
+
+    // Dates personnalisées (CUSTOM_DATE)
+    if (outputText.includes("<CUSTOM_DATE>")) {
+      anonymizedTypes.add("Dates");
+    }
+
+    // Numéros d'entreprise belges (BE_ENTERPRISE_NUMBER)
+    if (outputText.includes("<BE_ENTERPRISE_NUMBER>")) {
+      anonymizedTypes.add("Numéros d'ID");
+    }
+
+    // ✅ ANCIENS PATTERNS (pour compatibilité)
+    
+    // Noms (anciens patterns [Nom1], [Nom2]...)
    if (outputText.includes("[Nom1]") || outputText.includes("[Nom")) {
      anonymizedTypes.add("Prénoms");
      anonymizedTypes.add("Noms de famille");
      anonymizedTypes.add("Noms complets");
    }

-    // Emails
+    // Emails (anciens patterns)
    if (outputText.includes("[Email1]") || outputText.includes("[Email")) {
      anonymizedTypes.add("Adresses e-mail");
    }

-    // Téléphones
-    if (
-      outputText.includes("[Téléphone1]") ||
-      outputText.includes("[Téléphone")
-    ) {
+    // Téléphones (anciens patterns)
+    if (outputText.includes("[Téléphone1]") || outputText.includes("[Téléphone")) {
      anonymizedTypes.add("Numéros de téléphone");
    }

-    // Adresses
+    // Adresses (anciens patterns)
    if (outputText.includes("[Adresse1]") || outputText.includes("[Adresse")) {
      anonymizedTypes.add("Adresses");
    }

-    // Numéros d'ID / Sécurité sociale
+    // Numéros d'ID / Sécurité sociale (anciens patterns)
    if (
      outputText.includes("[NuméroSS1]") ||
      outputText.includes("[NuméroSS") ||
@@ -53,14 +94,6 @@ export const AnonymizationInterface = ({
      anonymizedTypes.add("Numéros d'ID");
    }

-    // Dates
-    if (
-      outputText.includes("[Date") ||
-      /\[\d{2}\/\d{2}\/\d{4}\]/.test(outputText)
-    ) {
-      anonymizedTypes.add("Dates");
-    }
-
    // Valeurs monétaires
    if (outputText.includes("[Montant") || /\[\d+[€$]\]/.test(outputText)) {
      anonymizedTypes.add("Valeurs monétaires");
--- a/app/components/AnonymizationLogic.tsx
+++ b/app/components/AnonymizationLogic.tsx
@@ -8,6 +8,29 @@ interface EntityMapping {
  endIndex: number;
 }

+// Nouvelle interface pour les résultats de Presidio Analyzer
+interface PresidioAnalyzerResult {
+  entity_type: string;
+  start: number;
+  end: number;
+  score: number;
+  analysis_explanation?: {
+    recognizer: string;
+    pattern_name?: string;
+    pattern?: string;
+    validation_result?: boolean;
+  };
+}
+
+// Interface pour la réponse de l'API
+interface ProcessDocumentResponse {
+  text?: string;
+  anonymizedText?: string;
+  piiCount?: number;
+  analyzerResults?: PresidioAnalyzerResult[];
+  error?: string;
+}
+
 interface AnonymizationLogicProps {
  sourceText: string;
  fileContent: string;
@@ -31,9 +54,7 @@ export const useAnonymization = ({
    const textToProcess = sourceText || fileContent || "";

    if (!textToProcess.trim()) {
-      setError(
-        "Veuillez saisir du texte à anonymiser ou télécharger un fichier"
-      );
+      setError("Veuillez saisir du texte à anonymiser ou télécharger un fichier");
      return;
    }

@@ -43,131 +64,65 @@ export const useAnonymization = ({
    setEntityMappings([]);

    try {
-      if (
-        uploadedFile &&
-        uploadedFile.type === "application/pdf" &&
-        !fileContent
-      ) {
-        const formData = new FormData();
-        formData.append("file", uploadedFile);
-
-        const response = await fetch("/api/process-document", {
-          method: "POST",
-          body: formData,
-        });
-
-        if (!response.ok) {
-          throw new Error("Erreur lors du traitement du PDF");
-        }
-
-        const data = await response.json();
-
-        if (data.error) {
-          throw new Error(data.error);
-        }
-        
-        if (data.anonymizedText) {
-          setOutputText(data.anonymizedText);
-          // TODO: Extraire les mappings depuis les résultats Presidio
-          setIsProcessing(false);
-          return;
-        }
-      }
-
-      await new Promise((resolve) => setTimeout(resolve, 1500));
-
-      // Simulation des mappings pour le fallback
-      const mappings: EntityMapping[] = [];
-      let anonymized = textToProcess;
+      console.log("🚀 Début anonymisation avec Presidio");
      
-      // Noms
-      const nameMatches = textToProcess.matchAll(/\b[A-Z][a-z]+ [A-Z][a-z]+\b/g);
-      let nameCounter = 1;
-      for (const match of nameMatches) {
-        const replacement = `[Nom${nameCounter}]`;
-        mappings.push({
-          originalValue: match[0],
-          anonymizedValue: replacement,
-          entityType: "PERSON",
-          startIndex: match.index!,
-          endIndex: match.index! + match[0].length
-        });
-        anonymized = anonymized.replace(match[0], replacement);
-        nameCounter++;
+      const formData = new FormData();
+      
+      if (uploadedFile) {
+        console.log("📁 Traitement fichier:", uploadedFile.name);
+        formData.append("file", uploadedFile);
+      } else {
+        console.log("📝 Traitement texte saisi");
+        const textBlob = new Blob([textToProcess], { type: "text/plain" });
+        const textFile = new File([textBlob], "input.txt", { type: "text/plain" });
+        formData.append("file", textFile);
      }

-      // Téléphones
-      const phoneMatches = textToProcess.matchAll(/\b0[1-9](?:[\s.-]?\d{2}){4}\b/g);
-      let phoneCounter = 1;
-      for (const match of phoneMatches) {
-        const replacement = `[Téléphone${phoneCounter}]`;
-        mappings.push({
-          originalValue: match[0],
-          anonymizedValue: replacement,
-          entityType: "PHONE_NUMBER",
-          startIndex: match.index!,
-          endIndex: match.index! + match[0].length
-        });
-        anonymized = anonymized.replace(match[0], replacement);
-        phoneCounter++;
+      console.log("🔍 Appel à /api/process-document avec Presidio...");
+      const response = await fetch("/api/process-document", {
+        method: "POST",
+        body: formData,
+      });
+
+      if (!response.ok) {
+        throw new Error(`Erreur API: ${response.status}`);
      }

-      // Emails
-      const emailMatches = textToProcess.matchAll(/\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b/g);
-      let emailCounter = 1;
-      for (const match of emailMatches) {
-        const replacement = `[Email${emailCounter}]`;
-        mappings.push({
-          originalValue: match[0],
-          anonymizedValue: replacement,
-          entityType: "EMAIL_ADDRESS",
-          startIndex: match.index!,
-          endIndex: match.index! + match[0].length
-        });
-        anonymized = anonymized.replace(match[0], replacement);
-        emailCounter++;
+      const data: ProcessDocumentResponse = await response.json();
+      console.log("📊 Réponse API:", data);
+
+      if (data.error) {
+        throw new Error(data.error);
      }

-      // Adresses
-      const addressMatches = textToProcess.matchAll(/\b\d{1,3}\s+[a-zA-Z\s]+,\s*\d{5}\s+[a-zA-Z\s]+\b/g);
-      let addressCounter = 1;
-      for (const match of addressMatches) {
-        const replacement = `[Adresse${addressCounter}]`;
-        mappings.push({
-          originalValue: match[0],
-          anonymizedValue: replacement,
-          entityType: "LOCATION",
-          startIndex: match.index!,
-          endIndex: match.index! + match[0].length
-        });
-        anonymized = anonymized.replace(match[0], replacement);
-        addressCounter++;
+      if (data.anonymizedText) {
+        console.log("✅ Anonymisation réussie avec Presidio");
+        setOutputText(data.anonymizedText);
+        
+        // Extraire les mappings depuis les résultats Presidio (plus d'erreur 'any')
+        if (data.analyzerResults && data.text) {
+          const mappings: EntityMapping[] = data.analyzerResults.map((entity: PresidioAnalyzerResult, index: number) => ({
+            originalValue: data.text!.substring(entity.start, entity.end),
+            anonymizedValue: `[${entity.entity_type}${index + 1}]`,
+            entityType: entity.entity_type,
+            startIndex: entity.start,
+            endIndex: entity.end
+          }));
+          setEntityMappings(mappings);
+          console.log("📋 Entités détectées:", mappings.length);
+          console.log("🔍 Détails des entités:", mappings);
+        }
+      } else if (data.text) {
+        console.log("⚠️ Fallback: Presidio non disponible, texte original retourné");
+        setOutputText(data.text);
+        setError("Presidio temporairement indisponible. Texte non anonymisé.");
      }
-
-      // Numéros de sécurité sociale
-      const ssnMatches = textToProcess.matchAll(/\b\d\s\d{2}\s\d{2}\s\d{2}\s\d{3}\s\d{3}\s\d{2}\b/g);
-      let ssnCounter = 1;
-      for (const match of ssnMatches) {
-        const replacement = `[NuméroSS${ssnCounter}]`;
-        mappings.push({
-          originalValue: match[0],
-          anonymizedValue: replacement,
-          entityType: "FR_NIR",
-          startIndex: match.index!,
-          endIndex: match.index! + match[0].length
-        });
-        anonymized = anonymized.replace(match[0], replacement);
-        ssnCounter++;
-      }
-
-      setOutputText(anonymized);
-      setEntityMappings(mappings);
    } catch (error) {
-      console.error("Erreur anonymisation:", error);
+      console.error("❌ Erreur anonymisation:", error);
      setError(
        error instanceof Error
-          ? error.message
-          : "Erreur lors de l'anonymisation"
+          ? `Erreur Presidio: ${error.message}`
+          : "Erreur lors de l'anonymisation avec Presidio"
      );
    } finally {
      setIsProcessing(false);
--- a/app/utils/highlightEntities.tsx
+++ b/app/utils/highlightEntities.tsx
@@ -1,31 +1,89 @@
-export const highlightEntities = (text: string) => {
+import { ReactNode } from 'react';
+
+export const highlightEntities = (text: string): ReactNode => {
  if (!text) return text;

-  const entityPattern = /\[([^\]]+)\]/g;
-  const parts = [];
-  let lastIndex = 0;
-  let match;
+  // Patterns pour les différents types d'entités Presidio
+  const patterns = [
+    // ✅ Patterns Presidio existants
+    { regex: /<PERSON>/g, className: "bg-blue-200 text-blue-800", label: "Personne" },
+    { regex: /<EMAIL_ADDRESS>/g, className: "bg-green-200 text-green-800", label: "Email" },
+    { regex: /<PHONE_NUMBER>/g, className: "bg-purple-200 text-purple-800", label: "Téléphone" },
+    { regex: /<LOCATION>/g, className: "bg-red-200 text-red-800", label: "Lieu" },
+    { regex: /<IBAN>/g, className: "bg-yellow-200 text-yellow-800", label: "IBAN" },
+    { regex: /<ORGANIZATION>/g, className: "bg-indigo-200 text-indigo-800", label: "Organisation" },
+    
+    // 🆕 Patterns spécifiques détectés dans votre texte
+    { regex: /<FLEXIBLE_DATE>/g, className: "bg-pink-200 text-pink-800", label: "Date" },
+    { regex: /<BE_ADDRESS>/g, className: "bg-cyan-200 text-cyan-800", label: "Adresse BE" },
+    { regex: /<BE_PHONE_NUMBER>/g, className: "bg-violet-200 text-violet-800", label: "Tél. BE" },
+    { regex: /<BE_ENTERPRISE_NUMBER>/g, className: "bg-orange-200 text-orange-800", label: "N° Entreprise BE" },
+    { regex: /<BE_PRO_ID>/g, className: "bg-emerald-200 text-emerald-800", label: "ID Professionnel BE" },
+    { regex: /<IP_ADDRESS>/g, className: "bg-slate-200 text-slate-800", label: "Adresse IP" },
+    
+    // Anciens patterns (pour compatibilité)
+    { regex: /\[([^\]]+)\]/g, className: "bg-[#f7ab6e] text-[#092727]", label: "Anonymisé" },
+  ];

-  while ((match = entityPattern.exec(text)) !== null) {
-    if (match.index > lastIndex) {
-      parts.push(text.slice(lastIndex, match.index));
+  const replacements: Array<{ start: number; end: number; element: ReactNode }> = [];
+
+  // Trouver toutes les correspondances
+  patterns.forEach((pattern, patternIndex) => {
+    const regex = new RegExp(pattern.regex.source, pattern.regex.flags);
+    let match;
+    
+    while ((match = regex.exec(text)) !== null) {
+      const start = match.index;
+      const end = match.index + match[0].length;
+      
+      // Vérifier qu'il n'y a pas de chevauchement avec des remplacements existants
+      const hasOverlap = replacements.some(r => 
+        (start >= r.start && start < r.end) || (end > r.start && end <= r.end)
+      );
+      
+      if (!hasOverlap) {
+        const element = (
+          <span
+            key={`${patternIndex}-${start}`}
+            className={`${pattern.className} px-2 py-1 rounded-md font-medium text-xs inline-block mx-0.5 shadow-sm border`}
+            title={`${pattern.label} anonymisé`}
+          >
+            {match[0]}
+          </span>
+        );
+        
+        replacements.push({ start, end, element });
+      }
    }
+  });

-    parts.push(
-      <span
-        key={match.index}
-        className="bg-[#f7ab6e] text-[#092727] px-1 py-0.5 rounded font-medium"
-      >
-        {match[0]}
-      </span>
-    );
+  // Trier les remplacements par position
+  replacements.sort((a, b) => a.start - b.start);

-    lastIndex = match.index + match[0].length;
+  // Construire le résultat final
+  if (replacements.length === 0) {
+    return text;
  }

+  const parts: ReactNode[] = [];
+  let lastIndex = 0;
+
+  replacements.forEach((replacement) => {
+    // Ajouter le texte avant le remplacement
+    if (replacement.start > lastIndex) {
+      parts.push(text.slice(lastIndex, replacement.start));
+    }
+    
+    // Ajouter l'élément de remplacement
+    parts.push(replacement.element);
+    
+    lastIndex = replacement.end;
+  });
+
+  // Ajouter le texte restant
  if (lastIndex < text.length) {
    parts.push(text.slice(lastIndex));
  }

-  return parts.length > 0 ? parts : text;
+  return parts;
 };