presidio ok v.1

2025-07-28 20:55:11 +02:00
parent 6d12017561
commit dc734e08f0
4 changed files with 221 additions and 163 deletions
--- a/app/api/process-document/route.ts
+++ b/app/api/process-document/route.ts
@@ -1,5 +1,5 @@
 import { NextResponse, type NextRequest } from "next/server";
-import pdf from "pdf-parse/lib/pdf-parse";
+import pdf from "pdf-parse"; // ✅ Import correct
 import mammoth from "mammoth";
 export async function POST(req: NextRequest) {
@@ -27,13 +27,20 @@ export async function POST(req: NextRequest) {
        const data = await pdf(buffer);
        fileContent = data.text || "";
        console.log("✅ Extraction PDF réussie, longueur:", fileContent.length);
        // ✅ Vérification supplémentaire
        if (!fileContent.trim()) {
          console.log("⚠️ PDF vide ou non lisible");
          return NextResponse.json(
            { error: "Le PDF ne contient pas de texte extractible ou est protégé." },
            { status: 400 }
          );
        }
      } catch (pdfError) {
-        console.error("❌ Erreur PDF:", pdfError);
+        console.error("❌ Erreur PDF détaillée:", pdfError);
        return NextResponse.json(
          {
-            error: `Erreur traitement PDF: ${
+            error: `Erreur traitement PDF: ${pdfError instanceof Error ? pdfError.message : "Erreur inconnue"}. Vérifiez que le PDF n'est pas protégé ou corrompu.`,
              pdfError instanceof Error ? pdfError.message : "Erreur inconnue"
            }`,
          },
          { status: 500 }
        );
@@ -111,8 +118,8 @@ export async function POST(req: NextRequest) {
    console.log("🔍 Appel à Presidio Analyzer...");
-    const presidioAnalyzerUrl =
+    // ✅ Définir l'URL AVANT de l'utiliser
-      "http://analyzer.151.80.20.211.sslip.io/analyze";
+    const presidioAnalyzerUrl = "http://analyzer.151.80.20.211.sslip.io/analyze";
    try {
      const analyzeResponse = await fetch(presidioAnalyzerUrl, {
@@ -125,9 +132,13 @@ export async function POST(req: NextRequest) {
      });
      console.log("📊 Statut Analyzer:", analyzeResponse.status);
      console.log("📊 Headers Analyzer:", analyzeResponse.headers);
      if (!analyzeResponse.ok) {
        const errorBody = await analyzeResponse.text();
        console.error("❌ Erreur Analyzer:", errorBody);
        console.error("❌ URL utilisée:", presidioAnalyzerUrl);
        console.error("❌ Config envoyée:", analyzerConfig);
        // Fallback: retourner juste le texte si Presidio n'est pas disponible
        return NextResponse.json({ text: fileContent }, { status: 200 });
      }
@@ -172,6 +183,7 @@ export async function POST(req: NextRequest) {
        text: fileContent,
        anonymizedText: anonymizerResult.text,
        piiCount: analyzerResults.length,
        analyzerResults: analyzerResults,
      };
      return NextResponse.json(result, { status: 200 });
--- a/app/components/AnonymizationInterface.tsx
+++ b/app/components/AnonymizationInterface.tsx
@@ -17,34 +17,75 @@ export const AnonymizationInterface = ({
    const anonymizedTypes = new Set<string>();
-    // Détecter les patterns d'anonymisation dans le texte de sortie
+    // ✅ NOUVEAUX PATTERNS PRESIDIO
-    // Noms (Prénoms, Noms de famille, Noms complets)
+    // Noms (PERSON)
    if (outputText.includes("<PERSON>")) {
      anonymizedTypes.add("Prénoms");
      anonymizedTypes.add("Noms de famille");
      anonymizedTypes.add("Noms complets");
    }
    // Emails (EMAIL_ADDRESS)
    if (outputText.includes("<EMAIL_ADDRESS>")) {
      anonymizedTypes.add("Adresses e-mail");
    }
    // Téléphones (PHONE_NUMBER)
    if (outputText.includes("<PHONE_NUMBER>")) {
      anonymizedTypes.add("Numéros de téléphone");
    }
    // Adresses (LOCATION)
    if (outputText.includes("<LOCATION>")) {
      anonymizedTypes.add("Adresses");
    }
    // IBAN (IBAN)
    if (outputText.includes("<IBAN>")) {
      anonymizedTypes.add("Numéros d'ID"); // Ou créer une nouvelle catégorie "IBAN"
    }
    // Organisations (ORGANIZATION)
    if (outputText.includes("<ORGANIZATION>")) {
      anonymizedTypes.add("Noms de domaine"); // Ou adapter selon vos besoins
    }
    // Dates personnalisées (CUSTOM_DATE)
    if (outputText.includes("<CUSTOM_DATE>")) {
      anonymizedTypes.add("Dates");
    }
    // Numéros d'entreprise belges (BE_ENTERPRISE_NUMBER)
    if (outputText.includes("<BE_ENTERPRISE_NUMBER>")) {
      anonymizedTypes.add("Numéros d'ID");
    }
    // ✅ ANCIENS PATTERNS (pour compatibilité)
    // Noms (anciens patterns [Nom1], [Nom2]...)
    if (outputText.includes("[Nom1]") || outputText.includes("[Nom")) {
      anonymizedTypes.add("Prénoms");
      anonymizedTypes.add("Noms de famille");
      anonymizedTypes.add("Noms complets");
    }
-    // Emails
+    // Emails (anciens patterns)
    if (outputText.includes("[Email1]") || outputText.includes("[Email")) {
      anonymizedTypes.add("Adresses e-mail");
    }
-    // Téléphones
+    // Téléphones (anciens patterns)
-    if (
+    if (outputText.includes("[Téléphone1]") || outputText.includes("[Téléphone")) {
      outputText.includes("[Téléphone1]") ||
      outputText.includes("[Téléphone")
    ) {
      anonymizedTypes.add("Numéros de téléphone");
    }
-    // Adresses
+    // Adresses (anciens patterns)
    if (outputText.includes("[Adresse1]") || outputText.includes("[Adresse")) {
      anonymizedTypes.add("Adresses");
    }
-    // Numéros d'ID / Sécurité sociale
+    // Numéros d'ID / Sécurité sociale (anciens patterns)
    if (
      outputText.includes("[NuméroSS1]") ||
      outputText.includes("[NuméroSS") ||
@@ -53,14 +94,6 @@ export const AnonymizationInterface = ({
      anonymizedTypes.add("Numéros d'ID");
    }
    // Dates
    if (
      outputText.includes("[Date") ||
      /\[\d{2}\/\d{2}\/\d{4}\]/.test(outputText)
    ) {
      anonymizedTypes.add("Dates");
    }
    // Valeurs monétaires
    if (outputText.includes("[Montant") || /\[\d+[€$]\]/.test(outputText)) {
      anonymizedTypes.add("Valeurs monétaires");
--- a/app/components/AnonymizationLogic.tsx
+++ b/app/components/AnonymizationLogic.tsx
@@ -8,6 +8,29 @@ interface EntityMapping {
  endIndex: number;
 }
 // Nouvelle interface pour les résultats de Presidio Analyzer
 interface PresidioAnalyzerResult {
  entity_type: string;
  start: number;
  end: number;
  score: number;
  analysis_explanation?: {
    recognizer: string;
    pattern_name?: string;
    pattern?: string;
    validation_result?: boolean;
  };
 }
 // Interface pour la réponse de l'API
 interface ProcessDocumentResponse {
  text?: string;
  anonymizedText?: string;
  piiCount?: number;
  analyzerResults?: PresidioAnalyzerResult[];
  error?: string;
 }
 interface AnonymizationLogicProps {
  sourceText: string;
  fileContent: string;
@@ -31,9 +54,7 @@ export const useAnonymization = ({
    const textToProcess = sourceText || fileContent || "";
    if (!textToProcess.trim()) {
-      setError(
+      setError("Veuillez saisir du texte à anonymiser ou télécharger un fichier");
        "Veuillez saisir du texte à anonymiser ou télécharger un fichier"
      );
      return;
    }
@@ -43,131 +64,65 @@ export const useAnonymization = ({
    setEntityMappings([]);
    try {
-      if (
+      console.log("🚀 Début anonymisation avec Presidio");
-        uploadedFile &&
+      
-        uploadedFile.type === "application/pdf" &&
+      const formData = new FormData();
-        !fileContent
+      
-      ) {
+      if (uploadedFile) {
-        const formData = new FormData();
+        console.log("📁 Traitement fichier:", uploadedFile.name);
        formData.append("file", uploadedFile);
      } else {
        console.log("📝 Traitement texte saisi");
        const textBlob = new Blob([textToProcess], { type: "text/plain" });
        const textFile = new File([textBlob], "input.txt", { type: "text/plain" });
        formData.append("file", textFile);
      }
-        const response = await fetch("/api/process-document", {
+      console.log("🔍 Appel à /api/process-document avec Presidio...");
-          method: "POST",
+      const response = await fetch("/api/process-document", {
-          body: formData,
+        method: "POST",
-        });
+        body: formData,
      });
-        if (!response.ok) {
+      if (!response.ok) {
-          throw new Error("Erreur lors du traitement du PDF");
+        throw new Error(`Erreur API: ${response.status}`);
-        }
+      }
-
+
-        const data = await response.json();
+      const data: ProcessDocumentResponse = await response.json();
-
+      console.log("📊 Réponse API:", data);
-        if (data.error) {
+
-          throw new Error(data.error);
+      if (data.error) {
-        }
+        throw new Error(data.error);
-        
+      }
-        if (data.anonymizedText) {
+
-          setOutputText(data.anonymizedText);
+      if (data.anonymizedText) {
-          // TODO: Extraire les mappings depuis les résultats Presidio
+        console.log("✅ Anonymisation réussie avec Presidio");
-          setIsProcessing(false);
+        setOutputText(data.anonymizedText);
-          return;
+        
        // Extraire les mappings depuis les résultats Presidio (plus d'erreur 'any')
        if (data.analyzerResults && data.text) {
          const mappings: EntityMapping[] = data.analyzerResults.map((entity: PresidioAnalyzerResult, index: number) => ({
            originalValue: data.text!.substring(entity.start, entity.end),
            anonymizedValue: `[${entity.entity_type}${index + 1}]`,
            entityType: entity.entity_type,
            startIndex: entity.start,
            endIndex: entity.end
          }));
          setEntityMappings(mappings);
          console.log("📋 Entités détectées:", mappings.length);
          console.log("🔍 Détails des entités:", mappings);
        }
      } else if (data.text) {
        console.log("⚠️ Fallback: Presidio non disponible, texte original retourné");
        setOutputText(data.text);
        setError("Presidio temporairement indisponible. Texte non anonymisé.");
      }
      await new Promise((resolve) => setTimeout(resolve, 1500));
      // Simulation des mappings pour le fallback
      const mappings: EntityMapping[] = [];
      let anonymized = textToProcess;
      // Noms
      const nameMatches = textToProcess.matchAll(/\b[A-Z][a-z]+ [A-Z][a-z]+\b/g);
      let nameCounter = 1;
      for (const match of nameMatches) {
        const replacement = `[Nom${nameCounter}]`;
        mappings.push({
          originalValue: match[0],
          anonymizedValue: replacement,
          entityType: "PERSON",
          startIndex: match.index!,
          endIndex: match.index! + match[0].length
        });
        anonymized = anonymized.replace(match[0], replacement);
        nameCounter++;
      }
      // Téléphones
      const phoneMatches = textToProcess.matchAll(/\b0[1-9](?:[\s.-]?\d{2}){4}\b/g);
      let phoneCounter = 1;
      for (const match of phoneMatches) {
        const replacement = `[Téléphone${phoneCounter}]`;
        mappings.push({
          originalValue: match[0],
          anonymizedValue: replacement,
          entityType: "PHONE_NUMBER",
          startIndex: match.index!,
          endIndex: match.index! + match[0].length
        });
        anonymized = anonymized.replace(match[0], replacement);
        phoneCounter++;
      }
      // Emails
      const emailMatches = textToProcess.matchAll(/\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b/g);
      let emailCounter = 1;
      for (const match of emailMatches) {
        const replacement = `[Email${emailCounter}]`;
        mappings.push({
          originalValue: match[0],
          anonymizedValue: replacement,
          entityType: "EMAIL_ADDRESS",
          startIndex: match.index!,
          endIndex: match.index! + match[0].length
        });
        anonymized = anonymized.replace(match[0], replacement);
        emailCounter++;
      }
      // Adresses
      const addressMatches = textToProcess.matchAll(/\b\d{1,3}\s+[a-zA-Z\s]+,\s*\d{5}\s+[a-zA-Z\s]+\b/g);
      let addressCounter = 1;
      for (const match of addressMatches) {
        const replacement = `[Adresse${addressCounter}]`;
        mappings.push({
          originalValue: match[0],
          anonymizedValue: replacement,
          entityType: "LOCATION",
          startIndex: match.index!,
          endIndex: match.index! + match[0].length
        });
        anonymized = anonymized.replace(match[0], replacement);
        addressCounter++;
      }
      // Numéros de sécurité sociale
      const ssnMatches = textToProcess.matchAll(/\b\d\s\d{2}\s\d{2}\s\d{2}\s\d{3}\s\d{3}\s\d{2}\b/g);
      let ssnCounter = 1;
      for (const match of ssnMatches) {
        const replacement = `[NuméroSS${ssnCounter}]`;
        mappings.push({
          originalValue: match[0],
          anonymizedValue: replacement,
          entityType: "FR_NIR",
          startIndex: match.index!,
          endIndex: match.index! + match[0].length
        });
        anonymized = anonymized.replace(match[0], replacement);
        ssnCounter++;
      }
      setOutputText(anonymized);
      setEntityMappings(mappings);
    } catch (error) {
-      console.error("Erreur anonymisation:", error);
+      console.error("❌ Erreur anonymisation:", error);
      setError(
        error instanceof Error
-          ? error.message
+          ? `Erreur Presidio: ${error.message}`
-          : "Erreur lors de l'anonymisation"
+          : "Erreur lors de l'anonymisation avec Presidio"
      );
    } finally {
      setIsProcessing(false);
--- a/app/utils/highlightEntities.tsx
+++ b/app/utils/highlightEntities.tsx
@@ -1,31 +1,89 @@
-export const highlightEntities = (text: string) => {
+import { ReactNode } from 'react';
 export const highlightEntities = (text: string): ReactNode => {
  if (!text) return text;
-  const entityPattern = /\[([^\]]+)\]/g;
+  // Patterns pour les différents types d'entités Presidio
-  const parts = [];
+  const patterns = [
-  let lastIndex = 0;
+    // ✅ Patterns Presidio existants
-  let match;
+    { regex: /<PERSON>/g, className: "bg-blue-200 text-blue-800", label: "Personne" },
    { regex: /<EMAIL_ADDRESS>/g, className: "bg-green-200 text-green-800", label: "Email" },
    { regex: /<PHONE_NUMBER>/g, className: "bg-purple-200 text-purple-800", label: "Téléphone" },
    { regex: /<LOCATION>/g, className: "bg-red-200 text-red-800", label: "Lieu" },
    { regex: /<IBAN>/g, className: "bg-yellow-200 text-yellow-800", label: "IBAN" },
    { regex: /<ORGANIZATION>/g, className: "bg-indigo-200 text-indigo-800", label: "Organisation" },
-  while ((match = entityPattern.exec(text)) !== null) {
+    // 🆕 Patterns spécifiques détectés dans votre texte
-    if (match.index > lastIndex) {
+    { regex: /<FLEXIBLE_DATE>/g, className: "bg-pink-200 text-pink-800", label: "Date" },
-      parts.push(text.slice(lastIndex, match.index));
+    { regex: /<BE_ADDRESS>/g, className: "bg-cyan-200 text-cyan-800", label: "Adresse BE" },
    { regex: /<BE_PHONE_NUMBER>/g, className: "bg-violet-200 text-violet-800", label: "Tél. BE" },
    { regex: /<BE_ENTERPRISE_NUMBER>/g, className: "bg-orange-200 text-orange-800", label: "N° Entreprise BE" },
    { regex: /<BE_PRO_ID>/g, className: "bg-emerald-200 text-emerald-800", label: "ID Professionnel BE" },
    { regex: /<IP_ADDRESS>/g, className: "bg-slate-200 text-slate-800", label: "Adresse IP" },
    // Anciens patterns (pour compatibilité)
    { regex: /\[([^\]]+)\]/g, className: "bg-[#f7ab6e] text-[#092727]", label: "Anonymisé" },
  ];
  const replacements: Array<{ start: number; end: number; element: ReactNode }> = [];
  // Trouver toutes les correspondances
  patterns.forEach((pattern, patternIndex) => {
    const regex = new RegExp(pattern.regex.source, pattern.regex.flags);
    let match;
    while ((match = regex.exec(text)) !== null) {
      const start = match.index;
      const end = match.index + match[0].length;
      // Vérifier qu'il n'y a pas de chevauchement avec des remplacements existants
      const hasOverlap = replacements.some(r => 
        (start >= r.start && start < r.end) || (end > r.start && end <= r.end)
      );
      if (!hasOverlap) {
        const element = (
          <span
            key={`${patternIndex}-${start}`}
            className={`${pattern.className} px-2 py-1 rounded-md font-medium text-xs inline-block mx-0.5 shadow-sm border`}
            title={`${pattern.label} anonymisé`}
          >
            {match[0]}
          </span>
        );
        replacements.push({ start, end, element });
      }
    }
  });
-    parts.push(
+  // Trier les remplacements par position
-      <span
+  replacements.sort((a, b) => a.start - b.start);
        key={match.index}
        className="bg-[#f7ab6e] text-[#092727] px-1 py-0.5 rounded font-medium"
      >
        {match[0]}
      </span>
    );
-    lastIndex = match.index + match[0].length;
+  // Construire le résultat final
  if (replacements.length === 0) {
    return text;
  }
  const parts: ReactNode[] = [];
  let lastIndex = 0;
  replacements.forEach((replacement) => {
    // Ajouter le texte avant le remplacement
    if (replacement.start > lastIndex) {
      parts.push(text.slice(lastIndex, replacement.start));
    }
    // Ajouter l'élément de remplacement
    parts.push(replacement.element);
    lastIndex = replacement.end;
  });
  // Ajouter le texte restant
  if (lastIndex < text.length) {
    parts.push(text.slice(lastIndex));
  }
-  return parts.length > 0 ? parts : text;
+  return parts;
 };