// Fichier : src/app/api/anonymize/route.ts import { NextResponse } from "next/server"; import mammoth from "mammoth"; import * as pdfjs from "pdfjs-dist"; // --- Vos URLs Presidio --- const PRESIDIO_ANALYZER_URL = "http://ocs00s000ssow8kssossocco.51.68.233.212.sslip.io/analyze"; const PRESIDIO_ANONYMIZER_URL = "http://r8gko4kcwwk4sso40cc0gkg8.51.68.233.212.sslip.io/anonymize"; // Fonction utilitaire pour extraire le texte d'un PDF const extractTextWithPdfJs = async (pdfData: Uint8Array): Promise => { pdfjs.GlobalWorkerOptions.workerSrc = `https://cdnjs.cloudflare.com/ajax/libs/pdf.js/${pdfjs.version}/pdf.worker.min.mjs`; const doc = await pdfjs.getDocument(pdfData).promise; const numPages = doc.numPages; let fullText = ""; for (let i = 1; i <= numPages; i++) { const page = await doc.getPage(i); const content = await page.getTextContent(); // =================================================================== // === CORRECTION CRUCIALE DE L'EXTRACTION DE TEXTE === // =================================================================== // On joint chaque morceau de texte trouvé avec un espace. // Cela évite de coller des mots comme "beSi". // Le nettoyage ultérieur se chargera des espaces en trop. const pageText = content.items .map((item: unknown) => ("str" in item ? item.str : "")) .join(" "); fullText += pageText + "\n"; // On ajoute un saut de ligne entre les pages } return fullText; }; export async function POST(request: Request) { try { const formData = await request.formData(); const file = formData.get("file") as File | null; if (!file) { return NextResponse.json( { error: "Aucun fichier reçu" }, { status: 400 } ); } // --- Extraction du texte --- let extractedText = ""; const fileArrayBuffer = await file.arrayBuffer(); if (file.type === "application/pdf") { extractedText = await extractTextWithPdfJs( new Uint8Array(fileArrayBuffer) ); } else if ( file.type === "application/vnd.openxmlformats-officedocument.wordprocessingml.document" ) { const { value } = await mammoth.extractRawText({ buffer: Buffer.from(fileArrayBuffer), }); extractedText = value; } else if (file.type.startsWith("text/")) { extractedText = await file.text(); } else { return NextResponse.json( { error: `Type de fichier non supporté: ${file.type}` }, { status: 415 } ); } if (!extractedText.trim()) { return NextResponse.json( { error: "Impossible d'extraire du texte de ce fichier." }, { status: 400 } ); } // --- Nettoyage général du texte après extraction --- const cleanedText = extractedText.replace(/\s+/g, " ").trim(); // La configuration d'analyse reste la même, complète et agressive const analyzerPayload = { text: cleanedText, language: "fr", ad_hoc_recognizers: [ { name: "Email Recognizer", supported_entity: "EMAIL_ADDRESS", deny_list: ["[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\\.[a-zA-Z0-9-.]+"], }, { name: "French Phone Recognizer", supported_entity: "PHONE_NUMBER", deny_list: ["\\b(0|\\+33|0033)[1-9]([-. ]?[0-9]{2}){4}\\b"], }, { name: "IBAN Recognizer", supported_entity: "IBAN", deny_list: ["\\b[A-Z]{2}[0-9]{2}(?:[ ]?[0-9]{4}){4,7}\\b"], }, { name: "BIC/SWIFT Recognizer", supported_entity: "SWIFT_CODE", deny_list: [ "\\b([A-Z]{6}[A-Z2-9][A-NP-Z0-9])(X{3}|[A-NP-Z0-9]{3})?\\b", ], }, { name: "Belgian Company Number", supported_entity: "BE_COMPANY_NUMBER", deny_list: ["\\bBE\\s*0[0-9]{3}[.]?[0-9]{3}[.]?[0-9]{3}\\b"], }, { name: "Company Name Recognizer", supported_entity: "ORGANIZATION", deny_list: [ "\\b[A-Z][a-zA-ZÀ-ÖØ-öø-ÿ-&' ]+\\s+(SPRL|SA|Partners|Solutions|Capital)\\b", ], }, { name: "Specific Company Recognizer", supported_entity: "ORGANIZATION", deny_list: [ "TechFlow Solutions SPRL", "Innovation Capital Partners SA", ], }, { name: "Date Recognizer (DD/MM/YYYY)", supported_entity: "DATE_TIME", deny_list: [ "\\b(0[1-9]|[12][0-9]|3[01])[-/.](0[1-9]|1[012])[-/.](19|20)\\d\\d\\b", ], }, ], }; // --- Le reste du processus ne change pas --- const analyzeResponse = await fetch(PRESIDIO_ANALYZER_URL, { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify(analyzerPayload), cache: "no-store", }); if (!analyzeResponse.ok) { const errorText = await analyzeResponse.text(); return NextResponse.json( { error: `Erreur de l'Analyzer [${analyzeResponse.status}]: ${errorText}`, }, { status: analyzeResponse.status } ); } const analysisResults = await analyzeResponse.json(); const anonymizeResponse = await fetch(PRESIDIO_ANONYMIZER_URL, { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ text: cleanedText, analyzer_results: analysisResults, anonymizers: { DEFAULT: { type: "replace", new_value: "" } }, }), cache: "no-store", }); if (!anonymizeResponse.ok) { const errorText = await anonymizeResponse.text(); return NextResponse.json( { error: `Erreur de l'Anonymizer [${anonymizeResponse.status}]: ${errorText}`, }, { status: anonymizeResponse.status } ); } const anonymizedData = await anonymizeResponse.json(); const finalResponse = { text: anonymizedData.text, items: analysisResults }; return NextResponse.json(finalResponse); } catch (error) { console.error("Erreur critique dans l'API Route:", error); const errorMessage = error instanceof Error ? error.message : "Erreur inconnue"; return NextResponse.json( { error: `Erreur interne du serveur: ${errorMessage}` }, { status: 500 } ); } }