diff --git a/app/api/anonymize/route.ts b/app/api/anonymize/route.ts deleted file mode 100644 index b534628..0000000 --- a/app/api/anonymize/route.ts +++ /dev/null @@ -1,195 +0,0 @@ -// Fichier : src/app/api/anonymize/route.ts - -import { NextResponse } from "next/server"; -import mammoth from "mammoth"; -import * as pdfjs from "pdfjs-dist"; - -// --- Vos URLs Presidio --- -const PRESIDIO_ANALYZER_URL = - "http://ocs00s000ssow8kssossocco.51.68.233.212.sslip.io/analyze"; -const PRESIDIO_ANONYMIZER_URL = - "http://r8gko4kcwwk4sso40cc0gkg8.51.68.233.212.sslip.io/anonymize"; - -// Fonction utilitaire pour extraire le texte d'un PDF -const extractTextWithPdfJs = async (pdfData: Uint8Array): Promise => { - pdfjs.GlobalWorkerOptions.workerSrc = `https://cdnjs.cloudflare.com/ajax/libs/pdf.js/${pdfjs.version}/pdf.worker.min.mjs`; - - const doc = await pdfjs.getDocument(pdfData).promise; - const numPages = doc.numPages; - let fullText = ""; - - for (let i = 1; i <= numPages; i++) { - const page = await doc.getPage(i); - const content = await page.getTextContent(); - - // =================================================================== - // === CORRECTION CRUCIALE DE L'EXTRACTION DE TEXTE === - // =================================================================== - // On joint chaque morceau de texte trouvé avec un espace. - // Cela évite de coller des mots comme "beSi". - // Le nettoyage ultérieur se chargera des espaces en trop. - const pageText = content.items - .map((item: unknown) => ("str" in item ? item.str : "")) - .join(" "); - fullText += pageText + "\n"; // On ajoute un saut de ligne entre les pages - } - - return fullText; -}; - -export async function POST(request: Request) { - try { - const formData = await request.formData(); - const file = formData.get("file") as File | null; - - if (!file) { - return NextResponse.json( - { error: "Aucun fichier reçu" }, - { status: 400 } - ); - } - - // --- Extraction du texte --- - let extractedText = ""; - const fileArrayBuffer = await file.arrayBuffer(); - if (file.type === "application/pdf") { - extractedText = await extractTextWithPdfJs( - new Uint8Array(fileArrayBuffer) - ); - } else if ( - file.type === - "application/vnd.openxmlformats-officedocument.wordprocessingml.document" - ) { - const { value } = await mammoth.extractRawText({ - buffer: Buffer.from(fileArrayBuffer), - }); - extractedText = value; - } else if (file.type.startsWith("text/")) { - extractedText = await file.text(); - } else { - return NextResponse.json( - { error: `Type de fichier non supporté: ${file.type}` }, - { status: 415 } - ); - } - - if (!extractedText.trim()) { - return NextResponse.json( - { error: "Impossible d'extraire du texte de ce fichier." }, - { status: 400 } - ); - } - - // --- Nettoyage général du texte après extraction --- - const cleanedText = extractedText.replace(/\s+/g, " ").trim(); - - // La configuration d'analyse reste la même, complète et agressive - const analyzerPayload = { - text: cleanedText, - language: "fr", - ad_hoc_recognizers: [ - { - name: "Email Recognizer", - supported_entity: "EMAIL_ADDRESS", - deny_list: ["[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\\.[a-zA-Z0-9-.]+"], - }, - { - name: "French Phone Recognizer", - supported_entity: "PHONE_NUMBER", - deny_list: ["\\b(0|\\+33|0033)[1-9]([-. ]?[0-9]{2}){4}\\b"], - }, - { - name: "IBAN Recognizer", - supported_entity: "IBAN", - deny_list: ["\\b[A-Z]{2}[0-9]{2}(?:[ ]?[0-9]{4}){4,7}\\b"], - }, - { - name: "BIC/SWIFT Recognizer", - supported_entity: "SWIFT_CODE", - deny_list: [ - "\\b([A-Z]{6}[A-Z2-9][A-NP-Z0-9])(X{3}|[A-NP-Z0-9]{3})?\\b", - ], - }, - { - name: "Belgian Company Number", - supported_entity: "BE_COMPANY_NUMBER", - deny_list: ["\\bBE\\s*0[0-9]{3}[.]?[0-9]{3}[.]?[0-9]{3}\\b"], - }, - { - name: "Company Name Recognizer", - supported_entity: "ORGANIZATION", - deny_list: [ - "\\b[A-Z][a-zA-ZÀ-ÖØ-öø-ÿ-&' ]+\\s+(SPRL|SA|Partners|Solutions|Capital)\\b", - ], - }, - { - name: "Specific Company Recognizer", - supported_entity: "ORGANIZATION", - deny_list: [ - "TechFlow Solutions SPRL", - "Innovation Capital Partners SA", - ], - }, - { - name: "Date Recognizer (DD/MM/YYYY)", - supported_entity: "DATE_TIME", - deny_list: [ - "\\b(0[1-9]|[12][0-9]|3[01])[-/.](0[1-9]|1[012])[-/.](19|20)\\d\\d\\b", - ], - }, - ], - }; - - // --- Le reste du processus ne change pas --- - const analyzeResponse = await fetch(PRESIDIO_ANALYZER_URL, { - method: "POST", - headers: { "Content-Type": "application/json" }, - body: JSON.stringify(analyzerPayload), - cache: "no-store", - }); - - if (!analyzeResponse.ok) { - const errorText = await analyzeResponse.text(); - return NextResponse.json( - { - error: `Erreur de l'Analyzer [${analyzeResponse.status}]: ${errorText}`, - }, - { status: analyzeResponse.status } - ); - } - const analysisResults = await analyzeResponse.json(); - - const anonymizeResponse = await fetch(PRESIDIO_ANONYMIZER_URL, { - method: "POST", - headers: { "Content-Type": "application/json" }, - body: JSON.stringify({ - text: cleanedText, - analyzer_results: analysisResults, - anonymizers: { DEFAULT: { type: "replace", new_value: "" } }, - }), - cache: "no-store", - }); - - if (!anonymizeResponse.ok) { - const errorText = await anonymizeResponse.text(); - return NextResponse.json( - { - error: `Erreur de l'Anonymizer [${anonymizeResponse.status}]: ${errorText}`, - }, - { status: anonymizeResponse.status } - ); - } - - const anonymizedData = await anonymizeResponse.json(); - const finalResponse = { text: anonymizedData.text, items: analysisResults }; - return NextResponse.json(finalResponse); - } catch (error) { - console.error("Erreur critique dans l'API Route:", error); - const errorMessage = - error instanceof Error ? error.message : "Erreur inconnue"; - return NextResponse.json( - { error: `Erreur interne du serveur: ${errorMessage}` }, - { status: 500 } - ); - } -}