n8n anonyme

This commit is contained in:
nBiqoz
2025-06-12 23:39:40 +02:00
parent f9f3474a92
commit 386950b630

View File

@@ -1,195 +0,0 @@
// Fichier : src/app/api/anonymize/route.ts
import { NextResponse } from "next/server";
import mammoth from "mammoth";
import * as pdfjs from "pdfjs-dist";
// --- Vos URLs Presidio ---
const PRESIDIO_ANALYZER_URL =
"http://ocs00s000ssow8kssossocco.51.68.233.212.sslip.io/analyze";
const PRESIDIO_ANONYMIZER_URL =
"http://r8gko4kcwwk4sso40cc0gkg8.51.68.233.212.sslip.io/anonymize";
// Fonction utilitaire pour extraire le texte d'un PDF
const extractTextWithPdfJs = async (pdfData: Uint8Array): Promise<string> => {
pdfjs.GlobalWorkerOptions.workerSrc = `https://cdnjs.cloudflare.com/ajax/libs/pdf.js/${pdfjs.version}/pdf.worker.min.mjs`;
const doc = await pdfjs.getDocument(pdfData).promise;
const numPages = doc.numPages;
let fullText = "";
for (let i = 1; i <= numPages; i++) {
const page = await doc.getPage(i);
const content = await page.getTextContent();
// ===================================================================
// === CORRECTION CRUCIALE DE L'EXTRACTION DE TEXTE ===
// ===================================================================
// On joint chaque morceau de texte trouvé avec un espace.
// Cela évite de coller des mots comme "beSi".
// Le nettoyage ultérieur se chargera des espaces en trop.
const pageText = content.items
.map((item: unknown) => ("str" in item ? item.str : ""))
.join(" ");
fullText += pageText + "\n"; // On ajoute un saut de ligne entre les pages
}
return fullText;
};
export async function POST(request: Request) {
try {
const formData = await request.formData();
const file = formData.get("file") as File | null;
if (!file) {
return NextResponse.json(
{ error: "Aucun fichier reçu" },
{ status: 400 }
);
}
// --- Extraction du texte ---
let extractedText = "";
const fileArrayBuffer = await file.arrayBuffer();
if (file.type === "application/pdf") {
extractedText = await extractTextWithPdfJs(
new Uint8Array(fileArrayBuffer)
);
} else if (
file.type ===
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
) {
const { value } = await mammoth.extractRawText({
buffer: Buffer.from(fileArrayBuffer),
});
extractedText = value;
} else if (file.type.startsWith("text/")) {
extractedText = await file.text();
} else {
return NextResponse.json(
{ error: `Type de fichier non supporté: ${file.type}` },
{ status: 415 }
);
}
if (!extractedText.trim()) {
return NextResponse.json(
{ error: "Impossible d'extraire du texte de ce fichier." },
{ status: 400 }
);
}
// --- Nettoyage général du texte après extraction ---
const cleanedText = extractedText.replace(/\s+/g, " ").trim();
// La configuration d'analyse reste la même, complète et agressive
const analyzerPayload = {
text: cleanedText,
language: "fr",
ad_hoc_recognizers: [
{
name: "Email Recognizer",
supported_entity: "EMAIL_ADDRESS",
deny_list: ["[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\\.[a-zA-Z0-9-.]+"],
},
{
name: "French Phone Recognizer",
supported_entity: "PHONE_NUMBER",
deny_list: ["\\b(0|\\+33|0033)[1-9]([-. ]?[0-9]{2}){4}\\b"],
},
{
name: "IBAN Recognizer",
supported_entity: "IBAN",
deny_list: ["\\b[A-Z]{2}[0-9]{2}(?:[ ]?[0-9]{4}){4,7}\\b"],
},
{
name: "BIC/SWIFT Recognizer",
supported_entity: "SWIFT_CODE",
deny_list: [
"\\b([A-Z]{6}[A-Z2-9][A-NP-Z0-9])(X{3}|[A-NP-Z0-9]{3})?\\b",
],
},
{
name: "Belgian Company Number",
supported_entity: "BE_COMPANY_NUMBER",
deny_list: ["\\bBE\\s*0[0-9]{3}[.]?[0-9]{3}[.]?[0-9]{3}\\b"],
},
{
name: "Company Name Recognizer",
supported_entity: "ORGANIZATION",
deny_list: [
"\\b[A-Z][a-zA-ZÀ-ÖØ-öø-ÿ-&' ]+\\s+(SPRL|SA|Partners|Solutions|Capital)\\b",
],
},
{
name: "Specific Company Recognizer",
supported_entity: "ORGANIZATION",
deny_list: [
"TechFlow Solutions SPRL",
"Innovation Capital Partners SA",
],
},
{
name: "Date Recognizer (DD/MM/YYYY)",
supported_entity: "DATE_TIME",
deny_list: [
"\\b(0[1-9]|[12][0-9]|3[01])[-/.](0[1-9]|1[012])[-/.](19|20)\\d\\d\\b",
],
},
],
};
// --- Le reste du processus ne change pas ---
const analyzeResponse = await fetch(PRESIDIO_ANALYZER_URL, {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify(analyzerPayload),
cache: "no-store",
});
if (!analyzeResponse.ok) {
const errorText = await analyzeResponse.text();
return NextResponse.json(
{
error: `Erreur de l'Analyzer [${analyzeResponse.status}]: ${errorText}`,
},
{ status: analyzeResponse.status }
);
}
const analysisResults = await analyzeResponse.json();
const anonymizeResponse = await fetch(PRESIDIO_ANONYMIZER_URL, {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({
text: cleanedText,
analyzer_results: analysisResults,
anonymizers: { DEFAULT: { type: "replace", new_value: "<ANONYMISÉ>" } },
}),
cache: "no-store",
});
if (!anonymizeResponse.ok) {
const errorText = await anonymizeResponse.text();
return NextResponse.json(
{
error: `Erreur de l'Anonymizer [${anonymizeResponse.status}]: ${errorText}`,
},
{ status: anonymizeResponse.status }
);
}
const anonymizedData = await anonymizeResponse.json();
const finalResponse = { text: anonymizedData.text, items: analysisResults };
return NextResponse.json(finalResponse);
} catch (error) {
console.error("Erreur critique dans l'API Route:", error);
const errorMessage =
error instanceof Error ? error.message : "Erreur inconnue";
return NextResponse.json(
{ error: `Erreur interne du serveur: ${errorMessage}` },
{ status: 500 }
);
}
}