n8n anonyme
This commit is contained in:
@@ -1,195 +0,0 @@
|
|||||||
// Fichier : src/app/api/anonymize/route.ts
|
|
||||||
|
|
||||||
import { NextResponse } from "next/server";
|
|
||||||
import mammoth from "mammoth";
|
|
||||||
import * as pdfjs from "pdfjs-dist";
|
|
||||||
|
|
||||||
// --- Vos URLs Presidio ---
|
|
||||||
const PRESIDIO_ANALYZER_URL =
|
|
||||||
"http://ocs00s000ssow8kssossocco.51.68.233.212.sslip.io/analyze";
|
|
||||||
const PRESIDIO_ANONYMIZER_URL =
|
|
||||||
"http://r8gko4kcwwk4sso40cc0gkg8.51.68.233.212.sslip.io/anonymize";
|
|
||||||
|
|
||||||
// Fonction utilitaire pour extraire le texte d'un PDF
|
|
||||||
const extractTextWithPdfJs = async (pdfData: Uint8Array): Promise<string> => {
|
|
||||||
pdfjs.GlobalWorkerOptions.workerSrc = `https://cdnjs.cloudflare.com/ajax/libs/pdf.js/${pdfjs.version}/pdf.worker.min.mjs`;
|
|
||||||
|
|
||||||
const doc = await pdfjs.getDocument(pdfData).promise;
|
|
||||||
const numPages = doc.numPages;
|
|
||||||
let fullText = "";
|
|
||||||
|
|
||||||
for (let i = 1; i <= numPages; i++) {
|
|
||||||
const page = await doc.getPage(i);
|
|
||||||
const content = await page.getTextContent();
|
|
||||||
|
|
||||||
// ===================================================================
|
|
||||||
// === CORRECTION CRUCIALE DE L'EXTRACTION DE TEXTE ===
|
|
||||||
// ===================================================================
|
|
||||||
// On joint chaque morceau de texte trouvé avec un espace.
|
|
||||||
// Cela évite de coller des mots comme "beSi".
|
|
||||||
// Le nettoyage ultérieur se chargera des espaces en trop.
|
|
||||||
const pageText = content.items
|
|
||||||
.map((item: unknown) => ("str" in item ? item.str : ""))
|
|
||||||
.join(" ");
|
|
||||||
fullText += pageText + "\n"; // On ajoute un saut de ligne entre les pages
|
|
||||||
}
|
|
||||||
|
|
||||||
return fullText;
|
|
||||||
};
|
|
||||||
|
|
||||||
export async function POST(request: Request) {
|
|
||||||
try {
|
|
||||||
const formData = await request.formData();
|
|
||||||
const file = formData.get("file") as File | null;
|
|
||||||
|
|
||||||
if (!file) {
|
|
||||||
return NextResponse.json(
|
|
||||||
{ error: "Aucun fichier reçu" },
|
|
||||||
{ status: 400 }
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
// --- Extraction du texte ---
|
|
||||||
let extractedText = "";
|
|
||||||
const fileArrayBuffer = await file.arrayBuffer();
|
|
||||||
if (file.type === "application/pdf") {
|
|
||||||
extractedText = await extractTextWithPdfJs(
|
|
||||||
new Uint8Array(fileArrayBuffer)
|
|
||||||
);
|
|
||||||
} else if (
|
|
||||||
file.type ===
|
|
||||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
|
||||||
) {
|
|
||||||
const { value } = await mammoth.extractRawText({
|
|
||||||
buffer: Buffer.from(fileArrayBuffer),
|
|
||||||
});
|
|
||||||
extractedText = value;
|
|
||||||
} else if (file.type.startsWith("text/")) {
|
|
||||||
extractedText = await file.text();
|
|
||||||
} else {
|
|
||||||
return NextResponse.json(
|
|
||||||
{ error: `Type de fichier non supporté: ${file.type}` },
|
|
||||||
{ status: 415 }
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!extractedText.trim()) {
|
|
||||||
return NextResponse.json(
|
|
||||||
{ error: "Impossible d'extraire du texte de ce fichier." },
|
|
||||||
{ status: 400 }
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
// --- Nettoyage général du texte après extraction ---
|
|
||||||
const cleanedText = extractedText.replace(/\s+/g, " ").trim();
|
|
||||||
|
|
||||||
// La configuration d'analyse reste la même, complète et agressive
|
|
||||||
const analyzerPayload = {
|
|
||||||
text: cleanedText,
|
|
||||||
language: "fr",
|
|
||||||
ad_hoc_recognizers: [
|
|
||||||
{
|
|
||||||
name: "Email Recognizer",
|
|
||||||
supported_entity: "EMAIL_ADDRESS",
|
|
||||||
deny_list: ["[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\\.[a-zA-Z0-9-.]+"],
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "French Phone Recognizer",
|
|
||||||
supported_entity: "PHONE_NUMBER",
|
|
||||||
deny_list: ["\\b(0|\\+33|0033)[1-9]([-. ]?[0-9]{2}){4}\\b"],
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "IBAN Recognizer",
|
|
||||||
supported_entity: "IBAN",
|
|
||||||
deny_list: ["\\b[A-Z]{2}[0-9]{2}(?:[ ]?[0-9]{4}){4,7}\\b"],
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "BIC/SWIFT Recognizer",
|
|
||||||
supported_entity: "SWIFT_CODE",
|
|
||||||
deny_list: [
|
|
||||||
"\\b([A-Z]{6}[A-Z2-9][A-NP-Z0-9])(X{3}|[A-NP-Z0-9]{3})?\\b",
|
|
||||||
],
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "Belgian Company Number",
|
|
||||||
supported_entity: "BE_COMPANY_NUMBER",
|
|
||||||
deny_list: ["\\bBE\\s*0[0-9]{3}[.]?[0-9]{3}[.]?[0-9]{3}\\b"],
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "Company Name Recognizer",
|
|
||||||
supported_entity: "ORGANIZATION",
|
|
||||||
deny_list: [
|
|
||||||
"\\b[A-Z][a-zA-ZÀ-ÖØ-öø-ÿ-&' ]+\\s+(SPRL|SA|Partners|Solutions|Capital)\\b",
|
|
||||||
],
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "Specific Company Recognizer",
|
|
||||||
supported_entity: "ORGANIZATION",
|
|
||||||
deny_list: [
|
|
||||||
"TechFlow Solutions SPRL",
|
|
||||||
"Innovation Capital Partners SA",
|
|
||||||
],
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "Date Recognizer (DD/MM/YYYY)",
|
|
||||||
supported_entity: "DATE_TIME",
|
|
||||||
deny_list: [
|
|
||||||
"\\b(0[1-9]|[12][0-9]|3[01])[-/.](0[1-9]|1[012])[-/.](19|20)\\d\\d\\b",
|
|
||||||
],
|
|
||||||
},
|
|
||||||
],
|
|
||||||
};
|
|
||||||
|
|
||||||
// --- Le reste du processus ne change pas ---
|
|
||||||
const analyzeResponse = await fetch(PRESIDIO_ANALYZER_URL, {
|
|
||||||
method: "POST",
|
|
||||||
headers: { "Content-Type": "application/json" },
|
|
||||||
body: JSON.stringify(analyzerPayload),
|
|
||||||
cache: "no-store",
|
|
||||||
});
|
|
||||||
|
|
||||||
if (!analyzeResponse.ok) {
|
|
||||||
const errorText = await analyzeResponse.text();
|
|
||||||
return NextResponse.json(
|
|
||||||
{
|
|
||||||
error: `Erreur de l'Analyzer [${analyzeResponse.status}]: ${errorText}`,
|
|
||||||
},
|
|
||||||
{ status: analyzeResponse.status }
|
|
||||||
);
|
|
||||||
}
|
|
||||||
const analysisResults = await analyzeResponse.json();
|
|
||||||
|
|
||||||
const anonymizeResponse = await fetch(PRESIDIO_ANONYMIZER_URL, {
|
|
||||||
method: "POST",
|
|
||||||
headers: { "Content-Type": "application/json" },
|
|
||||||
body: JSON.stringify({
|
|
||||||
text: cleanedText,
|
|
||||||
analyzer_results: analysisResults,
|
|
||||||
anonymizers: { DEFAULT: { type: "replace", new_value: "<ANONYMISÉ>" } },
|
|
||||||
}),
|
|
||||||
cache: "no-store",
|
|
||||||
});
|
|
||||||
|
|
||||||
if (!anonymizeResponse.ok) {
|
|
||||||
const errorText = await anonymizeResponse.text();
|
|
||||||
return NextResponse.json(
|
|
||||||
{
|
|
||||||
error: `Erreur de l'Anonymizer [${anonymizeResponse.status}]: ${errorText}`,
|
|
||||||
},
|
|
||||||
{ status: anonymizeResponse.status }
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
const anonymizedData = await anonymizeResponse.json();
|
|
||||||
const finalResponse = { text: anonymizedData.text, items: analysisResults };
|
|
||||||
return NextResponse.json(finalResponse);
|
|
||||||
} catch (error) {
|
|
||||||
console.error("Erreur critique dans l'API Route:", error);
|
|
||||||
const errorMessage =
|
|
||||||
error instanceof Error ? error.message : "Erreur inconnue";
|
|
||||||
return NextResponse.json(
|
|
||||||
{ error: `Erreur interne du serveur: ${errorMessage}` },
|
|
||||||
{ status: 500 }
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Reference in New Issue
Block a user