196 lines
6.3 KiB
TypeScript
196 lines
6.3 KiB
TypeScript
// Fichier : src/app/api/anonymize/route.ts
|
|
|
|
import { NextResponse } from "next/server";
|
|
import mammoth from "mammoth";
|
|
import * as pdfjs from "pdfjs-dist";
|
|
|
|
// --- Vos URLs Presidio ---
|
|
const PRESIDIO_ANALYZER_URL =
|
|
"http://ocs00s000ssow8kssossocco.51.68.233.212.sslip.io/analyze";
|
|
const PRESIDIO_ANONYMIZER_URL =
|
|
"http://r8gko4kcwwk4sso40cc0gkg8.51.68.233.212.sslip.io/anonymize";
|
|
|
|
// Fonction utilitaire pour extraire le texte d'un PDF
|
|
const extractTextWithPdfJs = async (pdfData: Uint8Array): Promise<string> => {
|
|
pdfjs.GlobalWorkerOptions.workerSrc = `https://cdnjs.cloudflare.com/ajax/libs/pdf.js/${pdfjs.version}/pdf.worker.min.mjs`;
|
|
|
|
const doc = await pdfjs.getDocument(pdfData).promise;
|
|
const numPages = doc.numPages;
|
|
let fullText = "";
|
|
|
|
for (let i = 1; i <= numPages; i++) {
|
|
const page = await doc.getPage(i);
|
|
const content = await page.getTextContent();
|
|
|
|
// ===================================================================
|
|
// === CORRECTION CRUCIALE DE L'EXTRACTION DE TEXTE ===
|
|
// ===================================================================
|
|
// On joint chaque morceau de texte trouvé avec un espace.
|
|
// Cela évite de coller des mots comme "beSi".
|
|
// Le nettoyage ultérieur se chargera des espaces en trop.
|
|
const pageText = content.items
|
|
.map((item: unknown) => ("str" in item ? item.str : ""))
|
|
.join(" ");
|
|
fullText += pageText + "\n"; // On ajoute un saut de ligne entre les pages
|
|
}
|
|
|
|
return fullText;
|
|
};
|
|
|
|
export async function POST(request: Request) {
|
|
try {
|
|
const formData = await request.formData();
|
|
const file = formData.get("file") as File | null;
|
|
|
|
if (!file) {
|
|
return NextResponse.json(
|
|
{ error: "Aucun fichier reçu" },
|
|
{ status: 400 }
|
|
);
|
|
}
|
|
|
|
// --- Extraction du texte ---
|
|
let extractedText = "";
|
|
const fileArrayBuffer = await file.arrayBuffer();
|
|
if (file.type === "application/pdf") {
|
|
extractedText = await extractTextWithPdfJs(
|
|
new Uint8Array(fileArrayBuffer)
|
|
);
|
|
} else if (
|
|
file.type ===
|
|
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
|
) {
|
|
const { value } = await mammoth.extractRawText({
|
|
buffer: Buffer.from(fileArrayBuffer),
|
|
});
|
|
extractedText = value;
|
|
} else if (file.type.startsWith("text/")) {
|
|
extractedText = await file.text();
|
|
} else {
|
|
return NextResponse.json(
|
|
{ error: `Type de fichier non supporté: ${file.type}` },
|
|
{ status: 415 }
|
|
);
|
|
}
|
|
|
|
if (!extractedText.trim()) {
|
|
return NextResponse.json(
|
|
{ error: "Impossible d'extraire du texte de ce fichier." },
|
|
{ status: 400 }
|
|
);
|
|
}
|
|
|
|
// --- Nettoyage général du texte après extraction ---
|
|
const cleanedText = extractedText.replace(/\s+/g, " ").trim();
|
|
|
|
// La configuration d'analyse reste la même, complète et agressive
|
|
const analyzerPayload = {
|
|
text: cleanedText,
|
|
language: "fr",
|
|
ad_hoc_recognizers: [
|
|
{
|
|
name: "Email Recognizer",
|
|
supported_entity: "EMAIL_ADDRESS",
|
|
deny_list: ["[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\\.[a-zA-Z0-9-.]+"],
|
|
},
|
|
{
|
|
name: "French Phone Recognizer",
|
|
supported_entity: "PHONE_NUMBER",
|
|
deny_list: ["\\b(0|\\+33|0033)[1-9]([-. ]?[0-9]{2}){4}\\b"],
|
|
},
|
|
{
|
|
name: "IBAN Recognizer",
|
|
supported_entity: "IBAN",
|
|
deny_list: ["\\b[A-Z]{2}[0-9]{2}(?:[ ]?[0-9]{4}){4,7}\\b"],
|
|
},
|
|
{
|
|
name: "BIC/SWIFT Recognizer",
|
|
supported_entity: "SWIFT_CODE",
|
|
deny_list: [
|
|
"\\b([A-Z]{6}[A-Z2-9][A-NP-Z0-9])(X{3}|[A-NP-Z0-9]{3})?\\b",
|
|
],
|
|
},
|
|
{
|
|
name: "Belgian Company Number",
|
|
supported_entity: "BE_COMPANY_NUMBER",
|
|
deny_list: ["\\bBE\\s*0[0-9]{3}[.]?[0-9]{3}[.]?[0-9]{3}\\b"],
|
|
},
|
|
{
|
|
name: "Company Name Recognizer",
|
|
supported_entity: "ORGANIZATION",
|
|
deny_list: [
|
|
"\\b[A-Z][a-zA-ZÀ-ÖØ-öø-ÿ-&' ]+\\s+(SPRL|SA|Partners|Solutions|Capital)\\b",
|
|
],
|
|
},
|
|
{
|
|
name: "Specific Company Recognizer",
|
|
supported_entity: "ORGANIZATION",
|
|
deny_list: [
|
|
"TechFlow Solutions SPRL",
|
|
"Innovation Capital Partners SA",
|
|
],
|
|
},
|
|
{
|
|
name: "Date Recognizer (DD/MM/YYYY)",
|
|
supported_entity: "DATE_TIME",
|
|
deny_list: [
|
|
"\\b(0[1-9]|[12][0-9]|3[01])[-/.](0[1-9]|1[012])[-/.](19|20)\\d\\d\\b",
|
|
],
|
|
},
|
|
],
|
|
};
|
|
|
|
// --- Le reste du processus ne change pas ---
|
|
const analyzeResponse = await fetch(PRESIDIO_ANALYZER_URL, {
|
|
method: "POST",
|
|
headers: { "Content-Type": "application/json" },
|
|
body: JSON.stringify(analyzerPayload),
|
|
cache: "no-store",
|
|
});
|
|
|
|
if (!analyzeResponse.ok) {
|
|
const errorText = await analyzeResponse.text();
|
|
return NextResponse.json(
|
|
{
|
|
error: `Erreur de l'Analyzer [${analyzeResponse.status}]: ${errorText}`,
|
|
},
|
|
{ status: analyzeResponse.status }
|
|
);
|
|
}
|
|
const analysisResults = await analyzeResponse.json();
|
|
|
|
const anonymizeResponse = await fetch(PRESIDIO_ANONYMIZER_URL, {
|
|
method: "POST",
|
|
headers: { "Content-Type": "application/json" },
|
|
body: JSON.stringify({
|
|
text: cleanedText,
|
|
analyzer_results: analysisResults,
|
|
anonymizers: { DEFAULT: { type: "replace", new_value: "<ANONYMISÉ>" } },
|
|
}),
|
|
cache: "no-store",
|
|
});
|
|
|
|
if (!anonymizeResponse.ok) {
|
|
const errorText = await anonymizeResponse.text();
|
|
return NextResponse.json(
|
|
{
|
|
error: `Erreur de l'Anonymizer [${anonymizeResponse.status}]: ${errorText}`,
|
|
},
|
|
{ status: anonymizeResponse.status }
|
|
);
|
|
}
|
|
|
|
const anonymizedData = await anonymizeResponse.json();
|
|
const finalResponse = { text: anonymizedData.text, items: analysisResults };
|
|
return NextResponse.json(finalResponse);
|
|
} catch (error) {
|
|
console.error("Erreur critique dans l'API Route:", error);
|
|
const errorMessage =
|
|
error instanceof Error ? error.message : "Erreur inconnue";
|
|
return NextResponse.json(
|
|
{ error: `Erreur interne du serveur: ${errorMessage}` },
|
|
{ status: 500 }
|
|
);
|
|
}
|
|
}
|