const PAGE_PATTERNS = [
    /<!--.*?PageHeader=".*?".*?-->/g, // Matches any PageHeader comment with content
    /<!--.*?PageFooter=".*?".*?-->/g, // Matches any PageFooter comment with content
    /<!--.*?PageNumber=".*?".*?-->/g, // Matches any PageNumber comment with content
];

export const azureDocIntelligenceContentCleaner = (content: string): string => {
    let cleanedContent = content;

    // Remove Azure Doc Intelligence patterns
    PAGE_PATTERNS.forEach(pattern => {
        cleanedContent = cleanedContent.replaceAll(pattern, "");
    });

    // Replace markdown headers with dashes, except when ending in colon
    cleanedContent = cleanedContent.replace(/^#{1,6}\s*(.*?)$/gm, (_, text) =>
        (text as string).trim().endsWith(":") ? (text as string) : `${text as string} - `,
    );

    // Replace HTML tables
    cleanedContent = cleanedContent.replace(/<table[\s\S]*?<\/table>/gi, "[TABLE]");

    // Replace markdown tables (lines starting with | and containing |)
    cleanedContent = cleanedContent.replace(/(?:\|.*\|[\r\n]*)+/g, "[TABLE]");

    // Handle figure tags with captions
    cleanedContent = cleanedContent.replace(
        /<figure>[\s\S]*?<figcaption>(.*?)<\/figcaption>[\s\S]*?<\/figure>/gi,
        (_, caption) => `[FIGURE - ${caption as string}]`,
    );

    // Handle empty figure tags
    cleanedContent = cleanedContent.replace(/<figure>[\s\S]*?<\/figure>/gi, "[FIGURE]");

    return cleanedContent.trim();
};
