import './ModdedPdfJs';
import { escape } from 'html-escaper';
import { UrlRegExp } from '../../util/UrlRegExp';
import {
    PunctuationMarksFinishPhrasePtBr,
    PunctuationMarksPtBr,
    QuotationMarksPtBr,
} from '../../util/PunctuationMarks';
import { resizeBase64Img } from '../../util/ResizeBase64Img';
import * as math from 'mathjs';
import {
    TdlDomainsSet,
    DocumentTypeEnum,
    isDocumentTypeImageDescription,
} from 'plataforma-braille-common';
import { findMathExpressions } from './FindMathExpressions';
import { createEditorElementImage } from '../../edit-document/editor-mods/modules/core/editor-element/EditorElementImage';
import { normalizeSpaces } from '../../util/TextUtil';

pdfjsLib.GlobalWorkerOptions.workerSrc = '/assets/js/pdf.worker.min.js';

const CONTAINS_TAG_REGEXP = /<[^>]*>/;

export class ImportPdf {
    /**
     * @typedef {object} Font
     * @property {string} name
     * @property {number} size
     * @property {boolean} boldDetected
     * @property {boolean} italicDetected
     * @property {boolean} underlineDetected
     */

    /**
     * @typedef {object} TextExtracted
     * @property {number} x
     * @property {number} y
     * @property {number} width
     * @property {number} height
     * @property {Font} font
     * @property {string} text
     * @property {number | undefined} originalX
     * @property {number | undefined} originalY
     * @property {number | undefined} originalWidth
     * @property {number | undefined} originalHeight
     */

    /**
     * @typedef {object} ImageExtracted
     * @property {number} x
     * @property {number} y
     * @property {number} width
     * @property {number} height
     * @property {string} objId
     * @property {string} data
     */

    /**
     * @typedef {object} PageAttributes
     * @property {number} width
     * @property {number} height
     */

    /**
     * @typedef {object} PageExtracted
     * @property {TextExtracted[]} textExtracted
     * @property {ImageExtracted[]} imageExtracted
     * @property {PageAttributes} pageAttributes
     */

    /**
     * @param file {HTMLInputElement}
     * @returns {Promise<*>}
     */
    async loadDocumentFromFile(file) {
        return await pdfjsLib.getDocument(file).promise;
    }

    /**
     * @param doc {*}
     * @param pageNum {number}
     * @returns {Promise<PageExtracted>}
     */
    async _extractDataFromPage(doc, pageNum) {
        const page = await doc.getPage(pageNum);
        /**
         * @type {HTMLCanvasElement}
         */
        const canvas = document.createElement('canvas');
        const context = canvas.getContext('2d');
        const viewport = page.getViewport({ scale: 1 });
        return await page.render({
            canvasContext: context,
            viewport,
        }).promise;
    }

    /**
     * @param txt {string}
     * @returns {string}
     */
    _processText(txt) {
        return escape(
            txt
                .replaceAll('¦', '•') // fix the bullet from some pdfs
                .replaceAll('⋅', '×')
                .replaceAll('−', '-') // #47675: hyphen variation
                .replace(/(•)(\s)*(\p{L}*)/gu, '$1 $3') // put space between bullets and letters
                .replace(/(^| +)([-–])( +)/gm, '$1—$3') // replace hyphen surrounded by space to dash (#44940)
                .replace(/ﬁ ?/g, 'fi') // #47673: stranger char conversion
                .replace(/ﬂ ?/g, 'fl'), // #47673: stranger char conversion
        );
    }

    /**
     * @param txt {string}
     * @param callback {function(match: RegExpMatchArray, value: string)}
     * @returns {Promise<string>}
     */
    async _processMathExpression(txt, callback) {
        let output = '';
        let lastIndex = 0;

        let matches;
        try {
            matches = await findMathExpressions(txt);
        } catch (e) {
            console.error(e);
            return output;
        }

        for (let { match, lastIndex: end } of matches) {
            const start = match.index;
            output += txt.slice(lastIndex, start);

            let value;
            let str = match[0];
            try {
                for (const sentence of str.split(
                    /=|&lt;|&gt;|&lt;=|&gt;=|!=|&lt;&gt;/,
                )) {
                    math.parse(
                        sentence
                            .replaceAll('&lt;', '<')
                            .replaceAll('&gt;', '>')
                            .replaceAll('&nbsp;', ' '),
                    );
                }
                value = callback(match, str.trim());
            } catch (e) {
                value = str;
            }

            output += value;
            lastIndex = end;
        }
        output += txt.slice(lastIndex);
        return output;
    }

    /**
     * @param txt {string}
     * @returns {Promise<string>}
     */
    async detectMathExpressions(txt) {
        return await this._processMathExpression(txt, (match, str) => {
            console.debug(`Valid math expression detected: ${str}`);

            let value = match[1].replaceAll(' ', '&nbsp;');
            value += `<editor-element type='math'>${str}</editor-element>`;
            value += match[21].replaceAll(' ', '&nbsp;');
            return value;
        });
    }

    /**
     * @param txt {string}
     * @returns {string}
     */
    _detectComputerRelated(txt) {
        return txt;
        //TODO: needs new implementation
        // return txt.replace(
        //     getRegexpComputerRelatedDetectUrl('&lt;', '&gt;'),
        //     (match, g1, g2, g3, g4, g5, g6, g7) => {
        //         if (CONTAINS_TAG_REGEXP.test(match)) {
        //             return match;
        //         }
        //         let tdlDomain = g4.split('.').filter((value) => !!value);
        //         tdlDomain = tdlDomain[tdlDomain.length - 1]?.toUpperCase();
        //         if (TdlDomainsSet.has(tdlDomain)) {
        //             let removedAtEnd = '';
        //             if (g2.endsWith('.')) {
        //                 g2 = g2.substring(0, g2.length - 1); // removes the dot
        //                 removedAtEnd = '.'; // put the dot outside the context
        //             }
        //             console.debug(`Computer related detected: ${match}`);
        //             const url = (g1 ?? '') + g2.trim() + (g7 ?? '');
        //             return `<editor-element type='computer-related'>${url}</editor-element>${removedAtEnd}`;
        //         } else {
        //             console.debug(
        //                 `Computer related ignored due invalid TDL: ${match}`,
        //             );
        //             return match;
        //         }
        //     },
        // );
    }

    /**
     * @param txt {string}
     * @returns {string}
     */
    _detectSummaries(txt) {
        // source: https://sgm.codebit.com.br/manutencao/36444
        txt = txt.replace(
            /^(.*?)(^| |&nbsp;)([^.\n]+)((\.| |&nbsp;)*( |&nbsp;)*\.{5,}(\.| |&nbsp;)*)(<[^\n]*>)?((\d[.\d])*\d+)(.*)$/gm,
            (match, g1, g2, g3, g4, g5, g6, g7, g8, g9) => {
                if (CONTAINS_TAG_REGEXP.test(match)) {
                    return match;
                }

                const description = `${g1}${g2}${g3}`
                    .trim()
                    .replace(/<.*?>/g, '');
                const page = g9.trim().replace(/<.*?>/g, '');

                console.debug(`Summary detected: ${description} - ${page}`);

                return `${description.trimEnd()} :::: ${page.trim()}`;
                // Requested to disable in #48393
                // const element = CustomBehaviors.createEditorElementSummary(null, description, page);
                // return element.outerHTML;
            },
        );
        return txt;
    }

    /**
     * @param txt {string}
     * @returns {Promise<string>}
     */
    /**
     * @param txt {string}
     * @param documentType {import('plataforma-braille-common').DocumentTypeEnumValue}
     * @returns {Promise<string>}
     */
    async _postProcessText(txt, documentType) {
        // remove double spaces between words (#51157)
        txt = txt.replaceAll(/(\S) {2,}(\S)/g, '$1 $2');
        if (documentType !== DocumentTypeEnum.IMAGE_DESCRIPTION_ARC) {
            // remove multiple lines (#51157)
            txt = txt.replaceAll(/\n{2,}/g, '\n\n');
        }

        txt = await this.detectMathExpressions(txt);
        txt = this._detectSummaries(txt);
        txt = this._detectComputerRelated(txt);

        // swap lines to <br>
        txt = txt.replaceAll('\n', '<br>\n');

        // put bullets in new line
        txt = txt.replace(/( *[ \p{L}]+)(•)/gu, (match, g1, g2) => {
            if (g1.trim() !== '') {
                return `${g1}<br>\n${g2}`;
            } else {
                return match;
            }
        });

        txt = removeStackedTags(txt);

        // remove alone numbers for bold text (#52363)
        txt = txt.replace(/<strong>\s*([\d,.) ]*)\s*<\/strong>/g, '$1');

        return removeStackedTags(txt);
    }

    /**
     * @param obj1 {TextExtracted}
     * @param obj2 {TextExtracted}
     * @returns {boolean}
     */
    _isOverlappedY(obj1, obj2) {
        const y1 = obj1.y;
        const h1 = obj1.height;

        const y2 = obj2.y;
        const h2 = obj2.height;

        return y1 < y2 + h2 && y1 + h1 > y2;
    }

    /**
     * @param obj1 {TextExtracted}
     * @param obj2 {TextExtracted}
     * @returns {boolean}
     */
    _isOverlappedX(obj1, obj2) {
        const x1 = obj1.x;
        const w1 = obj1.width;

        const x2 = obj2.x;
        const w2 = obj2.width;

        return x1 < x2 + w2 && x1 + w1 > x2;
    }

    /**
     * @param priorBlock {TextExtracted}
     * @param block {TextExtracted}
     * @returns {number}
     */
    _getIntersectionY(priorBlock, block) {
        const y1 = priorBlock.y;
        const h1 = priorBlock.height;

        const y2 = block.y;
        const h2 = block.height;

        const y = Math.max(y1, y2);
        const h = Math.min(y1 + h1, y2 + h2);
        return h - y;
    }

    /**
     * @typedef {object} BlockCompare
     * @property {boolean} sameContext
     * @property {number} xDistance
     * @property {number} avgCharWidth
     * @property {boolean} fontSizeDiff
     * @property {boolean} sameParagraphTolerance
     * @property {number} yDistance
     * @property {boolean} overlapped
     * @property {boolean} sameParagraph
     *
     */

    /**
     * @param priorBlock {TextExtracted}
     * @param block {TextExtracted}
     * @returns {BlockCompare}
     */
    _blockCompare(priorBlock, block) {
        const yDistance =
            block.y + block.height - (priorBlock.y + priorBlock.height);
        const sameParagraphTolerance = block.height / yDistance >= 0.8;
        const overlappedY = this._isOverlappedY(priorBlock, block);
        const overlappedYArea = this._getIntersectionY(priorBlock, block);
        const sameParagraph =
            (overlappedY && overlappedYArea / block.height >= 0.65) ||
            Math.round(block.y) === Math.round(priorBlock.y);
        const overlapped =
            overlappedY && this._isOverlappedX(priorBlock, block);
        const priorBlockX = priorBlock.originalX ?? priorBlock.x; // may value is merged and in this case must be the original value
        const priorBlockWidth = priorBlock.originalWidth ?? priorBlock.width; // may value is merged and in this case must be the original value
        const avgCharWidth =
            (priorBlockWidth + block.width) /
            2 /
            (priorBlock.text.length + block.text.length);
        const xDistance =
            block.x >= priorBlockX
                ? block.x - (priorBlockX + priorBlockWidth)
                : block.x - priorBlockX;
        const sameContext =
            (overlapped || sameParagraph || sameParagraphTolerance) &&
            xDistance < avgCharWidth * 5;
        // if font size is too different
        const fontSizeDiff =
            Math.min(priorBlock.font.size, block.font.size) /
                Math.max(priorBlock.font.size, block.font.size) <
            0.98;

        return {
            yDistance,
            overlapped,
            sameParagraphTolerance,
            sameParagraph,
            avgCharWidth,
            xDistance,
            sameContext,
            fontSizeDiff,
        };
    }

    /**
     *
     * @param priorBlock {TextExtracted}
     * @param block {TextExtracted}
     * @returns {{x: number, width: number, y: number, height: number}}
     */
    _mergeBlocks(priorBlock, block) {
        const x1 = priorBlock.x;
        const w1 = priorBlock.width;
        const x2 = block.x;
        const w2 = block.width;

        const y1 = priorBlock.y;
        const h1 = priorBlock.height;
        const y2 = block.y;
        const h2 = block.height;

        const x = Math.min(x1, x2);
        const width = Math.max(x1 + w1, x2 + w2) - x;
        const y = Math.min(y1, y2);
        const height = Math.max(y1 + h1, y2 + h2) - y;
        return {
            x,
            y,
            width,
            height,
        };
    }

    /**
     * @param priorBlock {TextExtracted}
     * @param block {TextExtracted}
     * @param compare {BlockCompare}
     * @returns {boolean}
     */
    _textShouldBeOnSameLine(priorBlock, block, compare) {
        if (compare.sameParagraph) {
            return true;
        }
        if (
            priorBlock.text.trimEnd().endsWith('-') &&
            block.text.trimStart().startsWith('-')
        ) {
            return true;
        }
        if (!compare.sameContext) {
            return false;
        }

        const priorTrimmed = priorBlock.text.trimEnd();
        const trimmed = block.text.trimStart();
        if (compare.sameParagraphTolerance) {
            // starts or finish with punctuation mark
            for (const punctuationMark of PunctuationMarksFinishPhrasePtBr) {
                if (
                    priorTrimmed.endsWith(punctuationMark) ||
                    trimmed.startsWith(punctuationMark)
                ) {
                    return false;
                }
            }
            // if font size is too different
            return !compare.fontSizeDiff;
        }
        return false;
    }

    /**
     *
     * @param priorBlock {TextExtracted}
     * @param block {TextExtracted}
     * @param compare {BlockCompare}
     * @returns {boolean}
     */
    _textHasSpaceBetween(priorBlock, block, compare) {
        if (priorBlock.text.endsWith(' ') || block.text.startsWith(' ')) {
            // already has space
            return false;
        }

        const priorTrimmed = priorBlock.text.trimEnd();
        const trimmed = block.text.trimStart();
        if (compare.sameParagraphTolerance) {
            // starts with punctuation mark
            for (const punctuationMark of PunctuationMarksPtBr.concat(
                QuotationMarksPtBr,
            )) {
                if (trimmed.startsWith(punctuationMark)) {
                    return false;
                }
            }

            // if font size is too different
            if (compare.fontSizeDiff) {
                return false;
            }
        }

        if (
            priorTrimmed.endsWith('-') ||
            trimmed.startsWith('-') ||
            priorTrimmed.startsWith('/') ||
            trimmed.startsWith('/')
        ) {
            // maybe an url
            return false;
        }

        // no space between words (a word must have a single space from another)
        if (compare.sameParagraph) {
            if (
                !compare.fontSizeDiff &&
                Math.round(compare.xDistance / compare.avgCharWidth) < 1
            ) {
                return false;
            }
        }

        return true;
    }

    /**
     * @param textExtracted {TextExtracted[]}
     * @returns {TextExtracted[]}
     */
    _removeDuplicatedBlocks(textExtracted) {
        let textExtractedAux = [];
        function compareBlocks(obj1, obj2) {
            return (
                obj1.x === obj2.x &&
                obj1.y === obj2.y &&
                obj1.width === obj2.width &&
                obj1.height === obj2.height &&
                obj1.text === obj2.text
            );
        }
        for (const block of textExtracted) {
            if (
                textExtractedAux.findIndex((obj) => {
                    return compareBlocks(obj, block);
                }) === -1
            ) {
                textExtractedAux.push(block);
            }
        }
        return textExtractedAux;
    }

    /**
     * @param textExtracted {TextExtracted[]}
     * @param lastPage {number | null}
     * @param pageAttributes {PageAttributes}
     * @returns {{textExtracted: TextExtracted[], page: number | null | undefined}}
     */
    _removeAndExtractPageNumbers(textExtracted, lastPage, pageAttributes) {
        // #36452: remove page numbers
        if (textExtracted.length === 0) return { textExtracted }; // no data
        const startRemoteAreaY = 0.1 * pageAttributes.height;
        const endRemoteAreaY =
            pageAttributes.height - 0.1 * pageAttributes.height;

        let toRemove = [];
        for (const block of textExtracted) {
            if (block.y <= startRemoteAreaY || block.y >= endRemoteAreaY) {
                if (block.text.match(/^ *\d+[,.]?\d* *$/m)) {
                    toRemove.push(block);
                }
            }
        }
        let page = null;
        if (toRemove.length > 1 && lastPage) {
            let toRemoveAux = [];
            for (const block of toRemove) {
                if (
                    parseInt(
                        block.text.replaceAll('.', '').replaceAll(',', '.'),
                    ) ===
                    lastPage + 1
                ) {
                    toRemoveAux.push(block);
                    page = lastPage + 1;
                }
            }
            toRemove = toRemoveAux;
        } else if (toRemove.length === 1) {
            page = parseInt(
                toRemove[0].text.replaceAll('.', '').replaceAll(',', '.'),
            );
        }

        if (page) {
            for (const block of toRemove) {
                const index = textExtracted.indexOf(block);
                textExtracted.splice(index, 1);
            }
        }
        return {
            textExtracted,
            page,
        };
    }

    /**
     * @param imagesExtracted {ImageExtracted[]}
     * @param yStart {number}
     * @param yEnd {number}
     * @returns {Promise<*>}
     */
    async _getImageFromRegion(imagesExtracted, yStart, yEnd) {
        let image = null;
        for (const img of imagesExtracted) {
            if ((yStart == null || img.y >= yStart) && img.y <= yEnd) {
                image = img;
                break;
            }
        }
        if (image) {
            imagesExtracted.splice(imagesExtracted.indexOf(image), 1);
            image.data = await resizeBase64Img(image.data, 400, 400, 0.8);
        }
        return image;
    }

    /**
     * @param image
     * @param page
     * @returns {string}
     */
    _createEditorElementImage(image, page = null) {
        const element = createEditorElementImage(
            null,
            null,
            null,
            page,
            image.data,
        );
        element.setAttribute('data-id', image.objId);
        return '\n' + element.outerHTML + '\n';
    }

    /**
     * @param extractedData {PageExtracted}
     * @param lastExtractedPage {number | null}
     * @param pageAttributes {PageAttributes}
     * @param onlyImages {boolean}
     * @returns {Promise<{text: string, page: number}>}
     */
    async _extractTextFromTextData(
        extractedData,
        lastExtractedPage,
        pageAttributes,
        documentType,
    ) {
        let text = '';
        extractedData.textExtracted = this._removeDuplicatedBlocks(
            extractedData.textExtracted,
        );
        let { textExtracted, page } = this._removeAndExtractPageNumbers(
            extractedData.textExtracted,
            lastExtractedPage,
            pageAttributes,
        );

        if (!isDocumentTypeImageDescription(documentType)) {
            for (let i = 0; i < textExtracted.length; i++) {
                const block = textExtracted[i];
                const priorBlock = i > 0 ? textExtracted[i - 1] : null;
                const nextBlock =
                    textExtracted.length > i + 1 ? textExtracted[i + 1] : null;

                let pre = '',
                    post = '';
                if (block.font.boldDetected) {
                    pre = pre + '<strong>';
                    post = '</strong>' + post;
                }
                if (block.font.italicDetected) {
                    pre = pre + '<em>';
                    post = '</em>' + post;
                }
                if (block.font.underlineDetected) {
                    pre = pre + '<span style="text-decoration: underline;">';
                    post = '</span>' + post;
                }

                let separator = '\n';
                if (block.text.trimEnd().endsWith('-') && nextBlock) {
                    block.text = block.text.trimEnd();
                    if (block.text.endsWith('-')) {
                        const words = block.text.split(' ');
                        const lastWord = words[words.length - 1];
                        if (
                            lastWord.match(/^[^- \n0-9]+-+$/gs) &&
                            lastWord.indexOf('/') === -1
                        ) {
                            // ends with hyphen and not an url
                            block.text = block.text.substring(
                                0,
                                block.text.length - 1,
                            );
                            nextBlock.hyphenation = true;
                        }
                    }
                }

                const image = await this._getImageFromRegion(
                    extractedData.imageExtracted,
                    priorBlock?.y,
                    block.y,
                );
                if (image) {
                    let distance =
                        image.y - (priorBlock?.y + priorBlock?.height);
                    let refHeight = priorBlock?.height;
                    let linesBetween = Math.abs(
                        Math.round(distance / refHeight),
                    );
                    for (let i = 0; i < linesBetween && i < 5; i++) {
                        // max 5 lines of space
                        text += '\n';
                    }
                    text += this._createEditorElementImage(image, page);
                }

                if (priorBlock) {
                    const compare = this._blockCompare(priorBlock, block);
                    const { yDistance, sameParagraph } = compare;

                    if (!block.hyphenation && block.text.startsWith('-')) {
                        const priorBlockLastWord =
                            priorBlock?.text.split(' ')[
                                priorBlock?.text.split(' ').length - 1
                            ];
                        const nextBlockFirstWord =
                            nextBlock?.text.split(' ')[0];

                        let connectorBlock;
                        let startWord;
                        let endWord;
                        if (block.text.trim() === '-') {
                            startWord = priorBlockLastWord;
                            endWord = '-' + nextBlockFirstWord;
                            connectorBlock = true;
                        } else {
                            startWord = priorBlockLastWord;
                            endWord = block.text;
                            connectorBlock = false;
                        }

                        if (
                            !startWord.match(UrlRegExp) &&
                            startWord.match(/^[^- \n0-9]*$/gs) &&
                            endWord.match(/^-+[^- \n0-9]+$/gs)
                        ) {
                            block.text = block.text.substring(1);
                            if (connectorBlock && nextBlock) {
                                nextBlock.hyphenation = true;
                            }
                            block.hyphenation = true;
                        }
                    }

                    // check for superscript (ordinal) or subscript
                    const priorBlockY = priorBlock.originalY ?? priorBlock.y; // may value is merged and in this case must be the original value
                    const priorBlockHeight =
                        priorBlock.orinalHeight ?? priorBlock.height; // may value is merged and in this case must be the original value
                    if (
                        sameParagraph &&
                        priorBlock.text.match(/\w/) && // check if it has a digit
                        priorBlock.font.size === block.font.size &&
                        priorBlockHeight / block.height >= 1.5
                    ) {
                        const upDistance = Math.abs(priorBlock.y - block.y);
                        const downDistance = Math.abs(
                            priorBlockY +
                                priorBlockHeight -
                                (block.y + block.height),
                        );
                        if (upDistance < downDistance) {
                            if (block.text === 'o') {
                                block.text = 'º';
                            } else if (block.text === 'a') {
                                block.text = 'ª';
                            } else {
                                pre = pre + '<sup>';
                                post = '</sup>' + post;
                            }
                        } else {
                            pre = pre + '<sub>';
                            post = '</sub>' + post;
                        }
                    }

                    const hyphenation = block.hyphenation && !sameParagraph;
                    if (
                        hyphenation ||
                        this._textShouldBeOnSameLine(priorBlock, block, compare)
                    ) {
                        if (
                            !hyphenation &&
                            this._textHasSpaceBetween(
                                priorBlock,
                                block,
                                compare,
                            )
                        ) {
                            separator = ' ';
                        } else {
                            separator = '';
                        }

                        const merged = this._mergeBlocks(priorBlock, block);
                        block.originalX = block.x;
                        block.x = merged.x;
                        block.originalY = merged.y;
                        block.y = merged.y;
                        block.originalWidth = block.width;
                        block.width = merged.width;
                        block.orinalHeight = block.height;
                        block.height = merged.height;
                    } else {
                        let distance = image
                            ? block.y - (image.y + image.height)
                            : yDistance;
                        let refHeight =
                            block.height > priorBlock.height
                                ? priorBlock.height
                                : block.height;
                        let linesBetween;
                        if (image && distance < 0) {
                            linesBetween = 0;
                        } else {
                            linesBetween = Math.abs(
                                Math.round(distance / refHeight),
                            );
                        }
                        for (let i = 1; i < linesBetween && i < 2; i++) {
                            // max 2 lines of space
                            separator += '\n';
                        }
                    }
                } else {
                    separator = '';
                }

                if (block.text !== ' ') {
                    text +=
                        separator + pre + this._processText(block.text) + post;
                } else {
                    text += separator === '' ? ' ' : separator;
                }
            }
        }

        for (const image of extractedData.imageExtracted) {
            if (documentType === DocumentTypeEnum.IMAGE_DESCRIPTION_ARC) {
                text += '\n\n\n';
            }
            text += this._createEditorElementImage(image, page);
            if (documentType === DocumentTypeEnum.IMAGE_DESCRIPTION_ARC) {
                text += '\n\n\n';
            }
        }

        if (
            !isDocumentTypeImageDescription(documentType) ||
            documentType === DocumentTypeEnum.IMAGE_DESCRIPTION_ARC
        ) {
            // I18N
            const pageStr = page ?? 'INSERIR PÁGINA TINTA';
            text = `&lt;${pageStr}&gt;\n` + text;
        }
        return {
            text: await this._postProcessText(
                normalizeSpaces(text),
                documentType,
            ),
            page,
        };
    }

    /**
     * @param document
     * @param onProgress {function(currentPage: number, pageCount: number, abort: function()) | null}
     * @param abortFn {function():boolean | null}
     * @param documentType {import('plataforma-braille-common').DocumentTypeEnumValue}
     */
    async extractText(document, onProgress, abortFn, documentType) {
        let text = '';
        console.debug('Extracting data from PDF.');

        let lastExtractedPage = null;

        if (onProgress) onProgress(0, document.numPages, abortFn);
        let abortedOnProgress = false;
        for (let i = 1; i <= document.numPages; i++) {
            if (abortedOnProgress || (abortFn && abortFn() === true)) {
                console.warn('PDF text extraction aborted.');
                return null;
            }
            const extractedData = await this._extractDataFromPage(document, i);
            let { text: extractedText, page } =
                await this._extractTextFromTextData(
                    extractedData,
                    lastExtractedPage,
                    extractedData.pageAttributes,
                    documentType,
                );

            text += '<editor-page>\n' + extractedText + '\n</editor-page>\n\n';
            lastExtractedPage = page ?? lastExtractedPage;
            if (onProgress)
                onProgress(
                    i,
                    document.numPages,
                    () => (abortedOnProgress = true),
                );
        }
        console.debug('Data successfully extracted.');
        return text;
    }
}

export function removeStackedTags(txt) {
    txt = txt.replace(/<\/strong>(\s*)<strong>/gm, '$1');
    txt = txt.replace(/<\/em>(\s*)<em>/gm, '$1');
    txt = txt.replace(/<\/span>(\s*)<span>/gm, '$1');
    return txt;
}
