import * as pdfjsDist from 'pdfjs-dist';

/**
 * Extract text from a pdf file (via url) and process it
 * @param pdfUrl
 * @returns {Promise<String>}
 */
export function extractTextFromPdf(pdfUrl)
{
    pdfjsDist.GlobalWorkerOptions.workerSrc = `/js/pdf.worker.mjs`;
    const pdfDoc = pdfjsDist.getDocument(pdfUrl);

    return pdfDoc.promise.then(function(pdf)
    {
        let pagePromises = [];
        for (let i = 1; i <= pdf.numPages; i++)
        {
            let page = pdf.getPage(i);
            pagePromises.push(page.then(function(page)
            {
                let pageContent = page.getTextContent();
                return pageContent.then(function(pageText)
                {
                    return pageText.items.map(function(textItem)
                    {
                        let text = textItem.str;
                        if (text.endsWith(',') || text.endsWith('.'))
                        {
                            text += ' ';
                        }

                        if (!text.startsWith(',') || !text.startsWith('.') || !text.startsWith('-'))
                        {
                            text = ' ' + text;
                        }

                        if (text === '' || text === '\n')
                        {
                            text = ' ';
                        }

                        return text;
                    }).join('');
                });
            }));
        }

        // Wait for all pages and join text
        return Promise.all(pagePromises).then(function(texts)
        {
            return processText(texts.join(' '));
        });
    }).catch(function(reason)
    {
        return Promise.reject(reason);
    });
}

function processText(text)
{
    // remove unicode characters
    // eslint-disable-next-line no-control-regex
    text = text.replace(/[^\x00-\x7F]/g, '');
    // remove urls
    text = text.replace(/(?:https?|ftp):\/\/[\n\S]+/g, '');
    // remove emails
    text = text.replace(/\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b/g, '');
    // fix words with letters that are spaced
    text = fixExplodedWords(text);
    // remove multiple spaces
    text = text.replace(/\s+/g, ' ');
    // fix hyphenated words
    text = text.replace(/(\w+)-\s(\w+)/g, '$1$2');

    return text;
}

function fixExplodedWords(text)
{
    return text.replace(/(\b\w\s\b){2,}/g, function(match)
    {
        return match.replace(/\s/g, '');
    });
}

/**
 * Render a thumbnail of the first page of a pdf file
 * Returns dataUrl which contains base64 string, and returns number of pages in pdf
 * @param url
 * @returns {Promise<[string, number]>}
 */
export async function renderThumbnail(url)
{
    return new Promise((resolve, reject) =>
    {
        pdfjsDist.GlobalWorkerOptions.workerSrc = `/js/pdf.worker.mjs`;
        const pdfDoc = pdfjsDist.getDocument(url);

        pdfDoc.promise.then(function(pdf)
        {
            const numPages = pdf.numPages;
            pdf.getPage(1).then(page =>
            {
                const scale = 1.5;
                const viewport = page.getViewport({ scale: scale });
                const canvas = document.createElement('canvas');
                const context = canvas.getContext('2d');
                canvas.height = viewport.height;
                canvas.width = viewport.width;
                const renderContext = {
                    canvasContext: context,
                    viewport: viewport
                };
                page.render(renderContext).promise.then(function()
                {
                    resolve([canvas.toDataURL(), numPages]);
                });
            });
        });
    });
}