Pdf-text-img NPM

pdf-text-img

Extract text and image from PDF

Usage

const fs = require('fs');
const pdf = require('pdf-text-img');

//todo
let yourPDF = 'pdf.pdf';
let dataBuffer = fs.readFileSync(yourPDF);

pdf.LoadingPDF(dataBuffer).then(function (pageCount) {
    /**
     * Extract text
     */
    pdf.ExtractText(1, pageCount).then(function (data) {
        console.log(data);
    });

    /**
     * Download image
     */
    pdf.ExtractImg(1, pageCount).then(function (data) {
        for (let i = 0; i < data.length; i++) {
            let dataBuffer = new Buffer.from(data[i]['imgBase64'], 'base64');

            fs.writeFile('pdf_' + i + '.jpg', dataBuffer, function (err) {
                if (err) throw err;
            });
        }
    });

    pdf.End();
});

It is worth noting that if a PDF has 1000 pages and each page has 100 pictures, it is obviously a waste of resources according to the above logic. Therefore, we can refer to the following logic of "batch extraction of pictures":

const fs = require('fs');
const pdf = require('pdf-text-img');

//todo
let yourPDF = 'pdf.pdf';
let dataBuffer = fs.readFileSync(yourPDF);

async function Img(startPage) {
    /**
     * Download pictures
     */
    startPage = startPage == undefined ? 1 : startPage;
    if (startPage > pdf.pdfInfo.numPages) {
        console.log('Completed!');

        pdf.End();
        return;
    }

    let endPage = startPage + 1;

    pdf.ExtractImg(startPage, endPage).then(function (data) {
        let index = 0;
        let downloadCount = data.length; // Total number of pictures to download

        // No picture on current page
        if (downloadCount == 0) {
            console.log('No picture on current page, extract next page...');

            startPage += 2;
            Img(startPage);
            return;
        }

        for (let i = 0; i < data.length; i++) {
            let dataBuffer = new Buffer.from(data[i]['imgBase64'], 'base64');

            fs.writeFile('pdf_' + Math.ceil(Math.random() * 100000) + '.jpg', dataBuffer, function (err) {
                if (err) throw err;

                index++;

                if (index == downloadCount) {
                    console.log('Continue extraction...');

                    startPage += 2;
                    Img(startPage);
                }
            });
        }
    });
}

pdf.LoadingPDF(dataBuffer).then(function (pageCount) {
    pdf.ExtractText(1, pageCount).then(function (data) {
        console.log(data)
    });

    /**
     * batch extraction of pictures
     */
    Img();
});

Welcome to exchange, email: yfilemail@163.com

@infinitebrahmanuniverse/nolb-pdf @everything-registry/sub-chunk-2421

6 years ago

6 years ago

6 years ago

6 years ago

6 years ago

6 years ago

6 years ago

6 years ago

6 years ago

6 years ago

6 years ago

6 years ago

6 years ago