1.0.12 • Published 4 years ago
pdf-text-img v1.0.12
pdf-text-img
Extract text and image from PDF
Usage
const fs = require('fs');
const pdf = require('pdf-text-img');
//todo
let yourPDF = 'pdf.pdf';
let dataBuffer = fs.readFileSync(yourPDF);
pdf.LoadingPDF(dataBuffer).then(function (pageCount) {
/**
* Extract text
*/
pdf.ExtractText(1, pageCount).then(function (data) {
console.log(data);
});
/**
* Download image
*/
pdf.ExtractImg(1, pageCount).then(function (data) {
for (let i = 0; i < data.length; i++) {
let dataBuffer = new Buffer.from(data[i]['imgBase64'], 'base64');
fs.writeFile('pdf_' + i + '.jpg', dataBuffer, function (err) {
if (err) throw err;
});
}
});
pdf.End();
});
It is worth noting that if a PDF has 1000 pages and each page has 100 pictures, it is obviously a waste of resources according to the above logic. Therefore, we can refer to the following logic of "batch extraction of pictures":
const fs = require('fs');
const pdf = require('pdf-text-img');
//todo
let yourPDF = 'pdf.pdf';
let dataBuffer = fs.readFileSync(yourPDF);
async function Img(startPage) {
/**
* Download pictures
*/
startPage = startPage == undefined ? 1 : startPage;
if (startPage > pdf.pdfInfo.numPages) {
console.log('Completed!');
pdf.End();
return;
}
let endPage = startPage + 1;
pdf.ExtractImg(startPage, endPage).then(function (data) {
let index = 0;
let downloadCount = data.length; // Total number of pictures to download
// No picture on current page
if (downloadCount == 0) {
console.log('No picture on current page, extract next page...');
startPage += 2;
Img(startPage);
return;
}
for (let i = 0; i < data.length; i++) {
let dataBuffer = new Buffer.from(data[i]['imgBase64'], 'base64');
fs.writeFile('pdf_' + Math.ceil(Math.random() * 100000) + '.jpg', dataBuffer, function (err) {
if (err) throw err;
index++;
if (index == downloadCount) {
console.log('Continue extraction...');
startPage += 2;
Img(startPage);
}
});
}
});
}
pdf.LoadingPDF(dataBuffer).then(function (pageCount) {
pdf.ExtractText(1, pageCount).then(function (data) {
console.log(data)
});
/**
* batch extraction of pictures
*/
Img();
});
Welcome to exchange, email: yfilemail@163.com