1.1.0 • Published 7 months ago
newspapers-com-scraper v1.1.0
Newspaper.com Scraper
A Node.js scraper for extracting article data from Newspapers.com based on keywords, dates, and locations.
const scraper = new NewspaperScraper();
await scraper.retrieve({
keyword: "elon musk twitter",
limit: 500,
dateRange: [2020, 2024],
location: "us"
});
What it Does
Searches Newspapers.com and extracts:
- Newspaper title
- Page number and URL
- Publication date
- Location
- Number of keyword matches on each page
Requirements
- Node.js 14+
- Google Chrome browser
- GEONODE.com account (optional, for proxy support)
Installation
npm install newspapers-com-scraper
Basic Usage
const NewspaperScraper = require('newspapers-com-scraper');
async function main() {
const scraper = new NewspaperScraper();
// Listen for articles
scraper.on('article', (article) => {
console.log(`Found: ${article.title} (${article.date})`);
});
await scraper.retrieve({
keyword: "elon musk twitter", // Required: search term
limit: 500, // Optional: limit total results
dateRange: [2020, 2024], // Optional: date range
location: "us" // Optional: location code
});
}
Events
The scraper emits three types of events:
// 1. Article found
scraper.on('article', (article) => {
console.log(article);
// {
// title: "The Daily News",
// pageNumber: 4,
// date: "2023-05-15",
// location: "New York, NY",
// keywordMatches: 3,
// url: "https://www.newspapers.com/image/12345678/"
// }
});
// 2. Progress update
scraper.on('progress', (progress) => {
console.log(progress);
// {
// current: 5, // Current page
// total: 20, // Total pages
// percentage: 25.0, // Progress percentage
// stats: {
// timeElapsed: 45.2, // Total seconds
// avgPageTime: 9.04 // Avg seconds per page
// }
// }
});
// 3. Scraping complete
scraper.on('complete', (stats) => {
console.log(stats);
// {
// timeElapsed: 180.5, // Total seconds
// pageTimes: [8.2, 9.1] // Time per page
// }
});
Advanced Configuration
Full configuration example:
const scraper = new NewspaperScraper({
// Scraping settings
concurrentPages: 2, // Pages to scrape in parallel
resultsPerPage: 50, // Results per page (max 50)
maxConcurrentRequests: 10, // Max parallel requests
// Browser settings
browser: {
headless: false, // Show browser
userAgent: 'Mozilla/5.0...',
executablePath: '/path/to/chrome' // Optional
},
// Proxy settings (optional)
proxy: {
enabled: false,
host: 'proxy.host',
port: 9008,
username: 'user',
password: 'pass'
},
// Logging
logger: {
level: 'info', // 'error' | 'warn' | 'info' | 'debug' | 'silent'
custom: null // Custom logger
}
});
await scraper.retrieve({
keyword: "elon musk twitter",
limit: 500,
dateRange: [2020, 2024],
location: "us"
});
// If using proxy, set up .env:
// PROXY_HOST=your_geonode_proxy_host
// PROXY_USER=your_geonode_username
// PROXY_PASS=your_geonode_password
See examples/main.js
for a complete working example.