elastic-indexer v1.0.0
msa-elastic-indexer
installation
Install dependencies.
npm install elastic-indexer
Run kafka
First start zookeeper.
bin/zookeeper-server-start.sh config/zookeeper.properties
Then run kafka.
bin/kafka-server-start.sh config/server.properties
Parser
Require the module. (Assuming file name is app.js)
const oodebe = require('elastic-indexer');
Create configuration of parser. Refer below for reference.
let consumerConfig = {
"source": {
"npm": {
parser: {
'description': {
html: 'body > div.container.content > div.content-column > p',
method: 'html'
},
'github': {
html: 'body > div.container.content > div.sidebar > ul:nth-child(3) > li:nth-child(3) > a',
method: 'html'
}
},
partition: 0,
offset: 0,
readFromLastOffset: false
}
}
};
Each message in Kafka is associated with unique numeric ID called offset. If the key readFromLastOffset is set to true, then it will read the messages from the last read offset, if set to false, it will start reading from the 0 or number specified in offset key.
Initialize the Parser.
let parser = new oodebe.Parser();
parser.initializeConsumer(consumerConfig);
Create configuration for Indexer. Refer below for reference.
let indexerConfig = {
"source": {
"npm": {
"indexer" : {
index: 'npmjs',
type: 'repos',
fields: {
doc: 'doc'
}
}
}
},
"elastic" : {
host: "localhost:9200"
}
};
Initialize the indexer.
let indexer = new oodebe.Indexer();
indexer.initializeIndexer(indexerConfig);
Parsing and Indexing
While reading the messages from Kafka, on each message we will recieve the message event.
parser.on('message',function(topic,html,$) {
// Parse the HTML
let parseData = parser.parseHTML(topic,$);
// Index it in ElasticSearch
indexer.index(topic,html,parseData);
});
Events
It emits following events for Parser and Indexer.
parser.on('error',function(topic,error) {
});
parser.on('offsetOutOfRange',function(error) {
});
indexer.on('error',function(error) {
});
indexer.on('success',function() {
});
Finding HTML Selector path
One of the easiest way is to use chrome developers tool. For example, if you want to crawl the Github link of all the NPM packages, then follow steps mention below:
- visit any npm package page and open up the chrome inspector.
- Travrse to the target HTML element
- Right click and click on copy then copy selector
Complete code
var oodebe = require('elastic-indexer');
let consumerConfig = {
"source": {
"npm": {
parser: {
'description': {
html: 'body > div.container.content > div.content-column > p',
method: 'html'
},
'github': {
html: 'body > div.container.content > div.sidebar > ul:nth-child(3) > li:nth-child(3) > a',
method: 'html'
}
},
partition: 0,
offset: 0,
readFromLastOffset: false
}
}
};
let indexerConfig = {
"source": {
"npm": {
"indexer" : {
index: 'npmjs',
type: 'repos',
fields: {
doc: 'doc'
}
}
}
},
"elastic" : {
host: "localhost:9200"
}
};
let parser = new oodebe.Parser();
let indexer = new oodebe.Indexer();
// Intialize parser and consumer
parser.initializeConsumer(consumerConfig);
indexer.initializeIndexer(indexerConfig);
// Register events
parser.on('message',function(topic,html,$) {
let parseData = parser.parseHTML(topic,$);
indexer.index(topic,html,parseData);
});
parser.on('error',function(topic,error) {
console.log(topic);
console.log(error);
});
parser.on('offsetOutOfRange',function(error) {
console.log(error);
});
indexer.on('error',function(error) {
console.log(error);
});
indexer.on('success',function() {
console.log("indexed");
});
Running the app
Start the server.
node app.js
This should start the server and parse the HTML message and put it in ElasticSearch.
7 years ago