1.0.0 • Published 7 years ago

elastic-indexer v1.0.0

Weekly downloads
-
License
ISC
Repository
-
Last release
7 years ago

msa-elastic-indexer

installation

Install dependencies.

npm install elastic-indexer

Run kafka

First start zookeeper.

bin/zookeeper-server-start.sh config/zookeeper.properties 

Then run kafka.

bin/kafka-server-start.sh config/server.properties

Parser

Require the module. (Assuming file name is app.js)

const oodebe = require('elastic-indexer');

Create configuration of parser. Refer below for reference.

let consumerConfig = {
  "source": {
    "npm": {
      parser: {
        'description': {
          html: 'body > div.container.content > div.content-column > p',
          method: 'html'
        },
        'github': {
          html: 'body > div.container.content > div.sidebar > ul:nth-child(3) > li:nth-child(3) > a',
          method: 'html'
        }
      },
      partition: 0,
      offset: 0,
      readFromLastOffset: false
    }
  }
};

Each message in Kafka is associated with unique numeric ID called offset. If the key readFromLastOffset is set to true, then it will read the messages from the last read offset, if set to false, it will start reading from the 0 or number specified in offset key.

Initialize the Parser.

let parser = new oodebe.Parser();
parser.initializeConsumer(consumerConfig);

Create configuration for Indexer. Refer below for reference.

let indexerConfig = {
  "source": {
    "npm": {
      "indexer" : {
        index: 'npmjs',
        type: 'repos',
        fields: {
          doc: 'doc'
        }
      }
    }
  },
  "elastic" : {
    host: "localhost:9200"
  }
};

Initialize the indexer.

let indexer = new oodebe.Indexer();
indexer.initializeIndexer(indexerConfig);

Parsing and Indexing

While reading the messages from Kafka, on each message we will recieve the message event.

parser.on('message',function(topic,html,$) {
  // Parse the HTML
  let parseData = parser.parseHTML(topic,$);
  // Index it in ElasticSearch
  indexer.index(topic,html,parseData);
});

Events

It emits following events for Parser and Indexer.

parser.on('error',function(topic,error) {
});

parser.on('offsetOutOfRange',function(error) {
});

indexer.on('error',function(error) {
});

indexer.on('success',function() {
});

Finding HTML Selector path

One of the easiest way is to use chrome developers tool. For example, if you want to crawl the Github link of all the NPM packages, then follow steps mention below:

  • visit any npm package page and open up the chrome inspector.
  • Travrse to the target HTML element
  • Right click and click on copy then copy selector

Imgur

Complete code

var oodebe = require('elastic-indexer');
let consumerConfig = {
  "source": {
    "npm": {
      parser: {
        'description': {
          html: 'body > div.container.content > div.content-column > p',
          method: 'html'
        },
        'github': {
          html: 'body > div.container.content > div.sidebar > ul:nth-child(3) > li:nth-child(3) > a',
          method: 'html'
        }
      },
      partition: 0,
      offset: 0,
      readFromLastOffset: false
    }
  }
};

let indexerConfig = {
  "source": {
    "npm": {
      "indexer" : {
        index: 'npmjs',
        type: 'repos',
        fields: {
          doc: 'doc'
        }
      }
    }
  },
  "elastic" : {
    host: "localhost:9200"
  }
};

let parser = new oodebe.Parser();
let indexer = new oodebe.Indexer();

// Intialize parser and consumer
parser.initializeConsumer(consumerConfig);
indexer.initializeIndexer(indexerConfig);

// Register events

parser.on('message',function(topic,html,$) {
  let parseData = parser.parseHTML(topic,$);
  indexer.index(topic,html,parseData);
});

parser.on('error',function(topic,error) {
  console.log(topic);
  console.log(error);
});

parser.on('offsetOutOfRange',function(error) {
  console.log(error);
});

indexer.on('error',function(error) {
  console.log(error);
});

indexer.on('success',function() {
  console.log("indexed");
});

Running the app

Start the server.

node app.js

This should start the server and parse the HTML message and put it in ElasticSearch.