0.0.1 • Published 9 years ago

new-scraping v0.0.1

Weekly downloads
1
License
ISC
Repository
github
Last release
9 years ago

Tasks Structure

'use strict';

var CONSTANTS, Classes, dependencies, taskName;

function helperFunction () {
}
// ...

Class = require('class');
// ...

CONSTANT1 = 'foo';
// ...

taskName = Yakuza.task('scraper', 'agent', 'taskName');

taskName.builder(function (job) {
  // ...
});

taskName.main(function (task, http, params) {
  var opts;
  
  loginFormOpts = http.optionsTemplate({
    'headers': LOGIN_HEADERS,
    'follow_max': 1
  });
  
  // Request: Request summary
  // ========================
  // Detailed explanation if necessary.
  // lorem ipsum dolor sit amet
  // ...
  http.get('FormRetrieval', opts.build({url: URL_FORM}), function (err, res, body) {
    var a, b, c;
    
    if (err) {
      task.fail(err, 'Request error');
    }
    
    // Parsing: Parsing explanation
    // ----------------------------
    try {
      // Parse account name
      superComplexOperation(/[a-b]+/g).match();
    } catch (error) {
      task.fail(error, 'Failed parsing form');
      return;
    }
    
    task.success(data);
  });
});

Handling Asyncrony

Lodash Loops

Stopping a loop

  
  // DO NOT DO
  _.each(collection, function (elem) {
    return false;
  });
  
  // Do this instead
  noErrors = _.every(collection, function (elem) {
    try {
      // .. Parsing
    } catch (error) {
      // .. Handle error
      return false;
    }
    return true;
  });
  
  if (!noErrors) {
    return;
  }
  
  // Wont reach this line if error happened

Handling asyncronous requests

  var promise1, promise2;
  
  getChecking = Q.defer();
  getCredit = Q.defer();
  
  // Request: Get Checking accounts
  // ==============================
  http.get(..., function (){
    // Parsing:
    try {
      // Parse stuff
      getChecking.resolve(result);
    catch (error) {
      getChecking.reject({error: error, message: 'Something happened'});
      return; 
    }
  });
  
  // Request: Get Credit accounts
  // ==============================
  http.get(..., function (){
    // Parsing:
    try {
      // Parse stuff
      getCredit.resolve(result);
    catch (error) {
      getCredit.reject({error: error, message: 'Something happened'});
      return; 
    }
  });
  
  Q.all([getChecking, getCredit]).then(function (results) {
    // results[0] => get checking results
    // results[1] => get credit results
    
    task.success(...);
  }, function (error) {
  
    task.fail(error.error, error.message);
  });

Directory structure

  • app.js
  • banks
    • bank1
      • login
        • login.task.js ...
      • bank1.agent.js Readme.md ... banks.scraper.js

Component declarations

  • Should always be CapitalizedCamelCase: SomeTask, SomeScraper

Documenting scrapers

  • Agents should have a Readme.md with details about the bank and important info/links etc.
  • Tasks should have exaplations for all requests unless they are extremely trivial
  • Tasks can have a general explanation, detailing how requests are made by the task.
  • Code which is not easily understood on its own should be preceeded with a comment that briefly explains it
  • Shares should be preceeded by a comment explaining why it is necessary and which tasks require the shared value
  • Parsing blocks should be explained in a preceeding comment block

Example:

// This awesome task handles tasking taskfulness, it is important to keep track of
// the awesomes received and re-send them to the awesome-receiver and bla bla bla
Yakuza.task('Scraper', 'Agent', 'Task').main(function (task, http, params) {
  var awesomeOpts;
  
  awesomeOpts = http.build({
    url: URL_AWESOME,
    data: {
      important: 'things'
    },
    follow_max: 1
  });
  
  // Request: send important things
  // ==============================
  // important things need to be sent to set important session cookies
  http.post(http.build(awesomeOpts, function (err, res, body) {
    
    // Parsing: retrieve important things
    // ----------------------------------
    // This parses important things and skips the last to rows so that nothing explodes and stuff
    try {
      // Replaces all letters with 'awesome' because why not
      body.replace(/[a-zA-Z]+/g, 'awesome');
    } catch (error) {
      // ...
    }
    
    // Share: Used by `FooTask` for its requests
    task.share('importantThings', body);
    
    task.success('hi');
  });
});
0.0.1

9 years ago