Sunday 15 April 2012

Grab categories from Amazon aStores

Grab categories from Amazon aStores
This task use javascript sandbox with jsoup support to grab categories from Amazon aStores.
Grab categories from Amazon aStores
  1. Create javascript sandbox with jsoup support
  2. Create javascript as following
javascript
function main(env, args) {
  var astore = 'paesia';
  try {
    var categories = grabCategory(astore, env);
    var map = env.newHashMap();
    for (var i = 0; i < categories.size(); i++) {
      var cat = categories.get(i);
      map.put(cat.get('node'), cat);
    }
    for (var i = 0; i < categories.size(); i++) {
      var cat = categories.get(i);
      var node = cat.get('node');
      var title = cat.get('title');
      var parentNode = cat.get('parent');
      var parentCat = map.get(parentNode);
      var parentTitle = '';
      if (parentCat != null) {
        parentTitle = parentCat.get('title');
      }
      var line = '';
      line += '\r\nTitle: ' + title;
      line += '\r\nNode: ' + node;
      line += '\r\nParent: ' + parentNode;
      line += '\r\nParent Title: ' + parentTitle;
      env.info(line);
    }
  } catch (e) {
    env.error(e);
  }
}

function grabCategory(astore, env) {
  var tag = env.newArrayList();
  try {
    var nodelist = env.newArrayList();
    var alink = env.newURL('http://astore.amazon.com/' + astore + '-20');
    var doc = env.newJsoup().parse(alink, 60000);
    var elements = doc.select('#searchbrowse a');
    for (var i = 0; i < elements.size(); i++) {
      var element = elements.get(i);
      var title = element.text();
      var url = element.attr('href');
      var pos = url.lastIndexOf('node=');
      if (pos < 0) continue;
      var node = url.substring(pos + 5);
      pos = node.indexOf('&');
      if (pos >= 0) {
        node = node.substring(0, pos);
      }
      var item = env.newHashMap();
      item.put('title', title);
      item.put('node', node);
      item.put('parent', '');
      tag.add(item);
      nodelist.add(node);
    }
    var no = 0;
    while (no < nodelist.size()) {
      alink = env.newURL('http://astore.amazon.com/' + astore + '-20?node=' + nodelist.get(no));
      doc = env.newJsoup().parse(alink, 60000);
      elements = doc.select('#searchbrowse .indent a');
      for (var i = 0; i < elements.size(); i++) {
        var element = elements.get(i);
        var title = element.text();
        var url = element.attr('href');
        var pos = url.lastIndexOf('node=');
        if (pos < 0) continue;
        var node = url.substring(pos + 5);
        pos = node.indexOf('&');
        if (pos >= 0) {
          node = node.substring(0, pos);
        }
        if (nodelist.indexOf(node) >= 0) continue;
        var item = env.newHashMap();
        item.put('title', title);
        item.put('node', node);
        item.put('parent', nodelist.get(no));
        tag.add(item);
        nodelist.add(node);
      }
      no++;
    }
  } catch (e) {
    env.error(e);
  }
  return tag;
}
    

  Protected by Copyscape Online Copyright Protection

No comments:

Post a Comment