Monday 16 April 2012

Grab categories from Newegg

Grab categories from Newegg
This task use javascript sandbox with jsoup support to grab categories from Newegg.
Grab categories from Newegg
  1. Create javascript sandbox with jsoup support
  2. Create javascript as following
javascript
function main(env, args) {
  var links = args.get('links');
  var cats = grabCategory(env);
  for (var i = 0; i < cats.size(); i++) {
    links.add(cats.get(i));
  }
}

function grabCategory(env) {
  var tag = env.newArrayList();
  try {
    var link = env.newURL('http://www.newegg.com/Info/SiteMap.aspx');
    var doc = env.newJsoup().parse(link, 60000);
    var elements = doc.select('h5 a.nolone');
    for (var i = 0; i < elements.size(); i++) {
      var element = elements.get(i);
      var topcat = element.text();
      var posA = topcat.lastIndexOf(' (');
      var posB = topcat.lastIndexOf(')');
      if (posA >= 0 && posB >= 0 && posA < posB) {
        topcat = topcat.substring(0, posA);
      }
      var children = element.parent().nextElementSibling().select('a.nolone');
      for (var j = 0; j < children.size(); j++) {
        var child = children.get(j);
        var title = child.text();
        posA = title.lastIndexOf(' (');
        posB = title.lastIndexOf(')');
        if (posA >= 0 && posB >= 0 && posA < posB) {
          title = title.substring(0, posA);
        }
        title = title + ' | ' + topcat;
        var url = env.newURL(link, child.attr('href')) + '';
        var item = env.newHashMap();
        item.put('title', title);
        item.put('url', url);
        tag.add(item);
      }
    }
  } catch (e) {
    env.error(e);
  }
  return tag;
}
    

  Protected by Copyscape Online Copyright Protection

No comments:

Post a Comment