Monday 16 April 2012

Grab categories from Walmart

Grab categories from Walmart
This task use javascript sandbox with jsoup support to grab categories from Walmart.
Grab categories from Walmart
  1. Create javascript sandbox with jsoup support
  2. Create javascript as following
javascript
function main(env, args) {
  var links = args.get('links');
  var cats = grabCategory(env);
  for (var i = 0; i < cats.size(); i++) {
    links.add(cats.get(i));
  }
}

function grabCategory(env) {
  var tag = env.newArrayList();
  try {
    var link = env.newURL('http://www.walmart.com/index.gsp');
    var doc = env.newJsoup().parse(link, 60000);
    var elements = doc.select('.MainCategory');
    var saved = env.newArrayList();
    var topcat = env.newArrayList();

    for (var i = 0; i < elements.size(); i++) {
      var element = elements.get(i);
      var children = element.parent().select('a');
      for (var j = 0; j < children.size(); j++) {
        var child = children.get(j);
        if (child.attr('class') == 'MainCategory') continue;
        var url = env.newURL(link, child.attr('href')) + '';
        if (url.indexOf('http://www.walmart.com/') < 0) continue;
        if (url.indexOf('http://www.walmart.com/cp/') < 0 && url.indexOf('http://www.walmart.com/browse/') < 0) continue;
        if (saved.indexOf(url) >= 0) continue;
        saved.add(url);
        var item = env.newHashMap();
        item.put('title', child.text());
        item.put('url', url);
        topcat.add(item);
      }
    }

    saved.clear();
    for (var i = 0; i < topcat.size(); i++) {
      var tc = topcat.get(i);
      try {
        doc = env.newJsoup().parse(env.newURL(tc.get('url')), 60000);
        elements = doc.select('.browseIn');
        if (elements.size() == 0) {
          var url = tc.get('url');
          if (url.indexOf('http://www.walmart.com/browse/') >= 0 && saved.indexOf(url) < 0) {
            saved.add(url);
            tag.add(tc);
          }
        } else {
          for (var j = 0; j < elements.size(); j++) {
            var element = elements.get(j);
            var title = element.text() + ' | ' + tc.get('title');
            var url = env.newURL(link, element.attr('href')) + '';
            if (saved.indexOf(url) >= 0) continue;
            saved.add(url);
            var item = env.newHashMap();
            item.put('title', title);
            item.put('url', url);
            tag.add(item);
          }
        }
      } catch (e) {
        env.error(e);
      }
    }


  } catch (e) {
    env.error(e);
  }
  return tag;
}
    

  Protected by Copyscape Online Copyright Protection

No comments:

Post a Comment