Monday 16 April 2012

Grab products from Walmart

Grab products from Walmart
This task use javascript sandbox with jsoup support to grab products from Walmart.
Grab products from Walmart
  1. Create javascript sandbox with jsoup support
  2. Create javascript as following
javascript
function main(env, args) {
  var catUrl = 'http://www.walmart.com/browse/Vitamins/Herbals/_/N-8z7v?browsein=true&_refineresult=true&_refineresult=true?=32_0&ref=+418891&catNavId=1005863&povid=cat1005863-env250565-moduleA030812-lLinkLHN3Herbals';
  var maxpage = 1;
  var links = args.get('links');
  var prods = grabProduct(catUrl, maxpage, env);
  for (var i = 0; i < prods.size(); i++) {
    links.add(prods.get(i));
  }
}

function grabProduct(catUrl, maxpage, env) {
  var tag = env.newArrayList();
  var saved = env.newArrayList();
  for (var no = 1; no <= maxpage; no++) {
    try {
      var start = (no - 1) * 32;
      var link = env.newURL(catUrl + '&ic=32_' + start);
      var conn = env.newJsoup().connect(link + '').userAgent('Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101');
      var html = conn.timeout(60000).execute().body();
      var doc = env.newJsoup().parse(html);
      var elements = doc.select('.item');
      for (var i = 0; i < elements.size(); i++) {
        var element = elements.get(i);
        var child = element.select('.ListItemLink').first();
        if (child == null) continue;
        var title = child.text();
        var url = env.newURL(link, child.attr('href')) + '';
        if (saved.indexOf(url) >= 0) continue;
        saved.add(url);
        var item = env.newHashMap();
        item.put('title', title);
        item.put('url', url);
        child = element.select('.PriceDisplay').first();
        if (child != null) {
          item.put('price', child.text());
        }
        child = element.select('img.prodImg').first();
        if (child != null) {
          item.put('small-image', env.newURL(link, child.attr('src')) + '');
        }
        try {
          var cdoc = env.newJsoup().parse(env.newJsoup().connect(url).userAgent('Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101').timeout(60000).execute().body());
          child = cdoc.select('.ItemSectionContent .BodyXL').first();
          if (child != null) {
            item.put('description', child.html());
          }
          var child = cdoc.select('.ItemSectionContent .SpecTable').first();
          if (child != null) {
            item.put('specifications', child.parent().html());
          }
          child = cdoc.select('#mainImage').first();
          if (child != null) {
            item.put('large-image', env.newURL(link, child.attr('src')) + '');
          }
        } catch (e) {
          env.error(e);
        }

        tag.add(item);
      }
    } catch (e) {
      env.error(e);
    }
  }
  return tag;
}
    

  Protected by Copyscape Online Copyright Protection

No comments:

Post a Comment