Sunday, 15 April 2012

Grab products from BestBuy

Grab products from BestBuy
This task use javascript sandbox with jsoup support to grab products from BestBuy.
Grab products from BestBuy
  1. Create javascript sandbox with jsoup support
  2. Create javascript as following
javascript
function main(env, args) {
  var catUrl = 'http://www.bestbuy.com/site/olstemplatemapper.jsp?id=pcat17080&type=page&qp=cabcat0100000%23%230%23%23wv~~cabcat0101000%23%23-1%23%23wv~~q466173746c696d69747067735f323236~~nf862%7C%7C53616d73756e67&list=y&nrp=15&sc=TVVideoSP&sp=-bestsellingsort+skuid&usc=abcat0100000';
  var maxpage = 1;
  var products = grabProduct(catUrl, maxpage, env);
  var links = args.get('links');
  for (var i = 0; i < products.size(); i++) {
    links.add(products.get(i));
  }
}

function grabProduct(catUrl, maxpage, env) {
  var tag = env.newArrayList();
  for (var no = 1; no <= maxpage; no++) {
    try {
      var link = env.newURL(catUrl + '&cp=' + no);
      var doc = env.newJsoup().parse(link, 60000);
      var elements = doc.select('.hproduct');
      for (var i = 0; i < elements.size(); i++) {
        var element = elements.get(i);
        var child = element.select('.info-main .name a').first();
        if (child == null) continue;
        var title = child.text();
        var url = env.newURL(link, child.attr('href')) + '';
        var item = env.newHashMap();
        item.put('title', title);
        item.put('url', url);
        child = element.select('.attributes').first();
        if (child != null) {
          var desc = child.html();
          var bdoc = env.newJsoup().parse(desc, link + '');
          buildURL(bdoc, link + '', env);
          item.put('attributes', bdoc.select('body').first().html());
        }
        child = element.select('.description').first();
        if (child != null) {
          var desc = child.html();
          var bdoc = env.newJsoup().parse(desc, link + '');
          buildURL(bdoc, link + '', env);
          item.put('description', bdoc.select('body').first().html());
        }
        child = element.select('.image-col a.uri img.thumb').first();
        if (child != null) {
          item.put('small-image', env.newURL(link, child.attr('src')) + '');
        }
        var cdoc = env.newJsoup().parse(env.newURL(url), 60000);
        child = cdoc.select('#esrbcontent').first();
        if (child != null) {
          var desc = child.parent().html();
          var bdoc = env.newJsoup().parse(desc, link + '');
          buildURL(bdoc, link + '', env);
          item.put('overview', bdoc.select('body').first().html());
        }
        child = cdoc.select('#tabbed-specifications').first();
        if (child != null) {
          var desc = child.html();
          var bdoc = env.newJsoup().parse(desc, link + '');
          buildURL(bdoc, link + '', env);
          item.put('specifications', bdoc.select('body').first().html());
        }
        child = cdoc.select('.prciest').first();
        if (child != null) {
          item.put('price', child.text());
        }
        child = cdoc.select('.price').first();
        if (child != null) {
          item.put('price', child.text());
        }
        tag.add(item);
      }
    } catch (e) {
      env.error(e);
    }
  }
  return tag;
}

function buildURL(doc, baseUrl, env) {
  baseUrl = env.newURL(baseUrl);
  var elements = doc.select('a');
  for (var i = 0; i < elements.size(); i++) {
    var element = elements.get(i);
    try {
      var url = env.newURL(baseUrl, element.attr('href'));
      element.attr('href', url + '');
    } catch (e) {
    }
  }
  elements = doc.select('img');
  for (var i = 0; i < elements.size(); i++) {
    var element = elements.get(i);
    try {
      var url = env.newURL(baseUrl, element.attr('src'));
      element.attr('src', url + '');
    } catch (e) {
    }
  }
}
    

  Protected by Copyscape Online Copyright Protection

No comments:

Post a Comment