Sunday 15 April 2012

Grab products from Amazon aStores

Grab products from Amazon aStores
This task use javascript sandbox with jsoup support to grab products from Amazon aStores.
Grab products from Amazon aStores
  1. Create javascript sandbox with jsoup support
  2. Create javascript as following
javascript
function main(env, args) {
  var astore = 'paesia';
  var node = '100';
  var maxpage = 2;
  try {
    var products = grabProduct(astore, node, maxpage, env);
    var links = args.get('links');
    for (var i = 0; i < products.size(); i++) {
      links.add(products.get(i));
    }
  } catch (e) {
    env.error(e);
  }
}

function grabProduct(astore, node, maxpage, env) {
  var tag = env.newArrayList();
  for (var no = 1; no <= maxpage; no++) {
    try {
      var alink = env.newURL('http://astore.amazon.com/' + astore + '-20?node=' + node + '&page=' + no);
      var doc = env.newJsoup().parse(alink, 60000);
      var elements = doc.select('#featuredProducts .textrow a');
      var map = env.newHashMap();
      for (var i = 0; i < elements.size(); i++) {
        var element = elements.get(i);
        var title = element.text();
        var url = element.attr('href');
        var pos = url.lastIndexOf('/detail/');
        if (pos < 0) continue;
        var code = url.substring(pos + 8);
        var url = env.newURL(alink, url) + '';
        var item = env.newHashMap();
        item.put('code', code);
        item.put('title', title);
        item.put('url', url);
        map.put(code, item);
      }
      elements = doc.select('#featuredProducts .imagerow a');
      for (var i = 0; i < elements.size(); i++) {
        var element = elements.get(i);
        var url = element.attr('href');
        var pos = url.lastIndexOf('/detail/');
        if (pos < 0) continue;
        var code = url.substring(pos + 8);
        var item = map.get(code);
        if (item == null) continue;
        var child = element.select('img').first();
        if (child == null) continue;
        var title = child.attr('alt');
        var smimg = child.attr('src');
        if (title.length() > 0) {
          item.put('title', title);
        }
        item.put('small-image', smimg);
      }

      var keys = env.getKeys(map);
      for (var i = 0; i < keys.size(); i++) {
        try {
          var item = map.get(keys.get(i));
          alink = env.newURL(item.get('url'));
          doc = env.newJsoup().parse(alink, 60000);
          var element = doc.select('#detailImage img').first();
          if (element != null) {
            item.put('large-image', element.attr('src'));
          }
          element = doc.select('#productDescription').first();
          if (element != null) {
            var desc = element.html();
            var pattern = '<h2>Product Description</h2>';
            var pos = desc.indexOf(pattern);
            if (pos >= 0) {
              desc = desc.substring(pos + pattern.length);
            }
            var bdoc = env.newJsoup().parse(desc, item.get('url'));
            buildURL(bdoc, item.get('url'), env);
            desc = bdoc.select('body').first().html();
            if (desc.indexOf('<html') < 0) {
              item.put('description', desc);
            }
          }
          element = doc.select('#productDetails').first();
          if (element != null) {
            var desc = element.html();
            var pattern = '<h2>Product Details</h2>';
            var pos = desc.indexOf(pattern);
            if (pos >= 0) {
              desc = desc.substring(pos + pattern.length);
            }
            var bdoc = env.newJsoup().parse(desc, item.get('url'));
            buildURL(bdoc, item.get('url'), env);
            desc = bdoc.select('body').first().html();
            if (desc.indexOf('<html') < 0) {
              item.put('details', desc);
            }
          }
          element = doc.select('#editorialReviews').first();
          if (element != null) {
            var desc = element.html();
            var bdoc = env.newJsoup().parse(desc, item.get('url') + '');
            buildURL(bdoc, item.get('url'), env);
            desc = bdoc.select('body').first().html();
            if (desc.indexOf('<html') < 0) {
              item.put('editorial-reviews', desc);
            }
          }
          element = doc.select('#detailListPrice').first();
          if (element != null) {
            item.put('list-price', element.text());
          }
          element = doc.select('#detailOfferPrice').first();
          if (element != null) {
            item.put('offer-price', element.text());
          }
          element = doc.select('#addToCartForm a').first();
          if (element != null) {
            item.put('buy-url', element.attr('href'));
          }
        } catch (e) {
          env.error(e);
        }
      }

      for (var i = 0; i < keys.size(); i++) {
        tag.add(map.get(keys.get(i)));
      }
    } catch (e) {
      env.error(e);
    }
  }
  return tag;
}

function buildURL(doc, baseUrl, env) {
  baseUrl = env.newURL(baseUrl);
  var elements = doc.select('a');
  for (var i = 0; i < elements.size(); i++) {
    var element = elements.get(i);
    var url = env.newURL(baseUrl, element.attr('href'));
    element.attr('href', url + '');
  }
  elements = doc.select('img');
  for (var i = 0; i < elements.size(); i++) {
    var element = elements.get(i);
    var url = env.newURL(baseUrl, element.attr('src'));
    element.attr('src', url + '');
  }
}
    

  Protected by Copyscape Online Copyright Protection

No comments:

Post a Comment