Tuesday 22 May 2012

Grab products from HP Shopping

Grab products from HP Shopping
This task use javascript sandbox with jsoup support to grab products from HP Shopping.
Grab products from HP Shopping
  1. Create javascript sandbox with jsoup support
  2. Create javascript as following
javascript
function main(env, args) {
  var catUrl = 'http://shopping.hp.com/en_US/home-office/-/products/Laptops/HP%20Pavilion';
  var links = args.get('links');
  var prods = grabProduct(catUrl, env);
  for (var i = 0; i < prods.size(); i++) {
    links.add(prods.get(i));
  }
}

function grabProduct(catUrl, env) {
  var tag = env.newArrayList();
  var urls = env.newArrayList();
  urls.add(catUrl);
  try {
      var conn = env.newJsoup().connect(catUrl).userAgent('Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101');
      var doc = conn.timeout(60000).get();
      var root = doc.select('.pagination-results-container').first();
      if (root != null) {
        var elements = root.select('a');
        for (var i = 0; i < elements.size() - 1; i++) {
           var element = elements.get(i);
           if (element.hasClass('option')) continue;
           if (element.hasClass('pngFix')) continue;
           urls.add(element.attr('href'));
        }
      }
  } catch (e) {
    env.error(e);
  }
  for (var i = 0; i < urls.size(); i++) {
    try {
      var link = urls.get(i);
      var conn = env.newJsoup().connect(link).userAgent('Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101');
      var doc = conn.timeout(60000).get();
      var elements = doc.select('.listing-page-bucket');
      for (var j = 0; j < elements.size(); j++) {
        var element = elements.get(j);
        var children = element.select('.color-selector-img img.pngFix');
        var image_list = '';
        var image = '';
        for (var k = 0; k < children.size(); k++) {
           var child = children.get(k);
           if (image_list.length > 0) image_list += '\n';
           image_list += child.attr('src') + '';
           if (child.attr('style') + '' != 'display:none') {
              image = child.attr('src');
           }
        }
        var child = element.select('.product-specs h3 a').first();
        if (child == null) continue;
        var title = child.text();
        var url = child.attr('href');
        var desc = '';
        child = element.select('.product-specs .rating').first();
        if (child != null) {
          child = child.nextElementSibling().nextElementSibling();
          desc = child.text();
        }
        var price = '';
        child = element.select('#start-price').first();
        if (child != null) {
          price = child.text();
        } else {
          child = element.select('.price-value').first();
          if (child != null) {
            price = child.text();
          }
        }
        var it = env.newHashMap();
        it.put('image-list', image_list);
        it.put('image', image);
        it.put('title', title);
        it.put('url', url);
        it.put('desc', desc);
        it.put('price', price);
        tag.add(it);
      }
    } catch (e) {
      env.error(e);
    }
  }
  return tag;
}
    

  Protected by Copyscape Online Copyright Protection

Grab categories from HP Shopping

Grab categories from HP Shopping
This task use javascript sandbox with jsoup support to grab categories from HP Shopping.
Grab categories from HP Shopping
  1. Create javascript sandbox with jsoup support
  2. Create javascript as following
javascript
function main(env, args) {
  var links = args.get('links');
  var cats = grabCategory(env);
  for (var i = 0; i < cats.size(); i++) {
    links.add(cats.get(i));
  }
}

function grabCategory(env) {
  var tag = env.newArrayList();
  try {
    var link = env.newURL('http://shopping.hp.com');
    var conn = env.newJsoup().connect(link).userAgent('Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101');
    var html = conn.timeout(60000).execute().body();
    var pat1 = 'var surveyInitData =';
    var pat2 = "$('body').hpOnSiteExit({";
    var pos1 = html.indexOf(pat1);
    if (pos1 < 0) {
      env.info('S1: javascript code not found');
      return tag;
    }
    var pos2 = html.indexOf(pat2, pos1);
    if (pos2 < 0) {
      env.info('S2: javascript code not found');
      return tag;
    }
    var js = html.substring(pos1 + pat1.length, pos2);
    var obj = null;
    eval('obj = ' + js);
    var pages = obj.surveyData[0].configPages;
    for (var i = 0; i < pages.length; i++) {
      var pg = pages[i];
      if (pg.pageType != 'category') continue;
      var it = env.newHashMap();
      it.put('title', pg.pageName);
      it.put('url', pg.fullpath);
      tag.add(it);
    }
  } catch (e) {
    env.error(e);
  }
  return tag;
}
    

  Protected by Copyscape Online Copyright Protection