Grab products from Amazon aStores
This task use javascript sandbox with jsoup support to grab products from Amazon aStores.
Grab products from Amazon aStores
- Create javascript sandbox with jsoup support
- Create javascript as following
javascript
function main(env, args) { var astore = 'paesia'; var node = '100'; var maxpage = 2; try { var products = grabProduct(astore, node, maxpage, env); var links = args.get('links'); for (var i = 0; i < products.size(); i++) { links.add(products.get(i)); } } catch (e) { env.error(e); } } function grabProduct(astore, node, maxpage, env) { var tag = env.newArrayList(); for (var no = 1; no <= maxpage; no++) { try { var alink = env.newURL('http://astore.amazon.com/' + astore + '-20?node=' + node + '&page=' + no); var doc = env.newJsoup().parse(alink, 60000); var elements = doc.select('#featuredProducts .textrow a'); var map = env.newHashMap(); for (var i = 0; i < elements.size(); i++) { var element = elements.get(i); var title = element.text(); var url = element.attr('href'); var pos = url.lastIndexOf('/detail/'); if (pos < 0) continue; var code = url.substring(pos + 8); var url = env.newURL(alink, url) + ''; var item = env.newHashMap(); item.put('code', code); item.put('title', title); item.put('url', url); map.put(code, item); } elements = doc.select('#featuredProducts .imagerow a'); for (var i = 0; i < elements.size(); i++) { var element = elements.get(i); var url = element.attr('href'); var pos = url.lastIndexOf('/detail/'); if (pos < 0) continue; var code = url.substring(pos + 8); var item = map.get(code); if (item == null) continue; var child = element.select('img').first(); if (child == null) continue; var title = child.attr('alt'); var smimg = child.attr('src'); if (title.length() > 0) { item.put('title', title); } item.put('small-image', smimg); } var keys = env.getKeys(map); for (var i = 0; i < keys.size(); i++) { try { var item = map.get(keys.get(i)); alink = env.newURL(item.get('url')); doc = env.newJsoup().parse(alink, 60000); var element = doc.select('#detailImage img').first(); if (element != null) { item.put('large-image', element.attr('src')); } element = doc.select('#productDescription').first(); if (element != null) { var desc = element.html(); var pattern = '<h2>Product Description</h2>'; var pos = desc.indexOf(pattern); if (pos >= 0) { desc = desc.substring(pos + pattern.length); } var bdoc = env.newJsoup().parse(desc, item.get('url')); buildURL(bdoc, item.get('url'), env); desc = bdoc.select('body').first().html(); if (desc.indexOf('<html') < 0) { item.put('description', desc); } } element = doc.select('#productDetails').first(); if (element != null) { var desc = element.html(); var pattern = '<h2>Product Details</h2>'; var pos = desc.indexOf(pattern); if (pos >= 0) { desc = desc.substring(pos + pattern.length); } var bdoc = env.newJsoup().parse(desc, item.get('url')); buildURL(bdoc, item.get('url'), env); desc = bdoc.select('body').first().html(); if (desc.indexOf('<html') < 0) { item.put('details', desc); } } element = doc.select('#editorialReviews').first(); if (element != null) { var desc = element.html(); var bdoc = env.newJsoup().parse(desc, item.get('url') + ''); buildURL(bdoc, item.get('url'), env); desc = bdoc.select('body').first().html(); if (desc.indexOf('<html') < 0) { item.put('editorial-reviews', desc); } } element = doc.select('#detailListPrice').first(); if (element != null) { item.put('list-price', element.text()); } element = doc.select('#detailOfferPrice').first(); if (element != null) { item.put('offer-price', element.text()); } element = doc.select('#addToCartForm a').first(); if (element != null) { item.put('buy-url', element.attr('href')); } } catch (e) { env.error(e); } } for (var i = 0; i < keys.size(); i++) { tag.add(map.get(keys.get(i))); } } catch (e) { env.error(e); } } return tag; } function buildURL(doc, baseUrl, env) { baseUrl = env.newURL(baseUrl); var elements = doc.select('a'); for (var i = 0; i < elements.size(); i++) { var element = elements.get(i); var url = env.newURL(baseUrl, element.attr('href')); element.attr('href', url + ''); } elements = doc.select('img'); for (var i = 0; i < elements.size(); i++) { var element = elements.get(i); var url = env.newURL(baseUrl, element.attr('src')); element.attr('src', url + ''); } }
No comments:
Post a Comment