Grab products from Walmart
Grab products from Walmart
- Create javascript sandbox with jsoup support
- Create javascript as following
javascript
function main(env, args) {
var catUrl = 'http://www.walmart.com/browse/Vitamins/Herbals/_/N-8z7v?browsein=true&_refineresult=true&_refineresult=true?=32_0&ref=+418891&catNavId=1005863&povid=cat1005863-env250565-moduleA030812-lLinkLHN3Herbals';
var maxpage = 1;
var links = args.get('links');
var prods = grabProduct(catUrl, maxpage, env);
for (var i = 0; i < prods.size(); i++) {
links.add(prods.get(i));
}
}
function grabProduct(catUrl, maxpage, env) {
var tag = env.newArrayList();
var saved = env.newArrayList();
for (var no = 1; no <= maxpage; no++) {
try {
var start = (no - 1) * 32;
var link = env.newURL(catUrl + '&ic=32_' + start);
var conn = env.newJsoup().connect(link + '').userAgent('Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101');
var html = conn.timeout(60000).execute().body();
var doc = env.newJsoup().parse(html);
var elements = doc.select('.item');
for (var i = 0; i < elements.size(); i++) {
var element = elements.get(i);
var child = element.select('.ListItemLink').first();
if (child == null) continue;
var title = child.text();
var url = env.newURL(link, child.attr('href')) + '';
if (saved.indexOf(url) >= 0) continue;
saved.add(url);
var item = env.newHashMap();
item.put('title', title);
item.put('url', url);
child = element.select('.PriceDisplay').first();
if (child != null) {
item.put('price', child.text());
}
child = element.select('img.prodImg').first();
if (child != null) {
item.put('small-image', env.newURL(link, child.attr('src')) + '');
}
try {
var cdoc = env.newJsoup().parse(env.newJsoup().connect(url).userAgent('Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101').timeout(60000).execute().body());
child = cdoc.select('.ItemSectionContent .BodyXL').first();
if (child != null) {
item.put('description', child.html());
}
var child = cdoc.select('.ItemSectionContent .SpecTable').first();
if (child != null) {
item.put('specifications', child.parent().html());
}
child = cdoc.select('#mainImage').first();
if (child != null) {
item.put('large-image', env.newURL(link, child.attr('src')) + '');
}
} catch (e) {
env.error(e);
}
tag.add(item);
}
} catch (e) {
env.error(e);
}
}
return tag;
}
No comments:
Post a Comment