Grab products from Walmart
Grab products from Walmart
- Create javascript sandbox with jsoup support
- Create javascript as following
javascript
function main(env, args) { var catUrl = 'http://www.walmart.com/browse/Vitamins/Herbals/_/N-8z7v?browsein=true&_refineresult=true&_refineresult=true?=32_0&ref=+418891&catNavId=1005863&povid=cat1005863-env250565-moduleA030812-lLinkLHN3Herbals'; var maxpage = 1; var links = args.get('links'); var prods = grabProduct(catUrl, maxpage, env); for (var i = 0; i < prods.size(); i++) { links.add(prods.get(i)); } } function grabProduct(catUrl, maxpage, env) { var tag = env.newArrayList(); var saved = env.newArrayList(); for (var no = 1; no <= maxpage; no++) { try { var start = (no - 1) * 32; var link = env.newURL(catUrl + '&ic=32_' + start); var conn = env.newJsoup().connect(link + '').userAgent('Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101'); var html = conn.timeout(60000).execute().body(); var doc = env.newJsoup().parse(html); var elements = doc.select('.item'); for (var i = 0; i < elements.size(); i++) { var element = elements.get(i); var child = element.select('.ListItemLink').first(); if (child == null) continue; var title = child.text(); var url = env.newURL(link, child.attr('href')) + ''; if (saved.indexOf(url) >= 0) continue; saved.add(url); var item = env.newHashMap(); item.put('title', title); item.put('url', url); child = element.select('.PriceDisplay').first(); if (child != null) { item.put('price', child.text()); } child = element.select('img.prodImg').first(); if (child != null) { item.put('small-image', env.newURL(link, child.attr('src')) + ''); } try { var cdoc = env.newJsoup().parse(env.newJsoup().connect(url).userAgent('Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101').timeout(60000).execute().body()); child = cdoc.select('.ItemSectionContent .BodyXL').first(); if (child != null) { item.put('description', child.html()); } var child = cdoc.select('.ItemSectionContent .SpecTable').first(); if (child != null) { item.put('specifications', child.parent().html()); } child = cdoc.select('#mainImage').first(); if (child != null) { item.put('large-image', env.newURL(link, child.attr('src')) + ''); } } catch (e) { env.error(e); } tag.add(item); } } catch (e) { env.error(e); } } return tag; }
No comments:
Post a Comment