Grab products from Walmart
Grab products from Walmart
- Create javascript sandbox with jsoup support
- Create javascript as following
javascript
1 | function main(env, args) { |
2 | var catUrl = 'http://www.walmart.com/browse/Vitamins/Herbals/_/N-8z7v?browsein=true&_refineresult=true&_refineresult=true?=32_0&ref=+418891&catNavId=1005863&povid=cat1005863-env250565-moduleA030812-lLinkLHN3Herbals'; |
3 | var maxpage = 1; |
4 | var links = args.get('links'); |
5 | var prods = grabProduct(catUrl, maxpage, env); |
6 | for (var i = 0; i < prods.size(); i++) { |
7 | links.add(prods.get(i)); |
8 | } |
9 | } |
10 | |
11 | function grabProduct(catUrl, maxpage, env) { |
12 | var tag = env.newArrayList(); |
13 | var saved = env.newArrayList(); |
14 | for (var no = 1; no <= maxpage; no++) { |
15 | try { |
16 | var start = (no - 1) * 32; |
17 | var link = env.newURL(catUrl + '&ic=32_' + start); |
18 | var conn = env.newJsoup().connect(link + '').userAgent('Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101'); |
19 | var html = conn.timeout(60000).execute().body(); |
20 | var doc = env.newJsoup().parse(html); |
21 | var elements = doc.select('.item'); |
22 | for (var i = 0; i < elements.size(); i++) { |
23 | var element = elements.get(i); |
24 | var child = element.select('.ListItemLink').first(); |
25 | if (child == null) continue; |
26 | var title = child.text(); |
27 | var url = env.newURL(link, child.attr('href')) + ''; |
28 | if (saved.indexOf(url) >= 0) continue; |
29 | saved.add(url); |
30 | var item = env.newHashMap(); |
31 | item.put('title', title); |
32 | item.put('url', url); |
33 | child = element.select('.PriceDisplay').first(); |
34 | if (child != null) { |
35 | item.put('price', child.text()); |
36 | } |
37 | child = element.select('img.prodImg').first(); |
38 | if (child != null) { |
39 | item.put('small-image', env.newURL(link, child.attr('src')) + ''); |
40 | } |
41 | try { |
42 | var cdoc = env.newJsoup().parse(env.newJsoup().connect(url).userAgent('Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101').timeout(60000).execute().body()); |
43 | child = cdoc.select('.ItemSectionContent .BodyXL').first(); |
44 | if (child != null) { |
45 | item.put('description', child.html()); |
46 | } |
47 | var child = cdoc.select('.ItemSectionContent .SpecTable').first(); |
48 | if (child != null) { |
49 | item.put('specifications', child.parent().html()); |
50 | } |
51 | child = cdoc.select('#mainImage').first(); |
52 | if (child != null) { |
53 | item.put('large-image', env.newURL(link, child.attr('src')) + ''); |
54 | } |
55 | } catch (e) { |
56 | env.error(e); |
57 | } |
58 | |
59 | tag.add(item); |
60 | } |
61 | } catch (e) { |
62 | env.error(e); |
63 | } |
64 | } |
65 | return tag; |
66 | } |
function main(env, args) { var catUrl = 'http://www.walmart.com/browse/Vitamins/Herbals/_/N-8z7v?browsein=true&_refineresult=true&_refineresult=true?=32_0&ref=+418891&catNavId=1005863&povid=cat1005863-env250565-moduleA030812-lLinkLHN3Herbals'; var maxpage = 1; var links = args.get('links'); var prods = grabProduct(catUrl, maxpage, env); for (var i = 0; i < prods.size(); i++) { links.add(prods.get(i)); } } function grabProduct(catUrl, maxpage, env) { var tag = env.newArrayList(); var saved = env.newArrayList(); for (var no = 1; no <= maxpage; no++) { try { var start = (no - 1) * 32; var link = env.newURL(catUrl + '&ic=32_' + start); var conn = env.newJsoup().connect(link + '').userAgent('Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101'); var html = conn.timeout(60000).execute().body(); var doc = env.newJsoup().parse(html); var elements = doc.select('.item'); for (var i = 0; i < elements.size(); i++) { var element = elements.get(i); var child = element.select('.ListItemLink').first(); if (child == null) continue; var title = child.text(); var url = env.newURL(link, child.attr('href')) + ''; if (saved.indexOf(url) >= 0) continue; saved.add(url); var item = env.newHashMap(); item.put('title', title); item.put('url', url); child = element.select('.PriceDisplay').first(); if (child != null) { item.put('price', child.text()); } child = element.select('img.prodImg').first(); if (child != null) { item.put('small-image', env.newURL(link, child.attr('src')) + ''); } try { var cdoc = env.newJsoup().parse(env.newJsoup().connect(url).userAgent('Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101').timeout(60000).execute().body()); child = cdoc.select('.ItemSectionContent .BodyXL').first(); if (child != null) { item.put('description', child.html()); } var child = cdoc.select('.ItemSectionContent .SpecTable').first(); if (child != null) { item.put('specifications', child.parent().html()); } child = cdoc.select('#mainImage').first(); if (child != null) { item.put('large-image', env.newURL(link, child.attr('src')) + ''); } } catch (e) { env.error(e); } tag.add(item); } } catch (e) { env.error(e); } } return tag; }
No comments:
Post a Comment