Grab products from HP Shopping
This task use javascript sandbox with jsoup support to grab products from HP Shopping.
Grab products from HP Shopping
- Create javascript sandbox with jsoup support
- Create javascript as following
javascript
1 | function main(env, args) { |
2 | var catUrl = 'http://shopping.hp.com/en_US/home-office/-/products/Laptops/HP%20Pavilion'; |
3 | var links = args.get('links'); |
4 | var prods = grabProduct(catUrl, env); |
5 | for (var i = 0; i < prods.size(); i++) { |
6 | links.add(prods.get(i)); |
7 | } |
8 | } |
9 | |
10 | function grabProduct(catUrl, env) { |
11 | var tag = env.newArrayList(); |
12 | var urls = env.newArrayList(); |
13 | urls.add(catUrl); |
14 | try { |
15 | var conn = env.newJsoup().connect(catUrl).userAgent('Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101'); |
16 | var doc = conn.timeout(60000).get(); |
17 | var root = doc.select('.pagination-results-container').first(); |
18 | if (root != null) { |
19 | var elements = root.select('a'); |
20 | for (var i = 0; i < elements.size() - 1; i++) { |
21 | var element = elements.get(i); |
22 | if (element.hasClass('option')) continue; |
23 | if (element.hasClass('pngFix')) continue; |
24 | urls.add(element.attr('href')); |
25 | } |
26 | } |
27 | } catch (e) { |
28 | env.error(e); |
29 | } |
30 | for (var i = 0; i < urls.size(); i++) { |
31 | try { |
32 | var link = urls.get(i); |
33 | var conn = env.newJsoup().connect(link).userAgent('Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101'); |
34 | var doc = conn.timeout(60000).get(); |
35 | var elements = doc.select('.listing-page-bucket'); |
36 | for (var j = 0; j < elements.size(); j++) { |
37 | var element = elements.get(j); |
38 | var children = element.select('.color-selector-img img.pngFix'); |
39 | var image_list = ''; |
40 | var image = ''; |
41 | for (var k = 0; k < children.size(); k++) { |
42 | var child = children.get(k); |
43 | if (image_list.length > 0) image_list += '\n'; |
44 | image_list += child.attr('src') + ''; |
45 | if (child.attr('style') + '' != 'display:none') { |
46 | image = child.attr('src'); |
47 | } |
48 | } |
49 | var child = element.select('.product-specs h3 a').first(); |
50 | if (child == null) continue; |
51 | var title = child.text(); |
52 | var url = child.attr('href'); |
53 | var desc = ''; |
54 | child = element.select('.product-specs .rating').first(); |
55 | if (child != null) { |
56 | child = child.nextElementSibling().nextElementSibling(); |
57 | desc = child.text(); |
58 | } |
59 | var price = ''; |
60 | child = element.select('#start-price').first(); |
61 | if (child != null) { |
62 | price = child.text(); |
63 | } else { |
64 | child = element.select('.price-value').first(); |
65 | if (child != null) { |
66 | price = child.text(); |
67 | } |
68 | } |
69 | var it = env.newHashMap(); |
70 | it.put('image-list', image_list); |
71 | it.put('image', image); |
72 | it.put('title', title); |
73 | it.put('url', url); |
74 | it.put('desc', desc); |
75 | it.put('price', price); |
76 | tag.add(it); |
77 | } |
78 | } catch (e) { |
79 | env.error(e); |
80 | } |
81 | } |
82 | return tag; |
83 | } |
function main(env, args) { var catUrl = 'http://shopping.hp.com/en_US/home-office/-/products/Laptops/HP%20Pavilion'; var links = args.get('links'); var prods = grabProduct(catUrl, env); for (var i = 0; i < prods.size(); i++) { links.add(prods.get(i)); } } function grabProduct(catUrl, env) { var tag = env.newArrayList(); var urls = env.newArrayList(); urls.add(catUrl); try { var conn = env.newJsoup().connect(catUrl).userAgent('Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101'); var doc = conn.timeout(60000).get(); var root = doc.select('.pagination-results-container').first(); if (root != null) { var elements = root.select('a'); for (var i = 0; i < elements.size() - 1; i++) { var element = elements.get(i); if (element.hasClass('option')) continue; if (element.hasClass('pngFix')) continue; urls.add(element.attr('href')); } } } catch (e) { env.error(e); } for (var i = 0; i < urls.size(); i++) { try { var link = urls.get(i); var conn = env.newJsoup().connect(link).userAgent('Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101'); var doc = conn.timeout(60000).get(); var elements = doc.select('.listing-page-bucket'); for (var j = 0; j < elements.size(); j++) { var element = elements.get(j); var children = element.select('.color-selector-img img.pngFix'); var image_list = ''; var image = ''; for (var k = 0; k < children.size(); k++) { var child = children.get(k); if (image_list.length > 0) image_list += '\n'; image_list += child.attr('src') + ''; if (child.attr('style') + '' != 'display:none') { image = child.attr('src'); } } var child = element.select('.product-specs h3 a').first(); if (child == null) continue; var title = child.text(); var url = child.attr('href'); var desc = ''; child = element.select('.product-specs .rating').first(); if (child != null) { child = child.nextElementSibling().nextElementSibling(); desc = child.text(); } var price = ''; child = element.select('#start-price').first(); if (child != null) { price = child.text(); } else { child = element.select('.price-value').first(); if (child != null) { price = child.text(); } } var it = env.newHashMap(); it.put('image-list', image_list); it.put('image', image); it.put('title', title); it.put('url', url); it.put('desc', desc); it.put('price', price); tag.add(it); } } catch (e) { env.error(e); } } return tag; }