Grab products from Amazon aStores
This task use javascript sandbox with jsoup support to grab products from Amazon aStores.
Grab products from Amazon aStores
- Create javascript sandbox with jsoup support
- Create javascript as following
javascript
1 | function main(env, args) { |
2 | var astore = 'paesia'; |
3 | var node = '100'; |
4 | var maxpage = 2; |
5 | try { |
6 | var products = grabProduct(astore, node, maxpage, env); |
7 | var links = args.get('links'); |
8 | for (var i = 0; i < products.size(); i++) { |
9 | links.add(products.get(i)); |
10 | } |
11 | } catch (e) { |
12 | env.error(e); |
13 | } |
14 | } |
15 | |
16 | function grabProduct(astore, node, maxpage, env) { |
17 | var tag = env.newArrayList(); |
18 | for (var no = 1; no <= maxpage; no++) { |
19 | try { |
20 | var alink = env.newURL('http://astore.amazon.com/' + astore + '-20?node=' + node + '&page=' + no); |
21 | var doc = env.newJsoup().parse(alink, 60000); |
22 | var elements = doc.select('#featuredProducts .textrow a'); |
23 | var map = env.newHashMap(); |
24 | for (var i = 0; i < elements.size(); i++) { |
25 | var element = elements.get(i); |
26 | var title = element.text(); |
27 | var url = element.attr('href'); |
28 | var pos = url.lastIndexOf('/detail/'); |
29 | if (pos < 0) continue; |
30 | var code = url.substring(pos + 8); |
31 | var url = env.newURL(alink, url) + ''; |
32 | var item = env.newHashMap(); |
33 | item.put('code', code); |
34 | item.put('title', title); |
35 | item.put('url', url); |
36 | map.put(code, item); |
37 | } |
38 | elements = doc.select('#featuredProducts .imagerow a'); |
39 | for (var i = 0; i < elements.size(); i++) { |
40 | var element = elements.get(i); |
41 | var url = element.attr('href'); |
42 | var pos = url.lastIndexOf('/detail/'); |
43 | if (pos < 0) continue; |
44 | var code = url.substring(pos + 8); |
45 | var item = map.get(code); |
46 | if (item == null) continue; |
47 | var child = element.select('img').first(); |
48 | if (child == null) continue; |
49 | var title = child.attr('alt'); |
50 | var smimg = child.attr('src'); |
51 | if (title.length() > 0) { |
52 | item.put('title', title); |
53 | } |
54 | item.put('small-image', smimg); |
55 | } |
56 | |
57 | var keys = env.getKeys(map); |
58 | for (var i = 0; i < keys.size(); i++) { |
59 | try { |
60 | var item = map.get(keys.get(i)); |
61 | alink = env.newURL(item.get('url')); |
62 | doc = env.newJsoup().parse(alink, 60000); |
63 | var element = doc.select('#detailImage img').first(); |
64 | if (element != null) { |
65 | item.put('large-image', element.attr('src')); |
66 | } |
67 | element = doc.select('#productDescription').first(); |
68 | if (element != null) { |
69 | var desc = element.html(); |
70 | var pattern = '<h2>Product Description</h2>'; |
71 | var pos = desc.indexOf(pattern); |
72 | if (pos >= 0) { |
73 | desc = desc.substring(pos + pattern.length); |
74 | } |
75 | var bdoc = env.newJsoup().parse(desc, item.get('url')); |
76 | buildURL(bdoc, item.get('url'), env); |
77 | desc = bdoc.select('body').first().html(); |
78 | if (desc.indexOf('<html') < 0) { |
79 | item.put('description', desc); |
80 | } |
81 | } |
82 | element = doc.select('#productDetails').first(); |
83 | if (element != null) { |
84 | var desc = element.html(); |
85 | var pattern = '<h2>Product Details</h2>'; |
86 | var pos = desc.indexOf(pattern); |
87 | if (pos >= 0) { |
88 | desc = desc.substring(pos + pattern.length); |
89 | } |
90 | var bdoc = env.newJsoup().parse(desc, item.get('url')); |
91 | buildURL(bdoc, item.get('url'), env); |
92 | desc = bdoc.select('body').first().html(); |
93 | if (desc.indexOf('<html') < 0) { |
94 | item.put('details', desc); |
95 | } |
96 | } |
97 | element = doc.select('#editorialReviews').first(); |
98 | if (element != null) { |
99 | var desc = element.html(); |
100 | var bdoc = env.newJsoup().parse(desc, item.get('url') + ''); |
101 | buildURL(bdoc, item.get('url'), env); |
102 | desc = bdoc.select('body').first().html(); |
103 | if (desc.indexOf('<html') < 0) { |
104 | item.put('editorial-reviews', desc); |
105 | } |
106 | } |
107 | element = doc.select('#detailListPrice').first(); |
108 | if (element != null) { |
109 | item.put('list-price', element.text()); |
110 | } |
111 | element = doc.select('#detailOfferPrice').first(); |
112 | if (element != null) { |
113 | item.put('offer-price', element.text()); |
114 | } |
115 | element = doc.select('#addToCartForm a').first(); |
116 | if (element != null) { |
117 | item.put('buy-url', element.attr('href')); |
118 | } |
119 | } catch (e) { |
120 | env.error(e); |
121 | } |
122 | } |
123 | |
124 | for (var i = 0; i < keys.size(); i++) { |
125 | tag.add(map.get(keys.get(i))); |
126 | } |
127 | } catch (e) { |
128 | env.error(e); |
129 | } |
130 | } |
131 | return tag; |
132 | } |
133 | |
134 | function buildURL(doc, baseUrl, env) { |
135 | baseUrl = env.newURL(baseUrl); |
136 | var elements = doc.select('a'); |
137 | for (var i = 0; i < elements.size(); i++) { |
138 | var element = elements.get(i); |
139 | var url = env.newURL(baseUrl, element.attr('href')); |
140 | element.attr('href', url + ''); |
141 | } |
142 | elements = doc.select('img'); |
143 | for (var i = 0; i < elements.size(); i++) { |
144 | var element = elements.get(i); |
145 | var url = env.newURL(baseUrl, element.attr('src')); |
146 | element.attr('src', url + ''); |
147 | } |
148 | } |
function main(env, args) { var astore = 'paesia'; var node = '100'; var maxpage = 2; try { var products = grabProduct(astore, node, maxpage, env); var links = args.get('links'); for (var i = 0; i < products.size(); i++) { links.add(products.get(i)); } } catch (e) { env.error(e); } } function grabProduct(astore, node, maxpage, env) { var tag = env.newArrayList(); for (var no = 1; no <= maxpage; no++) { try { var alink = env.newURL('http://astore.amazon.com/' + astore + '-20?node=' + node + '&page=' + no); var doc = env.newJsoup().parse(alink, 60000); var elements = doc.select('#featuredProducts .textrow a'); var map = env.newHashMap(); for (var i = 0; i < elements.size(); i++) { var element = elements.get(i); var title = element.text(); var url = element.attr('href'); var pos = url.lastIndexOf('/detail/'); if (pos < 0) continue; var code = url.substring(pos + 8); var url = env.newURL(alink, url) + ''; var item = env.newHashMap(); item.put('code', code); item.put('title', title); item.put('url', url); map.put(code, item); } elements = doc.select('#featuredProducts .imagerow a'); for (var i = 0; i < elements.size(); i++) { var element = elements.get(i); var url = element.attr('href'); var pos = url.lastIndexOf('/detail/'); if (pos < 0) continue; var code = url.substring(pos + 8); var item = map.get(code); if (item == null) continue; var child = element.select('img').first(); if (child == null) continue; var title = child.attr('alt'); var smimg = child.attr('src'); if (title.length() > 0) { item.put('title', title); } item.put('small-image', smimg); } var keys = env.getKeys(map); for (var i = 0; i < keys.size(); i++) { try { var item = map.get(keys.get(i)); alink = env.newURL(item.get('url')); doc = env.newJsoup().parse(alink, 60000); var element = doc.select('#detailImage img').first(); if (element != null) { item.put('large-image', element.attr('src')); } element = doc.select('#productDescription').first(); if (element != null) { var desc = element.html(); var pattern = '<h2>Product Description</h2>'; var pos = desc.indexOf(pattern); if (pos >= 0) { desc = desc.substring(pos + pattern.length); } var bdoc = env.newJsoup().parse(desc, item.get('url')); buildURL(bdoc, item.get('url'), env); desc = bdoc.select('body').first().html(); if (desc.indexOf('<html') < 0) { item.put('description', desc); } } element = doc.select('#productDetails').first(); if (element != null) { var desc = element.html(); var pattern = '<h2>Product Details</h2>'; var pos = desc.indexOf(pattern); if (pos >= 0) { desc = desc.substring(pos + pattern.length); } var bdoc = env.newJsoup().parse(desc, item.get('url')); buildURL(bdoc, item.get('url'), env); desc = bdoc.select('body').first().html(); if (desc.indexOf('<html') < 0) { item.put('details', desc); } } element = doc.select('#editorialReviews').first(); if (element != null) { var desc = element.html(); var bdoc = env.newJsoup().parse(desc, item.get('url') + ''); buildURL(bdoc, item.get('url'), env); desc = bdoc.select('body').first().html(); if (desc.indexOf('<html') < 0) { item.put('editorial-reviews', desc); } } element = doc.select('#detailListPrice').first(); if (element != null) { item.put('list-price', element.text()); } element = doc.select('#detailOfferPrice').first(); if (element != null) { item.put('offer-price', element.text()); } element = doc.select('#addToCartForm a').first(); if (element != null) { item.put('buy-url', element.attr('href')); } } catch (e) { env.error(e); } } for (var i = 0; i < keys.size(); i++) { tag.add(map.get(keys.get(i))); } } catch (e) { env.error(e); } } return tag; } function buildURL(doc, baseUrl, env) { baseUrl = env.newURL(baseUrl); var elements = doc.select('a'); for (var i = 0; i < elements.size(); i++) { var element = elements.get(i); var url = env.newURL(baseUrl, element.attr('href')); element.attr('href', url + ''); } elements = doc.select('img'); for (var i = 0; i < elements.size(); i++) { var element = elements.get(i); var url = env.newURL(baseUrl, element.attr('src')); element.attr('src', url + ''); } }
No comments:
Post a Comment