Grab categories from Walmart
Grab categories from Walmart
- Create javascript sandbox with jsoup support
- Create javascript as following
javascript
1 | function main(env, args) { |
2 | var links = args.get('links'); |
3 | var cats = grabCategory(env); |
4 | for (var i = 0; i < cats.size(); i++) { |
5 | links.add(cats.get(i)); |
6 | } |
7 | } |
8 | |
9 | function grabCategory(env) { |
10 | var tag = env.newArrayList(); |
11 | try { |
12 | var link = env.newURL('http://www.walmart.com/index.gsp'); |
13 | var doc = env.newJsoup().parse(link, 60000); |
14 | var elements = doc.select('.MainCategory'); |
15 | var saved = env.newArrayList(); |
16 | var topcat = env.newArrayList(); |
17 | |
18 | for (var i = 0; i < elements.size(); i++) { |
19 | var element = elements.get(i); |
20 | var children = element.parent().select('a'); |
21 | for (var j = 0; j < children.size(); j++) { |
22 | var child = children.get(j); |
23 | if (child.attr('class') == 'MainCategory') continue; |
24 | var url = env.newURL(link, child.attr('href')) + ''; |
25 | if (url.indexOf('http://www.walmart.com/') < 0) continue; |
26 | if (url.indexOf('http://www.walmart.com/cp/') < 0 && url.indexOf('http://www.walmart.com/browse/') < 0) continue; |
27 | if (saved.indexOf(url) >= 0) continue; |
28 | saved.add(url); |
29 | var item = env.newHashMap(); |
30 | item.put('title', child.text()); |
31 | item.put('url', url); |
32 | topcat.add(item); |
33 | } |
34 | } |
35 | |
36 | saved.clear(); |
37 | for (var i = 0; i < topcat.size(); i++) { |
38 | var tc = topcat.get(i); |
39 | try { |
40 | doc = env.newJsoup().parse(env.newURL(tc.get('url')), 60000); |
41 | elements = doc.select('.browseIn'); |
42 | if (elements.size() == 0) { |
43 | var url = tc.get('url'); |
44 | if (url.indexOf('http://www.walmart.com/browse/') >= 0 && saved.indexOf(url) < 0) { |
45 | saved.add(url); |
46 | tag.add(tc); |
47 | } |
48 | } else { |
49 | for (var j = 0; j < elements.size(); j++) { |
50 | var element = elements.get(j); |
51 | var title = element.text() + ' | ' + tc.get('title'); |
52 | var url = env.newURL(link, element.attr('href')) + ''; |
53 | if (saved.indexOf(url) >= 0) continue; |
54 | saved.add(url); |
55 | var item = env.newHashMap(); |
56 | item.put('title', title); |
57 | item.put('url', url); |
58 | tag.add(item); |
59 | } |
60 | } |
61 | } catch (e) { |
62 | env.error(e); |
63 | } |
64 | } |
65 | |
66 | |
67 | } catch (e) { |
68 | env.error(e); |
69 | } |
70 | return tag; |
71 | } |
function main(env, args) { var links = args.get('links'); var cats = grabCategory(env); for (var i = 0; i < cats.size(); i++) { links.add(cats.get(i)); } } function grabCategory(env) { var tag = env.newArrayList(); try { var link = env.newURL('http://www.walmart.com/index.gsp'); var doc = env.newJsoup().parse(link, 60000); var elements = doc.select('.MainCategory'); var saved = env.newArrayList(); var topcat = env.newArrayList(); for (var i = 0; i < elements.size(); i++) { var element = elements.get(i); var children = element.parent().select('a'); for (var j = 0; j < children.size(); j++) { var child = children.get(j); if (child.attr('class') == 'MainCategory') continue; var url = env.newURL(link, child.attr('href')) + ''; if (url.indexOf('http://www.walmart.com/') < 0) continue; if (url.indexOf('http://www.walmart.com/cp/') < 0 && url.indexOf('http://www.walmart.com/browse/') < 0) continue; if (saved.indexOf(url) >= 0) continue; saved.add(url); var item = env.newHashMap(); item.put('title', child.text()); item.put('url', url); topcat.add(item); } } saved.clear(); for (var i = 0; i < topcat.size(); i++) { var tc = topcat.get(i); try { doc = env.newJsoup().parse(env.newURL(tc.get('url')), 60000); elements = doc.select('.browseIn'); if (elements.size() == 0) { var url = tc.get('url'); if (url.indexOf('http://www.walmart.com/browse/') >= 0 && saved.indexOf(url) < 0) { saved.add(url); tag.add(tc); } } else { for (var j = 0; j < elements.size(); j++) { var element = elements.get(j); var title = element.text() + ' | ' + tc.get('title'); var url = env.newURL(link, element.attr('href')) + ''; if (saved.indexOf(url) >= 0) continue; saved.add(url); var item = env.newHashMap(); item.put('title', title); item.put('url', url); tag.add(item); } } } catch (e) { env.error(e); } } } catch (e) { env.error(e); } return tag; }
No comments:
Post a Comment