Grab categories from Newegg
Grab categories from Newegg
- Create javascript sandbox with jsoup support
- Create javascript as following
javascript
1 | function main(env, args) { |
2 | var links = args.get('links'); |
3 | var cats = grabCategory(env); |
4 | for (var i = 0; i < cats.size(); i++) { |
5 | links.add(cats.get(i)); |
6 | } |
7 | } |
8 | |
9 | function grabCategory(env) { |
10 | var tag = env.newArrayList(); |
11 | try { |
12 | var link = env.newURL('http://www.newegg.com/Info/SiteMap.aspx'); |
13 | var doc = env.newJsoup().parse(link, 60000); |
14 | var elements = doc.select('h5 a.nolone'); |
15 | for (var i = 0; i < elements.size(); i++) { |
16 | var element = elements.get(i); |
17 | var topcat = element.text(); |
18 | var posA = topcat.lastIndexOf(' ('); |
19 | var posB = topcat.lastIndexOf(')'); |
20 | if (posA >= 0 && posB >= 0 && posA < posB) { |
21 | topcat = topcat.substring(0, posA); |
22 | } |
23 | var children = element.parent().nextElementSibling().select('a.nolone'); |
24 | for (var j = 0; j < children.size(); j++) { |
25 | var child = children.get(j); |
26 | var title = child.text(); |
27 | posA = title.lastIndexOf(' ('); |
28 | posB = title.lastIndexOf(')'); |
29 | if (posA >= 0 && posB >= 0 && posA < posB) { |
30 | title = title.substring(0, posA); |
31 | } |
32 | title = title + ' | ' + topcat; |
33 | var url = env.newURL(link, child.attr('href')) + ''; |
34 | var item = env.newHashMap(); |
35 | item.put('title', title); |
36 | item.put('url', url); |
37 | tag.add(item); |
38 | } |
39 | } |
40 | } catch (e) { |
41 | env.error(e); |
42 | } |
43 | return tag; |
44 | } |
function main(env, args) { var links = args.get('links'); var cats = grabCategory(env); for (var i = 0; i < cats.size(); i++) { links.add(cats.get(i)); } } function grabCategory(env) { var tag = env.newArrayList(); try { var link = env.newURL('http://www.newegg.com/Info/SiteMap.aspx'); var doc = env.newJsoup().parse(link, 60000); var elements = doc.select('h5 a.nolone'); for (var i = 0; i < elements.size(); i++) { var element = elements.get(i); var topcat = element.text(); var posA = topcat.lastIndexOf(' ('); var posB = topcat.lastIndexOf(')'); if (posA >= 0 && posB >= 0 && posA < posB) { topcat = topcat.substring(0, posA); } var children = element.parent().nextElementSibling().select('a.nolone'); for (var j = 0; j < children.size(); j++) { var child = children.get(j); var title = child.text(); posA = title.lastIndexOf(' ('); posB = title.lastIndexOf(')'); if (posA >= 0 && posB >= 0 && posA < posB) { title = title.substring(0, posA); } title = title + ' | ' + topcat; var url = env.newURL(link, child.attr('href')) + ''; var item = env.newHashMap(); item.put('title', title); item.put('url', url); tag.add(item); } } } catch (e) { env.error(e); } return tag; }
No comments:
Post a Comment