Grab categories from Walmart
Grab categories from Walmart
- Create javascript sandbox with jsoup support
- Create javascript as following
javascript
function main(env, args) { var links = args.get('links'); var cats = grabCategory(env); for (var i = 0; i < cats.size(); i++) { links.add(cats.get(i)); } } function grabCategory(env) { var tag = env.newArrayList(); try { var link = env.newURL('http://www.walmart.com/index.gsp'); var doc = env.newJsoup().parse(link, 60000); var elements = doc.select('.MainCategory'); var saved = env.newArrayList(); var topcat = env.newArrayList(); for (var i = 0; i < elements.size(); i++) { var element = elements.get(i); var children = element.parent().select('a'); for (var j = 0; j < children.size(); j++) { var child = children.get(j); if (child.attr('class') == 'MainCategory') continue; var url = env.newURL(link, child.attr('href')) + ''; if (url.indexOf('http://www.walmart.com/') < 0) continue; if (url.indexOf('http://www.walmart.com/cp/') < 0 && url.indexOf('http://www.walmart.com/browse/') < 0) continue; if (saved.indexOf(url) >= 0) continue; saved.add(url); var item = env.newHashMap(); item.put('title', child.text()); item.put('url', url); topcat.add(item); } } saved.clear(); for (var i = 0; i < topcat.size(); i++) { var tc = topcat.get(i); try { doc = env.newJsoup().parse(env.newURL(tc.get('url')), 60000); elements = doc.select('.browseIn'); if (elements.size() == 0) { var url = tc.get('url'); if (url.indexOf('http://www.walmart.com/browse/') >= 0 && saved.indexOf(url) < 0) { saved.add(url); tag.add(tc); } } else { for (var j = 0; j < elements.size(); j++) { var element = elements.get(j); var title = element.text() + ' | ' + tc.get('title'); var url = env.newURL(link, element.attr('href')) + ''; if (saved.indexOf(url) >= 0) continue; saved.add(url); var item = env.newHashMap(); item.put('title', title); item.put('url', url); tag.add(item); } } } catch (e) { env.error(e); } } } catch (e) { env.error(e); } return tag; }
No comments:
Post a Comment