Grab categories from Walmart
Grab categories from Walmart
- Create javascript sandbox with jsoup support
- Create javascript as following
javascript
function main(env, args) {
var links = args.get('links');
var cats = grabCategory(env);
for (var i = 0; i < cats.size(); i++) {
links.add(cats.get(i));
}
}
function grabCategory(env) {
var tag = env.newArrayList();
try {
var link = env.newURL('http://www.walmart.com/index.gsp');
var doc = env.newJsoup().parse(link, 60000);
var elements = doc.select('.MainCategory');
var saved = env.newArrayList();
var topcat = env.newArrayList();
for (var i = 0; i < elements.size(); i++) {
var element = elements.get(i);
var children = element.parent().select('a');
for (var j = 0; j < children.size(); j++) {
var child = children.get(j);
if (child.attr('class') == 'MainCategory') continue;
var url = env.newURL(link, child.attr('href')) + '';
if (url.indexOf('http://www.walmart.com/') < 0) continue;
if (url.indexOf('http://www.walmart.com/cp/') < 0 && url.indexOf('http://www.walmart.com/browse/') < 0) continue;
if (saved.indexOf(url) >= 0) continue;
saved.add(url);
var item = env.newHashMap();
item.put('title', child.text());
item.put('url', url);
topcat.add(item);
}
}
saved.clear();
for (var i = 0; i < topcat.size(); i++) {
var tc = topcat.get(i);
try {
doc = env.newJsoup().parse(env.newURL(tc.get('url')), 60000);
elements = doc.select('.browseIn');
if (elements.size() == 0) {
var url = tc.get('url');
if (url.indexOf('http://www.walmart.com/browse/') >= 0 && saved.indexOf(url) < 0) {
saved.add(url);
tag.add(tc);
}
} else {
for (var j = 0; j < elements.size(); j++) {
var element = elements.get(j);
var title = element.text() + ' | ' + tc.get('title');
var url = env.newURL(link, element.attr('href')) + '';
if (saved.indexOf(url) >= 0) continue;
saved.add(url);
var item = env.newHashMap();
item.put('title', title);
item.put('url', url);
tag.add(item);
}
}
} catch (e) {
env.error(e);
}
}
} catch (e) {
env.error(e);
}
return tag;
}
No comments:
Post a Comment