Grab google search results
This task use javascript sandbox with jsoup support to grab google search results.
Grab google search results
- Create javascript sandbox with jsoup support
- Create javascript as following
javascript
function main(env, args) {
var query = 'lucene';
var max = 50;
searchGoogle(query, max, env, args);
}
function searchGoogle(query, max, env, args) {
var links = args.get('links');
var start = 0;
while (start < max) {
var count = searchGooglePage(query, start, env, args);
if (count == 0) break;
start += count;
}
for (var i = start - 1; i >= max; i--) {
links.remove(i);
}
}
function searchGooglePage(query, start, env, args) {
var links = args.get('links');
var count = 0;
try {
var url = env.newURL('http://ajax.googleapis.com/ajax/services/search/web?v=1.0&rsz=large&safe=active&q=' + env.encodeURL(query, 'UTF-8') + 'lucene&start=' + start);
var conn = env.newJsoup().connect(url).timeout(60000);
var html = conn.execute().body();
var root = env.newJSONObject().fromObject(html);
var resp = root.getJSONObject('responseData');
var results = resp.getJSONArray('results');
for (var i = 0; i < results.size(); i++) {
var resItem = results.getJSONObject(i);
var url = unescape(resItem.getString('url'), env);
var title = resItem.getString('title');
var titleNoFormatting = resItem.getString('titleNoFormatting');
var visibleUrl = resItem.getString('visibleUrl');
var content = unescape(resItem.getString('content'), env);
var cacheUrl = unescape(resItem.getString('cacheUrl'), env);
var item = env.newHashMap();
item.put('url', url);
item.put('title', title);
item.put('titleNoFormatting', titleNoFormatting);
item.put('visibleUrl', visibleUrl);
item.put('content', content);
item.put('cacheUrl', cacheUrl);
links.add(item);
count++;
}
} catch (e) {
env.error(e);
}
return count;
}
function unescape(src, env) {
var tag = env.newString(src);
tag = tag.replaceAll('\u003c', '<');
tag = tag.replaceAll('\u003e', '>');
tag = tag.replaceAll('\u003d', '=');
return tag + '';
}
No comments:
Post a Comment