Grab google search results
This task use javascript sandbox with jsoup support to grab google search results.
Grab google search results
- Create javascript sandbox with jsoup support
- Create javascript as following
javascript
function main(env, args) { var query = 'lucene'; var max = 50; searchGoogle(query, max, env, args); } function searchGoogle(query, max, env, args) { var links = args.get('links'); var start = 0; while (start < max) { var count = searchGooglePage(query, start, env, args); if (count == 0) break; start += count; } for (var i = start - 1; i >= max; i--) { links.remove(i); } } function searchGooglePage(query, start, env, args) { var links = args.get('links'); var count = 0; try { var url = env.newURL('http://ajax.googleapis.com/ajax/services/search/web?v=1.0&rsz=large&safe=active&q=' + env.encodeURL(query, 'UTF-8') + 'lucene&start=' + start); var conn = env.newJsoup().connect(url).timeout(60000); var html = conn.execute().body(); var root = env.newJSONObject().fromObject(html); var resp = root.getJSONObject('responseData'); var results = resp.getJSONArray('results'); for (var i = 0; i < results.size(); i++) { var resItem = results.getJSONObject(i); var url = unescape(resItem.getString('url'), env); var title = resItem.getString('title'); var titleNoFormatting = resItem.getString('titleNoFormatting'); var visibleUrl = resItem.getString('visibleUrl'); var content = unescape(resItem.getString('content'), env); var cacheUrl = unescape(resItem.getString('cacheUrl'), env); var item = env.newHashMap(); item.put('url', url); item.put('title', title); item.put('titleNoFormatting', titleNoFormatting); item.put('visibleUrl', visibleUrl); item.put('content', content); item.put('cacheUrl', cacheUrl); links.add(item); count++; } } catch (e) { env.error(e); } return count; } function unescape(src, env) { var tag = env.newString(src); tag = tag.replaceAll('\u003c', '<'); tag = tag.replaceAll('\u003e', '>'); tag = tag.replaceAll('\u003d', '='); return tag + ''; }
No comments:
Post a Comment