Grab google search results
This task use javascript sandbox with jsoup support to grab google search results.
Grab google search results
- Create javascript sandbox with jsoup support
- Create javascript as following
javascript
1 | function main(env, args) { |
2 | var query = 'lucene'; |
3 | var max = 50; |
4 | searchGoogle(query, max, env, args); |
5 | } |
6 | |
7 | function searchGoogle(query, max, env, args) { |
8 | var links = args.get('links'); |
9 | var start = 0; |
10 | while (start < max) { |
11 | var count = searchGooglePage(query, start, env, args); |
12 | if (count == 0) break; |
13 | start += count; |
14 | } |
15 | for (var i = start - 1; i >= max; i--) { |
16 | links.remove(i); |
17 | } |
18 | } |
19 | |
20 | function searchGooglePage(query, start, env, args) { |
21 | var links = args.get('links'); |
22 | var count = 0; |
23 | try { |
24 | var url = env.newURL('http://ajax.googleapis.com/ajax/services/search/web?v=1.0&rsz=large&safe=active&q=' + env.encodeURL(query, 'UTF-8') + 'lucene&start=' + start); |
25 | var conn = env.newJsoup().connect(url).timeout(60000); |
26 | var html = conn.execute().body(); |
27 | var root = env.newJSONObject().fromObject(html); |
28 | var resp = root.getJSONObject('responseData'); |
29 | var results = resp.getJSONArray('results'); |
30 | for (var i = 0; i < results.size(); i++) { |
31 | var resItem = results.getJSONObject(i); |
32 | var url = unescape(resItem.getString('url'), env); |
33 | var title = resItem.getString('title'); |
34 | var titleNoFormatting = resItem.getString('titleNoFormatting'); |
35 | var visibleUrl = resItem.getString('visibleUrl'); |
36 | var content = unescape(resItem.getString('content'), env); |
37 | var cacheUrl = unescape(resItem.getString('cacheUrl'), env); |
38 | var item = env.newHashMap(); |
39 | item.put('url', url); |
40 | item.put('title', title); |
41 | item.put('titleNoFormatting', titleNoFormatting); |
42 | item.put('visibleUrl', visibleUrl); |
43 | item.put('content', content); |
44 | item.put('cacheUrl', cacheUrl); |
45 | links.add(item); |
46 | count++; |
47 | } |
48 | } catch (e) { |
49 | env.error(e); |
50 | } |
51 | return count; |
52 | } |
53 | |
54 | function unescape(src, env) { |
55 | var tag = env.newString(src); |
56 | tag = tag.replaceAll('\u003c', '<'); |
57 | tag = tag.replaceAll('\u003e', '>'); |
58 | tag = tag.replaceAll('\u003d', '='); |
59 | return tag + ''; |
60 | } |
function main(env, args) { var query = 'lucene'; var max = 50; searchGoogle(query, max, env, args); } function searchGoogle(query, max, env, args) { var links = args.get('links'); var start = 0; while (start < max) { var count = searchGooglePage(query, start, env, args); if (count == 0) break; start += count; } for (var i = start - 1; i >= max; i--) { links.remove(i); } } function searchGooglePage(query, start, env, args) { var links = args.get('links'); var count = 0; try { var url = env.newURL('http://ajax.googleapis.com/ajax/services/search/web?v=1.0&rsz=large&safe=active&q=' + env.encodeURL(query, 'UTF-8') + 'lucene&start=' + start); var conn = env.newJsoup().connect(url).timeout(60000); var html = conn.execute().body(); var root = env.newJSONObject().fromObject(html); var resp = root.getJSONObject('responseData'); var results = resp.getJSONArray('results'); for (var i = 0; i < results.size(); i++) { var resItem = results.getJSONObject(i); var url = unescape(resItem.getString('url'), env); var title = resItem.getString('title'); var titleNoFormatting = resItem.getString('titleNoFormatting'); var visibleUrl = resItem.getString('visibleUrl'); var content = unescape(resItem.getString('content'), env); var cacheUrl = unescape(resItem.getString('cacheUrl'), env); var item = env.newHashMap(); item.put('url', url); item.put('title', title); item.put('titleNoFormatting', titleNoFormatting); item.put('visibleUrl', visibleUrl); item.put('content', content); item.put('cacheUrl', cacheUrl); links.add(item); count++; } } catch (e) { env.error(e); } return count; } function unescape(src, env) { var tag = env.newString(src); tag = tag.replaceAll('\u003c', '<'); tag = tag.replaceAll('\u003e', '>'); tag = tag.replaceAll('\u003d', '='); return tag + ''; }
No comments:
Post a Comment