Grab search results from Yahoo
Grab search results from Yahoo
- Create javascript sandbox with jsoup support
- Create javascript as following
javascript
1 | var g_env; |
2 | |
3 | function main(p_env, p_args) { |
4 | g_env = p_env; |
5 | g_env.info('Starting'); |
6 | run(); |
7 | g_env.info('Ending'); |
8 | } |
9 | |
10 | function run() { |
11 | try { |
12 | var query = 'lucene'; |
13 | for (var pn = 1; pn <= 10; pn++) { |
14 | var res = grab(query, pn); |
15 | for (var i = 0; i < res.size(); i++) { |
16 | var it = res.get(i); |
17 | var title = it.get('title'); |
18 | var link = it.get('link'); |
19 | var no = (pn - 1) * 10 + i + 1; |
20 | g_env.info(no + ' | ' + title + ' | ' + link); |
21 | } |
22 | } |
23 | } catch (e) { |
24 | g_env.error(e); |
25 | } |
26 | } |
27 | |
28 | function grab(query, pageno) { |
29 | var tag = g_env.newArrayList(); |
30 | try { |
31 | var url = 'http://search.yahoo.com/search?p=' + g_env.encodeURL(query, 'UTF-8') + '&pstart=1&b=' + ((pageno - 1) * 10 + 1); |
32 | var conn = g_env.newJsoup().connect(url); |
33 | conn.userAgent('Mozilla/5.0 (Windows NT 5.1; rv:14.0) Gecko/20100101 Firefox/14.0.1'); |
34 | conn.timeout(60000); |
35 | var doc = conn.get(); |
36 | var nodes = doc.select('#web .res'); |
37 | for (var i = 0; i < nodes.size(); i++) { |
38 | var node = nodes.get(i); |
39 | var child = node.select('a.yschttl').first(); |
40 | var title = child.text(); |
41 | var link = child.attr('href'); |
42 | var pos = link.indexOf('**'); |
43 | if (pos >= 0) { |
44 | link = link.substring(pos + 2); |
45 | link = g_env.decodeURL(link, 'UTF-8'); |
46 | } |
47 | var it = g_env.newHashMap(); |
48 | it.put('title', title); |
49 | it.put('link', link); |
50 | tag.add(it); |
51 | } |
52 | } catch (e) { |
53 | g_env.error(e); |
54 | } |
55 | return tag; |
56 | } |
var g_env; function main(p_env, p_args) { g_env = p_env; g_env.info('Starting'); run(); g_env.info('Ending'); } function run() { try { var query = 'lucene'; for (var pn = 1; pn <= 10; pn++) { var res = grab(query, pn); for (var i = 0; i < res.size(); i++) { var it = res.get(i); var title = it.get('title'); var link = it.get('link'); var no = (pn - 1) * 10 + i + 1; g_env.info(no + ' | ' + title + ' | ' + link); } } } catch (e) { g_env.error(e); } } function grab(query, pageno) { var tag = g_env.newArrayList(); try { var url = 'http://search.yahoo.com/search?p=' + g_env.encodeURL(query, 'UTF-8') + '&pstart=1&b=' + ((pageno - 1) * 10 + 1); var conn = g_env.newJsoup().connect(url); conn.userAgent('Mozilla/5.0 (Windows NT 5.1; rv:14.0) Gecko/20100101 Firefox/14.0.1'); conn.timeout(60000); var doc = conn.get(); var nodes = doc.select('#web .res'); for (var i = 0; i < nodes.size(); i++) { var node = nodes.get(i); var child = node.select('a.yschttl').first(); var title = child.text(); var link = child.attr('href'); var pos = link.indexOf('**'); if (pos >= 0) { link = link.substring(pos + 2); link = g_env.decodeURL(link, 'UTF-8'); } var it = g_env.newHashMap(); it.put('title', title); it.put('link', link); tag.add(it); } } catch (e) { g_env.error(e); } return tag; }