Wednesday, 25 July 2012

Grab search results from Yahoo

Grab search results from Yahoo
This task use javascript sandbox with jsoup support to grab search results from Yahoo.
Grab search results from Yahoo
  1. Create javascript sandbox with jsoup support
  2. Create javascript as following
javascript
1var g_env;
2
3function main(p_env, p_args) {
4 g_env = p_env;
5 g_env.info('Starting');
6 run();
7 g_env.info('Ending');
8}
9
10function run() {
11 try {
12 var query = 'lucene';
13 for (var pn = 1; pn <= 10; pn++) {
14 var res = grab(query, pn);
15 for (var i = 0; i < res.size(); i++) {
16 var it = res.get(i);
17 var title = it.get('title');
18 var link = it.get('link');
19 var no = (pn - 1) * 10 + i + 1;
20 g_env.info(no + ' | ' + title + ' | ' + link);
21 }
22 }
23 } catch (e) {
24 g_env.error(e);
25 }
26}
27
28function grab(query, pageno) {
29 var tag = g_env.newArrayList();
30 try {
31 var url = 'http://search.yahoo.com/search?p=' + g_env.encodeURL(query, 'UTF-8') + '&pstart=1&b=' + ((pageno - 1) * 10 + 1);
32 var conn = g_env.newJsoup().connect(url);
33 conn.userAgent('Mozilla/5.0 (Windows NT 5.1; rv:14.0) Gecko/20100101 Firefox/14.0.1');
34 conn.timeout(60000);
35 var doc = conn.get();
36 var nodes = doc.select('#web .res');
37 for (var i = 0; i < nodes.size(); i++) {
38 var node = nodes.get(i);
39 var child = node.select('a.yschttl').first();
40 var title = child.text();
41 var link = child.attr('href');
42 var pos = link.indexOf('**');
43 if (pos >= 0) {
44 link = link.substring(pos + 2);
45 link = g_env.decodeURL(link, 'UTF-8');
46 }
47 var it = g_env.newHashMap();
48 it.put('title', title);
49 it.put('link', link);
50 tag.add(it);
51 }
52 } catch (e) {
53 g_env.error(e);
54 }
55 return tag;
56}
var g_env;

function main(p_env, p_args) {
  g_env = p_env;
  g_env.info('Starting');
  run();
  g_env.info('Ending');
}

function run() {
  try {
    var query = 'lucene';
    for (var pn = 1; pn <= 10; pn++) {
      var res = grab(query, pn);
      for (var i = 0; i < res.size(); i++) {
        var it = res.get(i);
        var title = it.get('title');
        var link = it.get('link');
        var no = (pn - 1) * 10 + i + 1;
        g_env.info(no + ' | ' + title + ' | ' + link);
      }
    }
  } catch (e) {
    g_env.error(e);
  }
}

function grab(query, pageno) {
  var tag = g_env.newArrayList();
  try {
    var url = 'http://search.yahoo.com/search?p=' + g_env.encodeURL(query, 'UTF-8') + '&pstart=1&b=' + ((pageno - 1) * 10 + 1);
    var conn = g_env.newJsoup().connect(url);
    conn.userAgent('Mozilla/5.0 (Windows NT 5.1; rv:14.0) Gecko/20100101 Firefox/14.0.1');
    conn.timeout(60000);
    var doc = conn.get();
    var nodes = doc.select('#web .res');
    for (var i = 0; i < nodes.size(); i++) {
      var node = nodes.get(i);
      var child = node.select('a.yschttl').first();
      var title = child.text();
      var link = child.attr('href');
      var pos = link.indexOf('**');
      if (pos >= 0) {
        link = link.substring(pos + 2);
        link = g_env.decodeURL(link, 'UTF-8');
      }
      var it = g_env.newHashMap();
      it.put('title', title);
      it.put('link', link);
      tag.add(it);
    }
  } catch (e) {
    g_env.error(e);
  }
  return tag;
}

  Protected by Copyscape Online Copyright Protection

No comments:

Post a Comment