Wednesday 25 July 2012

Grab search results from Yahoo

Grab search results from Yahoo
This task use javascript sandbox with jsoup support to grab search results from Yahoo.
Grab search results from Yahoo
  1. Create javascript sandbox with jsoup support
  2. Create javascript as following
javascript
var g_env;

function main(p_env, p_args) {
  g_env = p_env;
  g_env.info('Starting');
  run();
  g_env.info('Ending');
}

function run() {
  try {
    var query = 'lucene';
    for (var pn = 1; pn <= 10; pn++) {
      var res = grab(query, pn);
      for (var i = 0; i < res.size(); i++) {
        var it = res.get(i);
        var title = it.get('title');
        var link = it.get('link');
        var no = (pn - 1) * 10 + i + 1;
        g_env.info(no + ' | ' + title + ' | ' + link);
      }
    }
  } catch (e) {
    g_env.error(e);
  }
}

function grab(query, pageno) {
  var tag = g_env.newArrayList();
  try {
    var url = 'http://search.yahoo.com/search?p=' + g_env.encodeURL(query, 'UTF-8') + '&pstart=1&b=' + ((pageno - 1) * 10 + 1);
    var conn = g_env.newJsoup().connect(url);
    conn.userAgent('Mozilla/5.0 (Windows NT 5.1; rv:14.0) Gecko/20100101 Firefox/14.0.1');
    conn.timeout(60000);
    var doc = conn.get();
    var nodes = doc.select('#web .res');
    for (var i = 0; i < nodes.size(); i++) {
      var node = nodes.get(i);
      var child = node.select('a.yschttl').first();
      var title = child.text();
      var link = child.attr('href');
      var pos = link.indexOf('**');
      if (pos >= 0) {
        link = link.substring(pos + 2);
        link = g_env.decodeURL(link, 'UTF-8');
      }
      var it = g_env.newHashMap();
      it.put('title', title);
      it.put('link', link);
      tag.add(it);
    }
  } catch (e) {
    g_env.error(e);
  }
  return tag;
}
    

  Protected by Copyscape Online Copyright Protection

No comments:

Post a Comment