Saturday, 14 April 2012

Grab google search results

Grab google search results
This task use javascript sandbox with jsoup support to grab google search results.
Grab google search results
  1. Create javascript sandbox with jsoup support
  2. Create javascript as following
javascript
function main(env, args) {
  var query = 'lucene';
  var max = 50;
  searchGoogle(query, max, env, args);
}

function searchGoogle(query, max, env, args) {
  var links = args.get('links');
  var start = 0;
  while (start < max) {
    var count = searchGooglePage(query, start, env, args);
    if (count == 0) break;
    start += count;
  }
  for (var i = start - 1; i >= max; i--) {
    links.remove(i);
  }
}

function searchGooglePage(query, start, env, args) {
  var links = args.get('links');
  var count = 0;
  try {
    var url = env.newURL('http://ajax.googleapis.com/ajax/services/search/web?v=1.0&rsz=large&safe=active&q=' + env.encodeURL(query, 'UTF-8') + 'lucene&start=' + start);
    var conn = env.newJsoup().connect(url).timeout(60000);
    var html = conn.execute().body();
    var root = env.newJSONObject().fromObject(html);
    var resp = root.getJSONObject('responseData');
    var results = resp.getJSONArray('results');
    for (var i = 0; i < results.size(); i++) {
      var resItem = results.getJSONObject(i);
      var url = unescape(resItem.getString('url'), env);
      var title = resItem.getString('title');
      var titleNoFormatting = resItem.getString('titleNoFormatting');
      var visibleUrl = resItem.getString('visibleUrl');
      var content = unescape(resItem.getString('content'), env);
      var cacheUrl = unescape(resItem.getString('cacheUrl'), env);
      var item = env.newHashMap();
      item.put('url', url);
      item.put('title', title);
      item.put('titleNoFormatting', titleNoFormatting);
      item.put('visibleUrl', visibleUrl);
      item.put('content', content);
      item.put('cacheUrl', cacheUrl);
      links.add(item);
      count++;
    }
  } catch (e) {
    env.error(e);
  }
  return count;
}

function unescape(src, env) {
  var tag = env.newString(src);
  tag = tag.replaceAll('\u003c', '<');
  tag = tag.replaceAll('\u003e', '>');
  tag = tag.replaceAll('\u003d', '=');
  return tag + '';
}
    

  Protected by Copyscape Online Copyright Protection

No comments:

Post a Comment