Grab google search results
  This task use javascript sandbox with jsoup support to grab google search results.
Grab google search results
  - Create javascript sandbox with jsoup support
- Create javascript as following
    javascript
    
    
function main(env, args) {
  var query = 'lucene';
  var max = 50;
  searchGoogle(query, max, env, args);
}
function searchGoogle(query, max, env, args) {
  var links = args.get('links');
  var start = 0;
  while (start < max) {
    var count = searchGooglePage(query, start, env, args);
    if (count == 0) break;
    start += count;
  }
  for (var i = start - 1; i >= max; i--) {
    links.remove(i);
  }
}
function searchGooglePage(query, start, env, args) {
  var links = args.get('links');
  var count = 0;
  try {
    var url = env.newURL('http://ajax.googleapis.com/ajax/services/search/web?v=1.0&rsz=large&safe=active&q=' + env.encodeURL(query, 'UTF-8') + 'lucene&start=' + start);
    var conn = env.newJsoup().connect(url).timeout(60000);
    var html = conn.execute().body();
    var root = env.newJSONObject().fromObject(html);
    var resp = root.getJSONObject('responseData');
    var results = resp.getJSONArray('results');
    for (var i = 0; i < results.size(); i++) {
      var resItem = results.getJSONObject(i);
      var url = unescape(resItem.getString('url'), env);
      var title = resItem.getString('title');
      var titleNoFormatting = resItem.getString('titleNoFormatting');
      var visibleUrl = resItem.getString('visibleUrl');
      var content = unescape(resItem.getString('content'), env);
      var cacheUrl = unescape(resItem.getString('cacheUrl'), env);
      var item = env.newHashMap();
      item.put('url', url);
      item.put('title', title);
      item.put('titleNoFormatting', titleNoFormatting);
      item.put('visibleUrl', visibleUrl);
      item.put('content', content);
      item.put('cacheUrl', cacheUrl);
      links.add(item);
      count++;
    }
  } catch (e) {
    env.error(e);
  }
  return count;
}
function unescape(src, env) {
  var tag = env.newString(src);
  tag = tag.replaceAll('\u003c', '<');
  tag = tag.replaceAll('\u003e', '>');
  tag = tag.replaceAll('\u003d', '=');
  return tag + '';
}
    
   
 

 
No comments:
Post a Comment