Wednesday, 25 July 2012

Grab search results from Bing

Grab search results from Bing
This task use javascript sandbox with jsoup support to grab search results from Bing.
Grab search results from Bing
  1. Create javascript sandbox with jsoup support
  2. Create javascript as following
javascript
1var g_env;
2
3function main(p_env, p_args) {
4 g_env = p_env;
5 g_env.info('Starting');
6 run();
7 g_env.info('Ending');
8}
9
10function run() {
11 try {
12 var query = 'lucene';
13 for (var pn = 1; pn <= 10; pn++) {
14 var res = grab(query, pn);
15 for (var i = 0; i < res.size(); i++) {
16 var it = res.get(i);
17 var title = it.get('title');
18 var link = it.get('link');
19 var no = (pn - 1) * 10 + i + 1;
20 g_env.info(no + ' | ' + title + ' | ' + link);
21 }
22 }
23 } catch (e) {
24 g_env.error(e);
25 }
26}
27
28function grab(query, pageno) {
29 var tag = g_env.newArrayList();
30 try {
31 var url = 'http://www.bing.com/search?q=' + g_env.encodeURL(query, 'UTF-8') + '&first=' + ((pageno - 1) * 10 + 1);
32 var conn = g_env.newJsoup().connect(url);
33 conn.userAgent('Mozilla/5.0 (Windows NT 5.1; rv:14.0) Gecko/20100101 Firefox/14.0.1');
34 conn.timeout(60000);
35 var doc = conn.get();
36 var nodes = doc.select('#results .sa_wr');
37 for (var i = 0; i < nodes.size(); i++) {
38 var node = nodes.get(i);
39 var child = node.select('.sa_cc .sa_mc .sb_tlst a').first();
40 var title = child.text();
41 var link = child.attr('href');
42 var it = g_env.newHashMap();
43 it.put('title', title);
44 it.put('link', link);
45 tag.add(it);
46 }
47 } catch (e) {
48 g_env.error(e);
49 }
50 return tag;
51}
var g_env;

function main(p_env, p_args) {
  g_env = p_env;
  g_env.info('Starting');
  run();
  g_env.info('Ending');
}

function run() {
  try {
    var query = 'lucene';
    for (var pn = 1; pn <= 10; pn++) {
      var res = grab(query, pn);
      for (var i = 0; i < res.size(); i++) {
        var it = res.get(i);
        var title = it.get('title');
        var link = it.get('link');
        var no = (pn - 1) * 10 + i + 1;
        g_env.info(no + ' | ' + title + ' | ' + link);
      }
    }
  } catch (e) {
    g_env.error(e);
  }
}

function grab(query, pageno) {
  var tag = g_env.newArrayList();
  try {
    var url = 'http://www.bing.com/search?q=' + g_env.encodeURL(query, 'UTF-8') + '&first=' + ((pageno - 1) * 10 + 1);
    var conn = g_env.newJsoup().connect(url);
    conn.userAgent('Mozilla/5.0 (Windows NT 5.1; rv:14.0) Gecko/20100101 Firefox/14.0.1');
    conn.timeout(60000);
    var doc = conn.get();
    var nodes = doc.select('#results .sa_wr');
    for (var i = 0; i < nodes.size(); i++) {
      var node = nodes.get(i);
      var child = node.select('.sa_cc .sa_mc .sb_tlst a').first();
      var title = child.text();
      var link = child.attr('href');
      var it = g_env.newHashMap();
      it.put('title', title);
      it.put('link', link);
      tag.add(it);
    }
  } catch (e) {
    g_env.error(e);
  }
  return tag;
}

  Protected by Copyscape Online Copyright Protection

No comments:

Post a Comment