Saturday, 21 July 2012

Grab book/journal from ScienceDirect

Grab book/journal from ScienceDirect
This task use javascript sandbox with jsoup and lucene support to grab book/journal from ScienceDirect.
Grab book/journal from ScienceDirect
  1. Create javascript sandbox with jsoup support
  2. Add Lucene support to javascript sandbox
  3. Create javascript as following
javascript
1var g_site = 'sciencedirect.com';
2var g_env;
3
4function main(p_env, p_args) {
5 g_env = p_env;
6 run();
7}
8
9function newEntity() {
10 return g_env.newEntity();
11}
12
13function loadUrl(url) {
14 var conn = g_env.newJsoup().connect(url);
15 conn.userAgent('Mozilla/5.0 (Windows NT x.y; rv:10.0.1) Gecko/20100101 Firefox/10.0.1');
16 conn.timeout(60000);
17 return conn.get();
18}
19
20function run() {
21 g_env.info('Starting');
22 grabCategory('http://www.sciencedirect.com/science/browse/sub/physicalsciences');
23 grabCategory('http://www.sciencedirect.com/science/browse/sub/chemicaleng');
24 grabCategory('http://www.sciencedirect.com/science/browse/sub/chemistry');
25 grabCategory('http://www.sciencedirect.com/science/browse/sub/computerscience');
26 grabCategory('http://www.sciencedirect.com/science/browse/sub/earth');
27 grabCategory('http://www.sciencedirect.com/science/browse/sub/energy');
28 grabCategory('http://www.sciencedirect.com/science/browse/sub/engineering');
29 grabCategory('http://www.sciencedirect.com/science/browse/sub/materialsscience');
30 grabCategory('http://www.sciencedirect.com/science/browse/sub/mathematics');
31 grabCategory('http://www.sciencedirect.com/science/browse/sub/physics');
32 grabCategory('http://www.sciencedirect.com/science/browse/sub/lifesciences');
33 grabCategory('http://www.sciencedirect.com/science/browse/sub/agribio');
34 grabCategory('http://www.sciencedirect.com/science/browse/sub/biochemgenmolbiol');
35 grabCategory('http://www.sciencedirect.com/science/browse/sub/environmental');
36 grabCategory('http://www.sciencedirect.com/science/browse/sub/immunolmicrobiol');
37 grabCategory('http://www.sciencedirect.com/science/browse/sub/neuroscience');
38 grabCategory('http://www.sciencedirect.com/science/browse/sub/healthsciences');
39 grabCategory('http://www.sciencedirect.com/science/browse/sub/medicinedentistry');
40 grabCategory('http://www.sciencedirect.com/science/browse/sub/nursinghealth');
41 grabCategory('http://www.sciencedirect.com/science/browse/sub/pharmatox');
42 grabCategory('http://www.sciencedirect.com/science/browse/sub/vetscimed');
43 grabCategory('http://www.sciencedirect.com/science/browse/sub/socialscienceshumanities');
44 grabCategory('http://www.sciencedirect.com/science/browse/sub/artsandhumanities');
45 grabCategory('http://www.sciencedirect.com/science/browse/sub/busmanacc');
46 grabCategory('http://www.sciencedirect.com/science/browse/sub/decisionsciences');
47 grabCategory('http://www.sciencedirect.com/science/browse/sub/economics');
48 grabCategory('http://www.sciencedirect.com/science/browse/sub/psychology');
49 grabCategory('http://www.sciencedirect.com/science/browse/sub/socialsciences');
50 g_env.info('Ending');
51}
52
53function grabCategory(cat) {
54 try {
55 var pages = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '0-9'];
56 for (var pn = 0; pn < pages.length; pn++) {
57 try {
58 var url = cat + '/' + pages[pn];
59 var doc = loadUrl(url);
60 var rows = doc.select('#content_browseimp tr.browseimpBrowseRow');
61 for (var i = 0; i < rows.size(); i++) {
62 var row = rows.get(i);
63 var title = g_env.newString('');
64 var link = g_env.newString('');
65 var kind = g_env.newString('');
66 var child = row.select('.browseColFirst a').first();
67 if (child != null) {
68 title = child.text();
69 link = child.attr('href');
70 link = g_env.newString(g_env.newURL(g_env.newURL(url), link) + '');
71 }
72 var child = row.select('.browseColFourth').first();
73 if (child != null) {
74 kind = child.text().trim();
75 }
76 if (title.length() > 0 && link.length() > 0 && kind.length() > 0) {
77 saveTitle(title, link, kind);
78 }
79 }
80 } catch (e) {
81 g_env.error(e);
82 }
83 }
84 } catch (e) {
85 g_env.error(e);
86 }
87}
88
89function saveTitle(title, link, kind) {
90 if (findTitleByLink(link) != null) return;
91 var schema = 's|link|s|title|s|kind';
92 var entity = newEntity();
93 entity.setSchema(schema);
94 entity.setKind(g_site + '_Title');
95 entity.setId(g_env.uniqid());
96 entity.setString('link', link);
97 entity.setString('title', title);
98 entity.setString('kind', kind);
99 entity.save();
100 g_env.info(kind + ' | ' + title + ' | ' + link);
101}
102
103function findTitleByLink(link) {
104 var pat = newEntity();
105 var res = pat.search(g_site + '_Title', pat.newTermQuery(pat.newTerm('link', link)), 1);
106 if (res.size() == 0) return null;
107 return res.get(0);
108}
var g_site = 'sciencedirect.com';
var g_env;

function main(p_env, p_args) {
  g_env = p_env;
  run();
}

function newEntity() {
  return g_env.newEntity();
}

function loadUrl(url) {
  var conn = g_env.newJsoup().connect(url);
  conn.userAgent('Mozilla/5.0 (Windows NT x.y; rv:10.0.1) Gecko/20100101 Firefox/10.0.1');
  conn.timeout(60000);
  return conn.get();
}

function run() {
  g_env.info('Starting');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/physicalsciences');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/chemicaleng');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/chemistry');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/computerscience');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/earth');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/energy');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/engineering');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/materialsscience');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/mathematics');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/physics');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/lifesciences');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/agribio');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/biochemgenmolbiol');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/environmental');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/immunolmicrobiol');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/neuroscience');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/healthsciences');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/medicinedentistry');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/nursinghealth');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/pharmatox');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/vetscimed');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/socialscienceshumanities');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/artsandhumanities');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/busmanacc');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/decisionsciences');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/economics');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/psychology');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/socialsciences');
  g_env.info('Ending');
}

function grabCategory(cat) {
  try {
    var pages = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '0-9'];
    for (var pn = 0; pn < pages.length; pn++) {
      try {
        var url = cat + '/' + pages[pn];
        var doc = loadUrl(url);
        var rows = doc.select('#content_browseimp tr.browseimpBrowseRow');
        for (var i = 0; i < rows.size(); i++) {
          var row = rows.get(i);
          var title = g_env.newString('');
          var link = g_env.newString('');
          var kind = g_env.newString('');
          var child = row.select('.browseColFirst a').first();
          if (child != null) {
            title = child.text();
            link = child.attr('href');
            link = g_env.newString(g_env.newURL(g_env.newURL(url), link) + '');
          }
          var child = row.select('.browseColFourth').first();
          if (child != null) {
            kind = child.text().trim();
          }
          if (title.length() > 0 && link.length() > 0 && kind.length() > 0) {
            saveTitle(title, link, kind);
          }
        }
      } catch (e) {
        g_env.error(e);
      }
    }
  } catch (e) {
    g_env.error(e);
  }
}

function saveTitle(title, link, kind) {
  if (findTitleByLink(link) != null) return;
  var schema = 's|link|s|title|s|kind';
  var entity = newEntity();
  entity.setSchema(schema);
  entity.setKind(g_site + '_Title');
  entity.setId(g_env.uniqid());
  entity.setString('link', link);
  entity.setString('title', title);
  entity.setString('kind', kind);
  entity.save();
  g_env.info(kind + ' | ' + title + ' | ' + link);
}

function findTitleByLink(link) {
  var pat = newEntity();
  var res = pat.search(g_site + '_Title', pat.newTermQuery(pat.newTerm('link', link)), 1);
  if (res.size() == 0) return null;
  return res.get(0);
}

  Protected by Copyscape Online Copyright Protection

No comments:

Post a Comment