Saturday 21 July 2012

Grab book/journal from ScienceDirect

Grab book/journal from ScienceDirect
This task use javascript sandbox with jsoup and lucene support to grab book/journal from ScienceDirect.
Grab book/journal from ScienceDirect
  1. Create javascript sandbox with jsoup support
  2. Add Lucene support to javascript sandbox
  3. Create javascript as following
javascript
var g_site = 'sciencedirect.com';
var g_env;

function main(p_env, p_args) {
  g_env = p_env;
  run();
}

function newEntity() {
  return g_env.newEntity();
}

function loadUrl(url) {
  var conn = g_env.newJsoup().connect(url);
  conn.userAgent('Mozilla/5.0 (Windows NT x.y; rv:10.0.1) Gecko/20100101 Firefox/10.0.1');
  conn.timeout(60000);
  return conn.get();
}

function run() {
  g_env.info('Starting');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/physicalsciences');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/chemicaleng');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/chemistry');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/computerscience');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/earth');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/energy');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/engineering');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/materialsscience');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/mathematics');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/physics');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/lifesciences');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/agribio');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/biochemgenmolbiol');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/environmental');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/immunolmicrobiol');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/neuroscience');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/healthsciences');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/medicinedentistry');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/nursinghealth');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/pharmatox');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/vetscimed');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/socialscienceshumanities');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/artsandhumanities');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/busmanacc');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/decisionsciences');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/economics');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/psychology');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/socialsciences');
  g_env.info('Ending');
}

function grabCategory(cat) {
  try {
    var pages = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '0-9'];
    for (var pn = 0; pn < pages.length; pn++) {
      try {
        var url = cat + '/' + pages[pn];
        var doc = loadUrl(url);
        var rows = doc.select('#content_browseimp tr.browseimpBrowseRow');
        for (var i = 0; i < rows.size(); i++) {
          var row = rows.get(i);
          var title = g_env.newString('');
          var link = g_env.newString('');
          var kind = g_env.newString('');
          var child = row.select('.browseColFirst a').first();
          if (child != null) {
            title = child.text();
            link = child.attr('href');
            link = g_env.newString(g_env.newURL(g_env.newURL(url), link) + '');
          }
          var child = row.select('.browseColFourth').first();
          if (child != null) {
            kind = child.text().trim();
          }
          if (title.length() > 0 && link.length() > 0 && kind.length() > 0) {
            saveTitle(title, link, kind);
          }
        }
      } catch (e) {
        g_env.error(e);
      }
    }
  } catch (e) {
    g_env.error(e);
  }
}

function saveTitle(title, link, kind) {
  if (findTitleByLink(link) != null) return;
  var schema = 's|link|s|title|s|kind';
  var entity = newEntity();
  entity.setSchema(schema);
  entity.setKind(g_site + '_Title');
  entity.setId(g_env.uniqid());
  entity.setString('link', link);
  entity.setString('title', title);
  entity.setString('kind', kind);
  entity.save();
  g_env.info(kind + ' | ' + title + ' | ' + link);
}

function findTitleByLink(link) {
  var pat = newEntity();
  var res = pat.search(g_site + '_Title', pat.newTermQuery(pat.newTerm('link', link)), 1);
  if (res.size() == 0) return null;
  return res.get(0);
}
    

  Protected by Copyscape Online Copyright Protection

No comments:

Post a Comment