Saturday 21 July 2012

Grab article from ScienceDirect

Grab article from ScienceDirect
This task use javascript sandbox with jsoup and lucene support to grab article from ScienceDirect.
Grab article from ScienceDirect
  1. Create javascript sandbox with jsoup support
  2. Add Lucene support to javascript sandbox
  3. Create javascript as following
javascript
var g_title = '';
var g_cache = true;
var g_site = 'sciencedirect.com';
var g_env;
var g_cookie;

function main(p_env, p_args) {
  g_env = p_env;
  run();
}

function newEntity() {
  return g_env.newEntity();
}

function loadUrlCookieStart(url) {
  var conn = g_env.newJsoup().connect(url);
  conn.userAgent('Mozilla/5.0 (Windows NT 5.1; rv:14.0) Gecko/20100101 Firefox/14.0.1');
  conn.timeout(60000);
  var tag = conn.get();
  g_cookie = conn.getCookies();
  return tag;
}

function loadUrlCookie(url) {
  var conn = g_env.newJsoup().connect(url);
  conn.userAgent('Mozilla/5.0 (Windows NT 5.1; rv:14.0) Gecko/20100101 Firefox/14.0.1');
  conn.timeout(60000);
  conn.cookies(g_cookie);
  return conn.get();
}

function loadUrl(url) {
  var conn = g_env.newJsoup().connect(url);
  conn.userAgent('Mozilla/5.0 (Windows NT 5.1; rv:14.0) Gecko/20100101 Firefox/14.0.1');
  conn.timeout(60000);
  return conn.get();
}

function run() {
  g_env.info('Starting');
  if (g_env.newString(g_title).length() > 0) {
    grabTitle(g_title);
  } else {
    if (!g_cache) {
      clearCache();
    }
    var rs = loadTitleFresh();
    while (rs.size() > 0) {
      for (var i = 0; i < rs.size(); i++) {
        var et = rs.get(i);
        grabTitle(et.getString('link'));
      }
      rs = loadTitleFresh();
    }
  }
  g_env.info('Ending');
}

function grabTitle(link) {
  var et = findTitleByLink(link);
  if (et == null) return;
  var kind = et.getString('kind');
  if (kind == 'Book') {
    grabBook(et.getString('title'), et.getString('link'));
  }
  if (kind == 'Book Series') {
    grabBookSeries(et.getString('title'), et.getString('link'));
  }
  if (kind == 'Journal') {
    grabJournal(et.getString('title'), et.getString('link'));
  }
  et.setMark('crawled');
  et.save();
}

function grabJournal(p_title, p_link) {
  try {
    var doc = loadUrl(p_link);
    var vols_link = g_env.newArrayList();
    var vols_title = g_env.newArrayList();
    var rows = doc.select('#volumeIssueData .txtBold a');
    for (var i = 0; i < rows.size(); i++) {
      var child = rows.get(i);
      var title = child.text();
      var link = child.attr('href');
      link = g_env.newString(g_env.newURL(g_env.newURL(p_link), link) + '');
      vols_link.add(link);
      vols_title.add(title);
    }
    for (var i = 0; i < vols_link.size(); i++) {
      var titleV = vols_title.get(i);
      var linkV = vols_link.get(i);
      try {
        doc = loadUrlCookieStart(linkV);
        rows = doc.select('#bodyMainResults .resultRow');
        for (var j = 0; j < rows.size(); j++) {
          var row = rows.get(j);
          child = row.select('.cLink').first();
          if (child == null) continue;
          var title = child.text();
          var link = child.attr('href');
          link = g_env.newString(g_env.newURL(g_env.newURL(p_link), link) + '');
          var desc = '';
          try {
            var cdoc = loadUrlCookie(link);
            child = cdoc.select('#section_abstract').first();
            if (child != null) {
              child = child.parent();
              desc = child.text();
              if (desc.indexOf('Abstract') == 0) {
                desc = desc.substring(8);
              }
              if (desc.indexOf('Summary') == 0) {
                desc = desc.substring(7);
              }
            }
          } catch (e) {
            g_env.error(e);
          }
          saveArticle(title + ' | ' + titleV + ' | ' + p_title, link, desc);
        }
      } catch (e) {
        g_env.error(e);
      }
    }
  } catch (e) {
    g_env.error(e);
  }
}

function grabBook(p_title, p_link) {
  try {
    var doc = loadUrlCookieStart(p_link);
    var rows = doc.select('.contentMain .nonSerialResultsList .cLink');
    for (var j = 0; j < rows.size(); j++) {
      var row = rows.get(j);
      child = row.select('.cLink').first();
      if (child == null) continue;
      var title = child.text();
      var link = child.attr('href');
      link = g_env.newString(g_env.newURL(g_env.newURL(p_link), link) + '');
      var desc = '';
      try {
        var cdoc = loadUrlCookie(link);
        child = cdoc.select('#section_abstract').first();
        if (child != null) {
          child = child.parent();
          desc = child.text();
          if (desc.indexOf('Abstract') == 0) {
            desc = desc.substring(8);
          }
          if (desc.indexOf('Summary') == 0) {
            desc = desc.substring(7);
          }
        }
      } catch (e) {
        g_env.error(e);
      }
      saveArticle(title + ' | ' + p_title, link, desc);
    }
  } catch (e) {
    g_env.error(e);
  }
}

function grabBookSeries(p_title, p_link) {
  try {
    var doc = loadUrl(p_link);
    var vols_link = g_env.newArrayList();
    var vols_title = g_env.newArrayList();
    var rows = doc.select('#volumeIssueData .txt');
    for (var i = 0; i < rows.size(); i++) {
      var row = rows.get(i);
      child = row.select('a').first();
      var title = '';
      var link = '';
      if (child == null) {
        child = row.select('span').first();
        if (child == null) continue;
        title = child.text();
        link = p_link;
      } else {
        title = child.text();
        link = child.attr('href');
        link = g_env.newString(g_env.newURL(g_env.newURL(p_link), link) + '');
      }
      vols_link.add(link);
      vols_title.add(title);
    }
    for (var i = 0; i < vols_link.size(); i++) {
      var titleV = vols_title.get(i);
      var linkV = vols_link.get(i);
      try {
        doc = loadUrlCookieStart(linkV);
        rows = doc.select('#bodyMainResults .resultRow');
        for (var j = 0; j < rows.size(); j++) {
          var row = rows.get(j);
          child = row.select('.cLink').first();
          if (child == null) continue;
          var title = child.text();
          var link = child.attr('href');
          link = g_env.newString(g_env.newURL(g_env.newURL(p_link), link) + '');
          var desc = '';
          try {
            var cdoc = loadUrlCookie(link);
            child = cdoc.select('#section_abstract').first();
            if (child != null) {
              child = child.parent();
              desc = child.text();
              if (desc.indexOf('Abstract') == 0) {
                desc = desc.substring(8);
              }
              if (desc.indexOf('Summary') == 0) {
                desc = desc.substring(7);
              }
            }
          } catch (e) {
            g_env.error(e);
          }
          saveArticle(title + ' | ' + titleV + ' | ' + p_title, link, desc);
        }
      } catch (e) {
        g_env.error(e);
      }
    }
  } catch (e) {
    g_env.error(e);
  }
}

function saveArticle(title, link, desc) {
  var src = findLink(link);
  if (src != null) return;
  var schema = 's|url|a|title|a|desc|s|fixed|d|score|s|site|s|inbound|s|code';
  var entity = newEntity();
  entity.setSchema(schema);
  entity.setKind('Link');
  entity.setId(g_env.uniqid());
  entity.setString('url', link);
  entity.setString('title', title);
  entity.setString('desc', desc);
  entity.setString('fixed', 'false');
  entity.setString('inbound', '');
  entity.setDouble('score', 0);
  entity.setString('code', g_env.suniqid());
  try {
    var t_url = g_env.newURL(link);
    var t_host = t_url.getHost();
    entity.setString('site', t_host);
  } catch (e) {
    g_env.error(e);
  }
  entity.save();

  var op = '\r\nTitle: ' + title;
  op += '\r\nLink: ' + link;
  op += '\r\nDesc: ' + desc;
  g_env.info(op);
}

function clearCache() {
  g_env.info('Start clearing cache');
  var rs = loadTitleCrawled();
  while (rs.size() > 0) {
    for (var i = 0; i < rs.size(); i++) {
      var et = rs.get(i);
      et.setMark('');
      et.save();
    }
    rs = loadTitleCrawled();
  }
  g_env.info('End clearing cache');
}

function loadTitleCrawled() {
  var pat = newEntity();
  var bq = pat.newBooleanQuery();
  bq.add(pat.newBooleanClause(pat.newTermQuery(pat.newTerm(pat.MARK, 'crawled')), pat.occurMust()));
  var rs = pat.search(g_site + '_Title', bq, 10);
  return rs;
}

function loadTitleFresh() {
  var pat = newEntity();
  var bq = pat.newBooleanQuery();
  bq.add(pat.newBooleanClause(pat.newMatchAllDocsQuery(), pat.occurMust()));
  bq.add(pat.newBooleanClause(pat.newTermQuery(pat.newTerm(pat.MARK, 'crawled')), pat.occurMustNot()));
  var rs = pat.search(g_site + '_Title', bq, 10);
  return rs;
}

function findTitleByLink(link) {
  var pat = newEntity();
  var res = pat.search(g_site + '_Title', pat.newTermQuery(pat.newTerm('link', link)), 1);
  if (res.size() == 0) return null;
  return res.get(0);
}

function findLink(link) {
  var pat = newEntity();
  var res = pat.search('Link', pat.newTermQuery(pat.newTerm('url', link)), 1);
  if (res.size() == 0) return null;
  return res.get(0);
}
    

  Protected by Copyscape Online Copyright Protection

No comments:

Post a Comment