Grab article from ScienceDirect
Grab article from ScienceDirect
- Create javascript sandbox with jsoup support
- Add Lucene support to javascript sandbox
- Create javascript as following
javascript
var g_title = '';
var g_cache = true;
var g_site = 'sciencedirect.com';
var g_env;
var g_cookie;
function main(p_env, p_args) {
g_env = p_env;
run();
}
function newEntity() {
return g_env.newEntity();
}
function loadUrlCookieStart(url) {
var conn = g_env.newJsoup().connect(url);
conn.userAgent('Mozilla/5.0 (Windows NT 5.1; rv:14.0) Gecko/20100101 Firefox/14.0.1');
conn.timeout(60000);
var tag = conn.get();
g_cookie = conn.getCookies();
return tag;
}
function loadUrlCookie(url) {
var conn = g_env.newJsoup().connect(url);
conn.userAgent('Mozilla/5.0 (Windows NT 5.1; rv:14.0) Gecko/20100101 Firefox/14.0.1');
conn.timeout(60000);
conn.cookies(g_cookie);
return conn.get();
}
function loadUrl(url) {
var conn = g_env.newJsoup().connect(url);
conn.userAgent('Mozilla/5.0 (Windows NT 5.1; rv:14.0) Gecko/20100101 Firefox/14.0.1');
conn.timeout(60000);
return conn.get();
}
function run() {
g_env.info('Starting');
if (g_env.newString(g_title).length() > 0) {
grabTitle(g_title);
} else {
if (!g_cache) {
clearCache();
}
var rs = loadTitleFresh();
while (rs.size() > 0) {
for (var i = 0; i < rs.size(); i++) {
var et = rs.get(i);
grabTitle(et.getString('link'));
}
rs = loadTitleFresh();
}
}
g_env.info('Ending');
}
function grabTitle(link) {
var et = findTitleByLink(link);
if (et == null) return;
var kind = et.getString('kind');
if (kind == 'Book') {
grabBook(et.getString('title'), et.getString('link'));
}
if (kind == 'Book Series') {
grabBookSeries(et.getString('title'), et.getString('link'));
}
if (kind == 'Journal') {
grabJournal(et.getString('title'), et.getString('link'));
}
et.setMark('crawled');
et.save();
}
function grabJournal(p_title, p_link) {
try {
var doc = loadUrl(p_link);
var vols_link = g_env.newArrayList();
var vols_title = g_env.newArrayList();
var rows = doc.select('#volumeIssueData .txtBold a');
for (var i = 0; i < rows.size(); i++) {
var child = rows.get(i);
var title = child.text();
var link = child.attr('href');
link = g_env.newString(g_env.newURL(g_env.newURL(p_link), link) + '');
vols_link.add(link);
vols_title.add(title);
}
for (var i = 0; i < vols_link.size(); i++) {
var titleV = vols_title.get(i);
var linkV = vols_link.get(i);
try {
doc = loadUrlCookieStart(linkV);
rows = doc.select('#bodyMainResults .resultRow');
for (var j = 0; j < rows.size(); j++) {
var row = rows.get(j);
child = row.select('.cLink').first();
if (child == null) continue;
var title = child.text();
var link = child.attr('href');
link = g_env.newString(g_env.newURL(g_env.newURL(p_link), link) + '');
var desc = '';
try {
var cdoc = loadUrlCookie(link);
child = cdoc.select('#section_abstract').first();
if (child != null) {
child = child.parent();
desc = child.text();
if (desc.indexOf('Abstract') == 0) {
desc = desc.substring(8);
}
if (desc.indexOf('Summary') == 0) {
desc = desc.substring(7);
}
}
} catch (e) {
g_env.error(e);
}
saveArticle(title + ' | ' + titleV + ' | ' + p_title, link, desc);
}
} catch (e) {
g_env.error(e);
}
}
} catch (e) {
g_env.error(e);
}
}
function grabBook(p_title, p_link) {
try {
var doc = loadUrlCookieStart(p_link);
var rows = doc.select('.contentMain .nonSerialResultsList .cLink');
for (var j = 0; j < rows.size(); j++) {
var row = rows.get(j);
child = row.select('.cLink').first();
if (child == null) continue;
var title = child.text();
var link = child.attr('href');
link = g_env.newString(g_env.newURL(g_env.newURL(p_link), link) + '');
var desc = '';
try {
var cdoc = loadUrlCookie(link);
child = cdoc.select('#section_abstract').first();
if (child != null) {
child = child.parent();
desc = child.text();
if (desc.indexOf('Abstract') == 0) {
desc = desc.substring(8);
}
if (desc.indexOf('Summary') == 0) {
desc = desc.substring(7);
}
}
} catch (e) {
g_env.error(e);
}
saveArticle(title + ' | ' + p_title, link, desc);
}
} catch (e) {
g_env.error(e);
}
}
function grabBookSeries(p_title, p_link) {
try {
var doc = loadUrl(p_link);
var vols_link = g_env.newArrayList();
var vols_title = g_env.newArrayList();
var rows = doc.select('#volumeIssueData .txt');
for (var i = 0; i < rows.size(); i++) {
var row = rows.get(i);
child = row.select('a').first();
var title = '';
var link = '';
if (child == null) {
child = row.select('span').first();
if (child == null) continue;
title = child.text();
link = p_link;
} else {
title = child.text();
link = child.attr('href');
link = g_env.newString(g_env.newURL(g_env.newURL(p_link), link) + '');
}
vols_link.add(link);
vols_title.add(title);
}
for (var i = 0; i < vols_link.size(); i++) {
var titleV = vols_title.get(i);
var linkV = vols_link.get(i);
try {
doc = loadUrlCookieStart(linkV);
rows = doc.select('#bodyMainResults .resultRow');
for (var j = 0; j < rows.size(); j++) {
var row = rows.get(j);
child = row.select('.cLink').first();
if (child == null) continue;
var title = child.text();
var link = child.attr('href');
link = g_env.newString(g_env.newURL(g_env.newURL(p_link), link) + '');
var desc = '';
try {
var cdoc = loadUrlCookie(link);
child = cdoc.select('#section_abstract').first();
if (child != null) {
child = child.parent();
desc = child.text();
if (desc.indexOf('Abstract') == 0) {
desc = desc.substring(8);
}
if (desc.indexOf('Summary') == 0) {
desc = desc.substring(7);
}
}
} catch (e) {
g_env.error(e);
}
saveArticle(title + ' | ' + titleV + ' | ' + p_title, link, desc);
}
} catch (e) {
g_env.error(e);
}
}
} catch (e) {
g_env.error(e);
}
}
function saveArticle(title, link, desc) {
var src = findLink(link);
if (src != null) return;
var schema = 's|url|a|title|a|desc|s|fixed|d|score|s|site|s|inbound|s|code';
var entity = newEntity();
entity.setSchema(schema);
entity.setKind('Link');
entity.setId(g_env.uniqid());
entity.setString('url', link);
entity.setString('title', title);
entity.setString('desc', desc);
entity.setString('fixed', 'false');
entity.setString('inbound', '');
entity.setDouble('score', 0);
entity.setString('code', g_env.suniqid());
try {
var t_url = g_env.newURL(link);
var t_host = t_url.getHost();
entity.setString('site', t_host);
} catch (e) {
g_env.error(e);
}
entity.save();
var op = '\r\nTitle: ' + title;
op += '\r\nLink: ' + link;
op += '\r\nDesc: ' + desc;
g_env.info(op);
}
function clearCache() {
g_env.info('Start clearing cache');
var rs = loadTitleCrawled();
while (rs.size() > 0) {
for (var i = 0; i < rs.size(); i++) {
var et = rs.get(i);
et.setMark('');
et.save();
}
rs = loadTitleCrawled();
}
g_env.info('End clearing cache');
}
function loadTitleCrawled() {
var pat = newEntity();
var bq = pat.newBooleanQuery();
bq.add(pat.newBooleanClause(pat.newTermQuery(pat.newTerm(pat.MARK, 'crawled')), pat.occurMust()));
var rs = pat.search(g_site + '_Title', bq, 10);
return rs;
}
function loadTitleFresh() {
var pat = newEntity();
var bq = pat.newBooleanQuery();
bq.add(pat.newBooleanClause(pat.newMatchAllDocsQuery(), pat.occurMust()));
bq.add(pat.newBooleanClause(pat.newTermQuery(pat.newTerm(pat.MARK, 'crawled')), pat.occurMustNot()));
var rs = pat.search(g_site + '_Title', bq, 10);
return rs;
}
function findTitleByLink(link) {
var pat = newEntity();
var res = pat.search(g_site + '_Title', pat.newTermQuery(pat.newTerm('link', link)), 1);
if (res.size() == 0) return null;
return res.get(0);
}
function findLink(link) {
var pat = newEntity();
var res = pat.search('Link', pat.newTermQuery(pat.newTerm('url', link)), 1);
if (res.size() == 0) return null;
return res.get(0);
}