Grab book/journal from ScienceDirect
This task use javascript sandbox with jsoup and lucene support to grab book/journal from ScienceDirect.
Grab book/journal from ScienceDirect
- Create javascript sandbox with jsoup support
- Add Lucene support to javascript sandbox
- Create javascript as following
javascript
var g_site = 'sciencedirect.com';
var g_env;
function main(p_env, p_args) {
g_env = p_env;
run();
}
function newEntity() {
return g_env.newEntity();
}
function loadUrl(url) {
var conn = g_env.newJsoup().connect(url);
conn.userAgent('Mozilla/5.0 (Windows NT x.y; rv:10.0.1) Gecko/20100101 Firefox/10.0.1');
conn.timeout(60000);
return conn.get();
}
function run() {
g_env.info('Starting');
grabCategory('http://www.sciencedirect.com/science/browse/sub/physicalsciences');
grabCategory('http://www.sciencedirect.com/science/browse/sub/chemicaleng');
grabCategory('http://www.sciencedirect.com/science/browse/sub/chemistry');
grabCategory('http://www.sciencedirect.com/science/browse/sub/computerscience');
grabCategory('http://www.sciencedirect.com/science/browse/sub/earth');
grabCategory('http://www.sciencedirect.com/science/browse/sub/energy');
grabCategory('http://www.sciencedirect.com/science/browse/sub/engineering');
grabCategory('http://www.sciencedirect.com/science/browse/sub/materialsscience');
grabCategory('http://www.sciencedirect.com/science/browse/sub/mathematics');
grabCategory('http://www.sciencedirect.com/science/browse/sub/physics');
grabCategory('http://www.sciencedirect.com/science/browse/sub/lifesciences');
grabCategory('http://www.sciencedirect.com/science/browse/sub/agribio');
grabCategory('http://www.sciencedirect.com/science/browse/sub/biochemgenmolbiol');
grabCategory('http://www.sciencedirect.com/science/browse/sub/environmental');
grabCategory('http://www.sciencedirect.com/science/browse/sub/immunolmicrobiol');
grabCategory('http://www.sciencedirect.com/science/browse/sub/neuroscience');
grabCategory('http://www.sciencedirect.com/science/browse/sub/healthsciences');
grabCategory('http://www.sciencedirect.com/science/browse/sub/medicinedentistry');
grabCategory('http://www.sciencedirect.com/science/browse/sub/nursinghealth');
grabCategory('http://www.sciencedirect.com/science/browse/sub/pharmatox');
grabCategory('http://www.sciencedirect.com/science/browse/sub/vetscimed');
grabCategory('http://www.sciencedirect.com/science/browse/sub/socialscienceshumanities');
grabCategory('http://www.sciencedirect.com/science/browse/sub/artsandhumanities');
grabCategory('http://www.sciencedirect.com/science/browse/sub/busmanacc');
grabCategory('http://www.sciencedirect.com/science/browse/sub/decisionsciences');
grabCategory('http://www.sciencedirect.com/science/browse/sub/economics');
grabCategory('http://www.sciencedirect.com/science/browse/sub/psychology');
grabCategory('http://www.sciencedirect.com/science/browse/sub/socialsciences');
g_env.info('Ending');
}
function grabCategory(cat) {
try {
var pages = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '0-9'];
for (var pn = 0; pn < pages.length; pn++) {
try {
var url = cat + '/' + pages[pn];
var doc = loadUrl(url);
var rows = doc.select('#content_browseimp tr.browseimpBrowseRow');
for (var i = 0; i < rows.size(); i++) {
var row = rows.get(i);
var title = g_env.newString('');
var link = g_env.newString('');
var kind = g_env.newString('');
var child = row.select('.browseColFirst a').first();
if (child != null) {
title = child.text();
link = child.attr('href');
link = g_env.newString(g_env.newURL(g_env.newURL(url), link) + '');
}
var child = row.select('.browseColFourth').first();
if (child != null) {
kind = child.text().trim();
}
if (title.length() > 0 && link.length() > 0 && kind.length() > 0) {
saveTitle(title, link, kind);
}
}
} catch (e) {
g_env.error(e);
}
}
} catch (e) {
g_env.error(e);
}
}
function saveTitle(title, link, kind) {
if (findTitleByLink(link) != null) return;
var schema = 's|link|s|title|s|kind';
var entity = newEntity();
entity.setSchema(schema);
entity.setKind(g_site + '_Title');
entity.setId(g_env.uniqid());
entity.setString('link', link);
entity.setString('title', title);
entity.setString('kind', kind);
entity.save();
g_env.info(kind + ' | ' + title + ' | ' + link);
}
function findTitleByLink(link) {
var pat = newEntity();
var res = pat.search(g_site + '_Title', pat.newTermQuery(pat.newTerm('link', link)), 1);
if (res.size() == 0) return null;
return res.get(0);
}
No comments:
Post a Comment