Grab book/journal from ScienceDirect
This task use javascript sandbox with jsoup and lucene support to grab book/journal from ScienceDirect.
Grab book/journal from ScienceDirect
- Create javascript sandbox with jsoup support
- Add Lucene support to javascript sandbox
- Create javascript as following
javascript
var g_site = 'sciencedirect.com'; var g_env; function main(p_env, p_args) { g_env = p_env; run(); } function newEntity() { return g_env.newEntity(); } function loadUrl(url) { var conn = g_env.newJsoup().connect(url); conn.userAgent('Mozilla/5.0 (Windows NT x.y; rv:10.0.1) Gecko/20100101 Firefox/10.0.1'); conn.timeout(60000); return conn.get(); } function run() { g_env.info('Starting'); grabCategory('http://www.sciencedirect.com/science/browse/sub/physicalsciences'); grabCategory('http://www.sciencedirect.com/science/browse/sub/chemicaleng'); grabCategory('http://www.sciencedirect.com/science/browse/sub/chemistry'); grabCategory('http://www.sciencedirect.com/science/browse/sub/computerscience'); grabCategory('http://www.sciencedirect.com/science/browse/sub/earth'); grabCategory('http://www.sciencedirect.com/science/browse/sub/energy'); grabCategory('http://www.sciencedirect.com/science/browse/sub/engineering'); grabCategory('http://www.sciencedirect.com/science/browse/sub/materialsscience'); grabCategory('http://www.sciencedirect.com/science/browse/sub/mathematics'); grabCategory('http://www.sciencedirect.com/science/browse/sub/physics'); grabCategory('http://www.sciencedirect.com/science/browse/sub/lifesciences'); grabCategory('http://www.sciencedirect.com/science/browse/sub/agribio'); grabCategory('http://www.sciencedirect.com/science/browse/sub/biochemgenmolbiol'); grabCategory('http://www.sciencedirect.com/science/browse/sub/environmental'); grabCategory('http://www.sciencedirect.com/science/browse/sub/immunolmicrobiol'); grabCategory('http://www.sciencedirect.com/science/browse/sub/neuroscience'); grabCategory('http://www.sciencedirect.com/science/browse/sub/healthsciences'); grabCategory('http://www.sciencedirect.com/science/browse/sub/medicinedentistry'); grabCategory('http://www.sciencedirect.com/science/browse/sub/nursinghealth'); grabCategory('http://www.sciencedirect.com/science/browse/sub/pharmatox'); grabCategory('http://www.sciencedirect.com/science/browse/sub/vetscimed'); grabCategory('http://www.sciencedirect.com/science/browse/sub/socialscienceshumanities'); grabCategory('http://www.sciencedirect.com/science/browse/sub/artsandhumanities'); grabCategory('http://www.sciencedirect.com/science/browse/sub/busmanacc'); grabCategory('http://www.sciencedirect.com/science/browse/sub/decisionsciences'); grabCategory('http://www.sciencedirect.com/science/browse/sub/economics'); grabCategory('http://www.sciencedirect.com/science/browse/sub/psychology'); grabCategory('http://www.sciencedirect.com/science/browse/sub/socialsciences'); g_env.info('Ending'); } function grabCategory(cat) { try { var pages = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '0-9']; for (var pn = 0; pn < pages.length; pn++) { try { var url = cat + '/' + pages[pn]; var doc = loadUrl(url); var rows = doc.select('#content_browseimp tr.browseimpBrowseRow'); for (var i = 0; i < rows.size(); i++) { var row = rows.get(i); var title = g_env.newString(''); var link = g_env.newString(''); var kind = g_env.newString(''); var child = row.select('.browseColFirst a').first(); if (child != null) { title = child.text(); link = child.attr('href'); link = g_env.newString(g_env.newURL(g_env.newURL(url), link) + ''); } var child = row.select('.browseColFourth').first(); if (child != null) { kind = child.text().trim(); } if (title.length() > 0 && link.length() > 0 && kind.length() > 0) { saveTitle(title, link, kind); } } } catch (e) { g_env.error(e); } } } catch (e) { g_env.error(e); } } function saveTitle(title, link, kind) { if (findTitleByLink(link) != null) return; var schema = 's|link|s|title|s|kind'; var entity = newEntity(); entity.setSchema(schema); entity.setKind(g_site + '_Title'); entity.setId(g_env.uniqid()); entity.setString('link', link); entity.setString('title', title); entity.setString('kind', kind); entity.save(); g_env.info(kind + ' | ' + title + ' | ' + link); } function findTitleByLink(link) { var pat = newEntity(); var res = pat.search(g_site + '_Title', pat.newTermQuery(pat.newTerm('link', link)), 1); if (res.size() == 0) return null; return res.get(0); }
No comments:
Post a Comment