Grab article from ScienceDirect
Grab article from ScienceDirect
- Create javascript sandbox with jsoup support
- Add Lucene support to javascript sandbox
- Create javascript as following
javascript
var g_title = ''; var g_cache = true; var g_site = 'sciencedirect.com'; var g_env; var g_cookie; function main(p_env, p_args) { g_env = p_env; run(); } function newEntity() { return g_env.newEntity(); } function loadUrlCookieStart(url) { var conn = g_env.newJsoup().connect(url); conn.userAgent('Mozilla/5.0 (Windows NT 5.1; rv:14.0) Gecko/20100101 Firefox/14.0.1'); conn.timeout(60000); var tag = conn.get(); g_cookie = conn.getCookies(); return tag; } function loadUrlCookie(url) { var conn = g_env.newJsoup().connect(url); conn.userAgent('Mozilla/5.0 (Windows NT 5.1; rv:14.0) Gecko/20100101 Firefox/14.0.1'); conn.timeout(60000); conn.cookies(g_cookie); return conn.get(); } function loadUrl(url) { var conn = g_env.newJsoup().connect(url); conn.userAgent('Mozilla/5.0 (Windows NT 5.1; rv:14.0) Gecko/20100101 Firefox/14.0.1'); conn.timeout(60000); return conn.get(); } function run() { g_env.info('Starting'); if (g_env.newString(g_title).length() > 0) { grabTitle(g_title); } else { if (!g_cache) { clearCache(); } var rs = loadTitleFresh(); while (rs.size() > 0) { for (var i = 0; i < rs.size(); i++) { var et = rs.get(i); grabTitle(et.getString('link')); } rs = loadTitleFresh(); } } g_env.info('Ending'); } function grabTitle(link) { var et = findTitleByLink(link); if (et == null) return; var kind = et.getString('kind'); if (kind == 'Book') { grabBook(et.getString('title'), et.getString('link')); } if (kind == 'Book Series') { grabBookSeries(et.getString('title'), et.getString('link')); } if (kind == 'Journal') { grabJournal(et.getString('title'), et.getString('link')); } et.setMark('crawled'); et.save(); } function grabJournal(p_title, p_link) { try { var doc = loadUrl(p_link); var vols_link = g_env.newArrayList(); var vols_title = g_env.newArrayList(); var rows = doc.select('#volumeIssueData .txtBold a'); for (var i = 0; i < rows.size(); i++) { var child = rows.get(i); var title = child.text(); var link = child.attr('href'); link = g_env.newString(g_env.newURL(g_env.newURL(p_link), link) + ''); vols_link.add(link); vols_title.add(title); } for (var i = 0; i < vols_link.size(); i++) { var titleV = vols_title.get(i); var linkV = vols_link.get(i); try { doc = loadUrlCookieStart(linkV); rows = doc.select('#bodyMainResults .resultRow'); for (var j = 0; j < rows.size(); j++) { var row = rows.get(j); child = row.select('.cLink').first(); if (child == null) continue; var title = child.text(); var link = child.attr('href'); link = g_env.newString(g_env.newURL(g_env.newURL(p_link), link) + ''); var desc = ''; try { var cdoc = loadUrlCookie(link); child = cdoc.select('#section_abstract').first(); if (child != null) { child = child.parent(); desc = child.text(); if (desc.indexOf('Abstract') == 0) { desc = desc.substring(8); } if (desc.indexOf('Summary') == 0) { desc = desc.substring(7); } } } catch (e) { g_env.error(e); } saveArticle(title + ' | ' + titleV + ' | ' + p_title, link, desc); } } catch (e) { g_env.error(e); } } } catch (e) { g_env.error(e); } } function grabBook(p_title, p_link) { try { var doc = loadUrlCookieStart(p_link); var rows = doc.select('.contentMain .nonSerialResultsList .cLink'); for (var j = 0; j < rows.size(); j++) { var row = rows.get(j); child = row.select('.cLink').first(); if (child == null) continue; var title = child.text(); var link = child.attr('href'); link = g_env.newString(g_env.newURL(g_env.newURL(p_link), link) + ''); var desc = ''; try { var cdoc = loadUrlCookie(link); child = cdoc.select('#section_abstract').first(); if (child != null) { child = child.parent(); desc = child.text(); if (desc.indexOf('Abstract') == 0) { desc = desc.substring(8); } if (desc.indexOf('Summary') == 0) { desc = desc.substring(7); } } } catch (e) { g_env.error(e); } saveArticle(title + ' | ' + p_title, link, desc); } } catch (e) { g_env.error(e); } } function grabBookSeries(p_title, p_link) { try { var doc = loadUrl(p_link); var vols_link = g_env.newArrayList(); var vols_title = g_env.newArrayList(); var rows = doc.select('#volumeIssueData .txt'); for (var i = 0; i < rows.size(); i++) { var row = rows.get(i); child = row.select('a').first(); var title = ''; var link = ''; if (child == null) { child = row.select('span').first(); if (child == null) continue; title = child.text(); link = p_link; } else { title = child.text(); link = child.attr('href'); link = g_env.newString(g_env.newURL(g_env.newURL(p_link), link) + ''); } vols_link.add(link); vols_title.add(title); } for (var i = 0; i < vols_link.size(); i++) { var titleV = vols_title.get(i); var linkV = vols_link.get(i); try { doc = loadUrlCookieStart(linkV); rows = doc.select('#bodyMainResults .resultRow'); for (var j = 0; j < rows.size(); j++) { var row = rows.get(j); child = row.select('.cLink').first(); if (child == null) continue; var title = child.text(); var link = child.attr('href'); link = g_env.newString(g_env.newURL(g_env.newURL(p_link), link) + ''); var desc = ''; try { var cdoc = loadUrlCookie(link); child = cdoc.select('#section_abstract').first(); if (child != null) { child = child.parent(); desc = child.text(); if (desc.indexOf('Abstract') == 0) { desc = desc.substring(8); } if (desc.indexOf('Summary') == 0) { desc = desc.substring(7); } } } catch (e) { g_env.error(e); } saveArticle(title + ' | ' + titleV + ' | ' + p_title, link, desc); } } catch (e) { g_env.error(e); } } } catch (e) { g_env.error(e); } } function saveArticle(title, link, desc) { var src = findLink(link); if (src != null) return; var schema = 's|url|a|title|a|desc|s|fixed|d|score|s|site|s|inbound|s|code'; var entity = newEntity(); entity.setSchema(schema); entity.setKind('Link'); entity.setId(g_env.uniqid()); entity.setString('url', link); entity.setString('title', title); entity.setString('desc', desc); entity.setString('fixed', 'false'); entity.setString('inbound', ''); entity.setDouble('score', 0); entity.setString('code', g_env.suniqid()); try { var t_url = g_env.newURL(link); var t_host = t_url.getHost(); entity.setString('site', t_host); } catch (e) { g_env.error(e); } entity.save(); var op = '\r\nTitle: ' + title; op += '\r\nLink: ' + link; op += '\r\nDesc: ' + desc; g_env.info(op); } function clearCache() { g_env.info('Start clearing cache'); var rs = loadTitleCrawled(); while (rs.size() > 0) { for (var i = 0; i < rs.size(); i++) { var et = rs.get(i); et.setMark(''); et.save(); } rs = loadTitleCrawled(); } g_env.info('End clearing cache'); } function loadTitleCrawled() { var pat = newEntity(); var bq = pat.newBooleanQuery(); bq.add(pat.newBooleanClause(pat.newTermQuery(pat.newTerm(pat.MARK, 'crawled')), pat.occurMust())); var rs = pat.search(g_site + '_Title', bq, 10); return rs; } function loadTitleFresh() { var pat = newEntity(); var bq = pat.newBooleanQuery(); bq.add(pat.newBooleanClause(pat.newMatchAllDocsQuery(), pat.occurMust())); bq.add(pat.newBooleanClause(pat.newTermQuery(pat.newTerm(pat.MARK, 'crawled')), pat.occurMustNot())); var rs = pat.search(g_site + '_Title', bq, 10); return rs; } function findTitleByLink(link) { var pat = newEntity(); var res = pat.search(g_site + '_Title', pat.newTermQuery(pat.newTerm('link', link)), 1); if (res.size() == 0) return null; return res.get(0); } function findLink(link) { var pat = newEntity(); var res = pat.search('Link', pat.newTermQuery(pat.newTerm('url', link)), 1); if (res.size() == 0) return null; return res.get(0); }
No comments:
Post a Comment