Grab book/journal from ScienceDirect
This task use javascript sandbox with jsoup and lucene support to grab book/journal from ScienceDirect.
Grab book/journal from ScienceDirect
- Create javascript sandbox with jsoup support
- Add Lucene support to javascript sandbox
- Create javascript as following
javascript
1 | var g_site = 'sciencedirect.com'; |
2 | var g_env; |
3 | |
4 | function main(p_env, p_args) { |
5 | g_env = p_env; |
6 | run(); |
7 | } |
8 | |
9 | function newEntity() { |
10 | return g_env.newEntity(); |
11 | } |
12 | |
13 | function loadUrl(url) { |
14 | var conn = g_env.newJsoup().connect(url); |
15 | conn.userAgent('Mozilla/5.0 (Windows NT x.y; rv:10.0.1) Gecko/20100101 Firefox/10.0.1'); |
16 | conn.timeout(60000); |
17 | return conn.get(); |
18 | } |
19 | |
20 | function run() { |
21 | g_env.info('Starting'); |
22 | grabCategory('http://www.sciencedirect.com/science/browse/sub/physicalsciences'); |
23 | grabCategory('http://www.sciencedirect.com/science/browse/sub/chemicaleng'); |
24 | grabCategory('http://www.sciencedirect.com/science/browse/sub/chemistry'); |
25 | grabCategory('http://www.sciencedirect.com/science/browse/sub/computerscience'); |
26 | grabCategory('http://www.sciencedirect.com/science/browse/sub/earth'); |
27 | grabCategory('http://www.sciencedirect.com/science/browse/sub/energy'); |
28 | grabCategory('http://www.sciencedirect.com/science/browse/sub/engineering'); |
29 | grabCategory('http://www.sciencedirect.com/science/browse/sub/materialsscience'); |
30 | grabCategory('http://www.sciencedirect.com/science/browse/sub/mathematics'); |
31 | grabCategory('http://www.sciencedirect.com/science/browse/sub/physics'); |
32 | grabCategory('http://www.sciencedirect.com/science/browse/sub/lifesciences'); |
33 | grabCategory('http://www.sciencedirect.com/science/browse/sub/agribio'); |
34 | grabCategory('http://www.sciencedirect.com/science/browse/sub/biochemgenmolbiol'); |
35 | grabCategory('http://www.sciencedirect.com/science/browse/sub/environmental'); |
36 | grabCategory('http://www.sciencedirect.com/science/browse/sub/immunolmicrobiol'); |
37 | grabCategory('http://www.sciencedirect.com/science/browse/sub/neuroscience'); |
38 | grabCategory('http://www.sciencedirect.com/science/browse/sub/healthsciences'); |
39 | grabCategory('http://www.sciencedirect.com/science/browse/sub/medicinedentistry'); |
40 | grabCategory('http://www.sciencedirect.com/science/browse/sub/nursinghealth'); |
41 | grabCategory('http://www.sciencedirect.com/science/browse/sub/pharmatox'); |
42 | grabCategory('http://www.sciencedirect.com/science/browse/sub/vetscimed'); |
43 | grabCategory('http://www.sciencedirect.com/science/browse/sub/socialscienceshumanities'); |
44 | grabCategory('http://www.sciencedirect.com/science/browse/sub/artsandhumanities'); |
45 | grabCategory('http://www.sciencedirect.com/science/browse/sub/busmanacc'); |
46 | grabCategory('http://www.sciencedirect.com/science/browse/sub/decisionsciences'); |
47 | grabCategory('http://www.sciencedirect.com/science/browse/sub/economics'); |
48 | grabCategory('http://www.sciencedirect.com/science/browse/sub/psychology'); |
49 | grabCategory('http://www.sciencedirect.com/science/browse/sub/socialsciences'); |
50 | g_env.info('Ending'); |
51 | } |
52 | |
53 | function grabCategory(cat) { |
54 | try { |
55 | var pages = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '0-9']; |
56 | for (var pn = 0; pn < pages.length; pn++) { |
57 | try { |
58 | var url = cat + '/' + pages[pn]; |
59 | var doc = loadUrl(url); |
60 | var rows = doc.select('#content_browseimp tr.browseimpBrowseRow'); |
61 | for (var i = 0; i < rows.size(); i++) { |
62 | var row = rows.get(i); |
63 | var title = g_env.newString(''); |
64 | var link = g_env.newString(''); |
65 | var kind = g_env.newString(''); |
66 | var child = row.select('.browseColFirst a').first(); |
67 | if (child != null) { |
68 | title = child.text(); |
69 | link = child.attr('href'); |
70 | link = g_env.newString(g_env.newURL(g_env.newURL(url), link) + ''); |
71 | } |
72 | var child = row.select('.browseColFourth').first(); |
73 | if (child != null) { |
74 | kind = child.text().trim(); |
75 | } |
76 | if (title.length() > 0 && link.length() > 0 && kind.length() > 0) { |
77 | saveTitle(title, link, kind); |
78 | } |
79 | } |
80 | } catch (e) { |
81 | g_env.error(e); |
82 | } |
83 | } |
84 | } catch (e) { |
85 | g_env.error(e); |
86 | } |
87 | } |
88 | |
89 | function saveTitle(title, link, kind) { |
90 | if (findTitleByLink(link) != null) return; |
91 | var schema = 's|link|s|title|s|kind'; |
92 | var entity = newEntity(); |
93 | entity.setSchema(schema); |
94 | entity.setKind(g_site + '_Title'); |
95 | entity.setId(g_env.uniqid()); |
96 | entity.setString('link', link); |
97 | entity.setString('title', title); |
98 | entity.setString('kind', kind); |
99 | entity.save(); |
100 | g_env.info(kind + ' | ' + title + ' | ' + link); |
101 | } |
102 | |
103 | function findTitleByLink(link) { |
104 | var pat = newEntity(); |
105 | var res = pat.search(g_site + '_Title', pat.newTermQuery(pat.newTerm('link', link)), 1); |
106 | if (res.size() == 0) return null; |
107 | return res.get(0); |
108 | } |
var g_site = 'sciencedirect.com'; var g_env; function main(p_env, p_args) { g_env = p_env; run(); } function newEntity() { return g_env.newEntity(); } function loadUrl(url) { var conn = g_env.newJsoup().connect(url); conn.userAgent('Mozilla/5.0 (Windows NT x.y; rv:10.0.1) Gecko/20100101 Firefox/10.0.1'); conn.timeout(60000); return conn.get(); } function run() { g_env.info('Starting'); grabCategory('http://www.sciencedirect.com/science/browse/sub/physicalsciences'); grabCategory('http://www.sciencedirect.com/science/browse/sub/chemicaleng'); grabCategory('http://www.sciencedirect.com/science/browse/sub/chemistry'); grabCategory('http://www.sciencedirect.com/science/browse/sub/computerscience'); grabCategory('http://www.sciencedirect.com/science/browse/sub/earth'); grabCategory('http://www.sciencedirect.com/science/browse/sub/energy'); grabCategory('http://www.sciencedirect.com/science/browse/sub/engineering'); grabCategory('http://www.sciencedirect.com/science/browse/sub/materialsscience'); grabCategory('http://www.sciencedirect.com/science/browse/sub/mathematics'); grabCategory('http://www.sciencedirect.com/science/browse/sub/physics'); grabCategory('http://www.sciencedirect.com/science/browse/sub/lifesciences'); grabCategory('http://www.sciencedirect.com/science/browse/sub/agribio'); grabCategory('http://www.sciencedirect.com/science/browse/sub/biochemgenmolbiol'); grabCategory('http://www.sciencedirect.com/science/browse/sub/environmental'); grabCategory('http://www.sciencedirect.com/science/browse/sub/immunolmicrobiol'); grabCategory('http://www.sciencedirect.com/science/browse/sub/neuroscience'); grabCategory('http://www.sciencedirect.com/science/browse/sub/healthsciences'); grabCategory('http://www.sciencedirect.com/science/browse/sub/medicinedentistry'); grabCategory('http://www.sciencedirect.com/science/browse/sub/nursinghealth'); grabCategory('http://www.sciencedirect.com/science/browse/sub/pharmatox'); grabCategory('http://www.sciencedirect.com/science/browse/sub/vetscimed'); grabCategory('http://www.sciencedirect.com/science/browse/sub/socialscienceshumanities'); grabCategory('http://www.sciencedirect.com/science/browse/sub/artsandhumanities'); grabCategory('http://www.sciencedirect.com/science/browse/sub/busmanacc'); grabCategory('http://www.sciencedirect.com/science/browse/sub/decisionsciences'); grabCategory('http://www.sciencedirect.com/science/browse/sub/economics'); grabCategory('http://www.sciencedirect.com/science/browse/sub/psychology'); grabCategory('http://www.sciencedirect.com/science/browse/sub/socialsciences'); g_env.info('Ending'); } function grabCategory(cat) { try { var pages = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '0-9']; for (var pn = 0; pn < pages.length; pn++) { try { var url = cat + '/' + pages[pn]; var doc = loadUrl(url); var rows = doc.select('#content_browseimp tr.browseimpBrowseRow'); for (var i = 0; i < rows.size(); i++) { var row = rows.get(i); var title = g_env.newString(''); var link = g_env.newString(''); var kind = g_env.newString(''); var child = row.select('.browseColFirst a').first(); if (child != null) { title = child.text(); link = child.attr('href'); link = g_env.newString(g_env.newURL(g_env.newURL(url), link) + ''); } var child = row.select('.browseColFourth').first(); if (child != null) { kind = child.text().trim(); } if (title.length() > 0 && link.length() > 0 && kind.length() > 0) { saveTitle(title, link, kind); } } } catch (e) { g_env.error(e); } } } catch (e) { g_env.error(e); } } function saveTitle(title, link, kind) { if (findTitleByLink(link) != null) return; var schema = 's|link|s|title|s|kind'; var entity = newEntity(); entity.setSchema(schema); entity.setKind(g_site + '_Title'); entity.setId(g_env.uniqid()); entity.setString('link', link); entity.setString('title', title); entity.setString('kind', kind); entity.save(); g_env.info(kind + ' | ' + title + ' | ' + link); } function findTitleByLink(link) { var pat = newEntity(); var res = pat.search(g_site + '_Title', pat.newTermQuery(pat.newTerm('link', link)), 1); if (res.size() == 0) return null; return res.get(0); }
No comments:
Post a Comment