Grab article from ScienceDirect
Grab article from ScienceDirect
- Create javascript sandbox with jsoup support
- Add Lucene support to javascript sandbox
- Create javascript as following
javascript
1 | var g_title = ''; |
2 | var g_cache = true; |
3 | var g_site = 'sciencedirect.com'; |
4 | var g_env; |
5 | var g_cookie; |
6 | |
7 | function main(p_env, p_args) { |
8 | g_env = p_env; |
9 | run(); |
10 | } |
11 | |
12 | function newEntity() { |
13 | return g_env.newEntity(); |
14 | } |
15 | |
16 | function loadUrlCookieStart(url) { |
17 | var conn = g_env.newJsoup().connect(url); |
18 | conn.userAgent('Mozilla/5.0 (Windows NT 5.1; rv:14.0) Gecko/20100101 Firefox/14.0.1'); |
19 | conn.timeout(60000); |
20 | var tag = conn.get(); |
21 | g_cookie = conn.getCookies(); |
22 | return tag; |
23 | } |
24 | |
25 | function loadUrlCookie(url) { |
26 | var conn = g_env.newJsoup().connect(url); |
27 | conn.userAgent('Mozilla/5.0 (Windows NT 5.1; rv:14.0) Gecko/20100101 Firefox/14.0.1'); |
28 | conn.timeout(60000); |
29 | conn.cookies(g_cookie); |
30 | return conn.get(); |
31 | } |
32 | |
33 | function loadUrl(url) { |
34 | var conn = g_env.newJsoup().connect(url); |
35 | conn.userAgent('Mozilla/5.0 (Windows NT 5.1; rv:14.0) Gecko/20100101 Firefox/14.0.1'); |
36 | conn.timeout(60000); |
37 | return conn.get(); |
38 | } |
39 | |
40 | function run() { |
41 | g_env.info('Starting'); |
42 | if (g_env.newString(g_title).length() > 0) { |
43 | grabTitle(g_title); |
44 | } else { |
45 | if (!g_cache) { |
46 | clearCache(); |
47 | } |
48 | var rs = loadTitleFresh(); |
49 | while (rs.size() > 0) { |
50 | for (var i = 0; i < rs.size(); i++) { |
51 | var et = rs.get(i); |
52 | grabTitle(et.getString('link')); |
53 | } |
54 | rs = loadTitleFresh(); |
55 | } |
56 | } |
57 | g_env.info('Ending'); |
58 | } |
59 | |
60 | function grabTitle(link) { |
61 | var et = findTitleByLink(link); |
62 | if (et == null) return; |
63 | var kind = et.getString('kind'); |
64 | if (kind == 'Book') { |
65 | grabBook(et.getString('title'), et.getString('link')); |
66 | } |
67 | if (kind == 'Book Series') { |
68 | grabBookSeries(et.getString('title'), et.getString('link')); |
69 | } |
70 | if (kind == 'Journal') { |
71 | grabJournal(et.getString('title'), et.getString('link')); |
72 | } |
73 | et.setMark('crawled'); |
74 | et.save(); |
75 | } |
76 | |
77 | function grabJournal(p_title, p_link) { |
78 | try { |
79 | var doc = loadUrl(p_link); |
80 | var vols_link = g_env.newArrayList(); |
81 | var vols_title = g_env.newArrayList(); |
82 | var rows = doc.select('#volumeIssueData .txtBold a'); |
83 | for (var i = 0; i < rows.size(); i++) { |
84 | var child = rows.get(i); |
85 | var title = child.text(); |
86 | var link = child.attr('href'); |
87 | link = g_env.newString(g_env.newURL(g_env.newURL(p_link), link) + ''); |
88 | vols_link.add(link); |
89 | vols_title.add(title); |
90 | } |
91 | for (var i = 0; i < vols_link.size(); i++) { |
92 | var titleV = vols_title.get(i); |
93 | var linkV = vols_link.get(i); |
94 | try { |
95 | doc = loadUrlCookieStart(linkV); |
96 | rows = doc.select('#bodyMainResults .resultRow'); |
97 | for (var j = 0; j < rows.size(); j++) { |
98 | var row = rows.get(j); |
99 | child = row.select('.cLink').first(); |
100 | if (child == null) continue; |
101 | var title = child.text(); |
102 | var link = child.attr('href'); |
103 | link = g_env.newString(g_env.newURL(g_env.newURL(p_link), link) + ''); |
104 | var desc = ''; |
105 | try { |
106 | var cdoc = loadUrlCookie(link); |
107 | child = cdoc.select('#section_abstract').first(); |
108 | if (child != null) { |
109 | child = child.parent(); |
110 | desc = child.text(); |
111 | if (desc.indexOf('Abstract') == 0) { |
112 | desc = desc.substring(8); |
113 | } |
114 | if (desc.indexOf('Summary') == 0) { |
115 | desc = desc.substring(7); |
116 | } |
117 | } |
118 | } catch (e) { |
119 | g_env.error(e); |
120 | } |
121 | saveArticle(title + ' | ' + titleV + ' | ' + p_title, link, desc); |
122 | } |
123 | } catch (e) { |
124 | g_env.error(e); |
125 | } |
126 | } |
127 | } catch (e) { |
128 | g_env.error(e); |
129 | } |
130 | } |
131 | |
132 | function grabBook(p_title, p_link) { |
133 | try { |
134 | var doc = loadUrlCookieStart(p_link); |
135 | var rows = doc.select('.contentMain .nonSerialResultsList .cLink'); |
136 | for (var j = 0; j < rows.size(); j++) { |
137 | var row = rows.get(j); |
138 | child = row.select('.cLink').first(); |
139 | if (child == null) continue; |
140 | var title = child.text(); |
141 | var link = child.attr('href'); |
142 | link = g_env.newString(g_env.newURL(g_env.newURL(p_link), link) + ''); |
143 | var desc = ''; |
144 | try { |
145 | var cdoc = loadUrlCookie(link); |
146 | child = cdoc.select('#section_abstract').first(); |
147 | if (child != null) { |
148 | child = child.parent(); |
149 | desc = child.text(); |
150 | if (desc.indexOf('Abstract') == 0) { |
151 | desc = desc.substring(8); |
152 | } |
153 | if (desc.indexOf('Summary') == 0) { |
154 | desc = desc.substring(7); |
155 | } |
156 | } |
157 | } catch (e) { |
158 | g_env.error(e); |
159 | } |
160 | saveArticle(title + ' | ' + p_title, link, desc); |
161 | } |
162 | } catch (e) { |
163 | g_env.error(e); |
164 | } |
165 | } |
166 | |
167 | function grabBookSeries(p_title, p_link) { |
168 | try { |
169 | var doc = loadUrl(p_link); |
170 | var vols_link = g_env.newArrayList(); |
171 | var vols_title = g_env.newArrayList(); |
172 | var rows = doc.select('#volumeIssueData .txt'); |
173 | for (var i = 0; i < rows.size(); i++) { |
174 | var row = rows.get(i); |
175 | child = row.select('a').first(); |
176 | var title = ''; |
177 | var link = ''; |
178 | if (child == null) { |
179 | child = row.select('span').first(); |
180 | if (child == null) continue; |
181 | title = child.text(); |
182 | link = p_link; |
183 | } else { |
184 | title = child.text(); |
185 | link = child.attr('href'); |
186 | link = g_env.newString(g_env.newURL(g_env.newURL(p_link), link) + ''); |
187 | } |
188 | vols_link.add(link); |
189 | vols_title.add(title); |
190 | } |
191 | for (var i = 0; i < vols_link.size(); i++) { |
192 | var titleV = vols_title.get(i); |
193 | var linkV = vols_link.get(i); |
194 | try { |
195 | doc = loadUrlCookieStart(linkV); |
196 | rows = doc.select('#bodyMainResults .resultRow'); |
197 | for (var j = 0; j < rows.size(); j++) { |
198 | var row = rows.get(j); |
199 | child = row.select('.cLink').first(); |
200 | if (child == null) continue; |
201 | var title = child.text(); |
202 | var link = child.attr('href'); |
203 | link = g_env.newString(g_env.newURL(g_env.newURL(p_link), link) + ''); |
204 | var desc = ''; |
205 | try { |
206 | var cdoc = loadUrlCookie(link); |
207 | child = cdoc.select('#section_abstract').first(); |
208 | if (child != null) { |
209 | child = child.parent(); |
210 | desc = child.text(); |
211 | if (desc.indexOf('Abstract') == 0) { |
212 | desc = desc.substring(8); |
213 | } |
214 | if (desc.indexOf('Summary') == 0) { |
215 | desc = desc.substring(7); |
216 | } |
217 | } |
218 | } catch (e) { |
219 | g_env.error(e); |
220 | } |
221 | saveArticle(title + ' | ' + titleV + ' | ' + p_title, link, desc); |
222 | } |
223 | } catch (e) { |
224 | g_env.error(e); |
225 | } |
226 | } |
227 | } catch (e) { |
228 | g_env.error(e); |
229 | } |
230 | } |
231 | |
232 | function saveArticle(title, link, desc) { |
233 | var src = findLink(link); |
234 | if (src != null) return; |
235 | var schema = 's|url|a|title|a|desc|s|fixed|d|score|s|site|s|inbound|s|code'; |
236 | var entity = newEntity(); |
237 | entity.setSchema(schema); |
238 | entity.setKind('Link'); |
239 | entity.setId(g_env.uniqid()); |
240 | entity.setString('url', link); |
241 | entity.setString('title', title); |
242 | entity.setString('desc', desc); |
243 | entity.setString('fixed', 'false'); |
244 | entity.setString('inbound', ''); |
245 | entity.setDouble('score', 0); |
246 | entity.setString('code', g_env.suniqid()); |
247 | try { |
248 | var t_url = g_env.newURL(link); |
249 | var t_host = t_url.getHost(); |
250 | entity.setString('site', t_host); |
251 | } catch (e) { |
252 | g_env.error(e); |
253 | } |
254 | entity.save(); |
255 | |
256 | var op = '\r\nTitle: ' + title; |
257 | op += '\r\nLink: ' + link; |
258 | op += '\r\nDesc: ' + desc; |
259 | g_env.info(op); |
260 | } |
261 | |
262 | function clearCache() { |
263 | g_env.info('Start clearing cache'); |
264 | var rs = loadTitleCrawled(); |
265 | while (rs.size() > 0) { |
266 | for (var i = 0; i < rs.size(); i++) { |
267 | var et = rs.get(i); |
268 | et.setMark(''); |
269 | et.save(); |
270 | } |
271 | rs = loadTitleCrawled(); |
272 | } |
273 | g_env.info('End clearing cache'); |
274 | } |
275 | |
276 | function loadTitleCrawled() { |
277 | var pat = newEntity(); |
278 | var bq = pat.newBooleanQuery(); |
279 | bq.add(pat.newBooleanClause(pat.newTermQuery(pat.newTerm(pat.MARK, 'crawled')), pat.occurMust())); |
280 | var rs = pat.search(g_site + '_Title', bq, 10); |
281 | return rs; |
282 | } |
283 | |
284 | function loadTitleFresh() { |
285 | var pat = newEntity(); |
286 | var bq = pat.newBooleanQuery(); |
287 | bq.add(pat.newBooleanClause(pat.newMatchAllDocsQuery(), pat.occurMust())); |
288 | bq.add(pat.newBooleanClause(pat.newTermQuery(pat.newTerm(pat.MARK, 'crawled')), pat.occurMustNot())); |
289 | var rs = pat.search(g_site + '_Title', bq, 10); |
290 | return rs; |
291 | } |
292 | |
293 | function findTitleByLink(link) { |
294 | var pat = newEntity(); |
295 | var res = pat.search(g_site + '_Title', pat.newTermQuery(pat.newTerm('link', link)), 1); |
296 | if (res.size() == 0) return null; |
297 | return res.get(0); |
298 | } |
299 | |
300 | function findLink(link) { |
301 | var pat = newEntity(); |
302 | var res = pat.search('Link', pat.newTermQuery(pat.newTerm('url', link)), 1); |
303 | if (res.size() == 0) return null; |
304 | return res.get(0); |
305 | } |
var g_title = ''; var g_cache = true; var g_site = 'sciencedirect.com'; var g_env; var g_cookie; function main(p_env, p_args) { g_env = p_env; run(); } function newEntity() { return g_env.newEntity(); } function loadUrlCookieStart(url) { var conn = g_env.newJsoup().connect(url); conn.userAgent('Mozilla/5.0 (Windows NT 5.1; rv:14.0) Gecko/20100101 Firefox/14.0.1'); conn.timeout(60000); var tag = conn.get(); g_cookie = conn.getCookies(); return tag; } function loadUrlCookie(url) { var conn = g_env.newJsoup().connect(url); conn.userAgent('Mozilla/5.0 (Windows NT 5.1; rv:14.0) Gecko/20100101 Firefox/14.0.1'); conn.timeout(60000); conn.cookies(g_cookie); return conn.get(); } function loadUrl(url) { var conn = g_env.newJsoup().connect(url); conn.userAgent('Mozilla/5.0 (Windows NT 5.1; rv:14.0) Gecko/20100101 Firefox/14.0.1'); conn.timeout(60000); return conn.get(); } function run() { g_env.info('Starting'); if (g_env.newString(g_title).length() > 0) { grabTitle(g_title); } else { if (!g_cache) { clearCache(); } var rs = loadTitleFresh(); while (rs.size() > 0) { for (var i = 0; i < rs.size(); i++) { var et = rs.get(i); grabTitle(et.getString('link')); } rs = loadTitleFresh(); } } g_env.info('Ending'); } function grabTitle(link) { var et = findTitleByLink(link); if (et == null) return; var kind = et.getString('kind'); if (kind == 'Book') { grabBook(et.getString('title'), et.getString('link')); } if (kind == 'Book Series') { grabBookSeries(et.getString('title'), et.getString('link')); } if (kind == 'Journal') { grabJournal(et.getString('title'), et.getString('link')); } et.setMark('crawled'); et.save(); } function grabJournal(p_title, p_link) { try { var doc = loadUrl(p_link); var vols_link = g_env.newArrayList(); var vols_title = g_env.newArrayList(); var rows = doc.select('#volumeIssueData .txtBold a'); for (var i = 0; i < rows.size(); i++) { var child = rows.get(i); var title = child.text(); var link = child.attr('href'); link = g_env.newString(g_env.newURL(g_env.newURL(p_link), link) + ''); vols_link.add(link); vols_title.add(title); } for (var i = 0; i < vols_link.size(); i++) { var titleV = vols_title.get(i); var linkV = vols_link.get(i); try { doc = loadUrlCookieStart(linkV); rows = doc.select('#bodyMainResults .resultRow'); for (var j = 0; j < rows.size(); j++) { var row = rows.get(j); child = row.select('.cLink').first(); if (child == null) continue; var title = child.text(); var link = child.attr('href'); link = g_env.newString(g_env.newURL(g_env.newURL(p_link), link) + ''); var desc = ''; try { var cdoc = loadUrlCookie(link); child = cdoc.select('#section_abstract').first(); if (child != null) { child = child.parent(); desc = child.text(); if (desc.indexOf('Abstract') == 0) { desc = desc.substring(8); } if (desc.indexOf('Summary') == 0) { desc = desc.substring(7); } } } catch (e) { g_env.error(e); } saveArticle(title + ' | ' + titleV + ' | ' + p_title, link, desc); } } catch (e) { g_env.error(e); } } } catch (e) { g_env.error(e); } } function grabBook(p_title, p_link) { try { var doc = loadUrlCookieStart(p_link); var rows = doc.select('.contentMain .nonSerialResultsList .cLink'); for (var j = 0; j < rows.size(); j++) { var row = rows.get(j); child = row.select('.cLink').first(); if (child == null) continue; var title = child.text(); var link = child.attr('href'); link = g_env.newString(g_env.newURL(g_env.newURL(p_link), link) + ''); var desc = ''; try { var cdoc = loadUrlCookie(link); child = cdoc.select('#section_abstract').first(); if (child != null) { child = child.parent(); desc = child.text(); if (desc.indexOf('Abstract') == 0) { desc = desc.substring(8); } if (desc.indexOf('Summary') == 0) { desc = desc.substring(7); } } } catch (e) { g_env.error(e); } saveArticle(title + ' | ' + p_title, link, desc); } } catch (e) { g_env.error(e); } } function grabBookSeries(p_title, p_link) { try { var doc = loadUrl(p_link); var vols_link = g_env.newArrayList(); var vols_title = g_env.newArrayList(); var rows = doc.select('#volumeIssueData .txt'); for (var i = 0; i < rows.size(); i++) { var row = rows.get(i); child = row.select('a').first(); var title = ''; var link = ''; if (child == null) { child = row.select('span').first(); if (child == null) continue; title = child.text(); link = p_link; } else { title = child.text(); link = child.attr('href'); link = g_env.newString(g_env.newURL(g_env.newURL(p_link), link) + ''); } vols_link.add(link); vols_title.add(title); } for (var i = 0; i < vols_link.size(); i++) { var titleV = vols_title.get(i); var linkV = vols_link.get(i); try { doc = loadUrlCookieStart(linkV); rows = doc.select('#bodyMainResults .resultRow'); for (var j = 0; j < rows.size(); j++) { var row = rows.get(j); child = row.select('.cLink').first(); if (child == null) continue; var title = child.text(); var link = child.attr('href'); link = g_env.newString(g_env.newURL(g_env.newURL(p_link), link) + ''); var desc = ''; try { var cdoc = loadUrlCookie(link); child = cdoc.select('#section_abstract').first(); if (child != null) { child = child.parent(); desc = child.text(); if (desc.indexOf('Abstract') == 0) { desc = desc.substring(8); } if (desc.indexOf('Summary') == 0) { desc = desc.substring(7); } } } catch (e) { g_env.error(e); } saveArticle(title + ' | ' + titleV + ' | ' + p_title, link, desc); } } catch (e) { g_env.error(e); } } } catch (e) { g_env.error(e); } } function saveArticle(title, link, desc) { var src = findLink(link); if (src != null) return; var schema = 's|url|a|title|a|desc|s|fixed|d|score|s|site|s|inbound|s|code'; var entity = newEntity(); entity.setSchema(schema); entity.setKind('Link'); entity.setId(g_env.uniqid()); entity.setString('url', link); entity.setString('title', title); entity.setString('desc', desc); entity.setString('fixed', 'false'); entity.setString('inbound', ''); entity.setDouble('score', 0); entity.setString('code', g_env.suniqid()); try { var t_url = g_env.newURL(link); var t_host = t_url.getHost(); entity.setString('site', t_host); } catch (e) { g_env.error(e); } entity.save(); var op = '\r\nTitle: ' + title; op += '\r\nLink: ' + link; op += '\r\nDesc: ' + desc; g_env.info(op); } function clearCache() { g_env.info('Start clearing cache'); var rs = loadTitleCrawled(); while (rs.size() > 0) { for (var i = 0; i < rs.size(); i++) { var et = rs.get(i); et.setMark(''); et.save(); } rs = loadTitleCrawled(); } g_env.info('End clearing cache'); } function loadTitleCrawled() { var pat = newEntity(); var bq = pat.newBooleanQuery(); bq.add(pat.newBooleanClause(pat.newTermQuery(pat.newTerm(pat.MARK, 'crawled')), pat.occurMust())); var rs = pat.search(g_site + '_Title', bq, 10); return rs; } function loadTitleFresh() { var pat = newEntity(); var bq = pat.newBooleanQuery(); bq.add(pat.newBooleanClause(pat.newMatchAllDocsQuery(), pat.occurMust())); bq.add(pat.newBooleanClause(pat.newTermQuery(pat.newTerm(pat.MARK, 'crawled')), pat.occurMustNot())); var rs = pat.search(g_site + '_Title', bq, 10); return rs; } function findTitleByLink(link) { var pat = newEntity(); var res = pat.search(g_site + '_Title', pat.newTermQuery(pat.newTerm('link', link)), 1); if (res.size() == 0) return null; return res.get(0); } function findLink(link) { var pat = newEntity(); var res = pat.search('Link', pat.newTermQuery(pat.newTerm('url', link)), 1); if (res.size() == 0) return null; return res.get(0); }
No comments:
Post a Comment