Saturday, 21 July 2012

Grab article from ScienceDirect

Grab article from ScienceDirect
This task use javascript sandbox with jsoup and lucene support to grab article from ScienceDirect.
Grab article from ScienceDirect
  1. Create javascript sandbox with jsoup support
  2. Add Lucene support to javascript sandbox
  3. Create javascript as following
javascript
1var g_title = '';
2var g_cache = true;
3var g_site = 'sciencedirect.com';
4var g_env;
5var g_cookie;
6
7function main(p_env, p_args) {
8 g_env = p_env;
9 run();
10}
11
12function newEntity() {
13 return g_env.newEntity();
14}
15
16function loadUrlCookieStart(url) {
17 var conn = g_env.newJsoup().connect(url);
18 conn.userAgent('Mozilla/5.0 (Windows NT 5.1; rv:14.0) Gecko/20100101 Firefox/14.0.1');
19 conn.timeout(60000);
20 var tag = conn.get();
21 g_cookie = conn.getCookies();
22 return tag;
23}
24
25function loadUrlCookie(url) {
26 var conn = g_env.newJsoup().connect(url);
27 conn.userAgent('Mozilla/5.0 (Windows NT 5.1; rv:14.0) Gecko/20100101 Firefox/14.0.1');
28 conn.timeout(60000);
29 conn.cookies(g_cookie);
30 return conn.get();
31}
32
33function loadUrl(url) {
34 var conn = g_env.newJsoup().connect(url);
35 conn.userAgent('Mozilla/5.0 (Windows NT 5.1; rv:14.0) Gecko/20100101 Firefox/14.0.1');
36 conn.timeout(60000);
37 return conn.get();
38}
39
40function run() {
41 g_env.info('Starting');
42 if (g_env.newString(g_title).length() > 0) {
43 grabTitle(g_title);
44 } else {
45 if (!g_cache) {
46 clearCache();
47 }
48 var rs = loadTitleFresh();
49 while (rs.size() > 0) {
50 for (var i = 0; i < rs.size(); i++) {
51 var et = rs.get(i);
52 grabTitle(et.getString('link'));
53 }
54 rs = loadTitleFresh();
55 }
56 }
57 g_env.info('Ending');
58}
59
60function grabTitle(link) {
61 var et = findTitleByLink(link);
62 if (et == null) return;
63 var kind = et.getString('kind');
64 if (kind == 'Book') {
65 grabBook(et.getString('title'), et.getString('link'));
66 }
67 if (kind == 'Book Series') {
68 grabBookSeries(et.getString('title'), et.getString('link'));
69 }
70 if (kind == 'Journal') {
71 grabJournal(et.getString('title'), et.getString('link'));
72 }
73 et.setMark('crawled');
74 et.save();
75}
76
77function grabJournal(p_title, p_link) {
78 try {
79 var doc = loadUrl(p_link);
80 var vols_link = g_env.newArrayList();
81 var vols_title = g_env.newArrayList();
82 var rows = doc.select('#volumeIssueData .txtBold a');
83 for (var i = 0; i < rows.size(); i++) {
84 var child = rows.get(i);
85 var title = child.text();
86 var link = child.attr('href');
87 link = g_env.newString(g_env.newURL(g_env.newURL(p_link), link) + '');
88 vols_link.add(link);
89 vols_title.add(title);
90 }
91 for (var i = 0; i < vols_link.size(); i++) {
92 var titleV = vols_title.get(i);
93 var linkV = vols_link.get(i);
94 try {
95 doc = loadUrlCookieStart(linkV);
96 rows = doc.select('#bodyMainResults .resultRow');
97 for (var j = 0; j < rows.size(); j++) {
98 var row = rows.get(j);
99 child = row.select('.cLink').first();
100 if (child == null) continue;
101 var title = child.text();
102 var link = child.attr('href');
103 link = g_env.newString(g_env.newURL(g_env.newURL(p_link), link) + '');
104 var desc = '';
105 try {
106 var cdoc = loadUrlCookie(link);
107 child = cdoc.select('#section_abstract').first();
108 if (child != null) {
109 child = child.parent();
110 desc = child.text();
111 if (desc.indexOf('Abstract') == 0) {
112 desc = desc.substring(8);
113 }
114 if (desc.indexOf('Summary') == 0) {
115 desc = desc.substring(7);
116 }
117 }
118 } catch (e) {
119 g_env.error(e);
120 }
121 saveArticle(title + ' | ' + titleV + ' | ' + p_title, link, desc);
122 }
123 } catch (e) {
124 g_env.error(e);
125 }
126 }
127 } catch (e) {
128 g_env.error(e);
129 }
130}
131
132function grabBook(p_title, p_link) {
133 try {
134 var doc = loadUrlCookieStart(p_link);
135 var rows = doc.select('.contentMain .nonSerialResultsList .cLink');
136 for (var j = 0; j < rows.size(); j++) {
137 var row = rows.get(j);
138 child = row.select('.cLink').first();
139 if (child == null) continue;
140 var title = child.text();
141 var link = child.attr('href');
142 link = g_env.newString(g_env.newURL(g_env.newURL(p_link), link) + '');
143 var desc = '';
144 try {
145 var cdoc = loadUrlCookie(link);
146 child = cdoc.select('#section_abstract').first();
147 if (child != null) {
148 child = child.parent();
149 desc = child.text();
150 if (desc.indexOf('Abstract') == 0) {
151 desc = desc.substring(8);
152 }
153 if (desc.indexOf('Summary') == 0) {
154 desc = desc.substring(7);
155 }
156 }
157 } catch (e) {
158 g_env.error(e);
159 }
160 saveArticle(title + ' | ' + p_title, link, desc);
161 }
162 } catch (e) {
163 g_env.error(e);
164 }
165}
166
167function grabBookSeries(p_title, p_link) {
168 try {
169 var doc = loadUrl(p_link);
170 var vols_link = g_env.newArrayList();
171 var vols_title = g_env.newArrayList();
172 var rows = doc.select('#volumeIssueData .txt');
173 for (var i = 0; i < rows.size(); i++) {
174 var row = rows.get(i);
175 child = row.select('a').first();
176 var title = '';
177 var link = '';
178 if (child == null) {
179 child = row.select('span').first();
180 if (child == null) continue;
181 title = child.text();
182 link = p_link;
183 } else {
184 title = child.text();
185 link = child.attr('href');
186 link = g_env.newString(g_env.newURL(g_env.newURL(p_link), link) + '');
187 }
188 vols_link.add(link);
189 vols_title.add(title);
190 }
191 for (var i = 0; i < vols_link.size(); i++) {
192 var titleV = vols_title.get(i);
193 var linkV = vols_link.get(i);
194 try {
195 doc = loadUrlCookieStart(linkV);
196 rows = doc.select('#bodyMainResults .resultRow');
197 for (var j = 0; j < rows.size(); j++) {
198 var row = rows.get(j);
199 child = row.select('.cLink').first();
200 if (child == null) continue;
201 var title = child.text();
202 var link = child.attr('href');
203 link = g_env.newString(g_env.newURL(g_env.newURL(p_link), link) + '');
204 var desc = '';
205 try {
206 var cdoc = loadUrlCookie(link);
207 child = cdoc.select('#section_abstract').first();
208 if (child != null) {
209 child = child.parent();
210 desc = child.text();
211 if (desc.indexOf('Abstract') == 0) {
212 desc = desc.substring(8);
213 }
214 if (desc.indexOf('Summary') == 0) {
215 desc = desc.substring(7);
216 }
217 }
218 } catch (e) {
219 g_env.error(e);
220 }
221 saveArticle(title + ' | ' + titleV + ' | ' + p_title, link, desc);
222 }
223 } catch (e) {
224 g_env.error(e);
225 }
226 }
227 } catch (e) {
228 g_env.error(e);
229 }
230}
231
232function saveArticle(title, link, desc) {
233 var src = findLink(link);
234 if (src != null) return;
235 var schema = 's|url|a|title|a|desc|s|fixed|d|score|s|site|s|inbound|s|code';
236 var entity = newEntity();
237 entity.setSchema(schema);
238 entity.setKind('Link');
239 entity.setId(g_env.uniqid());
240 entity.setString('url', link);
241 entity.setString('title', title);
242 entity.setString('desc', desc);
243 entity.setString('fixed', 'false');
244 entity.setString('inbound', '');
245 entity.setDouble('score', 0);
246 entity.setString('code', g_env.suniqid());
247 try {
248 var t_url = g_env.newURL(link);
249 var t_host = t_url.getHost();
250 entity.setString('site', t_host);
251 } catch (e) {
252 g_env.error(e);
253 }
254 entity.save();
255
256 var op = '\r\nTitle: ' + title;
257 op += '\r\nLink: ' + link;
258 op += '\r\nDesc: ' + desc;
259 g_env.info(op);
260}
261
262function clearCache() {
263 g_env.info('Start clearing cache');
264 var rs = loadTitleCrawled();
265 while (rs.size() > 0) {
266 for (var i = 0; i < rs.size(); i++) {
267 var et = rs.get(i);
268 et.setMark('');
269 et.save();
270 }
271 rs = loadTitleCrawled();
272 }
273 g_env.info('End clearing cache');
274}
275
276function loadTitleCrawled() {
277 var pat = newEntity();
278 var bq = pat.newBooleanQuery();
279 bq.add(pat.newBooleanClause(pat.newTermQuery(pat.newTerm(pat.MARK, 'crawled')), pat.occurMust()));
280 var rs = pat.search(g_site + '_Title', bq, 10);
281 return rs;
282}
283
284function loadTitleFresh() {
285 var pat = newEntity();
286 var bq = pat.newBooleanQuery();
287 bq.add(pat.newBooleanClause(pat.newMatchAllDocsQuery(), pat.occurMust()));
288 bq.add(pat.newBooleanClause(pat.newTermQuery(pat.newTerm(pat.MARK, 'crawled')), pat.occurMustNot()));
289 var rs = pat.search(g_site + '_Title', bq, 10);
290 return rs;
291}
292
293function findTitleByLink(link) {
294 var pat = newEntity();
295 var res = pat.search(g_site + '_Title', pat.newTermQuery(pat.newTerm('link', link)), 1);
296 if (res.size() == 0) return null;
297 return res.get(0);
298}
299
300function findLink(link) {
301 var pat = newEntity();
302 var res = pat.search('Link', pat.newTermQuery(pat.newTerm('url', link)), 1);
303 if (res.size() == 0) return null;
304 return res.get(0);
305}
var g_title = '';
var g_cache = true;
var g_site = 'sciencedirect.com';
var g_env;
var g_cookie;

function main(p_env, p_args) {
  g_env = p_env;
  run();
}

function newEntity() {
  return g_env.newEntity();
}

function loadUrlCookieStart(url) {
  var conn = g_env.newJsoup().connect(url);
  conn.userAgent('Mozilla/5.0 (Windows NT 5.1; rv:14.0) Gecko/20100101 Firefox/14.0.1');
  conn.timeout(60000);
  var tag = conn.get();
  g_cookie = conn.getCookies();
  return tag;
}

function loadUrlCookie(url) {
  var conn = g_env.newJsoup().connect(url);
  conn.userAgent('Mozilla/5.0 (Windows NT 5.1; rv:14.0) Gecko/20100101 Firefox/14.0.1');
  conn.timeout(60000);
  conn.cookies(g_cookie);
  return conn.get();
}

function loadUrl(url) {
  var conn = g_env.newJsoup().connect(url);
  conn.userAgent('Mozilla/5.0 (Windows NT 5.1; rv:14.0) Gecko/20100101 Firefox/14.0.1');
  conn.timeout(60000);
  return conn.get();
}

function run() {
  g_env.info('Starting');
  if (g_env.newString(g_title).length() > 0) {
    grabTitle(g_title);
  } else {
    if (!g_cache) {
      clearCache();
    }
    var rs = loadTitleFresh();
    while (rs.size() > 0) {
      for (var i = 0; i < rs.size(); i++) {
        var et = rs.get(i);
        grabTitle(et.getString('link'));
      }
      rs = loadTitleFresh();
    }
  }
  g_env.info('Ending');
}

function grabTitle(link) {
  var et = findTitleByLink(link);
  if (et == null) return;
  var kind = et.getString('kind');
  if (kind == 'Book') {
    grabBook(et.getString('title'), et.getString('link'));
  }
  if (kind == 'Book Series') {
    grabBookSeries(et.getString('title'), et.getString('link'));
  }
  if (kind == 'Journal') {
    grabJournal(et.getString('title'), et.getString('link'));
  }
  et.setMark('crawled');
  et.save();
}

function grabJournal(p_title, p_link) {
  try {
    var doc = loadUrl(p_link);
    var vols_link = g_env.newArrayList();
    var vols_title = g_env.newArrayList();
    var rows = doc.select('#volumeIssueData .txtBold a');
    for (var i = 0; i < rows.size(); i++) {
      var child = rows.get(i);
      var title = child.text();
      var link = child.attr('href');
      link = g_env.newString(g_env.newURL(g_env.newURL(p_link), link) + '');
      vols_link.add(link);
      vols_title.add(title);
    }
    for (var i = 0; i < vols_link.size(); i++) {
      var titleV = vols_title.get(i);
      var linkV = vols_link.get(i);
      try {
        doc = loadUrlCookieStart(linkV);
        rows = doc.select('#bodyMainResults .resultRow');
        for (var j = 0; j < rows.size(); j++) {
          var row = rows.get(j);
          child = row.select('.cLink').first();
          if (child == null) continue;
          var title = child.text();
          var link = child.attr('href');
          link = g_env.newString(g_env.newURL(g_env.newURL(p_link), link) + '');
          var desc = '';
          try {
            var cdoc = loadUrlCookie(link);
            child = cdoc.select('#section_abstract').first();
            if (child != null) {
              child = child.parent();
              desc = child.text();
              if (desc.indexOf('Abstract') == 0) {
                desc = desc.substring(8);
              }
              if (desc.indexOf('Summary') == 0) {
                desc = desc.substring(7);
              }
            }
          } catch (e) {
            g_env.error(e);
          }
          saveArticle(title + ' | ' + titleV + ' | ' + p_title, link, desc);
        }
      } catch (e) {
        g_env.error(e);
      }
    }
  } catch (e) {
    g_env.error(e);
  }
}

function grabBook(p_title, p_link) {
  try {
    var doc = loadUrlCookieStart(p_link);
    var rows = doc.select('.contentMain .nonSerialResultsList .cLink');
    for (var j = 0; j < rows.size(); j++) {
      var row = rows.get(j);
      child = row.select('.cLink').first();
      if (child == null) continue;
      var title = child.text();
      var link = child.attr('href');
      link = g_env.newString(g_env.newURL(g_env.newURL(p_link), link) + '');
      var desc = '';
      try {
        var cdoc = loadUrlCookie(link);
        child = cdoc.select('#section_abstract').first();
        if (child != null) {
          child = child.parent();
          desc = child.text();
          if (desc.indexOf('Abstract') == 0) {
            desc = desc.substring(8);
          }
          if (desc.indexOf('Summary') == 0) {
            desc = desc.substring(7);
          }
        }
      } catch (e) {
        g_env.error(e);
      }
      saveArticle(title + ' | ' + p_title, link, desc);
    }
  } catch (e) {
    g_env.error(e);
  }
}

function grabBookSeries(p_title, p_link) {
  try {
    var doc = loadUrl(p_link);
    var vols_link = g_env.newArrayList();
    var vols_title = g_env.newArrayList();
    var rows = doc.select('#volumeIssueData .txt');
    for (var i = 0; i < rows.size(); i++) {
      var row = rows.get(i);
      child = row.select('a').first();
      var title = '';
      var link = '';
      if (child == null) {
        child = row.select('span').first();
        if (child == null) continue;
        title = child.text();
        link = p_link;
      } else {
        title = child.text();
        link = child.attr('href');
        link = g_env.newString(g_env.newURL(g_env.newURL(p_link), link) + '');
      }
      vols_link.add(link);
      vols_title.add(title);
    }
    for (var i = 0; i < vols_link.size(); i++) {
      var titleV = vols_title.get(i);
      var linkV = vols_link.get(i);
      try {
        doc = loadUrlCookieStart(linkV);
        rows = doc.select('#bodyMainResults .resultRow');
        for (var j = 0; j < rows.size(); j++) {
          var row = rows.get(j);
          child = row.select('.cLink').first();
          if (child == null) continue;
          var title = child.text();
          var link = child.attr('href');
          link = g_env.newString(g_env.newURL(g_env.newURL(p_link), link) + '');
          var desc = '';
          try {
            var cdoc = loadUrlCookie(link);
            child = cdoc.select('#section_abstract').first();
            if (child != null) {
              child = child.parent();
              desc = child.text();
              if (desc.indexOf('Abstract') == 0) {
                desc = desc.substring(8);
              }
              if (desc.indexOf('Summary') == 0) {
                desc = desc.substring(7);
              }
            }
          } catch (e) {
            g_env.error(e);
          }
          saveArticle(title + ' | ' + titleV + ' | ' + p_title, link, desc);
        }
      } catch (e) {
        g_env.error(e);
      }
    }
  } catch (e) {
    g_env.error(e);
  }
}

function saveArticle(title, link, desc) {
  var src = findLink(link);
  if (src != null) return;
  var schema = 's|url|a|title|a|desc|s|fixed|d|score|s|site|s|inbound|s|code';
  var entity = newEntity();
  entity.setSchema(schema);
  entity.setKind('Link');
  entity.setId(g_env.uniqid());
  entity.setString('url', link);
  entity.setString('title', title);
  entity.setString('desc', desc);
  entity.setString('fixed', 'false');
  entity.setString('inbound', '');
  entity.setDouble('score', 0);
  entity.setString('code', g_env.suniqid());
  try {
    var t_url = g_env.newURL(link);
    var t_host = t_url.getHost();
    entity.setString('site', t_host);
  } catch (e) {
    g_env.error(e);
  }
  entity.save();

  var op = '\r\nTitle: ' + title;
  op += '\r\nLink: ' + link;
  op += '\r\nDesc: ' + desc;
  g_env.info(op);
}

function clearCache() {
  g_env.info('Start clearing cache');
  var rs = loadTitleCrawled();
  while (rs.size() > 0) {
    for (var i = 0; i < rs.size(); i++) {
      var et = rs.get(i);
      et.setMark('');
      et.save();
    }
    rs = loadTitleCrawled();
  }
  g_env.info('End clearing cache');
}

function loadTitleCrawled() {
  var pat = newEntity();
  var bq = pat.newBooleanQuery();
  bq.add(pat.newBooleanClause(pat.newTermQuery(pat.newTerm(pat.MARK, 'crawled')), pat.occurMust()));
  var rs = pat.search(g_site + '_Title', bq, 10);
  return rs;
}

function loadTitleFresh() {
  var pat = newEntity();
  var bq = pat.newBooleanQuery();
  bq.add(pat.newBooleanClause(pat.newMatchAllDocsQuery(), pat.occurMust()));
  bq.add(pat.newBooleanClause(pat.newTermQuery(pat.newTerm(pat.MARK, 'crawled')), pat.occurMustNot()));
  var rs = pat.search(g_site + '_Title', bq, 10);
  return rs;
}

function findTitleByLink(link) {
  var pat = newEntity();
  var res = pat.search(g_site + '_Title', pat.newTermQuery(pat.newTerm('link', link)), 1);
  if (res.size() == 0) return null;
  return res.get(0);
}

function findLink(link) {
  var pat = newEntity();
  var res = pat.search('Link', pat.newTermQuery(pat.newTerm('url', link)), 1);
  if (res.size() == 0) return null;
  return res.get(0);
}

  Protected by Copyscape Online Copyright Protection

No comments:

Post a Comment