Wednesday, 25 July 2012

Grab search results from Yahoo

Grab search results from Yahoo
This task use javascript sandbox with jsoup support to grab search results from Yahoo.
Grab search results from Yahoo
  1. Create javascript sandbox with jsoup support
  2. Create javascript as following
javascript
1var g_env;
2
3function main(p_env, p_args) {
4 g_env = p_env;
5 g_env.info('Starting');
6 run();
7 g_env.info('Ending');
8}
9
10function run() {
11 try {
12 var query = 'lucene';
13 for (var pn = 1; pn <= 10; pn++) {
14 var res = grab(query, pn);
15 for (var i = 0; i < res.size(); i++) {
16 var it = res.get(i);
17 var title = it.get('title');
18 var link = it.get('link');
19 var no = (pn - 1) * 10 + i + 1;
20 g_env.info(no + ' | ' + title + ' | ' + link);
21 }
22 }
23 } catch (e) {
24 g_env.error(e);
25 }
26}
27
28function grab(query, pageno) {
29 var tag = g_env.newArrayList();
30 try {
31 var url = 'http://search.yahoo.com/search?p=' + g_env.encodeURL(query, 'UTF-8') + '&pstart=1&b=' + ((pageno - 1) * 10 + 1);
32 var conn = g_env.newJsoup().connect(url);
33 conn.userAgent('Mozilla/5.0 (Windows NT 5.1; rv:14.0) Gecko/20100101 Firefox/14.0.1');
34 conn.timeout(60000);
35 var doc = conn.get();
36 var nodes = doc.select('#web .res');
37 for (var i = 0; i < nodes.size(); i++) {
38 var node = nodes.get(i);
39 var child = node.select('a.yschttl').first();
40 var title = child.text();
41 var link = child.attr('href');
42 var pos = link.indexOf('**');
43 if (pos >= 0) {
44 link = link.substring(pos + 2);
45 link = g_env.decodeURL(link, 'UTF-8');
46 }
47 var it = g_env.newHashMap();
48 it.put('title', title);
49 it.put('link', link);
50 tag.add(it);
51 }
52 } catch (e) {
53 g_env.error(e);
54 }
55 return tag;
56}
var g_env;

function main(p_env, p_args) {
  g_env = p_env;
  g_env.info('Starting');
  run();
  g_env.info('Ending');
}

function run() {
  try {
    var query = 'lucene';
    for (var pn = 1; pn <= 10; pn++) {
      var res = grab(query, pn);
      for (var i = 0; i < res.size(); i++) {
        var it = res.get(i);
        var title = it.get('title');
        var link = it.get('link');
        var no = (pn - 1) * 10 + i + 1;
        g_env.info(no + ' | ' + title + ' | ' + link);
      }
    }
  } catch (e) {
    g_env.error(e);
  }
}

function grab(query, pageno) {
  var tag = g_env.newArrayList();
  try {
    var url = 'http://search.yahoo.com/search?p=' + g_env.encodeURL(query, 'UTF-8') + '&pstart=1&b=' + ((pageno - 1) * 10 + 1);
    var conn = g_env.newJsoup().connect(url);
    conn.userAgent('Mozilla/5.0 (Windows NT 5.1; rv:14.0) Gecko/20100101 Firefox/14.0.1');
    conn.timeout(60000);
    var doc = conn.get();
    var nodes = doc.select('#web .res');
    for (var i = 0; i < nodes.size(); i++) {
      var node = nodes.get(i);
      var child = node.select('a.yschttl').first();
      var title = child.text();
      var link = child.attr('href');
      var pos = link.indexOf('**');
      if (pos >= 0) {
        link = link.substring(pos + 2);
        link = g_env.decodeURL(link, 'UTF-8');
      }
      var it = g_env.newHashMap();
      it.put('title', title);
      it.put('link', link);
      tag.add(it);
    }
  } catch (e) {
    g_env.error(e);
  }
  return tag;
}

  Protected by Copyscape Online Copyright Protection

Grab search results from Bing

Grab search results from Bing
This task use javascript sandbox with jsoup support to grab search results from Bing.
Grab search results from Bing
  1. Create javascript sandbox with jsoup support
  2. Create javascript as following
javascript
1var g_env;
2
3function main(p_env, p_args) {
4 g_env = p_env;
5 g_env.info('Starting');
6 run();
7 g_env.info('Ending');
8}
9
10function run() {
11 try {
12 var query = 'lucene';
13 for (var pn = 1; pn <= 10; pn++) {
14 var res = grab(query, pn);
15 for (var i = 0; i < res.size(); i++) {
16 var it = res.get(i);
17 var title = it.get('title');
18 var link = it.get('link');
19 var no = (pn - 1) * 10 + i + 1;
20 g_env.info(no + ' | ' + title + ' | ' + link);
21 }
22 }
23 } catch (e) {
24 g_env.error(e);
25 }
26}
27
28function grab(query, pageno) {
29 var tag = g_env.newArrayList();
30 try {
31 var url = 'http://www.bing.com/search?q=' + g_env.encodeURL(query, 'UTF-8') + '&first=' + ((pageno - 1) * 10 + 1);
32 var conn = g_env.newJsoup().connect(url);
33 conn.userAgent('Mozilla/5.0 (Windows NT 5.1; rv:14.0) Gecko/20100101 Firefox/14.0.1');
34 conn.timeout(60000);
35 var doc = conn.get();
36 var nodes = doc.select('#results .sa_wr');
37 for (var i = 0; i < nodes.size(); i++) {
38 var node = nodes.get(i);
39 var child = node.select('.sa_cc .sa_mc .sb_tlst a').first();
40 var title = child.text();
41 var link = child.attr('href');
42 var it = g_env.newHashMap();
43 it.put('title', title);
44 it.put('link', link);
45 tag.add(it);
46 }
47 } catch (e) {
48 g_env.error(e);
49 }
50 return tag;
51}
var g_env;

function main(p_env, p_args) {
  g_env = p_env;
  g_env.info('Starting');
  run();
  g_env.info('Ending');
}

function run() {
  try {
    var query = 'lucene';
    for (var pn = 1; pn <= 10; pn++) {
      var res = grab(query, pn);
      for (var i = 0; i < res.size(); i++) {
        var it = res.get(i);
        var title = it.get('title');
        var link = it.get('link');
        var no = (pn - 1) * 10 + i + 1;
        g_env.info(no + ' | ' + title + ' | ' + link);
      }
    }
  } catch (e) {
    g_env.error(e);
  }
}

function grab(query, pageno) {
  var tag = g_env.newArrayList();
  try {
    var url = 'http://www.bing.com/search?q=' + g_env.encodeURL(query, 'UTF-8') + '&first=' + ((pageno - 1) * 10 + 1);
    var conn = g_env.newJsoup().connect(url);
    conn.userAgent('Mozilla/5.0 (Windows NT 5.1; rv:14.0) Gecko/20100101 Firefox/14.0.1');
    conn.timeout(60000);
    var doc = conn.get();
    var nodes = doc.select('#results .sa_wr');
    for (var i = 0; i < nodes.size(); i++) {
      var node = nodes.get(i);
      var child = node.select('.sa_cc .sa_mc .sb_tlst a').first();
      var title = child.text();
      var link = child.attr('href');
      var it = g_env.newHashMap();
      it.put('title', title);
      it.put('link', link);
      tag.add(it);
    }
  } catch (e) {
    g_env.error(e);
  }
  return tag;
}

  Protected by Copyscape Online Copyright Protection

Tuesday, 24 July 2012

Grab search results from Google

Grab search results from Google
This task use javascript sandbox with jsoup support to grab search results from Google.
Grab search results from Google
  1. Create javascript sandbox with jsoup support
  2. Create javascript as following
javascript
1var g_env;
2
3function main(p_env, p_args) {
4 g_env = p_env;
5 g_env.info('Starting');
6 run();
7 g_env.info('Ending');
8}
9
10function run() {
11 try {
12 var query = 'lucene';
13 for (var pn = 1; pn <= 10; pn++) {
14 var res = grab(query, pn);
15 for (var i = 0; i < res.size(); i++) {
16 var it = res.get(i);
17 var title = it.get('title');
18 var link = it.get('link');
19 var no = (pn - 1) * 10 + i + 1;
20 g_env.info(no + ' | ' + title + ' | ' + link);
21 }
22 }
23 } catch (e) {
24 g_env.error(e);
25 }
26}
27
28function grab(query, pageno) {
29 var tag = g_env.newArrayList();
30 try {
31 var url = 'http://google.com/search?q=' + g_env.encodeURL(query, 'UTF-8') + '&start=' + ((pageno - 1) * 10);
32 var conn = g_env.newJsoup().connect(url);
33 conn.userAgent('Mozilla/5.0 (Windows NT 5.1; rv:14.0) Gecko/20100101 Firefox/14.0.1');
34 conn.timeout(60000);
35 var doc = conn.get();
36 var nodes = doc.select('#rso .g');
37 for (var i = 0; i < nodes.size(); i++) {
38 var node = nodes.get(i);
39 var child = node.select('.vsc .r .l');
40 var title = child.text();
41 var link = child.attr('href');
42 var it = g_env.newHashMap();
43 it.put('title', title);
44 it.put('link', link);
45 tag.add(it);
46 }
47 } catch (e) {
48 g_env.error(e);
49 }
50 return tag;
51}
var g_env;

function main(p_env, p_args) {
  g_env = p_env;
  g_env.info('Starting');
  run();
  g_env.info('Ending');
}

function run() {
  try {
    var query = 'lucene';
    for (var pn = 1; pn <= 10; pn++) {
      var res = grab(query, pn);
      for (var i = 0; i < res.size(); i++) {
        var it = res.get(i);
        var title = it.get('title');
        var link = it.get('link');
        var no = (pn - 1) * 10 + i + 1;
        g_env.info(no + ' | ' + title + ' | ' + link);
      }
    }
  } catch (e) {
    g_env.error(e);
  }
}

function grab(query, pageno) {
  var tag = g_env.newArrayList();
  try {
    var url = 'http://google.com/search?q=' + g_env.encodeURL(query, 'UTF-8') + '&start=' + ((pageno - 1) * 10);
    var conn = g_env.newJsoup().connect(url);
    conn.userAgent('Mozilla/5.0 (Windows NT 5.1; rv:14.0) Gecko/20100101 Firefox/14.0.1');
    conn.timeout(60000);
    var doc = conn.get();
    var nodes = doc.select('#rso .g');
    for (var i = 0; i < nodes.size(); i++) {
      var node = nodes.get(i);
      var child = node.select('.vsc .r .l');
      var title = child.text();
      var link = child.attr('href');
      var it = g_env.newHashMap();
      it.put('title', title);
      it.put('link', link);
      tag.add(it);
    }
  } catch (e) {
    g_env.error(e);
  }
  return tag;
}

  Protected by Copyscape Online Copyright Protection

Saturday, 21 July 2012

Grab article from ScienceDirect

Grab article from ScienceDirect
This task use javascript sandbox with jsoup and lucene support to grab article from ScienceDirect.
Grab article from ScienceDirect
  1. Create javascript sandbox with jsoup support
  2. Add Lucene support to javascript sandbox
  3. Create javascript as following
javascript
1var g_title = '';
2var g_cache = true;
3var g_site = 'sciencedirect.com';
4var g_env;
5var g_cookie;
6
7function main(p_env, p_args) {
8 g_env = p_env;
9 run();
10}
11
12function newEntity() {
13 return g_env.newEntity();
14}
15
16function loadUrlCookieStart(url) {
17 var conn = g_env.newJsoup().connect(url);
18 conn.userAgent('Mozilla/5.0 (Windows NT 5.1; rv:14.0) Gecko/20100101 Firefox/14.0.1');
19 conn.timeout(60000);
20 var tag = conn.get();
21 g_cookie = conn.getCookies();
22 return tag;
23}
24
25function loadUrlCookie(url) {
26 var conn = g_env.newJsoup().connect(url);
27 conn.userAgent('Mozilla/5.0 (Windows NT 5.1; rv:14.0) Gecko/20100101 Firefox/14.0.1');
28 conn.timeout(60000);
29 conn.cookies(g_cookie);
30 return conn.get();
31}
32
33function loadUrl(url) {
34 var conn = g_env.newJsoup().connect(url);
35 conn.userAgent('Mozilla/5.0 (Windows NT 5.1; rv:14.0) Gecko/20100101 Firefox/14.0.1');
36 conn.timeout(60000);
37 return conn.get();
38}
39
40function run() {
41 g_env.info('Starting');
42 if (g_env.newString(g_title).length() > 0) {
43 grabTitle(g_title);
44 } else {
45 if (!g_cache) {
46 clearCache();
47 }
48 var rs = loadTitleFresh();
49 while (rs.size() > 0) {
50 for (var i = 0; i < rs.size(); i++) {
51 var et = rs.get(i);
52 grabTitle(et.getString('link'));
53 }
54 rs = loadTitleFresh();
55 }
56 }
57 g_env.info('Ending');
58}
59
60function grabTitle(link) {
61 var et = findTitleByLink(link);
62 if (et == null) return;
63 var kind = et.getString('kind');
64 if (kind == 'Book') {
65 grabBook(et.getString('title'), et.getString('link'));
66 }
67 if (kind == 'Book Series') {
68 grabBookSeries(et.getString('title'), et.getString('link'));
69 }
70 if (kind == 'Journal') {
71 grabJournal(et.getString('title'), et.getString('link'));
72 }
73 et.setMark('crawled');
74 et.save();
75}
76
77function grabJournal(p_title, p_link) {
78 try {
79 var doc = loadUrl(p_link);
80 var vols_link = g_env.newArrayList();
81 var vols_title = g_env.newArrayList();
82 var rows = doc.select('#volumeIssueData .txtBold a');
83 for (var i = 0; i < rows.size(); i++) {
84 var child = rows.get(i);
85 var title = child.text();
86 var link = child.attr('href');
87 link = g_env.newString(g_env.newURL(g_env.newURL(p_link), link) + '');
88 vols_link.add(link);
89 vols_title.add(title);
90 }
91 for (var i = 0; i < vols_link.size(); i++) {
92 var titleV = vols_title.get(i);
93 var linkV = vols_link.get(i);
94 try {
95 doc = loadUrlCookieStart(linkV);
96 rows = doc.select('#bodyMainResults .resultRow');
97 for (var j = 0; j < rows.size(); j++) {
98 var row = rows.get(j);
99 child = row.select('.cLink').first();
100 if (child == null) continue;
101 var title = child.text();
102 var link = child.attr('href');
103 link = g_env.newString(g_env.newURL(g_env.newURL(p_link), link) + '');
104 var desc = '';
105 try {
106 var cdoc = loadUrlCookie(link);
107 child = cdoc.select('#section_abstract').first();
108 if (child != null) {
109 child = child.parent();
110 desc = child.text();
111 if (desc.indexOf('Abstract') == 0) {
112 desc = desc.substring(8);
113 }
114 if (desc.indexOf('Summary') == 0) {
115 desc = desc.substring(7);
116 }
117 }
118 } catch (e) {
119 g_env.error(e);
120 }
121 saveArticle(title + ' | ' + titleV + ' | ' + p_title, link, desc);
122 }
123 } catch (e) {
124 g_env.error(e);
125 }
126 }
127 } catch (e) {
128 g_env.error(e);
129 }
130}
131
132function grabBook(p_title, p_link) {
133 try {
134 var doc = loadUrlCookieStart(p_link);
135 var rows = doc.select('.contentMain .nonSerialResultsList .cLink');
136 for (var j = 0; j < rows.size(); j++) {
137 var row = rows.get(j);
138 child = row.select('.cLink').first();
139 if (child == null) continue;
140 var title = child.text();
141 var link = child.attr('href');
142 link = g_env.newString(g_env.newURL(g_env.newURL(p_link), link) + '');
143 var desc = '';
144 try {
145 var cdoc = loadUrlCookie(link);
146 child = cdoc.select('#section_abstract').first();
147 if (child != null) {
148 child = child.parent();
149 desc = child.text();
150 if (desc.indexOf('Abstract') == 0) {
151 desc = desc.substring(8);
152 }
153 if (desc.indexOf('Summary') == 0) {
154 desc = desc.substring(7);
155 }
156 }
157 } catch (e) {
158 g_env.error(e);
159 }
160 saveArticle(title + ' | ' + p_title, link, desc);
161 }
162 } catch (e) {
163 g_env.error(e);
164 }
165}
166
167function grabBookSeries(p_title, p_link) {
168 try {
169 var doc = loadUrl(p_link);
170 var vols_link = g_env.newArrayList();
171 var vols_title = g_env.newArrayList();
172 var rows = doc.select('#volumeIssueData .txt');
173 for (var i = 0; i < rows.size(); i++) {
174 var row = rows.get(i);
175 child = row.select('a').first();
176 var title = '';
177 var link = '';
178 if (child == null) {
179 child = row.select('span').first();
180 if (child == null) continue;
181 title = child.text();
182 link = p_link;
183 } else {
184 title = child.text();
185 link = child.attr('href');
186 link = g_env.newString(g_env.newURL(g_env.newURL(p_link), link) + '');
187 }
188 vols_link.add(link);
189 vols_title.add(title);
190 }
191 for (var i = 0; i < vols_link.size(); i++) {
192 var titleV = vols_title.get(i);
193 var linkV = vols_link.get(i);
194 try {
195 doc = loadUrlCookieStart(linkV);
196 rows = doc.select('#bodyMainResults .resultRow');
197 for (var j = 0; j < rows.size(); j++) {
198 var row = rows.get(j);
199 child = row.select('.cLink').first();
200 if (child == null) continue;
201 var title = child.text();
202 var link = child.attr('href');
203 link = g_env.newString(g_env.newURL(g_env.newURL(p_link), link) + '');
204 var desc = '';
205 try {
206 var cdoc = loadUrlCookie(link);
207 child = cdoc.select('#section_abstract').first();
208 if (child != null) {
209 child = child.parent();
210 desc = child.text();
211 if (desc.indexOf('Abstract') == 0) {
212 desc = desc.substring(8);
213 }
214 if (desc.indexOf('Summary') == 0) {
215 desc = desc.substring(7);
216 }
217 }
218 } catch (e) {
219 g_env.error(e);
220 }
221 saveArticle(title + ' | ' + titleV + ' | ' + p_title, link, desc);
222 }
223 } catch (e) {
224 g_env.error(e);
225 }
226 }
227 } catch (e) {
228 g_env.error(e);
229 }
230}
231
232function saveArticle(title, link, desc) {
233 var src = findLink(link);
234 if (src != null) return;
235 var schema = 's|url|a|title|a|desc|s|fixed|d|score|s|site|s|inbound|s|code';
236 var entity = newEntity();
237 entity.setSchema(schema);
238 entity.setKind('Link');
239 entity.setId(g_env.uniqid());
240 entity.setString('url', link);
241 entity.setString('title', title);
242 entity.setString('desc', desc);
243 entity.setString('fixed', 'false');
244 entity.setString('inbound', '');
245 entity.setDouble('score', 0);
246 entity.setString('code', g_env.suniqid());
247 try {
248 var t_url = g_env.newURL(link);
249 var t_host = t_url.getHost();
250 entity.setString('site', t_host);
251 } catch (e) {
252 g_env.error(e);
253 }
254 entity.save();
255
256 var op = '\r\nTitle: ' + title;
257 op += '\r\nLink: ' + link;
258 op += '\r\nDesc: ' + desc;
259 g_env.info(op);
260}
261
262function clearCache() {
263 g_env.info('Start clearing cache');
264 var rs = loadTitleCrawled();
265 while (rs.size() > 0) {
266 for (var i = 0; i < rs.size(); i++) {
267 var et = rs.get(i);
268 et.setMark('');
269 et.save();
270 }
271 rs = loadTitleCrawled();
272 }
273 g_env.info('End clearing cache');
274}
275
276function loadTitleCrawled() {
277 var pat = newEntity();
278 var bq = pat.newBooleanQuery();
279 bq.add(pat.newBooleanClause(pat.newTermQuery(pat.newTerm(pat.MARK, 'crawled')), pat.occurMust()));
280 var rs = pat.search(g_site + '_Title', bq, 10);
281 return rs;
282}
283
284function loadTitleFresh() {
285 var pat = newEntity();
286 var bq = pat.newBooleanQuery();
287 bq.add(pat.newBooleanClause(pat.newMatchAllDocsQuery(), pat.occurMust()));
288 bq.add(pat.newBooleanClause(pat.newTermQuery(pat.newTerm(pat.MARK, 'crawled')), pat.occurMustNot()));
289 var rs = pat.search(g_site + '_Title', bq, 10);
290 return rs;
291}
292
293function findTitleByLink(link) {
294 var pat = newEntity();
295 var res = pat.search(g_site + '_Title', pat.newTermQuery(pat.newTerm('link', link)), 1);
296 if (res.size() == 0) return null;
297 return res.get(0);
298}
299
300function findLink(link) {
301 var pat = newEntity();
302 var res = pat.search('Link', pat.newTermQuery(pat.newTerm('url', link)), 1);
303 if (res.size() == 0) return null;
304 return res.get(0);
305}
var g_title = '';
var g_cache = true;
var g_site = 'sciencedirect.com';
var g_env;
var g_cookie;

function main(p_env, p_args) {
  g_env = p_env;
  run();
}

function newEntity() {
  return g_env.newEntity();
}

function loadUrlCookieStart(url) {
  var conn = g_env.newJsoup().connect(url);
  conn.userAgent('Mozilla/5.0 (Windows NT 5.1; rv:14.0) Gecko/20100101 Firefox/14.0.1');
  conn.timeout(60000);
  var tag = conn.get();
  g_cookie = conn.getCookies();
  return tag;
}

function loadUrlCookie(url) {
  var conn = g_env.newJsoup().connect(url);
  conn.userAgent('Mozilla/5.0 (Windows NT 5.1; rv:14.0) Gecko/20100101 Firefox/14.0.1');
  conn.timeout(60000);
  conn.cookies(g_cookie);
  return conn.get();
}

function loadUrl(url) {
  var conn = g_env.newJsoup().connect(url);
  conn.userAgent('Mozilla/5.0 (Windows NT 5.1; rv:14.0) Gecko/20100101 Firefox/14.0.1');
  conn.timeout(60000);
  return conn.get();
}

function run() {
  g_env.info('Starting');
  if (g_env.newString(g_title).length() > 0) {
    grabTitle(g_title);
  } else {
    if (!g_cache) {
      clearCache();
    }
    var rs = loadTitleFresh();
    while (rs.size() > 0) {
      for (var i = 0; i < rs.size(); i++) {
        var et = rs.get(i);
        grabTitle(et.getString('link'));
      }
      rs = loadTitleFresh();
    }
  }
  g_env.info('Ending');
}

function grabTitle(link) {
  var et = findTitleByLink(link);
  if (et == null) return;
  var kind = et.getString('kind');
  if (kind == 'Book') {
    grabBook(et.getString('title'), et.getString('link'));
  }
  if (kind == 'Book Series') {
    grabBookSeries(et.getString('title'), et.getString('link'));
  }
  if (kind == 'Journal') {
    grabJournal(et.getString('title'), et.getString('link'));
  }
  et.setMark('crawled');
  et.save();
}

function grabJournal(p_title, p_link) {
  try {
    var doc = loadUrl(p_link);
    var vols_link = g_env.newArrayList();
    var vols_title = g_env.newArrayList();
    var rows = doc.select('#volumeIssueData .txtBold a');
    for (var i = 0; i < rows.size(); i++) {
      var child = rows.get(i);
      var title = child.text();
      var link = child.attr('href');
      link = g_env.newString(g_env.newURL(g_env.newURL(p_link), link) + '');
      vols_link.add(link);
      vols_title.add(title);
    }
    for (var i = 0; i < vols_link.size(); i++) {
      var titleV = vols_title.get(i);
      var linkV = vols_link.get(i);
      try {
        doc = loadUrlCookieStart(linkV);
        rows = doc.select('#bodyMainResults .resultRow');
        for (var j = 0; j < rows.size(); j++) {
          var row = rows.get(j);
          child = row.select('.cLink').first();
          if (child == null) continue;
          var title = child.text();
          var link = child.attr('href');
          link = g_env.newString(g_env.newURL(g_env.newURL(p_link), link) + '');
          var desc = '';
          try {
            var cdoc = loadUrlCookie(link);
            child = cdoc.select('#section_abstract').first();
            if (child != null) {
              child = child.parent();
              desc = child.text();
              if (desc.indexOf('Abstract') == 0) {
                desc = desc.substring(8);
              }
              if (desc.indexOf('Summary') == 0) {
                desc = desc.substring(7);
              }
            }
          } catch (e) {
            g_env.error(e);
          }
          saveArticle(title + ' | ' + titleV + ' | ' + p_title, link, desc);
        }
      } catch (e) {
        g_env.error(e);
      }
    }
  } catch (e) {
    g_env.error(e);
  }
}

function grabBook(p_title, p_link) {
  try {
    var doc = loadUrlCookieStart(p_link);
    var rows = doc.select('.contentMain .nonSerialResultsList .cLink');
    for (var j = 0; j < rows.size(); j++) {
      var row = rows.get(j);
      child = row.select('.cLink').first();
      if (child == null) continue;
      var title = child.text();
      var link = child.attr('href');
      link = g_env.newString(g_env.newURL(g_env.newURL(p_link), link) + '');
      var desc = '';
      try {
        var cdoc = loadUrlCookie(link);
        child = cdoc.select('#section_abstract').first();
        if (child != null) {
          child = child.parent();
          desc = child.text();
          if (desc.indexOf('Abstract') == 0) {
            desc = desc.substring(8);
          }
          if (desc.indexOf('Summary') == 0) {
            desc = desc.substring(7);
          }
        }
      } catch (e) {
        g_env.error(e);
      }
      saveArticle(title + ' | ' + p_title, link, desc);
    }
  } catch (e) {
    g_env.error(e);
  }
}

function grabBookSeries(p_title, p_link) {
  try {
    var doc = loadUrl(p_link);
    var vols_link = g_env.newArrayList();
    var vols_title = g_env.newArrayList();
    var rows = doc.select('#volumeIssueData .txt');
    for (var i = 0; i < rows.size(); i++) {
      var row = rows.get(i);
      child = row.select('a').first();
      var title = '';
      var link = '';
      if (child == null) {
        child = row.select('span').first();
        if (child == null) continue;
        title = child.text();
        link = p_link;
      } else {
        title = child.text();
        link = child.attr('href');
        link = g_env.newString(g_env.newURL(g_env.newURL(p_link), link) + '');
      }
      vols_link.add(link);
      vols_title.add(title);
    }
    for (var i = 0; i < vols_link.size(); i++) {
      var titleV = vols_title.get(i);
      var linkV = vols_link.get(i);
      try {
        doc = loadUrlCookieStart(linkV);
        rows = doc.select('#bodyMainResults .resultRow');
        for (var j = 0; j < rows.size(); j++) {
          var row = rows.get(j);
          child = row.select('.cLink').first();
          if (child == null) continue;
          var title = child.text();
          var link = child.attr('href');
          link = g_env.newString(g_env.newURL(g_env.newURL(p_link), link) + '');
          var desc = '';
          try {
            var cdoc = loadUrlCookie(link);
            child = cdoc.select('#section_abstract').first();
            if (child != null) {
              child = child.parent();
              desc = child.text();
              if (desc.indexOf('Abstract') == 0) {
                desc = desc.substring(8);
              }
              if (desc.indexOf('Summary') == 0) {
                desc = desc.substring(7);
              }
            }
          } catch (e) {
            g_env.error(e);
          }
          saveArticle(title + ' | ' + titleV + ' | ' + p_title, link, desc);
        }
      } catch (e) {
        g_env.error(e);
      }
    }
  } catch (e) {
    g_env.error(e);
  }
}

function saveArticle(title, link, desc) {
  var src = findLink(link);
  if (src != null) return;
  var schema = 's|url|a|title|a|desc|s|fixed|d|score|s|site|s|inbound|s|code';
  var entity = newEntity();
  entity.setSchema(schema);
  entity.setKind('Link');
  entity.setId(g_env.uniqid());
  entity.setString('url', link);
  entity.setString('title', title);
  entity.setString('desc', desc);
  entity.setString('fixed', 'false');
  entity.setString('inbound', '');
  entity.setDouble('score', 0);
  entity.setString('code', g_env.suniqid());
  try {
    var t_url = g_env.newURL(link);
    var t_host = t_url.getHost();
    entity.setString('site', t_host);
  } catch (e) {
    g_env.error(e);
  }
  entity.save();

  var op = '\r\nTitle: ' + title;
  op += '\r\nLink: ' + link;
  op += '\r\nDesc: ' + desc;
  g_env.info(op);
}

function clearCache() {
  g_env.info('Start clearing cache');
  var rs = loadTitleCrawled();
  while (rs.size() > 0) {
    for (var i = 0; i < rs.size(); i++) {
      var et = rs.get(i);
      et.setMark('');
      et.save();
    }
    rs = loadTitleCrawled();
  }
  g_env.info('End clearing cache');
}

function loadTitleCrawled() {
  var pat = newEntity();
  var bq = pat.newBooleanQuery();
  bq.add(pat.newBooleanClause(pat.newTermQuery(pat.newTerm(pat.MARK, 'crawled')), pat.occurMust()));
  var rs = pat.search(g_site + '_Title', bq, 10);
  return rs;
}

function loadTitleFresh() {
  var pat = newEntity();
  var bq = pat.newBooleanQuery();
  bq.add(pat.newBooleanClause(pat.newMatchAllDocsQuery(), pat.occurMust()));
  bq.add(pat.newBooleanClause(pat.newTermQuery(pat.newTerm(pat.MARK, 'crawled')), pat.occurMustNot()));
  var rs = pat.search(g_site + '_Title', bq, 10);
  return rs;
}

function findTitleByLink(link) {
  var pat = newEntity();
  var res = pat.search(g_site + '_Title', pat.newTermQuery(pat.newTerm('link', link)), 1);
  if (res.size() == 0) return null;
  return res.get(0);
}

function findLink(link) {
  var pat = newEntity();
  var res = pat.search('Link', pat.newTermQuery(pat.newTerm('url', link)), 1);
  if (res.size() == 0) return null;
  return res.get(0);
}

  Protected by Copyscape Online Copyright Protection

Grab book/journal from ScienceDirect

Grab book/journal from ScienceDirect
This task use javascript sandbox with jsoup and lucene support to grab book/journal from ScienceDirect.
Grab book/journal from ScienceDirect
  1. Create javascript sandbox with jsoup support
  2. Add Lucene support to javascript sandbox
  3. Create javascript as following
javascript
1var g_site = 'sciencedirect.com';
2var g_env;
3
4function main(p_env, p_args) {
5 g_env = p_env;
6 run();
7}
8
9function newEntity() {
10 return g_env.newEntity();
11}
12
13function loadUrl(url) {
14 var conn = g_env.newJsoup().connect(url);
15 conn.userAgent('Mozilla/5.0 (Windows NT x.y; rv:10.0.1) Gecko/20100101 Firefox/10.0.1');
16 conn.timeout(60000);
17 return conn.get();
18}
19
20function run() {
21 g_env.info('Starting');
22 grabCategory('http://www.sciencedirect.com/science/browse/sub/physicalsciences');
23 grabCategory('http://www.sciencedirect.com/science/browse/sub/chemicaleng');
24 grabCategory('http://www.sciencedirect.com/science/browse/sub/chemistry');
25 grabCategory('http://www.sciencedirect.com/science/browse/sub/computerscience');
26 grabCategory('http://www.sciencedirect.com/science/browse/sub/earth');
27 grabCategory('http://www.sciencedirect.com/science/browse/sub/energy');
28 grabCategory('http://www.sciencedirect.com/science/browse/sub/engineering');
29 grabCategory('http://www.sciencedirect.com/science/browse/sub/materialsscience');
30 grabCategory('http://www.sciencedirect.com/science/browse/sub/mathematics');
31 grabCategory('http://www.sciencedirect.com/science/browse/sub/physics');
32 grabCategory('http://www.sciencedirect.com/science/browse/sub/lifesciences');
33 grabCategory('http://www.sciencedirect.com/science/browse/sub/agribio');
34 grabCategory('http://www.sciencedirect.com/science/browse/sub/biochemgenmolbiol');
35 grabCategory('http://www.sciencedirect.com/science/browse/sub/environmental');
36 grabCategory('http://www.sciencedirect.com/science/browse/sub/immunolmicrobiol');
37 grabCategory('http://www.sciencedirect.com/science/browse/sub/neuroscience');
38 grabCategory('http://www.sciencedirect.com/science/browse/sub/healthsciences');
39 grabCategory('http://www.sciencedirect.com/science/browse/sub/medicinedentistry');
40 grabCategory('http://www.sciencedirect.com/science/browse/sub/nursinghealth');
41 grabCategory('http://www.sciencedirect.com/science/browse/sub/pharmatox');
42 grabCategory('http://www.sciencedirect.com/science/browse/sub/vetscimed');
43 grabCategory('http://www.sciencedirect.com/science/browse/sub/socialscienceshumanities');
44 grabCategory('http://www.sciencedirect.com/science/browse/sub/artsandhumanities');
45 grabCategory('http://www.sciencedirect.com/science/browse/sub/busmanacc');
46 grabCategory('http://www.sciencedirect.com/science/browse/sub/decisionsciences');
47 grabCategory('http://www.sciencedirect.com/science/browse/sub/economics');
48 grabCategory('http://www.sciencedirect.com/science/browse/sub/psychology');
49 grabCategory('http://www.sciencedirect.com/science/browse/sub/socialsciences');
50 g_env.info('Ending');
51}
52
53function grabCategory(cat) {
54 try {
55 var pages = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '0-9'];
56 for (var pn = 0; pn < pages.length; pn++) {
57 try {
58 var url = cat + '/' + pages[pn];
59 var doc = loadUrl(url);
60 var rows = doc.select('#content_browseimp tr.browseimpBrowseRow');
61 for (var i = 0; i < rows.size(); i++) {
62 var row = rows.get(i);
63 var title = g_env.newString('');
64 var link = g_env.newString('');
65 var kind = g_env.newString('');
66 var child = row.select('.browseColFirst a').first();
67 if (child != null) {
68 title = child.text();
69 link = child.attr('href');
70 link = g_env.newString(g_env.newURL(g_env.newURL(url), link) + '');
71 }
72 var child = row.select('.browseColFourth').first();
73 if (child != null) {
74 kind = child.text().trim();
75 }
76 if (title.length() > 0 && link.length() > 0 && kind.length() > 0) {
77 saveTitle(title, link, kind);
78 }
79 }
80 } catch (e) {
81 g_env.error(e);
82 }
83 }
84 } catch (e) {
85 g_env.error(e);
86 }
87}
88
89function saveTitle(title, link, kind) {
90 if (findTitleByLink(link) != null) return;
91 var schema = 's|link|s|title|s|kind';
92 var entity = newEntity();
93 entity.setSchema(schema);
94 entity.setKind(g_site + '_Title');
95 entity.setId(g_env.uniqid());
96 entity.setString('link', link);
97 entity.setString('title', title);
98 entity.setString('kind', kind);
99 entity.save();
100 g_env.info(kind + ' | ' + title + ' | ' + link);
101}
102
103function findTitleByLink(link) {
104 var pat = newEntity();
105 var res = pat.search(g_site + '_Title', pat.newTermQuery(pat.newTerm('link', link)), 1);
106 if (res.size() == 0) return null;
107 return res.get(0);
108}
var g_site = 'sciencedirect.com';
var g_env;

function main(p_env, p_args) {
  g_env = p_env;
  run();
}

function newEntity() {
  return g_env.newEntity();
}

function loadUrl(url) {
  var conn = g_env.newJsoup().connect(url);
  conn.userAgent('Mozilla/5.0 (Windows NT x.y; rv:10.0.1) Gecko/20100101 Firefox/10.0.1');
  conn.timeout(60000);
  return conn.get();
}

function run() {
  g_env.info('Starting');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/physicalsciences');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/chemicaleng');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/chemistry');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/computerscience');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/earth');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/energy');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/engineering');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/materialsscience');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/mathematics');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/physics');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/lifesciences');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/agribio');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/biochemgenmolbiol');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/environmental');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/immunolmicrobiol');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/neuroscience');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/healthsciences');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/medicinedentistry');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/nursinghealth');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/pharmatox');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/vetscimed');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/socialscienceshumanities');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/artsandhumanities');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/busmanacc');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/decisionsciences');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/economics');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/psychology');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/socialsciences');
  g_env.info('Ending');
}

function grabCategory(cat) {
  try {
    var pages = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '0-9'];
    for (var pn = 0; pn < pages.length; pn++) {
      try {
        var url = cat + '/' + pages[pn];
        var doc = loadUrl(url);
        var rows = doc.select('#content_browseimp tr.browseimpBrowseRow');
        for (var i = 0; i < rows.size(); i++) {
          var row = rows.get(i);
          var title = g_env.newString('');
          var link = g_env.newString('');
          var kind = g_env.newString('');
          var child = row.select('.browseColFirst a').first();
          if (child != null) {
            title = child.text();
            link = child.attr('href');
            link = g_env.newString(g_env.newURL(g_env.newURL(url), link) + '');
          }
          var child = row.select('.browseColFourth').first();
          if (child != null) {
            kind = child.text().trim();
          }
          if (title.length() > 0 && link.length() > 0 && kind.length() > 0) {
            saveTitle(title, link, kind);
          }
        }
      } catch (e) {
        g_env.error(e);
      }
    }
  } catch (e) {
    g_env.error(e);
  }
}

function saveTitle(title, link, kind) {
  if (findTitleByLink(link) != null) return;
  var schema = 's|link|s|title|s|kind';
  var entity = newEntity();
  entity.setSchema(schema);
  entity.setKind(g_site + '_Title');
  entity.setId(g_env.uniqid());
  entity.setString('link', link);
  entity.setString('title', title);
  entity.setString('kind', kind);
  entity.save();
  g_env.info(kind + ' | ' + title + ' | ' + link);
}

function findTitleByLink(link) {
  var pat = newEntity();
  var res = pat.search(g_site + '_Title', pat.newTermQuery(pat.newTerm('link', link)), 1);
  if (res.size() == 0) return null;
  return res.get(0);
}

  Protected by Copyscape Online Copyright Protection

Wednesday, 6 June 2012

Grab video from DailyMotion

Grab video from DailyMotion
This task use javascript sandbox with jsoup and lucene support to grab video from DailyMotion.
Grab video from DailyMotion
  1. Create javascript sandbox with jsoup support
  2. Add Lucene support to javascript sandbox
  3. Create javascript as following
javascript
1var env;
2var args;
3var maxpage = 1000;
4
5function main(penv, pargs) {
6 env = penv;
7 args = pargs;
8 env.info('Starting');
9 while (true) {
10 var queue_list = loadQueue();
11 env.info('Size: ' + queue_list.size());
12 while (queue_list.size() > 0) {
13 for (var i = 0; i < queue_list.size(); i++) {
14 var queue = queue_list.get(i);
15 grabVideo(queue);
16 queue.setString('crawled', 'true');
17 queue.save();
18 }
19 queue_list = loadQueue();
20 }
21 grabCategory('http://www.dailymotion.com/group/in_theaters_this_week/');
22 grabCategory('http://www.dailymotion.com/group/in_theaters_now/');
23 grabCategory('http://www.dailymotion.com/group/coming_soon/');
24 grabCategory('http://www.dailymotion.com/user/ReelzChannel/lang/en/search/movie+review/');
25 grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/interview+movie/channel/shortfilms/');
26 grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/movie+news/channel/shortfilms/');
27 grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/horror/channel/shortfilms/');
28 grabCategory('http://www.dailymotion.com/user/hollywoodtv/');
29 grabCategory('http://www.dailymotion.com/creative-official/tag/rock/lang/en/channel/music/');
30 grabCategory('http://www.dailymotion.com/creative-official/tag/pop/lang/en/channel/music/');
31 grabCategory('http://www.dailymotion.com/creative-official/tag/hop/lang/en/channel/music/');
32 grabCategory('http://www.dailymotion.com/creative-official/tag/alternative/lang/en/channel/music/');
33 grabCategory('http://www.dailymotion.com/creative-official/tag/dance/lang/en/channel/music/');
34 grabCategory('http://www.dailymotion.com/creative-official/tag/soul/lang/en/channel/music/');
35 grabCategory('http://www.dailymotion.com/creative-official/tag/latin/lang/en/channel/music/');
36 grabCategory('http://www.dailymotion.com/creative-official/tag/country/lang/en/channel/music/');
37 grabCategory('http://www.dailymotion.com/user/ClassicGameRoom/');
38 grabCategory('http://www.dailymotion.com/mychannel/ClassicGameRoom/');
39 grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/fight/channel/videogames/');
40 grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/strategy/channel/videogames/');
41 grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/shooter/channel/videogames/');
42 grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/action/channel/videogames/');
43 grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/sport/channel/videogames/');
44 grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/trailer/channel/videogames/');
45 grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/review/channel/videogames/');
46 grabCategory('http://www.dailymotion.com/creative-official/lang/en/search/nfl/');
47 grabCategory('http://www.dailymotion.com/creative-official/lang/en/search/nba/');
48 grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/nhl/channel/sport/');
49 grabCategory('http://www.dailymotion.com/user/TotalCollegeSports/');
50 grabCategory('http://www.dailymotion.com/user/UFC/');
51 grabCategory('http://www.dailymotion.com/user/sportsillustrated/');
52 grabCategory('http://www.dailymotion.com/user/transworld/');
53 grabCategory('http://www.dailymotion.com/user/rooftopcomedy/');
54 grabCategory('http://www.dailymotion.com/playlist/x1qzsd_MyDamnChannel_dicki/');
55 grabCategory('http://www.dailymotion.com/playlist/x1r5r4_MyDamnChannel_easy-to-assemble-season-3/');
56 grabCategory('http://www.dailymotion.com/user/Rhettandlink/');
57 grabCategory('http://www.dailymotion.com/user/epicmealtime/');
58 grabCategory('http://www.dailymotion.com/group/familyguy/');
59 grabCategory('http://www.dailymotion.com/creative/lang/en/channel/fun/');
60 grabCategory('http://www.dailymotion.com/group/nbcnightlynews/');
61 grabCategory('http://www.dailymotion.com/user/reuters/');
62 grabCategory('http://www.dailymotion.com/user/NewsLook/');
63 grabCategory('http://www.dailymotion.com/user/NYMag/');
64 grabCategory('http://www.dailymotion.com/user/itnnews/');
65 grabCategory('http://www.dailymotion.com/user/Buzz60/');
66 grabCategory('http://www.dailymotion.com/user/associatedpress/');
67 grabCategory('http://www.dailymotion.com/us/featured/channel/news/');
68 grabCategory('http://www.dailymotion.com/user/clevvertv/');
69 grabCategory('http://www.dailymotion.com/user/tvguide/');
70 grabCategory('http://www.dailymotion.com/user/splashnews/');
71 grabCategory('http://www.dailymotion.com/user/hollywoodbackstage/');
72 grabCategory('http://www.dailymotion.com/user/celebtv/');
73 grabCategory('http://www.dailymotion.com/user/maximotv/');
74 grabCategory('http://www.dailymotion.com/user/mojosupreme/');
75 grabCategory('http://www.dailymotion.com/user/DiagonalView/');
76 grabCategory('http://www.dailymotion.com/hub/x38_Motionmaker-documentaries/');
77 grabCategory('http://www.dailymotion.com/user/tysihelp/');
78 grabCategory('http://www.dailymotion.com/user/computerTV/');
79 grabCategory('http://www.dailymotion.com/user/soldierknowsbest/');
80 grabCategory('http://www.dailymotion.com/user/appjudgment/');
81 grabCategory('http://www.dailymotion.com/user/geekbeattv/');
82 grabCategory('http://www.dailymotion.com/user/allthingsscience/');
83 grabCategory('http://www.dailymotion.com/user/stuffwelike/');
84 grabCategory('http://www.dailymotion.com/user/lifehackershow/');
85 grabCategory('http://www.dailymotion.com/us/channel/auto/');
86 }
87 env.info('Ending');
88}
89
90function grabVideo(queue) {
91 try {
92 var url = queue.getString('url');
93 var title = queue.getString('title');
94 var image = queue.getString('image');
95 var conn = env.newJsoup().connect(url).userAgent('Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101');
96 var doc = conn.timeout(60000).get();
97 var child = doc.select('#video_description').first();
98 var desc = env.newString('');
99 if (child != null) {
100 desc = child.text();
101 }
102 saveLink(title, url, desc, image, '');
103 } catch (e) {
104 env.error(e);
105 }
106}
107
108function saveLink(title, url, desc, image, price) {
109 url = env.newString(url);
110 var pos = url.lastIndexOf('&feature=');
111 if (pos >= 0) {
112 url = url.substring(0, pos);
113 }
114 if (findLinkByUrl(url)) return;
115 var schema = 's|url|a|title|a|desc|s|fixed|d|score|s|site|s|image|s|price';
116 var entity = env.newEntity();
117 entity.setSchema(schema);
118 entity.setKind('Link');
119 entity.setId(env.uniqid());
120 entity.setString('url', url);
121 entity.setString('title', title);
122 entity.setString('desc', desc);
123 entity.setString('fixed', 'true');
124 entity.setDouble('score', 100);
125 entity.setString('image', image);
126 entity.setString('price', price);
127 try {
128 var t_url = env.newURL(url);
129 var t_host = t_url.getHost();
130 entity.setString('site', t_host);
131 } catch (e) {
132 env.error(e);
133 }
134 entity.save();
135 env.info(title + ' | ' + url);
136}
137
138function findLinkByUrl(url) {
139 var entity = env.newEntity();
140 var query = entity.newTermQuery(entity.newTerm('url', url));
141 var size = entity.count('Link', query, 1);
142 return (size > 0);
143}
144
145function grabCategory(catUrl) {
146 env.info('Category: ' + catUrl);
147 for (var no = 1; no <= maxpage; no++) {
148 try {
149 var conn = env.newJsoup().connect(catUrl + no + '?mode=playlist').userAgent('Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101');
150 var doc = conn.timeout(60000).get();
151 var items = doc.select('.dmpi_video_item');
152 if (items.size() == 0) break;
153 for (var i = 0; i < items.size(); i++) {
154 var item = items.get(i);
155 var child = item.select('.dmpi_video_title a').first();
156 if (child == null) continue;
157 var title = child.text().trim();
158 var url = env.newString(env.newURL(env.newURL('http://www.dailymotion.com'), child.attr('href')) + '');
159 child = item.select('.dmpi_video_preview a img').first();
160 var image = env.newString('');
161 if (child != null) {
162 var tmp = env.newString(child.attr('data-spr'));
163 tmp = tmp.replaceAl?('jpeg_preview_sprite.jpg', 'jpeg_preview_medium.jpg');
164 image = env.newString(env.newURL(env.newURL('http://www.dailymotion.com'), tmp) + '');
165 }
166 markVideo(title, url, image);
167 }
168 } catch (e) {
169 env.error(e);
170 }
171 }
172}
173
174function markVideo(title, url, image) {
175 if (findQueueByUrl(url)) return;
176 var schema = 's|url|s|title|s|image|s|crawled';
177 var entity = env.newEntity();
178 entity.setSchema(schema);
179 entity.setKind('Queue_DailyMotion');
180 entity.setId(env.uniqid());
181 entity.setString('url', url);
182 entity.setString('title', title);
183 entity.setString('image', image);
184 entity.setString('crawled', 'false');
185 entity.save();
186}
187
188function findQueueByUrl(url) {
189 var entity = env.newEntity();
190 var query = entity.newTermQuery(entity.newTerm('url', url));
191 var size = entity.count('Queue_DailyMotion', query, 1);
192 return (size > 0);
193}
194
195function loadQueue() {
196 var entity = env.newEntity();
197 var tag = entity.search('Queue_DailyMotion', entity.newTermQuery(entity.newTerm('crawled', 'false')), 10);
198 return tag;
199}
var env;
var args;
var maxpage = 1000;

function main(penv, pargs) {
    env = penv;
    args = pargs;
    env.info('Starting');
    while (true) {
        var queue_list = loadQueue();
        env.info('Size: ' + queue_list.size());
        while (queue_list.size() > 0) {
            for (var i = 0; i < queue_list.size(); i++) {
                var queue = queue_list.get(i);
                grabVideo(queue);
                queue.setString('crawled', 'true');
                queue.save();
            }
            queue_list = loadQueue();
        }
        grabCategory('http://www.dailymotion.com/group/in_theaters_this_week/');
        grabCategory('http://www.dailymotion.com/group/in_theaters_now/');
        grabCategory('http://www.dailymotion.com/group/coming_soon/');
        grabCategory('http://www.dailymotion.com/user/ReelzChannel/lang/en/search/movie+review/');
        grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/interview+movie/channel/shortfilms/');
        grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/movie+news/channel/shortfilms/');
        grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/horror/channel/shortfilms/');
        grabCategory('http://www.dailymotion.com/user/hollywoodtv/');
        grabCategory('http://www.dailymotion.com/creative-official/tag/rock/lang/en/channel/music/');
        grabCategory('http://www.dailymotion.com/creative-official/tag/pop/lang/en/channel/music/');
        grabCategory('http://www.dailymotion.com/creative-official/tag/hop/lang/en/channel/music/');
        grabCategory('http://www.dailymotion.com/creative-official/tag/alternative/lang/en/channel/music/');
        grabCategory('http://www.dailymotion.com/creative-official/tag/dance/lang/en/channel/music/');
        grabCategory('http://www.dailymotion.com/creative-official/tag/soul/lang/en/channel/music/');
        grabCategory('http://www.dailymotion.com/creative-official/tag/latin/lang/en/channel/music/');
        grabCategory('http://www.dailymotion.com/creative-official/tag/country/lang/en/channel/music/');
        grabCategory('http://www.dailymotion.com/user/ClassicGameRoom/');
        grabCategory('http://www.dailymotion.com/mychannel/ClassicGameRoom/');
        grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/fight/channel/videogames/');
        grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/strategy/channel/videogames/');
        grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/shooter/channel/videogames/');
        grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/action/channel/videogames/');
        grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/sport/channel/videogames/');
        grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/trailer/channel/videogames/');
        grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/review/channel/videogames/');
        grabCategory('http://www.dailymotion.com/creative-official/lang/en/search/nfl/');
        grabCategory('http://www.dailymotion.com/creative-official/lang/en/search/nba/');
        grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/nhl/channel/sport/');
        grabCategory('http://www.dailymotion.com/user/TotalCollegeSports/');
        grabCategory('http://www.dailymotion.com/user/UFC/');
        grabCategory('http://www.dailymotion.com/user/sportsillustrated/');
        grabCategory('http://www.dailymotion.com/user/transworld/');
        grabCategory('http://www.dailymotion.com/user/rooftopcomedy/');
        grabCategory('http://www.dailymotion.com/playlist/x1qzsd_MyDamnChannel_dicki/');
        grabCategory('http://www.dailymotion.com/playlist/x1r5r4_MyDamnChannel_easy-to-assemble-season-3/');
        grabCategory('http://www.dailymotion.com/user/Rhettandlink/');
        grabCategory('http://www.dailymotion.com/user/epicmealtime/');
        grabCategory('http://www.dailymotion.com/group/familyguy/');
        grabCategory('http://www.dailymotion.com/creative/lang/en/channel/fun/');
        grabCategory('http://www.dailymotion.com/group/nbcnightlynews/');
        grabCategory('http://www.dailymotion.com/user/reuters/');
        grabCategory('http://www.dailymotion.com/user/NewsLook/');
        grabCategory('http://www.dailymotion.com/user/NYMag/');
        grabCategory('http://www.dailymotion.com/user/itnnews/');
        grabCategory('http://www.dailymotion.com/user/Buzz60/');
        grabCategory('http://www.dailymotion.com/user/associatedpress/');
        grabCategory('http://www.dailymotion.com/us/featured/channel/news/');
        grabCategory('http://www.dailymotion.com/user/clevvertv/');
        grabCategory('http://www.dailymotion.com/user/tvguide/');
        grabCategory('http://www.dailymotion.com/user/splashnews/');
        grabCategory('http://www.dailymotion.com/user/hollywoodbackstage/');
        grabCategory('http://www.dailymotion.com/user/celebtv/');
        grabCategory('http://www.dailymotion.com/user/maximotv/');
        grabCategory('http://www.dailymotion.com/user/mojosupreme/');
        grabCategory('http://www.dailymotion.com/user/DiagonalView/');
        grabCategory('http://www.dailymotion.com/hub/x38_Motionmaker-documentaries/');
        grabCategory('http://www.dailymotion.com/user/tysihelp/');
        grabCategory('http://www.dailymotion.com/user/computerTV/');
        grabCategory('http://www.dailymotion.com/user/soldierknowsbest/');
        grabCategory('http://www.dailymotion.com/user/appjudgment/');
        grabCategory('http://www.dailymotion.com/user/geekbeattv/');
        grabCategory('http://www.dailymotion.com/user/allthingsscience/');
        grabCategory('http://www.dailymotion.com/user/stuffwelike/');
        grabCategory('http://www.dailymotion.com/user/lifehackershow/');
        grabCategory('http://www.dailymotion.com/us/channel/auto/');
    }
    env.info('Ending');
}

function grabVideo(queue) {
    try {
        var url = queue.getString('url');
        var title = queue.getString('title');
        var image = queue.getString('image');
        var conn = env.newJsoup().connect(url).userAgent('Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101');
        var doc = conn.timeout(60000).get(); 
        var child = doc.select('#video_description').first();
        var desc = env.newString('');
        if (child != null) {
            desc = child.text();
        }
        saveLink(title, url, desc, image, '');
    } catch (e) {
        env.error(e);
    }
}

function saveLink(title, url, desc, image, price) {
    url = env.newString(url);
    var pos = url.lastIndexOf('&feature=');
    if (pos >= 0) {
        url = url.substring(0, pos);
    }
    if (findLinkByUrl(url)) return;
    var schema = 's|url|a|title|a|desc|s|fixed|d|score|s|site|s|image|s|price';
    var entity = env.newEntity();
    entity.setSchema(schema);
    entity.setKind('Link');
    entity.setId(env.uniqid());
    entity.setString('url', url);
    entity.setString('title', title);
    entity.setString('desc', desc);
    entity.setString('fixed', 'true');
    entity.setDouble('score', 100);
    entity.setString('image', image);
    entity.setString('price', price);
    try {
        var t_url = env.newURL(url);
        var t_host = t_url.getHost();
        entity.setString('site', t_host);
    } catch (e) {
        env.error(e);
    }
    entity.save();
    env.info(title + ' | ' + url);
}

function findLinkByUrl(url) {
    var entity = env.newEntity();
    var query = entity.newTermQuery(entity.newTerm('url', url));
    var size = entity.count('Link', query, 1);
    return (size > 0);
}

function grabCategory(catUrl) {
    env.info('Category: ' + catUrl);
    for (var no = 1; no <= maxpage; no++) {
        try {
            var conn = env.newJsoup().connect(catUrl + no + '?mode=playlist').userAgent('Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101');
            var doc = conn.timeout(60000).get(); 
            var items = doc.select('.dmpi_video_item');
            if (items.size() == 0) break;
            for (var i = 0; i < items.size(); i++) {
                var item = items.get(i);
                var child = item.select('.dmpi_video_title a').first();
                if (child == null) continue;
                var title = child.text().trim();
                var url = env.newString(env.newURL(env.newURL('http://www.dailymotion.com'), child.attr('href')) + '');
                child = item.select('.dmpi_video_preview a img').first();
                var image = env.newString('');
                if (child != null) {
                    var tmp = env.newString(child.attr('data-spr'));
                    tmp = tmp.replaceAl?('jpeg_preview_sprite.jpg', 'jpeg_preview_medium.jpg');
                    image = env.newString(env.newURL(env.newURL('http://www.dailymotion.com'), tmp) + '');
                }
                markVideo(title, url, image);
            }
        } catch (e) {
            env.error(e);
        }
    }
}

function markVideo(title, url, image) {
    if (findQueueByUrl(url)) return;
    var schema = 's|url|s|title|s|image|s|crawled';
    var entity = env.newEntity();
    entity.setSchema(schema);
    entity.setKind('Queue_DailyMotion');
    entity.setId(env.uniqid());
    entity.setString('url', url);
    entity.setString('title', title);
    entity.setString('image', image);
    entity.setString('crawled', 'false');
    entity.save();
}

function findQueueByUrl(url) {
    var entity = env.newEntity();
    var query = entity.newTermQuery(entity.newTerm('url', url));
    var size = entity.count('Queue_DailyMotion', query, 1);
    return (size > 0);
}

function loadQueue() {
    var entity = env.newEntity();
    var tag = entity.search('Queue_DailyMotion', entity.newTermQuery(entity.newTerm('crawled', 'false')), 10);
    return tag;
}

  Protected by Copyscape Online Copyright Protection

Grab video from YouTube

Grab video from YouTube
This task use javascript sandbox with jsoup and lucene support to grab video from YouTube.
Grab video from YouTube
  1. Create javascript sandbox with jsoup support
  2. Add Lucene support to javascript sandbox
  3. Create javascript as following
javascript
1var env;
2var args;
3
4function main(penv, pargs) {
5 env = penv;
6 args = pargs;
7 env.info('Starting');
8 while (true) {
9 var queue_list = loadQueue();
10 env.info('Size: ' + queue_list.size());
11 while (queue_list.size() > 0) {
12 for (var i = 0; i < queue_list.size(); i++) {
13 var queue = queue_list.get(i);
14 grabVideo(queue);
15 queue.setString('crawled', 'true');
16 queue.save();
17 }
18 queue_list = loadQueue();
19 }
20 grabCategory('http://www.youtube.com/autos');
21 grabCategory('http://www.youtube.com/comedy');
22 grabCategory('http://www.youtube.com/entertainment');
23 grabCategory('http://www.youtube.com/film');
24 grabCategory('http://www.youtube.com/gaming');
25 grabCategory('http://www.youtube.com/howto');
26 grabCategory('http://www.youtube.com/activism');
27 grabCategory('http://www.youtube.com/people');
28 grabCategory('http://www.youtube.com/pets');
29 grabCategory('http://www.youtube.com/science');
30 grabCategory('http://www.youtube.com/videos?c=17');
31 grabCategory('http://www.youtube.com/travel');
32 }
33 env.info('Ending');
34}
35
36function grabVideo(queue) {
37 try {
38 var url = queue.getString('url');
39 var title = queue.getString('title');
40 var image = queue.getString('image');
41 var conn = env.newJsoup().connect(url).userAgent('Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101');
42 var doc = conn.timeout(60000).get();
43 var child = doc.select('#eow-description').first();
44 var desc = env.newString('');
45 if (child != null) {
46 desc = child.text();
47 } else {
48 child = doc.select('#ded').first();
49 if (child != null) {
50 desc = child.text();
51 }
52 }
53 saveLink(title, url, desc, image, '');
54
55 var html = doc.html();
56 var pos1 = html.indexOf('var rvl =');
57 var pos2 = html.indexOf('var cml =');
58 if (pos1 < 0 || pos2 < 0 || pos1 >= pos2) return;
59 var js1 = html.substring(pos1 + 9, pos2);
60 var obj1 = null;
61 eval('obj1 = ' + js1);
62 if (obj1 == null) return;
63 for (var i = 0; i < obj1.length; i++) {
64 var item = obj1[i];
65 var url2 = env.newString('http://www.youtube.com/watch?v=' + item.k);
66 var title2 = item.t;
67 var image2 = env.newString(env.newURL(env.newURL('http://www.youtube.com'), item.i));
68 markVideo(title2, url2, image2);
69 env.info('Video: ' + title2 + ' | ' + url2 + ' | ' + image2);
70 }
71 } catch (e) {
72 env.error(e);
73 }
74}
75
76function saveLink(title, url, desc, image, price) {
77 url = env.newString(url);
78 var pos = url.lastIndexOf('&feature=');
79 if (pos >= 0) {
80 url = url.substring(0, pos);
81 }
82 if (findLinkByUrl(url)) return;
83 var schema = 's|url|a|title|a|desc|s|fixed|d|score|s|site|s|image|s|price';
84 var entity = env.newEntity();
85 entity.setSchema(schema);
86 entity.setKind('Link');
87 entity.setId(env.uniqid());
88 entity.setString('url', url);
89 entity.setString('title', title);
90 entity.setString('desc', desc);
91 entity.setString('fixed', 'true');
92 entity.setDouble('score', 100);
93 entity.setString('image', image);
94 entity.setString('price', price);
95 try {
96 var t_url = env.newURL(url);
97 var t_host = t_url.getHost();
98 entity.setString('site', t_host);
99 } catch (e) {
100 env.error(e);
101 }
102 entity.save();
103 env.info(title + ' | ' + url);
104}
105
106function findLinkByUrl(url) {
107 var entity = env.newEntity();
108 var query = entity.newTermQuery(entity.newTerm('url', url));
109 var size = entity.count('Link', query, 1);
110 return (size > 0);
111}
112
113function grabCategory(catUrl) {
114 try {
115 var conn = env.newJsoup().connect(catUrl).userAgent('Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101');
116 var doc = conn.timeout(60000).get();
117 var items = doc.select('.browse-item');
118 for (var i = 0; i < items.size(); i++) {
119 var item = items.get(i);
120 var child = item.select('.browse-item-content h3 a').first();
121 if (child == null) continue;
122 var title = child.text().trim();
123 var url = env.newString(env.newURL(env.newURL('http://www.youtube.com'), child.attr('href')) + '');
124 child = item.select('.yt-thumb-clip-inner img').first();
125 var image = env.newString('');
126 if (child != null) {
127 image = env.newString(env.newURL(env.newURL('http://www.youtube.com'), child.attr('data-thumb')) + '');
128 }
129 markVideo(title, url, image);
130 }
131 } catch (e) {
132 env.error(e);
133 }
134}
135
136function markVideo(title, url, image) {
137 if (findQueueByUrl(url)) return;
138 var schema = 's|url|s|title|s|image|s|crawled';
139 var entity = env.newEntity();
140 entity.setSchema(schema);
141 entity.setKind('Queue_YouTube');
142 entity.setId(env.uniqid());
143 entity.setString('url', url);
144 entity.setString('title', title);
145 entity.setString('image', image);
146 entity.setString('crawled', 'false');
147 entity.save();
148}
149
150function findQueueByUrl(url) {
151 var entity = env.newEntity();
152 var query = entity.newTermQuery(entity.newTerm('url', url));
153 var size = entity.count('Queue_YouTube', query, 1);
154 return (size > 0);
155}
156
157function loadQueue() {
158 var entity = env.newEntity();
159 var tag = entity.search('Queue_YouTube', entity.newTermQuery(entity.newTerm('crawled', 'false')), 10);
160 return tag;
161}
var env;
var args;

function main(penv, pargs) {
    env = penv;
    args = pargs;
    env.info('Starting');
    while (true) {
        var queue_list = loadQueue();
        env.info('Size: ' + queue_list.size());
        while (queue_list.size() > 0) {
            for (var i = 0; i < queue_list.size(); i++) {
                var queue = queue_list.get(i);
                grabVideo(queue);
                queue.setString('crawled', 'true');
                queue.save();
            }
            queue_list = loadQueue();
        }
        grabCategory('http://www.youtube.com/autos');
        grabCategory('http://www.youtube.com/comedy');
        grabCategory('http://www.youtube.com/entertainment');
        grabCategory('http://www.youtube.com/film');
        grabCategory('http://www.youtube.com/gaming');
        grabCategory('http://www.youtube.com/howto');
        grabCategory('http://www.youtube.com/activism');
        grabCategory('http://www.youtube.com/people');
        grabCategory('http://www.youtube.com/pets');
        grabCategory('http://www.youtube.com/science');
        grabCategory('http://www.youtube.com/videos?c=17');
        grabCategory('http://www.youtube.com/travel');
    }
    env.info('Ending');
}

function grabVideo(queue) {
    try {
        var url = queue.getString('url');
        var title = queue.getString('title');
        var image = queue.getString('image');
        var conn = env.newJsoup().connect(url).userAgent('Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101');
        var doc = conn.timeout(60000).get(); 
        var child = doc.select('#eow-description').first();
        var desc = env.newString('');
        if (child != null) {
            desc = child.text();
        } else {
            child = doc.select('#ded').first();
            if (child != null) {
                desc = child.text();
            }
        }
        saveLink(title, url, desc, image, '');
  
        var html = doc.html();
        var pos1 = html.indexOf('var rvl =');
        var pos2 = html.indexOf('var cml =');
        if (pos1 < 0 || pos2 < 0 || pos1 >= pos2) return;
        var js1 = html.substring(pos1 + 9, pos2);
        var obj1 = null;
        eval('obj1 = ' + js1);
        if (obj1 == null) return;
        for (var i = 0; i < obj1.length; i++) {
            var item = obj1[i];
            var url2 = env.newString('http://www.youtube.com/watch?v=' + item.k);
            var title2 = item.t;
            var image2 = env.newString(env.newURL(env.newURL('http://www.youtube.com'), item.i));
            markVideo(title2, url2, image2);
            env.info('Video: ' + title2 + ' | ' + url2 + ' | ' + image2);
        }
    } catch (e) {
        env.error(e);
    }
}

function saveLink(title, url, desc, image, price) {
    url = env.newString(url);
    var pos = url.lastIndexOf('&feature=');
    if (pos >= 0) {
        url = url.substring(0, pos);
    }
    if (findLinkByUrl(url)) return;
    var schema = 's|url|a|title|a|desc|s|fixed|d|score|s|site|s|image|s|price';
    var entity = env.newEntity();
    entity.setSchema(schema);
    entity.setKind('Link');
    entity.setId(env.uniqid());
    entity.setString('url', url);
    entity.setString('title', title);
    entity.setString('desc', desc);
    entity.setString('fixed', 'true');
    entity.setDouble('score', 100);
    entity.setString('image', image);
    entity.setString('price', price);
    try {
        var t_url = env.newURL(url);
        var t_host = t_url.getHost();
        entity.setString('site', t_host);
    } catch (e) {
        env.error(e);
    }
    entity.save();
    env.info(title + ' | ' + url);
}

function findLinkByUrl(url) {
    var entity = env.newEntity();
    var query = entity.newTermQuery(entity.newTerm('url', url));
    var size = entity.count('Link', query, 1);
    return (size > 0);
}

function grabCategory(catUrl) {
    try {
        var conn = env.newJsoup().connect(catUrl).userAgent('Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101');
        var doc = conn.timeout(60000).get(); 
        var items = doc.select('.browse-item');
        for (var i = 0; i < items.size(); i++) {
            var item = items.get(i);
            var child = item.select('.browse-item-content h3 a').first();
            if (child == null) continue;
            var title = child.text().trim();
            var url = env.newString(env.newURL(env.newURL('http://www.youtube.com'), child.attr('href')) + '');
            child = item.select('.yt-thumb-clip-inner img').first();
            var image = env.newString('');
            if (child != null) {
                image = env.newString(env.newURL(env.newURL('http://www.youtube.com'), child.attr('data-thumb')) + '');
            }
            markVideo(title, url, image);
        }
    } catch (e) {
        env.error(e);
    }
}

function markVideo(title, url, image) {
    if (findQueueByUrl(url)) return;
    var schema = 's|url|s|title|s|image|s|crawled';
    var entity = env.newEntity();
    entity.setSchema(schema);
    entity.setKind('Queue_YouTube');
    entity.setId(env.uniqid());
    entity.setString('url', url);
    entity.setString('title', title);
    entity.setString('image', image);
    entity.setString('crawled', 'false');
    entity.save();
}

function findQueueByUrl(url) {
    var entity = env.newEntity();
    var query = entity.newTermQuery(entity.newTerm('url', url));
    var size = entity.count('Queue_YouTube', query, 1);
    return (size > 0);
}

function loadQueue() {
    var entity = env.newEntity();
    var tag = entity.search('Queue_YouTube', entity.newTermQuery(entity.newTerm('crawled', 'false')), 10);
    return tag;
}

  Protected by Copyscape Online Copyright Protection