Grab video from YouTube
Grab video from YouTube
- Create javascript sandbox with jsoup support
- Add Lucene support to javascript sandbox
- Create javascript as following
javascript
1 | var env; |
2 | var args; |
3 | |
4 | function main(penv, pargs) { |
5 | env = penv; |
6 | args = pargs; |
7 | env.info('Starting'); |
8 | while (true) { |
9 | var queue_list = loadQueue(); |
10 | env.info('Size: ' + queue_list.size()); |
11 | while (queue_list.size() > 0) { |
12 | for (var i = 0; i < queue_list.size(); i++) { |
13 | var queue = queue_list.get(i); |
14 | grabVideo(queue); |
15 | queue.setString('crawled', 'true'); |
16 | queue.save(); |
17 | } |
18 | queue_list = loadQueue(); |
19 | } |
20 | grabCategory('http://www.youtube.com/autos'); |
21 | grabCategory('http://www.youtube.com/comedy'); |
22 | grabCategory('http://www.youtube.com/entertainment'); |
23 | grabCategory('http://www.youtube.com/film'); |
24 | grabCategory('http://www.youtube.com/gaming'); |
25 | grabCategory('http://www.youtube.com/howto'); |
26 | grabCategory('http://www.youtube.com/activism'); |
27 | grabCategory('http://www.youtube.com/people'); |
28 | grabCategory('http://www.youtube.com/pets'); |
29 | grabCategory('http://www.youtube.com/science'); |
30 | grabCategory('http://www.youtube.com/videos?c=17'); |
31 | grabCategory('http://www.youtube.com/travel'); |
32 | } |
33 | env.info('Ending'); |
34 | } |
35 | |
36 | function grabVideo(queue) { |
37 | try { |
38 | var url = queue.getString('url'); |
39 | var title = queue.getString('title'); |
40 | var image = queue.getString('image'); |
41 | var conn = env.newJsoup().connect(url).userAgent('Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101'); |
42 | var doc = conn.timeout(60000).get(); |
43 | var child = doc.select('#eow-description').first(); |
44 | var desc = env.newString(''); |
45 | if (child != null) { |
46 | desc = child.text(); |
47 | } else { |
48 | child = doc.select('#ded').first(); |
49 | if (child != null) { |
50 | desc = child.text(); |
51 | } |
52 | } |
53 | saveLink(title, url, desc, image, ''); |
54 | |
55 | var html = doc.html(); |
56 | var pos1 = html.indexOf('var rvl ='); |
57 | var pos2 = html.indexOf('var cml ='); |
58 | if (pos1 < 0 || pos2 < 0 || pos1 >= pos2) return; |
59 | var js1 = html.substring(pos1 + 9, pos2); |
60 | var obj1 = null; |
61 | eval('obj1 = ' + js1); |
62 | if (obj1 == null) return; |
63 | for (var i = 0; i < obj1.length; i++) { |
64 | var item = obj1[i]; |
65 | var url2 = env.newString('http://www.youtube.com/watch?v=' + item.k); |
66 | var title2 = item.t; |
67 | var image2 = env.newString(env.newURL(env.newURL('http://www.youtube.com'), item.i)); |
68 | markVideo(title2, url2, image2); |
69 | env.info('Video: ' + title2 + ' | ' + url2 + ' | ' + image2); |
70 | } |
71 | } catch (e) { |
72 | env.error(e); |
73 | } |
74 | } |
75 | |
76 | function saveLink(title, url, desc, image, price) { |
77 | url = env.newString(url); |
78 | var pos = url.lastIndexOf('&feature='); |
79 | if (pos >= 0) { |
80 | url = url.substring(0, pos); |
81 | } |
82 | if (findLinkByUrl(url)) return; |
83 | var schema = 's|url|a|title|a|desc|s|fixed|d|score|s|site|s|image|s|price'; |
84 | var entity = env.newEntity(); |
85 | entity.setSchema(schema); |
86 | entity.setKind('Link'); |
87 | entity.setId(env.uniqid()); |
88 | entity.setString('url', url); |
89 | entity.setString('title', title); |
90 | entity.setString('desc', desc); |
91 | entity.setString('fixed', 'true'); |
92 | entity.setDouble('score', 100); |
93 | entity.setString('image', image); |
94 | entity.setString('price', price); |
95 | try { |
96 | var t_url = env.newURL(url); |
97 | var t_host = t_url.getHost(); |
98 | entity.setString('site', t_host); |
99 | } catch (e) { |
100 | env.error(e); |
101 | } |
102 | entity.save(); |
103 | env.info(title + ' | ' + url); |
104 | } |
105 | |
106 | function findLinkByUrl(url) { |
107 | var entity = env.newEntity(); |
108 | var query = entity.newTermQuery(entity.newTerm('url', url)); |
109 | var size = entity.count('Link', query, 1); |
110 | return (size > 0); |
111 | } |
112 | |
113 | function grabCategory(catUrl) { |
114 | try { |
115 | var conn = env.newJsoup().connect(catUrl).userAgent('Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101'); |
116 | var doc = conn.timeout(60000).get(); |
117 | var items = doc.select('.browse-item'); |
118 | for (var i = 0; i < items.size(); i++) { |
119 | var item = items.get(i); |
120 | var child = item.select('.browse-item-content h3 a').first(); |
121 | if (child == null) continue; |
122 | var title = child.text().trim(); |
123 | var url = env.newString(env.newURL(env.newURL('http://www.youtube.com'), child.attr('href')) + ''); |
124 | child = item.select('.yt-thumb-clip-inner img').first(); |
125 | var image = env.newString(''); |
126 | if (child != null) { |
127 | image = env.newString(env.newURL(env.newURL('http://www.youtube.com'), child.attr('data-thumb')) + ''); |
128 | } |
129 | markVideo(title, url, image); |
130 | } |
131 | } catch (e) { |
132 | env.error(e); |
133 | } |
134 | } |
135 | |
136 | function markVideo(title, url, image) { |
137 | if (findQueueByUrl(url)) return; |
138 | var schema = 's|url|s|title|s|image|s|crawled'; |
139 | var entity = env.newEntity(); |
140 | entity.setSchema(schema); |
141 | entity.setKind('Queue_YouTube'); |
142 | entity.setId(env.uniqid()); |
143 | entity.setString('url', url); |
144 | entity.setString('title', title); |
145 | entity.setString('image', image); |
146 | entity.setString('crawled', 'false'); |
147 | entity.save(); |
148 | } |
149 | |
150 | function findQueueByUrl(url) { |
151 | var entity = env.newEntity(); |
152 | var query = entity.newTermQuery(entity.newTerm('url', url)); |
153 | var size = entity.count('Queue_YouTube', query, 1); |
154 | return (size > 0); |
155 | } |
156 | |
157 | function loadQueue() { |
158 | var entity = env.newEntity(); |
159 | var tag = entity.search('Queue_YouTube', entity.newTermQuery(entity.newTerm('crawled', 'false')), 10); |
160 | return tag; |
161 | } |
var env; var args; function main(penv, pargs) { env = penv; args = pargs; env.info('Starting'); while (true) { var queue_list = loadQueue(); env.info('Size: ' + queue_list.size()); while (queue_list.size() > 0) { for (var i = 0; i < queue_list.size(); i++) { var queue = queue_list.get(i); grabVideo(queue); queue.setString('crawled', 'true'); queue.save(); } queue_list = loadQueue(); } grabCategory('http://www.youtube.com/autos'); grabCategory('http://www.youtube.com/comedy'); grabCategory('http://www.youtube.com/entertainment'); grabCategory('http://www.youtube.com/film'); grabCategory('http://www.youtube.com/gaming'); grabCategory('http://www.youtube.com/howto'); grabCategory('http://www.youtube.com/activism'); grabCategory('http://www.youtube.com/people'); grabCategory('http://www.youtube.com/pets'); grabCategory('http://www.youtube.com/science'); grabCategory('http://www.youtube.com/videos?c=17'); grabCategory('http://www.youtube.com/travel'); } env.info('Ending'); } function grabVideo(queue) { try { var url = queue.getString('url'); var title = queue.getString('title'); var image = queue.getString('image'); var conn = env.newJsoup().connect(url).userAgent('Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101'); var doc = conn.timeout(60000).get(); var child = doc.select('#eow-description').first(); var desc = env.newString(''); if (child != null) { desc = child.text(); } else { child = doc.select('#ded').first(); if (child != null) { desc = child.text(); } } saveLink(title, url, desc, image, ''); var html = doc.html(); var pos1 = html.indexOf('var rvl ='); var pos2 = html.indexOf('var cml ='); if (pos1 < 0 || pos2 < 0 || pos1 >= pos2) return; var js1 = html.substring(pos1 + 9, pos2); var obj1 = null; eval('obj1 = ' + js1); if (obj1 == null) return; for (var i = 0; i < obj1.length; i++) { var item = obj1[i]; var url2 = env.newString('http://www.youtube.com/watch?v=' + item.k); var title2 = item.t; var image2 = env.newString(env.newURL(env.newURL('http://www.youtube.com'), item.i)); markVideo(title2, url2, image2); env.info('Video: ' + title2 + ' | ' + url2 + ' | ' + image2); } } catch (e) { env.error(e); } } function saveLink(title, url, desc, image, price) { url = env.newString(url); var pos = url.lastIndexOf('&feature='); if (pos >= 0) { url = url.substring(0, pos); } if (findLinkByUrl(url)) return; var schema = 's|url|a|title|a|desc|s|fixed|d|score|s|site|s|image|s|price'; var entity = env.newEntity(); entity.setSchema(schema); entity.setKind('Link'); entity.setId(env.uniqid()); entity.setString('url', url); entity.setString('title', title); entity.setString('desc', desc); entity.setString('fixed', 'true'); entity.setDouble('score', 100); entity.setString('image', image); entity.setString('price', price); try { var t_url = env.newURL(url); var t_host = t_url.getHost(); entity.setString('site', t_host); } catch (e) { env.error(e); } entity.save(); env.info(title + ' | ' + url); } function findLinkByUrl(url) { var entity = env.newEntity(); var query = entity.newTermQuery(entity.newTerm('url', url)); var size = entity.count('Link', query, 1); return (size > 0); } function grabCategory(catUrl) { try { var conn = env.newJsoup().connect(catUrl).userAgent('Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101'); var doc = conn.timeout(60000).get(); var items = doc.select('.browse-item'); for (var i = 0; i < items.size(); i++) { var item = items.get(i); var child = item.select('.browse-item-content h3 a').first(); if (child == null) continue; var title = child.text().trim(); var url = env.newString(env.newURL(env.newURL('http://www.youtube.com'), child.attr('href')) + ''); child = item.select('.yt-thumb-clip-inner img').first(); var image = env.newString(''); if (child != null) { image = env.newString(env.newURL(env.newURL('http://www.youtube.com'), child.attr('data-thumb')) + ''); } markVideo(title, url, image); } } catch (e) { env.error(e); } } function markVideo(title, url, image) { if (findQueueByUrl(url)) return; var schema = 's|url|s|title|s|image|s|crawled'; var entity = env.newEntity(); entity.setSchema(schema); entity.setKind('Queue_YouTube'); entity.setId(env.uniqid()); entity.setString('url', url); entity.setString('title', title); entity.setString('image', image); entity.setString('crawled', 'false'); entity.save(); } function findQueueByUrl(url) { var entity = env.newEntity(); var query = entity.newTermQuery(entity.newTerm('url', url)); var size = entity.count('Queue_YouTube', query, 1); return (size > 0); } function loadQueue() { var entity = env.newEntity(); var tag = entity.search('Queue_YouTube', entity.newTermQuery(entity.newTerm('crawled', 'false')), 10); return tag; }
No comments:
Post a Comment