Grab video from DailyMotion
Grab video from DailyMotion
- Create javascript sandbox with jsoup support
- Add Lucene support to javascript sandbox
- Create javascript as following
javascript
1 | var env; |
2 | var args; |
3 | var maxpage = 1000; |
4 | |
5 | function main(penv, pargs) { |
6 | env = penv; |
7 | args = pargs; |
8 | env.info('Starting'); |
9 | while (true) { |
10 | var queue_list = loadQueue(); |
11 | env.info('Size: ' + queue_list.size()); |
12 | while (queue_list.size() > 0) { |
13 | for (var i = 0; i < queue_list.size(); i++) { |
14 | var queue = queue_list.get(i); |
15 | grabVideo(queue); |
16 | queue.setString('crawled', 'true'); |
17 | queue.save(); |
18 | } |
19 | queue_list = loadQueue(); |
20 | } |
21 | grabCategory('http://www.dailymotion.com/group/in_theaters_this_week/'); |
22 | grabCategory('http://www.dailymotion.com/group/in_theaters_now/'); |
23 | grabCategory('http://www.dailymotion.com/group/coming_soon/'); |
24 | grabCategory('http://www.dailymotion.com/user/ReelzChannel/lang/en/search/movie+review/'); |
25 | grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/interview+movie/channel/shortfilms/'); |
26 | grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/movie+news/channel/shortfilms/'); |
27 | grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/horror/channel/shortfilms/'); |
28 | grabCategory('http://www.dailymotion.com/user/hollywoodtv/'); |
29 | grabCategory('http://www.dailymotion.com/creative-official/tag/rock/lang/en/channel/music/'); |
30 | grabCategory('http://www.dailymotion.com/creative-official/tag/pop/lang/en/channel/music/'); |
31 | grabCategory('http://www.dailymotion.com/creative-official/tag/hop/lang/en/channel/music/'); |
32 | grabCategory('http://www.dailymotion.com/creative-official/tag/alternative/lang/en/channel/music/'); |
33 | grabCategory('http://www.dailymotion.com/creative-official/tag/dance/lang/en/channel/music/'); |
34 | grabCategory('http://www.dailymotion.com/creative-official/tag/soul/lang/en/channel/music/'); |
35 | grabCategory('http://www.dailymotion.com/creative-official/tag/latin/lang/en/channel/music/'); |
36 | grabCategory('http://www.dailymotion.com/creative-official/tag/country/lang/en/channel/music/'); |
37 | grabCategory('http://www.dailymotion.com/user/ClassicGameRoom/'); |
38 | grabCategory('http://www.dailymotion.com/mychannel/ClassicGameRoom/'); |
39 | grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/fight/channel/videogames/'); |
40 | grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/strategy/channel/videogames/'); |
41 | grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/shooter/channel/videogames/'); |
42 | grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/action/channel/videogames/'); |
43 | grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/sport/channel/videogames/'); |
44 | grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/trailer/channel/videogames/'); |
45 | grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/review/channel/videogames/'); |
46 | grabCategory('http://www.dailymotion.com/creative-official/lang/en/search/nfl/'); |
47 | grabCategory('http://www.dailymotion.com/creative-official/lang/en/search/nba/'); |
48 | grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/nhl/channel/sport/'); |
49 | grabCategory('http://www.dailymotion.com/user/TotalCollegeSports/'); |
50 | grabCategory('http://www.dailymotion.com/user/UFC/'); |
51 | grabCategory('http://www.dailymotion.com/user/sportsillustrated/'); |
52 | grabCategory('http://www.dailymotion.com/user/transworld/'); |
53 | grabCategory('http://www.dailymotion.com/user/rooftopcomedy/'); |
54 | grabCategory('http://www.dailymotion.com/playlist/x1qzsd_MyDamnChannel_dicki/'); |
55 | grabCategory('http://www.dailymotion.com/playlist/x1r5r4_MyDamnChannel_easy-to-assemble-season-3/'); |
56 | grabCategory('http://www.dailymotion.com/user/Rhettandlink/'); |
57 | grabCategory('http://www.dailymotion.com/user/epicmealtime/'); |
58 | grabCategory('http://www.dailymotion.com/group/familyguy/'); |
59 | grabCategory('http://www.dailymotion.com/creative/lang/en/channel/fun/'); |
60 | grabCategory('http://www.dailymotion.com/group/nbcnightlynews/'); |
61 | grabCategory('http://www.dailymotion.com/user/reuters/'); |
62 | grabCategory('http://www.dailymotion.com/user/NewsLook/'); |
63 | grabCategory('http://www.dailymotion.com/user/NYMag/'); |
64 | grabCategory('http://www.dailymotion.com/user/itnnews/'); |
65 | grabCategory('http://www.dailymotion.com/user/Buzz60/'); |
66 | grabCategory('http://www.dailymotion.com/user/associatedpress/'); |
67 | grabCategory('http://www.dailymotion.com/us/featured/channel/news/'); |
68 | grabCategory('http://www.dailymotion.com/user/clevvertv/'); |
69 | grabCategory('http://www.dailymotion.com/user/tvguide/'); |
70 | grabCategory('http://www.dailymotion.com/user/splashnews/'); |
71 | grabCategory('http://www.dailymotion.com/user/hollywoodbackstage/'); |
72 | grabCategory('http://www.dailymotion.com/user/celebtv/'); |
73 | grabCategory('http://www.dailymotion.com/user/maximotv/'); |
74 | grabCategory('http://www.dailymotion.com/user/mojosupreme/'); |
75 | grabCategory('http://www.dailymotion.com/user/DiagonalView/'); |
76 | grabCategory('http://www.dailymotion.com/hub/x38_Motionmaker-documentaries/'); |
77 | grabCategory('http://www.dailymotion.com/user/tysihelp/'); |
78 | grabCategory('http://www.dailymotion.com/user/computerTV/'); |
79 | grabCategory('http://www.dailymotion.com/user/soldierknowsbest/'); |
80 | grabCategory('http://www.dailymotion.com/user/appjudgment/'); |
81 | grabCategory('http://www.dailymotion.com/user/geekbeattv/'); |
82 | grabCategory('http://www.dailymotion.com/user/allthingsscience/'); |
83 | grabCategory('http://www.dailymotion.com/user/stuffwelike/'); |
84 | grabCategory('http://www.dailymotion.com/user/lifehackershow/'); |
85 | grabCategory('http://www.dailymotion.com/us/channel/auto/'); |
86 | } |
87 | env.info('Ending'); |
88 | } |
89 | |
90 | function grabVideo(queue) { |
91 | try { |
92 | var url = queue.getString('url'); |
93 | var title = queue.getString('title'); |
94 | var image = queue.getString('image'); |
95 | var conn = env.newJsoup().connect(url).userAgent('Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101'); |
96 | var doc = conn.timeout(60000).get(); |
97 | var child = doc.select('#video_description').first(); |
98 | var desc = env.newString(''); |
99 | if (child != null) { |
100 | desc = child.text(); |
101 | } |
102 | saveLink(title, url, desc, image, ''); |
103 | } catch (e) { |
104 | env.error(e); |
105 | } |
106 | } |
107 | |
108 | function saveLink(title, url, desc, image, price) { |
109 | url = env.newString(url); |
110 | var pos = url.lastIndexOf('&feature='); |
111 | if (pos >= 0) { |
112 | url = url.substring(0, pos); |
113 | } |
114 | if (findLinkByUrl(url)) return; |
115 | var schema = 's|url|a|title|a|desc|s|fixed|d|score|s|site|s|image|s|price'; |
116 | var entity = env.newEntity(); |
117 | entity.setSchema(schema); |
118 | entity.setKind('Link'); |
119 | entity.setId(env.uniqid()); |
120 | entity.setString('url', url); |
121 | entity.setString('title', title); |
122 | entity.setString('desc', desc); |
123 | entity.setString('fixed', 'true'); |
124 | entity.setDouble('score', 100); |
125 | entity.setString('image', image); |
126 | entity.setString('price', price); |
127 | try { |
128 | var t_url = env.newURL(url); |
129 | var t_host = t_url.getHost(); |
130 | entity.setString('site', t_host); |
131 | } catch (e) { |
132 | env.error(e); |
133 | } |
134 | entity.save(); |
135 | env.info(title + ' | ' + url); |
136 | } |
137 | |
138 | function findLinkByUrl(url) { |
139 | var entity = env.newEntity(); |
140 | var query = entity.newTermQuery(entity.newTerm('url', url)); |
141 | var size = entity.count('Link', query, 1); |
142 | return (size > 0); |
143 | } |
144 | |
145 | function grabCategory(catUrl) { |
146 | env.info('Category: ' + catUrl); |
147 | for (var no = 1; no <= maxpage; no++) { |
148 | try { |
149 | var conn = env.newJsoup().connect(catUrl + no + '?mode=playlist').userAgent('Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101'); |
150 | var doc = conn.timeout(60000).get(); |
151 | var items = doc.select('.dmpi_video_item'); |
152 | if (items.size() == 0) break; |
153 | for (var i = 0; i < items.size(); i++) { |
154 | var item = items.get(i); |
155 | var child = item.select('.dmpi_video_title a').first(); |
156 | if (child == null) continue; |
157 | var title = child.text().trim(); |
158 | var url = env.newString(env.newURL(env.newURL('http://www.dailymotion.com'), child.attr('href')) + ''); |
159 | child = item.select('.dmpi_video_preview a img').first(); |
160 | var image = env.newString(''); |
161 | if (child != null) { |
162 | var tmp = env.newString(child.attr('data-spr')); |
163 | tmp = tmp.replaceAl?('jpeg_preview_sprite.jpg', 'jpeg_preview_medium.jpg'); |
164 | image = env.newString(env.newURL(env.newURL('http://www.dailymotion.com'), tmp) + ''); |
165 | } |
166 | markVideo(title, url, image); |
167 | } |
168 | } catch (e) { |
169 | env.error(e); |
170 | } |
171 | } |
172 | } |
173 | |
174 | function markVideo(title, url, image) { |
175 | if (findQueueByUrl(url)) return; |
176 | var schema = 's|url|s|title|s|image|s|crawled'; |
177 | var entity = env.newEntity(); |
178 | entity.setSchema(schema); |
179 | entity.setKind('Queue_DailyMotion'); |
180 | entity.setId(env.uniqid()); |
181 | entity.setString('url', url); |
182 | entity.setString('title', title); |
183 | entity.setString('image', image); |
184 | entity.setString('crawled', 'false'); |
185 | entity.save(); |
186 | } |
187 | |
188 | function findQueueByUrl(url) { |
189 | var entity = env.newEntity(); |
190 | var query = entity.newTermQuery(entity.newTerm('url', url)); |
191 | var size = entity.count('Queue_DailyMotion', query, 1); |
192 | return (size > 0); |
193 | } |
194 | |
195 | function loadQueue() { |
196 | var entity = env.newEntity(); |
197 | var tag = entity.search('Queue_DailyMotion', entity.newTermQuery(entity.newTerm('crawled', 'false')), 10); |
198 | return tag; |
199 | } |
var env; var args; var maxpage = 1000; function main(penv, pargs) { env = penv; args = pargs; env.info('Starting'); while (true) { var queue_list = loadQueue(); env.info('Size: ' + queue_list.size()); while (queue_list.size() > 0) { for (var i = 0; i < queue_list.size(); i++) { var queue = queue_list.get(i); grabVideo(queue); queue.setString('crawled', 'true'); queue.save(); } queue_list = loadQueue(); } grabCategory('http://www.dailymotion.com/group/in_theaters_this_week/'); grabCategory('http://www.dailymotion.com/group/in_theaters_now/'); grabCategory('http://www.dailymotion.com/group/coming_soon/'); grabCategory('http://www.dailymotion.com/user/ReelzChannel/lang/en/search/movie+review/'); grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/interview+movie/channel/shortfilms/'); grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/movie+news/channel/shortfilms/'); grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/horror/channel/shortfilms/'); grabCategory('http://www.dailymotion.com/user/hollywoodtv/'); grabCategory('http://www.dailymotion.com/creative-official/tag/rock/lang/en/channel/music/'); grabCategory('http://www.dailymotion.com/creative-official/tag/pop/lang/en/channel/music/'); grabCategory('http://www.dailymotion.com/creative-official/tag/hop/lang/en/channel/music/'); grabCategory('http://www.dailymotion.com/creative-official/tag/alternative/lang/en/channel/music/'); grabCategory('http://www.dailymotion.com/creative-official/tag/dance/lang/en/channel/music/'); grabCategory('http://www.dailymotion.com/creative-official/tag/soul/lang/en/channel/music/'); grabCategory('http://www.dailymotion.com/creative-official/tag/latin/lang/en/channel/music/'); grabCategory('http://www.dailymotion.com/creative-official/tag/country/lang/en/channel/music/'); grabCategory('http://www.dailymotion.com/user/ClassicGameRoom/'); grabCategory('http://www.dailymotion.com/mychannel/ClassicGameRoom/'); grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/fight/channel/videogames/'); grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/strategy/channel/videogames/'); grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/shooter/channel/videogames/'); grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/action/channel/videogames/'); grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/sport/channel/videogames/'); grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/trailer/channel/videogames/'); grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/review/channel/videogames/'); grabCategory('http://www.dailymotion.com/creative-official/lang/en/search/nfl/'); grabCategory('http://www.dailymotion.com/creative-official/lang/en/search/nba/'); grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/nhl/channel/sport/'); grabCategory('http://www.dailymotion.com/user/TotalCollegeSports/'); grabCategory('http://www.dailymotion.com/user/UFC/'); grabCategory('http://www.dailymotion.com/user/sportsillustrated/'); grabCategory('http://www.dailymotion.com/user/transworld/'); grabCategory('http://www.dailymotion.com/user/rooftopcomedy/'); grabCategory('http://www.dailymotion.com/playlist/x1qzsd_MyDamnChannel_dicki/'); grabCategory('http://www.dailymotion.com/playlist/x1r5r4_MyDamnChannel_easy-to-assemble-season-3/'); grabCategory('http://www.dailymotion.com/user/Rhettandlink/'); grabCategory('http://www.dailymotion.com/user/epicmealtime/'); grabCategory('http://www.dailymotion.com/group/familyguy/'); grabCategory('http://www.dailymotion.com/creative/lang/en/channel/fun/'); grabCategory('http://www.dailymotion.com/group/nbcnightlynews/'); grabCategory('http://www.dailymotion.com/user/reuters/'); grabCategory('http://www.dailymotion.com/user/NewsLook/'); grabCategory('http://www.dailymotion.com/user/NYMag/'); grabCategory('http://www.dailymotion.com/user/itnnews/'); grabCategory('http://www.dailymotion.com/user/Buzz60/'); grabCategory('http://www.dailymotion.com/user/associatedpress/'); grabCategory('http://www.dailymotion.com/us/featured/channel/news/'); grabCategory('http://www.dailymotion.com/user/clevvertv/'); grabCategory('http://www.dailymotion.com/user/tvguide/'); grabCategory('http://www.dailymotion.com/user/splashnews/'); grabCategory('http://www.dailymotion.com/user/hollywoodbackstage/'); grabCategory('http://www.dailymotion.com/user/celebtv/'); grabCategory('http://www.dailymotion.com/user/maximotv/'); grabCategory('http://www.dailymotion.com/user/mojosupreme/'); grabCategory('http://www.dailymotion.com/user/DiagonalView/'); grabCategory('http://www.dailymotion.com/hub/x38_Motionmaker-documentaries/'); grabCategory('http://www.dailymotion.com/user/tysihelp/'); grabCategory('http://www.dailymotion.com/user/computerTV/'); grabCategory('http://www.dailymotion.com/user/soldierknowsbest/'); grabCategory('http://www.dailymotion.com/user/appjudgment/'); grabCategory('http://www.dailymotion.com/user/geekbeattv/'); grabCategory('http://www.dailymotion.com/user/allthingsscience/'); grabCategory('http://www.dailymotion.com/user/stuffwelike/'); grabCategory('http://www.dailymotion.com/user/lifehackershow/'); grabCategory('http://www.dailymotion.com/us/channel/auto/'); } env.info('Ending'); } function grabVideo(queue) { try { var url = queue.getString('url'); var title = queue.getString('title'); var image = queue.getString('image'); var conn = env.newJsoup().connect(url).userAgent('Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101'); var doc = conn.timeout(60000).get(); var child = doc.select('#video_description').first(); var desc = env.newString(''); if (child != null) { desc = child.text(); } saveLink(title, url, desc, image, ''); } catch (e) { env.error(e); } } function saveLink(title, url, desc, image, price) { url = env.newString(url); var pos = url.lastIndexOf('&feature='); if (pos >= 0) { url = url.substring(0, pos); } if (findLinkByUrl(url)) return; var schema = 's|url|a|title|a|desc|s|fixed|d|score|s|site|s|image|s|price'; var entity = env.newEntity(); entity.setSchema(schema); entity.setKind('Link'); entity.setId(env.uniqid()); entity.setString('url', url); entity.setString('title', title); entity.setString('desc', desc); entity.setString('fixed', 'true'); entity.setDouble('score', 100); entity.setString('image', image); entity.setString('price', price); try { var t_url = env.newURL(url); var t_host = t_url.getHost(); entity.setString('site', t_host); } catch (e) { env.error(e); } entity.save(); env.info(title + ' | ' + url); } function findLinkByUrl(url) { var entity = env.newEntity(); var query = entity.newTermQuery(entity.newTerm('url', url)); var size = entity.count('Link', query, 1); return (size > 0); } function grabCategory(catUrl) { env.info('Category: ' + catUrl); for (var no = 1; no <= maxpage; no++) { try { var conn = env.newJsoup().connect(catUrl + no + '?mode=playlist').userAgent('Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101'); var doc = conn.timeout(60000).get(); var items = doc.select('.dmpi_video_item'); if (items.size() == 0) break; for (var i = 0; i < items.size(); i++) { var item = items.get(i); var child = item.select('.dmpi_video_title a').first(); if (child == null) continue; var title = child.text().trim(); var url = env.newString(env.newURL(env.newURL('http://www.dailymotion.com'), child.attr('href')) + ''); child = item.select('.dmpi_video_preview a img').first(); var image = env.newString(''); if (child != null) { var tmp = env.newString(child.attr('data-spr')); tmp = tmp.replaceAl?('jpeg_preview_sprite.jpg', 'jpeg_preview_medium.jpg'); image = env.newString(env.newURL(env.newURL('http://www.dailymotion.com'), tmp) + ''); } markVideo(title, url, image); } } catch (e) { env.error(e); } } } function markVideo(title, url, image) { if (findQueueByUrl(url)) return; var schema = 's|url|s|title|s|image|s|crawled'; var entity = env.newEntity(); entity.setSchema(schema); entity.setKind('Queue_DailyMotion'); entity.setId(env.uniqid()); entity.setString('url', url); entity.setString('title', title); entity.setString('image', image); entity.setString('crawled', 'false'); entity.save(); } function findQueueByUrl(url) { var entity = env.newEntity(); var query = entity.newTermQuery(entity.newTerm('url', url)); var size = entity.count('Queue_DailyMotion', query, 1); return (size > 0); } function loadQueue() { var entity = env.newEntity(); var tag = entity.search('Queue_DailyMotion', entity.newTermQuery(entity.newTerm('crawled', 'false')), 10); return tag; }
No comments:
Post a Comment