Wednesday, 6 June 2012

Grab video from DailyMotion

Grab video from DailyMotion
This task use javascript sandbox with jsoup and lucene support to grab video from DailyMotion.
Grab video from DailyMotion
  1. Create javascript sandbox with jsoup support
  2. Add Lucene support to javascript sandbox
  3. Create javascript as following
javascript
1var env;
2var args;
3var maxpage = 1000;
4
5function main(penv, pargs) {
6 env = penv;
7 args = pargs;
8 env.info('Starting');
9 while (true) {
10 var queue_list = loadQueue();
11 env.info('Size: ' + queue_list.size());
12 while (queue_list.size() > 0) {
13 for (var i = 0; i < queue_list.size(); i++) {
14 var queue = queue_list.get(i);
15 grabVideo(queue);
16 queue.setString('crawled', 'true');
17 queue.save();
18 }
19 queue_list = loadQueue();
20 }
21 grabCategory('http://www.dailymotion.com/group/in_theaters_this_week/');
22 grabCategory('http://www.dailymotion.com/group/in_theaters_now/');
23 grabCategory('http://www.dailymotion.com/group/coming_soon/');
24 grabCategory('http://www.dailymotion.com/user/ReelzChannel/lang/en/search/movie+review/');
25 grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/interview+movie/channel/shortfilms/');
26 grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/movie+news/channel/shortfilms/');
27 grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/horror/channel/shortfilms/');
28 grabCategory('http://www.dailymotion.com/user/hollywoodtv/');
29 grabCategory('http://www.dailymotion.com/creative-official/tag/rock/lang/en/channel/music/');
30 grabCategory('http://www.dailymotion.com/creative-official/tag/pop/lang/en/channel/music/');
31 grabCategory('http://www.dailymotion.com/creative-official/tag/hop/lang/en/channel/music/');
32 grabCategory('http://www.dailymotion.com/creative-official/tag/alternative/lang/en/channel/music/');
33 grabCategory('http://www.dailymotion.com/creative-official/tag/dance/lang/en/channel/music/');
34 grabCategory('http://www.dailymotion.com/creative-official/tag/soul/lang/en/channel/music/');
35 grabCategory('http://www.dailymotion.com/creative-official/tag/latin/lang/en/channel/music/');
36 grabCategory('http://www.dailymotion.com/creative-official/tag/country/lang/en/channel/music/');
37 grabCategory('http://www.dailymotion.com/user/ClassicGameRoom/');
38 grabCategory('http://www.dailymotion.com/mychannel/ClassicGameRoom/');
39 grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/fight/channel/videogames/');
40 grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/strategy/channel/videogames/');
41 grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/shooter/channel/videogames/');
42 grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/action/channel/videogames/');
43 grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/sport/channel/videogames/');
44 grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/trailer/channel/videogames/');
45 grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/review/channel/videogames/');
46 grabCategory('http://www.dailymotion.com/creative-official/lang/en/search/nfl/');
47 grabCategory('http://www.dailymotion.com/creative-official/lang/en/search/nba/');
48 grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/nhl/channel/sport/');
49 grabCategory('http://www.dailymotion.com/user/TotalCollegeSports/');
50 grabCategory('http://www.dailymotion.com/user/UFC/');
51 grabCategory('http://www.dailymotion.com/user/sportsillustrated/');
52 grabCategory('http://www.dailymotion.com/user/transworld/');
53 grabCategory('http://www.dailymotion.com/user/rooftopcomedy/');
54 grabCategory('http://www.dailymotion.com/playlist/x1qzsd_MyDamnChannel_dicki/');
55 grabCategory('http://www.dailymotion.com/playlist/x1r5r4_MyDamnChannel_easy-to-assemble-season-3/');
56 grabCategory('http://www.dailymotion.com/user/Rhettandlink/');
57 grabCategory('http://www.dailymotion.com/user/epicmealtime/');
58 grabCategory('http://www.dailymotion.com/group/familyguy/');
59 grabCategory('http://www.dailymotion.com/creative/lang/en/channel/fun/');
60 grabCategory('http://www.dailymotion.com/group/nbcnightlynews/');
61 grabCategory('http://www.dailymotion.com/user/reuters/');
62 grabCategory('http://www.dailymotion.com/user/NewsLook/');
63 grabCategory('http://www.dailymotion.com/user/NYMag/');
64 grabCategory('http://www.dailymotion.com/user/itnnews/');
65 grabCategory('http://www.dailymotion.com/user/Buzz60/');
66 grabCategory('http://www.dailymotion.com/user/associatedpress/');
67 grabCategory('http://www.dailymotion.com/us/featured/channel/news/');
68 grabCategory('http://www.dailymotion.com/user/clevvertv/');
69 grabCategory('http://www.dailymotion.com/user/tvguide/');
70 grabCategory('http://www.dailymotion.com/user/splashnews/');
71 grabCategory('http://www.dailymotion.com/user/hollywoodbackstage/');
72 grabCategory('http://www.dailymotion.com/user/celebtv/');
73 grabCategory('http://www.dailymotion.com/user/maximotv/');
74 grabCategory('http://www.dailymotion.com/user/mojosupreme/');
75 grabCategory('http://www.dailymotion.com/user/DiagonalView/');
76 grabCategory('http://www.dailymotion.com/hub/x38_Motionmaker-documentaries/');
77 grabCategory('http://www.dailymotion.com/user/tysihelp/');
78 grabCategory('http://www.dailymotion.com/user/computerTV/');
79 grabCategory('http://www.dailymotion.com/user/soldierknowsbest/');
80 grabCategory('http://www.dailymotion.com/user/appjudgment/');
81 grabCategory('http://www.dailymotion.com/user/geekbeattv/');
82 grabCategory('http://www.dailymotion.com/user/allthingsscience/');
83 grabCategory('http://www.dailymotion.com/user/stuffwelike/');
84 grabCategory('http://www.dailymotion.com/user/lifehackershow/');
85 grabCategory('http://www.dailymotion.com/us/channel/auto/');
86 }
87 env.info('Ending');
88}
89
90function grabVideo(queue) {
91 try {
92 var url = queue.getString('url');
93 var title = queue.getString('title');
94 var image = queue.getString('image');
95 var conn = env.newJsoup().connect(url).userAgent('Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101');
96 var doc = conn.timeout(60000).get();
97 var child = doc.select('#video_description').first();
98 var desc = env.newString('');
99 if (child != null) {
100 desc = child.text();
101 }
102 saveLink(title, url, desc, image, '');
103 } catch (e) {
104 env.error(e);
105 }
106}
107
108function saveLink(title, url, desc, image, price) {
109 url = env.newString(url);
110 var pos = url.lastIndexOf('&feature=');
111 if (pos >= 0) {
112 url = url.substring(0, pos);
113 }
114 if (findLinkByUrl(url)) return;
115 var schema = 's|url|a|title|a|desc|s|fixed|d|score|s|site|s|image|s|price';
116 var entity = env.newEntity();
117 entity.setSchema(schema);
118 entity.setKind('Link');
119 entity.setId(env.uniqid());
120 entity.setString('url', url);
121 entity.setString('title', title);
122 entity.setString('desc', desc);
123 entity.setString('fixed', 'true');
124 entity.setDouble('score', 100);
125 entity.setString('image', image);
126 entity.setString('price', price);
127 try {
128 var t_url = env.newURL(url);
129 var t_host = t_url.getHost();
130 entity.setString('site', t_host);
131 } catch (e) {
132 env.error(e);
133 }
134 entity.save();
135 env.info(title + ' | ' + url);
136}
137
138function findLinkByUrl(url) {
139 var entity = env.newEntity();
140 var query = entity.newTermQuery(entity.newTerm('url', url));
141 var size = entity.count('Link', query, 1);
142 return (size > 0);
143}
144
145function grabCategory(catUrl) {
146 env.info('Category: ' + catUrl);
147 for (var no = 1; no <= maxpage; no++) {
148 try {
149 var conn = env.newJsoup().connect(catUrl + no + '?mode=playlist').userAgent('Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101');
150 var doc = conn.timeout(60000).get();
151 var items = doc.select('.dmpi_video_item');
152 if (items.size() == 0) break;
153 for (var i = 0; i < items.size(); i++) {
154 var item = items.get(i);
155 var child = item.select('.dmpi_video_title a').first();
156 if (child == null) continue;
157 var title = child.text().trim();
158 var url = env.newString(env.newURL(env.newURL('http://www.dailymotion.com'), child.attr('href')) + '');
159 child = item.select('.dmpi_video_preview a img').first();
160 var image = env.newString('');
161 if (child != null) {
162 var tmp = env.newString(child.attr('data-spr'));
163 tmp = tmp.replaceAl?('jpeg_preview_sprite.jpg', 'jpeg_preview_medium.jpg');
164 image = env.newString(env.newURL(env.newURL('http://www.dailymotion.com'), tmp) + '');
165 }
166 markVideo(title, url, image);
167 }
168 } catch (e) {
169 env.error(e);
170 }
171 }
172}
173
174function markVideo(title, url, image) {
175 if (findQueueByUrl(url)) return;
176 var schema = 's|url|s|title|s|image|s|crawled';
177 var entity = env.newEntity();
178 entity.setSchema(schema);
179 entity.setKind('Queue_DailyMotion');
180 entity.setId(env.uniqid());
181 entity.setString('url', url);
182 entity.setString('title', title);
183 entity.setString('image', image);
184 entity.setString('crawled', 'false');
185 entity.save();
186}
187
188function findQueueByUrl(url) {
189 var entity = env.newEntity();
190 var query = entity.newTermQuery(entity.newTerm('url', url));
191 var size = entity.count('Queue_DailyMotion', query, 1);
192 return (size > 0);
193}
194
195function loadQueue() {
196 var entity = env.newEntity();
197 var tag = entity.search('Queue_DailyMotion', entity.newTermQuery(entity.newTerm('crawled', 'false')), 10);
198 return tag;
199}
var env;
var args;
var maxpage = 1000;

function main(penv, pargs) {
    env = penv;
    args = pargs;
    env.info('Starting');
    while (true) {
        var queue_list = loadQueue();
        env.info('Size: ' + queue_list.size());
        while (queue_list.size() > 0) {
            for (var i = 0; i < queue_list.size(); i++) {
                var queue = queue_list.get(i);
                grabVideo(queue);
                queue.setString('crawled', 'true');
                queue.save();
            }
            queue_list = loadQueue();
        }
        grabCategory('http://www.dailymotion.com/group/in_theaters_this_week/');
        grabCategory('http://www.dailymotion.com/group/in_theaters_now/');
        grabCategory('http://www.dailymotion.com/group/coming_soon/');
        grabCategory('http://www.dailymotion.com/user/ReelzChannel/lang/en/search/movie+review/');
        grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/interview+movie/channel/shortfilms/');
        grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/movie+news/channel/shortfilms/');
        grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/horror/channel/shortfilms/');
        grabCategory('http://www.dailymotion.com/user/hollywoodtv/');
        grabCategory('http://www.dailymotion.com/creative-official/tag/rock/lang/en/channel/music/');
        grabCategory('http://www.dailymotion.com/creative-official/tag/pop/lang/en/channel/music/');
        grabCategory('http://www.dailymotion.com/creative-official/tag/hop/lang/en/channel/music/');
        grabCategory('http://www.dailymotion.com/creative-official/tag/alternative/lang/en/channel/music/');
        grabCategory('http://www.dailymotion.com/creative-official/tag/dance/lang/en/channel/music/');
        grabCategory('http://www.dailymotion.com/creative-official/tag/soul/lang/en/channel/music/');
        grabCategory('http://www.dailymotion.com/creative-official/tag/latin/lang/en/channel/music/');
        grabCategory('http://www.dailymotion.com/creative-official/tag/country/lang/en/channel/music/');
        grabCategory('http://www.dailymotion.com/user/ClassicGameRoom/');
        grabCategory('http://www.dailymotion.com/mychannel/ClassicGameRoom/');
        grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/fight/channel/videogames/');
        grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/strategy/channel/videogames/');
        grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/shooter/channel/videogames/');
        grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/action/channel/videogames/');
        grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/sport/channel/videogames/');
        grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/trailer/channel/videogames/');
        grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/review/channel/videogames/');
        grabCategory('http://www.dailymotion.com/creative-official/lang/en/search/nfl/');
        grabCategory('http://www.dailymotion.com/creative-official/lang/en/search/nba/');
        grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/nhl/channel/sport/');
        grabCategory('http://www.dailymotion.com/user/TotalCollegeSports/');
        grabCategory('http://www.dailymotion.com/user/UFC/');
        grabCategory('http://www.dailymotion.com/user/sportsillustrated/');
        grabCategory('http://www.dailymotion.com/user/transworld/');
        grabCategory('http://www.dailymotion.com/user/rooftopcomedy/');
        grabCategory('http://www.dailymotion.com/playlist/x1qzsd_MyDamnChannel_dicki/');
        grabCategory('http://www.dailymotion.com/playlist/x1r5r4_MyDamnChannel_easy-to-assemble-season-3/');
        grabCategory('http://www.dailymotion.com/user/Rhettandlink/');
        grabCategory('http://www.dailymotion.com/user/epicmealtime/');
        grabCategory('http://www.dailymotion.com/group/familyguy/');
        grabCategory('http://www.dailymotion.com/creative/lang/en/channel/fun/');
        grabCategory('http://www.dailymotion.com/group/nbcnightlynews/');
        grabCategory('http://www.dailymotion.com/user/reuters/');
        grabCategory('http://www.dailymotion.com/user/NewsLook/');
        grabCategory('http://www.dailymotion.com/user/NYMag/');
        grabCategory('http://www.dailymotion.com/user/itnnews/');
        grabCategory('http://www.dailymotion.com/user/Buzz60/');
        grabCategory('http://www.dailymotion.com/user/associatedpress/');
        grabCategory('http://www.dailymotion.com/us/featured/channel/news/');
        grabCategory('http://www.dailymotion.com/user/clevvertv/');
        grabCategory('http://www.dailymotion.com/user/tvguide/');
        grabCategory('http://www.dailymotion.com/user/splashnews/');
        grabCategory('http://www.dailymotion.com/user/hollywoodbackstage/');
        grabCategory('http://www.dailymotion.com/user/celebtv/');
        grabCategory('http://www.dailymotion.com/user/maximotv/');
        grabCategory('http://www.dailymotion.com/user/mojosupreme/');
        grabCategory('http://www.dailymotion.com/user/DiagonalView/');
        grabCategory('http://www.dailymotion.com/hub/x38_Motionmaker-documentaries/');
        grabCategory('http://www.dailymotion.com/user/tysihelp/');
        grabCategory('http://www.dailymotion.com/user/computerTV/');
        grabCategory('http://www.dailymotion.com/user/soldierknowsbest/');
        grabCategory('http://www.dailymotion.com/user/appjudgment/');
        grabCategory('http://www.dailymotion.com/user/geekbeattv/');
        grabCategory('http://www.dailymotion.com/user/allthingsscience/');
        grabCategory('http://www.dailymotion.com/user/stuffwelike/');
        grabCategory('http://www.dailymotion.com/user/lifehackershow/');
        grabCategory('http://www.dailymotion.com/us/channel/auto/');
    }
    env.info('Ending');
}

function grabVideo(queue) {
    try {
        var url = queue.getString('url');
        var title = queue.getString('title');
        var image = queue.getString('image');
        var conn = env.newJsoup().connect(url).userAgent('Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101');
        var doc = conn.timeout(60000).get(); 
        var child = doc.select('#video_description').first();
        var desc = env.newString('');
        if (child != null) {
            desc = child.text();
        }
        saveLink(title, url, desc, image, '');
    } catch (e) {
        env.error(e);
    }
}

function saveLink(title, url, desc, image, price) {
    url = env.newString(url);
    var pos = url.lastIndexOf('&feature=');
    if (pos >= 0) {
        url = url.substring(0, pos);
    }
    if (findLinkByUrl(url)) return;
    var schema = 's|url|a|title|a|desc|s|fixed|d|score|s|site|s|image|s|price';
    var entity = env.newEntity();
    entity.setSchema(schema);
    entity.setKind('Link');
    entity.setId(env.uniqid());
    entity.setString('url', url);
    entity.setString('title', title);
    entity.setString('desc', desc);
    entity.setString('fixed', 'true');
    entity.setDouble('score', 100);
    entity.setString('image', image);
    entity.setString('price', price);
    try {
        var t_url = env.newURL(url);
        var t_host = t_url.getHost();
        entity.setString('site', t_host);
    } catch (e) {
        env.error(e);
    }
    entity.save();
    env.info(title + ' | ' + url);
}

function findLinkByUrl(url) {
    var entity = env.newEntity();
    var query = entity.newTermQuery(entity.newTerm('url', url));
    var size = entity.count('Link', query, 1);
    return (size > 0);
}

function grabCategory(catUrl) {
    env.info('Category: ' + catUrl);
    for (var no = 1; no <= maxpage; no++) {
        try {
            var conn = env.newJsoup().connect(catUrl + no + '?mode=playlist').userAgent('Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101');
            var doc = conn.timeout(60000).get(); 
            var items = doc.select('.dmpi_video_item');
            if (items.size() == 0) break;
            for (var i = 0; i < items.size(); i++) {
                var item = items.get(i);
                var child = item.select('.dmpi_video_title a').first();
                if (child == null) continue;
                var title = child.text().trim();
                var url = env.newString(env.newURL(env.newURL('http://www.dailymotion.com'), child.attr('href')) + '');
                child = item.select('.dmpi_video_preview a img').first();
                var image = env.newString('');
                if (child != null) {
                    var tmp = env.newString(child.attr('data-spr'));
                    tmp = tmp.replaceAl?('jpeg_preview_sprite.jpg', 'jpeg_preview_medium.jpg');
                    image = env.newString(env.newURL(env.newURL('http://www.dailymotion.com'), tmp) + '');
                }
                markVideo(title, url, image);
            }
        } catch (e) {
            env.error(e);
        }
    }
}

function markVideo(title, url, image) {
    if (findQueueByUrl(url)) return;
    var schema = 's|url|s|title|s|image|s|crawled';
    var entity = env.newEntity();
    entity.setSchema(schema);
    entity.setKind('Queue_DailyMotion');
    entity.setId(env.uniqid());
    entity.setString('url', url);
    entity.setString('title', title);
    entity.setString('image', image);
    entity.setString('crawled', 'false');
    entity.save();
}

function findQueueByUrl(url) {
    var entity = env.newEntity();
    var query = entity.newTermQuery(entity.newTerm('url', url));
    var size = entity.count('Queue_DailyMotion', query, 1);
    return (size > 0);
}

function loadQueue() {
    var entity = env.newEntity();
    var tag = entity.search('Queue_DailyMotion', entity.newTermQuery(entity.newTerm('crawled', 'false')), 10);
    return tag;
}

  Protected by Copyscape Online Copyright Protection

No comments:

Post a Comment