Wednesday 6 June 2012

Grab video from DailyMotion

Grab video from DailyMotion
This task use javascript sandbox with jsoup and lucene support to grab video from DailyMotion.
Grab video from DailyMotion
  1. Create javascript sandbox with jsoup support
  2. Add Lucene support to javascript sandbox
  3. Create javascript as following
javascript
var env;
var args;
var maxpage = 1000;

function main(penv, pargs) {
    env = penv;
    args = pargs;
    env.info('Starting');
    while (true) {
        var queue_list = loadQueue();
        env.info('Size: ' + queue_list.size());
        while (queue_list.size() > 0) {
            for (var i = 0; i < queue_list.size(); i++) {
                var queue = queue_list.get(i);
                grabVideo(queue);
                queue.setString('crawled', 'true');
                queue.save();
            }
            queue_list = loadQueue();
        }
        grabCategory('http://www.dailymotion.com/group/in_theaters_this_week/');
        grabCategory('http://www.dailymotion.com/group/in_theaters_now/');
        grabCategory('http://www.dailymotion.com/group/coming_soon/');
        grabCategory('http://www.dailymotion.com/user/ReelzChannel/lang/en/search/movie+review/');
        grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/interview+movie/channel/shortfilms/');
        grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/movie+news/channel/shortfilms/');
        grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/horror/channel/shortfilms/');
        grabCategory('http://www.dailymotion.com/user/hollywoodtv/');
        grabCategory('http://www.dailymotion.com/creative-official/tag/rock/lang/en/channel/music/');
        grabCategory('http://www.dailymotion.com/creative-official/tag/pop/lang/en/channel/music/');
        grabCategory('http://www.dailymotion.com/creative-official/tag/hop/lang/en/channel/music/');
        grabCategory('http://www.dailymotion.com/creative-official/tag/alternative/lang/en/channel/music/');
        grabCategory('http://www.dailymotion.com/creative-official/tag/dance/lang/en/channel/music/');
        grabCategory('http://www.dailymotion.com/creative-official/tag/soul/lang/en/channel/music/');
        grabCategory('http://www.dailymotion.com/creative-official/tag/latin/lang/en/channel/music/');
        grabCategory('http://www.dailymotion.com/creative-official/tag/country/lang/en/channel/music/');
        grabCategory('http://www.dailymotion.com/user/ClassicGameRoom/');
        grabCategory('http://www.dailymotion.com/mychannel/ClassicGameRoom/');
        grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/fight/channel/videogames/');
        grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/strategy/channel/videogames/');
        grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/shooter/channel/videogames/');
        grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/action/channel/videogames/');
        grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/sport/channel/videogames/');
        grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/trailer/channel/videogames/');
        grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/review/channel/videogames/');
        grabCategory('http://www.dailymotion.com/creative-official/lang/en/search/nfl/');
        grabCategory('http://www.dailymotion.com/creative-official/lang/en/search/nba/');
        grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/nhl/channel/sport/');
        grabCategory('http://www.dailymotion.com/user/TotalCollegeSports/');
        grabCategory('http://www.dailymotion.com/user/UFC/');
        grabCategory('http://www.dailymotion.com/user/sportsillustrated/');
        grabCategory('http://www.dailymotion.com/user/transworld/');
        grabCategory('http://www.dailymotion.com/user/rooftopcomedy/');
        grabCategory('http://www.dailymotion.com/playlist/x1qzsd_MyDamnChannel_dicki/');
        grabCategory('http://www.dailymotion.com/playlist/x1r5r4_MyDamnChannel_easy-to-assemble-season-3/');
        grabCategory('http://www.dailymotion.com/user/Rhettandlink/');
        grabCategory('http://www.dailymotion.com/user/epicmealtime/');
        grabCategory('http://www.dailymotion.com/group/familyguy/');
        grabCategory('http://www.dailymotion.com/creative/lang/en/channel/fun/');
        grabCategory('http://www.dailymotion.com/group/nbcnightlynews/');
        grabCategory('http://www.dailymotion.com/user/reuters/');
        grabCategory('http://www.dailymotion.com/user/NewsLook/');
        grabCategory('http://www.dailymotion.com/user/NYMag/');
        grabCategory('http://www.dailymotion.com/user/itnnews/');
        grabCategory('http://www.dailymotion.com/user/Buzz60/');
        grabCategory('http://www.dailymotion.com/user/associatedpress/');
        grabCategory('http://www.dailymotion.com/us/featured/channel/news/');
        grabCategory('http://www.dailymotion.com/user/clevvertv/');
        grabCategory('http://www.dailymotion.com/user/tvguide/');
        grabCategory('http://www.dailymotion.com/user/splashnews/');
        grabCategory('http://www.dailymotion.com/user/hollywoodbackstage/');
        grabCategory('http://www.dailymotion.com/user/celebtv/');
        grabCategory('http://www.dailymotion.com/user/maximotv/');
        grabCategory('http://www.dailymotion.com/user/mojosupreme/');
        grabCategory('http://www.dailymotion.com/user/DiagonalView/');
        grabCategory('http://www.dailymotion.com/hub/x38_Motionmaker-documentaries/');
        grabCategory('http://www.dailymotion.com/user/tysihelp/');
        grabCategory('http://www.dailymotion.com/user/computerTV/');
        grabCategory('http://www.dailymotion.com/user/soldierknowsbest/');
        grabCategory('http://www.dailymotion.com/user/appjudgment/');
        grabCategory('http://www.dailymotion.com/user/geekbeattv/');
        grabCategory('http://www.dailymotion.com/user/allthingsscience/');
        grabCategory('http://www.dailymotion.com/user/stuffwelike/');
        grabCategory('http://www.dailymotion.com/user/lifehackershow/');
        grabCategory('http://www.dailymotion.com/us/channel/auto/');
    }
    env.info('Ending');
}

function grabVideo(queue) {
    try {
        var url = queue.getString('url');
        var title = queue.getString('title');
        var image = queue.getString('image');
        var conn = env.newJsoup().connect(url).userAgent('Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101');
        var doc = conn.timeout(60000).get(); 
        var child = doc.select('#video_description').first();
        var desc = env.newString('');
        if (child != null) {
            desc = child.text();
        }
        saveLink(title, url, desc, image, '');
    } catch (e) {
        env.error(e);
    }
}

function saveLink(title, url, desc, image, price) {
    url = env.newString(url);
    var pos = url.lastIndexOf('&feature=');
    if (pos >= 0) {
        url = url.substring(0, pos);
    }
    if (findLinkByUrl(url)) return;
    var schema = 's|url|a|title|a|desc|s|fixed|d|score|s|site|s|image|s|price';
    var entity = env.newEntity();
    entity.setSchema(schema);
    entity.setKind('Link');
    entity.setId(env.uniqid());
    entity.setString('url', url);
    entity.setString('title', title);
    entity.setString('desc', desc);
    entity.setString('fixed', 'true');
    entity.setDouble('score', 100);
    entity.setString('image', image);
    entity.setString('price', price);
    try {
        var t_url = env.newURL(url);
        var t_host = t_url.getHost();
        entity.setString('site', t_host);
    } catch (e) {
        env.error(e);
    }
    entity.save();
    env.info(title + ' | ' + url);
}

function findLinkByUrl(url) {
    var entity = env.newEntity();
    var query = entity.newTermQuery(entity.newTerm('url', url));
    var size = entity.count('Link', query, 1);
    return (size > 0);
}

function grabCategory(catUrl) {
    env.info('Category: ' + catUrl);
    for (var no = 1; no <= maxpage; no++) {
        try {
            var conn = env.newJsoup().connect(catUrl + no + '?mode=playlist').userAgent('Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101');
            var doc = conn.timeout(60000).get(); 
            var items = doc.select('.dmpi_video_item');
            if (items.size() == 0) break;
            for (var i = 0; i < items.size(); i++) {
                var item = items.get(i);
                var child = item.select('.dmpi_video_title a').first();
                if (child == null) continue;
                var title = child.text().trim();
                var url = env.newString(env.newURL(env.newURL('http://www.dailymotion.com'), child.attr('href')) + '');
                child = item.select('.dmpi_video_preview a img').first();
                var image = env.newString('');
                if (child != null) {
                    var tmp = env.newString(child.attr('data-spr'));
                    tmp = tmp.replaceAl?('jpeg_preview_sprite.jpg', 'jpeg_preview_medium.jpg');
                    image = env.newString(env.newURL(env.newURL('http://www.dailymotion.com'), tmp) + '');
                }
                markVideo(title, url, image);
            }
        } catch (e) {
            env.error(e);
        }
    }
}

function markVideo(title, url, image) {
    if (findQueueByUrl(url)) return;
    var schema = 's|url|s|title|s|image|s|crawled';
    var entity = env.newEntity();
    entity.setSchema(schema);
    entity.setKind('Queue_DailyMotion');
    entity.setId(env.uniqid());
    entity.setString('url', url);
    entity.setString('title', title);
    entity.setString('image', image);
    entity.setString('crawled', 'false');
    entity.save();
}

function findQueueByUrl(url) {
    var entity = env.newEntity();
    var query = entity.newTermQuery(entity.newTerm('url', url));
    var size = entity.count('Queue_DailyMotion', query, 1);
    return (size > 0);
}

function loadQueue() {
    var entity = env.newEntity();
    var tag = entity.search('Queue_DailyMotion', entity.newTermQuery(entity.newTerm('crawled', 'false')), 10);
    return tag;
}
    

  Protected by Copyscape Online Copyright Protection

No comments:

Post a Comment