Grab video from YouTube
  
Grab video from YouTube
  - Create javascript sandbox with jsoup support
- Add Lucene support to javascript sandbox
- Create javascript as following
    javascript
    
    
var env;
var args;
function main(penv, pargs) {
    env = penv;
    args = pargs;
    env.info('Starting');
    while (true) {
        var queue_list = loadQueue();
        env.info('Size: ' + queue_list.size());
        while (queue_list.size() > 0) {
            for (var i = 0; i < queue_list.size(); i++) {
                var queue = queue_list.get(i);
                grabVideo(queue);
                queue.setString('crawled', 'true');
                queue.save();
            }
            queue_list = loadQueue();
        }
        grabCategory('http://www.youtube.com/autos');
        grabCategory('http://www.youtube.com/comedy');
        grabCategory('http://www.youtube.com/entertainment');
        grabCategory('http://www.youtube.com/film');
        grabCategory('http://www.youtube.com/gaming');
        grabCategory('http://www.youtube.com/howto');
        grabCategory('http://www.youtube.com/activism');
        grabCategory('http://www.youtube.com/people');
        grabCategory('http://www.youtube.com/pets');
        grabCategory('http://www.youtube.com/science');
        grabCategory('http://www.youtube.com/videos?c=17');
        grabCategory('http://www.youtube.com/travel');
    }
    env.info('Ending');
}
function grabVideo(queue) {
    try {
        var url = queue.getString('url');
        var title = queue.getString('title');
        var image = queue.getString('image');
        var conn = env.newJsoup().connect(url).userAgent('Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101');
        var doc = conn.timeout(60000).get(); 
        var child = doc.select('#eow-description').first();
        var desc = env.newString('');
        if (child != null) {
            desc = child.text();
        } else {
            child = doc.select('#ded').first();
            if (child != null) {
                desc = child.text();
            }
        }
        saveLink(title, url, desc, image, '');
  
        var html = doc.html();
        var pos1 = html.indexOf('var rvl =');
        var pos2 = html.indexOf('var cml =');
        if (pos1 < 0 || pos2 < 0 || pos1 >= pos2) return;
        var js1 = html.substring(pos1 + 9, pos2);
        var obj1 = null;
        eval('obj1 = ' + js1);
        if (obj1 == null) return;
        for (var i = 0; i < obj1.length; i++) {
            var item = obj1[i];
            var url2 = env.newString('http://www.youtube.com/watch?v=' + item.k);
            var title2 = item.t;
            var image2 = env.newString(env.newURL(env.newURL('http://www.youtube.com'), item.i));
            markVideo(title2, url2, image2);
            env.info('Video: ' + title2 + ' | ' + url2 + ' | ' + image2);
        }
    } catch (e) {
        env.error(e);
    }
}
function saveLink(title, url, desc, image, price) {
    url = env.newString(url);
    var pos = url.lastIndexOf('&feature=');
    if (pos >= 0) {
        url = url.substring(0, pos);
    }
    if (findLinkByUrl(url)) return;
    var schema = 's|url|a|title|a|desc|s|fixed|d|score|s|site|s|image|s|price';
    var entity = env.newEntity();
    entity.setSchema(schema);
    entity.setKind('Link');
    entity.setId(env.uniqid());
    entity.setString('url', url);
    entity.setString('title', title);
    entity.setString('desc', desc);
    entity.setString('fixed', 'true');
    entity.setDouble('score', 100);
    entity.setString('image', image);
    entity.setString('price', price);
    try {
        var t_url = env.newURL(url);
        var t_host = t_url.getHost();
        entity.setString('site', t_host);
    } catch (e) {
        env.error(e);
    }
    entity.save();
    env.info(title + ' | ' + url);
}
function findLinkByUrl(url) {
    var entity = env.newEntity();
    var query = entity.newTermQuery(entity.newTerm('url', url));
    var size = entity.count('Link', query, 1);
    return (size > 0);
}
function grabCategory(catUrl) {
    try {
        var conn = env.newJsoup().connect(catUrl).userAgent('Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101');
        var doc = conn.timeout(60000).get(); 
        var items = doc.select('.browse-item');
        for (var i = 0; i < items.size(); i++) {
            var item = items.get(i);
            var child = item.select('.browse-item-content h3 a').first();
            if (child == null) continue;
            var title = child.text().trim();
            var url = env.newString(env.newURL(env.newURL('http://www.youtube.com'), child.attr('href')) + '');
            child = item.select('.yt-thumb-clip-inner img').first();
            var image = env.newString('');
            if (child != null) {
                image = env.newString(env.newURL(env.newURL('http://www.youtube.com'), child.attr('data-thumb')) + '');
            }
            markVideo(title, url, image);
        }
    } catch (e) {
        env.error(e);
    }
}
function markVideo(title, url, image) {
    if (findQueueByUrl(url)) return;
    var schema = 's|url|s|title|s|image|s|crawled';
    var entity = env.newEntity();
    entity.setSchema(schema);
    entity.setKind('Queue_YouTube');
    entity.setId(env.uniqid());
    entity.setString('url', url);
    entity.setString('title', title);
    entity.setString('image', image);
    entity.setString('crawled', 'false');
    entity.save();
}
function findQueueByUrl(url) {
    var entity = env.newEntity();
    var query = entity.newTermQuery(entity.newTerm('url', url));
    var size = entity.count('Queue_YouTube', query, 1);
    return (size > 0);
}
function loadQueue() {
    var entity = env.newEntity();
    var tag = entity.search('Queue_YouTube', entity.newTermQuery(entity.newTerm('crawled', 'false')), 10);
    return tag;
}
    
   
 

 
No comments:
Post a Comment