Wednesday, 6 June 2012

Grab video from YouTube

Grab video from YouTube
This task use javascript sandbox with jsoup and lucene support to grab video from YouTube.
Grab video from YouTube
  1. Create javascript sandbox with jsoup support
  2. Add Lucene support to javascript sandbox
  3. Create javascript as following
javascript
1var env;
2var args;
3
4function main(penv, pargs) {
5 env = penv;
6 args = pargs;
7 env.info('Starting');
8 while (true) {
9 var queue_list = loadQueue();
10 env.info('Size: ' + queue_list.size());
11 while (queue_list.size() > 0) {
12 for (var i = 0; i < queue_list.size(); i++) {
13 var queue = queue_list.get(i);
14 grabVideo(queue);
15 queue.setString('crawled', 'true');
16 queue.save();
17 }
18 queue_list = loadQueue();
19 }
20 grabCategory('http://www.youtube.com/autos');
21 grabCategory('http://www.youtube.com/comedy');
22 grabCategory('http://www.youtube.com/entertainment');
23 grabCategory('http://www.youtube.com/film');
24 grabCategory('http://www.youtube.com/gaming');
25 grabCategory('http://www.youtube.com/howto');
26 grabCategory('http://www.youtube.com/activism');
27 grabCategory('http://www.youtube.com/people');
28 grabCategory('http://www.youtube.com/pets');
29 grabCategory('http://www.youtube.com/science');
30 grabCategory('http://www.youtube.com/videos?c=17');
31 grabCategory('http://www.youtube.com/travel');
32 }
33 env.info('Ending');
34}
35
36function grabVideo(queue) {
37 try {
38 var url = queue.getString('url');
39 var title = queue.getString('title');
40 var image = queue.getString('image');
41 var conn = env.newJsoup().connect(url).userAgent('Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101');
42 var doc = conn.timeout(60000).get();
43 var child = doc.select('#eow-description').first();
44 var desc = env.newString('');
45 if (child != null) {
46 desc = child.text();
47 } else {
48 child = doc.select('#ded').first();
49 if (child != null) {
50 desc = child.text();
51 }
52 }
53 saveLink(title, url, desc, image, '');
54
55 var html = doc.html();
56 var pos1 = html.indexOf('var rvl =');
57 var pos2 = html.indexOf('var cml =');
58 if (pos1 < 0 || pos2 < 0 || pos1 >= pos2) return;
59 var js1 = html.substring(pos1 + 9, pos2);
60 var obj1 = null;
61 eval('obj1 = ' + js1);
62 if (obj1 == null) return;
63 for (var i = 0; i < obj1.length; i++) {
64 var item = obj1[i];
65 var url2 = env.newString('http://www.youtube.com/watch?v=' + item.k);
66 var title2 = item.t;
67 var image2 = env.newString(env.newURL(env.newURL('http://www.youtube.com'), item.i));
68 markVideo(title2, url2, image2);
69 env.info('Video: ' + title2 + ' | ' + url2 + ' | ' + image2);
70 }
71 } catch (e) {
72 env.error(e);
73 }
74}
75
76function saveLink(title, url, desc, image, price) {
77 url = env.newString(url);
78 var pos = url.lastIndexOf('&feature=');
79 if (pos >= 0) {
80 url = url.substring(0, pos);
81 }
82 if (findLinkByUrl(url)) return;
83 var schema = 's|url|a|title|a|desc|s|fixed|d|score|s|site|s|image|s|price';
84 var entity = env.newEntity();
85 entity.setSchema(schema);
86 entity.setKind('Link');
87 entity.setId(env.uniqid());
88 entity.setString('url', url);
89 entity.setString('title', title);
90 entity.setString('desc', desc);
91 entity.setString('fixed', 'true');
92 entity.setDouble('score', 100);
93 entity.setString('image', image);
94 entity.setString('price', price);
95 try {
96 var t_url = env.newURL(url);
97 var t_host = t_url.getHost();
98 entity.setString('site', t_host);
99 } catch (e) {
100 env.error(e);
101 }
102 entity.save();
103 env.info(title + ' | ' + url);
104}
105
106function findLinkByUrl(url) {
107 var entity = env.newEntity();
108 var query = entity.newTermQuery(entity.newTerm('url', url));
109 var size = entity.count('Link', query, 1);
110 return (size > 0);
111}
112
113function grabCategory(catUrl) {
114 try {
115 var conn = env.newJsoup().connect(catUrl).userAgent('Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101');
116 var doc = conn.timeout(60000).get();
117 var items = doc.select('.browse-item');
118 for (var i = 0; i < items.size(); i++) {
119 var item = items.get(i);
120 var child = item.select('.browse-item-content h3 a').first();
121 if (child == null) continue;
122 var title = child.text().trim();
123 var url = env.newString(env.newURL(env.newURL('http://www.youtube.com'), child.attr('href')) + '');
124 child = item.select('.yt-thumb-clip-inner img').first();
125 var image = env.newString('');
126 if (child != null) {
127 image = env.newString(env.newURL(env.newURL('http://www.youtube.com'), child.attr('data-thumb')) + '');
128 }
129 markVideo(title, url, image);
130 }
131 } catch (e) {
132 env.error(e);
133 }
134}
135
136function markVideo(title, url, image) {
137 if (findQueueByUrl(url)) return;
138 var schema = 's|url|s|title|s|image|s|crawled';
139 var entity = env.newEntity();
140 entity.setSchema(schema);
141 entity.setKind('Queue_YouTube');
142 entity.setId(env.uniqid());
143 entity.setString('url', url);
144 entity.setString('title', title);
145 entity.setString('image', image);
146 entity.setString('crawled', 'false');
147 entity.save();
148}
149
150function findQueueByUrl(url) {
151 var entity = env.newEntity();
152 var query = entity.newTermQuery(entity.newTerm('url', url));
153 var size = entity.count('Queue_YouTube', query, 1);
154 return (size > 0);
155}
156
157function loadQueue() {
158 var entity = env.newEntity();
159 var tag = entity.search('Queue_YouTube', entity.newTermQuery(entity.newTerm('crawled', 'false')), 10);
160 return tag;
161}
var env;
var args;

function main(penv, pargs) {
    env = penv;
    args = pargs;
    env.info('Starting');
    while (true) {
        var queue_list = loadQueue();
        env.info('Size: ' + queue_list.size());
        while (queue_list.size() > 0) {
            for (var i = 0; i < queue_list.size(); i++) {
                var queue = queue_list.get(i);
                grabVideo(queue);
                queue.setString('crawled', 'true');
                queue.save();
            }
            queue_list = loadQueue();
        }
        grabCategory('http://www.youtube.com/autos');
        grabCategory('http://www.youtube.com/comedy');
        grabCategory('http://www.youtube.com/entertainment');
        grabCategory('http://www.youtube.com/film');
        grabCategory('http://www.youtube.com/gaming');
        grabCategory('http://www.youtube.com/howto');
        grabCategory('http://www.youtube.com/activism');
        grabCategory('http://www.youtube.com/people');
        grabCategory('http://www.youtube.com/pets');
        grabCategory('http://www.youtube.com/science');
        grabCategory('http://www.youtube.com/videos?c=17');
        grabCategory('http://www.youtube.com/travel');
    }
    env.info('Ending');
}

function grabVideo(queue) {
    try {
        var url = queue.getString('url');
        var title = queue.getString('title');
        var image = queue.getString('image');
        var conn = env.newJsoup().connect(url).userAgent('Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101');
        var doc = conn.timeout(60000).get(); 
        var child = doc.select('#eow-description').first();
        var desc = env.newString('');
        if (child != null) {
            desc = child.text();
        } else {
            child = doc.select('#ded').first();
            if (child != null) {
                desc = child.text();
            }
        }
        saveLink(title, url, desc, image, '');
  
        var html = doc.html();
        var pos1 = html.indexOf('var rvl =');
        var pos2 = html.indexOf('var cml =');
        if (pos1 < 0 || pos2 < 0 || pos1 >= pos2) return;
        var js1 = html.substring(pos1 + 9, pos2);
        var obj1 = null;
        eval('obj1 = ' + js1);
        if (obj1 == null) return;
        for (var i = 0; i < obj1.length; i++) {
            var item = obj1[i];
            var url2 = env.newString('http://www.youtube.com/watch?v=' + item.k);
            var title2 = item.t;
            var image2 = env.newString(env.newURL(env.newURL('http://www.youtube.com'), item.i));
            markVideo(title2, url2, image2);
            env.info('Video: ' + title2 + ' | ' + url2 + ' | ' + image2);
        }
    } catch (e) {
        env.error(e);
    }
}

function saveLink(title, url, desc, image, price) {
    url = env.newString(url);
    var pos = url.lastIndexOf('&feature=');
    if (pos >= 0) {
        url = url.substring(0, pos);
    }
    if (findLinkByUrl(url)) return;
    var schema = 's|url|a|title|a|desc|s|fixed|d|score|s|site|s|image|s|price';
    var entity = env.newEntity();
    entity.setSchema(schema);
    entity.setKind('Link');
    entity.setId(env.uniqid());
    entity.setString('url', url);
    entity.setString('title', title);
    entity.setString('desc', desc);
    entity.setString('fixed', 'true');
    entity.setDouble('score', 100);
    entity.setString('image', image);
    entity.setString('price', price);
    try {
        var t_url = env.newURL(url);
        var t_host = t_url.getHost();
        entity.setString('site', t_host);
    } catch (e) {
        env.error(e);
    }
    entity.save();
    env.info(title + ' | ' + url);
}

function findLinkByUrl(url) {
    var entity = env.newEntity();
    var query = entity.newTermQuery(entity.newTerm('url', url));
    var size = entity.count('Link', query, 1);
    return (size > 0);
}

function grabCategory(catUrl) {
    try {
        var conn = env.newJsoup().connect(catUrl).userAgent('Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101');
        var doc = conn.timeout(60000).get(); 
        var items = doc.select('.browse-item');
        for (var i = 0; i < items.size(); i++) {
            var item = items.get(i);
            var child = item.select('.browse-item-content h3 a').first();
            if (child == null) continue;
            var title = child.text().trim();
            var url = env.newString(env.newURL(env.newURL('http://www.youtube.com'), child.attr('href')) + '');
            child = item.select('.yt-thumb-clip-inner img').first();
            var image = env.newString('');
            if (child != null) {
                image = env.newString(env.newURL(env.newURL('http://www.youtube.com'), child.attr('data-thumb')) + '');
            }
            markVideo(title, url, image);
        }
    } catch (e) {
        env.error(e);
    }
}

function markVideo(title, url, image) {
    if (findQueueByUrl(url)) return;
    var schema = 's|url|s|title|s|image|s|crawled';
    var entity = env.newEntity();
    entity.setSchema(schema);
    entity.setKind('Queue_YouTube');
    entity.setId(env.uniqid());
    entity.setString('url', url);
    entity.setString('title', title);
    entity.setString('image', image);
    entity.setString('crawled', 'false');
    entity.save();
}

function findQueueByUrl(url) {
    var entity = env.newEntity();
    var query = entity.newTermQuery(entity.newTerm('url', url));
    var size = entity.count('Queue_YouTube', query, 1);
    return (size > 0);
}

function loadQueue() {
    var entity = env.newEntity();
    var tag = entity.search('Queue_YouTube', entity.newTermQuery(entity.newTerm('crawled', 'false')), 10);
    return tag;
}

  Protected by Copyscape Online Copyright Protection

No comments:

Post a Comment