Wednesday 25 July 2012

Grab search results from Yahoo

Grab search results from Yahoo
This task use javascript sandbox with jsoup support to grab search results from Yahoo.
Grab search results from Yahoo
  1. Create javascript sandbox with jsoup support
  2. Create javascript as following
javascript
var g_env;

function main(p_env, p_args) {
  g_env = p_env;
  g_env.info('Starting');
  run();
  g_env.info('Ending');
}

function run() {
  try {
    var query = 'lucene';
    for (var pn = 1; pn <= 10; pn++) {
      var res = grab(query, pn);
      for (var i = 0; i < res.size(); i++) {
        var it = res.get(i);
        var title = it.get('title');
        var link = it.get('link');
        var no = (pn - 1) * 10 + i + 1;
        g_env.info(no + ' | ' + title + ' | ' + link);
      }
    }
  } catch (e) {
    g_env.error(e);
  }
}

function grab(query, pageno) {
  var tag = g_env.newArrayList();
  try {
    var url = 'http://search.yahoo.com/search?p=' + g_env.encodeURL(query, 'UTF-8') + '&pstart=1&b=' + ((pageno - 1) * 10 + 1);
    var conn = g_env.newJsoup().connect(url);
    conn.userAgent('Mozilla/5.0 (Windows NT 5.1; rv:14.0) Gecko/20100101 Firefox/14.0.1');
    conn.timeout(60000);
    var doc = conn.get();
    var nodes = doc.select('#web .res');
    for (var i = 0; i < nodes.size(); i++) {
      var node = nodes.get(i);
      var child = node.select('a.yschttl').first();
      var title = child.text();
      var link = child.attr('href');
      var pos = link.indexOf('**');
      if (pos >= 0) {
        link = link.substring(pos + 2);
        link = g_env.decodeURL(link, 'UTF-8');
      }
      var it = g_env.newHashMap();
      it.put('title', title);
      it.put('link', link);
      tag.add(it);
    }
  } catch (e) {
    g_env.error(e);
  }
  return tag;
}
    

  Protected by Copyscape Online Copyright Protection

Grab search results from Bing

Grab search results from Bing
This task use javascript sandbox with jsoup support to grab search results from Bing.
Grab search results from Bing
  1. Create javascript sandbox with jsoup support
  2. Create javascript as following
javascript
var g_env;

function main(p_env, p_args) {
  g_env = p_env;
  g_env.info('Starting');
  run();
  g_env.info('Ending');
}

function run() {
  try {
    var query = 'lucene';
    for (var pn = 1; pn <= 10; pn++) {
      var res = grab(query, pn);
      for (var i = 0; i < res.size(); i++) {
        var it = res.get(i);
        var title = it.get('title');
        var link = it.get('link');
        var no = (pn - 1) * 10 + i + 1;
        g_env.info(no + ' | ' + title + ' | ' + link);
      }
    }
  } catch (e) {
    g_env.error(e);
  }
}

function grab(query, pageno) {
  var tag = g_env.newArrayList();
  try {
    var url = 'http://www.bing.com/search?q=' + g_env.encodeURL(query, 'UTF-8') + '&first=' + ((pageno - 1) * 10 + 1);
    var conn = g_env.newJsoup().connect(url);
    conn.userAgent('Mozilla/5.0 (Windows NT 5.1; rv:14.0) Gecko/20100101 Firefox/14.0.1');
    conn.timeout(60000);
    var doc = conn.get();
    var nodes = doc.select('#results .sa_wr');
    for (var i = 0; i < nodes.size(); i++) {
      var node = nodes.get(i);
      var child = node.select('.sa_cc .sa_mc .sb_tlst a').first();
      var title = child.text();
      var link = child.attr('href');
      var it = g_env.newHashMap();
      it.put('title', title);
      it.put('link', link);
      tag.add(it);
    }
  } catch (e) {
    g_env.error(e);
  }
  return tag;
}
    

  Protected by Copyscape Online Copyright Protection

Tuesday 24 July 2012

Grab search results from Google

Grab search results from Google
This task use javascript sandbox with jsoup support to grab search results from Google.
Grab search results from Google
  1. Create javascript sandbox with jsoup support
  2. Create javascript as following
javascript
var g_env;

function main(p_env, p_args) {
  g_env = p_env;
  g_env.info('Starting');
  run();
  g_env.info('Ending');
}

function run() {
  try {
    var query = 'lucene';
    for (var pn = 1; pn <= 10; pn++) {
      var res = grab(query, pn);
      for (var i = 0; i < res.size(); i++) {
        var it = res.get(i);
        var title = it.get('title');
        var link = it.get('link');
        var no = (pn - 1) * 10 + i + 1;
        g_env.info(no + ' | ' + title + ' | ' + link);
      }
    }
  } catch (e) {
    g_env.error(e);
  }
}

function grab(query, pageno) {
  var tag = g_env.newArrayList();
  try {
    var url = 'http://google.com/search?q=' + g_env.encodeURL(query, 'UTF-8') + '&start=' + ((pageno - 1) * 10);
    var conn = g_env.newJsoup().connect(url);
    conn.userAgent('Mozilla/5.0 (Windows NT 5.1; rv:14.0) Gecko/20100101 Firefox/14.0.1');
    conn.timeout(60000);
    var doc = conn.get();
    var nodes = doc.select('#rso .g');
    for (var i = 0; i < nodes.size(); i++) {
      var node = nodes.get(i);
      var child = node.select('.vsc .r .l');
      var title = child.text();
      var link = child.attr('href');
      var it = g_env.newHashMap();
      it.put('title', title);
      it.put('link', link);
      tag.add(it);
    }
  } catch (e) {
    g_env.error(e);
  }
  return tag;
}
    

  Protected by Copyscape Online Copyright Protection

Saturday 21 July 2012

Grab article from ScienceDirect

Grab article from ScienceDirect
This task use javascript sandbox with jsoup and lucene support to grab article from ScienceDirect.
Grab article from ScienceDirect
  1. Create javascript sandbox with jsoup support
  2. Add Lucene support to javascript sandbox
  3. Create javascript as following
javascript
var g_title = '';
var g_cache = true;
var g_site = 'sciencedirect.com';
var g_env;
var g_cookie;

function main(p_env, p_args) {
  g_env = p_env;
  run();
}

function newEntity() {
  return g_env.newEntity();
}

function loadUrlCookieStart(url) {
  var conn = g_env.newJsoup().connect(url);
  conn.userAgent('Mozilla/5.0 (Windows NT 5.1; rv:14.0) Gecko/20100101 Firefox/14.0.1');
  conn.timeout(60000);
  var tag = conn.get();
  g_cookie = conn.getCookies();
  return tag;
}

function loadUrlCookie(url) {
  var conn = g_env.newJsoup().connect(url);
  conn.userAgent('Mozilla/5.0 (Windows NT 5.1; rv:14.0) Gecko/20100101 Firefox/14.0.1');
  conn.timeout(60000);
  conn.cookies(g_cookie);
  return conn.get();
}

function loadUrl(url) {
  var conn = g_env.newJsoup().connect(url);
  conn.userAgent('Mozilla/5.0 (Windows NT 5.1; rv:14.0) Gecko/20100101 Firefox/14.0.1');
  conn.timeout(60000);
  return conn.get();
}

function run() {
  g_env.info('Starting');
  if (g_env.newString(g_title).length() > 0) {
    grabTitle(g_title);
  } else {
    if (!g_cache) {
      clearCache();
    }
    var rs = loadTitleFresh();
    while (rs.size() > 0) {
      for (var i = 0; i < rs.size(); i++) {
        var et = rs.get(i);
        grabTitle(et.getString('link'));
      }
      rs = loadTitleFresh();
    }
  }
  g_env.info('Ending');
}

function grabTitle(link) {
  var et = findTitleByLink(link);
  if (et == null) return;
  var kind = et.getString('kind');
  if (kind == 'Book') {
    grabBook(et.getString('title'), et.getString('link'));
  }
  if (kind == 'Book Series') {
    grabBookSeries(et.getString('title'), et.getString('link'));
  }
  if (kind == 'Journal') {
    grabJournal(et.getString('title'), et.getString('link'));
  }
  et.setMark('crawled');
  et.save();
}

function grabJournal(p_title, p_link) {
  try {
    var doc = loadUrl(p_link);
    var vols_link = g_env.newArrayList();
    var vols_title = g_env.newArrayList();
    var rows = doc.select('#volumeIssueData .txtBold a');
    for (var i = 0; i < rows.size(); i++) {
      var child = rows.get(i);
      var title = child.text();
      var link = child.attr('href');
      link = g_env.newString(g_env.newURL(g_env.newURL(p_link), link) + '');
      vols_link.add(link);
      vols_title.add(title);
    }
    for (var i = 0; i < vols_link.size(); i++) {
      var titleV = vols_title.get(i);
      var linkV = vols_link.get(i);
      try {
        doc = loadUrlCookieStart(linkV);
        rows = doc.select('#bodyMainResults .resultRow');
        for (var j = 0; j < rows.size(); j++) {
          var row = rows.get(j);
          child = row.select('.cLink').first();
          if (child == null) continue;
          var title = child.text();
          var link = child.attr('href');
          link = g_env.newString(g_env.newURL(g_env.newURL(p_link), link) + '');
          var desc = '';
          try {
            var cdoc = loadUrlCookie(link);
            child = cdoc.select('#section_abstract').first();
            if (child != null) {
              child = child.parent();
              desc = child.text();
              if (desc.indexOf('Abstract') == 0) {
                desc = desc.substring(8);
              }
              if (desc.indexOf('Summary') == 0) {
                desc = desc.substring(7);
              }
            }
          } catch (e) {
            g_env.error(e);
          }
          saveArticle(title + ' | ' + titleV + ' | ' + p_title, link, desc);
        }
      } catch (e) {
        g_env.error(e);
      }
    }
  } catch (e) {
    g_env.error(e);
  }
}

function grabBook(p_title, p_link) {
  try {
    var doc = loadUrlCookieStart(p_link);
    var rows = doc.select('.contentMain .nonSerialResultsList .cLink');
    for (var j = 0; j < rows.size(); j++) {
      var row = rows.get(j);
      child = row.select('.cLink').first();
      if (child == null) continue;
      var title = child.text();
      var link = child.attr('href');
      link = g_env.newString(g_env.newURL(g_env.newURL(p_link), link) + '');
      var desc = '';
      try {
        var cdoc = loadUrlCookie(link);
        child = cdoc.select('#section_abstract').first();
        if (child != null) {
          child = child.parent();
          desc = child.text();
          if (desc.indexOf('Abstract') == 0) {
            desc = desc.substring(8);
          }
          if (desc.indexOf('Summary') == 0) {
            desc = desc.substring(7);
          }
        }
      } catch (e) {
        g_env.error(e);
      }
      saveArticle(title + ' | ' + p_title, link, desc);
    }
  } catch (e) {
    g_env.error(e);
  }
}

function grabBookSeries(p_title, p_link) {
  try {
    var doc = loadUrl(p_link);
    var vols_link = g_env.newArrayList();
    var vols_title = g_env.newArrayList();
    var rows = doc.select('#volumeIssueData .txt');
    for (var i = 0; i < rows.size(); i++) {
      var row = rows.get(i);
      child = row.select('a').first();
      var title = '';
      var link = '';
      if (child == null) {
        child = row.select('span').first();
        if (child == null) continue;
        title = child.text();
        link = p_link;
      } else {
        title = child.text();
        link = child.attr('href');
        link = g_env.newString(g_env.newURL(g_env.newURL(p_link), link) + '');
      }
      vols_link.add(link);
      vols_title.add(title);
    }
    for (var i = 0; i < vols_link.size(); i++) {
      var titleV = vols_title.get(i);
      var linkV = vols_link.get(i);
      try {
        doc = loadUrlCookieStart(linkV);
        rows = doc.select('#bodyMainResults .resultRow');
        for (var j = 0; j < rows.size(); j++) {
          var row = rows.get(j);
          child = row.select('.cLink').first();
          if (child == null) continue;
          var title = child.text();
          var link = child.attr('href');
          link = g_env.newString(g_env.newURL(g_env.newURL(p_link), link) + '');
          var desc = '';
          try {
            var cdoc = loadUrlCookie(link);
            child = cdoc.select('#section_abstract').first();
            if (child != null) {
              child = child.parent();
              desc = child.text();
              if (desc.indexOf('Abstract') == 0) {
                desc = desc.substring(8);
              }
              if (desc.indexOf('Summary') == 0) {
                desc = desc.substring(7);
              }
            }
          } catch (e) {
            g_env.error(e);
          }
          saveArticle(title + ' | ' + titleV + ' | ' + p_title, link, desc);
        }
      } catch (e) {
        g_env.error(e);
      }
    }
  } catch (e) {
    g_env.error(e);
  }
}

function saveArticle(title, link, desc) {
  var src = findLink(link);
  if (src != null) return;
  var schema = 's|url|a|title|a|desc|s|fixed|d|score|s|site|s|inbound|s|code';
  var entity = newEntity();
  entity.setSchema(schema);
  entity.setKind('Link');
  entity.setId(g_env.uniqid());
  entity.setString('url', link);
  entity.setString('title', title);
  entity.setString('desc', desc);
  entity.setString('fixed', 'false');
  entity.setString('inbound', '');
  entity.setDouble('score', 0);
  entity.setString('code', g_env.suniqid());
  try {
    var t_url = g_env.newURL(link);
    var t_host = t_url.getHost();
    entity.setString('site', t_host);
  } catch (e) {
    g_env.error(e);
  }
  entity.save();

  var op = '\r\nTitle: ' + title;
  op += '\r\nLink: ' + link;
  op += '\r\nDesc: ' + desc;
  g_env.info(op);
}

function clearCache() {
  g_env.info('Start clearing cache');
  var rs = loadTitleCrawled();
  while (rs.size() > 0) {
    for (var i = 0; i < rs.size(); i++) {
      var et = rs.get(i);
      et.setMark('');
      et.save();
    }
    rs = loadTitleCrawled();
  }
  g_env.info('End clearing cache');
}

function loadTitleCrawled() {
  var pat = newEntity();
  var bq = pat.newBooleanQuery();
  bq.add(pat.newBooleanClause(pat.newTermQuery(pat.newTerm(pat.MARK, 'crawled')), pat.occurMust()));
  var rs = pat.search(g_site + '_Title', bq, 10);
  return rs;
}

function loadTitleFresh() {
  var pat = newEntity();
  var bq = pat.newBooleanQuery();
  bq.add(pat.newBooleanClause(pat.newMatchAllDocsQuery(), pat.occurMust()));
  bq.add(pat.newBooleanClause(pat.newTermQuery(pat.newTerm(pat.MARK, 'crawled')), pat.occurMustNot()));
  var rs = pat.search(g_site + '_Title', bq, 10);
  return rs;
}

function findTitleByLink(link) {
  var pat = newEntity();
  var res = pat.search(g_site + '_Title', pat.newTermQuery(pat.newTerm('link', link)), 1);
  if (res.size() == 0) return null;
  return res.get(0);
}

function findLink(link) {
  var pat = newEntity();
  var res = pat.search('Link', pat.newTermQuery(pat.newTerm('url', link)), 1);
  if (res.size() == 0) return null;
  return res.get(0);
}
    

  Protected by Copyscape Online Copyright Protection

Grab book/journal from ScienceDirect

Grab book/journal from ScienceDirect
This task use javascript sandbox with jsoup and lucene support to grab book/journal from ScienceDirect.
Grab book/journal from ScienceDirect
  1. Create javascript sandbox with jsoup support
  2. Add Lucene support to javascript sandbox
  3. Create javascript as following
javascript
var g_site = 'sciencedirect.com';
var g_env;

function main(p_env, p_args) {
  g_env = p_env;
  run();
}

function newEntity() {
  return g_env.newEntity();
}

function loadUrl(url) {
  var conn = g_env.newJsoup().connect(url);
  conn.userAgent('Mozilla/5.0 (Windows NT x.y; rv:10.0.1) Gecko/20100101 Firefox/10.0.1');
  conn.timeout(60000);
  return conn.get();
}

function run() {
  g_env.info('Starting');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/physicalsciences');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/chemicaleng');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/chemistry');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/computerscience');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/earth');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/energy');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/engineering');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/materialsscience');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/mathematics');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/physics');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/lifesciences');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/agribio');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/biochemgenmolbiol');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/environmental');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/immunolmicrobiol');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/neuroscience');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/healthsciences');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/medicinedentistry');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/nursinghealth');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/pharmatox');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/vetscimed');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/socialscienceshumanities');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/artsandhumanities');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/busmanacc');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/decisionsciences');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/economics');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/psychology');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/socialsciences');
  g_env.info('Ending');
}

function grabCategory(cat) {
  try {
    var pages = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '0-9'];
    for (var pn = 0; pn < pages.length; pn++) {
      try {
        var url = cat + '/' + pages[pn];
        var doc = loadUrl(url);
        var rows = doc.select('#content_browseimp tr.browseimpBrowseRow');
        for (var i = 0; i < rows.size(); i++) {
          var row = rows.get(i);
          var title = g_env.newString('');
          var link = g_env.newString('');
          var kind = g_env.newString('');
          var child = row.select('.browseColFirst a').first();
          if (child != null) {
            title = child.text();
            link = child.attr('href');
            link = g_env.newString(g_env.newURL(g_env.newURL(url), link) + '');
          }
          var child = row.select('.browseColFourth').first();
          if (child != null) {
            kind = child.text().trim();
          }
          if (title.length() > 0 && link.length() > 0 && kind.length() > 0) {
            saveTitle(title, link, kind);
          }
        }
      } catch (e) {
        g_env.error(e);
      }
    }
  } catch (e) {
    g_env.error(e);
  }
}

function saveTitle(title, link, kind) {
  if (findTitleByLink(link) != null) return;
  var schema = 's|link|s|title|s|kind';
  var entity = newEntity();
  entity.setSchema(schema);
  entity.setKind(g_site + '_Title');
  entity.setId(g_env.uniqid());
  entity.setString('link', link);
  entity.setString('title', title);
  entity.setString('kind', kind);
  entity.save();
  g_env.info(kind + ' | ' + title + ' | ' + link);
}

function findTitleByLink(link) {
  var pat = newEntity();
  var res = pat.search(g_site + '_Title', pat.newTermQuery(pat.newTerm('link', link)), 1);
  if (res.size() == 0) return null;
  return res.get(0);
}
    

  Protected by Copyscape Online Copyright Protection

Wednesday 6 June 2012

Grab video from DailyMotion

Grab video from DailyMotion
This task use javascript sandbox with jsoup and lucene support to grab video from DailyMotion.
Grab video from DailyMotion
  1. Create javascript sandbox with jsoup support
  2. Add Lucene support to javascript sandbox
  3. Create javascript as following
javascript
var env;
var args;
var maxpage = 1000;

function main(penv, pargs) {
    env = penv;
    args = pargs;
    env.info('Starting');
    while (true) {
        var queue_list = loadQueue();
        env.info('Size: ' + queue_list.size());
        while (queue_list.size() > 0) {
            for (var i = 0; i < queue_list.size(); i++) {
                var queue = queue_list.get(i);
                grabVideo(queue);
                queue.setString('crawled', 'true');
                queue.save();
            }
            queue_list = loadQueue();
        }
        grabCategory('http://www.dailymotion.com/group/in_theaters_this_week/');
        grabCategory('http://www.dailymotion.com/group/in_theaters_now/');
        grabCategory('http://www.dailymotion.com/group/coming_soon/');
        grabCategory('http://www.dailymotion.com/user/ReelzChannel/lang/en/search/movie+review/');
        grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/interview+movie/channel/shortfilms/');
        grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/movie+news/channel/shortfilms/');
        grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/horror/channel/shortfilms/');
        grabCategory('http://www.dailymotion.com/user/hollywoodtv/');
        grabCategory('http://www.dailymotion.com/creative-official/tag/rock/lang/en/channel/music/');
        grabCategory('http://www.dailymotion.com/creative-official/tag/pop/lang/en/channel/music/');
        grabCategory('http://www.dailymotion.com/creative-official/tag/hop/lang/en/channel/music/');
        grabCategory('http://www.dailymotion.com/creative-official/tag/alternative/lang/en/channel/music/');
        grabCategory('http://www.dailymotion.com/creative-official/tag/dance/lang/en/channel/music/');
        grabCategory('http://www.dailymotion.com/creative-official/tag/soul/lang/en/channel/music/');
        grabCategory('http://www.dailymotion.com/creative-official/tag/latin/lang/en/channel/music/');
        grabCategory('http://www.dailymotion.com/creative-official/tag/country/lang/en/channel/music/');
        grabCategory('http://www.dailymotion.com/user/ClassicGameRoom/');
        grabCategory('http://www.dailymotion.com/mychannel/ClassicGameRoom/');
        grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/fight/channel/videogames/');
        grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/strategy/channel/videogames/');
        grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/shooter/channel/videogames/');
        grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/action/channel/videogames/');
        grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/sport/channel/videogames/');
        grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/trailer/channel/videogames/');
        grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/review/channel/videogames/');
        grabCategory('http://www.dailymotion.com/creative-official/lang/en/search/nfl/');
        grabCategory('http://www.dailymotion.com/creative-official/lang/en/search/nba/');
        grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/nhl/channel/sport/');
        grabCategory('http://www.dailymotion.com/user/TotalCollegeSports/');
        grabCategory('http://www.dailymotion.com/user/UFC/');
        grabCategory('http://www.dailymotion.com/user/sportsillustrated/');
        grabCategory('http://www.dailymotion.com/user/transworld/');
        grabCategory('http://www.dailymotion.com/user/rooftopcomedy/');
        grabCategory('http://www.dailymotion.com/playlist/x1qzsd_MyDamnChannel_dicki/');
        grabCategory('http://www.dailymotion.com/playlist/x1r5r4_MyDamnChannel_easy-to-assemble-season-3/');
        grabCategory('http://www.dailymotion.com/user/Rhettandlink/');
        grabCategory('http://www.dailymotion.com/user/epicmealtime/');
        grabCategory('http://www.dailymotion.com/group/familyguy/');
        grabCategory('http://www.dailymotion.com/creative/lang/en/channel/fun/');
        grabCategory('http://www.dailymotion.com/group/nbcnightlynews/');
        grabCategory('http://www.dailymotion.com/user/reuters/');
        grabCategory('http://www.dailymotion.com/user/NewsLook/');
        grabCategory('http://www.dailymotion.com/user/NYMag/');
        grabCategory('http://www.dailymotion.com/user/itnnews/');
        grabCategory('http://www.dailymotion.com/user/Buzz60/');
        grabCategory('http://www.dailymotion.com/user/associatedpress/');
        grabCategory('http://www.dailymotion.com/us/featured/channel/news/');
        grabCategory('http://www.dailymotion.com/user/clevvertv/');
        grabCategory('http://www.dailymotion.com/user/tvguide/');
        grabCategory('http://www.dailymotion.com/user/splashnews/');
        grabCategory('http://www.dailymotion.com/user/hollywoodbackstage/');
        grabCategory('http://www.dailymotion.com/user/celebtv/');
        grabCategory('http://www.dailymotion.com/user/maximotv/');
        grabCategory('http://www.dailymotion.com/user/mojosupreme/');
        grabCategory('http://www.dailymotion.com/user/DiagonalView/');
        grabCategory('http://www.dailymotion.com/hub/x38_Motionmaker-documentaries/');
        grabCategory('http://www.dailymotion.com/user/tysihelp/');
        grabCategory('http://www.dailymotion.com/user/computerTV/');
        grabCategory('http://www.dailymotion.com/user/soldierknowsbest/');
        grabCategory('http://www.dailymotion.com/user/appjudgment/');
        grabCategory('http://www.dailymotion.com/user/geekbeattv/');
        grabCategory('http://www.dailymotion.com/user/allthingsscience/');
        grabCategory('http://www.dailymotion.com/user/stuffwelike/');
        grabCategory('http://www.dailymotion.com/user/lifehackershow/');
        grabCategory('http://www.dailymotion.com/us/channel/auto/');
    }
    env.info('Ending');
}

function grabVideo(queue) {
    try {
        var url = queue.getString('url');
        var title = queue.getString('title');
        var image = queue.getString('image');
        var conn = env.newJsoup().connect(url).userAgent('Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101');
        var doc = conn.timeout(60000).get(); 
        var child = doc.select('#video_description').first();
        var desc = env.newString('');
        if (child != null) {
            desc = child.text();
        }
        saveLink(title, url, desc, image, '');
    } catch (e) {
        env.error(e);
    }
}

function saveLink(title, url, desc, image, price) {
    url = env.newString(url);
    var pos = url.lastIndexOf('&feature=');
    if (pos >= 0) {
        url = url.substring(0, pos);
    }
    if (findLinkByUrl(url)) return;
    var schema = 's|url|a|title|a|desc|s|fixed|d|score|s|site|s|image|s|price';
    var entity = env.newEntity();
    entity.setSchema(schema);
    entity.setKind('Link');
    entity.setId(env.uniqid());
    entity.setString('url', url);
    entity.setString('title', title);
    entity.setString('desc', desc);
    entity.setString('fixed', 'true');
    entity.setDouble('score', 100);
    entity.setString('image', image);
    entity.setString('price', price);
    try {
        var t_url = env.newURL(url);
        var t_host = t_url.getHost();
        entity.setString('site', t_host);
    } catch (e) {
        env.error(e);
    }
    entity.save();
    env.info(title + ' | ' + url);
}

function findLinkByUrl(url) {
    var entity = env.newEntity();
    var query = entity.newTermQuery(entity.newTerm('url', url));
    var size = entity.count('Link', query, 1);
    return (size > 0);
}

function grabCategory(catUrl) {
    env.info('Category: ' + catUrl);
    for (var no = 1; no <= maxpage; no++) {
        try {
            var conn = env.newJsoup().connect(catUrl + no + '?mode=playlist').userAgent('Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101');
            var doc = conn.timeout(60000).get(); 
            var items = doc.select('.dmpi_video_item');
            if (items.size() == 0) break;
            for (var i = 0; i < items.size(); i++) {
                var item = items.get(i);
                var child = item.select('.dmpi_video_title a').first();
                if (child == null) continue;
                var title = child.text().trim();
                var url = env.newString(env.newURL(env.newURL('http://www.dailymotion.com'), child.attr('href')) + '');
                child = item.select('.dmpi_video_preview a img').first();
                var image = env.newString('');
                if (child != null) {
                    var tmp = env.newString(child.attr('data-spr'));
                    tmp = tmp.replaceAl?('jpeg_preview_sprite.jpg', 'jpeg_preview_medium.jpg');
                    image = env.newString(env.newURL(env.newURL('http://www.dailymotion.com'), tmp) + '');
                }
                markVideo(title, url, image);
            }
        } catch (e) {
            env.error(e);
        }
    }
}

function markVideo(title, url, image) {
    if (findQueueByUrl(url)) return;
    var schema = 's|url|s|title|s|image|s|crawled';
    var entity = env.newEntity();
    entity.setSchema(schema);
    entity.setKind('Queue_DailyMotion');
    entity.setId(env.uniqid());
    entity.setString('url', url);
    entity.setString('title', title);
    entity.setString('image', image);
    entity.setString('crawled', 'false');
    entity.save();
}

function findQueueByUrl(url) {
    var entity = env.newEntity();
    var query = entity.newTermQuery(entity.newTerm('url', url));
    var size = entity.count('Queue_DailyMotion', query, 1);
    return (size > 0);
}

function loadQueue() {
    var entity = env.newEntity();
    var tag = entity.search('Queue_DailyMotion', entity.newTermQuery(entity.newTerm('crawled', 'false')), 10);
    return tag;
}
    

  Protected by Copyscape Online Copyright Protection

Grab video from YouTube

Grab video from YouTube
This task use javascript sandbox with jsoup and lucene support to grab video from YouTube.
Grab video from YouTube
  1. Create javascript sandbox with jsoup support
  2. Add Lucene support to javascript sandbox
  3. Create javascript as following
javascript
var env;
var args;

function main(penv, pargs) {
    env = penv;
    args = pargs;
    env.info('Starting');
    while (true) {
        var queue_list = loadQueue();
        env.info('Size: ' + queue_list.size());
        while (queue_list.size() > 0) {
            for (var i = 0; i < queue_list.size(); i++) {
                var queue = queue_list.get(i);
                grabVideo(queue);
                queue.setString('crawled', 'true');
                queue.save();
            }
            queue_list = loadQueue();
        }
        grabCategory('http://www.youtube.com/autos');
        grabCategory('http://www.youtube.com/comedy');
        grabCategory('http://www.youtube.com/entertainment');
        grabCategory('http://www.youtube.com/film');
        grabCategory('http://www.youtube.com/gaming');
        grabCategory('http://www.youtube.com/howto');
        grabCategory('http://www.youtube.com/activism');
        grabCategory('http://www.youtube.com/people');
        grabCategory('http://www.youtube.com/pets');
        grabCategory('http://www.youtube.com/science');
        grabCategory('http://www.youtube.com/videos?c=17');
        grabCategory('http://www.youtube.com/travel');
    }
    env.info('Ending');
}

function grabVideo(queue) {
    try {
        var url = queue.getString('url');
        var title = queue.getString('title');
        var image = queue.getString('image');
        var conn = env.newJsoup().connect(url).userAgent('Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101');
        var doc = conn.timeout(60000).get(); 
        var child = doc.select('#eow-description').first();
        var desc = env.newString('');
        if (child != null) {
            desc = child.text();
        } else {
            child = doc.select('#ded').first();
            if (child != null) {
                desc = child.text();
            }
        }
        saveLink(title, url, desc, image, '');
  
        var html = doc.html();
        var pos1 = html.indexOf('var rvl =');
        var pos2 = html.indexOf('var cml =');
        if (pos1 < 0 || pos2 < 0 || pos1 >= pos2) return;
        var js1 = html.substring(pos1 + 9, pos2);
        var obj1 = null;
        eval('obj1 = ' + js1);
        if (obj1 == null) return;
        for (var i = 0; i < obj1.length; i++) {
            var item = obj1[i];
            var url2 = env.newString('http://www.youtube.com/watch?v=' + item.k);
            var title2 = item.t;
            var image2 = env.newString(env.newURL(env.newURL('http://www.youtube.com'), item.i));
            markVideo(title2, url2, image2);
            env.info('Video: ' + title2 + ' | ' + url2 + ' | ' + image2);
        }
    } catch (e) {
        env.error(e);
    }
}

function saveLink(title, url, desc, image, price) {
    url = env.newString(url);
    var pos = url.lastIndexOf('&feature=');
    if (pos >= 0) {
        url = url.substring(0, pos);
    }
    if (findLinkByUrl(url)) return;
    var schema = 's|url|a|title|a|desc|s|fixed|d|score|s|site|s|image|s|price';
    var entity = env.newEntity();
    entity.setSchema(schema);
    entity.setKind('Link');
    entity.setId(env.uniqid());
    entity.setString('url', url);
    entity.setString('title', title);
    entity.setString('desc', desc);
    entity.setString('fixed', 'true');
    entity.setDouble('score', 100);
    entity.setString('image', image);
    entity.setString('price', price);
    try {
        var t_url = env.newURL(url);
        var t_host = t_url.getHost();
        entity.setString('site', t_host);
    } catch (e) {
        env.error(e);
    }
    entity.save();
    env.info(title + ' | ' + url);
}

function findLinkByUrl(url) {
    var entity = env.newEntity();
    var query = entity.newTermQuery(entity.newTerm('url', url));
    var size = entity.count('Link', query, 1);
    return (size > 0);
}

function grabCategory(catUrl) {
    try {
        var conn = env.newJsoup().connect(catUrl).userAgent('Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101');
        var doc = conn.timeout(60000).get(); 
        var items = doc.select('.browse-item');
        for (var i = 0; i < items.size(); i++) {
            var item = items.get(i);
            var child = item.select('.browse-item-content h3 a').first();
            if (child == null) continue;
            var title = child.text().trim();
            var url = env.newString(env.newURL(env.newURL('http://www.youtube.com'), child.attr('href')) + '');
            child = item.select('.yt-thumb-clip-inner img').first();
            var image = env.newString('');
            if (child != null) {
                image = env.newString(env.newURL(env.newURL('http://www.youtube.com'), child.attr('data-thumb')) + '');
            }
            markVideo(title, url, image);
        }
    } catch (e) {
        env.error(e);
    }
}

function markVideo(title, url, image) {
    if (findQueueByUrl(url)) return;
    var schema = 's|url|s|title|s|image|s|crawled';
    var entity = env.newEntity();
    entity.setSchema(schema);
    entity.setKind('Queue_YouTube');
    entity.setId(env.uniqid());
    entity.setString('url', url);
    entity.setString('title', title);
    entity.setString('image', image);
    entity.setString('crawled', 'false');
    entity.save();
}

function findQueueByUrl(url) {
    var entity = env.newEntity();
    var query = entity.newTermQuery(entity.newTerm('url', url));
    var size = entity.count('Queue_YouTube', query, 1);
    return (size > 0);
}

function loadQueue() {
    var entity = env.newEntity();
    var tag = entity.search('Queue_YouTube', entity.newTermQuery(entity.newTerm('crawled', 'false')), 10);
    return tag;
}
    

  Protected by Copyscape Online Copyright Protection