Wednesday, 25 July 2012

Grab search results from Yahoo

Grab search results from Yahoo
This task use javascript sandbox with jsoup support to grab search results from Yahoo.
Grab search results from Yahoo
  1. Create javascript sandbox with jsoup support
  2. Create javascript as following
javascript
1var g_env;
2
3function main(p_env, p_args) {
4 g_env = p_env;
5 g_env.info('Starting');
6 run();
7 g_env.info('Ending');
8}
9
10function run() {
11 try {
12 var query = 'lucene';
13 for (var pn = 1; pn <= 10; pn++) {
14 var res = grab(query, pn);
15 for (var i = 0; i < res.size(); i++) {
16 var it = res.get(i);
17 var title = it.get('title');
18 var link = it.get('link');
19 var no = (pn - 1) * 10 + i + 1;
20 g_env.info(no + ' | ' + title + ' | ' + link);
21 }
22 }
23 } catch (e) {
24 g_env.error(e);
25 }
26}
27
28function grab(query, pageno) {
29 var tag = g_env.newArrayList();
30 try {
31 var url = 'http://search.yahoo.com/search?p=' + g_env.encodeURL(query, 'UTF-8') + '&pstart=1&b=' + ((pageno - 1) * 10 + 1);
32 var conn = g_env.newJsoup().connect(url);
33 conn.userAgent('Mozilla/5.0 (Windows NT 5.1; rv:14.0) Gecko/20100101 Firefox/14.0.1');
34 conn.timeout(60000);
35 var doc = conn.get();
36 var nodes = doc.select('#web .res');
37 for (var i = 0; i < nodes.size(); i++) {
38 var node = nodes.get(i);
39 var child = node.select('a.yschttl').first();
40 var title = child.text();
41 var link = child.attr('href');
42 var pos = link.indexOf('**');
43 if (pos >= 0) {
44 link = link.substring(pos + 2);
45 link = g_env.decodeURL(link, 'UTF-8');
46 }
47 var it = g_env.newHashMap();
48 it.put('title', title);
49 it.put('link', link);
50 tag.add(it);
51 }
52 } catch (e) {
53 g_env.error(e);
54 }
55 return tag;
56}
var g_env;

function main(p_env, p_args) {
  g_env = p_env;
  g_env.info('Starting');
  run();
  g_env.info('Ending');
}

function run() {
  try {
    var query = 'lucene';
    for (var pn = 1; pn <= 10; pn++) {
      var res = grab(query, pn);
      for (var i = 0; i < res.size(); i++) {
        var it = res.get(i);
        var title = it.get('title');
        var link = it.get('link');
        var no = (pn - 1) * 10 + i + 1;
        g_env.info(no + ' | ' + title + ' | ' + link);
      }
    }
  } catch (e) {
    g_env.error(e);
  }
}

function grab(query, pageno) {
  var tag = g_env.newArrayList();
  try {
    var url = 'http://search.yahoo.com/search?p=' + g_env.encodeURL(query, 'UTF-8') + '&pstart=1&b=' + ((pageno - 1) * 10 + 1);
    var conn = g_env.newJsoup().connect(url);
    conn.userAgent('Mozilla/5.0 (Windows NT 5.1; rv:14.0) Gecko/20100101 Firefox/14.0.1');
    conn.timeout(60000);
    var doc = conn.get();
    var nodes = doc.select('#web .res');
    for (var i = 0; i < nodes.size(); i++) {
      var node = nodes.get(i);
      var child = node.select('a.yschttl').first();
      var title = child.text();
      var link = child.attr('href');
      var pos = link.indexOf('**');
      if (pos >= 0) {
        link = link.substring(pos + 2);
        link = g_env.decodeURL(link, 'UTF-8');
      }
      var it = g_env.newHashMap();
      it.put('title', title);
      it.put('link', link);
      tag.add(it);
    }
  } catch (e) {
    g_env.error(e);
  }
  return tag;
}

  Protected by Copyscape Online Copyright Protection

Grab search results from Bing

Grab search results from Bing
This task use javascript sandbox with jsoup support to grab search results from Bing.
Grab search results from Bing
  1. Create javascript sandbox with jsoup support
  2. Create javascript as following
javascript
1var g_env;
2
3function main(p_env, p_args) {
4 g_env = p_env;
5 g_env.info('Starting');
6 run();
7 g_env.info('Ending');
8}
9
10function run() {
11 try {
12 var query = 'lucene';
13 for (var pn = 1; pn <= 10; pn++) {
14 var res = grab(query, pn);
15 for (var i = 0; i < res.size(); i++) {
16 var it = res.get(i);
17 var title = it.get('title');
18 var link = it.get('link');
19 var no = (pn - 1) * 10 + i + 1;
20 g_env.info(no + ' | ' + title + ' | ' + link);
21 }
22 }
23 } catch (e) {
24 g_env.error(e);
25 }
26}
27
28function grab(query, pageno) {
29 var tag = g_env.newArrayList();
30 try {
31 var url = 'http://www.bing.com/search?q=' + g_env.encodeURL(query, 'UTF-8') + '&first=' + ((pageno - 1) * 10 + 1);
32 var conn = g_env.newJsoup().connect(url);
33 conn.userAgent('Mozilla/5.0 (Windows NT 5.1; rv:14.0) Gecko/20100101 Firefox/14.0.1');
34 conn.timeout(60000);
35 var doc = conn.get();
36 var nodes = doc.select('#results .sa_wr');
37 for (var i = 0; i < nodes.size(); i++) {
38 var node = nodes.get(i);
39 var child = node.select('.sa_cc .sa_mc .sb_tlst a').first();
40 var title = child.text();
41 var link = child.attr('href');
42 var it = g_env.newHashMap();
43 it.put('title', title);
44 it.put('link', link);
45 tag.add(it);
46 }
47 } catch (e) {
48 g_env.error(e);
49 }
50 return tag;
51}
var g_env;

function main(p_env, p_args) {
  g_env = p_env;
  g_env.info('Starting');
  run();
  g_env.info('Ending');
}

function run() {
  try {
    var query = 'lucene';
    for (var pn = 1; pn <= 10; pn++) {
      var res = grab(query, pn);
      for (var i = 0; i < res.size(); i++) {
        var it = res.get(i);
        var title = it.get('title');
        var link = it.get('link');
        var no = (pn - 1) * 10 + i + 1;
        g_env.info(no + ' | ' + title + ' | ' + link);
      }
    }
  } catch (e) {
    g_env.error(e);
  }
}

function grab(query, pageno) {
  var tag = g_env.newArrayList();
  try {
    var url = 'http://www.bing.com/search?q=' + g_env.encodeURL(query, 'UTF-8') + '&first=' + ((pageno - 1) * 10 + 1);
    var conn = g_env.newJsoup().connect(url);
    conn.userAgent('Mozilla/5.0 (Windows NT 5.1; rv:14.0) Gecko/20100101 Firefox/14.0.1');
    conn.timeout(60000);
    var doc = conn.get();
    var nodes = doc.select('#results .sa_wr');
    for (var i = 0; i < nodes.size(); i++) {
      var node = nodes.get(i);
      var child = node.select('.sa_cc .sa_mc .sb_tlst a').first();
      var title = child.text();
      var link = child.attr('href');
      var it = g_env.newHashMap();
      it.put('title', title);
      it.put('link', link);
      tag.add(it);
    }
  } catch (e) {
    g_env.error(e);
  }
  return tag;
}

  Protected by Copyscape Online Copyright Protection

Tuesday, 24 July 2012

Grab search results from Google

Grab search results from Google
This task use javascript sandbox with jsoup support to grab search results from Google.
Grab search results from Google
  1. Create javascript sandbox with jsoup support
  2. Create javascript as following
javascript
1var g_env;
2
3function main(p_env, p_args) {
4 g_env = p_env;
5 g_env.info('Starting');
6 run();
7 g_env.info('Ending');
8}
9
10function run() {
11 try {
12 var query = 'lucene';
13 for (var pn = 1; pn <= 10; pn++) {
14 var res = grab(query, pn);
15 for (var i = 0; i < res.size(); i++) {
16 var it = res.get(i);
17 var title = it.get('title');
18 var link = it.get('link');
19 var no = (pn - 1) * 10 + i + 1;
20 g_env.info(no + ' | ' + title + ' | ' + link);
21 }
22 }
23 } catch (e) {
24 g_env.error(e);
25 }
26}
27
28function grab(query, pageno) {
29 var tag = g_env.newArrayList();
30 try {
31 var url = 'http://google.com/search?q=' + g_env.encodeURL(query, 'UTF-8') + '&start=' + ((pageno - 1) * 10);
32 var conn = g_env.newJsoup().connect(url);
33 conn.userAgent('Mozilla/5.0 (Windows NT 5.1; rv:14.0) Gecko/20100101 Firefox/14.0.1');
34 conn.timeout(60000);
35 var doc = conn.get();
36 var nodes = doc.select('#rso .g');
37 for (var i = 0; i < nodes.size(); i++) {
38 var node = nodes.get(i);
39 var child = node.select('.vsc .r .l');
40 var title = child.text();
41 var link = child.attr('href');
42 var it = g_env.newHashMap();
43 it.put('title', title);
44 it.put('link', link);
45 tag.add(it);
46 }
47 } catch (e) {
48 g_env.error(e);
49 }
50 return tag;
51}
var g_env;

function main(p_env, p_args) {
  g_env = p_env;
  g_env.info('Starting');
  run();
  g_env.info('Ending');
}

function run() {
  try {
    var query = 'lucene';
    for (var pn = 1; pn <= 10; pn++) {
      var res = grab(query, pn);
      for (var i = 0; i < res.size(); i++) {
        var it = res.get(i);
        var title = it.get('title');
        var link = it.get('link');
        var no = (pn - 1) * 10 + i + 1;
        g_env.info(no + ' | ' + title + ' | ' + link);
      }
    }
  } catch (e) {
    g_env.error(e);
  }
}

function grab(query, pageno) {
  var tag = g_env.newArrayList();
  try {
    var url = 'http://google.com/search?q=' + g_env.encodeURL(query, 'UTF-8') + '&start=' + ((pageno - 1) * 10);
    var conn = g_env.newJsoup().connect(url);
    conn.userAgent('Mozilla/5.0 (Windows NT 5.1; rv:14.0) Gecko/20100101 Firefox/14.0.1');
    conn.timeout(60000);
    var doc = conn.get();
    var nodes = doc.select('#rso .g');
    for (var i = 0; i < nodes.size(); i++) {
      var node = nodes.get(i);
      var child = node.select('.vsc .r .l');
      var title = child.text();
      var link = child.attr('href');
      var it = g_env.newHashMap();
      it.put('title', title);
      it.put('link', link);
      tag.add(it);
    }
  } catch (e) {
    g_env.error(e);
  }
  return tag;
}

  Protected by Copyscape Online Copyright Protection

Saturday, 21 July 2012

Grab article from ScienceDirect

Grab article from ScienceDirect
This task use javascript sandbox with jsoup and lucene support to grab article from ScienceDirect.
Grab article from ScienceDirect
  1. Create javascript sandbox with jsoup support
  2. Add Lucene support to javascript sandbox
  3. Create javascript as following
javascript
1var g_title = '';
2var g_cache = true;
3var g_site = 'sciencedirect.com';
4var g_env;
5var g_cookie;
6
7function main(p_env, p_args) {
8 g_env = p_env;
9 run();
10}
11
12function newEntity() {
13 return g_env.newEntity();
14}
15
16function loadUrlCookieStart(url) {
17 var conn = g_env.newJsoup().connect(url);
18 conn.userAgent('Mozilla/5.0 (Windows NT 5.1; rv:14.0) Gecko/20100101 Firefox/14.0.1');
19 conn.timeout(60000);
20 var tag = conn.get();
21 g_cookie = conn.getCookies();
22 return tag;
23}
24
25function loadUrlCookie(url) {
26 var conn = g_env.newJsoup().connect(url);
27 conn.userAgent('Mozilla/5.0 (Windows NT 5.1; rv:14.0) Gecko/20100101 Firefox/14.0.1');
28 conn.timeout(60000);
29 conn.cookies(g_cookie);
30 return conn.get();
31}
32
33function loadUrl(url) {
34 var conn = g_env.newJsoup().connect(url);
35 conn.userAgent('Mozilla/5.0 (Windows NT 5.1; rv:14.0) Gecko/20100101 Firefox/14.0.1');
36 conn.timeout(60000);
37 return conn.get();
38}
39
40function run() {
41 g_env.info('Starting');
42 if (g_env.newString(g_title).length() > 0) {
43 grabTitle(g_title);
44 } else {
45 if (!g_cache) {
46 clearCache();
47 }
48 var rs = loadTitleFresh();
49 while (rs.size() > 0) {
50 for (var i = 0; i < rs.size(); i++) {
51 var et = rs.get(i);
52 grabTitle(et.getString('link'));
53 }
54 rs = loadTitleFresh();
55 }
56 }
57 g_env.info('Ending');
58}
59
60function grabTitle(link) {
61 var et = findTitleByLink(link);
62 if (et == null) return;
63 var kind = et.getString('kind');
64 if (kind == 'Book') {
65 grabBook(et.getString('title'), et.getString('link'));
66 }
67 if (kind == 'Book Series') {
68 grabBookSeries(et.getString('title'), et.getString('link'));
69 }
70 if (kind == 'Journal') {
71 grabJournal(et.getString('title'), et.getString('link'));
72 }
73 et.setMark('crawled');
74 et.save();
75}
76
77function grabJournal(p_title, p_link) {
78 try {
79 var doc = loadUrl(p_link);
80 var vols_link = g_env.newArrayList();
81 var vols_title = g_env.newArrayList();
82 var rows = doc.select('#volumeIssueData .txtBold a');
83 for (var i = 0; i < rows.size(); i++) {
84 var child = rows.get(i);
85 var title = child.text();
86 var link = child.attr('href');
87 link = g_env.newString(g_env.newURL(g_env.newURL(p_link), link) + '');
88 vols_link.add(link);
89 vols_title.add(title);
90 }
91 for (var i = 0; i < vols_link.size(); i++) {
92 var titleV = vols_title.get(i);
93 var linkV = vols_link.get(i);
94 try {
95 doc = loadUrlCookieStart(linkV);
96 rows = doc.select('#bodyMainResults .resultRow');
97 for (var j = 0; j < rows.size(); j++) {
98 var row = rows.get(j);
99 child = row.select('.cLink').first();
100 if (child == null) continue;
101 var title = child.text();
102 var link = child.attr('href');
103 link = g_env.newString(g_env.newURL(g_env.newURL(p_link), link) + '');
104 var desc = '';
105 try {
106 var cdoc = loadUrlCookie(link);
107 child = cdoc.select('#section_abstract').first();
108 if (child != null) {
109 child = child.parent();
110 desc = child.text();
111 if (desc.indexOf('Abstract') == 0) {
112 desc = desc.substring(8);
113 }
114 if (desc.indexOf('Summary') == 0) {
115 desc = desc.substring(7);
116 }
117 }
118 } catch (e) {
119 g_env.error(e);
120 }
121 saveArticle(title + ' | ' + titleV + ' | ' + p_title, link, desc);
122 }
123 } catch (e) {
124 g_env.error(e);
125 }
126 }
127 } catch (e) {
128 g_env.error(e);
129 }
130}
131
132function grabBook(p_title, p_link) {
133 try {
134 var doc = loadUrlCookieStart(p_link);
135 var rows = doc.select('.contentMain .nonSerialResultsList .cLink');
136 for (var j = 0; j < rows.size(); j++) {
137 var row = rows.get(j);
138 child = row.select('.cLink').first();
139 if (child == null) continue;
140 var title = child.text();
141 var link = child.attr('href');
142 link = g_env.newString(g_env.newURL(g_env.newURL(p_link), link) + '');
143 var desc = '';
144 try {
145 var cdoc = loadUrlCookie(link);
146 child = cdoc.select('#section_abstract').first();
147 if (child != null) {
148 child = child.parent();
149 desc = child.text();
150 if (desc.indexOf('Abstract') == 0) {
151 desc = desc.substring(8);
152 }
153 if (desc.indexOf('Summary') == 0) {
154 desc = desc.substring(7);
155 }
156 }
157 } catch (e) {
158 g_env.error(e);
159 }
160 saveArticle(title + ' | ' + p_title, link, desc);
161 }
162 } catch (e) {
163 g_env.error(e);
164 }
165}
166
167function grabBookSeries(p_title, p_link) {
168 try {
169 var doc = loadUrl(p_link);
170 var vols_link = g_env.newArrayList();
171 var vols_title = g_env.newArrayList();
172 var rows = doc.select('#volumeIssueData .txt');
173 for (var i = 0; i < rows.size(); i++) {
174 var row = rows.get(i);
175 child = row.select('a').first();
176 var title = '';
177 var link = '';
178 if (child == null) {
179 child = row.select('span').first();
180 if (child == null) continue;
181 title = child.text();
182 link = p_link;
183 } else {
184 title = child.text();
185 link = child.attr('href');
186 link = g_env.newString(g_env.newURL(g_env.newURL(p_link), link) + '');
187 }
188 vols_link.add(link);
189 vols_title.add(title);
190 }
191 for (var i = 0; i < vols_link.size(); i++) {
192 var titleV = vols_title.get(i);
193 var linkV = vols_link.get(i);
194 try {
195 doc = loadUrlCookieStart(linkV);
196 rows = doc.select('#bodyMainResults .resultRow');
197 for (var j = 0; j < rows.size(); j++) {
198 var row = rows.get(j);
199 child = row.select('.cLink').first();
200 if (child == null) continue;
201 var title = child.text();
202 var link = child.attr('href');
203 link = g_env.newString(g_env.newURL(g_env.newURL(p_link), link) + '');
204 var desc = '';
205 try {
206 var cdoc = loadUrlCookie(link);
207 child = cdoc.select('#section_abstract').first();
208 if (child != null) {
209 child = child.parent();
210 desc = child.text();
211 if (desc.indexOf('Abstract') == 0) {
212 desc = desc.substring(8);
213 }
214 if (desc.indexOf('Summary') == 0) {
215 desc = desc.substring(7);
216 }
217 }
218 } catch (e) {
219 g_env.error(e);
220 }
221 saveArticle(title + ' | ' + titleV + ' | ' + p_title, link, desc);
222 }
223 } catch (e) {
224 g_env.error(e);
225 }
226 }
227 } catch (e) {
228 g_env.error(e);
229 }
230}
231
232function saveArticle(title, link, desc) {
233 var src = findLink(link);
234 if (src != null) return;
235 var schema = 's|url|a|title|a|desc|s|fixed|d|score|s|site|s|inbound|s|code';
236 var entity = newEntity();
237 entity.setSchema(schema);
238 entity.setKind('Link');
239 entity.setId(g_env.uniqid());
240 entity.setString('url', link);
241 entity.setString('title', title);
242 entity.setString('desc', desc);
243 entity.setString('fixed', 'false');
244 entity.setString('inbound', '');
245 entity.setDouble('score', 0);
246 entity.setString('code', g_env.suniqid());
247 try {
248 var t_url = g_env.newURL(link);
249 var t_host = t_url.getHost();
250 entity.setString('site', t_host);
251 } catch (e) {
252 g_env.error(e);
253 }
254 entity.save();
255
256 var op = '\r\nTitle: ' + title;
257 op += '\r\nLink: ' + link;
258 op += '\r\nDesc: ' + desc;
259 g_env.info(op);
260}
261
262function clearCache() {
263 g_env.info('Start clearing cache');
264 var rs = loadTitleCrawled();
265 while (rs.size() > 0) {
266 for (var i = 0; i < rs.size(); i++) {
267 var et = rs.get(i);
268 et.setMark('');
269 et.save();
270 }
271 rs = loadTitleCrawled();
272 }
273 g_env.info('End clearing cache');
274}
275
276function loadTitleCrawled() {
277 var pat = newEntity();
278 var bq = pat.newBooleanQuery();
279 bq.add(pat.newBooleanClause(pat.newTermQuery(pat.newTerm(pat.MARK, 'crawled')), pat.occurMust()));
280 var rs = pat.search(g_site + '_Title', bq, 10);
281 return rs;
282}
283
284function loadTitleFresh() {
285 var pat = newEntity();
286 var bq = pat.newBooleanQuery();
287 bq.add(pat.newBooleanClause(pat.newMatchAllDocsQuery(), pat.occurMust()));
288 bq.add(pat.newBooleanClause(pat.newTermQuery(pat.newTerm(pat.MARK, 'crawled')), pat.occurMustNot()));
289 var rs = pat.search(g_site + '_Title', bq, 10);
290 return rs;
291}
292
293function findTitleByLink(link) {
294 var pat = newEntity();
295 var res = pat.search(g_site + '_Title', pat.newTermQuery(pat.newTerm('link', link)), 1);
296 if (res.size() == 0) return null;
297 return res.get(0);
298}
299
300function findLink(link) {
301 var pat = newEntity();
302 var res = pat.search('Link', pat.newTermQuery(pat.newTerm('url', link)), 1);
303 if (res.size() == 0) return null;
304 return res.get(0);
305}
var g_title = '';
var g_cache = true;
var g_site = 'sciencedirect.com';
var g_env;
var g_cookie;

function main(p_env, p_args) {
  g_env = p_env;
  run();
}

function newEntity() {
  return g_env.newEntity();
}

function loadUrlCookieStart(url) {
  var conn = g_env.newJsoup().connect(url);
  conn.userAgent('Mozilla/5.0 (Windows NT 5.1; rv:14.0) Gecko/20100101 Firefox/14.0.1');
  conn.timeout(60000);
  var tag = conn.get();
  g_cookie = conn.getCookies();
  return tag;
}

function loadUrlCookie(url) {
  var conn = g_env.newJsoup().connect(url);
  conn.userAgent('Mozilla/5.0 (Windows NT 5.1; rv:14.0) Gecko/20100101 Firefox/14.0.1');
  conn.timeout(60000);
  conn.cookies(g_cookie);
  return conn.get();
}

function loadUrl(url) {
  var conn = g_env.newJsoup().connect(url);
  conn.userAgent('Mozilla/5.0 (Windows NT 5.1; rv:14.0) Gecko/20100101 Firefox/14.0.1');
  conn.timeout(60000);
  return conn.get();
}

function run() {
  g_env.info('Starting');
  if (g_env.newString(g_title).length() > 0) {
    grabTitle(g_title);
  } else {
    if (!g_cache) {
      clearCache();
    }
    var rs = loadTitleFresh();
    while (rs.size() > 0) {
      for (var i = 0; i < rs.size(); i++) {
        var et = rs.get(i);
        grabTitle(et.getString('link'));
      }
      rs = loadTitleFresh();
    }
  }
  g_env.info('Ending');
}

function grabTitle(link) {
  var et = findTitleByLink(link);
  if (et == null) return;
  var kind = et.getString('kind');
  if (kind == 'Book') {
    grabBook(et.getString('title'), et.getString('link'));
  }
  if (kind == 'Book Series') {
    grabBookSeries(et.getString('title'), et.getString('link'));
  }
  if (kind == 'Journal') {
    grabJournal(et.getString('title'), et.getString('link'));
  }
  et.setMark('crawled');
  et.save();
}

function grabJournal(p_title, p_link) {
  try {
    var doc = loadUrl(p_link);
    var vols_link = g_env.newArrayList();
    var vols_title = g_env.newArrayList();
    var rows = doc.select('#volumeIssueData .txtBold a');
    for (var i = 0; i < rows.size(); i++) {
      var child = rows.get(i);
      var title = child.text();
      var link = child.attr('href');
      link = g_env.newString(g_env.newURL(g_env.newURL(p_link), link) + '');
      vols_link.add(link);
      vols_title.add(title);
    }
    for (var i = 0; i < vols_link.size(); i++) {
      var titleV = vols_title.get(i);
      var linkV = vols_link.get(i);
      try {
        doc = loadUrlCookieStart(linkV);
        rows = doc.select('#bodyMainResults .resultRow');
        for (var j = 0; j < rows.size(); j++) {
          var row = rows.get(j);
          child = row.select('.cLink').first();
          if (child == null) continue;
          var title = child.text();
          var link = child.attr('href');
          link = g_env.newString(g_env.newURL(g_env.newURL(p_link), link) + '');
          var desc = '';
          try {
            var cdoc = loadUrlCookie(link);
            child = cdoc.select('#section_abstract').first();
            if (child != null) {
              child = child.parent();
              desc = child.text();
              if (desc.indexOf('Abstract') == 0) {
                desc = desc.substring(8);
              }
              if (desc.indexOf('Summary') == 0) {
                desc = desc.substring(7);
              }
            }
          } catch (e) {
            g_env.error(e);
          }
          saveArticle(title + ' | ' + titleV + ' | ' + p_title, link, desc);
        }
      } catch (e) {
        g_env.error(e);
      }
    }
  } catch (e) {
    g_env.error(e);
  }
}

function grabBook(p_title, p_link) {
  try {
    var doc = loadUrlCookieStart(p_link);
    var rows = doc.select('.contentMain .nonSerialResultsList .cLink');
    for (var j = 0; j < rows.size(); j++) {
      var row = rows.get(j);
      child = row.select('.cLink').first();
      if (child == null) continue;
      var title = child.text();
      var link = child.attr('href');
      link = g_env.newString(g_env.newURL(g_env.newURL(p_link), link) + '');
      var desc = '';
      try {
        var cdoc = loadUrlCookie(link);
        child = cdoc.select('#section_abstract').first();
        if (child != null) {
          child = child.parent();
          desc = child.text();
          if (desc.indexOf('Abstract') == 0) {
            desc = desc.substring(8);
          }
          if (desc.indexOf('Summary') == 0) {
            desc = desc.substring(7);
          }
        }
      } catch (e) {
        g_env.error(e);
      }
      saveArticle(title + ' | ' + p_title, link, desc);
    }
  } catch (e) {
    g_env.error(e);
  }
}

function grabBookSeries(p_title, p_link) {
  try {
    var doc = loadUrl(p_link);
    var vols_link = g_env.newArrayList();
    var vols_title = g_env.newArrayList();
    var rows = doc.select('#volumeIssueData .txt');
    for (var i = 0; i < rows.size(); i++) {
      var row = rows.get(i);
      child = row.select('a').first();
      var title = '';
      var link = '';
      if (child == null) {
        child = row.select('span').first();
        if (child == null) continue;
        title = child.text();
        link = p_link;
      } else {
        title = child.text();
        link = child.attr('href');
        link = g_env.newString(g_env.newURL(g_env.newURL(p_link), link) + '');
      }
      vols_link.add(link);
      vols_title.add(title);
    }
    for (var i = 0; i < vols_link.size(); i++) {
      var titleV = vols_title.get(i);
      var linkV = vols_link.get(i);
      try {
        doc = loadUrlCookieStart(linkV);
        rows = doc.select('#bodyMainResults .resultRow');
        for (var j = 0; j < rows.size(); j++) {
          var row = rows.get(j);
          child = row.select('.cLink').first();
          if (child == null) continue;
          var title = child.text();
          var link = child.attr('href');
          link = g_env.newString(g_env.newURL(g_env.newURL(p_link), link) + '');
          var desc = '';
          try {
            var cdoc = loadUrlCookie(link);
            child = cdoc.select('#section_abstract').first();
            if (child != null) {
              child = child.parent();
              desc = child.text();
              if (desc.indexOf('Abstract') == 0) {
                desc = desc.substring(8);
              }
              if (desc.indexOf('Summary') == 0) {
                desc = desc.substring(7);
              }
            }
          } catch (e) {
            g_env.error(e);
          }
          saveArticle(title + ' | ' + titleV + ' | ' + p_title, link, desc);
        }
      } catch (e) {
        g_env.error(e);
      }
    }
  } catch (e) {
    g_env.error(e);
  }
}

function saveArticle(title, link, desc) {
  var src = findLink(link);
  if (src != null) return;
  var schema = 's|url|a|title|a|desc|s|fixed|d|score|s|site|s|inbound|s|code';
  var entity = newEntity();
  entity.setSchema(schema);
  entity.setKind('Link');
  entity.setId(g_env.uniqid());
  entity.setString('url', link);
  entity.setString('title', title);
  entity.setString('desc', desc);
  entity.setString('fixed', 'false');
  entity.setString('inbound', '');
  entity.setDouble('score', 0);
  entity.setString('code', g_env.suniqid());
  try {
    var t_url = g_env.newURL(link);
    var t_host = t_url.getHost();
    entity.setString('site', t_host);
  } catch (e) {
    g_env.error(e);
  }
  entity.save();

  var op = '\r\nTitle: ' + title;
  op += '\r\nLink: ' + link;
  op += '\r\nDesc: ' + desc;
  g_env.info(op);
}

function clearCache() {
  g_env.info('Start clearing cache');
  var rs = loadTitleCrawled();
  while (rs.size() > 0) {
    for (var i = 0; i < rs.size(); i++) {
      var et = rs.get(i);
      et.setMark('');
      et.save();
    }
    rs = loadTitleCrawled();
  }
  g_env.info('End clearing cache');
}

function loadTitleCrawled() {
  var pat = newEntity();
  var bq = pat.newBooleanQuery();
  bq.add(pat.newBooleanClause(pat.newTermQuery(pat.newTerm(pat.MARK, 'crawled')), pat.occurMust()));
  var rs = pat.search(g_site + '_Title', bq, 10);
  return rs;
}

function loadTitleFresh() {
  var pat = newEntity();
  var bq = pat.newBooleanQuery();
  bq.add(pat.newBooleanClause(pat.newMatchAllDocsQuery(), pat.occurMust()));
  bq.add(pat.newBooleanClause(pat.newTermQuery(pat.newTerm(pat.MARK, 'crawled')), pat.occurMustNot()));
  var rs = pat.search(g_site + '_Title', bq, 10);
  return rs;
}

function findTitleByLink(link) {
  var pat = newEntity();
  var res = pat.search(g_site + '_Title', pat.newTermQuery(pat.newTerm('link', link)), 1);
  if (res.size() == 0) return null;
  return res.get(0);
}

function findLink(link) {
  var pat = newEntity();
  var res = pat.search('Link', pat.newTermQuery(pat.newTerm('url', link)), 1);
  if (res.size() == 0) return null;
  return res.get(0);
}

  Protected by Copyscape Online Copyright Protection

Grab book/journal from ScienceDirect

Grab book/journal from ScienceDirect
This task use javascript sandbox with jsoup and lucene support to grab book/journal from ScienceDirect.
Grab book/journal from ScienceDirect
  1. Create javascript sandbox with jsoup support
  2. Add Lucene support to javascript sandbox
  3. Create javascript as following
javascript
1var g_site = 'sciencedirect.com';
2var g_env;
3
4function main(p_env, p_args) {
5 g_env = p_env;
6 run();
7}
8
9function newEntity() {
10 return g_env.newEntity();
11}
12
13function loadUrl(url) {
14 var conn = g_env.newJsoup().connect(url);
15 conn.userAgent('Mozilla/5.0 (Windows NT x.y; rv:10.0.1) Gecko/20100101 Firefox/10.0.1');
16 conn.timeout(60000);
17 return conn.get();
18}
19
20function run() {
21 g_env.info('Starting');
22 grabCategory('http://www.sciencedirect.com/science/browse/sub/physicalsciences');
23 grabCategory('http://www.sciencedirect.com/science/browse/sub/chemicaleng');
24 grabCategory('http://www.sciencedirect.com/science/browse/sub/chemistry');
25 grabCategory('http://www.sciencedirect.com/science/browse/sub/computerscience');
26 grabCategory('http://www.sciencedirect.com/science/browse/sub/earth');
27 grabCategory('http://www.sciencedirect.com/science/browse/sub/energy');
28 grabCategory('http://www.sciencedirect.com/science/browse/sub/engineering');
29 grabCategory('http://www.sciencedirect.com/science/browse/sub/materialsscience');
30 grabCategory('http://www.sciencedirect.com/science/browse/sub/mathematics');
31 grabCategory('http://www.sciencedirect.com/science/browse/sub/physics');
32 grabCategory('http://www.sciencedirect.com/science/browse/sub/lifesciences');
33 grabCategory('http://www.sciencedirect.com/science/browse/sub/agribio');
34 grabCategory('http://www.sciencedirect.com/science/browse/sub/biochemgenmolbiol');
35 grabCategory('http://www.sciencedirect.com/science/browse/sub/environmental');
36 grabCategory('http://www.sciencedirect.com/science/browse/sub/immunolmicrobiol');
37 grabCategory('http://www.sciencedirect.com/science/browse/sub/neuroscience');
38 grabCategory('http://www.sciencedirect.com/science/browse/sub/healthsciences');
39 grabCategory('http://www.sciencedirect.com/science/browse/sub/medicinedentistry');
40 grabCategory('http://www.sciencedirect.com/science/browse/sub/nursinghealth');
41 grabCategory('http://www.sciencedirect.com/science/browse/sub/pharmatox');
42 grabCategory('http://www.sciencedirect.com/science/browse/sub/vetscimed');
43 grabCategory('http://www.sciencedirect.com/science/browse/sub/socialscienceshumanities');
44 grabCategory('http://www.sciencedirect.com/science/browse/sub/artsandhumanities');
45 grabCategory('http://www.sciencedirect.com/science/browse/sub/busmanacc');
46 grabCategory('http://www.sciencedirect.com/science/browse/sub/decisionsciences');
47 grabCategory('http://www.sciencedirect.com/science/browse/sub/economics');
48 grabCategory('http://www.sciencedirect.com/science/browse/sub/psychology');
49 grabCategory('http://www.sciencedirect.com/science/browse/sub/socialsciences');
50 g_env.info('Ending');
51}
52
53function grabCategory(cat) {
54 try {
55 var pages = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '0-9'];
56 for (var pn = 0; pn < pages.length; pn++) {
57 try {
58 var url = cat + '/' + pages[pn];
59 var doc = loadUrl(url);
60 var rows = doc.select('#content_browseimp tr.browseimpBrowseRow');
61 for (var i = 0; i < rows.size(); i++) {
62 var row = rows.get(i);
63 var title = g_env.newString('');
64 var link = g_env.newString('');
65 var kind = g_env.newString('');
66 var child = row.select('.browseColFirst a').first();
67 if (child != null) {
68 title = child.text();
69 link = child.attr('href');
70 link = g_env.newString(g_env.newURL(g_env.newURL(url), link) + '');
71 }
72 var child = row.select('.browseColFourth').first();
73 if (child != null) {
74 kind = child.text().trim();
75 }
76 if (title.length() > 0 && link.length() > 0 && kind.length() > 0) {
77 saveTitle(title, link, kind);
78 }
79 }
80 } catch (e) {
81 g_env.error(e);
82 }
83 }
84 } catch (e) {
85 g_env.error(e);
86 }
87}
88
89function saveTitle(title, link, kind) {
90 if (findTitleByLink(link) != null) return;
91 var schema = 's|link|s|title|s|kind';
92 var entity = newEntity();
93 entity.setSchema(schema);
94 entity.setKind(g_site + '_Title');
95 entity.setId(g_env.uniqid());
96 entity.setString('link', link);
97 entity.setString('title', title);
98 entity.setString('kind', kind);
99 entity.save();
100 g_env.info(kind + ' | ' + title + ' | ' + link);
101}
102
103function findTitleByLink(link) {
104 var pat = newEntity();
105 var res = pat.search(g_site + '_Title', pat.newTermQuery(pat.newTerm('link', link)), 1);
106 if (res.size() == 0) return null;
107 return res.get(0);
108}
var g_site = 'sciencedirect.com';
var g_env;

function main(p_env, p_args) {
  g_env = p_env;
  run();
}

function newEntity() {
  return g_env.newEntity();
}

function loadUrl(url) {
  var conn = g_env.newJsoup().connect(url);
  conn.userAgent('Mozilla/5.0 (Windows NT x.y; rv:10.0.1) Gecko/20100101 Firefox/10.0.1');
  conn.timeout(60000);
  return conn.get();
}

function run() {
  g_env.info('Starting');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/physicalsciences');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/chemicaleng');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/chemistry');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/computerscience');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/earth');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/energy');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/engineering');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/materialsscience');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/mathematics');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/physics');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/lifesciences');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/agribio');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/biochemgenmolbiol');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/environmental');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/immunolmicrobiol');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/neuroscience');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/healthsciences');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/medicinedentistry');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/nursinghealth');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/pharmatox');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/vetscimed');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/socialscienceshumanities');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/artsandhumanities');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/busmanacc');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/decisionsciences');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/economics');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/psychology');
  grabCategory('http://www.sciencedirect.com/science/browse/sub/socialsciences');
  g_env.info('Ending');
}

function grabCategory(cat) {
  try {
    var pages = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '0-9'];
    for (var pn = 0; pn < pages.length; pn++) {
      try {
        var url = cat + '/' + pages[pn];
        var doc = loadUrl(url);
        var rows = doc.select('#content_browseimp tr.browseimpBrowseRow');
        for (var i = 0; i < rows.size(); i++) {
          var row = rows.get(i);
          var title = g_env.newString('');
          var link = g_env.newString('');
          var kind = g_env.newString('');
          var child = row.select('.browseColFirst a').first();
          if (child != null) {
            title = child.text();
            link = child.attr('href');
            link = g_env.newString(g_env.newURL(g_env.newURL(url), link) + '');
          }
          var child = row.select('.browseColFourth').first();
          if (child != null) {
            kind = child.text().trim();
          }
          if (title.length() > 0 && link.length() > 0 && kind.length() > 0) {
            saveTitle(title, link, kind);
          }
        }
      } catch (e) {
        g_env.error(e);
      }
    }
  } catch (e) {
    g_env.error(e);
  }
}

function saveTitle(title, link, kind) {
  if (findTitleByLink(link) != null) return;
  var schema = 's|link|s|title|s|kind';
  var entity = newEntity();
  entity.setSchema(schema);
  entity.setKind(g_site + '_Title');
  entity.setId(g_env.uniqid());
  entity.setString('link', link);
  entity.setString('title', title);
  entity.setString('kind', kind);
  entity.save();
  g_env.info(kind + ' | ' + title + ' | ' + link);
}

function findTitleByLink(link) {
  var pat = newEntity();
  var res = pat.search(g_site + '_Title', pat.newTermQuery(pat.newTerm('link', link)), 1);
  if (res.size() == 0) return null;
  return res.get(0);
}

  Protected by Copyscape Online Copyright Protection

Wednesday, 6 June 2012

Grab video from DailyMotion

Grab video from DailyMotion
This task use javascript sandbox with jsoup and lucene support to grab video from DailyMotion.
Grab video from DailyMotion
  1. Create javascript sandbox with jsoup support
  2. Add Lucene support to javascript sandbox
  3. Create javascript as following
javascript
1var env;
2var args;
3var maxpage = 1000;
4
5function main(penv, pargs) {
6 env = penv;
7 args = pargs;
8 env.info('Starting');
9 while (true) {
10 var queue_list = loadQueue();
11 env.info('Size: ' + queue_list.size());
12 while (queue_list.size() > 0) {
13 for (var i = 0; i < queue_list.size(); i++) {
14 var queue = queue_list.get(i);
15 grabVideo(queue);
16 queue.setString('crawled', 'true');
17 queue.save();
18 }
19 queue_list = loadQueue();
20 }
21 grabCategory('http://www.dailymotion.com/group/in_theaters_this_week/');
22 grabCategory('http://www.dailymotion.com/group/in_theaters_now/');
23 grabCategory('http://www.dailymotion.com/group/coming_soon/');
24 grabCategory('http://www.dailymotion.com/user/ReelzChannel/lang/en/search/movie+review/');
25 grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/interview+movie/channel/shortfilms/');
26 grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/movie+news/channel/shortfilms/');
27 grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/horror/channel/shortfilms/');
28 grabCategory('http://www.dailymotion.com/user/hollywoodtv/');
29 grabCategory('http://www.dailymotion.com/creative-official/tag/rock/lang/en/channel/music/');
30 grabCategory('http://www.dailymotion.com/creative-official/tag/pop/lang/en/channel/music/');
31 grabCategory('http://www.dailymotion.com/creative-official/tag/hop/lang/en/channel/music/');
32 grabCategory('http://www.dailymotion.com/creative-official/tag/alternative/lang/en/channel/music/');
33 grabCategory('http://www.dailymotion.com/creative-official/tag/dance/lang/en/channel/music/');
34 grabCategory('http://www.dailymotion.com/creative-official/tag/soul/lang/en/channel/music/');
35 grabCategory('http://www.dailymotion.com/creative-official/tag/latin/lang/en/channel/music/');
36 grabCategory('http://www.dailymotion.com/creative-official/tag/country/lang/en/channel/music/');
37 grabCategory('http://www.dailymotion.com/user/ClassicGameRoom/');
38 grabCategory('http://www.dailymotion.com/mychannel/ClassicGameRoom/');
39 grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/fight/channel/videogames/');
40 grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/strategy/channel/videogames/');
41 grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/shooter/channel/videogames/');
42 grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/action/channel/videogames/');
43 grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/sport/channel/videogames/');
44 grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/trailer/channel/videogames/');
45 grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/review/channel/videogames/');
46 grabCategory('http://www.dailymotion.com/creative-official/lang/en/search/nfl/');
47 grabCategory('http://www.dailymotion.com/creative-official/lang/en/search/nba/');
48 grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/nhl/channel/sport/');
49 grabCategory('http://www.dailymotion.com/user/TotalCollegeSports/');
50 grabCategory('http://www.dailymotion.com/user/UFC/');
51 grabCategory('http://www.dailymotion.com/user/sportsillustrated/');
52 grabCategory('http://www.dailymotion.com/user/transworld/');
53 grabCategory('http://www.dailymotion.com/user/rooftopcomedy/');
54 grabCategory('http://www.dailymotion.com/playlist/x1qzsd_MyDamnChannel_dicki/');
55 grabCategory('http://www.dailymotion.com/playlist/x1r5r4_MyDamnChannel_easy-to-assemble-season-3/');
56 grabCategory('http://www.dailymotion.com/user/Rhettandlink/');
57 grabCategory('http://www.dailymotion.com/user/epicmealtime/');
58 grabCategory('http://www.dailymotion.com/group/familyguy/');
59 grabCategory('http://www.dailymotion.com/creative/lang/en/channel/fun/');
60 grabCategory('http://www.dailymotion.com/group/nbcnightlynews/');
61 grabCategory('http://www.dailymotion.com/user/reuters/');
62 grabCategory('http://www.dailymotion.com/user/NewsLook/');
63 grabCategory('http://www.dailymotion.com/user/NYMag/');
64 grabCategory('http://www.dailymotion.com/user/itnnews/');
65 grabCategory('http://www.dailymotion.com/user/Buzz60/');
66 grabCategory('http://www.dailymotion.com/user/associatedpress/');
67 grabCategory('http://www.dailymotion.com/us/featured/channel/news/');
68 grabCategory('http://www.dailymotion.com/user/clevvertv/');
69 grabCategory('http://www.dailymotion.com/user/tvguide/');
70 grabCategory('http://www.dailymotion.com/user/splashnews/');
71 grabCategory('http://www.dailymotion.com/user/hollywoodbackstage/');
72 grabCategory('http://www.dailymotion.com/user/celebtv/');
73 grabCategory('http://www.dailymotion.com/user/maximotv/');
74 grabCategory('http://www.dailymotion.com/user/mojosupreme/');
75 grabCategory('http://www.dailymotion.com/user/DiagonalView/');
76 grabCategory('http://www.dailymotion.com/hub/x38_Motionmaker-documentaries/');
77 grabCategory('http://www.dailymotion.com/user/tysihelp/');
78 grabCategory('http://www.dailymotion.com/user/computerTV/');
79 grabCategory('http://www.dailymotion.com/user/soldierknowsbest/');
80 grabCategory('http://www.dailymotion.com/user/appjudgment/');
81 grabCategory('http://www.dailymotion.com/user/geekbeattv/');
82 grabCategory('http://www.dailymotion.com/user/allthingsscience/');
83 grabCategory('http://www.dailymotion.com/user/stuffwelike/');
84 grabCategory('http://www.dailymotion.com/user/lifehackershow/');
85 grabCategory('http://www.dailymotion.com/us/channel/auto/');
86 }
87 env.info('Ending');
88}
89
90function grabVideo(queue) {
91 try {
92 var url = queue.getString('url');
93 var title = queue.getString('title');
94 var image = queue.getString('image');
95 var conn = env.newJsoup().connect(url).userAgent('Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101');
96 var doc = conn.timeout(60000).get();
97 var child = doc.select('#video_description').first();
98 var desc = env.newString('');
99 if (child != null) {
100 desc = child.text();
101 }
102 saveLink(title, url, desc, image, '');
103 } catch (e) {
104 env.error(e);
105 }
106}
107
108function saveLink(title, url, desc, image, price) {
109 url = env.newString(url);
110 var pos = url.lastIndexOf('&feature=');
111 if (pos >= 0) {
112 url = url.substring(0, pos);
113 }
114 if (findLinkByUrl(url)) return;
115 var schema = 's|url|a|title|a|desc|s|fixed|d|score|s|site|s|image|s|price';
116 var entity = env.newEntity();
117 entity.setSchema(schema);
118 entity.setKind('Link');
119 entity.setId(env.uniqid());
120 entity.setString('url', url);
121 entity.setString('title', title);
122 entity.setString('desc', desc);
123 entity.setString('fixed', 'true');
124 entity.setDouble('score', 100);
125 entity.setString('image', image);
126 entity.setString('price', price);
127 try {
128 var t_url = env.newURL(url);
129 var t_host = t_url.getHost();
130 entity.setString('site', t_host);
131 } catch (e) {
132 env.error(e);
133 }
134 entity.save();
135 env.info(title + ' | ' + url);
136}
137
138function findLinkByUrl(url) {
139 var entity = env.newEntity();
140 var query = entity.newTermQuery(entity.newTerm('url', url));
141 var size = entity.count('Link', query, 1);
142 return (size > 0);
143}
144
145function grabCategory(catUrl) {
146 env.info('Category: ' + catUrl);
147 for (var no = 1; no <= maxpage; no++) {
148 try {
149 var conn = env.newJsoup().connect(catUrl + no + '?mode=playlist').userAgent('Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101');
150 var doc = conn.timeout(60000).get();
151 var items = doc.select('.dmpi_video_item');
152 if (items.size() == 0) break;
153 for (var i = 0; i < items.size(); i++) {
154 var item = items.get(i);
155 var child = item.select('.dmpi_video_title a').first();
156 if (child == null) continue;
157 var title = child.text().trim();
158 var url = env.newString(env.newURL(env.newURL('http://www.dailymotion.com'), child.attr('href')) + '');
159 child = item.select('.dmpi_video_preview a img').first();
160 var image = env.newString('');
161 if (child != null) {
162 var tmp = env.newString(child.attr('data-spr'));
163 tmp = tmp.replaceAl?('jpeg_preview_sprite.jpg', 'jpeg_preview_medium.jpg');
164 image = env.newString(env.newURL(env.newURL('http://www.dailymotion.com'), tmp) + '');
165 }
166 markVideo(title, url, image);
167 }
168 } catch (e) {
169 env.error(e);
170 }
171 }
172}
173
174function markVideo(title, url, image) {
175 if (findQueueByUrl(url)) return;
176 var schema = 's|url|s|title|s|image|s|crawled';
177 var entity = env.newEntity();
178 entity.setSchema(schema);
179 entity.setKind('Queue_DailyMotion');
180 entity.setId(env.uniqid());
181 entity.setString('url', url);
182 entity.setString('title', title);
183 entity.setString('image', image);
184 entity.setString('crawled', 'false');
185 entity.save();
186}
187
188function findQueueByUrl(url) {
189 var entity = env.newEntity();
190 var query = entity.newTermQuery(entity.newTerm('url', url));
191 var size = entity.count('Queue_DailyMotion', query, 1);
192 return (size > 0);
193}
194
195function loadQueue() {
196 var entity = env.newEntity();
197 var tag = entity.search('Queue_DailyMotion', entity.newTermQuery(entity.newTerm('crawled', 'false')), 10);
198 return tag;
199}
var env;
var args;
var maxpage = 1000;

function main(penv, pargs) {
    env = penv;
    args = pargs;
    env.info('Starting');
    while (true) {
        var queue_list = loadQueue();
        env.info('Size: ' + queue_list.size());
        while (queue_list.size() > 0) {
            for (var i = 0; i < queue_list.size(); i++) {
                var queue = queue_list.get(i);
                grabVideo(queue);
                queue.setString('crawled', 'true');
                queue.save();
            }
            queue_list = loadQueue();
        }
        grabCategory('http://www.dailymotion.com/group/in_theaters_this_week/');
        grabCategory('http://www.dailymotion.com/group/in_theaters_now/');
        grabCategory('http://www.dailymotion.com/group/coming_soon/');
        grabCategory('http://www.dailymotion.com/user/ReelzChannel/lang/en/search/movie+review/');
        grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/interview+movie/channel/shortfilms/');
        grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/movie+news/channel/shortfilms/');
        grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/horror/channel/shortfilms/');
        grabCategory('http://www.dailymotion.com/user/hollywoodtv/');
        grabCategory('http://www.dailymotion.com/creative-official/tag/rock/lang/en/channel/music/');
        grabCategory('http://www.dailymotion.com/creative-official/tag/pop/lang/en/channel/music/');
        grabCategory('http://www.dailymotion.com/creative-official/tag/hop/lang/en/channel/music/');
        grabCategory('http://www.dailymotion.com/creative-official/tag/alternative/lang/en/channel/music/');
        grabCategory('http://www.dailymotion.com/creative-official/tag/dance/lang/en/channel/music/');
        grabCategory('http://www.dailymotion.com/creative-official/tag/soul/lang/en/channel/music/');
        grabCategory('http://www.dailymotion.com/creative-official/tag/latin/lang/en/channel/music/');
        grabCategory('http://www.dailymotion.com/creative-official/tag/country/lang/en/channel/music/');
        grabCategory('http://www.dailymotion.com/user/ClassicGameRoom/');
        grabCategory('http://www.dailymotion.com/mychannel/ClassicGameRoom/');
        grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/fight/channel/videogames/');
        grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/strategy/channel/videogames/');
        grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/shooter/channel/videogames/');
        grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/action/channel/videogames/');
        grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/sport/channel/videogames/');
        grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/trailer/channel/videogames/');
        grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/review/channel/videogames/');
        grabCategory('http://www.dailymotion.com/creative-official/lang/en/search/nfl/');
        grabCategory('http://www.dailymotion.com/creative-official/lang/en/search/nba/');
        grabCategory('http://www.dailymotion.com/us/creative-official/lang/en/search/nhl/channel/sport/');
        grabCategory('http://www.dailymotion.com/user/TotalCollegeSports/');
        grabCategory('http://www.dailymotion.com/user/UFC/');
        grabCategory('http://www.dailymotion.com/user/sportsillustrated/');
        grabCategory('http://www.dailymotion.com/user/transworld/');
        grabCategory('http://www.dailymotion.com/user/rooftopcomedy/');
        grabCategory('http://www.dailymotion.com/playlist/x1qzsd_MyDamnChannel_dicki/');
        grabCategory('http://www.dailymotion.com/playlist/x1r5r4_MyDamnChannel_easy-to-assemble-season-3/');
        grabCategory('http://www.dailymotion.com/user/Rhettandlink/');
        grabCategory('http://www.dailymotion.com/user/epicmealtime/');
        grabCategory('http://www.dailymotion.com/group/familyguy/');
        grabCategory('http://www.dailymotion.com/creative/lang/en/channel/fun/');
        grabCategory('http://www.dailymotion.com/group/nbcnightlynews/');
        grabCategory('http://www.dailymotion.com/user/reuters/');
        grabCategory('http://www.dailymotion.com/user/NewsLook/');
        grabCategory('http://www.dailymotion.com/user/NYMag/');
        grabCategory('http://www.dailymotion.com/user/itnnews/');
        grabCategory('http://www.dailymotion.com/user/Buzz60/');
        grabCategory('http://www.dailymotion.com/user/associatedpress/');
        grabCategory('http://www.dailymotion.com/us/featured/channel/news/');
        grabCategory('http://www.dailymotion.com/user/clevvertv/');
        grabCategory('http://www.dailymotion.com/user/tvguide/');
        grabCategory('http://www.dailymotion.com/user/splashnews/');
        grabCategory('http://www.dailymotion.com/user/hollywoodbackstage/');
        grabCategory('http://www.dailymotion.com/user/celebtv/');
        grabCategory('http://www.dailymotion.com/user/maximotv/');
        grabCategory('http://www.dailymotion.com/user/mojosupreme/');
        grabCategory('http://www.dailymotion.com/user/DiagonalView/');
        grabCategory('http://www.dailymotion.com/hub/x38_Motionmaker-documentaries/');
        grabCategory('http://www.dailymotion.com/user/tysihelp/');
        grabCategory('http://www.dailymotion.com/user/computerTV/');
        grabCategory('http://www.dailymotion.com/user/soldierknowsbest/');
        grabCategory('http://www.dailymotion.com/user/appjudgment/');
        grabCategory('http://www.dailymotion.com/user/geekbeattv/');
        grabCategory('http://www.dailymotion.com/user/allthingsscience/');
        grabCategory('http://www.dailymotion.com/user/stuffwelike/');
        grabCategory('http://www.dailymotion.com/user/lifehackershow/');
        grabCategory('http://www.dailymotion.com/us/channel/auto/');
    }
    env.info('Ending');
}

function grabVideo(queue) {
    try {
        var url = queue.getString('url');
        var title = queue.getString('title');
        var image = queue.getString('image');
        var conn = env.newJsoup().connect(url).userAgent('Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101');
        var doc = conn.timeout(60000).get(); 
        var child = doc.select('#video_description').first();
        var desc = env.newString('');
        if (child != null) {
            desc = child.text();
        }
        saveLink(title, url, desc, image, '');
    } catch (e) {
        env.error(e);
    }
}

function saveLink(title, url, desc, image, price) {
    url = env.newString(url);
    var pos = url.lastIndexOf('&feature=');
    if (pos >= 0) {
        url = url.substring(0, pos);
    }
    if (findLinkByUrl(url)) return;
    var schema = 's|url|a|title|a|desc|s|fixed|d|score|s|site|s|image|s|price';
    var entity = env.newEntity();
    entity.setSchema(schema);
    entity.setKind('Link');
    entity.setId(env.uniqid());
    entity.setString('url', url);
    entity.setString('title', title);
    entity.setString('desc', desc);
    entity.setString('fixed', 'true');
    entity.setDouble('score', 100);
    entity.setString('image', image);
    entity.setString('price', price);
    try {
        var t_url = env.newURL(url);
        var t_host = t_url.getHost();
        entity.setString('site', t_host);
    } catch (e) {
        env.error(e);
    }
    entity.save();
    env.info(title + ' | ' + url);
}

function findLinkByUrl(url) {
    var entity = env.newEntity();
    var query = entity.newTermQuery(entity.newTerm('url', url));
    var size = entity.count('Link', query, 1);
    return (size > 0);
}

function grabCategory(catUrl) {
    env.info('Category: ' + catUrl);
    for (var no = 1; no <= maxpage; no++) {
        try {
            var conn = env.newJsoup().connect(catUrl + no + '?mode=playlist').userAgent('Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101');
            var doc = conn.timeout(60000).get(); 
            var items = doc.select('.dmpi_video_item');
            if (items.size() == 0) break;
            for (var i = 0; i < items.size(); i++) {
                var item = items.get(i);
                var child = item.select('.dmpi_video_title a').first();
                if (child == null) continue;
                var title = child.text().trim();
                var url = env.newString(env.newURL(env.newURL('http://www.dailymotion.com'), child.attr('href')) + '');
                child = item.select('.dmpi_video_preview a img').first();
                var image = env.newString('');
                if (child != null) {
                    var tmp = env.newString(child.attr('data-spr'));
                    tmp = tmp.replaceAl?('jpeg_preview_sprite.jpg', 'jpeg_preview_medium.jpg');
                    image = env.newString(env.newURL(env.newURL('http://www.dailymotion.com'), tmp) + '');
                }
                markVideo(title, url, image);
            }
        } catch (e) {
            env.error(e);
        }
    }
}

function markVideo(title, url, image) {
    if (findQueueByUrl(url)) return;
    var schema = 's|url|s|title|s|image|s|crawled';
    var entity = env.newEntity();
    entity.setSchema(schema);
    entity.setKind('Queue_DailyMotion');
    entity.setId(env.uniqid());
    entity.setString('url', url);
    entity.setString('title', title);
    entity.setString('image', image);
    entity.setString('crawled', 'false');
    entity.save();
}

function findQueueByUrl(url) {
    var entity = env.newEntity();
    var query = entity.newTermQuery(entity.newTerm('url', url));
    var size = entity.count('Queue_DailyMotion', query, 1);
    return (size > 0);
}

function loadQueue() {
    var entity = env.newEntity();
    var tag = entity.search('Queue_DailyMotion', entity.newTermQuery(entity.newTerm('crawled', 'false')), 10);
    return tag;
}

  Protected by Copyscape Online Copyright Protection

Grab video from YouTube

Grab video from YouTube
This task use javascript sandbox with jsoup and lucene support to grab video from YouTube.
Grab video from YouTube
  1. Create javascript sandbox with jsoup support
  2. Add Lucene support to javascript sandbox
  3. Create javascript as following
javascript
1var env;
2var args;
3
4function main(penv, pargs) {
5 env = penv;
6 args = pargs;
7 env.info('Starting');
8 while (true) {
9 var queue_list = loadQueue();
10 env.info('Size: ' + queue_list.size());
11 while (queue_list.size() > 0) {
12 for (var i = 0; i < queue_list.size(); i++) {
13 var queue = queue_list.get(i);
14 grabVideo(queue);
15 queue.setString('crawled', 'true');
16 queue.save();
17 }
18 queue_list = loadQueue();
19 }
20 grabCategory('http://www.youtube.com/autos');
21 grabCategory('http://www.youtube.com/comedy');
22 grabCategory('http://www.youtube.com/entertainment');
23 grabCategory('http://www.youtube.com/film');
24 grabCategory('http://www.youtube.com/gaming');
25 grabCategory('http://www.youtube.com/howto');
26 grabCategory('http://www.youtube.com/activism');
27 grabCategory('http://www.youtube.com/people');
28 grabCategory('http://www.youtube.com/pets');
29 grabCategory('http://www.youtube.com/science');
30 grabCategory('http://www.youtube.com/videos?c=17');
31 grabCategory('http://www.youtube.com/travel');
32 }
33 env.info('Ending');
34}
35
36function grabVideo(queue) {
37 try {
38 var url = queue.getString('url');
39 var title = queue.getString('title');
40 var image = queue.getString('image');
41 var conn = env.newJsoup().connect(url).userAgent('Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101');
42 var doc = conn.timeout(60000).get();
43 var child = doc.select('#eow-description').first();
44 var desc = env.newString('');
45 if (child != null) {
46 desc = child.text();
47 } else {
48 child = doc.select('#ded').first();
49 if (child != null) {
50 desc = child.text();
51 }
52 }
53 saveLink(title, url, desc, image, '');
54
55 var html = doc.html();
56 var pos1 = html.indexOf('var rvl =');
57 var pos2 = html.indexOf('var cml =');
58 if (pos1 < 0 || pos2 < 0 || pos1 >= pos2) return;
59 var js1 = html.substring(pos1 + 9, pos2);
60 var obj1 = null;
61 eval('obj1 = ' + js1);
62 if (obj1 == null) return;
63 for (var i = 0; i < obj1.length; i++) {
64 var item = obj1[i];
65 var url2 = env.newString('http://www.youtube.com/watch?v=' + item.k);
66 var title2 = item.t;
67 var image2 = env.newString(env.newURL(env.newURL('http://www.youtube.com'), item.i));
68 markVideo(title2, url2, image2);
69 env.info('Video: ' + title2 + ' | ' + url2 + ' | ' + image2);
70 }
71 } catch (e) {
72 env.error(e);
73 }
74}
75
76function saveLink(title, url, desc, image, price) {
77 url = env.newString(url);
78 var pos = url.lastIndexOf('&feature=');
79 if (pos >= 0) {
80 url = url.substring(0, pos);
81 }
82 if (findLinkByUrl(url)) return;
83 var schema = 's|url|a|title|a|desc|s|fixed|d|score|s|site|s|image|s|price';
84 var entity = env.newEntity();
85 entity.setSchema(schema);
86 entity.setKind('Link');
87 entity.setId(env.uniqid());
88 entity.setString('url', url);
89 entity.setString('title', title);
90 entity.setString('desc', desc);
91 entity.setString('fixed', 'true');
92 entity.setDouble('score', 100);
93 entity.setString('image', image);
94 entity.setString('price', price);
95 try {
96 var t_url = env.newURL(url);
97 var t_host = t_url.getHost();
98 entity.setString('site', t_host);
99 } catch (e) {
100 env.error(e);
101 }
102 entity.save();
103 env.info(title + ' | ' + url);
104}
105
106function findLinkByUrl(url) {
107 var entity = env.newEntity();
108 var query = entity.newTermQuery(entity.newTerm('url', url));
109 var size = entity.count('Link', query, 1);
110 return (size > 0);
111}
112
113function grabCategory(catUrl) {
114 try {
115 var conn = env.newJsoup().connect(catUrl).userAgent('Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101');
116 var doc = conn.timeout(60000).get();
117 var items = doc.select('.browse-item');
118 for (var i = 0; i < items.size(); i++) {
119 var item = items.get(i);
120 var child = item.select('.browse-item-content h3 a').first();
121 if (child == null) continue;
122 var title = child.text().trim();
123 var url = env.newString(env.newURL(env.newURL('http://www.youtube.com'), child.attr('href')) + '');
124 child = item.select('.yt-thumb-clip-inner img').first();
125 var image = env.newString('');
126 if (child != null) {
127 image = env.newString(env.newURL(env.newURL('http://www.youtube.com'), child.attr('data-thumb')) + '');
128 }
129 markVideo(title, url, image);
130 }
131 } catch (e) {
132 env.error(e);
133 }
134}
135
136function markVideo(title, url, image) {
137 if (findQueueByUrl(url)) return;
138 var schema = 's|url|s|title|s|image|s|crawled';
139 var entity = env.newEntity();
140 entity.setSchema(schema);
141 entity.setKind('Queue_YouTube');
142 entity.setId(env.uniqid());
143 entity.setString('url', url);
144 entity.setString('title', title);
145 entity.setString('image', image);
146 entity.setString('crawled', 'false');
147 entity.save();
148}
149
150function findQueueByUrl(url) {
151 var entity = env.newEntity();
152 var query = entity.newTermQuery(entity.newTerm('url', url));
153 var size = entity.count('Queue_YouTube', query, 1);
154 return (size > 0);
155}
156
157function loadQueue() {
158 var entity = env.newEntity();
159 var tag = entity.search('Queue_YouTube', entity.newTermQuery(entity.newTerm('crawled', 'false')), 10);
160 return tag;
161}
var env;
var args;

function main(penv, pargs) {
    env = penv;
    args = pargs;
    env.info('Starting');
    while (true) {
        var queue_list = loadQueue();
        env.info('Size: ' + queue_list.size());
        while (queue_list.size() > 0) {
            for (var i = 0; i < queue_list.size(); i++) {
                var queue = queue_list.get(i);
                grabVideo(queue);
                queue.setString('crawled', 'true');
                queue.save();
            }
            queue_list = loadQueue();
        }
        grabCategory('http://www.youtube.com/autos');
        grabCategory('http://www.youtube.com/comedy');
        grabCategory('http://www.youtube.com/entertainment');
        grabCategory('http://www.youtube.com/film');
        grabCategory('http://www.youtube.com/gaming');
        grabCategory('http://www.youtube.com/howto');
        grabCategory('http://www.youtube.com/activism');
        grabCategory('http://www.youtube.com/people');
        grabCategory('http://www.youtube.com/pets');
        grabCategory('http://www.youtube.com/science');
        grabCategory('http://www.youtube.com/videos?c=17');
        grabCategory('http://www.youtube.com/travel');
    }
    env.info('Ending');
}

function grabVideo(queue) {
    try {
        var url = queue.getString('url');
        var title = queue.getString('title');
        var image = queue.getString('image');
        var conn = env.newJsoup().connect(url).userAgent('Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101');
        var doc = conn.timeout(60000).get(); 
        var child = doc.select('#eow-description').first();
        var desc = env.newString('');
        if (child != null) {
            desc = child.text();
        } else {
            child = doc.select('#ded').first();
            if (child != null) {
                desc = child.text();
            }
        }
        saveLink(title, url, desc, image, '');
  
        var html = doc.html();
        var pos1 = html.indexOf('var rvl =');
        var pos2 = html.indexOf('var cml =');
        if (pos1 < 0 || pos2 < 0 || pos1 >= pos2) return;
        var js1 = html.substring(pos1 + 9, pos2);
        var obj1 = null;
        eval('obj1 = ' + js1);
        if (obj1 == null) return;
        for (var i = 0; i < obj1.length; i++) {
            var item = obj1[i];
            var url2 = env.newString('http://www.youtube.com/watch?v=' + item.k);
            var title2 = item.t;
            var image2 = env.newString(env.newURL(env.newURL('http://www.youtube.com'), item.i));
            markVideo(title2, url2, image2);
            env.info('Video: ' + title2 + ' | ' + url2 + ' | ' + image2);
        }
    } catch (e) {
        env.error(e);
    }
}

function saveLink(title, url, desc, image, price) {
    url = env.newString(url);
    var pos = url.lastIndexOf('&feature=');
    if (pos >= 0) {
        url = url.substring(0, pos);
    }
    if (findLinkByUrl(url)) return;
    var schema = 's|url|a|title|a|desc|s|fixed|d|score|s|site|s|image|s|price';
    var entity = env.newEntity();
    entity.setSchema(schema);
    entity.setKind('Link');
    entity.setId(env.uniqid());
    entity.setString('url', url);
    entity.setString('title', title);
    entity.setString('desc', desc);
    entity.setString('fixed', 'true');
    entity.setDouble('score', 100);
    entity.setString('image', image);
    entity.setString('price', price);
    try {
        var t_url = env.newURL(url);
        var t_host = t_url.getHost();
        entity.setString('site', t_host);
    } catch (e) {
        env.error(e);
    }
    entity.save();
    env.info(title + ' | ' + url);
}

function findLinkByUrl(url) {
    var entity = env.newEntity();
    var query = entity.newTermQuery(entity.newTerm('url', url));
    var size = entity.count('Link', query, 1);
    return (size > 0);
}

function grabCategory(catUrl) {
    try {
        var conn = env.newJsoup().connect(catUrl).userAgent('Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101');
        var doc = conn.timeout(60000).get(); 
        var items = doc.select('.browse-item');
        for (var i = 0; i < items.size(); i++) {
            var item = items.get(i);
            var child = item.select('.browse-item-content h3 a').first();
            if (child == null) continue;
            var title = child.text().trim();
            var url = env.newString(env.newURL(env.newURL('http://www.youtube.com'), child.attr('href')) + '');
            child = item.select('.yt-thumb-clip-inner img').first();
            var image = env.newString('');
            if (child != null) {
                image = env.newString(env.newURL(env.newURL('http://www.youtube.com'), child.attr('data-thumb')) + '');
            }
            markVideo(title, url, image);
        }
    } catch (e) {
        env.error(e);
    }
}

function markVideo(title, url, image) {
    if (findQueueByUrl(url)) return;
    var schema = 's|url|s|title|s|image|s|crawled';
    var entity = env.newEntity();
    entity.setSchema(schema);
    entity.setKind('Queue_YouTube');
    entity.setId(env.uniqid());
    entity.setString('url', url);
    entity.setString('title', title);
    entity.setString('image', image);
    entity.setString('crawled', 'false');
    entity.save();
}

function findQueueByUrl(url) {
    var entity = env.newEntity();
    var query = entity.newTermQuery(entity.newTerm('url', url));
    var size = entity.count('Queue_YouTube', query, 1);
    return (size > 0);
}

function loadQueue() {
    var entity = env.newEntity();
    var tag = entity.search('Queue_YouTube', entity.newTermQuery(entity.newTerm('crawled', 'false')), 10);
    return tag;
}

  Protected by Copyscape Online Copyright Protection

Tuesday, 22 May 2012

Grab products from HP Shopping

Grab products from HP Shopping
This task use javascript sandbox with jsoup support to grab products from HP Shopping.
Grab products from HP Shopping
  1. Create javascript sandbox with jsoup support
  2. Create javascript as following
javascript
1function main(env, args) {
2 var catUrl = 'http://shopping.hp.com/en_US/home-office/-/products/Laptops/HP%20Pavilion';
3 var links = args.get('links');
4 var prods = grabProduct(catUrl, env);
5 for (var i = 0; i < prods.size(); i++) {
6 links.add(prods.get(i));
7 }
8}
9
10function grabProduct(catUrl, env) {
11 var tag = env.newArrayList();
12 var urls = env.newArrayList();
13 urls.add(catUrl);
14 try {
15 var conn = env.newJsoup().connect(catUrl).userAgent('Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101');
16 var doc = conn.timeout(60000).get();
17 var root = doc.select('.pagination-results-container').first();
18 if (root != null) {
19 var elements = root.select('a');
20 for (var i = 0; i < elements.size() - 1; i++) {
21 var element = elements.get(i);
22 if (element.hasClass('option')) continue;
23 if (element.hasClass('pngFix')) continue;
24 urls.add(element.attr('href'));
25 }
26 }
27 } catch (e) {
28 env.error(e);
29 }
30 for (var i = 0; i < urls.size(); i++) {
31 try {
32 var link = urls.get(i);
33 var conn = env.newJsoup().connect(link).userAgent('Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101');
34 var doc = conn.timeout(60000).get();
35 var elements = doc.select('.listing-page-bucket');
36 for (var j = 0; j < elements.size(); j++) {
37 var element = elements.get(j);
38 var children = element.select('.color-selector-img img.pngFix');
39 var image_list = '';
40 var image = '';
41 for (var k = 0; k < children.size(); k++) {
42 var child = children.get(k);
43 if (image_list.length > 0) image_list += '\n';
44 image_list += child.attr('src') + '';
45 if (child.attr('style') + '' != 'display:none') {
46 image = child.attr('src');
47 }
48 }
49 var child = element.select('.product-specs h3 a').first();
50 if (child == null) continue;
51 var title = child.text();
52 var url = child.attr('href');
53 var desc = '';
54 child = element.select('.product-specs .rating').first();
55 if (child != null) {
56 child = child.nextElementSibling().nextElementSibling();
57 desc = child.text();
58 }
59 var price = '';
60 child = element.select('#start-price').first();
61 if (child != null) {
62 price = child.text();
63 } else {
64 child = element.select('.price-value').first();
65 if (child != null) {
66 price = child.text();
67 }
68 }
69 var it = env.newHashMap();
70 it.put('image-list', image_list);
71 it.put('image', image);
72 it.put('title', title);
73 it.put('url', url);
74 it.put('desc', desc);
75 it.put('price', price);
76 tag.add(it);
77 }
78 } catch (e) {
79 env.error(e);
80 }
81 }
82 return tag;
83}
function main(env, args) {
  var catUrl = 'http://shopping.hp.com/en_US/home-office/-/products/Laptops/HP%20Pavilion';
  var links = args.get('links');
  var prods = grabProduct(catUrl, env);
  for (var i = 0; i < prods.size(); i++) {
    links.add(prods.get(i));
  }
}

function grabProduct(catUrl, env) {
  var tag = env.newArrayList();
  var urls = env.newArrayList();
  urls.add(catUrl);
  try {
      var conn = env.newJsoup().connect(catUrl).userAgent('Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101');
      var doc = conn.timeout(60000).get();
      var root = doc.select('.pagination-results-container').first();
      if (root != null) {
        var elements = root.select('a');
        for (var i = 0; i < elements.size() - 1; i++) {
           var element = elements.get(i);
           if (element.hasClass('option')) continue;
           if (element.hasClass('pngFix')) continue;
           urls.add(element.attr('href'));
        }
      }
  } catch (e) {
    env.error(e);
  }
  for (var i = 0; i < urls.size(); i++) {
    try {
      var link = urls.get(i);
      var conn = env.newJsoup().connect(link).userAgent('Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101');
      var doc = conn.timeout(60000).get();
      var elements = doc.select('.listing-page-bucket');
      for (var j = 0; j < elements.size(); j++) {
        var element = elements.get(j);
        var children = element.select('.color-selector-img img.pngFix');
        var image_list = '';
        var image = '';
        for (var k = 0; k < children.size(); k++) {
           var child = children.get(k);
           if (image_list.length > 0) image_list += '\n';
           image_list += child.attr('src') + '';
           if (child.attr('style') + '' != 'display:none') {
              image = child.attr('src');
           }
        }
        var child = element.select('.product-specs h3 a').first();
        if (child == null) continue;
        var title = child.text();
        var url = child.attr('href');
        var desc = '';
        child = element.select('.product-specs .rating').first();
        if (child != null) {
          child = child.nextElementSibling().nextElementSibling();
          desc = child.text();
        }
        var price = '';
        child = element.select('#start-price').first();
        if (child != null) {
          price = child.text();
        } else {
          child = element.select('.price-value').first();
          if (child != null) {
            price = child.text();
          }
        }
        var it = env.newHashMap();
        it.put('image-list', image_list);
        it.put('image', image);
        it.put('title', title);
        it.put('url', url);
        it.put('desc', desc);
        it.put('price', price);
        tag.add(it);
      }
    } catch (e) {
      env.error(e);
    }
  }
  return tag;
}

  Protected by Copyscape Online Copyright Protection

Grab categories from HP Shopping

Grab categories from HP Shopping
This task use javascript sandbox with jsoup support to grab categories from HP Shopping.
Grab categories from HP Shopping
  1. Create javascript sandbox with jsoup support
  2. Create javascript as following
javascript
1function main(env, args) {
2 var links = args.get('links');
3 var cats = grabCategory(env);
4 for (var i = 0; i < cats.size(); i++) {
5 links.add(cats.get(i));
6 }
7}
8
9function grabCategory(env) {
10 var tag = env.newArrayList();
11 try {
12 var link = env.newURL('http://shopping.hp.com');
13 var conn = env.newJsoup().connect(link).userAgent('Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101');
14 var html = conn.timeout(60000).execute().body();
15 var pat1 = 'var surveyInitData =';
16 var pat2 = "$('body').hpOnSiteExit({";
17 var pos1 = html.indexOf(pat1);
18 if (pos1 < 0) {
19 env.info('S1: javascript code not found');
20 return tag;
21 }
22 var pos2 = html.indexOf(pat2, pos1);
23 if (pos2 < 0) {
24 env.info('S2: javascript code not found');
25 return tag;
26 }
27 var js = html.substring(pos1 + pat1.length, pos2);
28 var obj = null;
29 eval('obj = ' + js);
30 var pages = obj.surveyData[0].configPages;
31 for (var i = 0; i < pages.length; i++) {
32 var pg = pages[i];
33 if (pg.pageType != 'category') continue;
34 var it = env.newHashMap();
35 it.put('title', pg.pageName);
36 it.put('url', pg.fullpath);
37 tag.add(it);
38 }
39 } catch (e) {
40 env.error(e);
41 }
42 return tag;
43}
function main(env, args) {
  var links = args.get('links');
  var cats = grabCategory(env);
  for (var i = 0; i < cats.size(); i++) {
    links.add(cats.get(i));
  }
}

function grabCategory(env) {
  var tag = env.newArrayList();
  try {
    var link = env.newURL('http://shopping.hp.com');
    var conn = env.newJsoup().connect(link).userAgent('Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101');
    var html = conn.timeout(60000).execute().body();
    var pat1 = 'var surveyInitData =';
    var pat2 = "$('body').hpOnSiteExit({";
    var pos1 = html.indexOf(pat1);
    if (pos1 < 0) {
      env.info('S1: javascript code not found');
      return tag;
    }
    var pos2 = html.indexOf(pat2, pos1);
    if (pos2 < 0) {
      env.info('S2: javascript code not found');
      return tag;
    }
    var js = html.substring(pos1 + pat1.length, pos2);
    var obj = null;
    eval('obj = ' + js);
    var pages = obj.surveyData[0].configPages;
    for (var i = 0; i < pages.length; i++) {
      var pg = pages[i];
      if (pg.pageType != 'category') continue;
      var it = env.newHashMap();
      it.put('title', pg.pageName);
      it.put('url', pg.fullpath);
      tag.add(it);
    }
  } catch (e) {
    env.error(e);
  }
  return tag;
}

  Protected by Copyscape Online Copyright Protection

Wednesday, 25 April 2012

Add MySQL support to javascript sandbox

Add MySQL support to javascript sandbox
This task add MySQL support to javascript sandbox.
Add MySQL support to javascript sandbox
  1. Create javascript sandbox with jsoup support
  2. Create com.paesia.schema.script.safe.mysql.SMySQL class as following
  3. Modify com.paesia.schema.script.Machine class as following
Modify com.paesia.schema.script.Machine class
1............
2
3import com.paesia.schema.script.safe.mysql.SMySQL;
4
5public class Machine {
6
7 private Handler handler;
8
9 public static void run(Machine env, String js, Map args) throws Exception {
10 try {
11 Context cx = Context.enter();
12 cx.setClassShutter(new ClassShutter() {
13 public boolean visibleToScripts(String className) {
14...........
15 return false;
16 }
17 });
18
19...........
20
21 } catch (Exception e) {
22 throw e;
23 } finally {
24 Context.exit();
25 }
26 }
27...........
28
29 public SMySQL newMySQL() {
30 return new SMySQL();
31 }
32
33...........
34}
............

import com.paesia.schema.script.safe.mysql.SMySQL;

public class Machine {

    private Handler handler;
 
    public static void run(Machine env, String js, Map args) throws Exception {
        try {
            Context cx = Context.enter();
            cx.setClassShutter(new ClassShutter() {
                public boolean visibleToScripts(String className) {  
...........
                    return false;
                }
            });   

...........

        } catch (Exception e) {
            throw e;
        } finally {
            Context.exit();   
        }
    }
...........

    public SMySQL newMySQL() {
        return new SMySQL();
    }

...........
}
com.paesia.schema.script.safe.mysql.SMySQL class
1package com.paesia.schema.script.safe.mysql;
2
3import java.sql.Connection;
4import java.sql.DriverManager;
5import java.sql.PreparedStatement;
6import java.sql.ResultSet;
7import java.sql.ResultSetMetaData;
8import java.util.ArrayList;
9import java.util.HashMap;
10import java.util.List;
11import java.util.Map;
12
13public class SMySQL {
14
15 private Connection conn;
16
17 public void open(String server, String database, String username, String password) throws Exception {
18 close();
19 Class.forName("com.mysql.jdbc.Driver");
20 conn = DriverManager.getConnection("jdbc:mysql://" + server + "/" + database + "?"+ "user=" + username + "&password=" + password);
21 }
22
23 public void open(Map info) throws Exception {
24 String server = info.get("server") + "";
25 String database = info.get("database") + "";
26 String username = info.get("username") + "";
27 String password = info.get("password") + "";
28 open(server, database, username, password);
29 }
30
31 public void close() {
32 if (conn != null) {
33 try {
34 conn.close();
35 } catch (Exception e) {
36 }
37 conn = null;
38 }
39 }
40
41 public boolean execute(String sql, List params) throws Exception {
42 PreparedStatement prep = buildStatement(sql, params);
43 boolean tag = prep.execute();
44 prep.close();
45 return tag;
46 }
47
48 public boolean execute(String sql, Map... params) throws Exception {
49 PreparedStatement prep = buildStatement(sql, params);
50 boolean tag = prep.execute();
51 prep.close();
52 return tag;
53 }
54
55 public List query(String sql, List params) throws Exception {
56 PreparedStatement prep = buildStatement(sql, params);
57 ResultSet rs = prep.executeQuery();
58 List tag = parseResult(rs);
59 rs.close();
60 prep.close();
61 return tag;
62 }
63
64 public List query(String sql, Map... params) throws Exception {
65 PreparedStatement prep = buildStatement(sql, params);
66 ResultSet rs = prep.executeQuery();
67 List tag = parseResult(rs);
68 rs.close();
69 prep.close();
70 return tag;
71 }
72
73 public Map paramString(String value) {
74 Map tag = new HashMap();
75 tag.put("kind", "String");
76 tag.put("value", value);
77 return tag;
78 }
79
80 public Map paramBoolean(Boolean value) {
81 Map tag = new HashMap();
82 tag.put("kind", "Boolean");
83 tag.put("value", value);
84 return tag;
85 }
86
87 public Map paramInteger(Integer value) {
88 Map tag = new HashMap();
89 tag.put("kind", "Integer");
90 tag.put("value", value);
91 return tag;
92 }
93
94 public Map paramLong(Long value) {
95 Map tag = new HashMap();
96 tag.put("kind", "Long");
97 tag.put("value", value);
98 return tag;
99 }
100
101 public Map paramFloat(Float value) {
102 Map tag = new HashMap();
103 tag.put("kind", "Float");
104 tag.put("value", value);
105 return tag;
106 }
107
108 public Map paramDouble(Double value) {
109 Map tag = new HashMap();
110 tag.put("kind", "Double");
111 tag.put("value", value);
112 return tag;
113 }
114
115 protected PreparedStatement buildStatement(String sql, Map... params) throws Exception {
116 List args = new ArrayList();
117 if (params != null) {
118 for (int i = 0; i < params.length; i++) {
119 args.add(params[i]);
120 }
121 }
122 return buildStatement(sql, args);
123 }
124
125 protected PreparedStatement buildStatement(String sql, List params) throws Exception {
126 PreparedStatement prep = conn.prepareStatement(sql);
127 for (int i = 0; i < params.size(); i++) {
128 Map item = (Map)params.get(i);
129 String kind = item.get("kind") + "";
130 if ("String".equals(kind)) {
131 prep.setString(i + 1, (String)item.get("value"));
132 } else if ("Boolean".equals(kind)) {
133 prep.setBoolean(i + 1, (Boolean)item.get("value"));
134 } else if ("Integer".equals(kind)) {
135 prep.setInt(i + 1, (Integer)item.get("value"));
136 } else if ("Long".equals(kind)) {
137 prep.setLong(i + 1, (Long)item.get("value"));
138 } else if ("Float".equals(kind)) {
139 prep.setFloat(i + 1, (Float)item.get("value"));
140 } else if ("Double".equals(kind)) {
141 prep.setDouble(i + 1, (Double)item.get("value"));
142 } else {
143 prep.setString(i + 1, item.get("value") + "");
144 }
145 }
146 return prep;
147 }
148
149 protected List parseResult(ResultSet rs) throws Exception {
150 List tag = new ArrayList();
151 ResultSetMetaData md = rs.getMetaData();
152 while (rs.next()) {
153 Map item = new HashMap();
154 for (int i = 1; i <= md.getColumnCount(); i++) {
155 String name = md.getColumnName(i);
156 int type = md.getColumnType(i);
157 if (type == java.sql.Types.BIGINT) {
158 item.put(name, rs.getLong(name));
159 } else if (type == java.sql.Types.INTEGER || type == java.sql.Types.SMALLINT || type == java.sql.Types.TINYINT) {
160 item.put(name, rs.getInt(name));
161 } else if (type == java.sql.Types.DECIMAL || type == java.sql.Types.DOUBLE || type == java.sql.Types.NUMERIC || type == java.sql.Types.REAL) {
162 item.put(name, rs.getDouble(name));
163 } else if (type == java.sql.Types.FLOAT) {
164 item.put(name, rs.getFloat(name));
165 } else if (type == java.sql.Types.BIT || type == java.sql.Types.BOOLEAN) {
166 item.put(name, rs.getBoolean(name));
167 } else {
168 item.put(name, rs.getString(name));
169 }
170 }
171 tag.add(item);
172 }
173 return tag;
174 }
175
176}
package com.paesia.schema.script.safe.mysql;

import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.ResultSetMetaData;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

public class SMySQL {

    private Connection conn;
 
    public void open(String server, String database, String username, String password) throws Exception {
        close();
        Class.forName("com.mysql.jdbc.Driver");
        conn = DriverManager.getConnection("jdbc:mysql://" + server + "/" + database + "?"+ "user=" + username + "&password=" + password);
    }
 
    public void open(Map info) throws Exception {
        String server = info.get("server") + "";
        String database = info.get("database") + "";
        String username = info.get("username") + "";
        String password = info.get("password") + "";
        open(server, database, username, password);
    }
 
    public void close() {
        if (conn != null) {
            try {
                conn.close();
            } catch (Exception e) {
            }
            conn = null;
        }
    }
 
    public boolean execute(String sql, List params) throws Exception {
        PreparedStatement prep = buildStatement(sql, params);
        boolean tag = prep.execute();
        prep.close();
        return tag;
    }

    public boolean execute(String sql, Map... params) throws Exception {
        PreparedStatement prep = buildStatement(sql, params);
        boolean tag = prep.execute();
        prep.close();
        return tag;
    }

    public List query(String sql, List params) throws Exception {
        PreparedStatement prep = buildStatement(sql, params);
        ResultSet rs = prep.executeQuery();
        List tag = parseResult(rs);
        rs.close();
        prep.close();
        return tag;
    }

    public List query(String sql, Map... params) throws Exception {
        PreparedStatement prep = buildStatement(sql, params);
        ResultSet rs = prep.executeQuery();
        List tag = parseResult(rs);
        rs.close();
        prep.close();
        return tag;
    }
 
    public Map paramString(String value) {
        Map tag = new HashMap();
        tag.put("kind", "String");
        tag.put("value", value);
        return tag;
    }

    public Map paramBoolean(Boolean value) {
        Map tag = new HashMap();
        tag.put("kind", "Boolean");
        tag.put("value", value);
        return tag;
    }

    public Map paramInteger(Integer value) {
        Map tag = new HashMap();
        tag.put("kind", "Integer");
        tag.put("value", value);
        return tag;
    }

    public Map paramLong(Long value) {
        Map tag = new HashMap();
        tag.put("kind", "Long");
        tag.put("value", value);
        return tag;
    }
 
    public Map paramFloat(Float value) {
        Map tag = new HashMap();
        tag.put("kind", "Float");
        tag.put("value", value);
        return tag;
    }
 
    public Map paramDouble(Double value) {
        Map tag = new HashMap();
        tag.put("kind", "Double");
        tag.put("value", value);
        return tag;
    }

    protected PreparedStatement buildStatement(String sql, Map... params) throws Exception {
        List args = new ArrayList();
        if (params != null) {
            for (int i = 0; i < params.length; i++) {
                args.add(params[i]);
            }
        }
        return buildStatement(sql, args);
    }
 
    protected PreparedStatement buildStatement(String sql, List params) throws Exception {
        PreparedStatement prep = conn.prepareStatement(sql);
        for (int i = 0; i < params.size(); i++) {
            Map item = (Map)params.get(i);
            String kind = item.get("kind") + "";
            if ("String".equals(kind)) {
                prep.setString(i + 1, (String)item.get("value"));
            } else if ("Boolean".equals(kind)) {
                prep.setBoolean(i + 1, (Boolean)item.get("value"));
            } else if ("Integer".equals(kind)) {
                prep.setInt(i + 1, (Integer)item.get("value"));
            } else if ("Long".equals(kind)) {
                prep.setLong(i + 1, (Long)item.get("value"));
            } else if ("Float".equals(kind)) {
                prep.setFloat(i + 1, (Float)item.get("value"));
            } else if ("Double".equals(kind)) {
                prep.setDouble(i + 1, (Double)item.get("value"));
            } else {
                prep.setString(i + 1, item.get("value") + "");
            }
        }
        return prep;
    }

    protected List parseResult(ResultSet rs) throws Exception {
        List tag = new ArrayList();
        ResultSetMetaData md = rs.getMetaData();
        while (rs.next()) {
            Map item = new HashMap();
            for (int i = 1; i <= md.getColumnCount(); i++) {
                String name = md.getColumnName(i);
                int type = md.getColumnType(i);
                if (type == java.sql.Types.BIGINT) {
                    item.put(name, rs.getLong(name));
                } else if (type == java.sql.Types.INTEGER || type == java.sql.Types.SMALLINT || type == java.sql.Types.TINYINT) {
                    item.put(name, rs.getInt(name));
                } else if (type == java.sql.Types.DECIMAL || type == java.sql.Types.DOUBLE || type == java.sql.Types.NUMERIC || type == java.sql.Types.REAL) {
                    item.put(name, rs.getDouble(name));
                } else if (type == java.sql.Types.FLOAT) {
                    item.put(name, rs.getFloat(name));
                } else if (type == java.sql.Types.BIT || type == java.sql.Types.BOOLEAN) {
                    item.put(name, rs.getBoolean(name));
                } else {
                    item.put(name, rs.getString(name));
                }
            }
            tag.add(item);
        }
        return tag;
    }
 
}

  Protected by Copyscape Online Copyright Protection

Task tagged by [MySQL]

  1. Add MySQL support to javascript sandbox

Wednesday, 18 April 2012

Add Lucene support to javascript sandbox

Add Lucene support to javascript sandbox
This task add Lucene support to javascript sandbox.
Add Lucene support to javascript sandbox
  1. Create javascript sandbox with jsoup support
  2. Create com.paesia.schema.script.safe.lucene.SEntity class as following
  3. Create com.paesia.schema.script.LuceneHandler class as following
  4. Modify com.paesia.schema.script.Machine class as following
  5. Modify DataHandler class as following
  6. Create javascript as following
  7. Call Machine.run() method as following
Call Machine.run() method
1String dirIndex = "";
2String dirBackup = "";
3double systemQuota = 1024 * 1024;
4String js = loadJS();
5Map args = new HashMap();
6List links = new ArrayList();
7args.put("links", links);
8
9Machine env = new Machine(new DataHandler(dirIndex, dirBackup, systemQuota));
10Machine.run(env, js, args);
11
12for (int i = 0; i < links.size(); i++) {
13 Map item = (Map)links.get(i);
14 String line = "";
15 for (Object key : item.keySet()) {
16 line += "\r\n" + key + " : " + item.get(key);
17 }
18 logger.info("\r\n" + (i + 1) + " --------------------------------\r\n" + line + "\r\n");
19}
String dirIndex = "";
String dirBackup = "";
double systemQuota = 1024 * 1024;
String js = loadJS();
Map args = new HashMap();
List links = new ArrayList();
args.put("links", links);

Machine env = new Machine(new DataHandler(dirIndex, dirBackup, systemQuota));
Machine.run(env, js, args);
         
for (int i = 0; i < links.size(); i++) {
    Map item = (Map)links.get(i);
    String line = "";
    for (Object key : item.keySet()) {
        line += "\r\n" + key + " : " + item.get(key);
    }
    logger.info("\r\n" + (i + 1) + " --------------------------------\r\n" + line + "\r\n");
}   
Modify com.paesia.schema.script.Machine class
1............
2
3import com.paesia.schema.script.safe.lucene.SEntity;
4
5public class Machine {
6
7 private Handler handler;
8
9 public static void run(Machine env, String js, Map args) throws Exception {
10 try {
11 Context cx = Context.enter();
12 cx.setClassShutter(new ClassShutter() {
13 public boolean visibleToScripts(String className) {
14...........
15 if ("org.apache.lucene.search.Query".equals(className)) return true;
16 if ("org.apache.lucene.search.Filter".equals(className)) return true;
17 if ("org.apache.lucene.search.Sort".equals(className)) return true;
18 if ("org.apache.lucene.search.BooleanQuery".equals(className)) return true;
19 if ("org.apache.lucene.search.BooleanClause".equals(className)) return true;
20 if (className.startsWith("org.apache.lucene.search.BooleanClause$")) return true;
21 if ("org.apache.lucene.search.PhraseQuery".equals(className)) return true;
22 if ("org.apache.lucene.index.Term".equals(className)) return true;
23 if ("org.apache.lucene.search.MultiPhraseQuery".equals(className)) return true;
24 if ("org.apache.lucene.search.NGramPhraseQuery".equals(className)) return true;
25 if ("org.apache.lucene.search.NumericRangeQuery".equals(className)) return true;
26 if ("org.apache.lucene.search.PrefixQuery".equals(className)) return true;
27 if ("org.apache.lucene.search.TermQuery".equals(className)) return true;
28 if ("org.apache.lucene.search.TermRangeQuery".equals(className)) return true;
29 if ("org.apache.lucene.search.WildcardQuery".equals(className)) return true;
30 if ("org.apache.lucene.search.MatchAllDocsQuery".equals(className)) return true;
31 if ("org.apache.lucene.search.FieldValueFilter".equals(className)) return true;
32 if ("org.apache.lucene.search.NumericRangeFilter".equals(className)) return true;
33 if ("org.apache.lucene.search.PrefixFilter".equals(className)) return true;
34 if ("org.apache.lucene.search.QueryWrapperFilter".equals(className)) return true;
35 if ("org.apache.lucene.search.TermRangeFilter".equals(className)) return true;
36 if ("org.apache.lucene.search.SortField".equals(className)) return true;
37...........
38 return false;
39 }
40 });
41
42...........
43
44 } catch (Exception e) {
45 throw e;
46 } finally {
47 Context.exit();
48 }
49 }
50...........
51 public SEntity newEntity() {
52 SEntity.Handler seh = null;
53 if (handler != null) {
54 seh = handler.getEntityHandler();
55 }
56 return new SEntity(seh);
57 }
58
59 public static class Handler {
60...........
61 public SEntity.Handler getEntityHandler() { return null; }
62
63 }
64...........
65}
............

import com.paesia.schema.script.safe.lucene.SEntity;

public class Machine {

    private Handler handler;
 
    public static void run(Machine env, String js, Map args) throws Exception {
        try {
            Context cx = Context.enter();
            cx.setClassShutter(new ClassShutter() {
                public boolean visibleToScripts(String className) {  
...........
                    if ("org.apache.lucene.search.Query".equals(className)) return true;
                    if ("org.apache.lucene.search.Filter".equals(className)) return true;
                    if ("org.apache.lucene.search.Sort".equals(className)) return true;
                    if ("org.apache.lucene.search.BooleanQuery".equals(className)) return true;
                    if ("org.apache.lucene.search.BooleanClause".equals(className)) return true;
                    if (className.startsWith("org.apache.lucene.search.BooleanClause$")) return true;
                    if ("org.apache.lucene.search.PhraseQuery".equals(className)) return true;
                    if ("org.apache.lucene.index.Term".equals(className)) return true;
                    if ("org.apache.lucene.search.MultiPhraseQuery".equals(className)) return true;
                    if ("org.apache.lucene.search.NGramPhraseQuery".equals(className)) return true;
                    if ("org.apache.lucene.search.NumericRangeQuery".equals(className)) return true;
                    if ("org.apache.lucene.search.PrefixQuery".equals(className)) return true;
                    if ("org.apache.lucene.search.TermQuery".equals(className)) return true;
                    if ("org.apache.lucene.search.TermRangeQuery".equals(className)) return true;
                    if ("org.apache.lucene.search.WildcardQuery".equals(className)) return true;
                    if ("org.apache.lucene.search.MatchAllDocsQuery".equals(className)) return true;
                    if ("org.apache.lucene.search.FieldValueFilter".equals(className)) return true;
                    if ("org.apache.lucene.search.NumericRangeFilter".equals(className)) return true;
                    if ("org.apache.lucene.search.PrefixFilter".equals(className)) return true;
                    if ("org.apache.lucene.search.QueryWrapperFilter".equals(className)) return true;
                    if ("org.apache.lucene.search.TermRangeFilter".equals(className)) return true;
                    if ("org.apache.lucene.search.SortField".equals(className)) return true;
...........
                    return false;
                }
            });   

...........

        } catch (Exception e) {
            throw e;
        } finally {
            Context.exit();   
        }
    }
...........
    public SEntity newEntity() {
        SEntity.Handler seh = null;
        if (handler != null) {
            seh = handler.getEntityHandler();
        }
        return new SEntity(seh);
    }

    public static class Handler {
...........
        public SEntity.Handler getEntityHandler() { return null; }
  
    }
...........
}
com.paesia.schema.script.safe.lucene.SEntity class
1package com.paesia.schema.script.safe.lucene;
2
3import java.io.ByteArrayInputStream;
4import java.io.ByteArrayOutputStream;
5import java.io.StringReader;
6import java.util.ArrayList;
7import java.util.Date;
8import java.util.List;
9import java.util.Properties;
10
11import org.apache.lucene.analysis.Analyzer;
12import org.apache.lucene.analysis.CachingTokenFilter;
13import org.apache.lucene.analysis.standard.StandardAnalyzer;
14import org.apache.lucene.index.Term;
15import org.apache.lucene.queryParser.MultiFieldQueryParser;
16import org.apache.lucene.search.BooleanClause;
17import org.apache.lucene.search.BooleanClause.Occur;
18import org.apache.lucene.search.BooleanQuery;
19import org.apache.lucene.search.FieldValueFilter;
20import org.apache.lucene.search.Filter;
21import org.apache.lucene.search.MatchAllDocsQuery;
22import org.apache.lucene.search.MultiPhraseQuery;
23import org.apache.lucene.search.NGramPhraseQuery;
24import org.apache.lucene.search.NumericRangeFilter;
25import org.apache.lucene.search.NumericRangeQuery;
26import org.apache.lucene.search.PhraseQuery;
27import org.apache.lucene.search.PrefixFilter;
28import org.apache.lucene.search.PrefixQuery;
29import org.apache.lucene.search.Query;
30import org.apache.lucene.search.QueryWrapperFilter;
31import org.apache.lucene.search.Sort;
32import org.apache.lucene.search.SortField;
33import org.apache.lucene.search.TermQuery;
34import org.apache.lucene.search.TermRangeFilter;
35import org.apache.lucene.search.TermRangeQuery;
36import org.apache.lucene.search.WildcardQuery;
37import org.apache.lucene.search.highlight.Highlighter;
38import org.apache.lucene.search.highlight.Scorer;
39import org.apache.lucene.search.highlight.SimpleFragmenter;
40import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
41import org.apache.lucene.util.Version;
42
43public class SEntity {
44
45 public static final String STRING = "s";
46 public static final String DOUBLE = "d";
47 public static final String FLOAT = "f";
48 public static final String INTEGER = "i";
49 public static final String LONG = "l";
50 public static final String ANALYZED = "a";
51
52 public static final String ALL_KINDS = "|s|d|f|i|l|a|";
53
54 public static final String SCHEMA = "F4f8cc93237f50";
55 public static final String ID = "F4f8cce61643dd";
56 public static final String CREATED = "F4f8cd83fcca31";
57 public static final String UPDATED = "F4f8cd84e2b74a";
58 public static final String KIND = "F4f8cd9c8ee13d";
59 public static final String MARK = "F4f8cda27d62fb";
60
61 protected Properties data = new Properties();
62 protected Properties schema = new Properties();
63 protected Handler handler = null;
64
65 public SEntity(Handler handler) {
66 this.handler = handler;
67 registerDefault();
68 }
69
70 public void register(String field, String type) {
71 if (ALL_KINDS.indexOf("|" + type + "|") < 0) return;
72 schema.put(field, type);
73 saveSchema();
74 }
75
76 public void setSchema(String src) {
77 String[] fields = src.split("\\|");
78 schema.clear();
79 for (int i = 0; i < fields.length && i + 1 < fields.length; i+= 2) {
80 register(fields[i + 1], fields[i]);
81 }
82 registerDefault();
83 saveSchema();
84 }
85
86 public String getSchema() {
87 String tag = data.getProperty(SCHEMA);
88 if (tag == null) tag = "";
89 return tag;
90 }
91
92 public void fromString(String src) {
93 data.clear();
94 schema.clear();
95 try {
96 ByteArrayInputStream bais = new ByteArrayInputStream(src.getBytes("UTF-8"));
97 data.load(bais);
98 bais.close();
99 } catch (Exception e) {
100 }
101 loadSchema();
102 }
103
104 public String toString() {
105 String tag = "";
106 try {
107 ByteArrayOutputStream baos = new ByteArrayOutputStream();
108 data.store(baos, "");
109 tag = baos.toString();
110 baos.close();
111 } catch (Exception e) {
112 }
113 return tag;
114 }
115
116 public String getString(String field) {
117 String tag = data.getProperty(field);
118 if (tag == null) tag = "";
119 return tag;
120 }
121
122 public void setString(String field, String value) {
123 if (schema.containsKey(field)) {
124 if (value == null) value = "";
125 data.setProperty(field, value);
126 }
127 }
128
129 public double getDouble(String field) {
130 double tag = 0;
131 try {
132 tag = Double.parseDouble(getString(field));
133 } catch (Exception e) {
134 tag = 0;
135 }
136 return tag;
137 }
138
139 public void setDouble(String field, double value) {
140 setString(field, Double.toString(value));
141 }
142
143 public float getFloat(String field) {
144 float tag = 0;
145 try {
146 tag = Float.parseFloat(getString(field));
147 } catch (Exception e) {
148 tag = 0;
149 }
150 return tag;
151 }
152
153 public void setFloat(String field, float value) {
154 setString(field, Float.toString(value));
155 }
156
157 public long getLong(String field) {
158 long tag = 0;
159 try {
160 tag = Long.parseLong(getString(field));
161 } catch (Exception e) {
162 tag = 0;
163 }
164 return tag;
165 }
166
167 public void setLong(String field, long value) {
168 setString(field, Long.toString(value));
169 }
170
171 public int getInteger(String field) {
172 int tag = 0;
173 try {
174 tag = Integer.parseInt(getString(field));
175 } catch (Exception e) {
176 tag = 0;
177 }
178 return tag;
179 }
180
181 public void setInteger(String field, int value) {
182 setString(field, Integer.toString(value));
183 }
184
185 public String getId() {
186 return getString(ID);
187 }
188
189 public void setId(String src) {
190 setString(ID, src);
191 }
192
193 public String getKind() {
194 return getString(KIND);
195 }
196
197 public void setKind(String src) {
198 setString(KIND, src);
199 }
200
201 public String getMark() {
202 return getString(MARK);
203 }
204
205 public void setMark(String src) {
206 setString(MARK, src);
207 }
208
209 public Date getCreated() {
210 return new Date(getLong(CREATED));
211 }
212
213 public Date getUpdated() {
214 return new Date(getLong(UPDATED));
215 }
216
217 public boolean exists() {
218 if (handler == null) {
219 return false;
220 } else {
221 return handler.exists(getId());
222 }
223 }
224
225 public void save() {
226 if (handler != null) {
227 long now = new Date().getTime();
228 if (handler.exists(getId())) {
229 setLong(UPDATED, now);
230 handler.update(this);
231 } else {
232 setLong(CREATED, now);
233 setLong(UPDATED, now);
234 handler.create(this);
235 }
236 }
237 }
238
239 public int count(String kind, Query query, int max) {
240 if (handler != null) {
241 return handler.count(kind, query, max);
242 }
243 return 0;
244 }
245
246 public int count(String kind, Query query, Sort sort, int max) {
247 if (handler != null) {
248 return handler.count(kind, query, sort, max);
249 }
250 return 0;
251 }
252
253 public int count(String kind, Query query, Filter filter, int max) {
254 if (handler != null) {
255 return handler.count(kind, query, filter, max);
256 }
257 return 0;
258 }
259
260 public int count(String kind, Query query, Filter filter, Sort sort, int max) {
261 if (handler != null) {
262 return handler.count(kind, query, filter, sort, max);
263 }
264 return 0;
265 }
266
267 public List<SEntity> search(String kind, Query query, int max) {
268 if (handler != null) {
269 return handler.search(kind, query, max);
270 }
271 return new ArrayList<SEntity>();
272 }
273
274 public List<SEntity> search(String kind, Query query, Sort sort, int max) {
275 if (handler != null) {
276 return handler.search(kind, query, sort, max);
277 }
278 return new ArrayList<SEntity>();
279 }
280
281 public List<SEntity> search(String kind, Query query, Filter filter, int max) {
282 if (handler != null) {
283 return handler.search(kind, query, filter, max);
284 }
285 return new ArrayList<SEntity>();
286 }
287
288 public List<SEntity> search(String kind, Query query, Filter filter, Sort sort, int max) {
289 if (handler != null) {
290 return handler.search(kind, query, filter, sort, max);
291 }
292 return new ArrayList<SEntity>();
293 }
294
295 public List<SEntity> search(String kind, Query query, int pagesize, int pageno) {
296 if (handler != null) {
297 return handler.search(kind, query, pagesize, pageno);
298 }
299 return new ArrayList<SEntity>();
300 }
301
302 public List<SEntity> search(String kind, Query query, Sort sort, int pagesize, int pageno) {
303 if (handler != null) {
304 return handler.search(kind, query, sort, pagesize, pageno);
305 }
306 return new ArrayList<SEntity>();
307 }
308
309 public List<SEntity> search(String kind, Query query, Filter filter, int pagesize, int pageno) {
310 if (handler != null) {
311 return handler.search(kind, query, filter, pagesize, pageno);
312 }
313 return new ArrayList<SEntity>();
314 }
315
316 public List<SEntity> search(String kind, Query query, Filter filter, Sort sort, int max, int pagesize, int pageno) {
317 if (handler != null) {
318 return handler.search(kind, query, filter, sort, pagesize, pageno);
319 }
320 return new ArrayList<SEntity>();
321 }
322
323 public void load(String id) {
324 if (handler != null) {
325 handler.load(id, this);
326 }
327 }
328
329 public BooleanQuery newBooleanQuery() {
330 return new BooleanQuery();
331 }
332
333 public BooleanClause newBooleanClause(Query query, Occur occur) {
334 return new BooleanClause(query, occur);
335 }
336
337 public Occur occurMust() {
338 return Occur.MUST;
339 }
340
341 public Occur occurMustNot() {
342 return Occur.MUST_NOT;
343 }
344
345 public Occur occurShould() {
346 return Occur.SHOULD;
347 }
348
349 public MatchAllDocsQuery newMatchAllDocsQuery() {
350 return new MatchAllDocsQuery();
351 }
352
353 public MultiPhraseQuery newMultiPhraseQuery() {
354 return new MultiPhraseQuery();
355 }
356
357 public PhraseQuery newPhraseQuery() {
358 return new PhraseQuery();
359 }
360
361 public NGramPhraseQuery newNGramPhraseQuery(int n) {
362 return new NGramPhraseQuery(n);
363 }
364
365 public Term newTerm(String field, String value) {
366 return new Term(field, value);
367 }
368
369 public NumericRangeQuery<Double> newDoubleRangeQuery(String field, Double min, Double max, boolean minInclusive, boolean maxInclusive) {
370 return NumericRangeQuery.newDoubleRange(field, min, max, minInclusive, maxInclusive);
371 }
372
373 public NumericRangeQuery<Double> newDoubleRangeQuery(String field, int precisionStep, Double min, Double max, boolean minInclusive, boolean maxInclusive) {
374 return NumericRangeQuery.newDoubleRange(field, precisionStep, min, max, minInclusive, maxInclusive);
375 }
376
377 public NumericRangeQuery<Float> newFloatRangeQuery(String field, Float min, Float max, boolean minInclusive, boolean maxInclusive) {
378 return NumericRangeQuery.newFloatRange(field, min, max, minInclusive, maxInclusive);
379 }
380
381 public NumericRangeQuery<Float> newFloatRangeQuery(String field, int precisionStep, Float min, Float max, boolean minInclusive, boolean maxInclusive) {
382 return NumericRangeQuery.newFloatRange(field, precisionStep, min, max, minInclusive, maxInclusive);
383 }
384
385 public NumericRangeQuery<Integer> newIntegerRangeQuery(String field, Integer min, Integer max, boolean minInclusive, boolean maxInclusive) {
386 return NumericRangeQuery.newIntRange(field, min, max, minInclusive, maxInclusive);
387 }
388
389 public NumericRangeQuery<Integer> newIntegerRangeQuery(String field, int precisionStep, Integer min, Integer max, boolean minInclusive, boolean maxInclusive) {
390 return NumericRangeQuery.newIntRange(field, precisionStep, min, max, minInclusive, maxInclusive);
391 }
392
393 public NumericRangeQuery<Long> newLongRangeQuery(String field, Long min, Long max, boolean minInclusive, boolean maxInclusive) {
394 return NumericRangeQuery.newLongRange(field, min, max, minInclusive, maxInclusive);
395 }
396
397 public NumericRangeQuery<Long> newLongRangeQuery(String field, int precisionStep, Long min, Long max, boolean minInclusive, boolean maxInclusive) {
398 return NumericRangeQuery.newLongRange(field, precisionStep, min, max, minInclusive, maxInclusive);
399 }
400
401 public PrefixQuery newPrefixQuery(Term term) {
402 return new PrefixQuery(term);
403 }
404
405 public TermQuery newTermQuery(Term term) {
406 return new TermQuery(term);
407 }
408
409 public TermRangeQuery newTermRangeQuery(String field, String lowerTerm, String upperTerm, boolean includeLower, boolean includeUpper) {
410 return new TermRangeQuery(field, lowerTerm, upperTerm, includeLower, includeUpper);
411 }
412
413 public WildcardQuery newWildcardQuery(Term term) {
414 return new WildcardQuery(term);
415 }
416
417 public FieldValueFilter newFieldValueFilter(String field, boolean negate) {
418 return new FieldValueFilter(field, negate);
419 }
420
421 public NumericRangeFilter<Double> newDoubleRangeFilter(String field, Double min, Double max, boolean minInclusive, boolean maxInclusive) {
422 return NumericRangeFilter.newDoubleRange(field, min, max, minInclusive, maxInclusive);
423 }
424
425 public NumericRangeFilter<Double> newDoubleRangeFilter(String field, int precisionStep, Double min, Double max, boolean minInclusive, boolean maxInclusive) {
426 return NumericRangeFilter.newDoubleRange(field, precisionStep, min, max, minInclusive, maxInclusive);
427 }
428
429 public NumericRangeFilter<Float> newFloatRangeFilter(String field, Float min, Float max, boolean minInclusive, boolean maxInclusive) {
430 return NumericRangeFilter.newFloatRange(field, min, max, minInclusive, maxInclusive);
431 }
432
433 public NumericRangeFilter<Float> newFloatRangeFilter(String field, int precisionStep, Float min, Float max, boolean minInclusive, boolean maxInclusive) {
434 return NumericRangeFilter.newFloatRange(field, precisionStep, min, max, minInclusive, maxInclusive);
435 }
436
437 public NumericRangeFilter<Integer> newIntegerRangeFilter(String field, Integer min, Integer max, boolean minInclusive, boolean maxInclusive) {
438 return NumericRangeFilter.newIntRange(field, min, max, minInclusive, maxInclusive);
439 }
440
441 public NumericRangeFilter<Integer> newIntegerRangeFilter(String field, int precisionStep, Integer min, Integer max, boolean minInclusive, boolean maxInclusive) {
442 return NumericRangeFilter.newIntRange(field, precisionStep, min, max, minInclusive, maxInclusive);
443 }
444
445 public NumericRangeFilter<Long> newLongRangeFilter(String field, Long min, Long max, boolean minInclusive, boolean maxInclusive) {
446 return NumericRangeFilter.newLongRange(field, min, max, minInclusive, maxInclusive);
447 }
448
449 public NumericRangeFilter<Long> newLongRangeFilter(String field, int precisionStep, Long min, Long max, boolean minInclusive, boolean maxInclusive) {
450 return NumericRangeFilter.newLongRange(field, precisionStep, min, max, minInclusive, maxInclusive);
451 }
452
453 public PrefixFilter newPrefixFilter(Term term) {
454 return new PrefixFilter(term);
455 }
456
457 public QueryWrapperFilter newQueryWrapperFilter(Query query) {
458 return new QueryWrapperFilter(query);
459 }
460
461 public TermRangeFilter newTermRangeFilter(String fieldName, String lowerTerm, String upperTerm, boolean includeLower, boolean includeUpper) {
462 return new TermRangeFilter(fieldName, lowerTerm, upperTerm, includeLower, includeUpper);
463 }
464
465 public SortField newSortField(String field, int type, boolean reverse) {
466 return new SortField(field, type, reverse);
467 }
468
469 public Sort newSort() {
470 return new Sort();
471 }
472
473 public Sort newSort(SortField... fields) {
474 return new Sort(fields);
475 }
476
477 public Sort newSort(SortField field) {
478 return new Sort(field);
479 }
480
481 public Query parseQuery(String[] queries, String[] fields) throws Exception {
482 return MultiFieldQueryParser.parse(Version.LUCENE_36, queries, fields, new StandardAnalyzer(Version.LUCENE_36));
483 }
484
485 public Query parseQuery(String[] queries, String[] fields, BooleanClause.Occur[] flags) throws Exception {
486 return MultiFieldQueryParser.parse(Version.LUCENE_36, queries, fields, flags, new StandardAnalyzer(Version.LUCENE_36));
487 }
488
489 public Query parseQuery(String query, String[] fields, BooleanClause.Occur[] flags) throws Exception {
490 return MultiFieldQueryParser.parse(Version.LUCENE_36, query, fields, flags, new StandardAnalyzer(Version.LUCENE_36));
491 }
492
493 public String highlight(Query query, String text, String field, int fragmentSize, int maxNumFragments, String separator) throws Exception {
494 Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_36);
495 CachingTokenFilter tokenStream = new CachingTokenFilter(analyzer.tokenStream(field, new StringReader(text)));
496 SimpleHTMLFormatter formatter = new SimpleHTMLFormatter();
497 Scorer scorer = new org.apache.lucene.search.highlight.QueryScorer(query);
498 Highlighter highlighter = new Highlighter(formatter, scorer);
499 highlighter.setTextFragmenter(new SimpleFragmenter(fragmentSize));
500 tokenStream.reset();
501 String rv = highlighter.getBestFragments(tokenStream, text, maxNumFragments, separator);
502 return rv.length() == 0 ? text : rv;
503 }
504
505 protected void registerDefault() {
506 register(SCHEMA, "s");
507 register(ID, "s");
508 register(CREATED, "l");
509 register(UPDATED, "l");
510 register(KIND, "s");
511 register(MARK, "s");
512 }
513
514 protected void saveSchema() {
515 String tag = "";
516 for (Object key : schema.keySet()) {
517 if (tag.length() > 0) tag += "|";
518 tag += schema.get(key) + "|" + key;
519 }
520 data.put(SCHEMA, tag);
521 }
522
523 protected void loadSchema() {
524 String src = data.getProperty(SCHEMA);
525 if (src == null) src = "";
526 String[] fields = src.split("\\|");
527 schema.clear();
528 for (int i = 0; i < fields.length && i + 1 < fields.length; i+= 2) {
529 register(fields[i + 1], fields[i]);
530 }
531 registerDefault();
532
533 String tag = "";
534 for (Object key : schema.keySet()) {
535 if (tag.length() > 0) tag += "|";
536 tag += schema.get(key) + "|" + key;
537 }
538 data.put(SCHEMA, tag);
539 }
540
541 public void delete() {
542 delete(getId());
543 }
544
545 public void delete(String id) {
546 if (handler != null) {
547 handler.delete(id);
548 }
549 }
550
551 public SortField sortFieldDoc() {
552 return SortField.FIELD_DOC;
553 }
554
555 public SortField sortFieldScore() {
556 return SortField.FIELD_SCORE;
557 }
558
559 public int sortFieldLong() {
560 return SortField.LONG;
561 }
562
563 public int sortFieldInteger() {
564 return SortField.INT;
565 }
566
567 public int sortFieldDouble() {
568 return SortField.DOUBLE;
569 }
570
571 public int sortFieldFloat() {
572 return SortField.FLOAT;
573 }
574
575 public int sortFieldString() {
576 return SortField.STRING_VAL;
577 }
578
579 public double storageQuota() {
580 if (handler != null) {
581 return handler.storageQuota();
582 }
583 return 0;
584 }
585
586 public double storageSize() {
587 if (handler != null) {
588 return handler.storageSize();
589 }
590 return 0;
591 }
592
593 public static class Handler {
594
595 public boolean exists(String id) { return false; }
596 public void create(SEntity src) { }
597 public void update(SEntity src) { }
598 public void load(String id, SEntity src) { }
599 public void delete(String id) { }
600 public List<SEntity> search(String kind, Query query, int max) { return new ArrayList<SEntity>(); }
601 public List<SEntity> search(String kind, Query query, Sort sort, int max) { return new ArrayList<SEntity>(); }
602 public List<SEntity> search(String kind, Query query, Filter filter, int max) { return new ArrayList<SEntity>(); }
603 public List<SEntity> search(String kind, Query query, Filter filter, Sort sort, int max) { return new ArrayList<SEntity>(); }
604 public List<SEntity> search(String kind, Query query, int pagesize, int pageno) { return new ArrayList<SEntity>(); }
605 public List<SEntity> search(String kind, Query query, Sort sort, int pagesize, int pageno) { return new ArrayList<SEntity>(); }
606 public List<SEntity> search(String kind, Query query, Filter filter, int pagesize, int pageno) { return new ArrayList<SEntity>(); }
607 public List<SEntity> search(String kind, Query query, Filter filter, Sort sort, int pagesize, int pageno) { return new ArrayList<SEntity>(); }
608 public int count(String kind, Query query, int max) { return 0; }
609 public int count(String kind, Query query, Sort sort, int max) { return 0; }
610 public int count(String kind, Query query, Filter filter, int max) { return 0; }
611 public int count(String kind, Query query, Filter filter, Sort sort, int max) { return 0; }
612 public double storageQuota() { return 0; }
613 public double storageSize() { return 0; }
614
615 }
616
617}
package com.paesia.schema.script.safe.lucene;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Properties;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CachingTokenFilter;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryParser.MultiFieldQueryParser;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.FieldValueFilter;
import org.apache.lucene.search.Filter;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.MultiPhraseQuery;
import org.apache.lucene.search.NGramPhraseQuery;
import org.apache.lucene.search.NumericRangeFilter;
import org.apache.lucene.search.NumericRangeQuery;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.PrefixFilter;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.QueryWrapperFilter;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TermRangeFilter;
import org.apache.lucene.search.TermRangeQuery;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.Scorer;
import org.apache.lucene.search.highlight.SimpleFragmenter;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.util.Version;

public class SEntity {

    public static final String STRING = "s";
    public static final String DOUBLE = "d";
    public static final String FLOAT = "f";
    public static final String INTEGER = "i";
    public static final String LONG = "l";
    public static final String ANALYZED = "a";
 
    public static final String ALL_KINDS = "|s|d|f|i|l|a|";
 
    public static final String SCHEMA = "F4f8cc93237f50";
    public static final String ID = "F4f8cce61643dd";
    public static final String CREATED = "F4f8cd83fcca31";
    public static final String UPDATED = "F4f8cd84e2b74a";
    public static final String KIND = "F4f8cd9c8ee13d";
    public static final String MARK = "F4f8cda27d62fb";

    protected Properties data = new Properties();
    protected Properties schema = new Properties();
    protected Handler handler = null;
 
    public SEntity(Handler handler) {
        this.handler = handler;
        registerDefault();
    }
 
    public void register(String field, String type) {
        if (ALL_KINDS.indexOf("|" + type + "|") < 0) return;
        schema.put(field, type);
        saveSchema();
    }
 
    public void setSchema(String src) {
        String[] fields = src.split("\\|");
        schema.clear();
        for (int i = 0; i < fields.length && i + 1 < fields.length; i+= 2) {
            register(fields[i + 1], fields[i]);
        }
        registerDefault();
        saveSchema();
    }
 
    public String getSchema() {
        String tag = data.getProperty(SCHEMA);
        if (tag == null) tag = "";
        return tag;
    }
 
    public void fromString(String src) {
        data.clear();
        schema.clear();
        try {
            ByteArrayInputStream bais = new ByteArrayInputStream(src.getBytes("UTF-8"));
            data.load(bais);
            bais.close();
        } catch (Exception e) {
        }
        loadSchema();
    }
 
    public String toString() {
        String tag = "";
        try {
            ByteArrayOutputStream baos = new ByteArrayOutputStream();
            data.store(baos, "");
            tag = baos.toString();
            baos.close();
        } catch (Exception e) {
        }
        return tag;
    }
 
    public String getString(String field) {
        String tag = data.getProperty(field);
        if (tag == null) tag = "";
        return tag;
    }
 
    public void setString(String field, String value) {
        if (schema.containsKey(field)) {
            if (value == null) value = "";
            data.setProperty(field, value);
        }
    }
 
    public double getDouble(String field) {
        double tag = 0;
        try {
            tag = Double.parseDouble(getString(field));
        } catch (Exception e) {
            tag = 0;
        }
        return tag;
    }
 
    public void setDouble(String field, double value) {
        setString(field, Double.toString(value));
    }

    public float getFloat(String field) {
        float tag = 0;
        try {
            tag = Float.parseFloat(getString(field));
        } catch (Exception e) {
            tag = 0;
        }
        return tag;
    }
 
    public void setFloat(String field, float value) {
        setString(field, Float.toString(value));
    }

    public long getLong(String field) {
        long tag = 0;
        try {
            tag = Long.parseLong(getString(field));
        } catch (Exception e) {
            tag = 0;
        }
        return tag;
    }
 
    public void setLong(String field, long value) {
        setString(field, Long.toString(value));
    }

    public int getInteger(String field) {
        int tag = 0;
        try {
            tag = Integer.parseInt(getString(field));
        } catch (Exception e) {
            tag = 0;
        }
        return tag;
    }
 
    public void setInteger(String field, int value) {
        setString(field, Integer.toString(value));
    }
 
    public String getId() {
        return getString(ID);
    }
 
    public void setId(String src) {
        setString(ID, src);
    }

    public String getKind() {
        return getString(KIND);
    }
 
    public void setKind(String src) {
        setString(KIND, src);
    }
 
    public String getMark() {
        return getString(MARK);
    }
 
    public void setMark(String src) {
        setString(MARK, src);
    }
 
    public Date getCreated() {
        return new Date(getLong(CREATED));
    }
 
    public Date getUpdated() {
        return new Date(getLong(UPDATED));
    }
 
    public boolean exists() {
        if (handler == null) {
            return false;
        } else {
            return handler.exists(getId());
        }
    }
 
    public void save() {
        if (handler != null) {
            long now = new Date().getTime();
            if (handler.exists(getId())) {
                setLong(UPDATED, now);
                handler.update(this);
            } else {
                setLong(CREATED, now);
                setLong(UPDATED, now);
                handler.create(this);
            }
        }
    }

    public int count(String kind, Query query, int max) {
        if (handler != null) {
            return handler.count(kind, query, max);
        }
        return 0; 
    }
 
    public int count(String kind, Query query, Sort sort, int max) {
        if (handler != null) {
            return handler.count(kind, query, sort, max);
        }
        return 0; 
    }
 
    public int count(String kind, Query query, Filter filter, int max) {
        if (handler != null) {
            return handler.count(kind, query, filter, max);
        }
        return 0; 
    }
 
    public int count(String kind, Query query, Filter filter, Sort sort, int max) {
        if (handler != null) {
            return handler.count(kind, query, filter, sort, max);
        }
        return 0; 
    }
 
    public List<SEntity> search(String kind, Query query, int max) {
        if (handler != null) {
            return handler.search(kind, query, max);
        }
        return new ArrayList<SEntity>(); 
    }
 
    public List<SEntity> search(String kind, Query query, Sort sort, int max) {
        if (handler != null) {
            return handler.search(kind, query, sort, max);
        }
        return new ArrayList<SEntity>(); 
    }
 
    public List<SEntity> search(String kind, Query query, Filter filter, int max) {
        if (handler != null) {
            return handler.search(kind, query, filter, max);
        }
        return new ArrayList<SEntity>(); 
    }
 
    public List<SEntity> search(String kind, Query query, Filter filter, Sort sort, int max) {
        if (handler != null) {
            return handler.search(kind, query, filter, sort, max);
        }
        return new ArrayList<SEntity>(); 
    }
 
    public List<SEntity> search(String kind, Query query, int pagesize, int pageno) {
        if (handler != null) {
            return handler.search(kind, query, pagesize, pageno);
        }
        return new ArrayList<SEntity>(); 
    }
 
    public List<SEntity> search(String kind, Query query, Sort sort, int pagesize, int pageno) {
        if (handler != null) {
            return handler.search(kind, query, sort, pagesize, pageno);
        }
        return new ArrayList<SEntity>(); 
    }
 
    public List<SEntity> search(String kind, Query query, Filter filter, int pagesize, int pageno) {
        if (handler != null) {
            return handler.search(kind, query, filter, pagesize, pageno);
        }
        return new ArrayList<SEntity>(); 
    }
 
    public List<SEntity> search(String kind, Query query, Filter filter, Sort sort, int max, int pagesize, int pageno) {
        if (handler != null) {
            return handler.search(kind, query, filter, sort, pagesize, pageno);
        }
        return new ArrayList<SEntity>(); 
    }
 
    public void load(String id) {
        if (handler != null) {
            handler.load(id, this);
        }
    }
 
    public BooleanQuery newBooleanQuery() {
        return new BooleanQuery();
    }
 
    public BooleanClause newBooleanClause(Query query, Occur occur) {
        return new BooleanClause(query, occur);
    }
 
    public Occur occurMust() {
        return Occur.MUST;
    }
 
    public Occur occurMustNot() {
        return Occur.MUST_NOT;
    }
 
    public Occur occurShould() {
        return Occur.SHOULD;
    }

    public MatchAllDocsQuery newMatchAllDocsQuery() {
        return new MatchAllDocsQuery();
    }
 
    public MultiPhraseQuery newMultiPhraseQuery() {
        return new MultiPhraseQuery();
    }
 
    public PhraseQuery newPhraseQuery() {
        return new PhraseQuery();
    }
 
    public NGramPhraseQuery newNGramPhraseQuery(int n) {
        return new NGramPhraseQuery(n);
    }
 
    public Term newTerm(String field, String value) {
        return new Term(field, value);
    }
 
    public NumericRangeQuery<Double> newDoubleRangeQuery(String field, Double min, Double max, boolean minInclusive, boolean maxInclusive) {
        return NumericRangeQuery.newDoubleRange(field, min, max, minInclusive, maxInclusive);
    }
 
    public NumericRangeQuery<Double> newDoubleRangeQuery(String field, int precisionStep, Double min, Double max, boolean minInclusive, boolean maxInclusive) {
        return NumericRangeQuery.newDoubleRange(field, precisionStep, min, max, minInclusive, maxInclusive);
    }

    public NumericRangeQuery<Float> newFloatRangeQuery(String field, Float min, Float max, boolean minInclusive, boolean maxInclusive) {
        return NumericRangeQuery.newFloatRange(field, min, max, minInclusive, maxInclusive);
    }

    public NumericRangeQuery<Float> newFloatRangeQuery(String field, int precisionStep, Float min, Float max, boolean minInclusive, boolean maxInclusive) {
        return NumericRangeQuery.newFloatRange(field, precisionStep, min, max, minInclusive, maxInclusive);
    }

    public NumericRangeQuery<Integer> newIntegerRangeQuery(String field, Integer min, Integer max, boolean minInclusive, boolean maxInclusive) {
        return NumericRangeQuery.newIntRange(field, min, max, minInclusive, maxInclusive);
    }
 
    public NumericRangeQuery<Integer> newIntegerRangeQuery(String field, int precisionStep, Integer min, Integer max, boolean minInclusive, boolean maxInclusive) {
        return NumericRangeQuery.newIntRange(field, precisionStep, min, max, minInclusive, maxInclusive);
    }
 
    public NumericRangeQuery<Long> newLongRangeQuery(String field, Long min, Long max, boolean minInclusive, boolean maxInclusive) {
        return NumericRangeQuery.newLongRange(field, min, max, minInclusive, maxInclusive);
    }

    public NumericRangeQuery<Long> newLongRangeQuery(String field, int precisionStep, Long min, Long max, boolean minInclusive, boolean maxInclusive) {
        return NumericRangeQuery.newLongRange(field, precisionStep, min, max, minInclusive, maxInclusive);
    }
 
    public PrefixQuery newPrefixQuery(Term term) {
        return new PrefixQuery(term);
    }
 
    public TermQuery newTermQuery(Term term) {
        return new TermQuery(term);
    }
 
    public TermRangeQuery newTermRangeQuery(String field, String lowerTerm, String upperTerm, boolean includeLower, boolean includeUpper) {
        return new TermRangeQuery(field, lowerTerm, upperTerm, includeLower, includeUpper); 
    }
 
    public WildcardQuery newWildcardQuery(Term term) {
        return new WildcardQuery(term);
    }
 
    public FieldValueFilter newFieldValueFilter(String field, boolean negate) {
        return new FieldValueFilter(field, negate);
    }
 
    public NumericRangeFilter<Double> newDoubleRangeFilter(String field, Double min, Double max, boolean minInclusive, boolean maxInclusive) {
        return NumericRangeFilter.newDoubleRange(field, min, max, minInclusive, maxInclusive);
    }

    public NumericRangeFilter<Double> newDoubleRangeFilter(String field, int precisionStep, Double min, Double max, boolean minInclusive, boolean maxInclusive) {
        return NumericRangeFilter.newDoubleRange(field, precisionStep, min, max, minInclusive, maxInclusive);
    }

    public NumericRangeFilter<Float> newFloatRangeFilter(String field, Float min, Float max, boolean minInclusive, boolean maxInclusive) {
        return NumericRangeFilter.newFloatRange(field, min, max, minInclusive, maxInclusive);
    }

    public NumericRangeFilter<Float> newFloatRangeFilter(String field, int precisionStep, Float min, Float max, boolean minInclusive, boolean maxInclusive) {
        return NumericRangeFilter.newFloatRange(field, precisionStep, min, max, minInclusive, maxInclusive);
    }
 
    public NumericRangeFilter<Integer> newIntegerRangeFilter(String field, Integer min, Integer max, boolean minInclusive, boolean maxInclusive) {
        return NumericRangeFilter.newIntRange(field, min, max, minInclusive, maxInclusive);
    }

    public NumericRangeFilter<Integer> newIntegerRangeFilter(String field, int precisionStep, Integer min, Integer max, boolean minInclusive, boolean maxInclusive) {
        return NumericRangeFilter.newIntRange(field, precisionStep, min, max, minInclusive, maxInclusive);
    }
 
    public NumericRangeFilter<Long> newLongRangeFilter(String field, Long min, Long max, boolean minInclusive, boolean maxInclusive) {
        return NumericRangeFilter.newLongRange(field, min, max, minInclusive, maxInclusive);
    }

    public NumericRangeFilter<Long> newLongRangeFilter(String field, int precisionStep, Long min, Long max, boolean minInclusive, boolean maxInclusive) {
        return NumericRangeFilter.newLongRange(field, precisionStep, min, max, minInclusive, maxInclusive);
    }
 
    public PrefixFilter newPrefixFilter(Term term) {
        return new PrefixFilter(term);
    }
 
    public QueryWrapperFilter newQueryWrapperFilter(Query query) {
        return new QueryWrapperFilter(query);
    }
 
    public TermRangeFilter newTermRangeFilter(String fieldName, String lowerTerm, String upperTerm, boolean includeLower, boolean includeUpper) {
        return new TermRangeFilter(fieldName, lowerTerm, upperTerm, includeLower, includeUpper);
    }
 
    public SortField newSortField(String field, int type, boolean reverse) {
        return new SortField(field, type, reverse);
    }
 
    public Sort newSort() {
        return new Sort();
    }

    public Sort newSort(SortField... fields) {
        return new Sort(fields);
    }

    public Sort newSort(SortField field) {
        return new Sort(field);
    }
 
    public Query parseQuery(String[] queries, String[] fields) throws Exception {
        return MultiFieldQueryParser.parse(Version.LUCENE_36, queries, fields, new StandardAnalyzer(Version.LUCENE_36));
    }
 
    public Query parseQuery(String[] queries, String[] fields, BooleanClause.Occur[] flags) throws Exception {
        return MultiFieldQueryParser.parse(Version.LUCENE_36, queries, fields, flags, new StandardAnalyzer(Version.LUCENE_36));
    }
 
    public Query parseQuery(String query, String[] fields, BooleanClause.Occur[] flags) throws Exception {
        return MultiFieldQueryParser.parse(Version.LUCENE_36, query, fields, flags, new StandardAnalyzer(Version.LUCENE_36));
    }
 
    public String highlight(Query query, String text, String field, int fragmentSize, int maxNumFragments, String separator) throws Exception {
        Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_36);
        CachingTokenFilter tokenStream = new CachingTokenFilter(analyzer.tokenStream(field, new StringReader(text)));
        SimpleHTMLFormatter formatter = new SimpleHTMLFormatter();
        Scorer scorer = new org.apache.lucene.search.highlight.QueryScorer(query);
        Highlighter highlighter = new Highlighter(formatter, scorer);
        highlighter.setTextFragmenter(new SimpleFragmenter(fragmentSize));
        tokenStream.reset();
        String rv = highlighter.getBestFragments(tokenStream, text, maxNumFragments, separator);
        return rv.length() == 0 ? text : rv;
    }
 
    protected void registerDefault() {
        register(SCHEMA, "s");
        register(ID, "s");
        register(CREATED, "l");
        register(UPDATED, "l");
        register(KIND, "s");
        register(MARK, "s");
    }
 
    protected void saveSchema() {
        String tag = "";
        for (Object key : schema.keySet()) {
            if (tag.length() > 0) tag += "|";
            tag += schema.get(key) + "|" + key;
        }
        data.put(SCHEMA, tag);
    }

    protected void loadSchema() {
        String src = data.getProperty(SCHEMA);
        if (src == null) src = "";
        String[] fields = src.split("\\|");
        schema.clear();
        for (int i = 0; i < fields.length && i + 1 < fields.length; i+= 2) {
            register(fields[i + 1], fields[i]);
        }
        registerDefault();

        String tag = "";
        for (Object key : schema.keySet()) {
            if (tag.length() > 0) tag += "|";
            tag += schema.get(key) + "|" + key;
        }
        data.put(SCHEMA, tag);
    }
 
    public void delete() {
        delete(getId());
    }
 
    public void delete(String id) {
        if (handler != null) {
            handler.delete(id);
        }
    }

    public SortField sortFieldDoc() {
        return SortField.FIELD_DOC;
    }
    
    public SortField sortFieldScore() {
        return SortField.FIELD_SCORE;
    }
    
    public int sortFieldLong() {
        return SortField.LONG;
    }
    
    public int sortFieldInteger() {
        return SortField.INT;
    }
    
    public int sortFieldDouble() {
        return SortField.DOUBLE;
    }
    
    public int sortFieldFloat() {
        return SortField.FLOAT;
    }
    
    public int sortFieldString() {
        return SortField.STRING_VAL;
    }

    public double storageQuota() {
        if (handler != null) {
            return handler.storageQuota();
        }
        return 0; 
    }

    public double storageSize() { 
        if (handler != null) {
            return handler.storageSize();
        }
        return 0; 
    }

    public static class Handler {
  
        public boolean exists(String id) { return false; }
        public void create(SEntity src) { }
        public void update(SEntity src) { }
        public void load(String id, SEntity src) { }
        public void delete(String id) { }
        public List<SEntity> search(String kind, Query query, int max) { return new ArrayList<SEntity>(); }
        public List<SEntity> search(String kind, Query query, Sort sort, int max) { return new ArrayList<SEntity>(); }
        public List<SEntity> search(String kind, Query query, Filter filter, int max) { return new ArrayList<SEntity>(); }
        public List<SEntity> search(String kind, Query query, Filter filter, Sort sort, int max) { return new ArrayList<SEntity>(); }
        public List<SEntity> search(String kind, Query query, int pagesize, int pageno) { return new ArrayList<SEntity>(); }
        public List<SEntity> search(String kind, Query query, Sort sort, int pagesize, int pageno) { return new ArrayList<SEntity>(); }
        public List<SEntity> search(String kind, Query query, Filter filter, int pagesize, int pageno) { return new ArrayList<SEntity>(); }
        public List<SEntity> search(String kind, Query query, Filter filter, Sort sort, int pagesize, int pageno) { return new ArrayList<SEntity>(); }
        public int count(String kind, Query query, int max) { return 0; }
        public int count(String kind, Query query, Sort sort, int max) { return 0; }
        public int count(String kind, Query query, Filter filter, int max) { return 0; }
        public int count(String kind, Query query, Filter filter, Sort sort, int max) { return 0; }
        public double storageQuota() { return 0; }
        public double storageSize() { return 0; }
  
    }
 
}
com.paesia.schema.script.LuceneHandler class
1package com.paesia.schema.script;
2
3import java.io.BufferedWriter;
4import java.io.File;
5import java.io.FileOutputStream;
6import java.io.OutputStreamWriter;
7import java.util.ArrayList;
8import java.util.List;
9import java.util.Timer;
10import java.util.TimerTask;
11import java.util.UUID;
12
13import org.apache.lucene.analysis.Analyzer;
14import org.apache.lucene.analysis.standard.StandardAnalyzer;
15import org.apache.lucene.document.Document;
16import org.apache.lucene.document.Field;
17import org.apache.lucene.document.Field.Index;
18import org.apache.lucene.document.Field.Store;
19import org.apache.lucene.document.NumericField;
20import org.apache.lucene.index.IndexReader;
21import org.apache.lucene.index.IndexWriter;
22import org.apache.lucene.index.IndexWriterConfig;
23import org.apache.lucene.index.IndexWriterConfig.OpenMode;
24import org.apache.lucene.index.Term;
25import org.apache.lucene.search.BooleanClause;
26import org.apache.lucene.search.BooleanClause.Occur;
27import org.apache.lucene.search.BooleanQuery;
28import org.apache.lucene.search.Filter;
29import org.apache.lucene.search.IndexSearcher;
30import org.apache.lucene.search.Query;
31import org.apache.lucene.search.Sort;
32import org.apache.lucene.search.TermQuery;
33import org.apache.lucene.search.TopDocs;
34import org.apache.lucene.store.FSDirectory;
35import org.apache.lucene.util.Version;
36
37import com.paesia.schema.script.safe.lucene.SEntity;
38
39public class LuceneHandler extends SEntity.Handler {
40
41 public static final String KIND_QUOTA = "C4f91ee1eb414a";
42 public static final String QUOTA_SYSTEM = "F4f91ee659b1ec";
43
44 protected String dirIndex = "";
45 protected String dirBackup = "";
46 protected double systemQuota = 0;
47
48 public LuceneHandler(String dirIndex, String dirBackup, double systemQuota) {
49 this.dirIndex = dirIndex;
50 this.dirBackup = dirBackup;
51 this.systemQuota = systemQuota;
52 }
53
54 public boolean exists(String id) {
55 boolean tag = false;
56 if (id.length() == 0) return tag;
57 try {
58 IndexReader reader = IndexReader.open(FSDirectory.open(new File(dirIndex)));
59 IndexSearcher searcher = new IndexSearcher(reader);
60 TopDocs td = searcher.search(new TermQuery(new Term(SEntity.ID, id)), 1);
61 if (td.totalHits > 0) {
62 tag = true;
63 }
64 searcher.close();
65 reader.close();
66 } catch (Exception e) {
67 }
68
69 return tag;
70 }
71
72 public void create(SEntity src) {
73 Monitor monitor = new Monitor();
74 Timer timer = new Timer();
75 timer.schedule(new CreateTask(timer, src, monitor), 1);
76 while (!monitor.finished) {
77 try {
78 Thread.sleep(10);
79 } catch (Exception e) {
80 }
81 }
82 timer = null;
83 }
84
85 protected boolean quotaCreate(SEntity src) {
86 boolean tag = false;
87 SEntity quota = findSystemQuota();
88 if (quota == null) {
89 quota = newSystemQuota();
90 }
91 double newSize = quota.getDouble("size") + ((double)src.toString().length() / 1048576.0);
92 if (newSize < 0) newSize = 0;
93 if (newSize < systemQuota) {
94 tag = true;
95 quota.setDouble("size", newSize);
96 quota.save();
97 }
98 return tag;
99 }
100
101 protected boolean quotaUpdate(SEntity src) {
102 boolean tag = false;
103 SEntity quota = findSystemQuota();
104 if (quota == null) {
105 quota = newSystemQuota();
106 }
107 double newSize = quota.getDouble("size") - ((double)getFileSize(src.getId(), src.getKind()) / 1048576.0) + ((double)src.toString().length() / 1048576.0);
108 if (newSize < 0) newSize = 0;
109 if (newSize < systemQuota) {
110 tag = true;
111 quota.setDouble("size", newSize);
112 quota.save();
113 }
114 return tag;
115 }
116
117 protected boolean quotaDelete(String id, String kind) {
118 boolean tag = false;
119 SEntity quota = findSystemQuota();
120 if (quota == null) {
121 quota = newSystemQuota();
122 }
123 double newSize = quota.getDouble("size") - ((double)getFileSize(id, kind) / 1048576.0);
124 if (newSize < 0) newSize = 0;
125 if (newSize < systemQuota) {
126 tag = true;
127 quota.setDouble("size", newSize);
128 quota.save();
129 }
130 return tag;
131 }
132
133 protected long getFileSize(String id, String kind) {
134 long tag = 0;
135 String fid = "";
136 for (int i = 0; i < id.length() && i + 1 < id.length(); i += 2) {
137 if (fid.length() > 0) fid += File.separator;
138 fid += id.substring(i, i + 2);
139 }
140 File file = new File(dirBackup, kind);
141 file = new File(file.getAbsolutePath(), fid);
142 String folder = file.getAbsolutePath();
143 file = new File(folder, id + ".txt");
144 if (file.exists()) {
145 tag = file.length();
146 }
147 return tag;
148 }
149
150 protected SEntity newSystemQuota() {
151 SEntity tag = new SEntity(this);
152 tag.setSchema("s|kind|d|size");
153 tag.setKind(KIND_QUOTA);
154 tag.setId(UUID.randomUUID().toString().replaceAll("-", ""));
155 tag.setString("kind", QUOTA_SYSTEM);
156 return tag;
157 }
158
159 protected SEntity findSystemQuota() {
160 List<SEntity> results = search(KIND_QUOTA, new TermQuery(new Term("kind", QUOTA_SYSTEM)), 1);
161 if (results.size() == 0) return null;
162 return results.get(0);
163 }
164
165 protected void createEntity(SEntity src) {
166 if (src.getId().length() == 0) return;
167 if (src.getKind().length() == 0) return;
168
169 try {
170 if (!src.getKind().equals(KIND_QUOTA)) {
171 if (!quotaCreate(src)) return;
172 }
173 backup(src);
174 Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_36);
175 IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_36, analyzer);
176 iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);
177 IndexWriter writer = new IndexWriter(FSDirectory.open(new File(dirIndex)), iwc);
178 Document doc = new Document();
179 write(src, doc);
180 writer.addDocument(doc);
181 writer.close();
182 } catch (Exception e) {
183 }
184 }
185
186 public void update(SEntity src) {
187 Monitor monitor = new Monitor();
188 Timer timer = new Timer();
189 timer.schedule(new UpdateTask(timer, src, monitor), 1);
190 while (!monitor.finished) {
191 try {
192 Thread.sleep(10);
193 } catch (Exception e) {
194 }
195 }
196 timer = null;
197 }
198
199 protected void updateEntity(SEntity src) {
200 if (src.getId().length() == 0) return;
201 if (src.getKind().length() == 0) return;
202
203 try {
204 if (!src.getKind().equals(KIND_QUOTA)) {
205 if (!quotaUpdate(src)) return;
206 }
207 backup(src);
208 Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_36);
209 IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_36, analyzer);
210 iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);
211 IndexWriter writer = new IndexWriter(FSDirectory.open(new File(dirIndex)), iwc);
212 Document doc = new Document();
213 write(src, doc);
214 writer.updateDocument(new Term(SEntity.ID, src.getId()), doc);
215 writer.close();
216 } catch (Exception e) {
217 }
218 }
219
220 public void load(String id, SEntity src) {
221 try {
222 IndexReader reader = IndexReader.open(FSDirectory.open(new File(dirIndex)));
223 IndexSearcher searcher = new IndexSearcher(reader);
224 TopDocs td = searcher.search(new TermQuery(new Term(SEntity.ID, id)), 1);
225 if (td.totalHits > 0) {
226 Document doc = searcher.doc(td.scoreDocs[0].doc);
227 if (allowLoad(id, doc.get(SEntity.KIND))) {
228 src.setSchema(doc.get(SEntity.SCHEMA));
229 read(src, doc);
230 }
231 }
232 searcher.close();
233 reader.close();
234 } catch (Exception e) {
235 }
236 }
237
238 protected boolean allowLoad(String id, String kind) {
239 return true;
240 }
241
242 public int count(String kind, Query query, Filter filter, Sort sort, int max) {
243 int tag = 0;
244 try {
245 IndexReader reader = IndexReader.open(FSDirectory.open(new File(dirIndex)));
246 IndexSearcher searcher = new IndexSearcher(reader);
247 BooleanQuery boolQuery = new BooleanQuery();
248 boolQuery.add(new BooleanClause(new TermQuery(new Term(SEntity.KIND, kind)), Occur.MUST));
249 if (query != null) {
250 boolQuery.add(new BooleanClause(query, Occur.MUST));
251 }
252 TopDocs td = null;
253 if (filter != null && sort != null) {
254 td = searcher.search(boolQuery, filter, max, sort);
255 } else if (filter != null) {
256 td = searcher.search(boolQuery, filter, max);
257 } else if (sort != null) {
258 td = searcher.search(boolQuery, max, sort);
259 } else {
260 td = searcher.search(boolQuery, max);
261 }
262 tag = td.totalHits;
263 searcher.close();
264 reader.close();
265 } catch (Exception e) {
266 }
267 return tag;
268 }
269
270 public int count(String kind, Query query, int max) {
271 return count(kind, query, null, null, max);
272 }
273
274 public int count(String kind, Query query, Sort sort, int max) {
275 return count(kind, query, null, sort, max);
276 }
277
278 public int count(String kind, Query query, Filter filter, int max) {
279 return count(kind, query, filter, null, max);
280 }
281
282 public List<SEntity> search(String kind, Query query, int max) {
283 return search(kind, query, null, null, max);
284 }
285
286 public List<SEntity> search(String kind, Query query, Sort sort, int max) {
287 return search(kind, query, null, sort, max);
288 }
289
290 public List<SEntity> search(String kind, Query query, Filter filter, int max) {
291 return search(kind, query, filter, null, max);
292 }
293
294 public List<SEntity> search(String kind, Query query, int pagesize, int pageno) {
295 return search(kind, query, null, null, pagesize, pageno);
296 }
297
298 public List<SEntity> search(String kind, Query query, Sort sort, int pagesize, int pageno) {
299 return search(kind, query, null, sort, pagesize, pageno);
300 }
301
302 public List<SEntity> search(String kind, Query query, Filter filter, int pagesize, int pageno) {
303 return search(kind, query, filter, null, pagesize, pageno);
304 }
305
306 public List<SEntity> search(String kind, Query query, Filter filter, Sort sort, int max) {
307 List<SEntity> tag = new ArrayList<SEntity>();
308 try {
309 IndexReader reader = IndexReader.open(FSDirectory.open(new File(dirIndex)));
310 IndexSearcher searcher = new IndexSearcher(reader);
311 BooleanQuery boolQuery = new BooleanQuery();
312 boolQuery.add(new BooleanClause(new TermQuery(new Term(SEntity.KIND, kind)), Occur.MUST));
313 if (query != null) {
314 boolQuery.add(new BooleanClause(query, Occur.MUST));
315 }
316 TopDocs td = null;
317 if (filter != null && sort != null) {
318 td = searcher.search(boolQuery, filter, max, sort);
319 } else if (filter != null) {
320 td = searcher.search(boolQuery, filter, max);
321 } else if (sort != null) {
322 td = searcher.search(boolQuery, max, sort);
323 } else {
324 td = searcher.search(boolQuery, max);
325 }
326 for (int i = 0; i < td.totalHits; i++) {
327 SEntity item = new SEntity(this);
328 Document doc = searcher.doc(td.scoreDocs[i].doc);
329 item.setSchema(doc.get(SEntity.SCHEMA));
330 read(item, doc);
331 tag.add(item);
332 }
333 searcher.close();
334 reader.close();
335 } catch (Exception e) {
336 }
337 return tag;
338 }
339
340 public List<SEntity> search(String kind, Query query, Filter filter, Sort sort, int pagesize, int pageno) {
341 List<SEntity> tag = new ArrayList<SEntity>();
342 try {
343 IndexReader reader = IndexReader.open(FSDirectory.open(new File(dirIndex)));
344 IndexSearcher searcher = new IndexSearcher(reader);
345 BooleanQuery boolQuery = new BooleanQuery();
346 boolQuery.add(new BooleanClause(new TermQuery(new Term(SEntity.KIND, kind)), Occur.MUST));
347 if (query != null) {
348 boolQuery.add(new BooleanClause(query, Occur.MUST));
349 }
350 if (pagesize <= 0) pagesize = 10;
351 if (pageno <= 0) pageno = 1;
352 int max = pageno * pagesize;
353 TopDocs td = null;
354 if (filter != null && sort != null) {
355 td = searcher.search(boolQuery, filter, max, sort);
356 } else if (filter != null) {
357 td = searcher.search(boolQuery, filter, max);
358 } else if (sort != null) {
359 td = searcher.search(boolQuery, max, sort);
360 } else {
361 td = searcher.search(boolQuery, max);
362 }
363 for (int i = (pageno - 1) * pagesize; i < td.totalHits && i < max; i++) {
364 SEntity item = new SEntity(this);
365 Document doc = searcher.doc(td.scoreDocs[i].doc);
366 item.setSchema(doc.get(SEntity.SCHEMA));
367 read(item, doc);
368 tag.add(item);
369 }
370 searcher.close();
371 reader.close();
372 } catch (Exception e) {
373 }
374 return tag;
375 }
376
377 protected void backup(SEntity src) {
378 String id = src.getId();
379 if (id.length() == 0) return;
380 String kind = src.getKind();
381 if (kind.length() == 0) return;
382 String fid = "";
383 for (int i = 0; i < id.length() && i + 1 < id.length(); i += 2) {
384 if (fid.length() > 0) fid += File.separator;
385 fid += id.substring(i, i + 2);
386 }
387 try {
388 File file = new File(dirBackup, kind);
389 file = new File(file.getAbsolutePath(), fid);
390 file.mkdirs();
391 String folder = file.getAbsolutePath();
392 BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(new File(folder, id + ".txt"))));
393 writer.write(src.toString());
394 writer.close();
395 } catch (Exception e) {
396 }
397 }
398
399 protected void read(SEntity entity, Document doc) {
400 String schema = doc.get(SEntity.SCHEMA);
401 if (schema == null) schema = "";
402 String[] fields = schema.split("\\|");
403 for (int i = 0; i < fields.length && i + 1 < fields.length; i+= 2) {
404 String kind = fields[i];
405 String fname = fields[i + 1];
406 String val = doc.get(fname);
407 if (val == null) val = "";
408 if (SEntity.ALL_KINDS.indexOf("|" + kind + "|") < 0) continue;
409 entity.setString(fname, val);
410 }
411 }
412
413 protected void write(SEntity entity, Document doc) {
414 String schema = entity.getSchema();
415 if (schema == null) schema = "";
416 String[] fields = schema.split("\\|");
417 for (int i = 0; i < fields.length && i + 1 < fields.length; i+= 2) {
418 String kind = fields[i];
419 String fname = fields[i + 1];
420 if (SEntity.STRING.equalsIgnoreCase(kind)) {
421 Field field = new Field(fname, entity.getString(fname), Store.YES, Index.NOT_ANALYZED_NO_NORMS);
422 doc.add(field);
423 } else if (SEntity.DOUBLE.equalsIgnoreCase(kind)) {
424 NumericField field = new NumericField(fname, Store.YES, true);
425 field.setDoubleValue(entity.getDouble(fname));
426 doc.add(field);
427 } else if (SEntity.FLOAT.equalsIgnoreCase(kind)) {
428 NumericField field = new NumericField(fname, Store.YES, true);
429 field.setFloatValue(entity.getFloat(fname));
430 doc.add(field);
431 } else if (SEntity.INTEGER.equalsIgnoreCase(kind)) {
432 NumericField field = new NumericField(fname, Store.YES, true);
433 field.setIntValue(entity.getInteger(fname));
434 doc.add(field);
435 } else if (SEntity.LONG.equalsIgnoreCase(kind)) {
436 NumericField field = new NumericField(fname, Store.YES, true);
437 field.setLongValue(entity.getLong(fname));
438 doc.add(field);
439 } else if (SEntity.ANALYZED.equalsIgnoreCase(kind)) {
440 Field field = new Field(fname, entity.getString(fname), Store.YES, Index.ANALYZED);
441 doc.add(field);
442 }
443 }
444 }
445
446 public void delete(String id) {
447 Monitor monitor = new Monitor();
448 Timer timer = new Timer();
449 timer.schedule(new DeleteTask(timer, id, monitor), 1);
450 while (!monitor.finished) {
451 try {
452 Thread.sleep(10);
453 } catch (Exception e) {
454 }
455 }
456 timer = null;
457 }
458
459 protected void deleteEntity(String id) {
460 if (id.length() == 0) return;
461 String kind = "";
462
463 try {
464 IndexReader reader = IndexReader.open(FSDirectory.open(new File(dirIndex)));
465 IndexSearcher searcher = new IndexSearcher(reader);
466 TopDocs td = searcher.search(new TermQuery(new Term(SEntity.ID, id)), 1);
467 if (td.totalHits > 0) {
468 Document doc = searcher.doc(td.scoreDocs[0].doc);
469 kind = doc.get(SEntity.KIND);
470 }
471 searcher.close();
472 reader.close();
473 } catch (Exception e) {
474 }
475 if (kind.length() == 0) return;
476 if (!allowDelete(id, kind)) return;
477
478 try {
479 if (!kind.equals(KIND_QUOTA)) {
480 if (!quotaDelete(id, kind)) return;
481 }
482 removeBackup(id, kind);
483 Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_36);
484 IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_36, analyzer);
485 iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);
486 IndexWriter writer = new IndexWriter(FSDirectory.open(new File(dirIndex)), iwc);
487 writer.deleteDocuments(new Term(SEntity.ID, id));
488 writer.close();
489 } catch (Exception e) {
490 }
491 }
492
493 protected boolean allowDelete(String id, String kind) {
494 return true;
495 }
496
497 protected void removeBackup(String id, String kind) {
498 if (id.length() == 0) return;
499 if (kind.length() == 0) return;
500 String fid = "";
501 for (int i = 0; i < id.length() && i + 1 < id.length(); i += 2) {
502 if (fid.length() > 0) fid += File.separator;
503 fid += id.substring(i, i + 2);
504 }
505 try {
506 File file = new File(dirBackup, kind);
507 file = new File(file.getAbsolutePath(), fid);
508 String folder = file.getAbsolutePath();
509 file = new File(folder, id + ".txt");
510 file.delete();
511 } catch (Exception e) {
512 }
513 }
514
515 public double storageQuota() {
516 return systemQuota;
517 }
518
519 public double storageSize() {
520 SEntity tag = findSystemQuota();
521 if (tag == null) return 0;
522 return tag.getDouble("size");
523 }
524
525 private class DeleteTask extends TimerTask {
526
527 private String id;
528 private Timer timer;
529 private Monitor monitor;
530
531 public DeleteTask(Timer timer, String id, Monitor monitor) {
532 this.timer = timer;
533 this.id = id;
534 this.monitor = monitor;
535 }
536
537 @Override
538 public void run() {
539 deleteEntity(id);
540 monitor.finished = true;
541 timer.cancel();
542 timer.purge();
543 timer = null;
544 }
545
546 }
547
548 private class CreateTask extends TimerTask {
549
550 private SEntity entity;
551 private Timer timer;
552 private Monitor monitor;
553
554 public CreateTask(Timer timer, SEntity entity, Monitor monitor) {
555 this.timer = timer;
556 this.entity = entity;
557 this.monitor = monitor;
558 }
559
560 @Override
561 public void run() {
562 createEntity(entity);
563 monitor.finished = true;
564 timer.cancel();
565 timer.purge();
566 timer = null;
567 }
568
569 }
570
571 private class UpdateTask extends TimerTask {
572
573 private SEntity entity;
574 private Timer timer;
575 private Monitor monitor;
576
577 public UpdateTask(Timer timer, SEntity entity, Monitor monitor) {
578 this.timer = timer;
579 this.entity = entity;
580 this.monitor = monitor;
581 }
582
583 @Override
584 public void run() {
585 updateEntity(entity);
586 monitor.finished = true;
587 timer.cancel();
588 timer.purge();
589 timer = null;
590 }
591
592 }
593
594 private class Monitor {
595 public boolean finished = false;
596 }
597
598}
package com.paesia.schema.script;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.OutputStreamWriter;
import java.util.ArrayList;
import java.util.List;
import java.util.Timer;
import java.util.TimerTask;
import java.util.UUID;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.NumericField;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.Filter;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

import com.paesia.schema.script.safe.lucene.SEntity;

public class LuceneHandler extends SEntity.Handler {

    public static final String KIND_QUOTA = "C4f91ee1eb414a";
    public static final String QUOTA_SYSTEM = "F4f91ee659b1ec";
 
    protected String dirIndex = "";
    protected String dirBackup = "";
    protected double systemQuota = 0;

    public LuceneHandler(String dirIndex, String dirBackup, double systemQuota) {
        this.dirIndex = dirIndex;
        this.dirBackup = dirBackup;
        this.systemQuota = systemQuota;
    }
 
    public boolean exists(String id) {
        boolean tag = false;
        if (id.length() == 0) return tag;
        try {
            IndexReader reader = IndexReader.open(FSDirectory.open(new File(dirIndex)));
            IndexSearcher searcher = new IndexSearcher(reader);
            TopDocs td = searcher.search(new TermQuery(new Term(SEntity.ID, id)), 1);
            if (td.totalHits > 0) {
                tag = true;
            }
            searcher.close();
            reader.close();
        } catch (Exception e) {
        }
  
        return tag; 
    }
 
    public void create(SEntity src) {
        Monitor monitor = new Monitor();
        Timer timer = new Timer();
        timer.schedule(new CreateTask(timer, src, monitor), 1);
        while (!monitor.finished) {
            try {
                Thread.sleep(10);
            } catch (Exception e) {
            }
        }
        timer = null;
    }

    protected boolean quotaCreate(SEntity src) {
        boolean tag = false;
        SEntity quota = findSystemQuota();
        if (quota == null) {
            quota = newSystemQuota();
        }
        double newSize = quota.getDouble("size") + ((double)src.toString().length() / 1048576.0);
        if (newSize < 0) newSize = 0;
        if (newSize < systemQuota) {
            tag = true;
            quota.setDouble("size", newSize);
            quota.save();
        }
        return tag;
    }

    protected boolean quotaUpdate(SEntity src) {
        boolean tag = false;
        SEntity quota = findSystemQuota();
        if (quota == null) {
            quota = newSystemQuota();
        }
        double newSize = quota.getDouble("size") - ((double)getFileSize(src.getId(), src.getKind()) / 1048576.0) + ((double)src.toString().length() / 1048576.0);
        if (newSize < 0) newSize = 0;
        if (newSize < systemQuota) {
            tag = true;
            quota.setDouble("size", newSize);
            quota.save();
        }
        return tag;
    }

    protected boolean quotaDelete(String id, String kind) {
        boolean tag = false;
        SEntity quota = findSystemQuota();
        if (quota == null) {
            quota = newSystemQuota();
        }
        double newSize = quota.getDouble("size") - ((double)getFileSize(id, kind) / 1048576.0);
        if (newSize < 0) newSize = 0;
        if (newSize < systemQuota) {
            tag = true;
            quota.setDouble("size", newSize);
            quota.save();
        }
        return tag;
    }
 
    protected long getFileSize(String id, String kind) {
        long tag = 0;
        String fid = "";
        for (int i = 0; i < id.length() && i + 1 < id.length(); i += 2) {
            if (fid.length() > 0) fid += File.separator;
            fid += id.substring(i, i + 2);
        }
        File file = new File(dirBackup, kind);
        file = new File(file.getAbsolutePath(), fid);
        String folder = file.getAbsolutePath();
        file = new File(folder, id + ".txt");
        if (file.exists()) {
            tag = file.length();
        }
        return tag;
    }
 
    protected SEntity newSystemQuota() {
        SEntity tag = new SEntity(this);
        tag.setSchema("s|kind|d|size");
        tag.setKind(KIND_QUOTA);
        tag.setId(UUID.randomUUID().toString().replaceAll("-", ""));
        tag.setString("kind", QUOTA_SYSTEM);
        return tag;
    }
 
    protected SEntity findSystemQuota() {
        List<SEntity> results = search(KIND_QUOTA, new TermQuery(new Term("kind", QUOTA_SYSTEM)), 1);
        if (results.size() == 0) return null;
        return results.get(0);
    }

    protected void createEntity(SEntity src) { 
        if (src.getId().length() == 0) return;
        if (src.getKind().length() == 0) return;

        try {
            if (!src.getKind().equals(KIND_QUOTA)) {
                if (!quotaCreate(src)) return;
            }
            backup(src);
            Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_36);
            IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_36, analyzer);
            iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);
            IndexWriter writer = new IndexWriter(FSDirectory.open(new File(dirIndex)), iwc);
            Document doc = new Document();
            write(src, doc);
            writer.addDocument(doc);
            writer.close();
        } catch (Exception e) {
        }
    }
 
    public void update(SEntity src) {
        Monitor monitor = new Monitor();
        Timer timer = new Timer();
        timer.schedule(new UpdateTask(timer, src, monitor), 1);
        while (!monitor.finished) {
            try {
                Thread.sleep(10);
            } catch (Exception e) {
            }
        }
        timer = null;
    }

    protected void updateEntity(SEntity src) { 
        if (src.getId().length() == 0) return;
        if (src.getKind().length() == 0) return;

        try {
            if (!src.getKind().equals(KIND_QUOTA)) {
                if (!quotaUpdate(src)) return;
            }
            backup(src);
            Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_36);
            IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_36, analyzer);
            iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);
            IndexWriter writer = new IndexWriter(FSDirectory.open(new File(dirIndex)), iwc);
            Document doc = new Document();
            write(src, doc);
            writer.updateDocument(new Term(SEntity.ID, src.getId()), doc);
            writer.close();
        } catch (Exception e) {
        }
    }
 
    public void load(String id, SEntity src) {
        try {
            IndexReader reader = IndexReader.open(FSDirectory.open(new File(dirIndex)));
            IndexSearcher searcher = new IndexSearcher(reader);
            TopDocs td = searcher.search(new TermQuery(new Term(SEntity.ID, id)), 1);
            if (td.totalHits > 0) {
                Document doc = searcher.doc(td.scoreDocs[0].doc);
                if (allowLoad(id, doc.get(SEntity.KIND))) {
                    src.setSchema(doc.get(SEntity.SCHEMA));
                    read(src, doc);
                }
            }
            searcher.close();
            reader.close();
        } catch (Exception e) {
        }
    }
 
    protected boolean allowLoad(String id, String kind) {
        return true;
    }
 
    public int count(String kind, Query query, Filter filter, Sort sort, int max) {
        int tag = 0;
        try {
            IndexReader reader = IndexReader.open(FSDirectory.open(new File(dirIndex)));
            IndexSearcher searcher = new IndexSearcher(reader);
            BooleanQuery boolQuery = new BooleanQuery();
            boolQuery.add(new BooleanClause(new TermQuery(new Term(SEntity.KIND, kind)), Occur.MUST));
            if (query != null) {
                boolQuery.add(new BooleanClause(query, Occur.MUST));
            }
            TopDocs td = null;
            if (filter != null && sort != null) {
                td = searcher.search(boolQuery, filter, max, sort);
            } else if (filter != null) {
                td = searcher.search(boolQuery, filter, max);
            } else if (sort != null) {
                td = searcher.search(boolQuery, max, sort);
            } else {
                td = searcher.search(boolQuery, max);
            }
            tag = td.totalHits;
            searcher.close();
            reader.close();
        } catch (Exception e) {
        }
        return tag;
    }

    public int count(String kind, Query query, int max) {
        return count(kind, query, null, null, max);
    }

    public int count(String kind, Query query, Sort sort, int max) {
        return count(kind, query, null, sort, max);
    }
 
    public int count(String kind, Query query, Filter filter, int max) {
        return count(kind, query, filter, null, max);
    }
 
    public List<SEntity> search(String kind, Query query, int max) {
        return search(kind, query, null, null, max);
    }

    public List<SEntity> search(String kind, Query query, Sort sort, int max) {
        return search(kind, query, null, sort, max);
    }
 
    public List<SEntity> search(String kind, Query query, Filter filter, int max) {
        return search(kind, query, filter, null, max);
    }
 
    public List<SEntity> search(String kind, Query query, int pagesize, int pageno) { 
        return search(kind, query, null, null, pagesize, pageno);
    }
 
    public List<SEntity> search(String kind, Query query, Sort sort, int pagesize, int pageno) { 
        return search(kind, query, null, sort, pagesize, pageno);
    }
 
    public List<SEntity> search(String kind, Query query, Filter filter, int pagesize, int pageno) {
        return search(kind, query, filter, null, pagesize, pageno);
    }
 
    public List<SEntity> search(String kind, Query query, Filter filter, Sort sort, int max) {
        List<SEntity> tag = new ArrayList<SEntity>();
        try {
            IndexReader reader = IndexReader.open(FSDirectory.open(new File(dirIndex)));
            IndexSearcher searcher = new IndexSearcher(reader);
            BooleanQuery boolQuery = new BooleanQuery();
            boolQuery.add(new BooleanClause(new TermQuery(new Term(SEntity.KIND, kind)), Occur.MUST));
            if (query != null) {
                boolQuery.add(new BooleanClause(query, Occur.MUST));
            }
            TopDocs td = null;
            if (filter != null && sort != null) {
                td = searcher.search(boolQuery, filter, max, sort);
            } else if (filter != null) {
                td = searcher.search(boolQuery, filter, max);
            } else if (sort != null) {
                td = searcher.search(boolQuery, max, sort);
            } else {
                td = searcher.search(boolQuery, max);
            }
            for (int i = 0; i < td.totalHits; i++) {
                SEntity item = new SEntity(this);
                Document doc = searcher.doc(td.scoreDocs[i].doc);
                item.setSchema(doc.get(SEntity.SCHEMA));
                read(item, doc);
                tag.add(item);
            }
            searcher.close();
            reader.close();
        } catch (Exception e) {
        }
        return tag;
    }

    public List<SEntity> search(String kind, Query query, Filter filter, Sort sort, int pagesize, int pageno) {
        List<SEntity> tag = new ArrayList<SEntity>();
        try {
            IndexReader reader = IndexReader.open(FSDirectory.open(new File(dirIndex)));
            IndexSearcher searcher = new IndexSearcher(reader);
            BooleanQuery boolQuery = new BooleanQuery();
            boolQuery.add(new BooleanClause(new TermQuery(new Term(SEntity.KIND, kind)), Occur.MUST));
            if (query != null) {
                boolQuery.add(new BooleanClause(query, Occur.MUST));
            }
            if (pagesize <= 0) pagesize = 10;
            if (pageno <= 0) pageno = 1;
            int max = pageno * pagesize;
            TopDocs td = null;
            if (filter != null && sort != null) {
                td = searcher.search(boolQuery, filter, max, sort);
            } else if (filter != null) {
                td = searcher.search(boolQuery, filter, max);
            } else if (sort != null) {
                td = searcher.search(boolQuery, max, sort);
            } else {
                td = searcher.search(boolQuery, max);
            }
            for (int i = (pageno - 1) * pagesize; i < td.totalHits && i < max; i++) {
                SEntity item = new SEntity(this);
                Document doc = searcher.doc(td.scoreDocs[i].doc);
                item.setSchema(doc.get(SEntity.SCHEMA));
                read(item, doc);
                tag.add(item);
            }
            searcher.close();
            reader.close();
        } catch (Exception e) {
        }
        return tag;
    }
 
    protected void backup(SEntity src) {
        String id = src.getId();
        if (id.length() == 0) return;
        String kind = src.getKind();
        if (kind.length() == 0) return;
        String fid = "";
        for (int i = 0; i < id.length() && i + 1 < id.length(); i += 2) {
            if (fid.length() > 0) fid += File.separator;
            fid += id.substring(i, i + 2);
        }
        try {
            File file = new File(dirBackup, kind);
            file = new File(file.getAbsolutePath(), fid);
            file.mkdirs();
            String folder = file.getAbsolutePath();
            BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(new File(folder, id + ".txt"))));
            writer.write(src.toString());
            writer.close();
        } catch (Exception e) {
        }
    }
 
    protected void read(SEntity entity, Document doc) {
        String schema = doc.get(SEntity.SCHEMA);
        if (schema == null) schema = "";
        String[] fields = schema.split("\\|");
        for (int i = 0; i < fields.length && i + 1 < fields.length; i+= 2) {
            String kind = fields[i];
            String fname = fields[i + 1];
            String val = doc.get(fname);
            if (val == null) val = "";
            if (SEntity.ALL_KINDS.indexOf("|" + kind + "|") < 0) continue;
            entity.setString(fname, val);
        }
    }
 
    protected void write(SEntity entity, Document doc) {
        String schema = entity.getSchema();
        if (schema == null) schema = "";
        String[] fields = schema.split("\\|");
        for (int i = 0; i < fields.length && i + 1 < fields.length; i+= 2) {
            String kind = fields[i];
            String fname = fields[i + 1];
            if (SEntity.STRING.equalsIgnoreCase(kind)) {
                Field field = new Field(fname, entity.getString(fname), Store.YES, Index.NOT_ANALYZED_NO_NORMS);
                doc.add(field);
            } else if (SEntity.DOUBLE.equalsIgnoreCase(kind)) {
                NumericField field = new NumericField(fname, Store.YES, true);
                field.setDoubleValue(entity.getDouble(fname));
                doc.add(field);
            } else if (SEntity.FLOAT.equalsIgnoreCase(kind)) {
                NumericField field = new NumericField(fname, Store.YES, true);
                field.setFloatValue(entity.getFloat(fname));
                doc.add(field);
            } else if (SEntity.INTEGER.equalsIgnoreCase(kind)) {
                NumericField field = new NumericField(fname, Store.YES, true);
                field.setIntValue(entity.getInteger(fname));
                doc.add(field);
            } else if (SEntity.LONG.equalsIgnoreCase(kind)) {
                NumericField field = new NumericField(fname, Store.YES, true);
                field.setLongValue(entity.getLong(fname));
                doc.add(field);
            } else if (SEntity.ANALYZED.equalsIgnoreCase(kind)) {
                Field field = new Field(fname, entity.getString(fname), Store.YES, Index.ANALYZED);
                doc.add(field);
            }
        }
    }
 
    public void delete(String id) {
        Monitor monitor = new Monitor();
        Timer timer = new Timer();
        timer.schedule(new DeleteTask(timer, id, monitor), 1);
        while (!monitor.finished) {
            try {
                Thread.sleep(10);
            } catch (Exception e) {
            }
        }
        timer = null;
    }
 
    protected void deleteEntity(String id) { 
        if (id.length() == 0) return;
        String kind = "";
        
        try {
            IndexReader reader = IndexReader.open(FSDirectory.open(new File(dirIndex)));
            IndexSearcher searcher = new IndexSearcher(reader);
            TopDocs td = searcher.search(new TermQuery(new Term(SEntity.ID, id)), 1);
            if (td.totalHits > 0) {
                Document doc = searcher.doc(td.scoreDocs[0].doc);
                kind = doc.get(SEntity.KIND);
            }
            searcher.close();
            reader.close();
        } catch (Exception e) {
        }
        if (kind.length() == 0) return;
        if (!allowDelete(id, kind)) return;
        
        try {
            if (!kind.equals(KIND_QUOTA)) {
                if (!quotaDelete(id, kind)) return;
            }
            removeBackup(id, kind);
            Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_36);
            IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_36, analyzer);
            iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);
            IndexWriter writer = new IndexWriter(FSDirectory.open(new File(dirIndex)), iwc);
            writer.deleteDocuments(new Term(SEntity.ID, id));
            writer.close();
        } catch (Exception e) {
        }
    }
 
    protected boolean allowDelete(String id, String kind) {
        return true;
    }
 
    protected void removeBackup(String id, String kind) {
        if (id.length() == 0) return;
        if (kind.length() == 0) return;
        String fid = "";
        for (int i = 0; i < id.length() && i + 1 < id.length(); i += 2) {
            if (fid.length() > 0) fid += File.separator;
            fid += id.substring(i, i + 2);
        }
        try {
            File file = new File(dirBackup, kind);
            file = new File(file.getAbsolutePath(), fid);
            String folder = file.getAbsolutePath();
            file = new File(folder, id + ".txt");
            file.delete();
        } catch (Exception e) {
        }
    }

    public double storageQuota() {
        return systemQuota;
    }
 
    public double storageSize() {
        SEntity tag = findSystemQuota();
        if (tag == null) return 0;
        return tag.getDouble("size");
    }

    private class DeleteTask extends TimerTask {

        private String id;
        private Timer timer;
        private Monitor monitor;
  
        public DeleteTask(Timer timer, String id, Monitor monitor) {
            this.timer = timer;
            this.id = id;
            this.monitor = monitor;
        }
  
        @Override
        public void run() {
            deleteEntity(id);
            monitor.finished = true;
            timer.cancel();
            timer.purge();
            timer = null;
        }
  
    }

    private class CreateTask extends TimerTask {

        private SEntity entity;
        private Timer timer;
        private Monitor monitor;
  
        public CreateTask(Timer timer, SEntity entity, Monitor monitor) {
            this.timer = timer;
            this.entity = entity;
            this.monitor = monitor;
        }
  
        @Override
        public void run() {
            createEntity(entity);
            monitor.finished = true;
            timer.cancel();
            timer.purge();
            timer = null;
        }
  
    }

    private class UpdateTask extends TimerTask {

        private SEntity entity;
        private Timer timer;
        private Monitor monitor;
  
        public UpdateTask(Timer timer, SEntity entity, Monitor monitor) {
            this.timer = timer;
            this.entity = entity;
            this.monitor = monitor;
        }
  
        @Override
        public void run() {
            updateEntity(entity);
            monitor.finished = true;
            timer.cancel();
            timer.purge();
            timer = null;
        }
  
    }
 
    private class Monitor {
        public boolean finished = false;
    }
 
}
Modify DataHandler class
1public static class DataHandler extends Machine.Handler {
2
3 private String dirIndex;
4 private String dirBackup;
5 private double systemQuota;
6
7 public DataHandler(String dirIndex, String dirBackup, double systemQuota) {
8 this.dirIndex = dirIndex;
9 this.dirBackup = dirBackup;
10 this.systemQuota = systemQuota;
11 }
12
13 public SEntity.Handler getEntityHandler() {
14 return new LuceneHandler(dirIndex, dirBackup, systemQuota);
15 }
16
17..............
18}
public static class DataHandler extends Machine.Handler {

    private String dirIndex;
    private String dirBackup;
    private double systemQuota;
     
    public DataHandler(String dirIndex, String dirBackup, double systemQuota) {
        this.dirIndex = dirIndex;
        this.dirBackup = dirBackup;
        this.systemQuota = systemQuota;
    }
     
    public SEntity.Handler getEntityHandler() { 
        return new LuceneHandler(dirIndex, dirBackup, systemQuota);
    }

..............    
}
javascript
1function main(env, args) {
2 var no = 1;
3 if (no == 1) {
4 test01(env, args); // Grab products
5 }
6 if (no == 2) {
7 test02(env, args); // List all products
8 }
9 if (no == 3) {
10 test03(env, args); // Search products
11 }
12 if (no == 4) {
13 test04(env, args); // Delete products
14 }
15}
16
17function test04(env, args) {
18 var entity = env.newEntity();
19 var query = entity.newMatchAllDocsQuery();
20 var products = entity.search('Link', query, 3, 1);
21 for (var i = 0; i < products.size(); i++) {
22 products.get(i).delete();
23 }
24}
25
26function test03(env, args) {
27 var term = 'Sleeping';
28
29 var entity = env.newEntity();
30 var query = entity.parseQuery([term, term], ['desc', 'title'], [entity.occurShould(), entity.occurShould()]);
31 var size = entity.count('Link', query, 999999);
32 var products = entity.search('Link', query, entity.newSort(org.apache.lucene.search.SortField.FIELD_SCORE), 999999);
33 for (var i = 0; i < products.size(); i++) {
34 var title = env.newString(products.get(i).getString('title').getBytes('UTF-8'), 'UTF-8');
35 try {
36 title = entity.highlight(query, title, 'title', 50, 3, ' (...) ');
37 } catch (e) {
38 env.error(e);
39 }
40 var desc = env.newString(products.get(i).getString('desc').getBytes('UTF-8'), 'UTF-8');
41 try {
42 desc = entity.highlight(query, desc, 'desc', 50, 3, ' (...) ');
43 } catch (e) {
44 env.error(e);
45 }
46 printProduct(products.get(i), env, desc, title);
47 }
48}
49
50function test02(env, args) {
51 var entity = env.newEntity();
52 var query = entity.newMatchAllDocsQuery();
53 var size = entity.count('Link', query, 999999);
54 var products = entity.search('Link', query, 999999);
55 env.info('Size: ' + size);
56 for (var i = 0; i < products.size(); i++) {
57 printProduct(products.get(i), env);
58 }
59}
60
61function test01(env, args) {
62 var astore = 'paesia';
63 var node = '100';
64 var maxpage = 2;
65 var products = grabProduct(astore, node, maxpage, env);
66 for (var i = 0; i < products.size(); i++) {
67 var pro = products.get(i);
68 saveProduct(pro, env);
69 }
70 env.info('Saved: ' + products.size());
71}
72
73function printProduct(pro, env, descH, titleH) {
74 var line = '';
75 line += '\nId: ' + pro.getId();
76 line += '\nTitle: ' + pro.getString('title');
77 line += '\nUrl: ' + pro.getString('url');
78 line += '\nDescription: \n' + pro.getString('desc');
79 if (titleH != null) {
80 line += '\nTitle Highlight: \n' + titleH;
81 }
82 if (descH != null) {
83 line += '\nDescription Highlight: \n' + descH;
84 }
85 env.info('\n' + line + '\n');
86}
87
88function saveProduct(pro, env) {
89 var title = pro.get('title');
90 var url = pro.get('url');
91 if (title == null || title.length == 0 || url == null || url.length == 0) return;
92 if (findProductByUrl(url, env)) return;
93 var desc = pro.get('description');
94 if (desc == null) desc = '';
95 if (desc.length() > 0) {
96 var doc = env.newJsoup().parse(desc);
97 desc = doc.select('body').first().text();
98 }
99 var schema = 's|url|a|title|a|desc';
100 var entity = env.newEntity();
101 entity.setSchema(schema);
102 entity.setKind('Link');
103 entity.setId(env.uniqid());
104 entity.setString('url', url);
105 entity.setString('title', title);
106 entity.setString('desc', desc);
107 entity.save();
108}
109
110function findProductByUrl(url, env) {
111 var entity = env.newEntity();
112 var query = entity.newTermQuery(entity.newTerm('url', url));
113 var size = entity.count('Link', query, 1);
114 return (size > 0);
115}
116
117function grabProduct(astore, node, maxpage, env) {
118 var tag = env.newArrayList();
119 for (var no = 1; no <= maxpage; no++) {
120 try {
121 var alink = env.newURL('http://astore.amazon.com/' + astore + '-20?node=' + node + '&page=' + no);
122 var doc = env.newJsoup().parse(alink, 60000);
123 var elements = doc.select('#featuredProducts .textrow a');
124 var map = env.newHashMap();
125 for (var i = 0; i < elements.size(); i++) {
126 var element = elements.get(i);
127 var title = element.text();
128 var url = element.attr('href');
129 var pos = url.lastIndexOf('/detail/');
130 if (pos < 0) continue;
131 var code = url.substring(pos + 8);
132 var url = env.newURL(alink, url) + '';
133 var item = env.newHashMap();
134 item.put('code', code);
135 item.put('title', title);
136 item.put('url', url);
137 map.put(code, item);
138 }
139 elements = doc.select('#featuredProducts .imagerow a');
140 for (var i = 0; i < elements.size(); i++) {
141 var element = elements.get(i);
142 var url = element.attr('href');
143 var pos = url.lastIndexOf('/detail/');
144 if (pos < 0) continue;
145 var code = url.substring(pos + 8);
146 var item = map.get(code);
147 if (item == null) continue;
148 var child = element.select('img').first();
149 if (child == null) continue;
150 var title = child.attr('alt');
151 var smimg = child.attr('src');
152 if (title.length() > 0) {
153 item.put('title', title);
154 }
155 item.put('small-image', smimg);
156 }
157
158 var keys = env.getKeys(map);
159 for (var i = 0; i < keys.size(); i++) {
160 try {
161 var item = map.get(keys.get(i));
162 alink = env.newURL(item.get('url'));
163 doc = env.newJsoup().parse(alink, 60000);
164 var element = doc.select('#detailImage img').first();
165 if (element != null) {
166 item.put('large-image', element.attr('src'));
167 }
168 element = doc.select('#productDescription').first();
169 if (element != null) {
170 var desc = element.html();
171 var pattern = '<h2>Product Description</h2>';
172 var pos = desc.indexOf(pattern);
173 if (pos >= 0) {
174 desc = desc.substring(pos + pattern.length);
175 }
176 var bdoc = env.newJsoup().parse(desc, item.get('url'));
177 buildURL(bdoc, item.get('url'), env);
178 desc = bdoc.select('body').first().html();
179 if (desc.indexOf('<html') < 0) {
180 item.put('description', desc);
181 }
182 }
183 element = doc.select('#productDetails').first();
184 if (element != null) {
185 var desc = element.html();
186 var pattern = '<h2>Product Details</h2>';
187 var pos = desc.indexOf(pattern);
188 if (pos >= 0) {
189 desc = desc.substring(pos + pattern.length);
190 }
191 var bdoc = env.newJsoup().parse(desc, item.get('url'));
192 buildURL(bdoc, item.get('url'), env);
193 desc = bdoc.select('body').first().html();
194 if (desc.indexOf('<html') < 0) {
195 item.put('details', desc);
196 }
197 }
198 element = doc.select('#editorialReviews').first();
199 if (element != null) {
200 var desc = element.html();
201 var bdoc = env.newJsoup().parse(desc, item.get('url') + '');
202 buildURL(bdoc, item.get('url'), env);
203 desc = bdoc.select('body').first().html();
204 if (desc.indexOf('<html') < 0) {
205 item.put('editorial-reviews', desc);
206 }
207 }
208 element = doc.select('#detailListPrice').first();
209 if (element != null) {
210 item.put('list-price', element.text());
211 }
212 element = doc.select('#detailOfferPrice').first();
213 if (element != null) {
214 item.put('offer-price', element.text());
215 }
216 element = doc.select('#addToCartForm a').first();
217 if (element != null) {
218 item.put('buy-url', element.attr('href'));
219 }
220 } catch (e) {
221 env.error(e);
222 }
223 }
224
225 for (var i = 0; i < keys.size(); i++) {
226 tag.add(map.get(keys.get(i)));
227 }
228 } catch (e) {
229 env.error(e);
230 }
231 }
232 return tag;
233}
234
235function buildURL(doc, baseUrl, env) {
236 baseUrl = env.newURL(baseUrl);
237 var elements = doc.select('a');
238 for (var i = 0; i < elements.size(); i++) {
239 var element = elements.get(i);
240 var url = env.newURL(baseUrl, element.attr('href'));
241 element.attr('href', url + '');
242 }
243 elements = doc.select('img');
244 for (var i = 0; i < elements.size(); i++) {
245 var element = elements.get(i);
246 var url = env.newURL(baseUrl, element.attr('src'));
247 element.attr('src', url + '');
248 }
249}
function main(env, args) {
  var no = 1;
  if (no == 1) {
    test01(env, args); // Grab products
  }
  if (no == 2) {
    test02(env, args); // List all products
  }
  if (no == 3) {
    test03(env, args); // Search products
  }
  if (no == 4) {
    test04(env, args); // Delete products
  }
}

function test04(env, args) {
  var entity = env.newEntity();
  var query = entity.newMatchAllDocsQuery();
  var products = entity.search('Link', query, 3, 1);
  for (var i = 0; i < products.size(); i++) {
    products.get(i).delete();
  }
}

function test03(env, args) {
  var term = 'Sleeping';

  var entity = env.newEntity();
  var query = entity.parseQuery([term, term], ['desc', 'title'], [entity.occurShould(), entity.occurShould()]);
  var size = entity.count('Link', query, 999999);
  var products = entity.search('Link', query, entity.newSort(org.apache.lucene.search.SortField.FIELD_SCORE), 999999);
  for (var i = 0; i < products.size(); i++) {
    var title = env.newString(products.get(i).getString('title').getBytes('UTF-8'), 'UTF-8');
    try {
      title = entity.highlight(query, title, 'title', 50, 3, ' (...) ');
    } catch (e) {
      env.error(e);
    }
    var desc = env.newString(products.get(i).getString('desc').getBytes('UTF-8'), 'UTF-8');
    try {
      desc = entity.highlight(query, desc, 'desc', 50, 3, ' (...) ');
    } catch (e) {
      env.error(e);
    }
    printProduct(products.get(i), env, desc, title);
  }
}

function test02(env, args) {
  var entity = env.newEntity();
  var query = entity.newMatchAllDocsQuery();
  var size = entity.count('Link', query, 999999);
  var products = entity.search('Link', query, 999999);
  env.info('Size: ' + size);
  for (var i = 0; i < products.size(); i++) {
    printProduct(products.get(i), env);
  }
}

function test01(env, args) {
  var astore = 'paesia';
  var node = '100';
  var maxpage = 2;
  var products = grabProduct(astore, node, maxpage, env);
  for (var i = 0; i < products.size(); i++) {
    var pro = products.get(i);
    saveProduct(pro, env);
  }
  env.info('Saved: ' + products.size());
}

function printProduct(pro, env, descH, titleH) {
  var line = '';
  line += '\nId: ' + pro.getId();
  line += '\nTitle: ' + pro.getString('title');
  line += '\nUrl: ' + pro.getString('url');
  line += '\nDescription: \n' + pro.getString('desc');
  if (titleH != null) {
    line += '\nTitle Highlight: \n' + titleH;
  }
  if (descH != null) {
    line += '\nDescription Highlight: \n' + descH;
  }
  env.info('\n' + line + '\n');
}

function saveProduct(pro, env) {
  var title = pro.get('title');
  var url = pro.get('url');
  if (title == null || title.length == 0 || url == null || url.length == 0) return;
  if (findProductByUrl(url, env)) return;
  var desc = pro.get('description');
  if (desc == null) desc = '';
  if (desc.length() > 0) {
    var doc = env.newJsoup().parse(desc);
    desc = doc.select('body').first().text();
  }
  var schema = 's|url|a|title|a|desc';
  var entity = env.newEntity();
  entity.setSchema(schema);
  entity.setKind('Link');
  entity.setId(env.uniqid());
  entity.setString('url', url);
  entity.setString('title', title);
  entity.setString('desc', desc);
  entity.save();
}

function findProductByUrl(url, env) {
  var entity = env.newEntity();
  var query = entity.newTermQuery(entity.newTerm('url', url));
  var size = entity.count('Link', query, 1);
  return (size > 0);
}

function grabProduct(astore, node, maxpage, env) {
  var tag = env.newArrayList();
  for (var no = 1; no <= maxpage; no++) {
    try {
      var alink = env.newURL('http://astore.amazon.com/' + astore + '-20?node=' + node + '&page=' + no);
      var doc = env.newJsoup().parse(alink, 60000);
      var elements = doc.select('#featuredProducts .textrow a');
      var map = env.newHashMap();
      for (var i = 0; i < elements.size(); i++) {
        var element = elements.get(i);
        var title = element.text();
        var url = element.attr('href');
        var pos = url.lastIndexOf('/detail/');
        if (pos < 0) continue;
        var code = url.substring(pos + 8);
        var url = env.newURL(alink, url) + '';
        var item = env.newHashMap();
        item.put('code', code);
        item.put('title', title);
        item.put('url', url);
        map.put(code, item);
      }
      elements = doc.select('#featuredProducts .imagerow a');
      for (var i = 0; i < elements.size(); i++) {
        var element = elements.get(i);
        var url = element.attr('href');
        var pos = url.lastIndexOf('/detail/');
        if (pos < 0) continue;
        var code = url.substring(pos + 8);
        var item = map.get(code);
        if (item == null) continue;
        var child = element.select('img').first();
        if (child == null) continue;
        var title = child.attr('alt');
        var smimg = child.attr('src');
        if (title.length() > 0) {
          item.put('title', title);
        }
        item.put('small-image', smimg);
      }

      var keys = env.getKeys(map);
      for (var i = 0; i < keys.size(); i++) {
        try {
          var item = map.get(keys.get(i));
          alink = env.newURL(item.get('url'));
          doc = env.newJsoup().parse(alink, 60000);
          var element = doc.select('#detailImage img').first();
          if (element != null) {
            item.put('large-image', element.attr('src'));
          }
          element = doc.select('#productDescription').first();
          if (element != null) {
            var desc = element.html();
            var pattern = '<h2>Product Description</h2>';
            var pos = desc.indexOf(pattern);
            if (pos >= 0) {
              desc = desc.substring(pos + pattern.length);
            }
            var bdoc = env.newJsoup().parse(desc, item.get('url'));
            buildURL(bdoc, item.get('url'), env);
            desc = bdoc.select('body').first().html();
            if (desc.indexOf('<html') < 0) {
              item.put('description', desc);
            }
          }
          element = doc.select('#productDetails').first();
          if (element != null) {
            var desc = element.html();
            var pattern = '<h2>Product Details</h2>';
            var pos = desc.indexOf(pattern);
            if (pos >= 0) {
              desc = desc.substring(pos + pattern.length);
            }
            var bdoc = env.newJsoup().parse(desc, item.get('url'));
            buildURL(bdoc, item.get('url'), env);
            desc = bdoc.select('body').first().html();
            if (desc.indexOf('<html') < 0) {
              item.put('details', desc);
            }
          }
          element = doc.select('#editorialReviews').first();
          if (element != null) {
            var desc = element.html();
            var bdoc = env.newJsoup().parse(desc, item.get('url') + '');
            buildURL(bdoc, item.get('url'), env);
            desc = bdoc.select('body').first().html();
            if (desc.indexOf('<html') < 0) {
              item.put('editorial-reviews', desc);
            }
          }
          element = doc.select('#detailListPrice').first();
          if (element != null) {
            item.put('list-price', element.text());
          }
          element = doc.select('#detailOfferPrice').first();
          if (element != null) {
            item.put('offer-price', element.text());
          }
          element = doc.select('#addToCartForm a').first();
          if (element != null) {
            item.put('buy-url', element.attr('href'));
          }
        } catch (e) {
          env.error(e);
        }
      }

      for (var i = 0; i < keys.size(); i++) {
        tag.add(map.get(keys.get(i)));
      }
    } catch (e) {
      env.error(e);
    }
  }
  return tag;
}

function buildURL(doc, baseUrl, env) {
  baseUrl = env.newURL(baseUrl);
  var elements = doc.select('a');
  for (var i = 0; i < elements.size(); i++) {
    var element = elements.get(i);
    var url = env.newURL(baseUrl, element.attr('href'));
    element.attr('href', url + '');
  }
  elements = doc.select('img');
  for (var i = 0; i < elements.size(); i++) {
    var element = elements.get(i);
    var url = env.newURL(baseUrl, element.attr('src'));
    element.attr('src', url + '');
  }
}

  Protected by Copyscape Online Copyright Protection