Sunday, 15 April 2012

Grab products from Amazon aStores

Grab products from Amazon aStores
This task use javascript sandbox with jsoup support to grab products from Amazon aStores.
Grab products from Amazon aStores
  1. Create javascript sandbox with jsoup support
  2. Create javascript as following
javascript
1function main(env, args) {
2 var astore = 'paesia';
3 var node = '100';
4 var maxpage = 2;
5 try {
6 var products = grabProduct(astore, node, maxpage, env);
7 var links = args.get('links');
8 for (var i = 0; i < products.size(); i++) {
9 links.add(products.get(i));
10 }
11 } catch (e) {
12 env.error(e);
13 }
14}
15
16function grabProduct(astore, node, maxpage, env) {
17 var tag = env.newArrayList();
18 for (var no = 1; no <= maxpage; no++) {
19 try {
20 var alink = env.newURL('http://astore.amazon.com/' + astore + '-20?node=' + node + '&page=' + no);
21 var doc = env.newJsoup().parse(alink, 60000);
22 var elements = doc.select('#featuredProducts .textrow a');
23 var map = env.newHashMap();
24 for (var i = 0; i < elements.size(); i++) {
25 var element = elements.get(i);
26 var title = element.text();
27 var url = element.attr('href');
28 var pos = url.lastIndexOf('/detail/');
29 if (pos < 0) continue;
30 var code = url.substring(pos + 8);
31 var url = env.newURL(alink, url) + '';
32 var item = env.newHashMap();
33 item.put('code', code);
34 item.put('title', title);
35 item.put('url', url);
36 map.put(code, item);
37 }
38 elements = doc.select('#featuredProducts .imagerow a');
39 for (var i = 0; i < elements.size(); i++) {
40 var element = elements.get(i);
41 var url = element.attr('href');
42 var pos = url.lastIndexOf('/detail/');
43 if (pos < 0) continue;
44 var code = url.substring(pos + 8);
45 var item = map.get(code);
46 if (item == null) continue;
47 var child = element.select('img').first();
48 if (child == null) continue;
49 var title = child.attr('alt');
50 var smimg = child.attr('src');
51 if (title.length() > 0) {
52 item.put('title', title);
53 }
54 item.put('small-image', smimg);
55 }
56
57 var keys = env.getKeys(map);
58 for (var i = 0; i < keys.size(); i++) {
59 try {
60 var item = map.get(keys.get(i));
61 alink = env.newURL(item.get('url'));
62 doc = env.newJsoup().parse(alink, 60000);
63 var element = doc.select('#detailImage img').first();
64 if (element != null) {
65 item.put('large-image', element.attr('src'));
66 }
67 element = doc.select('#productDescription').first();
68 if (element != null) {
69 var desc = element.html();
70 var pattern = '<h2>Product Description</h2>';
71 var pos = desc.indexOf(pattern);
72 if (pos >= 0) {
73 desc = desc.substring(pos + pattern.length);
74 }
75 var bdoc = env.newJsoup().parse(desc, item.get('url'));
76 buildURL(bdoc, item.get('url'), env);
77 desc = bdoc.select('body').first().html();
78 if (desc.indexOf('<html') < 0) {
79 item.put('description', desc);
80 }
81 }
82 element = doc.select('#productDetails').first();
83 if (element != null) {
84 var desc = element.html();
85 var pattern = '<h2>Product Details</h2>';
86 var pos = desc.indexOf(pattern);
87 if (pos >= 0) {
88 desc = desc.substring(pos + pattern.length);
89 }
90 var bdoc = env.newJsoup().parse(desc, item.get('url'));
91 buildURL(bdoc, item.get('url'), env);
92 desc = bdoc.select('body').first().html();
93 if (desc.indexOf('<html') < 0) {
94 item.put('details', desc);
95 }
96 }
97 element = doc.select('#editorialReviews').first();
98 if (element != null) {
99 var desc = element.html();
100 var bdoc = env.newJsoup().parse(desc, item.get('url') + '');
101 buildURL(bdoc, item.get('url'), env);
102 desc = bdoc.select('body').first().html();
103 if (desc.indexOf('<html') < 0) {
104 item.put('editorial-reviews', desc);
105 }
106 }
107 element = doc.select('#detailListPrice').first();
108 if (element != null) {
109 item.put('list-price', element.text());
110 }
111 element = doc.select('#detailOfferPrice').first();
112 if (element != null) {
113 item.put('offer-price', element.text());
114 }
115 element = doc.select('#addToCartForm a').first();
116 if (element != null) {
117 item.put('buy-url', element.attr('href'));
118 }
119 } catch (e) {
120 env.error(e);
121 }
122 }
123
124 for (var i = 0; i < keys.size(); i++) {
125 tag.add(map.get(keys.get(i)));
126 }
127 } catch (e) {
128 env.error(e);
129 }
130 }
131 return tag;
132}
133
134function buildURL(doc, baseUrl, env) {
135 baseUrl = env.newURL(baseUrl);
136 var elements = doc.select('a');
137 for (var i = 0; i < elements.size(); i++) {
138 var element = elements.get(i);
139 var url = env.newURL(baseUrl, element.attr('href'));
140 element.attr('href', url + '');
141 }
142 elements = doc.select('img');
143 for (var i = 0; i < elements.size(); i++) {
144 var element = elements.get(i);
145 var url = env.newURL(baseUrl, element.attr('src'));
146 element.attr('src', url + '');
147 }
148}
function main(env, args) {
  var astore = 'paesia';
  var node = '100';
  var maxpage = 2;
  try {
    var products = grabProduct(astore, node, maxpage, env);
    var links = args.get('links');
    for (var i = 0; i < products.size(); i++) {
      links.add(products.get(i));
    }
  } catch (e) {
    env.error(e);
  }
}

function grabProduct(astore, node, maxpage, env) {
  var tag = env.newArrayList();
  for (var no = 1; no <= maxpage; no++) {
    try {
      var alink = env.newURL('http://astore.amazon.com/' + astore + '-20?node=' + node + '&page=' + no);
      var doc = env.newJsoup().parse(alink, 60000);
      var elements = doc.select('#featuredProducts .textrow a');
      var map = env.newHashMap();
      for (var i = 0; i < elements.size(); i++) {
        var element = elements.get(i);
        var title = element.text();
        var url = element.attr('href');
        var pos = url.lastIndexOf('/detail/');
        if (pos < 0) continue;
        var code = url.substring(pos + 8);
        var url = env.newURL(alink, url) + '';
        var item = env.newHashMap();
        item.put('code', code);
        item.put('title', title);
        item.put('url', url);
        map.put(code, item);
      }
      elements = doc.select('#featuredProducts .imagerow a');
      for (var i = 0; i < elements.size(); i++) {
        var element = elements.get(i);
        var url = element.attr('href');
        var pos = url.lastIndexOf('/detail/');
        if (pos < 0) continue;
        var code = url.substring(pos + 8);
        var item = map.get(code);
        if (item == null) continue;
        var child = element.select('img').first();
        if (child == null) continue;
        var title = child.attr('alt');
        var smimg = child.attr('src');
        if (title.length() > 0) {
          item.put('title', title);
        }
        item.put('small-image', smimg);
      }

      var keys = env.getKeys(map);
      for (var i = 0; i < keys.size(); i++) {
        try {
          var item = map.get(keys.get(i));
          alink = env.newURL(item.get('url'));
          doc = env.newJsoup().parse(alink, 60000);
          var element = doc.select('#detailImage img').first();
          if (element != null) {
            item.put('large-image', element.attr('src'));
          }
          element = doc.select('#productDescription').first();
          if (element != null) {
            var desc = element.html();
            var pattern = '<h2>Product Description</h2>';
            var pos = desc.indexOf(pattern);
            if (pos >= 0) {
              desc = desc.substring(pos + pattern.length);
            }
            var bdoc = env.newJsoup().parse(desc, item.get('url'));
            buildURL(bdoc, item.get('url'), env);
            desc = bdoc.select('body').first().html();
            if (desc.indexOf('<html') < 0) {
              item.put('description', desc);
            }
          }
          element = doc.select('#productDetails').first();
          if (element != null) {
            var desc = element.html();
            var pattern = '<h2>Product Details</h2>';
            var pos = desc.indexOf(pattern);
            if (pos >= 0) {
              desc = desc.substring(pos + pattern.length);
            }
            var bdoc = env.newJsoup().parse(desc, item.get('url'));
            buildURL(bdoc, item.get('url'), env);
            desc = bdoc.select('body').first().html();
            if (desc.indexOf('<html') < 0) {
              item.put('details', desc);
            }
          }
          element = doc.select('#editorialReviews').first();
          if (element != null) {
            var desc = element.html();
            var bdoc = env.newJsoup().parse(desc, item.get('url') + '');
            buildURL(bdoc, item.get('url'), env);
            desc = bdoc.select('body').first().html();
            if (desc.indexOf('<html') < 0) {
              item.put('editorial-reviews', desc);
            }
          }
          element = doc.select('#detailListPrice').first();
          if (element != null) {
            item.put('list-price', element.text());
          }
          element = doc.select('#detailOfferPrice').first();
          if (element != null) {
            item.put('offer-price', element.text());
          }
          element = doc.select('#addToCartForm a').first();
          if (element != null) {
            item.put('buy-url', element.attr('href'));
          }
        } catch (e) {
          env.error(e);
        }
      }

      for (var i = 0; i < keys.size(); i++) {
        tag.add(map.get(keys.get(i)));
      }
    } catch (e) {
      env.error(e);
    }
  }
  return tag;
}

function buildURL(doc, baseUrl, env) {
  baseUrl = env.newURL(baseUrl);
  var elements = doc.select('a');
  for (var i = 0; i < elements.size(); i++) {
    var element = elements.get(i);
    var url = env.newURL(baseUrl, element.attr('href'));
    element.attr('href', url + '');
  }
  elements = doc.select('img');
  for (var i = 0; i < elements.size(); i++) {
    var element = elements.get(i);
    var url = env.newURL(baseUrl, element.attr('src'));
    element.attr('src', url + '');
  }
}

  Protected by Copyscape Online Copyright Protection

No comments:

Post a Comment