Extract summary of web page
This task use java and jsoup to extract summary of web page.
Extract summary of web page
- Create Summary class as following
- Call Summary.extract() method as following
Call Summary.extract() method
Summary class
1 | String link = "http://forums.digitalpoint.com"; |
2 | Document doc = Jsoup.parse(new URL(link), 60000); |
3 | String sum = Summary.extract(doc); |
4 | logger.info(sum); |
String link = "http://forums.digitalpoint.com"; Document doc = Jsoup.parse(new URL(link), 60000); String sum = Summary.extract(doc); logger.info(sum);
1 | public static class Summary { |
2 | |
3 | public static String extract(Document doc) { |
4 | String tag = ""; |
5 | List<String> lines = new ArrayList<String>(); |
6 | |
7 | for (int i = 0; i < doc.children().size(); i++) { |
8 | summary(doc.child(i), lines); |
9 | } |
10 | |
11 | for (int i = 0; i < lines.size(); i++) { |
12 | if (tag.length() > 0) tag += "\r\n"; |
13 | tag += lines.get(i); |
14 | } |
15 | |
16 | return tag; |
17 | } |
18 | |
19 | private static void summary(Element ele, List<String> lines) { |
20 | if (ele.children().size() == 0 || allowedChildren(ele)) { |
21 | String[] tags = new String[] { "div", "p" }; |
22 | for (int i = 0; i < tags.length; i++) { |
23 | if (ele.tagName().equalsIgnoreCase(tags[i])) { |
24 | String text = ele.text().trim(); |
25 | if (text.length() > 0) { |
26 | if (text.endsWith(".") || text.endsWith("?")) { |
27 | lines.add(text); |
28 | } |
29 | } |
30 | } |
31 | } |
32 | } else { |
33 | for (int i = 0; i < ele.children().size(); i++) { |
34 | summary(ele.child(i), lines); |
35 | } |
36 | } |
37 | } |
38 | |
39 | private static boolean allowedChildren(Element ele) { |
40 | String[] tags = new String[] { "a", "b", "i", "strong" }; |
41 | for (int i = 0; i < ele.children().size(); i++) { |
42 | Element child = ele.child(i); |
43 | boolean found = false; |
44 | for (int j = 0; j < tags.length; j++) { |
45 | if (tags[j].equalsIgnoreCase(child.tagName())) { |
46 | found = true; |
47 | break; |
48 | } |
49 | } |
50 | if (!found) return false; |
51 | } |
52 | return true; |
53 | } |
54 | |
55 | } |
public static class Summary { public static String extract(Document doc) { String tag = ""; List<String> lines = new ArrayList<String>(); for (int i = 0; i < doc.children().size(); i++) { summary(doc.child(i), lines); } for (int i = 0; i < lines.size(); i++) { if (tag.length() > 0) tag += "\r\n"; tag += lines.get(i); } return tag; } private static void summary(Element ele, List<String> lines) { if (ele.children().size() == 0 || allowedChildren(ele)) { String[] tags = new String[] { "div", "p" }; for (int i = 0; i < tags.length; i++) { if (ele.tagName().equalsIgnoreCase(tags[i])) { String text = ele.text().trim(); if (text.length() > 0) { if (text.endsWith(".") || text.endsWith("?")) { lines.add(text); } } } } } else { for (int i = 0; i < ele.children().size(); i++) { summary(ele.child(i), lines); } } } private static boolean allowedChildren(Element ele) { String[] tags = new String[] { "a", "b", "i", "strong" }; for (int i = 0; i < ele.children().size(); i++) { Element child = ele.child(i); boolean found = false; for (int j = 0; j < tags.length; j++) { if (tags[j].equalsIgnoreCase(child.tagName())) { found = true; break; } } if (!found) return false; } return true; } }
No comments:
Post a Comment