Extract summary of web page
This task use java and jsoup to extract summary of web page.
Extract summary of web page
- Create Summary class as following
- Call Summary.extract() method as following
Call Summary.extract() method
String link = "http://forums.digitalpoint.com"; Document doc = Jsoup.parse(new URL(link), 60000); String sum = Summary.extract(doc); logger.info(sum);Summary class
public static class Summary { public static String extract(Document doc) { String tag = ""; List<String> lines = new ArrayList<String>(); for (int i = 0; i < doc.children().size(); i++) { summary(doc.child(i), lines); } for (int i = 0; i < lines.size(); i++) { if (tag.length() > 0) tag += "\r\n"; tag += lines.get(i); } return tag; } private static void summary(Element ele, List<String> lines) { if (ele.children().size() == 0 || allowedChildren(ele)) { String[] tags = new String[] { "div", "p" }; for (int i = 0; i < tags.length; i++) { if (ele.tagName().equalsIgnoreCase(tags[i])) { String text = ele.text().trim(); if (text.length() > 0) { if (text.endsWith(".") || text.endsWith("?")) { lines.add(text); } } } } } else { for (int i = 0; i < ele.children().size(); i++) { summary(ele.child(i), lines); } } } private static boolean allowedChildren(Element ele) { String[] tags = new String[] { "a", "b", "i", "strong" }; for (int i = 0; i < ele.children().size(); i++) { Element child = ele.child(i); boolean found = false; for (int j = 0; j < tags.length; j++) { if (tags[j].equalsIgnoreCase(child.tagName())) { found = true; break; } } if (!found) return false; } return true; } }
No comments:
Post a Comment