Extract summary of web page
This task use java and jsoup to extract summary of web page.
Extract summary of web page
- Create Summary class as following
- Call Summary.extract() method as following
Call Summary.extract() method
String link = "http://forums.digitalpoint.com";
Document doc = Jsoup.parse(new URL(link), 60000);
String sum = Summary.extract(doc);
logger.info(sum);
Summary class
public static class Summary {
public static String extract(Document doc) {
String tag = "";
List<String> lines = new ArrayList<String>();
for (int i = 0; i < doc.children().size(); i++) {
summary(doc.child(i), lines);
}
for (int i = 0; i < lines.size(); i++) {
if (tag.length() > 0) tag += "\r\n";
tag += lines.get(i);
}
return tag;
}
private static void summary(Element ele, List<String> lines) {
if (ele.children().size() == 0 || allowedChildren(ele)) {
String[] tags = new String[] { "div", "p" };
for (int i = 0; i < tags.length; i++) {
if (ele.tagName().equalsIgnoreCase(tags[i])) {
String text = ele.text().trim();
if (text.length() > 0) {
if (text.endsWith(".") || text.endsWith("?")) {
lines.add(text);
}
}
}
}
} else {
for (int i = 0; i < ele.children().size(); i++) {
summary(ele.child(i), lines);
}
}
}
private static boolean allowedChildren(Element ele) {
String[] tags = new String[] { "a", "b", "i", "strong" };
for (int i = 0; i < ele.children().size(); i++) {
Element child = ele.child(i);
boolean found = false;
for (int j = 0; j < tags.length; j++) {
if (tags[j].equalsIgnoreCase(child.tagName())) {
found = true;
break;
}
}
if (!found) return false;
}
return true;
}
}
No comments:
Post a Comment