Friday, 17 February 2012

Extract summary of web page

Extract summary of web page
This task use java and jsoup to extract summary of web page.
Extract summary of web page
  1. Create Summary class as following
  2. Call Summary.extract() method as following
Call Summary.extract() method
String link = "http://forums.digitalpoint.com";
Document doc = Jsoup.parse(new URL(link), 60000);
String sum = Summary.extract(doc);
logger.info(sum);
    
Summary class
public static class Summary {
     
    public static String extract(Document doc) {
        String tag = "";
        List<String> lines = new ArrayList<String>();
         
        for (int i = 0; i < doc.children().size(); i++) {
            summary(doc.child(i), lines);
        }

        for (int i = 0; i < lines.size(); i++) {
            if (tag.length() > 0) tag += "\r\n";
            tag += lines.get(i);
        }
         
        return tag;
    }
        
    private static void summary(Element ele, List<String> lines) {
        if (ele.children().size() == 0 || allowedChildren(ele)) {
            String[] tags = new String[] { "div", "p" };
            for (int i = 0; i < tags.length; i++) {
                if (ele.tagName().equalsIgnoreCase(tags[i])) {
                    String text = ele.text().trim();
                    if (text.length() > 0) {
                        if (text.endsWith(".") || text.endsWith("?")) {
                            lines.add(text);
                        }
                    }
                }
            }
        } else {
            for (int i = 0; i < ele.children().size(); i++) {
                summary(ele.child(i), lines);
            }
        }
    }
        
    private static boolean allowedChildren(Element ele) {
        String[] tags = new String[] { "a", "b", "i", "strong" };
        for (int i = 0; i < ele.children().size(); i++) {
            Element child = ele.child(i);
            boolean found = false;
            for (int j = 0; j < tags.length; j++) {
                if (tags[j].equalsIgnoreCase(child.tagName())) {
                    found = true;
                    break;
                }
            }
            if (!found) return false;
        }
        return true;
    }
     
}
    

  Protected by Copyscape Online Copyright Protection

No comments:

Post a Comment