Html Parser

14982 ワード

/*
    *  <table><tr><td></td></tr></table> 
    */
    public void testTable() {
        Parser myParser;
        NodeList nodeList = null;
        myParser = Parser.createParser("<body> " + "<table id=’table1′ >"
        + "<tr><td>1-11</td><td>1-12</td><td>1-13</td>"
        + "<tr><td>1-21</td><td>1-22</td><td>1-23</td>"
        + "<tr><td>1-31</td><td>1-32</td><td>1-33</td></table>"
        + "<table id=’table2′ >"
        + "<tr><td>2-11</td><td>2-12</td><td>2-13</td>"
        + "<tr><td>2-21</td><td>2-22</td><td>2-23</td>"
        + "<tr><td>2-31</td><td>2-32</td><td>2-33</td></table>"
        + "</body>", "GBK");
        NodeFilter tableFilter = new NodeClassFilter(TableTag.class);
        OrFilter lastFilter = new OrFilter();
        lastFilter.setPredicates(new NodeFilter[] { tableFilter });
        try {
            nodeList = myParser.parse(lastFilter);
            for (int i = 0; i <= nodeList.size(); i++) {
                if (nodeList.elementAt(i) instanceof TableTag) {
                    TableTag tag = (TableTag) nodeList.elementAt(i);
                    TableRow[] rows = tag.getRows();
            
                    for (int j = 0; j < rows.length; j++) {
                        TableRow tr = (TableRow) rows[j];
                        TableColumn[] td = tr.getColumns();
                        for (int k = 0; k < td.length; k++) {
                            logger.fatal("<td>" + td[k].toPlainTextString());
                        }
                
                    }
            
                }
            }
    
        } catch (ParserException e) {
            e.printStackTrace();
        }
    }

/*
    *  NodeVisitor , 
    */
    public void testVisitorAll() {
        try {
            Parser parser = new Parser();
            parser.setURL("http://www.google.com");
            parser.setEncoding(parser.getEncoding());
            
            NodeVisitor visitor = new NodeVisitor() {
                public void visitTag(Tag tag) {
                    logger.fatal("testVisitorAll() Tag name is :"
                    + tag.getTagName() + " 
Class is :" + tag.getClass()); } }; parser.visitAllNodesWith(visitor); } catch (ParserException e) { e.printStackTrace(); } } /* * Tag NodeVisitor */ public void testTagVisitor() { try { Parser parser = new Parser( "<head><title>dddd</title>" + "<link href=http://www.yeeach.com/’/test01/css.css’ text=’text/css’ rel=’stylesheet’ />" + "<link href=http://www.yeeach.com/’/test02/css.css’ text=’text/css’ rel=’stylesheet’ />" + "</head>" + "<body>" + "<a href=http://www.yeeach.com/’http://www.yeeach.com’>yeeach.com</a>" + "</body>"); NodeVisitor visitor = new NodeVisitor() { public void visitTag(Tag tag) { if (tag instanceof HeadTag) { logger.fatal("visitTag() HeadTag : Tag name is :" + tag.getTagName() + "
Class is :" + tag.getClass() + "
Text is :" + tag.getText()); } else if (tag instanceof TitleTag) { logger.fatal("visitTag() TitleTag : Tag name is :" + tag.getTagName() + "
Class is :" + tag.getClass() + "
Text is :" + tag.getText()); } else if (tag instanceof LinkTag) { logger.fatal("visitTag() LinkTag : Tag name is :" + tag.getTagName() + "
Class is :" + tag.getClass() + "
Text is :" + tag.getText() + "
getAttribute is :" + tag.getAttribute("href")); } else { logger.fatal("visitTag() : Tag name is :" + tag.getTagName() + "
Class is :" + tag.getClass() + "
Text is :" + tag.getText()); } } }; parser.visitAllNodesWith(visitor); } catch (Exception e) { e.printStackTrace(); } } /* * HtmlPage */ public void testHtmlPage() { String inputHTML = "<html>" + "<head>" + "<title>Welcome to the HTMLParser website</title>" + "</head>" + "<body>" + "Welcome to HTMLParser" + "<table id=’table1′ >" + "<tr><td>1-11</td><td>1-12</td><td>1-13</td>" + "<tr><td>1-21</td><td>1-22</td><td>1-23</td>" + "<tr><td>1-31</td><td>1-32</td><td>1-33</td></table>" + "<table id=’table2′ >" + "<tr><td>2-11</td><td>2-12</td><td>2-13</td>" + "<tr><td>2-21</td><td>2-22</td><td>2-23</td>" + "<tr><td>2-31</td><td>2-32</td><td>2-33</td></table>" + "</body>" + "</html>"; Parser parser = new Parser(); try { parser.setInputHTML(inputHTML); parser.setEncoding(parser.getURL()); HtmlPage page = new HtmlPage(parser); parser.visitAllNodesWith(page); logger.fatal("testHtmlPage -title is :" + page.getTitle()); NodeList list = page.getBody(); for (NodeIterator iterator = list.elements(); iterator .hasMoreNodes();) { Node node = iterator.nextNode(); logger.fatal("testHtmlPage -node is :" + node.toHtml()); } } catch (ParserException e) { // TODO Auto-generated catch block e.printStackTrace(); } } /* * LinkBean */ public void testLinkBean() { Parser parser = new Parser(); LinkBean linkBean = new LinkBean(); linkBean.setURL("http://www.google.com"); URL[] urls = linkBean.getLinks(); for (int i = 0; i < urls.length; i++) { URL url = urls[i]; logger.fatal("testLinkBean() -url is :" + url); } } }

参照
もう一つ
public class ParserTestCase extends TestCase {
    private static final Logger logger = Logger.getLogger(ParserTestCase.class);

    public ParserTestCase(String name) {
        super(name);
     }
    /*
     *  ObjectFindVisitor 
    */
    public void testImageVisitor() {
        try {
             ImageTag imgLink;
             ObjectFindingVisitor visitor = new ObjectFindingVisitor(ImageTag.class);
             Parser parser = new Parser();
             parser.setURL("http://www.google.com");
             parser.setEncoding(parser.getEncoding());
             parser.visitAllNodesWith(visitor);
             Node[] nodes = visitor.getTags();
            for (int i = 0; i < nodes.length; i++) {
                 imgLink = (ImageTag) nodes[i];
                 logger.fatal("testImageVisitor() ImageURL = "
                + imgLink.getImageURL());
                 logger.fatal("testImageVisitor() ImageLocation = "
                + imgLink.extractImageLocn());
                 logger.fatal("testImageVisitor() SRC = "
                + imgLink.getAttribute("SRC"));
             }
         }catch (Exception e) {
             e.printStackTrace();
         }
     }
    
    /*
     *  TagNameFilter 
    */
    public void testNodeFilter() {
        try {
             NodeFilter filter = new TagNameFilter("IMG");
             Parser parser = new Parser();
             parser.setURL("http://www.google.com");
             parser.setEncoding(parser.getEncoding());
             NodeList list = parser.extractAllNodesThatMatch(filter);
            for (int i = 0; i < list.size(); i++) {
                 logger.fatal("testNodeFilter() " + list.elementAt(i).toHtml());
             }
         } catch (Exception e) {
             e.printStackTrace();
         }

     }
    /*
     *  NodeClassFilter 
    */
    public void testLinkTag() {
        try {

             NodeFilter filter = new NodeClassFilter(LinkTag.class);
             Parser parser = new Parser();
             parser.setURL("http://www.google.com");
             parser.setEncoding(parser.getEncoding());
             NodeList list = parser.extractAllNodesThatMatch(filter);
            for (int i = 0; i < list.size(); i++) {
                 LinkTag node = (LinkTag) list.elementAt(i);
                 logger.fatal("testLinkTag() Link is :" + node.extractLink());
             }
         } catch (Exception e) {
             e.printStackTrace();
         }

     }
    /*
     *  <link href=" text=’text/css’ rel=’stylesheet’ /> 
    */
    public void testLinkCSS() {
        try {
             Parser parser = new Parser();
             parser
             .setInputHTML("<head><title>Link Test</title>"
            + "<link href=http://www.yeeach.com/’/test01/css.css’ text=’text/css’ rel=’stylesheet’ />"
            + "<link href=http://www.yeeach.com/’/test02/css.css’ text=’text/css’ rel=’stylesheet’ />"
            + "</head>" + "<body>");
             parser.setEncoding(parser.getEncoding());
             NodeList nodeList = null;
            for (NodeIterator e = parser.elements(); e.hasMoreNodes();) {
                 Node node = e.nextNode();
                 logger.fatal("testLinkCSS()" + node.getText()+ node.getClass());
        
             }
         } catch (Exception e) {
             e.printStackTrace();
         }
     }
    /*
     *  OrFilter 
    */
    public void testOrFilter() {
         NodeFilter inputFilter = new NodeClassFilter(InputTag.class);
         NodeFilter selectFilter = new NodeClassFilter(SelectTag.class);
         Parser myParser;
         NodeList nodeList = null;
    
        try {
             Parser parser = new Parser();
             parser
             .setInputHTML("<head><title>OrFilter Test</title>"
            + "<link href=http://www.yeeach.com/’/test01/css.css’ text=’text/css’ rel=’stylesheet’ />"
            + "<link href=http://www.yeeach.com/’/test02/css.css’ text=’text/css’ rel=’stylesheet’ />"
            + "</head>"
            + "<body>"
            + "<input type=’text’ value=’text1′ name=’text1′/>"
            + "<input type=’text’ value=’text2′ name=’text2′/>"
            + "<select><option id=’1′>1</option><option id=’2′>2</option><option id=’3′></option></select>"
            + "<a href=http://www.yeeach.com/’http://www.yeeach.com’>yeeach.com</a>"
            + "</body>");
        
             parser.setEncoding(parser.getEncoding());
             OrFilter lastFilter = new OrFilter();
             lastFilter.setPredicates(new NodeFilter[] { selectFilter,
             inputFilter });
             nodeList = parser.parse(lastFilter);
            for (int i = 0; i <= nodeList.size(); i++) {
                if (nodeList.elementAt(i) instanceof InputTag) {
                     InputTag tag = (InputTag) nodeList.elementAt(i);
                     logger.fatal("OrFilter tag name is :" + tag.getTagName()
                    + " ,tag value is:" + tag.getAttribute("value"));
                 }
                if (nodeList.elementAt(i) instanceof SelectTag) {
                         SelectTag tag = (SelectTag) nodeList.elementAt(i);
                         NodeList list = tag.getChildren();
                
                    for (int j = 0; j < list.size(); j++) {
                         OptionTag option = (OptionTag) list.elementAt(j);
                         logger
                         .fatal("OrFilter Option"
                        + option.getOptionText());
                     }
                
                 }
             }
    
         } catch (ParserException e) {
         e.printStackTrace();
         }
     }
    
   }