HtmlParase解析htmlファイル


htmlparserを初めて使用してから4ヶ月が経ちました.今整理したいので、忘れないでください.
package epson;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;

import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.Tag;
import org.htmlparser.filters.AndFilter;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.filters.OrFilter;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.tags.BodyTag;
import org.htmlparser.tags.HeadTag;
import org.htmlparser.tags.ImageTag;
import org.htmlparser.tags.MetaTag;
import org.htmlparser.tags.TableColumn;
import org.htmlparser.tags.TableRow;
import org.htmlparser.tags.TableTag;
import org.htmlparser.tags.TitleTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.htmlparser.util.SimpleNodeIterator;
import org.htmlparser.visitors.TextExtractingVisitor;


public class HtmlAnalysis {
    /**
     * @param args
     */
    private String metaDataString;
    private String title;
    private String charset;
    private String contentType;
    private String content;
    private String link;
    
    
    private String localPath ;
    private Parser parser = null;
    private String htmlsource=null;
    
    public static final String META_KEYWORDS="keywords";
    public static final String META_AUTHOR="author";
    public static final String META_DESCRIPTION="description";
    public static final String META_HTTP_EQUIV="http-equiv";
    
    public HtmlAnalysis(String htmlsource){
    	this.htmlsource = htmlsource; 
    }
    
    public HtmlAnalysis(File htmlsource){
    	
    	try{
    	String resource = this.getContentByLocalFile(htmlsource);
    	this.htmlsource = resource;
    	}catch(Exception e){
    		
    	}
    }
    
    public void init() throws Exception{
    	try{
    	parser = new Parser(this.htmlsource);
    	}catch(Exception e){
    		throw e;
    	}
    }
    
    
    public String getMetaKeywords(){
    	String metaKeywords = "";
        	
    	try {
			NodeFilter nt = new NodeClassFilter(MetaTag.class) ;
			NodeList nodeList = parser.parse(nt);
			for (int i = 0 ; i< nodeList.size(); i++) {
				MetaTag mt =(MetaTag) nodeList.elementAt(i) ;
				String cont  = mt.getAttribute("name") ;
				
				if (cont!=null && cont.equalsIgnoreCase("Keywords")) {
					metaKeywords = mt.getAttribute("content");
					break;
				}
			}
		} catch (ParserException e) {
			e.printStackTrace();
		}
    	return metaKeywords;
    }

    public String getTitle() {
    	String title="";
       	
    	try {
			NodeFilter nt = new NodeClassFilter(TitleTag.class) ;
			NodeList nodeList = parser.parse(nt);
			for (int i = 0 ; i< nodeList.size(); i++) {
                TitleTag titlenode = (TitleTag) nodeList.elementAt(i) ;
                title = titlenode.getTitle();
                break;
			}    
		} catch (ParserException e) {
			e.printStackTrace();
		}

        return title;
    }

    public String getBody() {
    	String body="";
       	
    	try {
		NodeFilter nt = new NodeClassFilter(BodyTag.class) ;
		NodeList nodeList = parser.parse(nt);
		for (int i = 0 ; i< nodeList.size(); i++) {
                   BodyTag bodynode = (BodyTag) nodeList.elementAt(i) ;
                   body = bodynode.getChildrenHTML();
                   break;
		}    
		} catch (ParserException e) {
			e.printStackTrace();
		}

        return body;
    }

    public String getBodyOnload() {
    	String bodyonload=""; 	
    	try {
			NodeFilter nt = new NodeClassFilter(BodyTag.class) ;
			NodeList nodeList = parser.parse(nt);
			for (int i = 0 ; i< nodeList.size(); i++) {
				BodyTag bodynode = (BodyTag) nodeList.elementAt(i) ;
				bodyonload = bodynode.getAttribute("onload");
                               break;
			}  
            
		} catch (ParserException e) {
			e.printStackTrace();
		}

        return bodyonload;
    }    
    
    public String getHeadInfo() {
    	String head="";
       	
    	try {
			NodeFilter nt = new NodeClassFilter(HeadTag.class) ;
			NodeList nodeList = parser.parse(nt);
            
			HeadTag headnode = null;
			for (int i = 0 ; i< nodeList.size(); i++) {
				headnode = (HeadTag) nodeList.elementAt(i) ;
				break;
			}  
			
			
			if(headnode !=null){
				SimpleNodeIterator tag = headnode.children();
				int i=0;
				while(tag.hasMoreNodes()){
					Node node =tag.nextNode();
					if((node instanceof MetaTag) || node instanceof TitleTag){
						headnode.removeChild(i);
					}
					
					i++;
				}
			}
			
			head = headnode.getChildrenHTML();
            
            
		} catch (ParserException e) {
			e.printStackTrace();
		}

        return head;
    } 
    
    
    public String getMetaInfo(String keytype){
    	String metaInfo = "";
        	
    	try {
    		
			NodeFilter nt = new NodeClassFilter(MetaTag.class) ;
			NodeList nodeList = parser.parse(nt);
			
    		if(META_KEYWORDS.equalsIgnoreCase(keytype)
    			||
    			META_AUTHOR.equalsIgnoreCase(keytype)
    			||
    			META_DESCRIPTION.equalsIgnoreCase(keytype))
    		{

				for (int i = 0 ; i< nodeList.size(); i++) {
					MetaTag mt =(MetaTag) nodeList.elementAt(i) ;
					String cont  = mt.getAttribute("name") ;
					
					if (cont!=null && cont.equalsIgnoreCase(keytype)) {
						metaInfo = mt.getAttribute("content");
						break;
					}
				}
    		}else if(META_HTTP_EQUIV.equals(keytype)){
				for (int i = 0 ; i< nodeList.size(); i++) {
					MetaTag mt =(MetaTag) nodeList.elementAt(i) ;
					String cont  = mt.getAttribute("http-equiv") ;
					
					if (cont!=null && cont.equalsIgnoreCase(keytype)) {
						metaInfo = mt.getAttribute("content");
						break;
					}
				}
    		}else{
				for (int i = 0 ; i< nodeList.size(); i++) {
					MetaTag mt =(MetaTag) nodeList.elementAt(i) ;
					String cont  = mt.getAttribute("name") ;
					
					if (cont!=null) {
						
						if(META_KEYWORDS.equalsIgnoreCase(cont)
				    			||
				    			META_AUTHOR.equalsIgnoreCase(cont)
				    			||
				    			META_DESCRIPTION.equalsIgnoreCase(cont)){
							
							//
						}else{
							String tempmetaInfo = mt.getAttribute("content");
							metaInfo +="<"+cont+">"+tempmetaInfo+"</"+cont+">";
						}
							
						
					}
				}
    			
    		}
    		
    		
		} catch (ParserException e) {
			e.printStackTrace();
		}
    	return metaInfo;
    }
    
    
    public String  getContentByLocalFile (File path) throws IOException {
    	StringBuffer sbStr = new StringBuffer();
    	BufferedReader reader = null ;
    	String result = null ;
		try {
			reader = new BufferedReader(new FileReader(path));
		} catch (FileNotFoundException e) {
			e.printStackTrace();
		}
		String temp = "";
		while((temp=reader.readLine())!=null)
		  {
		   sbStr.append(temp);
		   sbStr.append("\r
"); } reader.close(); result = sbStr.toString(); return result ; } public String getContentByUrl(String url){ return null ; } public void getmetaDataByVistor() { } public String getURLContent(String Url) { Parser parser = null; try { parser = new Parser(Url); String a=""; parser = new Parser(a); TextExtractingVisitor visitor = new TextExtractingVisitor(); parser.visitAllNodesWith(visitor); content = visitor.getExtractedText(); } catch (ParserException e1) { e1.printStackTrace(); } return content; } public NodeList getDiv(){ NodeList nodelist=null; NodeFilter[] nodeFilter=new NodeFilter[2]; try{ parser.setEncoding("GB2312");//set encode TagNameFilter divFilter=new TagNameFilter("div");//get the table content HasAttributeFilter divAttribute=new HasAttributeFilter("id","Cont_13");//hava the attribute "bgcolor" nodeFilter[0]=divFilter; nodeFilter[1]=divAttribute; AndFilter andFilter=new AndFilter(nodeFilter);//to link the three filter that above together nodelist=parser.extractAllNodesThatMatch(andFilter);//get the result that fit for the filter }catch(Exception e){ e.printStackTrace(); } return nodelist; } public NodeList getTable() throws ParserException{ NodeList nodelist=null; String dd = getDiv().toHtml(); Parser parser2 = new Parser(dd); TagNameFilter tableFilter=new TagNameFilter("table"); nodelist = parser2.extractAllNodesThatMatch(tableFilter); String htmlresult =""; for (int i = 0; i <= nodelist.size(); i++) { if (nodelist.elementAt(i) instanceof TableTag) { TableTag tag = (TableTag) nodelist.elementAt(i); TableRow[] rows = tag.getRows(); for (int j = 0; j < rows.length; j++) { TableRow tr = (TableRow) rows[j]; TableColumn[] td = tr.getColumns(); for (int k = 0; k < td.length; k++) { String result = td[k].toPlainTextString().trim().replace("\t", ""); if(k==0){ htmlresult += "<title>"+result+"</title>"; } else htmlresult += "<id>"+result+"</id>"; } } } } System.out.println(htmlresult); return nodelist; } public void testTable() { // Parser myParser; NodeList nodeList = null; // myParser = Parser.createParser("<body> " + "<table id=’table1′ >" // + "<tr><td>1-11</td><td>1-12</td><td>1-13</td>" // + "<tr><td>1-21</td><td>1-22</td><td>1-23</td>" // + "<tr><td>1-31</td><td>1-32</td><td>1-33</td></table>" // + "<table id=’table2′ >" // + "<tr><td>2-11</td><td>2-12</td><td>2-13</td>" // + "<tr><td>2-21</td><td>2-22</td><td>2-23</td>" // + "<tr><td>2-31</td><td>2-32</td><td>2-33</td></table>" // + "</body>", "GBK"); NodeFilter tableFilter = new NodeClassFilter(TableTag.class); OrFilter lastFilter = new OrFilter(); lastFilter.setPredicates(new NodeFilter[] { tableFilter }); try { nodeList = parser.parse(lastFilter); for (int i = 0; i <= nodeList.size(); i++) { if (nodeList.elementAt(i) instanceof TableTag) { TableTag tag = (TableTag) nodeList.elementAt(i); TableRow[] rows = tag.getRows(); for (int j = 0; j < rows.length; j++) { TableRow tr = (TableRow) rows[j]; TableColumn[] td = tr.getColumns(); for (int k = 0; k < td.length; k++) { System.out.println("<td>" + td[k].toPlainTextString()); } } } } } catch (ParserException e) { e.printStackTrace(); } } public String getImg() { String img=""; ImageTag imgnode=null; File file = new File("e:\\test\\jsp\\jsp\\test1.htm"); String imgRealPath=""; if(file.exists()) { file.delete(); try { file.createNewFile(); } catch (IOException e) { e.printStackTrace(); } }else{ try { file.createNewFile(); } catch (IOException e) { e.printStackTrace(); // TODO Auto-generated catch block } } try { NodeFilter nt = new NodeClassFilter(ImageTag.class) ; //BufferedWriter writer = new BufferedWriter(new OutputStreamWriter (new FileOutputStream (file))); NodeList nodeList = parser.parse(nt); for (int i = 0 ; i< nodeList.size(); i++){ int num=0; imgnode = (ImageTag)nodeList.elementAt(i); img = imgnode.getImageURL(); System.out.println(img); /* String[] filePath = file.getParent().split("\\\\"); String[] imgPath = img.split("/"); System.out.println(img+" "+file.getParent()); for(int j=0;j<imgPath.length;j++) { if(imgPath[j].equals("..")) { num++; } } System.out.println(img.indexOf(":")+"img.indexOf(:)"+img); if(img.indexOf(":")!=-1) { imgRealPath=img; } else if(num>1) { System.out.println("img before replace"+img); img = img.replace("../",""); System.out.println("img num>1"+img+num); imgRealPath = filePath[filePath.length-1-num]+"/"+img; while((filePath.length-1-num)>0) { num++; imgRealPath = filePath[filePath.length-1-num]+imgRealPath; } System.out.println("imgRealPath"+imgRealPath+(filePath.length-1-num)); } else if(imgPath[0].equals(".")) { System.out.println(file.getParent()+"imgPath[0].equals(.)"); img = img.replace("./",""); imgRealPath=file.getParent()+"\\"+img; } else { for(int j=0;j<imgPath.length;j++) { if(imgPath[j].equals("..")) { imgPath[j] = (String)( imgPath[j].replace("..",filePath[j+1])); System.out.println(imgPath[j]); } if(!imgPath[j].equals("")) imgRealPath += "/"+imgPath[j]; } imgRealPath=filePath[0]+imgRealPath; } imgRealPath = imgRealPath.replaceAll("\\\\","/"); imgnode.setImageURL(imgRealPath); imgRealPath=""; writer.write(imgnode.toHtml()); */ } //writer.flush(); // writer.close (); } catch (Exception e) { e.printStackTrace(); } return imgRealPath; } public static void main(String[] args) { HtmlAnalysis htmlAnalysis= new HtmlAnalysis(new File("f:\\test.html")); try{ htmlAnalysis.init(); // System.out.println(htmlAnalysis.getMetaInfo("keywords")); // htmlAnalysis.parser.reset(); // System.out.println(htmlAnalysis.getMetaInfo("author")); // htmlAnalysis.parser.reset(); // System.out.println(htmlAnalysis.getMetaInfo("description")); // htmlAnalysis.parser.reset(); // System.out.println(htmlAnalysis.getMetaInfo("other")); // htmlAnalysis.parser.reset(); //System.out.println(htmlAnalysis.getTitle()); //htmlAnalysis.parser.reset(); //System.out.println(htmlAnalysis.getHeadInfo()); htmlAnalysis.getTable(); // htmlAnalysis.testTable(); }catch(Exception e){ } } public static void visitTag(Tag tag) { if (tag.getAttribute("class") != null) { System.out.println(" " + tag.getTagName() + tag.getAttribute("class")); } } public String getCharset() { return charset; } public void setCharset(String charset) { this.charset = charset; } public String getContentType() { return contentType; } public void setContentType(String contentType) { this.contentType = contentType; } public String getMetaDataString() { return metaDataString; } public void setMetaDataString(String metaDataString) { this.metaDataString = metaDataString; } public void setTitle(String title) { this.title = title; } public String getContent() { return content; } public void setContent(String content) { this.content = content; } }