HtmlParase解析htmlファイル
htmlparserを初めて使用してから4ヶ月が経ちました.今整理したいので、忘れないでください.
package epson;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.Tag;
import org.htmlparser.filters.AndFilter;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.filters.OrFilter;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.tags.BodyTag;
import org.htmlparser.tags.HeadTag;
import org.htmlparser.tags.ImageTag;
import org.htmlparser.tags.MetaTag;
import org.htmlparser.tags.TableColumn;
import org.htmlparser.tags.TableRow;
import org.htmlparser.tags.TableTag;
import org.htmlparser.tags.TitleTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.htmlparser.util.SimpleNodeIterator;
import org.htmlparser.visitors.TextExtractingVisitor;
public class HtmlAnalysis {
/**
* @param args
*/
private String metaDataString;
private String title;
private String charset;
private String contentType;
private String content;
private String link;
private String localPath ;
private Parser parser = null;
private String htmlsource=null;
public static final String META_KEYWORDS="keywords";
public static final String META_AUTHOR="author";
public static final String META_DESCRIPTION="description";
public static final String META_HTTP_EQUIV="http-equiv";
public HtmlAnalysis(String htmlsource){
this.htmlsource = htmlsource;
}
public HtmlAnalysis(File htmlsource){
try{
String resource = this.getContentByLocalFile(htmlsource);
this.htmlsource = resource;
}catch(Exception e){
}
}
public void init() throws Exception{
try{
parser = new Parser(this.htmlsource);
}catch(Exception e){
throw e;
}
}
public String getMetaKeywords(){
String metaKeywords = "";
try {
NodeFilter nt = new NodeClassFilter(MetaTag.class) ;
NodeList nodeList = parser.parse(nt);
for (int i = 0 ; i< nodeList.size(); i++) {
MetaTag mt =(MetaTag) nodeList.elementAt(i) ;
String cont = mt.getAttribute("name") ;
if (cont!=null && cont.equalsIgnoreCase("Keywords")) {
metaKeywords = mt.getAttribute("content");
break;
}
}
} catch (ParserException e) {
e.printStackTrace();
}
return metaKeywords;
}
public String getTitle() {
String title="";
try {
NodeFilter nt = new NodeClassFilter(TitleTag.class) ;
NodeList nodeList = parser.parse(nt);
for (int i = 0 ; i< nodeList.size(); i++) {
TitleTag titlenode = (TitleTag) nodeList.elementAt(i) ;
title = titlenode.getTitle();
break;
}
} catch (ParserException e) {
e.printStackTrace();
}
return title;
}
public String getBody() {
String body="";
try {
NodeFilter nt = new NodeClassFilter(BodyTag.class) ;
NodeList nodeList = parser.parse(nt);
for (int i = 0 ; i< nodeList.size(); i++) {
BodyTag bodynode = (BodyTag) nodeList.elementAt(i) ;
body = bodynode.getChildrenHTML();
break;
}
} catch (ParserException e) {
e.printStackTrace();
}
return body;
}
public String getBodyOnload() {
String bodyonload="";
try {
NodeFilter nt = new NodeClassFilter(BodyTag.class) ;
NodeList nodeList = parser.parse(nt);
for (int i = 0 ; i< nodeList.size(); i++) {
BodyTag bodynode = (BodyTag) nodeList.elementAt(i) ;
bodyonload = bodynode.getAttribute("onload");
break;
}
} catch (ParserException e) {
e.printStackTrace();
}
return bodyonload;
}
public String getHeadInfo() {
String head="";
try {
NodeFilter nt = new NodeClassFilter(HeadTag.class) ;
NodeList nodeList = parser.parse(nt);
HeadTag headnode = null;
for (int i = 0 ; i< nodeList.size(); i++) {
headnode = (HeadTag) nodeList.elementAt(i) ;
break;
}
if(headnode !=null){
SimpleNodeIterator tag = headnode.children();
int i=0;
while(tag.hasMoreNodes()){
Node node =tag.nextNode();
if((node instanceof MetaTag) || node instanceof TitleTag){
headnode.removeChild(i);
}
i++;
}
}
head = headnode.getChildrenHTML();
} catch (ParserException e) {
e.printStackTrace();
}
return head;
}
public String getMetaInfo(String keytype){
String metaInfo = "";
try {
NodeFilter nt = new NodeClassFilter(MetaTag.class) ;
NodeList nodeList = parser.parse(nt);
if(META_KEYWORDS.equalsIgnoreCase(keytype)
||
META_AUTHOR.equalsIgnoreCase(keytype)
||
META_DESCRIPTION.equalsIgnoreCase(keytype))
{
for (int i = 0 ; i< nodeList.size(); i++) {
MetaTag mt =(MetaTag) nodeList.elementAt(i) ;
String cont = mt.getAttribute("name") ;
if (cont!=null && cont.equalsIgnoreCase(keytype)) {
metaInfo = mt.getAttribute("content");
break;
}
}
}else if(META_HTTP_EQUIV.equals(keytype)){
for (int i = 0 ; i< nodeList.size(); i++) {
MetaTag mt =(MetaTag) nodeList.elementAt(i) ;
String cont = mt.getAttribute("http-equiv") ;
if (cont!=null && cont.equalsIgnoreCase(keytype)) {
metaInfo = mt.getAttribute("content");
break;
}
}
}else{
for (int i = 0 ; i< nodeList.size(); i++) {
MetaTag mt =(MetaTag) nodeList.elementAt(i) ;
String cont = mt.getAttribute("name") ;
if (cont!=null) {
if(META_KEYWORDS.equalsIgnoreCase(cont)
||
META_AUTHOR.equalsIgnoreCase(cont)
||
META_DESCRIPTION.equalsIgnoreCase(cont)){
//
}else{
String tempmetaInfo = mt.getAttribute("content");
metaInfo +="<"+cont+">"+tempmetaInfo+"</"+cont+">";
}
}
}
}
} catch (ParserException e) {
e.printStackTrace();
}
return metaInfo;
}
public String getContentByLocalFile (File path) throws IOException {
StringBuffer sbStr = new StringBuffer();
BufferedReader reader = null ;
String result = null ;
try {
reader = new BufferedReader(new FileReader(path));
} catch (FileNotFoundException e) {
e.printStackTrace();
}
String temp = "";
while((temp=reader.readLine())!=null)
{
sbStr.append(temp);
sbStr.append("\r
");
}
reader.close();
result = sbStr.toString();
return result ;
}
public String getContentByUrl(String url){
return null ;
}
public void getmetaDataByVistor() {
}
public String getURLContent(String Url) {
Parser parser = null;
try {
parser = new Parser(Url);
String a="";
parser = new Parser(a);
TextExtractingVisitor visitor = new TextExtractingVisitor();
parser.visitAllNodesWith(visitor);
content = visitor.getExtractedText();
} catch (ParserException e1) {
e1.printStackTrace();
}
return content;
}
public NodeList getDiv(){
NodeList nodelist=null;
NodeFilter[] nodeFilter=new NodeFilter[2];
try{
parser.setEncoding("GB2312");//set encode
TagNameFilter divFilter=new TagNameFilter("div");//get the table content
HasAttributeFilter divAttribute=new HasAttributeFilter("id","Cont_13");//hava the attribute "bgcolor"
nodeFilter[0]=divFilter;
nodeFilter[1]=divAttribute;
AndFilter andFilter=new AndFilter(nodeFilter);//to link the three filter that above together
nodelist=parser.extractAllNodesThatMatch(andFilter);//get the result that fit for the filter
}catch(Exception e){
e.printStackTrace();
}
return nodelist;
}
public NodeList getTable() throws ParserException{
NodeList nodelist=null;
String dd = getDiv().toHtml();
Parser parser2 = new Parser(dd);
TagNameFilter tableFilter=new TagNameFilter("table");
nodelist = parser2.extractAllNodesThatMatch(tableFilter);
String htmlresult ="";
for (int i = 0; i <= nodelist.size(); i++) {
if (nodelist.elementAt(i) instanceof TableTag) {
TableTag tag = (TableTag) nodelist.elementAt(i);
TableRow[] rows = tag.getRows();
for (int j = 0; j < rows.length; j++) {
TableRow tr = (TableRow) rows[j];
TableColumn[] td = tr.getColumns();
for (int k = 0; k < td.length; k++) {
String result = td[k].toPlainTextString().trim().replace("\t", "");
if(k==0){
htmlresult += "<title>"+result+"</title>";
}
else
htmlresult += "<id>"+result+"</id>";
}
}
}
}
System.out.println(htmlresult);
return nodelist;
}
public void testTable() {
// Parser myParser;
NodeList nodeList = null;
// myParser = Parser.createParser("<body> " + "<table id=’table1′ >"
// + "<tr><td>1-11</td><td>1-12</td><td>1-13</td>"
// + "<tr><td>1-21</td><td>1-22</td><td>1-23</td>"
// + "<tr><td>1-31</td><td>1-32</td><td>1-33</td></table>"
// + "<table id=’table2′ >"
// + "<tr><td>2-11</td><td>2-12</td><td>2-13</td>"
// + "<tr><td>2-21</td><td>2-22</td><td>2-23</td>"
// + "<tr><td>2-31</td><td>2-32</td><td>2-33</td></table>"
// + "</body>", "GBK");
NodeFilter tableFilter = new NodeClassFilter(TableTag.class);
OrFilter lastFilter = new OrFilter();
lastFilter.setPredicates(new NodeFilter[] { tableFilter });
try {
nodeList = parser.parse(lastFilter);
for (int i = 0; i <= nodeList.size(); i++) {
if (nodeList.elementAt(i) instanceof TableTag) {
TableTag tag = (TableTag) nodeList.elementAt(i);
TableRow[] rows = tag.getRows();
for (int j = 0; j < rows.length; j++) {
TableRow tr = (TableRow) rows[j];
TableColumn[] td = tr.getColumns();
for (int k = 0; k < td.length; k++) {
System.out.println("<td>" + td[k].toPlainTextString());
}
}
}
}
} catch (ParserException e) {
e.printStackTrace();
}
}
public String getImg() {
String img="";
ImageTag imgnode=null;
File file = new File("e:\\test\\jsp\\jsp\\test1.htm");
String imgRealPath="";
if(file.exists())
{
file.delete();
try
{
file.createNewFile();
} catch (IOException e)
{
e.printStackTrace();
}
}else{
try
{
file.createNewFile();
} catch (IOException e) {
e.printStackTrace();
// TODO Auto-generated catch block
}
}
try {
NodeFilter nt = new NodeClassFilter(ImageTag.class) ;
//BufferedWriter writer = new BufferedWriter(new OutputStreamWriter (new FileOutputStream (file)));
NodeList nodeList = parser.parse(nt);
for (int i = 0 ; i< nodeList.size(); i++){
int num=0;
imgnode = (ImageTag)nodeList.elementAt(i);
img = imgnode.getImageURL();
System.out.println(img);
/* String[] filePath = file.getParent().split("\\\\");
String[] imgPath = img.split("/");
System.out.println(img+" "+file.getParent());
for(int j=0;j<imgPath.length;j++)
{
if(imgPath[j].equals(".."))
{
num++;
}
}
System.out.println(img.indexOf(":")+"img.indexOf(:)"+img);
if(img.indexOf(":")!=-1)
{
imgRealPath=img;
}
else if(num>1)
{
System.out.println("img before replace"+img);
img = img.replace("../","");
System.out.println("img num>1"+img+num);
imgRealPath = filePath[filePath.length-1-num]+"/"+img;
while((filePath.length-1-num)>0)
{
num++;
imgRealPath = filePath[filePath.length-1-num]+imgRealPath;
}
System.out.println("imgRealPath"+imgRealPath+(filePath.length-1-num));
}
else if(imgPath[0].equals("."))
{
System.out.println(file.getParent()+"imgPath[0].equals(.)");
img = img.replace("./","");
imgRealPath=file.getParent()+"\\"+img;
}
else
{
for(int j=0;j<imgPath.length;j++)
{
if(imgPath[j].equals(".."))
{
imgPath[j] = (String)( imgPath[j].replace("..",filePath[j+1]));
System.out.println(imgPath[j]);
}
if(!imgPath[j].equals(""))
imgRealPath += "/"+imgPath[j];
}
imgRealPath=filePath[0]+imgRealPath;
}
imgRealPath = imgRealPath.replaceAll("\\\\","/");
imgnode.setImageURL(imgRealPath);
imgRealPath="";
writer.write(imgnode.toHtml()); */
}
//writer.flush();
// writer.close ();
} catch (Exception e) {
e.printStackTrace();
}
return imgRealPath;
}
public static void main(String[] args) {
HtmlAnalysis htmlAnalysis= new HtmlAnalysis(new File("f:\\test.html"));
try{
htmlAnalysis.init();
// System.out.println(htmlAnalysis.getMetaInfo("keywords"));
// htmlAnalysis.parser.reset();
// System.out.println(htmlAnalysis.getMetaInfo("author"));
// htmlAnalysis.parser.reset();
// System.out.println(htmlAnalysis.getMetaInfo("description"));
// htmlAnalysis.parser.reset();
// System.out.println(htmlAnalysis.getMetaInfo("other"));
// htmlAnalysis.parser.reset();
//System.out.println(htmlAnalysis.getTitle());
//htmlAnalysis.parser.reset();
//System.out.println(htmlAnalysis.getHeadInfo());
htmlAnalysis.getTable();
// htmlAnalysis.testTable();
}catch(Exception e){
}
}
public static void visitTag(Tag tag) {
if (tag.getAttribute("class") != null) {
System.out.println(" " + tag.getTagName() +
tag.getAttribute("class"));
}
}
public String getCharset() {
return charset;
}
public void setCharset(String charset) {
this.charset = charset;
}
public String getContentType() {
return contentType;
}
public void setContentType(String contentType) {
this.contentType = contentType;
}
public String getMetaDataString() {
return metaDataString;
}
public void setMetaDataString(String metaDataString) {
this.metaDataString = metaDataString;
}
public void setTitle(String title) {
this.title = title;
}
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}
}