Http Client、httml Parseの簡単な使い方

3891 ワード

新華信託を解析する.
package com.zte.util;

import java.util.ArrayList;
import java.util.List;

import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpMethod;
import org.apache.commons.httpclient.methods.GetMethod;
import org.htmlparser.Parser;
import org.htmlparser.Tag;
import org.htmlparser.visitors.NodeVisitor;

import com.zte.entity.NewInfo;

public class ParseHtml {
   
	private static List<NewInfo> newInfos = new ArrayList<NewInfo>();
    private static int pageNumber = 1;
    private static boolean isFirstPage = true;
    
    public static List<NewInfo> getNewInfos(String url) throws Exception{
    	 HttpClient client = new HttpClient();   
	     client.getHostConfiguration().setProxy("10.130.40.13",8026);
	     HttpMethod method = new GetMethod(url);
	     client.executeMethod(method);   
         parseNew(method.getResponseBodyAsString());
         for(int page =2;page<=pageNumber ; page++) {
        	 method = new GetMethod( url + "&pagenum="+page);
    	     client.executeMethod(method);   
             parseNew(method.getResponseBodyAsString());
         }
         
	     method.releaseConnection();   
	     
	     return newInfos;
    }
    
	public static void parseNew(String content) {
		try {
			Parser parser = new Parser(content);
			NodeVisitor visitor = new NodeVisitorImpl();
			parser.visitAllNodesWith(visitor);

		} catch (Exception e) {
			e.printStackTrace();
		}
	}

	private static class NodeVisitorImpl extends NodeVisitor {
		boolean tdTag = false;
		public void visitTag(Tag tag) {
			String href = tag.getAttribute("href");
			if (tag.getTagName().equalsIgnoreCase("TD")) {
				tdTag = true;
			} else if (tdTag && tag.getTagName().equalsIgnoreCase("A") && !href.contains("javascript")) {
				String title = tag.getFirstChild().toHtml().trim();
				String newTitle1 = title.replaceAll("\\s{1,}", " ");
				String newTitle = newTitle1.replace("&#8226;", ".");
				NewInfo newInfo = new NewInfo();
				newInfo.setHref(href);
				newInfo.setTitle(newTitle);
				newInfos.add(newInfo);
				tdTag = false;
			} 
			
			if(isFirstPage && tag.getTagName().equalsIgnoreCase("select")) {
				pageNumber = new Integer(tag.getLastChild().getFirstChild().toHtml().trim());
				isFirstPage = false;
			}
			
		}
	}

}
情報エンティティ
package com.zte.entity;

public class NewInfo {
	private String href;
	private String title;

	public String getHref() {
		return href;
	}

	public void setHref(String href) {
		this.href = href;
	}

	public String getTitle() {
		return title;
	}

	public void setTitle(String title) {
		this.title = title;
	}

}
クライアントの呼び出し
public class ParseHtmlTest {
   public static void main(String[] args) throws Exception {
	   String url = "http://www.nct-china.com/NewsList.aspx?lmid=55";
	  List<NewInfo> newInfos = ParseHtml.getNewInfos(url);
	  int i = 0;
		 for(NewInfo newInfo : newInfos) {
			
			System.out.println("    :" + newInfo.getHref());
			System.out.println("    :" + newInfo.getTitle());
		    i ++ ;	
		 }
		
		 System.out.println(" " +i+ "   ");
   }
}