HttpClient GZip圧縮問題
4148 ワード
最近会社の人手が足りなくて、臨時にいくつかのネットの爬虫類の方面の仕事をして、いくつかのウェブサイトを這い出す時アクセスページgzip圧縮の問題に出会って、時間をかけて研究して、やっと解決しました.ここに記録しておくと、後で遡るのに便利です.
サンプルコード
説明:
実はhttpclientが提供するGzipDecompressingEntityクラスを使用してGZipコンテンツを復号します
サンプルコード
package com.yulore.test;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.HttpHost;
import org.apache.http.HttpResponse;
import org.apache.http.HttpStatus;
import org.apache.http.client.entity.GzipDecompressingEntity;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.conn.params.ConnRoutePNames;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.params.CoreConnectionPNames;
public class HttpClientTest02 {
public static void main(String[] args) {
test();
}
public static void test(){
//http://www.koubei.com/?spm=0.0.0.117.pR54PP&city=110100[0,1]
String url = "http://www.koubei.com/?spm=0.0.0.117.pR54PP&city=110100[0,1]";
String content = httpGet(url);
String regex = "href=\"([\\S]*?)\"\\s*?class=\"nav_a\\s*?shanghu\"\\s*?target=\"_blank\"> ";
Pattern pattern = Pattern.compile(regex,Pattern.CASE_INSENSITIVE);
Matcher matcher = pattern.matcher(content);
if(matcher.find()){
String target = matcher.group(1);
System.err.println("target="+target);
}
}
/**
* java http
*
* @return
*/
public static String httpGet(String url) {
String ip = "xxxxx";
String content = null;
DefaultHttpClient httpclient = null;
try {
httpclient = new DefaultHttpClient();
/** IP **/
HttpHost proxy = new HttpHost(ip, 8080);
httpclient.getParams().setParameter(ConnRoutePNames.DEFAULT_PROXY,
proxy);
HttpGet httpget = new HttpGet(url);
httpget.getParams().setParameter(CoreConnectionPNames.SO_TIMEOUT,
1000 * 30); //
httpget.setHeader("User-Agent","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.79 Safari/537.1");
httpget.setHeader("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
httpget.setHeader("Accept-Encoding", "gzip,deflate,sdch"); //
HttpResponse resp = httpclient.execute(httpget);
int statusCode = resp.getStatusLine().getStatusCode();
if (statusCode == HttpStatus.SC_MOVED_PERMANENTLY || statusCode == HttpStatus.SC_MOVED_TEMPORARILY){
System.out.println(" ,,,");
Header[] locationHeader = resp.getHeaders("Location");
if (locationHeader != null && locationHeader.length > 0) {
String redirectUrl = locationHeader[0].getValue();
System.out.println("redirectUrl:" + redirectUrl);
}
} else if (statusCode == HttpStatus.SC_OK) {
InputStream in = null;
HttpEntity entity = resp.getEntity();
Header header = entity.getContentEncoding();
if(header != null && header.getValue().equalsIgnoreCase("gzip")){ // gzip
System.err.println("gzip");
GzipDecompressingEntity gzipEntity = new GzipDecompressingEntity(entity);
in = gzipEntity.getContent();
}else{
in = entity.getContent();
}
content = getHTMLContent(in);
System.out.println("content:" + content);
}
} catch (Exception e) {
e.printStackTrace();
} finally {
httpclient.getConnectionManager().shutdown(); //
}
return content;
}
private static String getHTMLContent(InputStream in) {
StringBuffer sb = new StringBuffer();
BufferedReader br = new BufferedReader(new InputStreamReader(in));
try {
String line = null;
while((line=br.readLine())!=null){
sb.append(line);
}
} catch (IOException e) {
e.printStackTrace();
}finally{
try {
br.close();
} catch (IOException e) {
e.printStackTrace();
}
}
return sb.toString();
}
}
説明:
実はhttpclientが提供するGzipDecompressingEntityクラスを使用してGZipコンテンツを復号します