jsoupページキャプチャテスト

6615 ワード

package com.xy.xmweb.Controller;
/**
 *  
 */

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import com.xy.entity.INewsData;


public class JsoupFirstExtract {

	/**
	 * @param args
	 */
	public static void main(String[] args) {
		//parseHtml();		
		//parseBody();
		//parseUrl();
		System.out.println("=========================================");
		System.out.println("=========================================");
		System.out.println("=========================================");
		System.out.println("=========================================");
		//navigation();		
		//extractElement();
//		navigation();
		
		try {
			String httpCount = JsoupFirstExtract.clawer2("http://www1.xy.com/myoffice/news.do?optType=1&pageNo=1&pageSize=10");
			
			// jSoup   
	        // html doc   
			Document doc = Jsoup.parse(httpCount, "http://www1.xy.com/");
			Element body = doc.body();
			Element span = body.select("td").first();
			Elements links = span.getElementsByTag("a");
			for (Element element : links) {
				String linkAbsHref = element.absUrl("href");
				String linkText = element.text();
				System.out.println("linkAbsHref=:"+linkAbsHref);
				System.out.println(""+linkText+"");
			}
			
		} catch (Exception e) {
			e.printStackTrace();
		}
		
		int pageSize = 10;

		try {			
			//http://www1.xy.com/myoffice/news.do?optType=2&pageNo=78&pageSize=10
			Document doc = Jsoup.connect("http://www1.xy.com/myoffice/news.do?optType=2&pageNo=78&pageSize="+pageSize).timeout(10000).get(); 
			Elements as = doc.select("a[href]");
			System.out.println(as.size());
			if(pageSize > as.size()){
				pageSize = as.size();
			}
//			for (Element a : as) {
//				System.out.println(a.attr("href") + "###" + a.html()); 
//			}
			Elements tds = doc.select("td:not([title])");
//			for (Element td : tds) {
//				System.out.println(td.html()); 
//			}
			for(int i=0;i list = getIntfaceData("http://www1.xy.com/myoffice/news.do?optType=2&pageNo=78&pageSize=",10);
		if (list != null && list.size() > 0) {
			for (int i = 0; i < list.size(); i++) {
				INewsData newsData = list.get(i);
				System.out.println("=============newDate----getAhref-----:"+newsData.getAhref());
				System.out.println("=============newDate----getDatetime-----:"+newsData.getDatetime());
				System.out.println("=============newDate----getTitle-----:"+newsData.getTitle());
			}
		}
		
		}
	
	
	public static List getIntfaceData(String url, int pageSize) {
		
		List list = new ArrayList();
		try {
			//Document docconect = Jsoup.connect("http://www1.xy.com/myoffice/news.do?optType=2&pageNo=78&pageSize="+pageSize).timeout(10000).get();
			Document doc = Jsoup.connect(url+pageSize).timeout(10000).get();
//			Document doc = Jsoup.parse(docconect.toString(),"http://www1.xy.com/");
			Elements as = doc.select("a[href]");
			//System.out.println("====== ====="+as.size());
			if(pageSize > as.size()){
				pageSize = as.size();
			}
			Elements tds = doc.select("td:not([title])");
			for(int i=0;i

Parsed HTML into a doc."; Document doc = Jsoup.parse(html); System.out.println(doc); System.out.println("Print the html head --------------------"); System.out.println(doc.head()); System.out.println("Print the html body --------------------"); System.out.println(doc.body()); System.out.println("Print the html title --------------------"); System.out.println(doc.title()); } public static void parseBody() { String html = "
Lorem ipsum."; Document doc = Jsoup.parseBodyFragment(html); Element body = doc.body(); System.out.println("Print the body --------------------"); System.out.println(body); } public static void parseUrl() { try { Document doc = Jsoup.connect("http://www1.xy.com/myoffice/news.do?optType=1&pageNo=1&pageSize=10").get(); System.out.println("Print the Url --------------------"); System.out.println(doc); } catch (IOException e) {//TODO Auto-generated catch block e.printStackTrace(); } } public static void navigation() { String html="First parse"+ "
Parsed HTML into a doc.
"+ "hahaha"+ "bababa"+"
"; Document doc = Jsoup.parse(html, "http://192.168.3.84/gamestore/index.html"); Element content = doc.getElementById("content"); Elements links = content.getElementsByTag("a"); for (Element link : links) { String linkHref = link.attr("href"); String linkAbsHref = link.absUrl("href"); String linkText = link.text(); System.out.println(linkHref); System.out.println(linkAbsHref); System.out.println(linkText); } } public static void extractElement() { String html = "
An example link."; Document doc = Jsoup.parse(html); Element link = doc.select("a").first(); String text = doc.body().text();//"An example link"String linkHref = link.attr("href");//"http://example.com/"String linkText = link.text();//"example""String linkOuterH = link.outerHtml();//"
example"String linkInnerH = link.html();//"
example"System.out.println(text); System.out.println(linkHref); System.out.println(linkText); System.out.println(linkOuterH); System.out.println(linkInnerH); }/** * 一部のページにネストされたredirect接続がある場合、Server redirected too many timesのようなエラーが報告されます.*これは、このページの内部に他のページに転向するコードがあり、ループが多すぎるとプログラムエラーが発生するためです.このURLのページの内容だけをキャプチャしたい場合は、*を他のページにジャンプさせたくない場合は、以下のコードを使用します.*param myurl * @throws Exception */@SuppressWarnings("static-access") public static String clawer2(String myurl) throws Exception { URL urlmy = new URL(myurl); HttpURLConnection con = (HttpURLConnection) urlmy.openConnection(); con.setFollowRedirects(true); con.setInstanceFollowRedirects(false); con.connect(); BufferedReader br = new BufferedReader(new InputStreamReader(con.getInputStream(),"UTF-8")); String s = ""; StringBuffer sb = new StringBuffer(""); while ((s = br.readLine()) != null) { sb.append(s+"\r"); } return sb.toString(); } }
転載先:https://juejin.im/post/5aad0a7bf265da23793bedcc