jsoupページキャプチャテスト
6615 ワード
package com.xy.xmweb.Controller;
/**
*
*/
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import com.xy.entity.INewsData;
public class JsoupFirstExtract {
/**
* @param args
*/
public static void main(String[] args) {
//parseHtml();
//parseBody();
//parseUrl();
System.out.println("=========================================");
System.out.println("=========================================");
System.out.println("=========================================");
System.out.println("=========================================");
//navigation();
//extractElement();
// navigation();
try {
String httpCount = JsoupFirstExtract.clawer2("http://www1.xy.com/myoffice/news.do?optType=1&pageNo=1&pageSize=10");
// jSoup
// html doc
Document doc = Jsoup.parse(httpCount, "http://www1.xy.com/");
Element body = doc.body();
Element span = body.select("td").first();
Elements links = span.getElementsByTag("a");
for (Element element : links) {
String linkAbsHref = element.absUrl("href");
String linkText = element.text();
System.out.println("linkAbsHref=:"+linkAbsHref);
System.out.println(""+linkText+"");
}
} catch (Exception e) {
e.printStackTrace();
}
int pageSize = 10;
try {
//http://www1.xy.com/myoffice/news.do?optType=2&pageNo=78&pageSize=10
Document doc = Jsoup.connect("http://www1.xy.com/myoffice/news.do?optType=2&pageNo=78&pageSize="+pageSize).timeout(10000).get();
Elements as = doc.select("a[href]");
System.out.println(as.size());
if(pageSize > as.size()){
pageSize = as.size();
}
// for (Element a : as) {
// System.out.println(a.attr("href") + "###" + a.html());
// }
Elements tds = doc.select("td:not([title])");
// for (Element td : tds) {
// System.out.println(td.html());
// }
for(int i=0;i list = getIntfaceData("http://www1.xy.com/myoffice/news.do?optType=2&pageNo=78&pageSize=",10);
if (list != null && list.size() > 0) {
for (int i = 0; i < list.size(); i++) {
INewsData newsData = list.get(i);
System.out.println("=============newDate----getAhref-----:"+newsData.getAhref());
System.out.println("=============newDate----getDatetime-----:"+newsData.getDatetime());
System.out.println("=============newDate----getTitle-----:"+newsData.getTitle());
}
}
}
public static List getIntfaceData(String url, int pageSize) {
List list = new ArrayList();
try {
//Document docconect = Jsoup.connect("http://www1.xy.com/myoffice/news.do?optType=2&pageNo=78&pageSize="+pageSize).timeout(10000).get();
Document doc = Jsoup.connect(url+pageSize).timeout(10000).get();
// Document doc = Jsoup.parse(docconect.toString(),"http://www1.xy.com/");
Elements as = doc.select("a[href]");
//System.out.println("====== ====="+as.size());
if(pageSize > as.size()){
pageSize = as.size();
}
Elements tds = doc.select("td:not([title])");
for(int i=0;i
Parsed HTML into a doc."; Document doc = Jsoup.parse(html); System.out.println(doc); System.out.println("Print the html head --------------------"); System.out.println(doc.head()); System.out.println("Print the html body --------------------"); System.out.println(doc.body()); System.out.println("Print the html title --------------------"); System.out.println(doc.title()); } public static void parseBody() { String html = "
Lorem ipsum."; Document doc = Jsoup.parseBodyFragment(html); Element body = doc.body(); System.out.println("Print the body --------------------"); System.out.println(body); } public static void parseUrl() { try { Document doc = Jsoup.connect("http://www1.xy.com/myoffice/news.do?optType=1&pageNo=1&pageSize=10").get(); System.out.println("Print the Url --------------------"); System.out.println(doc); } catch (IOException e) {//TODO Auto-generated catch block e.printStackTrace(); } } public static void navigation() { String html="First parse"+ "
Parsed HTML into a doc.
"+ "hahaha"+ "bababa"+"
"; Document doc = Jsoup.parse(html, "http://192.168.3.84/gamestore/index.html"); Element content = doc.getElementById("content"); Elements links = content.getElementsByTag("a"); for (Element link : links) { String linkHref = link.attr("href"); String linkAbsHref = link.absUrl("href"); String linkText = link.text(); System.out.println(linkHref); System.out.println(linkAbsHref); System.out.println(linkText); } } public static void extractElement() { String html = "
An example link."; Document doc = Jsoup.parse(html); Element link = doc.select("a").first(); String text = doc.body().text();//"An example link"String linkHref = link.attr("href");//"http://example.com/"String linkText = link.text();//"example""String linkOuterH = link.outerHtml();//"
example"String linkInnerH = link.html();//"
example"System.out.println(text); System.out.println(linkHref); System.out.println(linkText); System.out.println(linkOuterH); System.out.println(linkInnerH); }/** * 一部のページにネストされたredirect接続がある場合、Server redirected too many timesのようなエラーが報告されます.*これは、このページの内部に他のページに転向するコードがあり、ループが多すぎるとプログラムエラーが発生するためです.このURLのページの内容だけをキャプチャしたい場合は、*を他のページにジャンプさせたくない場合は、以下のコードを使用します.*param myurl * @throws Exception */@SuppressWarnings("static-access") public static String clawer2(String myurl) throws Exception { URL urlmy = new URL(myurl); HttpURLConnection con = (HttpURLConnection) urlmy.openConnection(); con.setFollowRedirects(true); con.setInstanceFollowRedirects(false); con.connect(); BufferedReader br = new BufferedReader(new InputStreamReader(con.getInputStream(),"UTF-8")); String s = ""; StringBuffer sb = new StringBuffer(""); while ((s = br.readLine()) != null) { sb.append(s+"\r"); } return sb.toString(); } }
転載先:https://juejin.im/post/5aad0a7bf265da23793bedcc