JAva爬虫類、解析URL
1761 ワード
httpclientパッケージとjsoupパッケージで処理するURLを使用します.https://news.ecnu.edu.cn/cf/4c/c1833a118604/page.psp爬取:c 1833 a 118604-c 1833 a 118704まずurlを処理し、URLを取得する.
次に、ページ情報をローカルにキャプチャします.
public static int subUrl() {
int page;
String url = "https://news.ecnu.edu.cn/cf/4c/c1833a118604/page.psp";
String[] strs = url.split("/");
String str = strs[5];
String str1 = str.substring(0, str.indexOf("a"));
String str2 = str.substring(str1.length() + 1, str.length());
page = Integer.parseInt(str2);
return page;
}
次に、ページ情報をローカルにキャプチャします.
public class HttpRequest {
public static void main(String[] args) throws Exception {
int page = SubUrl.subUrl();
for (int i = 0; i < 99; i++) {
String url = "https://news.ecnu.edu.cn/cf/4c/c1833a" + page + "/page.psp";
CloseableHttpClient httpClient = HttpClients.createDefault();
HttpGet httpGet = new HttpGet(url);
CloseableHttpResponse response = httpClient.execute(httpGet);
HttpEntity entity = response.getEntity();
String concent = EntityUtils.toString(entity, "utf-8");
response.close();
Document document = Jsoup.parse(concent);
Elements elements = document.getElementsByTag("html");
String string = elements.html();
//
String fileName = "sunbeam//result" + page + ".html";
File file = new File(fileName);
File fileParent = file.getParentFile();
if (!fileParent.exists()) {
//
fileParent.mkdirs();
}
file.createNewFile();
// System.out.println(string);
OutputStreamWriter osw = new OutputStreamWriter(new FileOutputStream(file), "UTF-8");
//
osw.write(string);
//
osw.close();
page--;
}
}
}