jsoupは小説を登ります

2540 ワード

つまらない月曜日、お姉さんは私に1册のネット上の小说を探して、私はダウンロードしていないことを见て、1つの爬虫类を书いてtxtファイルを书いて、とても简単です

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.PrintStream;
import java.util.Calendar;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;


/**
 * 
 * @author memode
 *
 */
public class Test_1 {
public static void atiricle(){
Document newsdoc;
long t1 = System.currentTimeMillis(); //            
String link = "http://www.tywx.com/ty109892/";
int num = 5846664; //     
int chapter = 0;   //    
String tmpLink = "http://www.tywx.com/ty109892/"+num+".html";
File file = new File("c:\\test.txt");  //      
try {
PrintStream ps = new PrintStream(new FileOutputStream(file));
while (true) {
//  15s     
if("404".equals(Jsoup.connect(tmpLink).timeout(15000).execute().statusCode())){
break;
}
newsdoc = Jsoup.connect(tmpLink)
.userAgent("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.21 (KHTML, like Gecko) Chrome/19.0.1042.0 Safari/535.21")
.timeout(5000).get();  //      5s
//  
if(null==newsdoc.getElementsByAttributeValue("class","kfyd").first()){
break;
}
//  
String title = newsdoc.getElementsByAttributeValue("class","kfyd").first().select("h1").text();
System.out.println(title);
//        
tmpLink = link+ newsdoc.getElementsByAttributeValue("id","thumb").first().select("#pager_next").attr("href");
System.out.println(tmpLink);
//       
String news_tmp = newsdoc.getElementsByAttributeValue("id", "content").select("div").remove()
.html()
.replaceAll("<.*?script[^>]*?>[\\s\\S]*?<\\/.*?script.*?>*", " ")     //  script  
.replaceAll("(?i)<br[^>]*>
<br>", "
").replaceAll("&nbsp;", " ");   //         
chapter++;
//       
ps.append(title+"

");  
ps.append(news_tmp+"
");
}
} catch (IOException e) {
System.out.println("       net error!");
}
System.out.println("   "+chapter+" ");
long t2 = System.currentTimeMillis(); //            
Calendar c = Calendar.getInstance();
c.setTimeInMillis(t2 - t1);  
        System.out.println("  : " + c.get(Calendar.MINUTE) + "  "  
                + c.get(Calendar.SECOND) + "  " + c.get(Calendar.MILLISECOND)  
                + "   ");  
}
public static void main(String[] args) {
new Test_1().atiricle();
}
}

Leetcode 252 Meeting Rooms

LeetCode:Longest Increasing Path in a Matrix