jsoupは小説を登ります
2540 ワード
つまらない月曜日、お姉さんは私に1册のネット上の小说を探して、私はダウンロードしていないことを见て、1つの爬虫类を书いてtxtファイルを书いて、とても简単です
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.PrintStream;
import java.util.Calendar;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
/**
*
* @author memode
*
*/
public class Test_1 {
public static void atiricle(){
Document newsdoc;
long t1 = System.currentTimeMillis(); //
String link = "http://www.tywx.com/ty109892/";
int num = 5846664; //
int chapter = 0; //
String tmpLink = "http://www.tywx.com/ty109892/"+num+".html";
File file = new File("c:\\test.txt"); //
try {
PrintStream ps = new PrintStream(new FileOutputStream(file));
while (true) {
// 15s
if("404".equals(Jsoup.connect(tmpLink).timeout(15000).execute().statusCode())){
break;
}
newsdoc = Jsoup.connect(tmpLink)
.userAgent("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.21 (KHTML, like Gecko) Chrome/19.0.1042.0 Safari/535.21")
.timeout(5000).get(); // 5s
//
if(null==newsdoc.getElementsByAttributeValue("class","kfyd").first()){
break;
}
//
String title = newsdoc.getElementsByAttributeValue("class","kfyd").first().select("h1").text();
System.out.println(title);
//
tmpLink = link+ newsdoc.getElementsByAttributeValue("id","thumb").first().select("#pager_next").attr("href");
System.out.println(tmpLink);
//
String news_tmp = newsdoc.getElementsByAttributeValue("id", "content").select("div").remove()
.html()
.replaceAll("<.*?script[^>]*?>[\\s\\S]*?<\\/.*?script.*?>*", " ") // script
.replaceAll("(?i)<br[^>]*>
<br>", "
").replaceAll(" ", " "); //
chapter++;
//
ps.append(title+"
");
ps.append(news_tmp+"
");
}
} catch (IOException e) {
System.out.println(" net error!");
}
System.out.println(" "+chapter+" ");
long t2 = System.currentTimeMillis(); //
Calendar c = Calendar.getInstance();
c.setTimeInMillis(t2 - t1);
System.out.println(" : " + c.get(Calendar.MINUTE) + " "
+ c.get(Calendar.SECOND) + " " + c.get(Calendar.MILLISECOND)
+ " ");
}
public static void main(String[] args) {
new Test_1().atiricle();
}
}