JAva爬絵像データdemo
3938 ワード
package com.xcx.spots.test;
import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.scheduler.BloomFilterDuplicateRemover; import us.codecraft.webmagic.scheduler.QueueScheduler;
import us.codecraft.webmagic.selector.Selectable; public class Danli implements PageProcessor {
} package com.xcx.spots.test;
/** @ProjectName: spots @Package: com.xcx.spots.test @ClassName: Demo @Author: nh @Description: @Date: 2020/7/2 13:48 @Version: 1.0 */import us.codecraft.webmagic.ResultItems; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.pipeline.Pipeline; public class Demo implements Pipeline { @Override public void process(ResultItems resultItems, Task task) {
/** @ProjectName: spots @Package: com.xcx.spots.test @ClassName: UrlFileDownloadUtil @Author: nh @Description: @Date: 2020/7/2 13:48 @Version: 1.0/import java.io.; import java.net.URL; import java.util.UUID;
public class UrlFileDownloadUtil {
}
import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.scheduler.BloomFilterDuplicateRemover; import us.codecraft.webmagic.scheduler.QueueScheduler;
import us.codecraft.webmagic.selector.Selectable; public class Danli implements PageProcessor {
private Site site = Site.me().setRetryTimes(3).setSleepTime(1000).setTimeOut(10000);
@Override
public void process(Page page) {
//
if (page.getUrl().toString().equals("http://www.mmonly.cc/ktmh/dmmn/")) {
//
page.addTargetRequests(page.getHtml().$("div.item_t > div > div.ABox > a").links().all());
// , a
// p , :
// link
page.addTargetRequest(page.getHtml().$("#pageNum > a:nth-last-child(2)").links().toString());
//
} else if (page.getUrl().regex("http://www.mmonly.cc/ktmh/dmmn/[\\d]+") != null) {
// href
Selectable links = page.getHtml().$("#nl > a").links();
if (links != null )
page.addTargetRequest(links.toString());
// p
String img = page.getHtml().$("#big-pic p img").toString();
// a
if (img == "null")
//img link() , ,
img = page.getHtml().$("#big-pic a img").toString();
// s ,+5 h
img = img.substring(img.indexOf("src=\"") + 5, img.length() - 2);
page.putField("img", img);
}
}
@Override
public Site getSite() {
return site;
}
public static void main(String[] args) {
Spider.create(new Danli())
.addUrl("http://www.mmonly.cc/ktmh/dmmn/")
.setScheduler(new QueueScheduler().setDuplicateRemover(new BloomFilterDuplicateRemover(10000)))
.addPipeline(new Demo()).thread(5).run();
}
} package com.xcx.spots.test;
/**
String url = resultItems.get("img").toString();
UrlFileDownloadUtil.downloadPicture(url);
} } package com.xcx.spots.test; /**
public class UrlFileDownloadUtil {
public static void downloadPicture(String url) {
String file = "D:\\image\\";
try {
URL u = new URL(url);
String name = UUID.randomUUID().toString();
DataInputStream inputStream = new DataInputStream(u.openStream());
FileOutputStream outputStream = new FileOutputStream(file+name+".jpg");
byte [] bytes = new byte[1024*100];
int length ;
while ((length=inputStream.read(bytes))>0){
outputStream.write(bytes,0,length);
}
System.out.println(" :"+file+name+".jpg");
inputStream.close();
outputStream.close();
} catch ( Exception e) {
e.printStackTrace();
}
}
}