JAva爬絵像データdemo

3938 ワード

package com.xcx.spots.test;
import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.scheduler.BloomFilterDuplicateRemover; import us.codecraft.webmagic.scheduler.QueueScheduler;
import us.codecraft.webmagic.selector.Selectable; public class Danli implements PageProcessor {
private Site site = Site.me().setRetryTimes(3).setSleepTime(1000).setTimeOut(10000);

@Override
public void process(Page page) {
    //        
    if (page.getUrl().toString().equals("http://www.mmonly.cc/ktmh/dmmn/")) {

        //           
        page.addTargetRequests(page.getHtml().$("div.item_t > div > div.ABox > a").links().all());

        //      ,    a  
        //                    p   ,            :
        //          link
        page.addTargetRequest(page.getHtml().$("#pageNum > a:nth-last-child(2)").links().toString());

        //                                                              
    } else if (page.getUrl().regex("http://www.mmonly.cc/ktmh/dmmn/[\\d]+") != null) {
        //                 href
        Selectable links = page.getHtml().$("#nl > a").links();

        if (links != null )
            page.addTargetRequest(links.toString());
        //       p  
        String img = page.getHtml().$("#big-pic p img").toString();
        //  a  
        if (img == "null")
            //img      link()      ,         ,       
            img = page.getHtml().$("#big-pic a img").toString();
        //   s   ,+5    h   
        img = img.substring(img.indexOf("src=\"") + 5, img.length() - 2);

        page.putField("img", img);



    }
}

@Override
public Site getSite() {
    return site;
}

public static void main(String[] args) {
    Spider.create(new Danli())
            .addUrl("http://www.mmonly.cc/ktmh/dmmn/")
            .setScheduler(new QueueScheduler().setDuplicateRemover(new BloomFilterDuplicateRemover(10000)))
            .addPipeline(new Demo()).thread(5).run();
}

} package com.xcx.spots.test;
/**
  • @ProjectName: spots
  • @Package: com.xcx.spots.test
  • @ClassName: Demo
  • @Author: nh
  • @Description:
  • @Date: 2020/7/2 13:48
  • @Version: 1.0 */import us.codecraft.webmagic.ResultItems; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.pipeline.Pipeline; public class Demo implements Pipeline { @Override public void process(ResultItems resultItems, Task task) {
     String url = resultItems.get("img").toString();
     UrlFileDownloadUtil.downloadPicture(url);
    
    } } package com.xcx.spots.test;

  • /**
  • @ProjectName: spots
  • @Package: com.xcx.spots.test
  • @ClassName: UrlFileDownloadUtil
  • @Author: nh
  • @Description:
  • @Date: 2020/7/2 13:48
  • @Version: 1.0/import java.io.; import java.net.URL; import java.util.UUID;

  • public class UrlFileDownloadUtil {
    public static void downloadPicture(String url) {
        String file = "D:\\image\\";
        try {
            URL u = new URL(url);
            String name = UUID.randomUUID().toString();
    
            DataInputStream inputStream = new DataInputStream(u.openStream());
            FileOutputStream outputStream = new FileOutputStream(file+name+".jpg");
    
            byte [] bytes = new byte[1024*100];
            int length ;
            while ((length=inputStream.read(bytes))>0){
                outputStream.write(bytes,0,length);
            }
            System.out.println("    :"+file+name+".jpg");
    
            inputStream.close();
            outputStream.close();
        } catch (  Exception e) {
            e.printStackTrace();
        }
    }
    

    }