JavaはウェブページのデータHTMLを登って、CSS、JS


最近javaの爬虫類に接触して、テキストの情報は登り終わって、ピクチャーを見てどのように登りたいと思って、そこで研究して、実例は学校の公式サイトに登ります
pom依存
<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
        <dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.11.3</version>
        </dependency>

        <!--      -->
        <dependency>
            <groupId>commons-io</groupId>
            <artifactId>commons-io</artifactId>
            <version>2.5</version>
        </dependency>

        <!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient -->
        <dependency>
            <groupId>org.apache.httpcomponents</groupId>
            <artifactId>httpclient</artifactId>
            <version>4.5.5</version>
        </dependency>

構成情報の設定:
public class test {
     
    //   
    private static final String URL = "http://www.ktbdqn.com/";
    //   
    private static final String ECODING = "GBK";
    //   img    
    private static final String IMGURL_REG = "]*?>";
    //  link    
    private static final String LINKURL_REG = "]*?>";
    //   Img src     
    private static final String IMGSRC_REG = "(?x)(src|SRC|background|BACKGROUND)=('|\")/?(([\\w-]+/)*([\\w-]+\\.(jpg|JPG|png|PNG|gif|GIF)))('|\")";
    //   Link href     
    private static final String LINKSRC_REG = "(?x)(href|HREF)=('|\")/?(([\\w-]+/)*([\\w-]+\\.(css|CSS|([\\w-]+/)*([\\w-]+\\.(css|CSS|([\\w-]+/)*([\\w-]+\\.(css|CSS)))))))('|\")";
    // css      
    private static final String SAVE_CSS_PATH = "d:\\cskt\\";
    // img      
    private static final String SAVE_PATH = "d:\\";

ここでは、取得したページが文字化けしてしまうことに注意し、登ったページの符号化によって自分の符号化と一致するようにします.
画像のダウンロード
public static void Download(List<String> listImgSrc) {
     
        int count = 0;
        try {
     
            for (int i = 0; i < listImgSrc.size(); i++) {
     
                String url = listImgSrc.get(i);
                String imageName = url.substring(url.lastIndexOf("/") + 1, url.length());
                URL uri = new URL(url);
                //     
                URLConnection con = uri.openConnection();
                //       5s
                con.setConnectTimeout(5 * 1000);
                //    
                InputStream is = con.getInputStream();
                // 1K     
                byte[] bs = new byte[1024];
                //         
                int len;
                //       
                String src = listImgSrc.get(i).substring(URL.length());
                int index = src.lastIndexOf('/');
                String fileName = src.substring(0, index + 1);
                File sf = new File(SAVE_PATH + fileName);
                if (!sf.exists()) {
     
                    sf.mkdirs();
                }
                OutputStream os = new FileOutputStream(sf.getPath() + "\\" + imageName);
                System.out.println(++count + ".    :" + url);
                //     
                while ((len = is.read(bs)) != -1) {
     
                    os.write(bs, 0, len);
                }
                //   ,      
                os.close();
                is.close();
                System.out.println(imageName + ":--    ");
                System.out.println();
            }
        } catch (Exception e) {
     
            System.out.println("    ");
        }
    }

スタイルのダウンロード
public static void DownCss(List<String> listCssSrc) {
     
        int count = 0;
        try {
     
            for (int i = 0; i < listCssSrc.size(); i++) {
     
                String url = listCssSrc.get(i);
                String imageName = url.substring(url.lastIndexOf("/") + 1, url.length());
                URL uri = new URL(url);
                //     
                URLConnection con = uri.openConnection();
                //       5s
                con.setConnectTimeout(5 * 1000);
                //    
                InputStream is = con.getInputStream();
                // 1K     
                byte[] bs = new byte[1024];
                //         
                int len;
                //       
                String src = listCssSrc.get(i).substring(URL.length());
                int index = src.lastIndexOf('/');
                String fileName = src.substring(0, index + 1);
                File sf = new File(SAVE_CSS_PATH + fileName);
                if (!sf.exists()) {
     
                    sf.mkdirs();
                }
                OutputStream os = new FileOutputStream(sf.getPath() + "\\" + imageName);
                System.out.println(++count + ".    :" + url);
                //     
                while ((len = is.read(bs)) != -1) {
     
                    os.write(bs, 0, len);
                }
                //   ,      
                os.close();
                is.close();
                System.out.println(imageName + ":--    ");
                System.out.println();
            }
        } catch (Exception e) {
     
            System.out.println("    ");
        }
    }

Webページの画像アドレスを取得
private static List<String> getImgStr(String htmlStr) {
     
        List<String> pics = new ArrayList<String>();
        String img = "";
        Pattern p_image;
        Matcher m_image;
        p_image = Pattern.compile(IMGURL_REG, Pattern.CASE_INSENSITIVE);
        m_image = p_image.matcher(htmlStr);
        while (m_image.find()) {
     
            //     
            img = m_image.group();
            //     src  
            Matcher m = Pattern.compile(IMGSRC_REG).matcher(img);
            while (m.find()) {
     
                String imgUrl = m.group(3);
                if (!imgUrl.contains("http://") && !imgUrl.contains("https://")) {
     //      
                    imgUrl = URL + imgUrl;
                }
                pics.add(imgUrl);
            }
        }
        return pics;
    }

Webページのスタイルアドレスの取得
private static List<String> getCssStr(String htmlStr) {
     
        List<String> csss = new ArrayList<String>();
        String css = "";
        Pattern p_css;
        Matcher m_css;
        p_css = Pattern.compile(LINKURL_REG, Pattern.CASE_INSENSITIVE);
        m_css = p_css.matcher(htmlStr);
        while (m_css.find()) {
     
            //     
            css = m_css.group();
            //     src  
            Matcher m = Pattern.compile(LINKSRC_REG).matcher(css);
            while (m.find()) {
     
                String cssUrl = m.group(3);
                if (!cssUrl.contains("http://") && !cssUrl.contains("https://")) {
     //      
                    cssUrl = URL + cssUrl;
                }
                csss.add(cssUrl);
            }
        }
        return csss;
    }

Webソースの登録
public static String getHtmlResourceByUrl(String url, String encoding) {
     
        URL urlObj = null;
        URLConnection uc = null;
        InputStreamReader isr = null;
        BufferedReader reader = null;
        StringBuffer buffer = new StringBuffer();
        //       
        try {
     
            urlObj = new URL(url);
            //       
            uc = urlObj.openConnection();
            //        
            isr = new InputStreamReader(uc.getInputStream(), encoding);
            //                  
            reader = new BufferedReader(isr);
            //   
            String temp = null;
            while ((temp = reader.readLine()) != null) {
     //                     
                // System.out.println(temp+"
");
buffer.append(temp + "
"
); } } catch (Exception e) { e.printStackTrace(); } finally { // if (isr != null) { try { isr.close(); } catch (IOException e) { e.printStackTrace(); } } } return buffer.toString(); }

mainメソッド
public static void main(String[] args) throws ClientProtocolException, IOException {
     
        //      
        getJobInfo(URL, ECODING);
        //  html    
        String HTML = test.getHtmlResourceByUrl(URL, ECODING);
        //    src url  
        List<String> imgSrc = test.getImgStr(HTML);
        //    
        test.Download(imgSrc);
        //    href url  
        List<String> cssSrc = test.getCssStr(HTML);
        //  css  
        test.DownCss(cssSrc);
    }