JAva単純爬虫小demo爬虫情報がメイン

2146 ワード

/**
 *             
 * Created by    on 2017/11/29.
 */
public class Html {
    public Document getHtmlTextByUrl(String url) {

        Document doc = null;
        int i = (int) (Math.random() * 1000);
        while (i != 0) {
            i--;
        }
        try {
            doc = Jsoup.connect(url).data("query", "Java").userAgent("Mozilla").cookie("auth", "token").timeout(300000).post();
        } catch (IOException e) {
            e.printStackTrace();
        }
        try {
            doc = Jsoup.connect(url).timeout(500000).post();
        } catch (IOException e) {
            e.printStackTrace();
        }
        return doc;
    }


    /**
     *      
     *
     * @param doc
     * @param className
     * @return
     */
    public Elements getElementByClass(Document doc, String className) {
        Elements elements = null;
        elements = doc.select(className);
        return elements;
    }

    /**
     *   
     *
     * @param name
     * @param url    
     * @param type provincertr
     * @return
     */
    public ArrayList getProvince(String name, String url, String type) {
        ArrayList result = new ArrayList();
        String classType = "." + type;
        Document doc = this.getHtmlTextByUrl(url);
        if (doc != null) {
            Elements elements = this.getElementByClass(doc, classType);
            for (Element item : elements) {
                if (item != null) {
                    for (Element items : item.children()) {
                        String[] str = new String[4];
                        if (items.children().first() != null) {
                           // str[0] = url;
                            str[1] = items.children().first().ownText();
                            String ownUrl = items.children().first().attr("href");
                            str[2] = ownUrl;
                            str[3] = type;
                            result.add(str);
                        }
                    }
                }
            }
        }
        return result;
    }

}