python爬虫類の爬虫類トップページの校花網の情報

2622 ワード

スクールバス情報(氏名、票数、画像)を取得して保存
http://www.ttpaihang.com/vote/rank.php?voteid=621

 
#!/usr/bin/env python 
# -*- coding:utf-8 -*-

"""
    1、       
    2、       
    3、      
    4、        
"""
import json
import parser
import requests
import lxml.html

def parse_url(url, headers):
    #   url
    response = requests.get(url,headers)
    return response.content.decode("gb2312")


def get_wanghong_data(html_content):
    #        
    metree = lxml.html.etree
    #     
    parser = metree.HTML(html_content,metree.HTMLParser())
    #       
    div_list = parser.xpath("//td [@align='center']")
    # div_list_name = parser.xpath("//a[@class='clink']")
    # print(div_list)
    # print(len(div_list))
    data = []
    for element in div_list:
        item = {}
        item["name"] = element.xpath("./table[@width='460']/tr/td[@class='main2_bt_td']/div[@align='left']/table[@width='100%']/tr/td[@width='75%']/span[@class='zthei']/a/text()")[0]
        item["sum"] = element.xpath("./table[@width='460']/tr/td[@class='zthong']/text()")[0]
        item["img"] = "http://www.ttpaihang.com"+element.xpath("./table[@width='460']/tr/td[@width='155']/div[@align='center']/a/img/@src")[0]
        # print(item)
        data.append(item)
    # print(data)
    return data


def save_file(datas):
    #     
    json_str = json.dumps(datas,ensure_ascii=False,indent=2)
    with open("./file/wanghong.json","w",encoding="utf-8") as files:
        files.write(json_str)
    print("       !")


def main():
    http_url = "http://www.ttpaihang.com/vote/rank.php?voteid=621"
    header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36"}
    # 1、       
    html_data = parse_url(http_url,header)
    # print(html_data)

    # 2、       
    wanghong_data =get_wanghong_data(html_data)
    # print(wanghong_data)

    #      
    save_file(wanghong_data)



if __name__ == '__main__':
    main()

部分結果図
python爬虫之爬取首页校花网的信息_第1张图片