python爬虫類の爬虫類トップページの校花網の情報
2622 ワード
スクールバス情報(氏名、票数、画像)を取得して保存
部分結果図
http://www.ttpaihang.com/vote/rank.php?voteid=621
#!/usr/bin/env python
# -*- coding:utf-8 -*-
"""
1、
2、
3、
4、
"""
import json
import parser
import requests
import lxml.html
def parse_url(url, headers):
# url
response = requests.get(url,headers)
return response.content.decode("gb2312")
def get_wanghong_data(html_content):
#
metree = lxml.html.etree
#
parser = metree.HTML(html_content,metree.HTMLParser())
#
div_list = parser.xpath("//td [@align='center']")
# div_list_name = parser.xpath("//a[@class='clink']")
# print(div_list)
# print(len(div_list))
data = []
for element in div_list:
item = {}
item["name"] = element.xpath("./table[@width='460']/tr/td[@class='main2_bt_td']/div[@align='left']/table[@width='100%']/tr/td[@width='75%']/span[@class='zthei']/a/text()")[0]
item["sum"] = element.xpath("./table[@width='460']/tr/td[@class='zthong']/text()")[0]
item["img"] = "http://www.ttpaihang.com"+element.xpath("./table[@width='460']/tr/td[@width='155']/div[@align='center']/a/img/@src")[0]
# print(item)
data.append(item)
# print(data)
return data
def save_file(datas):
#
json_str = json.dumps(datas,ensure_ascii=False,indent=2)
with open("./file/wanghong.json","w",encoding="utf-8") as files:
files.write(json_str)
print(" !")
def main():
http_url = "http://www.ttpaihang.com/vote/rank.php?voteid=621"
header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36"}
# 1、
html_data = parse_url(http_url,header)
# print(html_data)
# 2、
wanghong_data =get_wanghong_data(html_data)
# print(wanghong_data)
#
save_file(wanghong_data)
if __name__ == '__main__':
main()
部分結果図