Python爬虫類、歴史的な天気データを登る
2309 ワード
ソースコードを先に上げる
今回はBeautifulSoupを使って、htmlを解析して、とても便利です
今回はBeautifulSoupを使って、htmlを解析して、とても便利です
import datetime
import pandas as pd
import re
import requests
import time
from bs4 import BeautifulSoup
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
}
def get_html(url):
# ,
while True:
r = requests.get(url, headers=headers)
print(' ', url, ' ')
if 'table' in r.text:
print(' ')
return r.content
else:
print(' , ')
time.sleep(1)
def parse_html(page_content):
soup = BeautifulSoup(page_content, features='lxml')
table = soup.find('table')
item_list = table.find_all('tr')
month = []
for i in range(1, len(item_list)):
td = item_list[i].find_all('td')
day = list()
#
day.append(parse_date(td[0].a.getText()))
#
nums = re.findall(r'-?\d+', td[2].getText())
day.append(int(nums[1]))
day.append(int(nums[0]))
#
pattern = re.compile(r'\s+')
day.append(re.sub(pattern, '', td[1].getText()))
day.append(re.sub(pattern, '', td[3].getText()))
month.append(day)
return month
def parse_date(text):
y = text.find(' ')
m = text.find(' ')
d = text.find(' ')
return datetime.date(int(text[y - 4: y]), int(text[m - 2: m]), int(text[d - 2: d]))
def main():
data = []
for year in [2016, 2017]:
for month in range(1, 13):
print(f' {year} {month} ')
month_str = '0' + str(month) if month < 10 else str(month)
url = 'http://www.tianqihoubao.com/lishi/wuhan/month/' + str(year) + month_str + '.html'
h = get_html(url)
data.extend(parse_html(h))
frame = pd.DataFrame(data, columns=['date', 'low_tp', 'high_tp', 'weather', 'wind'])
frame.to_csv('weather.csv', index=False)
if __name__ == '__main__':
main()