pythonを使用して住宅価格情報を取得
15867 ワード
貝殻網から住宅価格情報を取得する.
基本的な手順は私のこのブログと同じです.https://www.cnblogs.com/mrlayfolk/p/12319414.html.よく知らないものは参考にしてください.
次のコードは3000個のサンプルを取得するコードです.
基本的な手順は私のこのブログと同じです.https://www.cnblogs.com/mrlayfolk/p/12319414.html.よく知らないものは参考にしてください.
次のコードは3000個のサンプルを取得するコードです.
1 # encoding:utf-8
2
3 '''
4 : 。 :https://cd.ke.com/ershoufang/qingyang/l2/
5 :python 3.7.3
6 :requests、BeautifulSoup、xlwt
7 '''
8
9 import logging
10 import xlwt
11 import requests
12 import string
13 from bs4 import BeautifulSoup
14
15 headers = {
16 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36',\
17 "Host": "cd.ke.com",
18 }
19
20 #
21 def save_info(content):
22 workbook = xlwt.Workbook(encoding = 'ascii')
23 worksheet = workbook.add_sheet('house info')
24 style = xlwt.XFStyle() #
25 font = xlwt.Font() #
26 font.name = 'Times New Roman'
27 font.bold = True #
28 font.underline = True #
29 font.italic = True #
30 style.font = font #
31 worksheet.write(0, 0, ' ')
32 worksheet.write(0, 1, ' ')
33 worksheet.write(0, 2, ' ')
34 worksheet.write(0, 3, ' ( )')
35 worksheet.write(0, 4, ' ( / )')
36
37 for i, item in enumerate(content):
38 for j in range(5): # ( )
39 worksheet.write(i+1, j, content[i][j])
40 workbook.save('./house_info.xls') #
41
42
43 #
44 # :title positon houseinfo totalprice unitprice
45 def get_info():
46 all_info = []
47 title_list = []
48 position_list = []
49 house_list = []
50 totalPrice_list = []
51 unitPrice_list = []
52
53 for i in range(100):
54 link = 'https://cd.ke.com/ershoufang/qingyang/pg%dl2/' % i
55 r = requests.get(link, headers=headers, timeout=10)
56 print (str(i+1), 'status_code: ', r.status_code)
57 soup = BeautifulSoup(r.text, 'lxml')
58 titleInfo = soup.findAll('div', {'class': 'info clear'})
59 positionInfo = soup.findAll('div', {'class': 'positionInfo'})
60 houseInfo = soup.findAll('div', {'class': 'houseInfo'})
61 totalPrice = soup.findAll('div', {'class': 'totalPrice'})
62 unitPrice = soup.findAll('div', {'class': 'unitPrice'})
63 for item in titleInfo:
64 title = item.div.a.text.strip()
65 title_list.append(title)
66 for item in positionInfo:
67 postion = item.a.text.strip()
68 position_list.append(postion)
69 for item in houseInfo:
70 house = item.text.strip().replace('
', ' ').replace(' ', '')
71 house_list.append(house)
72 for item in totalPrice:
73 total_price = item.span.text.strip()
74 totalPrice_list.append(total_price)
75 for item in unitPrice:
76 unit_price = item.span.text.strip().replace(' ', '').replace(' / ', '')
77 unitPrice_list.append(unit_price)
78 print (len(title_list))
79 print (len(position_list))
80 print (len(house_list))
81 print (len(totalPrice_list))
82 print (len(unitPrice_list))
83 for i in range(len(title_list)):
84 item = [title_list[i], position_list[i], house_list[i], totalPrice_list[i], unitPrice_list[i]]
85 all_info.append(item)
86
87 return all_info
88
89
90 if __name__ == "__main__":
91 all_info = get_info()
92 save_info(all_info)