pythonを使用して住宅価格情報を取得

15867 ワード

貝殻網から住宅価格情報を取得する.
基本的な手順は私のこのブログと同じです.https://www.cnblogs.com/mrlayfolk/p/12319414.html.よく知らないものは参考にしてください.
次のコードは3000個のサンプルを取得するコードです.
 1 # encoding:utf-8
 2 
 3 '''
 4   :            。  :https://cd.ke.com/ershoufang/qingyang/l2/
 5   :python 3.7.3
 6     :requests、BeautifulSoup、xlwt
 7 '''
 8 
 9 import logging
10 import xlwt
11 import requests
12 import string
13 from bs4 import BeautifulSoup
14 
15 headers = {
16     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36',\
17     "Host": "cd.ke.com",
18 }
19 
20 #             
21 def save_info(content):
22     workbook = xlwt.Workbook(encoding = 'ascii')
23     worksheet = workbook.add_sheet('house info')
24     style = xlwt.XFStyle() #      
25     font = xlwt.Font() #        
26     font.name = 'Times New Roman' 
27     font.bold = True #   
28     font.underline = True #    
29     font.italic = True #    
30     style.font = font #     
31     worksheet.write(0, 0, '  ')
32     worksheet.write(0, 1, '  ')
33     worksheet.write(0, 2, '    ')
34     worksheet.write(0, 3, '  ( )')
35     worksheet.write(0, 4, '  ( /   )')
36     
37     for i, item in enumerate(content):
38         for j in range(5):  #     (  )
39             worksheet.write(i+1, j, content[i][j])
40     workbook.save('./house_info.xls') #     
41 
42 
43 #          
44 #     :title positon houseinfo totalprice unitprice
45 def get_info():
46     all_info        = []
47     title_list      = []
48     position_list   = []
49     house_list      = []
50     totalPrice_list = []
51     unitPrice_list  = []
52     
53     for i in range(100):
54         link = 'https://cd.ke.com/ershoufang/qingyang/pg%dl2/' % i
55         r = requests.get(link, headers=headers, timeout=10)
56         print (str(i+1), 'status_code: ', r.status_code)
57         soup = BeautifulSoup(r.text, 'lxml')
58         titleInfo = soup.findAll('div', {'class': 'info clear'})
59         positionInfo = soup.findAll('div', {'class': 'positionInfo'})
60         houseInfo = soup.findAll('div', {'class': 'houseInfo'})
61         totalPrice = soup.findAll('div', {'class': 'totalPrice'})
62         unitPrice = soup.findAll('div', {'class': 'unitPrice'})
63         for item in titleInfo:
64             title = item.div.a.text.strip()
65             title_list.append(title)
66         for item in positionInfo:
67             postion = item.a.text.strip()
68             position_list.append(postion)
69         for item in houseInfo:
70             house = item.text.strip().replace('
', ' ').replace(' ', '') 71 house_list.append(house) 72 for item in totalPrice: 73 total_price = item.span.text.strip() 74 totalPrice_list.append(total_price) 75 for item in unitPrice: 76 unit_price = item.span.text.strip().replace(' ', '').replace(' / ', '') 77 unitPrice_list.append(unit_price) 78 print (len(title_list)) 79 print (len(position_list)) 80 print (len(house_list)) 81 print (len(totalPrice_list)) 82 print (len(unitPrice_list)) 83 for i in range(len(title_list)): 84 item = [title_list[i], position_list[i], house_list[i], totalPrice_list[i], unitPrice_list[i]] 85 all_info.append(item) 86 87 return all_info 88 89 90 if __name__ == "__main__": 91 all_info = get_info() 92 save_info(all_info)