python爬虫類--チェーン蘇州成約住宅価格2
2457 ワード
# -*- coding: utf-8 -*-
import bs4
import requests
import time# time,
def open_url(url):
# url = 'https://su.lianjia.com/chengjiao/gongyeyuan/pg1/'
hd = {}
hd['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36'
r = requests.get(url,headers=hd,timeout=10)
return r
host = 'https://su.lianjia.com/chengjiao/pg'
whvj = []
aa = []
bb = []
cc = []
dd = []
count = 1
start = time.time()
size = 0
q = 2
#for i in range(98,99):
while count < q:
# count = 1aa
url = host + str(count)
r = open_url(url)
soup = bs4.BeautifulSoup(r.text,'html.parser')
count = count + 1
targets = soup.find_all('a',class_="img")
for each in targets:
whvj.append(each['href'])
print('\r'+" :"+int(count/q*100)*"█"+"【"+str(round(float(count/q)*100,2))+"%"+"】",end="")
# print(url)
# print(whvj)
count1 = 0
response = requests.get(url,stream = True)#stream True , , iter_content iter_lines
chunk_size = 1024# 1024
content_size = int(len(whvj))
for i in whvj:
soup1 = bs4.BeautifulSoup(open_url(i).text,'html.parser')
djjx = soup1.find_all("span",class_="record_price")
aa.append(djjx[0].text)
xbxi = soup1.find_all("div",class_="content")
for each in xbxi[0]:
bb.append(each.text.split())
uijm = soup1.find_all("div",class_="name")
# for each in uijm[len(uijm)-1]:
cc.append(uijm[len(uijm)-1].text)
title = soup1.find_all("title")
for each in title:
dd.append(each.text)
size = size +1
print('\r'+" :"+int(size/content_size*100)*"█"+" 【"+str(round(size/chunk_size/1024,2))+"MB】"+"【"+str(round(float(size/content_size)*100,2))+"%"+"】",end="")
##
result = []
length =len(bb)
for i in range(length):
result.append(str(dd[i])+' ' +str(aa[i])+' '+str(whvj[i])
+ ' ' + str(bb[i]) + '^' + str(cc[i]) + '
')
end = time.time()
print(" :"+str(end-start)+" ")
#data_count = 0
with open('ty.txt','w',encoding='utf-8') as f:
for each in result:
f.write(each)