中関村の携帯電話の資料の登り取り---1
2486 ワード
import re
import time
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
import random
def get_value(url):
soup=bs(f,'html.parser')
Dic={'NAME':[],'PRICE':[],'DATE':[]}
num=0
for z in soup.find('div', class_="list-box").find_all('div', class_="price-box"):
try:
b=z.find('b', class_="price-type").string
except:
b='N/A'
Dic['PRICE'].append(b) ##
try:
d=z.find('span', class_="date").string
except:
d='N/A'
Dic['DATE'].append(d) ##
for i in soup.find('div', class_="list-box").find_all('div',class_="pro-intro"):
a=i.find('h3').a.string
Dic['NAME'].append(a) ##
dic={}
for j in i.find('ul', class_="param clearfix").find_all('li')[1:]:
c1=j.span.string[:-1] ## ‘ :’
# print(c1)
if c1 not in Dic:
Dic[c1]=['N/A']*num
c2=j['title']
dic[c1]=c2
for key in Dic:
if key !='NAME' and key !='PRICE' and key !='DATE':
try:
Dic[key].append(dic[key])
except:
Dic[key].append('N/A')
num+=1
# data=pd.DataFrame(Dic)
# data.head()
return Dic
total=[]
for i in range(1,41):
url='http://detail.zol.com.cn/cell_phone_index/subcate57_0_list_1_0_1_1_0_{0}.html'.format(i)
hea={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:63.0) Gecko/20100101 Firefox/63.0'}
response = requests.get(url, headers = hea)
response.encoding='gbk'
f=response.text
d=get_value(url)
data=pd.DataFrame(d)
data=data.loc[:,['NAME','CPU ',' ',' ','CPU ',' ','RAM ','DATE','PRICE']]
total.append(data)
D=pd.concat(total,ignore_index=True)
print('---- ',i,' , ',len(d['NAME']),' , ',len(D['NAME']),' ')
time.sleep(3)
D.to_csv(r' 4.csv',encoding='gb18030')
print(' , ',len(D['NAME']),' ')
data.head()