python爬虫実戦:pyqueryを利用して猫眼映画TOP 100ランキング内容-1
前回はbeautifulsoupを使って猫眼映画TOP 10を取りましたが、今回は最近習ったpyqueryを利用して再び実戦しました.bs 4より使いやすい感じです.
次のようにコードを共有します.コミュニケーションを歓迎します.
次のようにコードを共有します.コミュニケーションを歓迎します.
from pyquery import PyQuery as pq
import requests
import os
import time
begin = time.clock() # 。
file_path = 'D:\python3.6\scrapy\ ' # , check
file_name = 'maoyan.txt' # ,
file = file_path+'\\'+file_name # ,
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"}
start = "http://maoyan.com/board/4" # url
flage='?offset=' # url
node='dd' # url html
step=10 # url
length=100 # url max
def create_file(file_path,file): #
if os.path.exists(file_path)== False: # check
os.makedirs(file_path) #
fp = open(file,'w') #
# "w" , , , ; , ,
elif os.path.exists(file_path)== True: # check
with open(file, 'w', encoding='utf-8') as f: #
f.seek(0)
# f.seek(offset[,where]) where offset 。where 0 , ;1 ;2
f.truncate()
# , : "r+" "rb+" "w" "wb" "wb+"
create_file(file_path,file)
for n in range(0,length,step): # url ,
dict ={} # ,
if n==0: # url
url=start
i=n+1 # i
else : # url
url=start+flage+str(n)
i=(n/10)+1
r = requests.get(url,headers=headers) # html
doc=pq(r.text) # ququery
page=doc.find(node) # node
for data in page.items(): # node , ,
# print(data,type(data)) # data
# print('1'*50)
index = data.children('i').text()
name = data.find('.name').text()
star = data.find('.star').text()
releasetime = data.find('.releasetime').text()
score = data.find('.score').text()
dict['index']=index
dict['name']=name
dict['star']=star
dict['releasetime']=releasetime
dict['score']=score
with open(file, 'a', encoding='utf-8') as f: # file
f.write(str(dict)+'
') # '
', dict txt
print(' %d !'%(i))
end = time.clock() # 。
print(" , :%f"%(end-begin))