python爬虫実戦:pyqueryを利用して猫眼映画TOP 100ランキング内容-1


前回はbeautifulsoupを使って猫眼映画TOP 10を取りましたが、今回は最近習ったpyqueryを利用して再び実戦しました.bs 4より使いやすい感じです.
次のようにコードを共有します.コミュニケーションを歓迎します.
from pyquery import PyQuery as pq
import requests
import os 
import time

begin = time.clock()  #           。

file_path = 'D:\python3.6\scrapy\  '   #      ,    check       
file_name = 'maoyan.txt'   #          ,
file = file_path+'\\'+file_name     #        ,      

headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"}
	
start = "http://maoyan.com/board/4"  #   url    
flage='?offset='   # url      
node='dd'  #   url    html      
step=10  # url        
length=100  # url    max


def create_file(file_path,file):   #                    
	
	if os.path.exists(file_path)== False: # check      
		os.makedirs(file_path)   #           
		fp = open(file,'w')   #          
	# "w"       ,     ,       ,     ;       ,   ,     
	
	elif os.path.exists(file_path)== True: # check     
		with open(file, 'w', encoding='utf-8') as f: #            
			f.seek(0)
	# f.seek(offset[,where])           where offset  。where 0       ,      ;1      ;2      
			f.truncate()
	#      ,  :    "r+" "rb+" "w" "wb" "wb+"                   

create_file(file_path,file)

for n in range(0,length,step): #  url            ,       
	dict ={}	 #     ,         
	if n==0:  #      url 
		url=start 
		i=n+1  #     i        
	else : #      url   
		url=start+flage+str(n)
		i=(n/10)+1
	r = requests.get(url,headers=headers) #       html
	doc=pq(r.text)  #    ququery     
	page=doc.find(node)  #      node   
				
	for data in page.items():  #        node   ,         ,      
		# print(data,type(data)) #    data  
		# print('1'*50)
		index = data.children('i').text()
		name = data.find('.name').text()
		star = data.find('.star').text()
		releasetime = data.find('.releasetime').text()
		score = data.find('.score').text()
		
		dict['index']=index
		dict['name']=name
		dict['star']=star
		dict['releasetime']=releasetime
		dict['score']=score
		
		with open(file, 'a', encoding='utf-8') as f: #     file  
			f.write(str(dict)+'
') # '
', dict txt print(' %d !'%(i)) end = time.clock() # 。 print(" , :%f"%(end-begin))