pythonを使ってcsdnブログのアクセス量を登ります。

1982 ワード

# -*- coding: utf-8 -*-


import urllib2
import re

#         
page_num = 1
#         ,             
notLast = 1

account = "mrzhang628"

#    
baseUrl = 'http://blog.csdn.net/'+account

while notLast:
	#    ,         
	myUrl = baseUrl+'/article/list/' + str(page_num)

	#        ,      csdn   
	user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; windows NT)'
	headers = {'User-Agent':user_agent}
	#    
	req = urllib2.Request(baseUrl,headers=headers)

	#request = urllib2.Request(myUrl);
	#request.add_header(headers);	

	#print("req = ",str(req));

	#    
	try:
		myResponse = urllib2.urlopen(req) #timeout      
	except Exception, e:
		raise
	else:
		pass #Python pass    ,             。
	finally:
		pass


	myPage = myResponse.read()

	#print("notLast = " ,str(notLast));

	print '----------------------------- %d ---------------------------------' % (page_num,)

	#               
	titles = re.findall('<span class="link_title"><a href=".*?">(.*?)</a></span>',myPage,re.S)	
	titleList=[] #  [ ]  。 python          
	for items in titles:
		titleList.append(str(items).lstrip().rstrip())#list.append(obj):           

	#               
	views = re.findall('<span class="link_view".*?><a href=".*?" title="    ">  </a>\((.*?)\)</span>',myPage,re.S)
	viewList=[]
	for items in views:
		viewList.append(str(items).lstrip().rstrip()) 


	#            
	index = []
	count = 0
	while (count < len(viewList)):
		index.insert(count,count)
		count = count + 1
	
	print(index)
	
	#      
	for n in index: #range(len(titleList)):
		print '   :%s   :%s' % (viewList[n].zfill(4),titleList[n])

	#   1
	page_num = page_num + 1

	#          ‘  ’               
	notLast = re.findall('<a href=".*?">  </a>',myPage,re.S)
問題があったら、直ちに交流して勉強してください。ありがとうございます。