pythonを使ってcsdnブログのアクセス量を登ります。
1982 ワード
# -*- coding: utf-8 -*-
import urllib2
import re
#
page_num = 1
# ,
notLast = 1
account = "mrzhang628"
#
baseUrl = 'http://blog.csdn.net/'+account
while notLast:
# ,
myUrl = baseUrl+'/article/list/' + str(page_num)
# , csdn
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; windows NT)'
headers = {'User-Agent':user_agent}
#
req = urllib2.Request(baseUrl,headers=headers)
#request = urllib2.Request(myUrl);
#request.add_header(headers);
#print("req = ",str(req));
#
try:
myResponse = urllib2.urlopen(req) #timeout
except Exception, e:
raise
else:
pass #Python pass , 。
finally:
pass
myPage = myResponse.read()
#print("notLast = " ,str(notLast));
print '----------------------------- %d ---------------------------------' % (page_num,)
#
titles = re.findall('<span class="link_title"><a href=".*?">(.*?)</a></span>',myPage,re.S)
titleList=[] # [ ] 。 python
for items in titles:
titleList.append(str(items).lstrip().rstrip())#list.append(obj):
#
views = re.findall('<span class="link_view".*?><a href=".*?" title=" "> </a>\((.*?)\)</span>',myPage,re.S)
viewList=[]
for items in views:
viewList.append(str(items).lstrip().rstrip())
#
index = []
count = 0
while (count < len(viewList)):
index.insert(count,count)
count = count + 1
print(index)
#
for n in index: #range(len(titleList)):
print ' :%s :%s' % (viewList[n].zfill(4),titleList[n])
# 1
page_num = page_num + 1
# ‘ ’
notLast = re.findall('<a href=".*?"> </a>',myPage,re.S)
問題があったら、直ちに交流して勉強してください。ありがとうございます。