pythonネットワーク爬虫実戦-リアルタイムで西刺を捕まえる無料エージェントip
9764 ワード
ネット上の達人のサンプルプログラムを参考にして、マルチスレッド技術を利用して、Pythonバージョンは2.7です
#-*-coding:utf8-*-
import urllib2
import re
import threading
import time
rawProxyList = []
checkedProxyList = []
#
targets=[]
for i in range(1,6):
target = r"http://www.xici.net.co/nn/%d" % i
targets.append(target)
# print targets
#
p = re.compile(r''' 1:
checkedProxyList.append((proxy[0],proxy[1],proxy[2],timeused))
else:
continue
except Exception,e:
continue
def run(self):
self.checkProxy()
if __name__ == "__main__":
getThreads=[]
checkThreads=[]
#
for i in range(len(targets)):
t = ProxyGet(targets[i])
getThreads.append(t)
for i in range(len(getThreads)):
getThreads[i].start()
for i in range(len(getThreads)):
getThreads[i].join()
print '.'*10+" %s " %len(rawProxyList) +'.'*10
# 20 , 20 ,
for i in range(20):
t = ProxyCheck(rawProxyList[((len(rawProxyList)+19)/20) * i:((len(rawProxyList)+19)/20) * (i+1)])
checkThreads.append(t)
for i in range(len(checkThreads)):
checkThreads[i].start()
for i in range(len(checkThreads)):
checkThreads[i].join()
print '.'*10+" %s " %len(checkedProxyList) +'.'*10
#
f= open("proxy_list.txt",'w+')
for proxy in sorted(checkedProxyList,cmp=lambda x,y:cmp(x[3],y[3])):
print "checked proxy is: %s:%s\t%s\t%s" %(proxy[0],proxy[1],proxy[2],proxy[3])
f.write("%s:%s\t%s\t%s
"%(proxy[0],proxy[1],proxy[2],proxy[3]))
f.close()