pythonネットワーク爬虫実戦-リアルタイムで西刺を捕まえる無料エージェントip

9764 ワード

ネット上の達人のサンプルプログラムを参考にして、マルチスレッド技術を利用して、Pythonバージョンは2.7です
#-*-coding:utf8-*-

import urllib2
import re
import threading
import time

rawProxyList = []
checkedProxyList = []

#      
targets=[]
for i in range(1,6):
    target = r"http://www.xici.net.co/nn/%d" % i
    targets.append(target)
# print targets

#  
p = re.compile(r''' 1:
                    checkedProxyList.append((proxy[0],proxy[1],proxy[2],timeused))
                else:
                    continue
            except Exception,e:
                continue

    def run(self):
        self.checkProxy()

if __name__ == "__main__":
    getThreads=[]
    checkThreads=[]

#                   
for i in range(len(targets)):
    t = ProxyGet(targets[i])
    getThreads.append(t)

for i in range(len(getThreads)):
    getThreads[i].start()

for i in range(len(getThreads)):
    getThreads[i].join()

print '.'*10+"     %s   " %len(rawProxyList) +'.'*10

#  20       ,         20 ,        
for i in range(20):
    t = ProxyCheck(rawProxyList[((len(rawProxyList)+19)/20) * i:((len(rawProxyList)+19)/20) * (i+1)])
    checkThreads.append(t)

for i in range(len(checkThreads)):
    checkThreads[i].start()

for i in range(len(checkThreads)):
    checkThreads[i].join()

print '.'*10+"   %s       " %len(checkedProxyList) +'.'*10

#   
f= open("proxy_list.txt",'w+')
for proxy in sorted(checkedProxyList,cmp=lambda x,y:cmp(x[3],y[3])):
    print "checked proxy is: %s:%s\t%s\t%s" %(proxy[0],proxy[1],proxy[2],proxy[3])
    f.write("%s:%s\t%s\t%s
"%(proxy[0],proxy[1],proxy[2],proxy[3])) f.close() python      ——          ip_ 1