PythonはテキストN行データのランダム読み出しを実現

9594 ワード

仕事の中であるテキストの中のURLが正常にアクセスできるかどうかを判断し、その中のN行が正常にアクセスできるURLデータをランダムに取得する必要がある.具体的には、次のようになります.
 1 import urllib2,random
 2 from sets import Set
 3 
 4 def get_responses(url):
 5     global good_list
 6     global bad_list
 7     if not url.startswith("http:"):
 8         http_url = "http://" + url
 9     headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 5.1; rv:10.0.1) Gecko/20100101 Firefox/10.0.1',}
10     try:
11         request = urllib2.Request(http_url, headers=headers)
12         resp = urllib2.urlopen(request)
13         print url
14     except urllib2.URLError, e:
15         print e
16         bad_list.append(url)
17         return 0
18 
19     retcode = resp.getcode()
20     if retcode == 200:
21         good_list.append(url)
22         #return 1
23     else:
24         bad_list.append(url)
25         #return 0
26 
27 def readFile():
28     try:
29         urllist = open(r'C:\Users\888\Desktop\urls.txt','r')
30     except IOError:
31         print "file does not exist.
" 32 for item in urllist: 33 item = item.strip('
') 34 r = get_responses(item) 35 36 urllist.close() 37 print "Total URLs: %d, Good URLs:%d, Bad URLs: %d." %((len(good_list)+len(bad_list)),len(good_list),len(bad_list)) 38 39 def writeFile(linenum): 40 result = [] 41 linelen = len(good_list) 42 while len(result) < int(linenum): 43 s = random.randint(0,linelen-1) 44 result.append(good_list[s]) 45 result = list(Set(result)) 46 47 # Put the good_url in goodurl.txt file 48 try: 49 goodurl = open(r'C:\Users\888\Desktop\goodurl.txt','w+') 50 except IOError: 51 print "file does not exist.
" 52 53 for item in result: 54 goodurl.write(item+'
') 55 goodurl.close() 56 57 print "The mission is done, Please check the goodurl.txt file" 58 59 if __name__ == "__main__": 60 good_list = [] 61 bad_list = [] 62 readFile() 63 writeFile(150)