PythonはテキストN行データのランダム読み出しを実現
9594 ワード
仕事の中であるテキストの中のURLが正常にアクセスできるかどうかを判断し、その中のN行が正常にアクセスできるURLデータをランダムに取得する必要がある.具体的には、次のようになります.
1 import urllib2,random
2 from sets import Set
3
4 def get_responses(url):
5 global good_list
6 global bad_list
7 if not url.startswith("http:"):
8 http_url = "http://" + url
9 headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 5.1; rv:10.0.1) Gecko/20100101 Firefox/10.0.1',}
10 try:
11 request = urllib2.Request(http_url, headers=headers)
12 resp = urllib2.urlopen(request)
13 print url
14 except urllib2.URLError, e:
15 print e
16 bad_list.append(url)
17 return 0
18
19 retcode = resp.getcode()
20 if retcode == 200:
21 good_list.append(url)
22 #return 1
23 else:
24 bad_list.append(url)
25 #return 0
26
27 def readFile():
28 try:
29 urllist = open(r'C:\Users\888\Desktop\urls.txt','r')
30 except IOError:
31 print "file does not exist.
"
32 for item in urllist:
33 item = item.strip('
')
34 r = get_responses(item)
35
36 urllist.close()
37 print "Total URLs: %d, Good URLs:%d, Bad URLs: %d." %((len(good_list)+len(bad_list)),len(good_list),len(bad_list))
38
39 def writeFile(linenum):
40 result = []
41 linelen = len(good_list)
42 while len(result) < int(linenum):
43 s = random.randint(0,linelen-1)
44 result.append(good_list[s])
45 result = list(Set(result))
46
47 # Put the good_url in goodurl.txt file
48 try:
49 goodurl = open(r'C:\Users\888\Desktop\goodurl.txt','w+')
50 except IOError:
51 print "file does not exist.
"
52
53 for item in result:
54 goodurl.write(item+'
')
55 goodurl.close()
56
57 print "The mission is done, Please check the goodurl.txt file"
58
59 if __name__ == "__main__":
60 good_list = []
61 bad_list = []
62 readFile()
63 writeFile(150)