pythonで爬虫を勉強します
3634 ワード
, , python2 3 , download , bug, url ,http , 。。。
#!/usr/bin/env python3
import os
import ssl
from html.parser import HTMLParser
import urllib.parse
import urllib.request
import http.client
http.client.HTTPConnection._http_vsn = 10
http.client.HTTPConnection._http_vsn_str = 'HTTP/1.0'
ssl._create_default_https_context = ssl._create_unverified_context
import sys
class pageDeal(HTMLParser):
def __init__(self, url):
HTMLParser.__init__(self)
self.url, self.file = self.getUrl(url)
self.data = set()
def getUrl(self, url):
'product usable filename'
parsed = urllib.parse.urlparse(url)
host = parsed.netloc.split('@')[-1].split(':')[0]
#print(host)
filePath = '%s%s' % (host, parsed.path)
if not os.path.splitext(parsed.path)[1]:
filePath = os.path.join(filePath, 'index.html')
#print(filePath)
linkDir = os.path.dirname(filePath)
if not os.path.isdir(linkDir):
if os.path.exists(linkDir):
os.unlink(linkDir)
os.makedirs(linkDir)
return url, filePath
def handle_starttag(self, tag, attrs):
if tag == 'a':
for name, value in attrs:
if name == 'href' and 'https' in value:
self.data.add(value)
def download(self):
'Download file base url'
print(self.file, ' ', self.url)
try:
ret = urllib.request.urlretrieve(self.url, self.file)
except (IOError, urllib.request.URLError) as e:
ret = (('*** ERROR: bad URL %s: %s' % (self.url, e)),)
return ret
def parserLink(self):
'product html list'
with open(self.file, 'r') as f:
try:
page = f.read()
except http.client.IncompleteRead as e:
return self.data
self.feed(page)
return self.data
class sumInfo(object):
count = 0
def __init__(self, url):
self.q = [url]
self.seen = set()
parsed = urllib.request.urlparse(url)
host = parsed.netloc.split('@')[-1].split(':')[0]
self.dom = '.'.join(host.split('.')[-2:])
def main():
# if len(sys.argv) > 1:
# url = sys.argv[1]
# else:
# try:
# url = input('input URL : ')
# except (KeyboardInterrupt, EOFError):
# url = ''
# if not url:
# print('input not satify,exit')
# return
# if not url.startswith('http://') and not url.startswith('ftp://'):
# url = 'http://%s' % url
#url = 'http://tieba.baidu.com/p/2256306796'
url = 'https://stackoverflow.com/questions/36998191/typeerror-not-a-valid-non-string-sequence-or-mapping-object'
mainPage = pageDeal(url)
print(mainPage.download())
inPage = mainPage.parserLink()
print('URL: %s
FILE: %s' % (mainPage.url, mainPage.file))
print(inPage)
for iUrl in inPage:
print(iUrl)
iPage = pageDeal(iUrl)
iPage.download()
#print('URL: %s
FILE: %s' % (iPage.url, iPage.file))
if __name__ == '__main__':
main()
#ret = urllib.request.urlretrieve('https://stackoverflow.com/questions/36998191/typeerror-not-a-valid-non-string-sequence-or-mapping-object', 'w00444862.flag')