pythonで爬虫を勉強します

3634 ワード

      ,        ,  python2 3     ,     download       ,      bug,  url       ,http     ,       。。。

#!/usr/bin/env python3


import os
import ssl
from html.parser import HTMLParser
import urllib.parse
import urllib.request
import http.client
http.client.HTTPConnection._http_vsn = 10
http.client.HTTPConnection._http_vsn_str = 'HTTP/1.0'
ssl._create_default_https_context = ssl._create_unverified_context

import sys


class pageDeal(HTMLParser):

    def __init__(self, url):
        HTMLParser.__init__(self)
        self.url, self.file = self.getUrl(url)
        self.data = set()

    def getUrl(self, url):
        'product usable filename'
        parsed = urllib.parse.urlparse(url)
        host = parsed.netloc.split('@')[-1].split(':')[0]
        #print(host)
        filePath = '%s%s' % (host, parsed.path)
        if not os.path.splitext(parsed.path)[1]:
            filePath = os.path.join(filePath, 'index.html')
        #print(filePath)
        linkDir = os.path.dirname(filePath)
        if not os.path.isdir(linkDir):
            if os.path.exists(linkDir):
                os.unlink(linkDir)
            os.makedirs(linkDir)
        return url, filePath

    def handle_starttag(self, tag, attrs):
        if tag == 'a':
            for name, value in attrs:
                if name == 'href' and 'https' in value:
                    self.data.add(value)

    def download(self):
        'Download file base url'
        print(self.file, '  ', self.url)
        try:
            ret = urllib.request.urlretrieve(self.url, self.file)
        except (IOError, urllib.request.URLError) as e:
            ret = (('*** ERROR: bad URL %s: %s' % (self.url, e)),)
        return ret

    def parserLink(self):
        'product html list'
        with open(self.file, 'r') as f:
            try:
                page = f.read()
            except http.client.IncompleteRead as e:
                return self.data
            self.feed(page)
            return self.data

class sumInfo(object):
    count = 0

    def __init__(self, url):
        self.q = [url]
        self.seen = set()
        parsed = urllib.request.urlparse(url)
        host = parsed.netloc.split('@')[-1].split(':')[0]
        self.dom = '.'.join(host.split('.')[-2:])

def main():
    # if len(sys.argv) > 1:
    #     url = sys.argv[1]
    # else:
    #     try:
    #         url = input('input URL : ')
    #     except (KeyboardInterrupt, EOFError):
    #         url = ''
    # if not url:
    #     print('input not satify,exit')
    #     return
    # if not url.startswith('http://') and not url.startswith('ftp://'):
    #     url = 'http://%s' % url
    #url = 'http://tieba.baidu.com/p/2256306796'
    url = 'https://stackoverflow.com/questions/36998191/typeerror-not-a-valid-non-string-sequence-or-mapping-object'
    mainPage = pageDeal(url)
    print(mainPage.download())
    inPage = mainPage.parserLink()
    print('URL:  %s
FILE: %s' % (mainPage.url, mainPage.file)) print(inPage) for iUrl in inPage: print(iUrl) iPage = pageDeal(iUrl) iPage.download() #print('URL: %s
FILE: %s' % (iPage.url, iPage.file)) if __name__ == '__main__': main() #ret = urllib.request.urlretrieve('https://stackoverflow.com/questions/36998191/typeerror-not-a-valid-non-string-sequence-or-mapping-object', 'w00444862.flag')