Pythonで省コードを取得

2639 ワード

import requests
import xml.etree.ElementTree as ET
from xml.parsers.expat import ParserCreate

In [2]:
class DefaultSaxHandler(object):
    def __init__(self, provinces):
        self.provinces = provinces
        
    #     
    def start_element(self, name, attrs):
        if name != 'map':
            name = attrs['title']
            number = attrs['href']
            self.provinces.append((name, number))
            
    #       
    def end_element(self, name):
        pass
    
    #     
    def char_data(self, text):
        pass

In [3]:
def get_province_entry(url):
    #        
    content = requests.get(url).content.decode('gb2312')
    #              ,       
    start = content.find('')
    end = content.find('')
    content = content[start:end + len('')].strip()
    provinces = []
    #   sex   
    handler = DefaultSaxHandler(provinces)
    #       
    parser = ParserCreate()
    parser.StartElementHandler = handler.start_element
    parser.EndElementHandler = handler.end_element
    parser.CharacterDataHandler = handler.char_data
    #    
    parser.Parse(content)
    #              
    return provinces

In [5]:
provinces = get_province_entry('http://www.ip138.com/post')
print(provinces)
[('  ', '/83/'), ('  ', '/85/'), ('  ', '/81/'), ('  ', '/73/'), ('  ', '/61/'), ('  ', '/65/'), ('  ', '/75/'), ('   ', '/01/'), ('   ', '/15/'), ('  ', '/13/'), ('  ', '/11/'), ('  ', '/50/'), ('  ', '/10/'), ('  ', '/30/'), ('  ', '/71/'), ('  ', '/03/'), ('  ', '/25/'), ('  ', '/45/'), ('  ', '/40/'), ('  ', '/43/'), ('  ', '/23/'), ('  ', '/21/'), ('  ', '/20/'), ('  ', '/55/'), ('  ', '/53/'), ('  ', '/41/'), ('  ', '/33/'), ('  ', '/31/'), ('  ', '/35/'), ('  ', '/51/'), ('  ', '/57/'), ('  ', '/taiwang/'), ('  ', '/aomen/'), ('  ', '/xianggang/')]

In [ ]: