python 3猫眼映画Top 100情報(正則+requests)


import json
from multiprocessing.dummy import Pool
import requests
import re

def get_one_page(url):
    #         ,        ,         
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119 Safari/537.36'
    }
    #           
    respone = requests.get(url,headers=headers)
    #     
    return respone.text

def parse_one_page(html):
    #        
    pattern = re.compile('
.*?board-index.*?>(.*?)<.>(.*?)' '.*?releasetime">(.*?).*?integer">(.*?).*?fraction">(.*?)' '.*?
',re.S) # result = re.findall(pattern,html) # yield, , , # for item in result: yield { 'index':item[0], 'title':item[1].strip(' ').strip('
').strip(' '), 'actor':item[2].strip(' ').strip('
').strip(' '), 'time':item[3], 'score':item[4]+item[5] } def write_to_file(content): # a , , with open('./output.txt','a',encoding='utf-8') as output_file: # utf-8 # json dict output_file.write(json.dumps(content,ensure_ascii=False)+'
') # ascii # , \u9b42\u65ad\u84dd\u6865 utf-8 def read_from_file(file): data = [] with open(file,'r',encoding='utf-8') as lines: # result = dict(line for line in lines if line) for line in lines: try: data.append(json.loads(line.rstrip('
').replace('\'','\"'))) except: print('something wrong') return data def main(offset): ''' ''' url = 'http://maoyan.com/board/4?offset=%d' % offset html = get_one_page(url) result = parse_one_page(html) for item in result: # write_to_file(item) if __name__ == '__main__': # Top100, , ,inidex 。 # for i in range(10): # main(i*10) # map, , pool = Pool() # , index pool.map(main,[i*10 for i in range(10)]) # output, # , data = read_from_file('./output.txt') print(data)

出力結果:
{"index": "21", "title": "     ", "actor": "  :  ·  ,   ·  ·  ,  ·  ", "time": "    :1998-10-28(   )", "score": "9.2"}
{"index": "22", "title": "   3:    ", "actor": "  :   ·  ,  ·    ,  ·  ", "time": "    :2004-03-15", "score": "9.2"}
{"index": "2", "title": "      ", "actor": "  :  ·   ,  ·   ,  ·  ", "time": "    :1994-10-14(  )", "score": "9.5"}
{"index": "11", "title": "    ", "actor": "  :   ,   ,   ", "time": "    :1999-02-13(    )", "score": "9.2"}
{"index": "12", "title": "    ", "actor": "  :  · ,   ·  ,    ·    ", "time": "    :1939-12-15(  )", "score": "9.1"}

.
.
.
{"index": "97", "title": "   ", "actor": "  :   ,   ,   ", "time": "    :2011-02-17(  )", "score": "9.0"}
{"index": "98", "title": "     ", "actor": "  :   ·     , · ·   ,  ·   ", "time": "    :1966-12-23(   )", "score": "8.9"}
{"index": "99", "title": "    ", "actor": "  :  ·  ,   ·   ", "time": "    :2001-12-12(  )", "score": "9.1"}
{"index": "100", "title": "    ", "actor": "  :  ,   ,   ", "time": "    :2017-11-17", "score": "9.2"}