python 3猫眼映画Top 100情報(正則+requests)
import json
from multiprocessing.dummy import Pool
import requests
import re
def get_one_page(url):
# , ,
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119 Safari/537.36'
}
#
respone = requests.get(url,headers=headers)
#
return respone.text
def parse_one_page(html):
#
pattern = re.compile('.*?board-index.*?>(.*?)<.>(.*?)'
'.*?releasetime">(.*?).*?integer">(.*?).*?fraction">(.*?)'
'.*? ',re.S)
#
result = re.findall(pattern,html)
# yield, , ,
#
for item in result:
yield {
'index':item[0],
'title':item[1].strip(' ').strip('
').strip(' '),
'actor':item[2].strip(' ').strip('
').strip(' '),
'time':item[3],
'score':item[4]+item[5]
}
def write_to_file(content):
# a , ,
with open('./output.txt','a',encoding='utf-8') as output_file: # utf-8
# json dict
output_file.write(json.dumps(content,ensure_ascii=False)+'
') # ascii
# , \u9b42\u65ad\u84dd\u6865 utf-8
def read_from_file(file):
data = []
with open(file,'r',encoding='utf-8') as lines:
# result = dict(line for line in lines if line)
for line in lines:
try:
data.append(json.loads(line.rstrip('
').replace('\'','\"')))
except:
print('something wrong')
return data
def main(offset):
''' '''
url = 'http://maoyan.com/board/4?offset=%d' % offset
html = get_one_page(url)
result = parse_one_page(html)
for item in result:
#
write_to_file(item)
if __name__ == '__main__':
# Top100, , ,inidex 。
# for i in range(10):
# main(i*10)
# map, ,
pool = Pool()
# , index
pool.map(main,[i*10 for i in range(10)])
# output,
# ,
data = read_from_file('./output.txt')
print(data)
出力結果:
{"index": "21", "title": " ", "actor": " : · , · · , · ", "time": " :1998-10-28( )", "score": "9.2"}
{"index": "22", "title": " 3: ", "actor": " : · , · , · ", "time": " :2004-03-15", "score": "9.2"}
{"index": "2", "title": " ", "actor": " : · , · , · ", "time": " :1994-10-14( )", "score": "9.5"}
{"index": "11", "title": " ", "actor": " : , , ", "time": " :1999-02-13( )", "score": "9.2"}
{"index": "12", "title": " ", "actor": " : · , · , · ", "time": " :1939-12-15( )", "score": "9.1"}
.
.
.
{"index": "97", "title": " ", "actor": " : , , ", "time": " :2011-02-17( )", "score": "9.0"}
{"index": "98", "title": " ", "actor": " : · , · · , · ", "time": " :1966-12-23( )", "score": "8.9"}
{"index": "99", "title": " ", "actor": " : · , · ", "time": " :2001-12-12( )", "score": "9.1"}
{"index": "100", "title": " ", "actor": " : , , ", "time": " :2017-11-17", "score": "9.2"}