python爬虫(実戦)--python菜鳥教程100題例題、猫眼映画top 100、今日のトップ図集を登ります.

12659 ワード

1.python菜鳥チュートリアル100の例題.コード(1):
#!/usr/bin/python
# coding:utf-8
import json
import re
from bs4 import BeautifulSoup
import requests
from requests.exceptions import RequestException


def get_one_page(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            response.encoding = 'utf-8'
            return response.text
        return None
    except RequestException:
        return None


def parse_one_page(html):
    datalist = []
    results = re.findall('

(.*?)(.*?)

', html, re.S) for i in results: for j in i: datalist.append(j) pattern = re.compile('
(.*?)
', re.S) results = re.findall(pattern, html) for result in results: soup = BeautifulSoup(result,'lxml') datalist.append(soup.get_text()) soup = BeautifulSoup(html,'lxml') for pre in soup.select('pre')[0:1]: datalist.append(pre.get_text()) return datalist def write_to_file(content): with open('result.txt', 'a', encoding='utf-8') as f: for i in content: f.write(i) f.write('
') def main(offest): url = 'http://www.runoob.com/python/python-exercise-example' + str(offest) + '.html' html = get_one_page(url) data = parse_one_page(html) write_to_file(data) if __name__ == '__main__': for i in range(1,101): main(offest=i) print(" " + str(i) + " 。。。。。。")

コード(2):
import requests
from bs4 import BeautifulSoup
from pyquery import PyQuery as pq
from lxml import etree
def geu_page(url):
    try:
        res = requests.get(url,timeout=4)
        res.encoding = 'utf-8'
        if res.status_code == 200:
            html = res.text
            return html.encode("utf-8")
    except Exception as e:
        for i in range(3):
            print(url,e)
            res = requests.get(url,timeout=4)
            res.encoding = 'utf-8'
            if res.status_code == 200:
                html = res.text
                return html.encode('utf-8')
def get_index(url):
    html = geu_page(url)
    html = BeautifulSoup(html,'lxml')
    datas = html.find_all('ul')
    data = datas[2]
    data = BeautifulSoup(str(data),'lxml')
    for  urls in data.find_all('a'):
        yield 'http://www.runoob.com' + urls.get('href')

def get_data(url):
    html = geu_page(url)
    doc = pq(html)
    datas = etree.HTML(html)
    title = doc('#content h1').text()
    print('    '+":"+title)
    data = doc('#content p')
    name = pq(data[1]).text()
    num = pq(data[2]).text()
    n = data[3].text
    data = datas.xpath('//div[@class="hl-main"]/span/text()')
    code = ''.join(data)
    with open(r'pythpn  100 .txt','a+',encoding='utf-8') as f:
        f.write(title+'
') f.write(name+'
') f.write(num+'
') f.write(n+'
') f.write(code) f.write('\r
') def main(): url = r'http://www.runoob.com/python/python-100-examples.html' for i in get_index(url): get_data(i) if __name__ == '__main__': main()

2.猫眼映画top 100.
#!/usr/bin/python
# -*- coding: UTF-8 -*-
import json
from multiprocessing import Pool
import requests
from requests.exceptions import RequestException
from multiprocessing import Pool
import re
def get_one_page(url):
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'
    }
    try:
        response = requests.get(url,headers=headers)
        if response.status_code == 200:
            return response.text
        return None
    except Exception as e:
        print(e)
        return None
def parse_one_page(html):
    pattern = re.compile('
.*?board-index.*?>(\d+).*?data-src="(.*?)".*?name">(.*?).*?star">(.*?).*?releasetime">(.*?)' +'.*?integer">(.*?).*?fraction">(.*?).*?
',re.S) items = re.findall(pattern, html) for item in items: yield { 'index':item[0], 'image':item[1], 'title':item[2], 'actor':item[3].strip()[3:], 'time':item[4].strip()[5:], 'score':item[5]+item[6] } def write_to_file(content): with open('result.txt','a',encoding='utf-8') as f: f.write(json.dumps(content,ensure_ascii=False) + '
') f.close() def main(offset): url = 'http://maoyan.com/board/4?offset=' + str(offset) html = get_one_page(url) for item in parse_one_page(html): print(item) write_to_file(item) if __name__ == '__main__': # for i in range(10): # main(i*10) pool = Pool() pool.map(main,[i*10 for i in range(10)]) #

3.今日のトップアルバムをキャプチャします.コード:
#!/usr/bin/python
#coding:utf-8
import os
import json
import re
import pymongo
import requests
from config import *
from requests.exceptions import RequestException
from bs4 import BeautifulSoup
from urllib.parse import urlencode
from hashlib import md5
from  multiprocessing import Pool

client = pymongo.MongoClient(MONGO_URL,connect=False)
db = client[MONGO_DB]
def get_page_index(offest,keyword):
    data = {
        'offset': offest,
        'format': 'json',
        'keyword': keyword,
        'autoload': 'true',
        'count': '20',
        'cur_tab': 1,
        'from': 'search_tab',
        'pd': 'synthesis'
    }

    url = 'https://www.toutiao.com/search_content/?' + urlencode(data)
    try:
        headers = {'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
                   'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9',
                   'cache-control': 'max-age=0',
                   'cookie': 'tt_webid=6607376733821126151; tt_webid=6607376733821126151; WEATHER_CITY=%E5%8C%97%E4%BA%AC; tt_webid=6607376733821126151; UM_distinctid=1662fc5d1b478-04aa6684af3777-8383268-1fa400-1662fc5d1b556c; csrftoken=0e079708e36d9c1eeea96125f6b6309a; uuid="w:17e8c76a5628443999604cfc1482b920"; ccid=fba911a3338ceafebd52015ebe3fb4a9; CNZZDATA1259612802=1051770912-1538395942-https%253A%252F%252Fwww.google.com.hk%252F%7C1538488870; __tasessionId=g87q247qw1538490746687',
                   'referer': 'https://www.toutiao.com/search/?keyword=%E8%A1%97%E6%8B%8D',
                   'upgrade-insecure-requests': '1',
                   'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'
                   }
        response = requests.get(url,headers=headers)
        if response.status_code == 200:
            return response.text
        return None
    except RequestException:
        print("         ")
        return None
def parse_page_index(html):
    data = json.loads(html)
    if data and 'data' in data.keys():
        for item in data.get('data'):
            yield item.get('article_url')
def get_page_detail(url):
    try:
        headers = {'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
                   'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9',
                   'cache-control': 'max-age=0',
                   'cookie': 'tt_webid=6607376733821126151; tt_webid=6607376733821126151; WEATHER_CITY=%E5%8C%97%E4%BA%AC; tt_webid=6607376733821126151; UM_distinctid=1662fc5d1b478-04aa6684af3777-8383268-1fa400-1662fc5d1b556c; csrftoken=0e079708e36d9c1eeea96125f6b6309a; uuid="w:17e8c76a5628443999604cfc1482b920"; ccid=fba911a3338ceafebd52015ebe3fb4a9; CNZZDATA1259612802=1051770912-1538395942-https%253A%252F%252Fwww.google.com.hk%252F%7C1538488870; __tasessionId=g87q247qw1538490746687',
                   'referer': 'https://www.toutiao.com/search/?keyword=%E8%A1%97%E6%8B%8D',
                   'upgrade-insecure-requests': '1',
                   'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'
                   }
        response = requests.get(url,headers=headers)
        if response.status_code == 200:
            return response.text
        return None
    except RequestException:
        print("         ",url)
        return None
def parse_page_datail(html,url):
    soup = BeautifulSoup(html,'lxml')
    title = soup.select('title')[0].get_text()
    print(title)
    images_pattern = re.compile('gallery: JSON.parse\("(.*?)"\),', re.S)
    result = re.search(images_pattern,html)
    if result:
        data = json.loads(result.group(1).replace('\\', ''))
        if data and 'sub_images' in data.keys():
            sub_images = data.get('sub_images')
            images = [item.get('url') for item in sub_images]
            for image in images:
                download_image(image)
            return {
                'title': title,
                'url':url,
                'images': images
            }
def download_image(url):
    print('    ',url)
    try:
        headers = {'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
                   'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9',
                   'cache-control': 'max-age=0',
                   'cookie': 'tt_webid=6607376733821126151; tt_webid=6607376733821126151; WEATHER_CITY=%E5%8C%97%E4%BA%AC; tt_webid=6607376733821126151; UM_distinctid=1662fc5d1b478-04aa6684af3777-8383268-1fa400-1662fc5d1b556c; csrftoken=0e079708e36d9c1eeea96125f6b6309a; uuid="w:17e8c76a5628443999604cfc1482b920"; ccid=fba911a3338ceafebd52015ebe3fb4a9; CNZZDATA1259612802=1051770912-1538395942-https%253A%252F%252Fwww.google.com.hk%252F%7C1538488870; __tasessionId=g87q247qw1538490746687',
                   'referer': 'https://www.toutiao.com/search/?keyword=%E8%A1%97%E6%8B%8D',
                   'upgrade-insecure-requests': '1',
                   'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'
                   }
        response = requests.get(url,headers=headers)
        if response.status_code == 200:
            save_image(response.content)
        return None
    except RequestException:
        print("      ",url)
        return None

def save_image(content):
    file_path = '{0}/{1}.{2}'.format(os.getcwd(),md5(content).hexdigest(),'jpg')
    if not os.path.exists(file_path):
        with open(file_path,'wb') as f:
            f.write(content)
            f.close()
def save_to_mongo(result):
    """        """
    if db[MONGO_DB].insert(result):
        print('    ', result)
        return True
    return False

def main(offset):
    html = get_page_index(offset,KEY)
    for url in parse_page_index(html):
        html = get_page_detail(url)
        if html:
            result = parse_page_datail(html,url)
            if result:
                save_to_mongo(result)

if __name__ == '__main__':
    groups = [x * 20 for x in range(Group_start,Group_end + 1)]
    pool = Pool()
    pool.map(main,groups)

プロファイル:
MONGO_URL = '127.0.0.1'
MONGO_DB = 'toutiao'
MONGO_TABLE = 'toutiao'

Group_start = 1
Group_end = 20

KEY = '   '