Pythonの下でScrapyを使ってWebページの内容を抽出

7051 ワード

先週は1週間かけてPythonとScrapyを学び、0から1までの完全なウェブ爬虫類の実現を実現しました.研究するときはつらいですが、楽しんで、技術をしていますか.
まず、Pythonをインストールして、穴が多すぎて、一人一人登っています.私はWindows環境なので、macを買うお金がなく、インストール時にいろいろな問題に遭遇し、確かにいろいろな依存があります.インストールチュートリアルはもう説明しません.インストール中にERROR:windows c/c++の問題が発生した場合、一般的にwindows開発コンパイル環境が不足しているため、夜のほとんどのチュートリアルはVisualStudioをインストールするので、あまりにも頼りにならないので、実際にはWindowsSDKをインストールすればいいのです.下に私の爬虫類コードを貼ります.
爬虫類マスター:
# -*- coding: utf-8 -*-

import scrapy
from scrapy.http import Request
from zjf.FsmzItems import FsmzItem
from scrapy.selector import Selector

#   :    
class MySpider(scrapy.Spider):
    #   
    name = "MySpider"
    #    
    allowed_domains = ["nvsheng.com"]
    #    
    start_urls = []

    #flag
    x = 0

    #    
    def parse(self, response):

        item = FsmzItem()

        sel = Selector(response)

        item['title'] = sel.xpath('//h1/text()').extract()
        item['text'] = sel.xpath('//*[@class="content"]/p/text()').extract()
        item['imags'] = sel.xpath('//div[@id="content"]/p/a/img/@src|//div[@id="content"]/p/img/@src').extract()

        if MySpider.x == 0:
            page_list = MySpider.getUrl(self,response)
            for page_single in page_list:
                yield Request(page_single)

        MySpider.x += 1

        yield item


    #init:       
    #       :  scrapy crawl MySpider -a start_url="http://some_url"
    def __init__(self,*args,**kwargs):
        super(MySpider,self).__init__(*args,**kwargs)
        self.start_urls = [kwargs.get('start_url')]

    def getUrl(self, response):
        url_list = []
        select = Selector(response)
        page_list_tmp = select.xpath('//div[@class="viewnewpages"]/a[not(@class="next")]/@href').extract()
        for page_tmp in page_list_tmp:
            if page_tmp not in url_list:
                url_list.append("http://www.nvsheng.com/emotion/px/" + page_tmp)
        return url_list

PipeLinesクラス
# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html

from zjf import settings
import json,os,re,random
import urllib.request
import requests, json
from requests_toolbelt.multipart.encoder import MultipartEncoder

class MyPipeline(object):

    flag = 1
    post_title = ''
    post_text = []
    post_text_imageUrl_list = []
    cs = []
    user_id= ''

    def __init__(self):
        MyPipeline.user_id = MyPipeline.getRandomUser('37619,18441390,18441391')

    #process the data
    def process_item(self, item, spider):

        #    user_id,    
        user_id = MyPipeline.user_id

        #    text_str_tmp
        text = item['text']
        text_str_tmp = ""
        for str in text:
            text_str_tmp = text_str_tmp + str
        # print(text_str_tmp)

        #    
        if MyPipeline.flag == 1:
            title = item['title']
            MyPipeline.post_title = MyPipeline.post_title + title[0]


        #       
        text_insert_pic = ''
        text_insert_pic_w = ''
        text_insert_pic_h = ''
        for imag_url in item['imags']:
            img_name = imag_url.replace('/','').replace('.','').replace('|','').replace(':','')
            pic_dir = settings.IMAGES_STORE + '%s.jpg' %(img_name)
            urllib.request.urlretrieve(imag_url,pic_dir)

            #    ,  json
            upload_img_result = MyPipeline.uploadImage(pic_dir,'image/jpeg')
            #  json       
            text_insert_pic = upload_img_result['result']['image_url']
            text_insert_pic_w = upload_img_result['result']['w']
            text_insert_pic_h = upload_img_result['result']['h']

        #  json
        if MyPipeline.flag == 1:
            cs_json = {"c":text_str_tmp,"i":"","w":text_insert_pic_w,"h":text_insert_pic_h}
        else:
            cs_json = {"c":text_str_tmp,"i":text_insert_pic,"w":text_insert_pic_w,"h":text_insert_pic_h}
        MyPipeline.cs.append(cs_json)

        MyPipeline.flag += 1

        return item

    #spider      
    def open_spider(self,spider):
        pass

    #sipder       
    def close_spider(self,spider):
        strcs = json.dumps(MyPipeline.cs)
        jsonData = {"apisign":"99ea3eda4b45549162c4a741d58baa60","user_id":MyPipeline.user_id,"gid":30,"t":MyPipeline.post_title,"cs":strcs}
        MyPipeline.uploadPost(jsonData)

    #    
    def uploadImage(img_path,content_type):
        "uploadImage functions"
        #UPLOAD_IMG_URL = "http://api.qa.douguo.net/robot/uploadpostimage"
        UPLOAD_IMG_URL = "http://api.douguo.net/robot/uploadpostimage"
        #    
        #imgPath = 'D:\pics\http___img_nvsheng_com_uploads_allimg_170119_18-1f1191g440_jpg.jpg'

        m = MultipartEncoder(
            # fields={'user_id': '192323',
            #         'images': ('filename', open(imgPath, 'rb'), 'image/JPEG')}
            fields={'user_id': MyPipeline.user_id,
                    'apisign':'99ea3eda4b45549162c4a741d58baa60',
                    'image': ('filename', open(img_path , 'rb'),'image/jpeg')}
        )

        r = requests.post(UPLOAD_IMG_URL,data=m,headers={'Content-Type': m.content_type})
        return r.json()

    def uploadPost(jsonData):
        CREATE_POST_URL = "http://api.douguo.net/robot/uploadimagespost"
        reqPost = requests.post(CREATE_POST_URL,data=jsonData)

    def getRandomUser(userStr):
        user_list = []
        user_chooesd = ''
        for user_id in str(userStr).split(','):
            user_list.append(user_id)

        userId_idx = random.randint(1,len(user_list))
        user_chooesd = user_list[userId_idx-1]

        return user_chooesd

フィールドにItemsクラスを保存
# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html

import scrapy

class FsmzItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    title = scrapy.Field()
    #tutor = scrapy.Field()
    #strongText = scrapy.Field()
    text = scrapy.Field()
    imags = scrapy.Field()

コマンドラインに入力
scrapy crawl MySpider -a start_url="www.aaa.com"

これでaaaを這い出すことができます.comの下の内容