python scrapy+MongodbトンボFMを這い取り、クールな私と怠け者が本を聞く

10500 ワード

  • 1、初心:ネット上で大量に本をダウンロードしたい、トークショーなど、資源が乏しいので、
  • を試してみてください.
  • 2、技術:wireshark scrapy jsonMonogoDB
  • 3、構想:wireshark分析モバイルAPPが返す各種接続分類、リスト、ダウンロードアドレスなど(jsonフォーマット)
  • 4、構想:scrapyはjsonを解析し、ダウンロード接続
  • を生成する.
  • 5、構想:MongoDB
  • に記憶する
  • 6、難点:wiresharkは各種の住所を分析して、すべて簡単なscrapyの基礎の使用で、公式サイトの説明のドキュメントはすべて
  • があります
  • 7、tree/Fで生成するファイルディレクトリに従って説明しよう1 items.pyフィールド設定、必要に応じて
  • を変更
    ''' from scrapy import Item,Field class QtscrapyItem(Item): id = Field() parent_info = Field() title = Field() update_time = Field() file_path = Field() source = Field() '''
    2 pipelines.pyフィールド設定および関連処理、必要に応じて変更
    ''' import pymongo as pymongo from scrapy import signals import json import codecs from scrapy.conf import settings class QtscrapyPipeline(object): def init(self): self.file = codecs.open('qingting_209.json', 'wb', encoding='utf-8') def process_item(self, item, spider): line = json.dumps(dict(item), ensure_ascii=False) + ""# print(line) self.file.write(line) return item class QtscrapyMongoPipeline(object): def init(self): host = settings['MONGODB_HOST'] port = settings['MONGODB_PORT'] dbName = settings['MONGODB_DBNAME'] client = pymongo.MongoClient(host=host, port=port) tdb = client[dbName] self.post = tdb[settings['MONGODB_DOCNAME']] def process_item(self, item, spider): qtfm = dict(item) self.post.insert(qtfm) return item '''
    3 settings.py基礎構成構成データベースは関連QtscrapyPipelineをpipelinesから格納する.pyで定義したクラス
    ''' ITEM_PIPELINES = { # 'qtscrapy.pipelines.QtscrapyPipeline': 300, 'qtscrapy.pipelines.QtscrapyMongoPipeline': 300, } MONGODB_HOST = '127.0.0.1' MONGODB_PORT = 12345 MONGODB_DBNAME = 'qingtingDB' MONGODB_DOCNAME = 'qingting' '''
    └─spiders
    4 qingting.py爬虫類は,それぞれ神通力がある.
    ''' from scrapy.spiders import BaseSpider from scrapy.http import Request import sys, json from qtscrapy.items import QtscrapyItem from scrapy_redis.spiders import RedisSpider reload(sys) sys.setdefaultencoding("utf-8")
    1クールな私は本の住所の分析を聞きます
    http://ts.kuwo.cn/service/gethome.php?act=new_home http://ts.kuwo.cn/service/getlist.v31.php?act=catlist&id=97 http://ts.kuwo.cn/service/getlist.v31.php?act=cat&id=21&type=hot http://ts.kuwo.cn/service/getlist.v31.php?act=detail&id=100102396
    2 Redisと協力してclass qtscrapy(RedisSpider):
    class qtscrapy(BaseSpider): name = "qingting"# redis_key = 'qingting:start_urls' base_url = "http://api2.qingting.fm/v6/media/recommends/guides/section/"start_urls = ["http://api2.qingting.fm/v6/media/recommends/guides/section/0", "http://ts.kuwo.cn/service/gethome.php?act=new_home", "http://api.mting.info/yyting/bookclient/ClientTypeResource.action?type=0&pageNum=0&pageSize=500&token=_4WfzpCah8ujgJZZzboaUGkJQvWGfEEL-zdukwv7lbY*&q=0&imei=ODY1MTY2MDIxNzMzNjI0"] allowed_domains = ["api2.qingting.fm", "ts.kuwo.cn", "api.mting.info"] def parse(self, response):
    3返されたurlから判断し、scrapyが多爬虫類を実行するか、それともこの混在を実行するかを考える
        if "qingting" in response.url:
            qt_json = json.loads(response.body, encoding="utf-8")
            if qt_json["data"] is not None:
                for data in qt_json["data"]:
                    if data is not None:
                        for de in data["recommends"]:
                            if de["parent_info"] is None:
                                pass
                            else:
                                jm_url = "http://api2.qingting.fm/v6/media/channelondemands/%(parent_id)s/programs/curpage/1/pagesize/1000" % \
                                         de["parent_info"]
                                yield Request(jm_url, callback=self.get_qt_jmlist, meta={"de": de})
            for i in range(0, 250):
                url = self.base_url + str(i)
                yield Request(url, callback=self.parse)
        if "kuwo" in response.url:
            kw_json = json.loads(response.body, encoding="utf-8")
            if kw_json["cats"] is not None:
                for data in kw_json["cats"]:
                    pp_id = data["Id"]
                    kw_url = "http://ts.kuwo.cn/service/getlist.v31.php?act=catlist&id=%s" % pp_id
                    yield Request(kw_url, callback=self.get_kw_catlist)
        if "mting" in response.url:
            # print(response)
            lr_json = json.loads(response.body, encoding="utf-8")
            if len(lr_json["list"]) > 0:
                for l in lr_json["list"]:
                    try:
                        lr_url = "http://api.mting.info/yyting/bookclient/ClientTypeResource.action?type=%(id)s&pageNum=0&pageSize=1000&sort=2&token=_4WfzpCah8ujgJZZzboaUGkJQvWGfEEL-zdukwv7lbY*&imei=ODY1MTY2MDIxNzMzNjI0" % l
                        yield Request(lr_url, callback=self.get_lr_booklist)
                    except:
                        pass
            for r in range(-10, 1000):
                lr_url = "http://api.mting.info/yyting/bookclient/ClientTypeResource.action?type=%s&pageNum=0&pageSize=1000&token=_4WfzpCah8ujgJZZzboaUGkJQvWGfEEL-zdukwv7lbY*&q=0&imei=ODY1MTY2MDIxNzMzNjI0" % t
                yield Request(lr_url, callback=self.parse)

    4再帰が必要な回数はApp構造で決定されます
    def get_qt_jmlist(self, response):
        jm_json = json.loads(response.body, encoding="utf-8")
        de = response.meta["de"]
        for jm_data in jm_json["data"]:
            if jm_data is None:
                pass
            else:
                try:
                    file_path = "http://upod.qingting.fm/%(file_path)s?deviceid=ffffffff-ebbe-fdec-ffff-ffffb1c8b222" % \
                                jm_data["mediainfo"]["bitrates_url"][0]
                    item = QtscrapyItem()
                    # print(item)
                    # print(jm_data["id"])
                    item["id"] = str(jm_data["id"])
                    parent_info = "%(parent_id)s_%(parent_name)s" % de["parent_info"]
                    item["parent_info"] = parent_info
                    item["title"] = jm_data["title"]
                    item["update_time"] = str(jm_data["update_time"])[:str(jm_data["update_time"]).index(' ')]
                    item["file_path"] = file_path
                    item["source"] = "qingting"
                    yield item
                except:
                    pass
        pass
    def get_kw_catlist(self, response):
        try:
            kw_json = json.loads(response.body, encoding="utf-8")
            if kw_json["sign"] is not None:
                if kw_json["list"] is not None:
                    for data in kw_json["list"]:
                        p_id = data["Id"]
                        kw_p_url = "http://ts.kuwo.cn/service/getlist.v31.php?act=cat&id=%s&type=hot" % p_id
                        yield Request(kw_p_url, callback=self.get_kw_cat)
        except:
            print("*" * 300)
            print(self.name, kw_json)
            pass
    def get_kw_cat(self, response):
        try:
            kw_json = json.loads(response.body, encoding="utf-8")
            p_info = {}
            if kw_json["sign"] is not None:
                if kw_json["list"] is not None:
                    for data in kw_json["list"]:
                        id = data["Id"]
                        p_info["p_id"] = data["Id"]
                        p_info["p_name"] = data["Name"]
                        kw_pp_url = "http://ts.kuwo.cn/service/getlist.v31.php?act=detail&id=%s" % id
                        yield Request(kw_pp_url, callback=self.get_kw_jmlist, meta={"p_info": p_info})
        except:
            print("*" * 300)
            print(self.name, kw_json)
            pass
    def get_kw_jmlist(self, response):
        jm_json = json.loads(response.body, encoding="utf-8")
        p_info = response.meta["p_info"]
        for jm_data in jm_json["Chapters"]:
            if jm_data is None:
                pass
            else:
                try:
                    file_path = "http://cxcnd.kuwo.cn/tingshu/res/WkdEWF5XS1BB/%s" % jm_data["Path"]
                    item = QtscrapyItem()
                    item["id"] = str(jm_data["Id"])
                    parent_info = "%(p_id)s_%(p_name)s" % p_info
                    item["parent_info"] = parent_info
                    item["title"] = jm_data["Name"]
                    item["update_time"] = ""
                    item["file_path"] = file_path
                    item["source"] = "kuwo"
                    yield item
                except:
                    pass
        pass
    def get_lr_booklist(self, response):
        s_lr_json = json.loads(response.body, encoding="utf-8")
        if len(s_lr_json["list"]) > 0:
            for s_lr in s_lr_json["list"]:
                s_lr_url = "http://api.mting.info/yyting/bookclient/ClientGetBookResource.action?bookId=%(id)s&pageNum=1&pageSize=2000&sortType=0&token=_4WfzpCah8ujgJZZzboaUGkJQvWGfEEL-zdukwv7lbY*&imei=ODY1MTY2MDIxNzMzNjI0" % s_lr
                meta = {}
                meta["id"] = s_lr["id"]
                meta["name"] = s_lr["name"]
                yield Request(s_lr_url, callback=self.get_lr_kmlist, meta={"meta": meta})
    def get_lr_kmlist(self, response):
        ss_lr_json = json.loads(response.body, encoding="utf-8")
        parent = response.meta["meta"]
        if len(ss_lr_json["list"]) > 0:
            for ss_lr in ss_lr_json["list"]:
                try:
                    item = QtscrapyItem()
                    item["id"] = str(ss_lr["id"])
                    parent_info = "%(id)s_%(name)s" % parent
                    item["parent_info"] = parent_info
                    item["title"] = ss_lr["name"]
                    item["update_time"] = ""
                    item["file_path"] = ss_lr["path"]
                    item["source"] = "lr"
                    yield item
                except:
                    pass

    ''###5結果は、約40万の記録を登ったことを示しています.