【プログラミング】逐行解析の爬取北上広深賃貸情報

57715 ワード

python

万巻の本を読むより万里の道を行くほうがいいと言われています.プログラミング能力を高める良い方法の一つは自分で書くことですが、自分ができないときに他の人が書いたコードを解析するのも効率的な学習方法です.本文は大神Alfredデータ室の中の1種のチェーン家の賃貸サイトのコードを逐行解読して、弟はコンピュータの専門ではありませんため、ただ少し興味があって、漏れたところ、必ずできなくて、間違いの不足なところ、指摘を惜しまないでください.完全なコードは文末にあります.北上広深借家図鑑
本文は主にmongodbデータベースの使用に触れ,今回登ったデータ量が大きいためclassの使用をさらに学習する.
文書ディレクトリ

コード分析

知識総括

完全コード

付録

info

付録2

コード解析

import os
import re
import time
import requests
from pymongo import MongoClient
from info import rent_type, city_info

osモジュールは、ファイルとディレクトリを処理するための関数を提供します.re,requestsは前に紹介しました.timeはその名の通り.pymongoはmongdbデータベースに接続するためのもので、MongoDBは現在最も流行しているNoSQLデータベースの一つで、使用するデータ型BSON(JSONのような)は、知らない学生にリンクを開けて真剣に勉強することをお勧めします.infoは著者が事前に収集した北上広深区位情報で,這い出す際に必要なカテゴリを導入し,文末に具体的な内容を調べることができる.

class Rent(object):
    """
         ，      （  、  ）、              mongodb   
    """

注釈のとおり.

    def __init__(self):
        self.rent_type = rent_type
        self.city_info = city_info

        host = os.environ.get('MONGODB_HOST', '127.0.0.1')  #      
        port = os.environ.get('MONGODB_PORT', '27017')  #      
        mongo_url = 'mongodb://{}:{}'.format(host, port)
        mongo_db = os.environ.get('MONGODB_DATABASE', 'Lianjia')
        client = MongoClient(mongo_url)
        self.db = client[mongo_db]
        self.db['zufang'].create_index('m_url', unique=True)  #  m

この関数は主にmongoデータベースを作成するために使用されます.まず,以前に収集した賃貸物件のカテゴリと都市情報を導入する.os.environ.get()はpythonのosモジュールが環境変数を取得する方法であり、os.Environは辞書で、環境変数の辞書です.MongoClient(mongo_url)は、上記のデータベースに接続します.

    def get_data(self):
        """
                、            
        :return: None
        """
        for ty, type_code in self.rent_type.items():  #   、  
            for city, info in self.city_info.items():  #   、       
                for dist, dist_py in info[2].items():  #       
                    res_bc = requests.get('https://m.lianjia.com/chuzu/{}/zufang/{}/'.format(info[1], dist_py))
                    pa_bc = r"data-type=\"bizcircle\" data-key=\"(.*)\" class=\"oneline \">"
                    bc_list = re.findall(pa_bc, res_bc.text)
                    self._write_bc(bc_list)
                    bc_list = self._read_bc()  #         ，           ，     ，        2000

関数の役割はコメントの通りです.self.rent_type.items()印刷の結果dict_items([(‘整租’,2006000001),(‘合租’,2006000002)])なのでtyは前述の整租,合租,type_codeは後のコードを指します.self.city_info.items()同様に,前者は都市であり,後者は具体的な情報である.私たちが登る携帯電話の末端チェーン家のウェブサイトはやはり比較的に簡単で、各カテゴリがウェブサイトの変化をもたらしたのはやはり比較的に発見しやすくて、つまり都市と各区のピンインです.re.findall(pa_bc,res_bc.text)は、取得したウェブサイトの後に追加する.textはこのウェブサイトのソースコードを直接取得することができます.re.findallは、前の正規表現設定に合致するすべてのコンテンツを取得し、後に書かれた関数によって変更内容を格納して読み出す.

if len(bc_list) > 0:
   for bc_name in bc_list:
   idx = 0
   has_more = 1
   while has_more:
         try:
             url = 'https://app.api.lianjia.com/Rentplat/v1/house/list?city_id={}&condition={}' \
                   '/rt{}&limit=30&offset={}&request_ts={}&scene=list'.format(info[0],
                                                                       bc_name,
                                                                       type_code,
                                                                       idx*30,
                                                                       int(time.time()))
             res = requests.get(url=url, timeout=10)
             print('    {} {}-{} {} {}   ！'.format(city, dist, bc_name, ty, idx+1))
             item = {'city': city, 'type': ty, 'dist': dist}
             self._parse_record(res.json()['data']['list'], item)

             total = res.json()['data']['total']
             idx += 1
             if total/30 <= idx:
               has_more = 0
             # time.sleep(random.random())
         except:
             print('       ，    ！')
             time.sleep(5)

もし商圏の値の長さが0より大きいならば、2つの値を設定して、分析してホームページapiデータインタフェースを探して、この地方の知識点は理解していないで、具体的にどのようにやり遂げることを知らないで、達人の指導を望みます.その後、1ページを登るたびに後の2つの関数が呼び出され、1ページあたりの表示数が30未満になると、次のループが行われ、アクセス回数が多すぎてアクセスが禁止されないように休憩時間が望ましい.

    @staticmethod
    def _write_bc(bc_list):
        """
                txt，            
        :param bc_list:   list
        :return: None
        """
        with open('bc_list.txt', 'w') as f:
            for bc in bc_list:
                f.write(bc + '
')

以前のコードは主にjsonデータを解析していた.with open asはテキストを開くためのもので、前のパラメータは開くファイルで、後のパラメータはどのように開くか、wは書き込みモードを表し、他のモードは付録2を参照する.
知識のまとめ

with open('bc_list.txt', 'w') as f:  #

完全なコード

import os
import re
import time
import requests
from pymongo import MongoClient
from info import rent_type, city_info


class Rent(object):
    """
         ，      （  、  ）、              mongodb   
    """
    def __init__(self):
        self.rent_type = rent_type
        self.city_info = city_info

        host = os.environ.get('MONGODB_HOST', '127.0.0.1')  #      
        port = os.environ.get('MONGODB_PORT', '27017')  #      
        mongo_url = 'mongodb://{}:{}'.format(host, port)
        mongo_db = os.environ.get('MONGODB_DATABASE', 'Lianjia')
        client = MongoClient(mongo_url)
        self.db = client[mongo_db]
        self.db['zufang'].create_index('m_url', unique=True)  #  m          

    def get_data(self):
        """
                、            
        :return: None
        """
        for ty, type_code in self.rent_type.items():  #   、  
            for city, info in self.city_info.items():  #   、       
                for dist, dist_py in info[2].items():  #       
                    res_bc = requests.get('https://m.lianjia.com/chuzu/{}/zufang/{}/'.format(info[1], dist_py))
                    pa_bc = r"data-type=\"bizcircle\" data-key=\"(.*)\" class=\"oneline \">"
                    bc_list = re.findall(pa_bc, res_bc.text)
                    self._write_bc(bc_list)
                    bc_list = self._read_bc()  #         ，           ，     ，        2000   

                    if len(bc_list) > 0:
                        for bc_name in bc_list:
                            idx = 0
                            has_more = 1
                            while has_more:
                                try:
                                    url = 'https://app.api.lianjia.com/Rentplat/v1/house/list?city_id={}&condition={}' \
                                          '/rt{}&limit=30&offset={}&request_ts={}&scene=list'.format(info[0],
                                                                                                     bc_name,
                                                                                                     type_code,
                                                                                                     idx*30,
                                                                                                     int(time.time()))
                                    res = requests.get(url=url, timeout=10)
                                    print('    {} {}-{} {} {}   ！'.format(city, dist, bc_name, ty, idx+1))
                                    item = {'city': city, 'type': ty, 'dist': dist}
                                    self._parse_record(res.json()['data']['list'], item)

                                    total = res.json()['data']['total']
                                    idx += 1
                                    if total/30 <= idx:
                                        has_more = 0
                                    # time.sleep(random.random())
                                except:
                                    print('       ，    ！')
                                    time.sleep(5)

    def _parse_record(self, data, item):
        """
            ，        response json  
        :param data:            
        :param item:     
        :return: None
        """
        if len(data) > 0:
            for rec in data:
                item['bedroom_num'] = rec.get('frame_bedroom_num')
                item['hall_num'] = rec.get('frame_hall_num')
                item['bathroom_num'] = rec.get('frame_bathroom_num')
                item['rent_area'] = rec.get('rent_area')
                item['house_title'] = rec.get('house_title')
                item['resblock_name'] = rec.get('resblock_name')
                item['bizcircle_name'] = rec.get('bizcircle_name')
                item['layout'] = rec.get('layout')
                item['rent_price_listing'] = rec.get('rent_price_listing')
                item['house_tag'] = self._parse_house_tags(rec.get('house_tags'))
                item['frame_orientation'] = rec.get('frame_orientation')
                item['m_url'] = rec.get('m_url')
                item['rent_price_unit'] = rec.get('rent_price_unit')

                try:
                    res2 = requests.get(item['m_url'], timeout=5)
                    pa_lon = r"longitude: '(.*)',"
                    pa_lat = r"latitude: '(.*)'"
                    pa_distance = r"(\d*) "
                    item['longitude'] = re.findall(pa_lon, res2.text)[0]
                    item['latitude'] = re.findall(pa_lat, res2.text)[0]
                    distance = re.findall(pa_distance, res2.text)
                    if len(distance) > 0:
                        item['distance'] = distance[0]
                    else:
                        item['distance'] = None
                except:
                    item['longitude'] = None
                    item['latitude'] = None
                    item['distance'] = None

                self.db['zufang'].update_one({'m_url': item['m_url']}, {'$set': item}, upsert=True)
                print('      :{}!'.format(item))

    @staticmethod
    def _parse_house_tags(house_tag):
        """
          house_tags  ，       
        :param house_tag: house_tags     
        :return:     house_tags
        """
        if len(house_tag) > 0:
            st = ''
            for tag in house_tag:
                st += tag.get('name') + ' '
            return st.strip()

    @staticmethod
    def _write_bc(bc_list):
        """
                txt，            
        :param bc_list:   list
        :return: None
        """
        with open('bc_list.txt', 'w') as f:
            for bc in bc_list:
                f.write(bc+'
')

    @staticmethod
    def _read_bc():
        """
            
        :return: None
        """
        with open('bc_list.txt', 'r') as f:
            return [bc.strip() for bc in f.readlines()]


if __name__ == '__main__':
    rent = Rent()
    rent.get_data()

ふろく
info

rent_type = {'  ': 200600000001, '  ': 200600000002}

city_info = {'  ': [110000, 'bj', {'  ': 'dongcheng', '  ': 'xicheng', '  ': 'chaoyang', '  ': 'haidian',
                                   '  ': 'fengtai', '   ': 'shijingshan', '  ': 'tongzhou', '  ': 'changping',
                                   '  ': 'daxing', '     ': 'yizhuangkaifaqu', '  ': 'shunyi', '  ': 'fangshan',
                                   '   ': 'mentougou', '  ': 'pinggu', '  ': 'huairou', '  ': 'miyun',
                                   '  ': 'yanqing'}],
             '  ': [310000, 'sh', {'  ': 'jingan', '  ': 'xuhui', '  ': 'huangpu', '  ': 'changning',
                                   '  ': 'putuo', '  ': 'pudong', '  ': 'baoshan', '  ': 'zhabei',
                                   '  ': 'hongkou','  ': 'yangpu', '  ': 'minhang', '  ': 'jinshan',
                                   '  ': 'jiading','  ': 'chongming', '  ': 'fengxian', '  ': 'songjiang',
                                   '  ': 'qingpu'}],
             '  ': [440100, 'gz', {'  ': 'tianhe', '  ': 'yuexiu', '  ': 'liwan', '  ': 'haizhu', '  ': 'panyu',
                                   '  ': 'baiyun', '  ': 'huangpu', '  ': 'conghua', '  ': 'zengcheng',
                                   '  ': 'huadu', '  ': 'nansha'}],
             '  ': [440300, 'sz', {'   ': 'luohuqu', '   ': 'futianqu', '   ': 'nanshanqu',
                                   '   ': 'yantianqu', '   ': 'baoanqu', '   ': 'longgangqu',
                                   '   ': 'longhuaqu', '   ': 'guangmingqu', '   ': 'pingshanqu',
                                   '    ': 'dapengxinqu'}]}

付録2

w：      ，
a：        (  EOF   ,         )
r+：       
w+：        (   w )
a+：        (   a )
rb：         
wb：          (   w )
ab：           (   a )
rb+：           (   r+ )
wb+：           (   w+ )
ab+：           (   a+ )fp.read([size])

pandas条件に従ってエラーが報告されたピットを検索する

機械学習(5)--K-meansクラスタリングアルゴリズム