pythonチェーンファミリーデータの取得

3458 ワード

爬虫類データ解析

V 1版
注意:sublimeで新規作成します.pyファイルはcommand+Bで実行できません.自分の環境変数が設定されていない可能性があります.spyderの新しい実行可能

import sys 
reload(sys) 
sys.setdefaultencoding("utf-8")  #          。           ，    pandas DataFrame              
import pandas as pd 
import urllib2    
import urllib   
import time
import re
from bs4 import BeautifulSoup

#         
myurl="http://hz.lianjia.com/ershoufang/pg"+str(1)
req = urllib2.Request(myurl)   
myResponse = urllib2.urlopen(req) 
myPage = myResponse.read()
unmyPage = myPage.decode('utf-8') #   Unicode  

#       
c1=re.findall('(.*?)(.*?)

',unmyPage,re.S)
totalPrice=[]
for item in c1:
newitem=item[0]+item[1]
newitem=str(newitem)
totalPrice.append(newitem)
#ルームソース情報の照合
c2=re.findall('data-el="region">(.*?)',unmyPage,re.S)
houseinfo=[]
for item in c2:
#item=item.encode('utf-8')
#print isinstance(item,str)
houseinfo.append(item)
#部屋の注目度と一致する
c3=re.findall('(.*?)',unmyPage,re.S)
followinfo=[]
for item in c3:
followinfo.append(item)
house=pd.DataFrame({'totalprice':totalPrice,'houseinfo':houseinfo,
'followinfo':followinfo})
print type(house['totalprice'][0]) #str
print type(house['houseinfo'][0]) #unicode
print type(house['followinfo'][0]) #unicode
print house.head()
参照文字列エンコーディングhttp://ajucs.com/2015/11/10/Python-character-encoding-explained.html http://wklken.me/posts/2013/08/31/python-extra-coding-intro.html
V 2版

import sys 
reload(sys) 
sys.setdefaultencoding("utf-8")  
import pandas as pd 
import urllib2    
import urllib   
import time
import re
from bs4 import BeautifulSoup

totalPrice=[]
houseinfo=[]
followinfo=[]
for i in range(1,3):
    #         
    myurl="http://hz.lianjia.com/ershoufang/pg"+str(i)
    req = urllib2.Request(myurl)   
    myResponse = urllib2.urlopen(req) 
    myPage = myResponse.read()
    unmyPage = myPage.decode('utf-8') 

    #       
    c1=re.findall('(.*?)(.*?)

',unmyPage,re.S)
for item in c1:
newitem=item[0]+item[1]
totalPrice.append(newitem)
#print newitem
#ルームソース情報の照合
c2=re.findall('data-el="region">(.*?)

OSError:cannot open resourceエラーの原因と解決方法

Pythonフォーマット出力_String Formatting_小数点以下の桁数を制御する