10万の商品データを取得

5866 ワード
問題:
取引先を這い出す場合、コード
を使用する.

 ， [' ：']， ‘ － ’ 
 
 selector 
soup.select('#wrapper > div.content.clearfix > div.leftBox > div > div > ul > li:nth-of-type(3) > a)

 
soup.select('#wrapper > div.content.clearfix > div.leftBox > div > div > ul > li:nth-of-type(3) > a)[0].stripped_strings

 ， 
 ， map() 

 

requests.exceptions.ChunkedEncodingError: ("Connection broken: ConnectionResetError(54, 'Connection reset by peer')", ConnectionResetError(54, 'Connection reset by peer'))
```    
       ，   [python requests  chunked    ](http://blog.csdn.net/wangzuxi/article/details/40377467)
              ，          
#####    
          Request Headers    Accept-Encoding,          ，     requests.get    
```headers = { 'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4)AppleWebKit/537.36(KHTML,likeGecko)Chrome/44.0.2403.157 Safari/537.36',    'Connection':'keep-alive','Accept-Encoding':'gzip, deflate'}```        
-      5       ```requests.exceptions.ConnectionError: None: Max retries exceeded with url: /qitawupin/o111/ (Caused by None)```    
######    
    ip


#       
-       

import requests
 from bs4 import BeautifulSoup
 first_url = 'http://bj.ganji.com/wu/'
 base_url = 'http://bj.ganji.com'
http://bj.ganji.com/jiaju/
def get_second_url(url):
 web_data = requests.get(url)
 soup = BeautifulSoup(web_data.text, 'lxml')
 second_urls = soup.select('dl.fenlei dt a')
 for second_url in second_urls:
 whole_second_url = base_url + second_url.get('href')
 print(whole_second_url)
         whole_second_url 

-      

mport requests,time,pymongo,random
 from bs4 import BeautifulSoup
 client = pymongo.MongoClient('localhost',27017)
 ganji = client['ganji']
 whole_third_url = ganji['whole_third_url']
 item_info = ganji['item_info']
 headers = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36', 'Connection':'keep-alive','Accept-Encoding':'gzip, deflate'}
 proxy_list = ['http://117.177.250.151:8081', 'http://111.85.219.250:3129', 'http://122.70.183.138:8118',]
 proxy_ip = random.choice(proxy_list)
 proxies = {'http':proxy_ip}
 def get_third_url(whole_second_url,pages):
 whole_url = '{}o{}/'.format(whole_second_url,str(pages))
 web_data = requests.get(whole_url,headers = headers,proxies = proxies)
 # time.sleep(5)
 soup = BeautifulSoup(web_data.text, 'lxml')
 if soup.find_all('a',{'class':'next'}):
 for link in soup.select('li.js-item a.ft-tit'):
 third_url = link.get('href')
 whole_third_url.insert_one({'url':third_url})
 #print(third_url)
 else:
 pass

-          

def get_item_info(url):
 web_data = requests.get(url)
 soup = BeautifulSoup(web_data.text, 'lxml')
 title = soup.select('h1.title-name')[0].text if soup.find_all('h1',{'class':'title-name'}) else None
 # （ ）， ， html ， 
 if title == None:
 pass
 else:
 time = list(soup.select('i.pr-5')[0].stripped_strings) if soup.find('i',{'class':'pr-5'}) else None
 type = soup.select('#wrapper > div.content.clearfix > div.leftBox > div:nth-of-type(3) > div > ul > li:nth-of-type(1) > span > a')[0].text if soup.find_all('ul',{'class':'det-infor'}) else None
 price = soup.select('i.f22.fc-orange.f-type')[0].text if soup.find_all('i',{'class':'f22 fc-orange f-type'}) else None
 address = list(map(lambda x:x.text,soup.select('#wrapper > div.content.clearfix > div.leftBox > div > div > ul > li:nth-of-type(3) > a'))) if soup.find_all('li') else None
 old_new = soup.select('ul.second-det-infor.clearfix > li:nth-of-type(2) > label')[0].text if soup.select('ul.second-det-infor.clearfix > li:nth-of-type(2) > label') else None
 item_info.insert_one({'title':title, 'time':time, 'type':type, 'price':price, 'address':address, 'old_new':old_new})
    print(title,time,type,price,address,old_new)

-      


from multiprocessing import Pool
 from get_second_url import whole_second_url
 from get_third_url import get_third_url
 from get_third_url import get_item_info
 def get_all_links_from(whole_second_url):
 for i in range(1,121):
 get_third_url(whole_second_url,i)
if name == 'main':
 pool = Pool()
 pool.map(get_all_links_from,whole_second_url.split())

-   
                    

import time
 from get_third_url import whole_third_url
 while True:
 print(whole_third_url.find().count())
 time.sleep(5)
【C++】ファイルの読み取りと書き込み
axiosのリクエストとレスポンスの内容をconsole.logで共通的に出力する方法