10万の商品データを取得

5866 ワード

問題:
  • 取引先を這い出す場合、コード
  • を使用する.

    , [' :'], ‘ - ’

    selector

    soup.select('#wrapper > div.content.clearfix > div.leftBox > div > div > ul > li:nth-of-type(3) > a)
    

    soup.select('#wrapper > div.content.clearfix > div.leftBox > div > div > ul > li:nth-of-type(3) > a)[0].stripped_strings
    


    , map()

    requests.exceptions.ChunkedEncodingError: ("Connection broken: ConnectionResetError(54, 'Connection reset by peer')", ConnectionResetError(54, 'Connection reset by peer'))
    ```    
           ,   [python requests  chunked    ](http://blog.csdn.net/wangzuxi/article/details/40377467)
                  ,          
    #####    
              Request Headers    Accept-Encoding,          ,     requests.get    
    ```headers = { 'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4)AppleWebKit/537.36(KHTML,likeGecko)Chrome/44.0.2403.157 Safari/537.36',    'Connection':'keep-alive','Accept-Encoding':'gzip, deflate'}```        
    -      5       ```requests.exceptions.ConnectionError: None: Max retries exceeded with url: /qitawupin/o111/ (Caused by None)```    
    ######    
        ip
    
    
    #       
    -       
    

    import requests
    from bs4 import BeautifulSoup
    first_url = 'http://bj.ganji.com/wu/'
    base_url = 'http://bj.ganji.com'

    http://bj.ganji.com/jiaju/

    def get_second_url(url):
    web_data = requests.get(url)
    soup = BeautifulSoup(web_data.text, 'lxml')
    second_urls = soup.select('dl.fenlei dt a')
    for second_url in second_urls:
    whole_second_url = base_url + second_url.get('href')
    print(whole_second_url)

             whole_second_url 
    
    -      
    

    mport requests,time,pymongo,random
    from bs4 import BeautifulSoup
    client = pymongo.MongoClient('localhost',27017)
    ganji = client['ganji']
    whole_third_url = ganji['whole_third_url']
    item_info = ganji['item_info']
    headers = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36', 'Connection':'keep-alive','Accept-Encoding':'gzip, deflate'}
    proxy_list = ['http://117.177.250.151:8081', 'http://111.85.219.250:3129', 'http://122.70.183.138:8118',]
    proxy_ip = random.choice(proxy_list)
    proxies = {'http':proxy_ip}
    def get_third_url(whole_second_url,pages):
    whole_url = '{}o{}/'.format(whole_second_url,str(pages))
    web_data = requests.get(whole_url,headers = headers,proxies = proxies)
    # time.sleep(5)
    soup = BeautifulSoup(web_data.text, 'lxml')
    if soup.find_all('a',{'class':'next'}):
    for link in soup.select('li.js-item a.ft-tit'):
    third_url = link.get('href')
    whole_third_url.insert_one({'url':third_url})
    #print(third_url)
    else:
    pass

    
    -          
    

    def get_item_info(url):
    web_data = requests.get(url)
    soup = BeautifulSoup(web_data.text, 'lxml')
    title = soup.select('h1.title-name')[0].text if soup.find_all('h1',{'class':'title-name'}) else None
    # ( ), , html ,
    if title == None:
    pass
    else:
    time = list(soup.select('i.pr-5')[0].stripped_strings) if soup.find('i',{'class':'pr-5'}) else None
    type = soup.select('#wrapper > div.content.clearfix > div.leftBox > div:nth-of-type(3) > div > ul > li:nth-of-type(1) > span > a')[0].text if soup.find_all('ul',{'class':'det-infor'}) else None
    price = soup.select('i.f22.fc-orange.f-type')[0].text if soup.find_all('i',{'class':'f22 fc-orange f-type'}) else None
    address = list(map(lambda x:x.text,soup.select('#wrapper > div.content.clearfix > div.leftBox > div > div > ul > li:nth-of-type(3) > a'))) if soup.find_all('li') else None
    old_new = soup.select('ul.second-det-infor.clearfix > li:nth-of-type(2) > label')[0].text if soup.select('ul.second-det-infor.clearfix > li:nth-of-type(2) > label') else None
    item_info.insert_one({'title':title, 'time':time, 'type':type, 'price':price, 'address':address, 'old_new':old_new})

        print(title,time,type,price,address,old_new)
    
    -      
    
    

    from multiprocessing import Pool
    from get_second_url import whole_second_url
    from get_third_url import get_third_url
    from get_third_url import get_item_info
    def get_all_links_from(whole_second_url):
    for i in range(1,121):
    get_third_url(whole_second_url,i)

    if name == 'main':
    pool = Pool()
    pool.map(get_all_links_from,whole_second_url.split())

    
    -   
                        
    

    import time
    from get_third_url import whole_third_url
    while True:
    print(whole_third_url.find().count())
    time.sleep(5)