データベースに格納されているリンクの内容をキャプチャ

5067 ワード

  • 以下は、情報をキャプチャする関数
  • である.
    def get_item_info(url):    
          web_data = requests.get(url)    
          soup = BeautifulSoup(web_data.text, 'lxml')    
          title = soup.select('h1.title-name')[0].text if soup.find_all('h1',{'class':'title-name'}) else None    
          if title == None:        
             pass    
          else:        
               data = {'time':list(soup.select('i.pr-5')[0].stripped_strings)[0].split()[0] if soup.find('i',{'class':'pr-5'}) else None, 
                       'type':soup.select('#wrapper > div.content.clearfix > div.leftBox > div:nth-of-type(3) > div > ul > li:nth-of-type(1) > span > a')[0].text if soup.find_all('ul',{'class':'det-infor'}) else None, 
                       'price':soup.select('i.f22.fc-orange.f-type')[0].text if soup.find_all('i',{'class':'f22 fc-orange f-type'}) else None,  
                       'address':list(map(lambda x:x.text,soup.select('#wrapper > div.content.clearfix > div.leftBox > div > div > ul > li:nth-of-type(3) > a'))) if soup.find_all('li') else None, 
                       'old_new':list(soup.select('#wrapper > div.content.clearfix > div.leftBox > div:nth-of-type(4) > div.det-summary > div > div.second-dt-bewrite > ul > li:nth-of-type(1)')[0].stripped_strings) if soup.select('#wrapper > div.content.clearfix > div.leftBox > div:nth-of-type(4) > div.det-summary > div > div.second-dt-bewrite > ul > li:nth-of-type(1)') else None }
    #ul.det-infor > li:nth-of-type(1) > span  selector         
              item_info.insert_one(data)        
              print(data)
    
  • 次は上の関数を呼び出すコードです。

  • from get_third_url import get_item_info
    from get_third_url import whole_third_url#whole_third_url 
    from multiprocessing import Pool
    import requests
    if __name__ == '__main__':   
         pool = Pool()    
         try:       
            pool.map(get_item_info,whole_third_url.find(['url']))    
        except requests.exceptions.InvalidSchema:        
            pass
    

    そしてこんな間違いが出てきた
    Traceback (most recent call last):
      File "/Users/wangpegnfei/Desktop/Plan-for-combating-master/week2/week2_homework/myself/action2.py", line 10, in 
        pool.map(get_item_info,whole_third_url.find(['url']))
      File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/pymongo/collection.py", line 1137, in find
        return Cursor(self, *args, **kwargs)
      File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/pymongo/cursor.py", line 121, in __init__
        validate_is_mapping("filter", spec)
      File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/pymongo/common.py", line 375, in validate_is_mapping
        "collections.Mapping" % (option,))
    TypeError: filter must be an instance of dict, bson.son.SON, or other type that inherits from collections.Mapping
    

    エラーのタイプ:フィルタは辞書のインスタンスでなければなりません
    2016.5.20、更新して、上のコードは少し問題があって、コードは少し変更しました
  • 以下は、情報をキャプチャする関数
  • である.
    def get_item_info(url, data=None):    
          wb_data = requests.get(url, headers=headers)    
         #  , ip    
         if wb_data.status_code != 200:        
            return    
         soup = BeautifulSoup(wb_data.text, 'lxml')   
         prices = soup.select('.f22.fc-orange.f-type')   
         pub_dates = soup.select('.pr-5')    
         areas = soup.select('ul.det-infor > li:nth-of-type(3) > a')    
         cates = soup.select('ul.det-infor > li:nth-of-type(1) > span') 
         print(areas)    
         data = {'title': soup.title.text.strip(), 'price': prices[0].text.strip() if len(prices) > 0 else 0,  'pub_date': pub_dates[0].text.strip().split(' ')[0] if len(pub_dates) > 0 else "",  'area': [area.text.strip() for area in areas if area.text.strip() != "-"], 'cates': [cate.text.strip() for cate in cates], 'state': soup.select('ul.second-det-infor.clearfix > li')[0].text.split(':')[-1].strip() 
         if soup.find('ul','second-det-infor') and soup.select('ul.second-det-infor.clearfix > li')[0].text.split(':')[0].strip() == ' ' else None,  'url': url    }   
         print(data)    
         item_info.insert_one(data)
    
  • 次は上の関数を呼び出すコードです。

  • from get_third_url import get_item_info
    from get_third_url import whole_third_url
    from multiprocessing import Pool
    
    if __name__ == '__main__':    
        pool = Pool()    
        for i in whole_third_url.find():        
        #print(i['url'])        
        pool.map(get_item_info,i['url'].split())