IPエージェントを使用して求人情報を取得

7387 ワード

from bs4 import BeautifulSoup
import requests
import ip_proxy
from urllib import parse

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
}


def get_boss_info(my_ip,detailed_url):
    #url = 'https://www.zhipin.com/job_detail/7e883f0c3a336cb51n142968FFM~.html?ka=search_list_1'

    proxy = {
        'http': 'http://' + my_ip.ip_proxy_str,
        'https': 'http://' + my_ip.ip_proxy_str
    }
    response = requests.get(detailed_url, headers=headers, proxies = proxy, timeout=5)

    soup = BeautifulSoup(response.text, 'lxml')
    title = soup.find('h1').text
    #div_ele = soup.find('div', class_="name")
    #print(div_ele)
    salary = soup.find('span', class_="badge").text.replace('
'
, '').strip() print(title) print(salary) gezhong_info = soup.select('div.info-primary > p')[0].text.replace('
'
, '').strip() print(gezhong_info) gangwei_info = soup.select('div.text')[0].text print(gangwei_info) # url def get_detail_url(my_ip, url): # url = 'https://www.zhipin.com/c101010100/h_101010100/?query=python&page=2&ka=page-2' proxy = { 'http': 'http://' + my_ip.ip_proxy_str, 'https': 'http://' + my_ip.ip_proxy_str } response = requests.get(url, headers = headers, proxies=proxy, timeout=5) soup = BeautifulSoup(response.text, 'lxml') #a_ele_list = soup.select('h3.name > a') a_ele_list = soup.select('div.job-list > ul > li div.info-primary > h3 > a') for a_ele in a_ele_list: # a_href = a_ele['href'] # href = parse.urljoin(url, a_href) print(' href: ' + href) # , boss , for i in range(0,3): try: # get_boss_info(my_ip, href) break except Exception as e: print(e) my_ip.update_ip_proxy_str() def get_all_info(my_ip): base_url = 'https://www.zhipin.com/c101010100/h_101010100/?query=python&page=%s&ka=page-%s' for i in range(1,4): # url url = base_url % (i, i) # , proxy , , 4 , for i in range(0, 4): try: # boss , # get_detail_url(my_ip, url) get_detail_url(my_ip, url) break except Exception as e: print(e) my_ip.update_ip_proxy_str() if __name__ == '__main__': my_ip = ip_proxy.ip_getter() # ip # proxy_str = '36.27.143.72:21450' # print(proxy_str) # boss get_all_info(my_ip) # with open('boss.html', 'wb') as f: # f.write(response.content) ------------------------------------------------------------- import requests class ip_getter(object): def __init__(self): self.ip_proxy_str = get_ip_string() def update_ip_proxy_str(self): self.ip_proxy_str = get_ip_string() print('get one ip : ' + self.ip_proxy_str) def get_ip_string(): url = 'http://dps.kdlapi.com/api/getdps/?orderid=963491899590153&num=1&pt=1&ut=1&dedup=1&sep=1' response = requests.get(url) return response.text