Python(2)-スタックオーバーフローを巡る

10399 ワード


スタックオーバーフロー求職サイト Scraping

Example of Scrapping Process

  • htmlを含むオブジェクト
  • を読み込む.
  • Parsing html in indeed_result object
  • find "div"which have s-pagination className
  • find title, company_name, location, link for company card included in each page
  • Code


    main.py

    from so import get_jobs as get_so_jobs
    jobs = get_so_jobs()

    so.py

    import requests
    from bs4 import BeautifulSoup
    
    
    URL = "https://stackoverflow.com/jobs?q=python"
    
    def get_last_page():
    
      result = requests.get(URL)
      soup = BeautifulSoup(result.text,"html.parser")
      pagination = soup.find("div",{"class" : "s-pagination"})
      links = pagination.find_all("a")
      last_page = links[-2].get_text(strip=True)
    
      return int(last_page)
      
    def extract_job(html):
      title = html.find("h2",{ "class" : "mb4" }).find("a")["title"]
      company, location = html.find("h3",{ "class" : "mb4" }).find_all("span",recursive=False)
      company = company.get_text(strip=True)
      location = location.get_text(strip=True) 
      job_id = html["data-jobid"]
      link = f"https://stackoverflow.com/jobs/{job_id}"
      # link = "https://stackoverflow.com/" + html.find("h2",{ "class" : "mb4" }).find("a")["href"]
      return {"title": title, "company" : company , "location": location, "link" : link}
    
    def extract_jobs(last_page):
      jobs= []
      for page in range(last_page):
        print(f"scrapping SO {page+1}")
        res = requests.get(f"{URL}&pg={page+1}")
        html = BeautifulSoup(res.text,"html.parser")
        job_cards = html.find_all("div",{"class" :"-job"})
    
        
        for job_card in job_cards:
          job = extract_job(job_card)
          jobs.append(job)
        
      return jobs
    
    
    def get_jobs():
      
      last_page = get_last_page()
      jobs = extract_jobs(last_page)
      
      return jobs