21.1.27


  • リクエスト、BeautifulSoup 4インストール後のデフォルト設定とページ数スクロール
  • import requests
    from bs4 import BeautifulSoup
    
    incruit_result = requests.get("https://search.incruit.com/list/search.asp?col=job&src=gsw*search&kw=%b9%e9%bf%a3%b5%e5&startno=0")
    
    incruit_soup = BeautifulSoup(incruit_result.text, "html.parser")
    
    paging = incruit_soup.find("p", {"class" : "sqr_paging sqr_pg_mid"})
    
    pages = paging.find_all("a")
    
    spans = []
    for page in pages:
      spans.append(page.find("span"))
    spans = spans[:-2]
    # span으로 2~5페이지까지 나오고 마지막에 단추 두 개를 지웠음.
    すべてのページのジャンプボタンの前にspan値をスクロールするしかないようです.クロムに何か変化がありますか...
  • 最終ページ
  • を検索
    import requests
    from bs4 import BeautifulSoup
    
    incruit_result = requests.get("https://search.incruit.com/list/search.asp?col=job&src=gsw*search&kw=%b9%e9%bf%a3%b5%e5&startno=0")
    
    incruit_soup = BeautifulSoup(incruit_result.text, "html.parser")
    
    paging = incruit_soup.find("p", {"class" : "sqr_paging sqr_pg_mid"})
    
    links = paging.find_all("a")
    
    pages = []
    for link in links:
      pages.append(link.find("span"))
    pages = pages[:-2]
    max_page = pages[-2]
    # 5페이지니까.
    要求最大
  • ページ
    (incruit.py)
  • import requests
    from bs4 import BeautifulSoup
    
    LIMIT = 20
    URL = "https://search.incruit.com/list/search.asp?col=job&src=gsw*search&kw=%b9%e9%bf%a3%b5%e5&startno={LIMIT}"
    
    def extract_incruit_pages():
      result = requests.get(URL)
      soup = BeautifulSoup(result.text, "html.parser")
      paging = soup.find("p", {"class" : "sqr_paging sqr_pg_mid"})
      links = paging.find_all("a")
      pages = []
      for link in links[:-2]:
        pages.append(int(link.string))
      max_page = pages[-2]
      return max_page
    
    def extract_incruit_jobs(last_page):
      for page in range(last_page):
        result = requests.get(f"{URL}&startno={page*LIMIT}")
        print(result.status_code)
    
    (main.py)
    from incruit import extract_incruit_pages, extract_incruit_jobs
    
    last_incruit_page = extract_incruit_pages()
    
    extract_incruit_jobs(last_incruit_page)
    誤字に注意する.単純なものからもう少しで塞がれるところだった.幸いなことに、200枚はまあまあです.
  • 選抜職務
  • まず1ページの仕事を引いた.以前とは異なり、最後にget text()を使用すると、美化グループにコンテンツを抽出しやすくなります.
    import requests
    from bs4 import BeautifulSoup
    
    LIMIT = 20
    URL = "https://search.incruit.com/list/search.asp?col=job&src=gsw*search&kw=%b9%e9%bf%a3%b5%e5&startno={LIMIT}"
    
    def extract_incruit_pages():
      result = requests.get(URL)
      soup = BeautifulSoup(result.text, "html.parser")
      paging = soup.find("p", {"class" : "sqr_paging sqr_pg_mid"})
      links = paging.find_all("a")
      pages = []
      for link in links[:-2]:
        pages.append(int(link.string))
      max_page = pages[-2]
      return max_page
    
    def extract_incruit_jobs(last_page):
      jobs = []
      #for page in range(last_page):
      result = requests.get(f"{URL}&startno={0*LIMIT}")
      soup = BeautifulSoup(result.text, "html.parser")
      results = soup.find_all("p", {"class" : "detail"})
      for result in results:
        title_case = result.find("span", {"class" : "rcrtTitle"})
        title = title_case.find("a").get_text()
        print(title)
      return jobs
    メイン料理は触らなかった.
  • 社名を抽出します.
  • こちらもNoneの価格がありますしかしこれはアンカーマークに囲まれていないテキストであり,いずれにしてもスクロールしても何の結果も生じない.残りはifゲートでstrを処理します.
    import requests
    from bs4 import BeautifulSoup
    
    LIMIT = 20
    URL = "https://search.incruit.com/list/search.asp?col=job&src=gsw*search&kw=%b9%e9%bf%a3%b5%e5&startno={LIMIT}"
    
    def extract_incruit_pages():
      result = requests.get(URL)
      soup = BeautifulSoup(result.text, "html.parser")
      paging = soup.find("p", {"class" : "sqr_paging sqr_pg_mid"})
      links = paging.find_all("a")
      pages = []
      for link in links[:-2]:
        pages.append(int(link.string))
      max_page = pages[-2]
      return max_page
    
    def extract_incruit_jobs(last_page):
      jobs = []
      #for page in range(last_page):
      result = requests.get(f"{URL}&startno={0*LIMIT}")
      soup = BeautifulSoup(result.text, "html.parser")
      results = soup.find_all("p", {"class" : "detail"})
      for result in results:
        title = result.find("span", {"class" : "rcrtTitle"}).find("a").get_text()
    
      results2 = soup.find_all("h3")
      for result in results2:
        company = result.find("a")
        if company == result.find("h3"):
          company = str(result.find("h3"))
        else:
          company = str(result.find("a").string)
        print(company)
      return jobs
  • 枚だけ印刷したいのですが、他の資料と縛られています.このページは大変ですね.まず、役職と会社名のみを出力するスクロールで終了します.
  • (incruit.py)
    import requests
    from bs4 import BeautifulSoup
    
    LIMIT = 20
    URL = "https://search.incruit.com/list/search.asp?col=job&src=gsw*search&kw=%b9%e9%bf%a3%b5%e5&startno={LIMIT}"
    
    def extract_incruit_pages():
      result = requests.get(URL)
      soup = BeautifulSoup(result.text, "html.parser")
      paging = soup.find("p", {"class" : "sqr_paging sqr_pg_mid"})
      links = paging.find_all("a")
      pages = []
      for link in links[:-2]:
        pages.append(int(link.string))
      max_page = pages[-2]
      return max_page
    
    def extract_incruit_jobs(last_page):
      jobs = []
      for page in range(last_page):
        result = requests.get(f"{URL}&startno={page*LIMIT}")
        soup = BeautifulSoup(result.text, "html.parser")
        results = soup.find_all("p", {"class" : "detail"})
        for result in results:
          title = result.find("span", {"class" : "rcrtTitle"}).find("a").get_text()
    
        results2 = soup.find_all("h3")
        for result in results2:
          company = result.find("a")
          if company == result.find("h3"):
            company = str(result.find("h3"))
          else:
            company = str(result.find("a").string)
          print({company + ':' + title})
      return jobs
    (main.py)
    from incruit import extract_incruit_pages, extract_incruit_jobs
    
    last_incruit_page = extract_incruit_pages()
    
    extract_incruit_jobs(last_incruit_page)
    残念なことに、総評:ウェブページは他のウェブページとは異なる構造(liで縛られたものなど)が等級化されていないので、学んだ内容より少し体現が少ない.(役職、社名).中間に会社名のない司会者でも解決できるのは良い経験です.(if文を使用).またstringのほかget text()を用いて文字列をキャプチャすることも学んだ.
    返信リンク