学校の教師の個人情報をscrapyで這い出す

3035 ワード

scrapyとbeautifulsoupフレームワークに基づいて、中国語、数字、英語の情報にかかわらず、私たちの希望を探す方法で簡単にWebページから必要な情報を取り出すことができます.
今度は私たちの学院のホームページのすべての先生の個人情報を取りたいです.職名、電話、事務室の住所、電子メールが含まれています.
pipelinesは次のとおりです.
from teacher1 import settings
import os
import urllib
from bs4 import BeautifulSoup

class TeacherPipeline(object):
    def process_item(self, item, spider):
        dir_path = '%s/%s' % (settings.PAGES_STORE, spider.name)  #     
        print 'dir_path', dir_path
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)

        file_path = '%s/teacherList.doc' % dir_path  # 1015        
        with open(file_path, 'wb') as file_writer:
            for page_url in item['page_urls']:
                print 'http://me.sjtu.edu.cn/sz_minglu/'+page_url
                html = urllib.urlopen('http://me.sjtu.edu.cn/sz_minglu/'+page_url).read()  #       
                soup1 = BeautifulSoup(html.decode('GBK','ignore').encode('utf-8','ignore'))
                if (soup1.find("span", attrs={"id":"lblName"})):        
                  headitems = soup1.find("span", attrs={"id":"lblName"}).getText().encode('utf-8','ignore')
                  print headitems.decode('utf-8','ignore')
                  title = soup1.find("span", attrs={"id": "lblzhicheng"}).getText().encode('utf-8','ignore')
		  phonenum = soup1.find("span", attrs={"id": "lbldianhua"}).getText().encode('utf-8','ignore')
                  addr = soup1.find("span", attrs={"id": "lbladdress"}).getText().encode('utf-8','ignore')
                  email = soup1.find("span", attrs={"id": "lblemail"}).getText().encode('utf-8','ignore')
                  file_writer.write(headitems+'\t   '+title+'
'+' : '+phonenum+'
'+' : '+addr+'
'+' : '+email+'

') else: continue file_writer.close()
 
  
 
   
  
 spider如下; 
   
   
  
import scrapy
from teacher1.items import TeacherItem


#from scrapy.crawler import CrawlerProcess
from bs4 import BeautifulSoup

class teacherSpider(scrapy.Spider):
    name = 'teacher'
    allowed_domains = []
    #start_urls = ["http://jandan.net/ooxx"]
    start_urls = ["http://me.sjtu.edu.cn/sz_minglu/Default.aspx?cid=4"]
    def parse(self, response):
        item = TeacherItem()
        item['page_urls'] =response.xpath('//a[@class="amingluCss"]//@href').extract()  #       
        print 'teacher_urls', item['page_urls']
        yield item
spiderを実行すると、teacherList,docというwordドキュメントが得られ、すべての先生の個人情報がリストされています.