Ruby版ネットワーク爬虫


以前書いた爬虫類はデータを取って、そしてmysqlデータベースを保管して、マルチスレッドをサポートします。

#! /usr/local/bin/ruby 

require 'hpricot'
require "open-uri"
require 'net/http'
require 'CompanyLink'
require 'parser/CompanyParser'
require 'parser/CompanyParser2'
require 'export/CsvExport'
require 'export/MysqlExport'

class AlibabaWebCatcher
	
	def initialize
		@exporter = nil
	end
	
	def setExporter(export = nil)
		@exporter = export
	end
	
	#collect by given company site
	def collectByCompanyLink(companySite)
		companyLink = CompanyLink.new(companySite)
		companyHostUri = companyLink.hostUri
		companyIntroduceUri = companyLink.introduceUri
		companyContactUri = companyLink.contactUri
		
		begin
			puts 'Collect company : ' + companySite
			http = Net::HTTP.new(companyHostUri, 80)
			
			puts 'Opening Page: http://' + companyHostUri + companyIntroduceUri
			resp, data = http.get(companyIntroduceUri, nil)
			#puts data
			if resp.message == "OK" then
				puts 'Opened Page: http://' + companyHostUri + companyIntroduceUri
				companyLink.parser.parseProfile(data)
			else
				puts 'Opening Page: http://' + companyHostUri + companyIntroduceUri + ' failed'
			end
			
			puts 'Opening Page: http://' + companyHostUri + companyContactUri
			resp2, data2 = http.get(companyContactUri, nil)
			#put data2
			if resp2.message == 'OK' then
				puts 'Opened Page: http://' + companyHostUri + companyContactUri
				companyLink.parser.parseContact(data2)
			else
				puts 'Opening Page: http://' + companyHostUri + companyContactUri + ' failed'
			end
			
			if @exporter.nil? then
				puts 'Please provider exporter for export function.'
			else
				@exporter.doExport(companyLink.parser.getCompany)
			end
			
			rescue Exception => err
				puts 'Exception happend caused by : ' + err.to_s
				#raise err
			ensure
			
			puts 'Collect company : ' + companySite + " finished."
		end
	end
	
	#collect by company list
	def collectByCompanyList(companyListUrl)
		companySearchHostUri = getCompanySearchHost
		companySearchUri = companyListUrl.split(/alibaba.com/)[1]
		#puts companySearchUri
		begin
			http = Net::HTTP.new(companySearchHostUri, 80)
			puts 'Opening Page: http://' + companySearchHostUri + companySearchUri
			resp, data = http.get(companySearchUri, nil)
			#puts data
			if resp.message == "OK"
				puts 'Opened Page: http://' + companySearchHostUri + companySearchUri
				#parse company list
				companySites = parseCompanySites(data)
				for companySite in companySites
					collectByCompanyLink(companySite)
				end
				nextPage = getNextSearchPage(data)
				if !nextPage.nil? then
					collectByCompanyList(nextPage)
				end
			else
				puts 'Opening Page: http://' + companySearchHostUri + companySearchUri + ' failed'
			end
			
			rescue Exception => err
				puts 'Exception happend caused by : ' + err.to_s
				raise err
			ensure
		end
	end
	
	#collect by company list  url
	def collectByKeywords(searchKeywords)
		puts 'Searching ... ' + searchKeywords
		
		companySearchHostUri = getCompanySearchHost()
		companySearchUri = getCompanySearchUri(searchKeywords)
		
		collectByCompanyList(companySearchHostUri + companySearchUri)
	end
	
	#collect by company list url use thread
	def collectByKeywordsByThread(searchKeywords)
		puts 'Searching ... ' + searchKeywords
		
		companySearchHostUri = getCompanySearchHost()
		companySearchUri = getCompanySearchUri(searchKeywords)
		
		http = Net::HTTP.new(companySearchHostUri, 80)
		puts 'Opening Page: http://' + companySearchHostUri + companySearchUri
		resp, data = http.get(companySearchUri, nil)
		#puts data
		if resp.message == "OK" then
			puts 'Opened Page: http://' + companySearchHostUri + companySearchUri
			totalPage = getTotalPage(data)
			secondPage = getNextSearchPage(data)
			uri = '/company' + secondPage.split(/\/company/)[1]
			#puts uri
			threads = []
			for i in 1..totalPage
				threads << Thread.new(companySearchHostUri) do |page|
					begin 
						h = Net::HTTP.new(page, 80)
						uri = uri.gsub(/\d+\.html/, i.to_s + '.html')
						#puts uri
						puts "Opening Page: http://#{page}#{uri}"
						resp_t, data_t = h.get(uri, nil)
						#puts data
						#puts resp_t.message
						if resp_t.message == "OK" then
							puts "Opened Page: http://#{page}#{uri}"
							#parse company list
							companySites = parseCompanySites(data_t)
							for companySite in companySites
								collectByCompanyLink(companySite)
							end
						else
							puts "Opening Page: http://#{page}#{uri} failed."
						end
						
						rescue Exception => err
							puts 'Exception thrown out when got page ' + i.to_s + ' since ' + err.to_s
							#raise
						ensure
					end
				end
			end
			
			threads.each { |t|  t.join }

		else
			puts 'Opening Page: http://' + companySearchHostUri + companySearchUri + ' failed'
		end
			
	end
	
	protected
		#parse the company site list
		def parseCompanySites(companyListHtml)
			companySites = Array.new
			doc = Hpricot(companyListHtml)
			
			doc.search('//div[@class="content"]/div[@class="info"]/span[@class="m undline"]/a').each do |item|
				companyHref = item.attributes['href'] 
				puts '    : ' + companyHref
				companySites << companyHref
			end
			return companySites
		end
		
	private
		#get company search host
		def getCompanySearchHost
			return 'search.china.alibaba.com';
		end
		#get company search uri according to keywords
		def getCompanySearchUri(keywords)
			return '/search/company_search.htm?filt=y&categoryId=0&maxCatId=&isNoRepost=false&descend_order=&show_fid=&cat_field=' + 
				'&tradeType=&searchType=&pageSize=30&sm=&seType=&townId=0&onlineStatus=all&memberlevel=' + 
				'&province=%BD%AD%CB%D5%2C%D5%E3%BD%AD%2C%C9%CF%BA%A3&city=&biztype=&established_year=' + 
				'&keywords=' + URI.escape(keywords)
			#return '/company/' + URI.escape(keywords, 'utf-8') + '/1.html?province=%BD%AD%CB%D5%2C%D5%E3%BD%AD%2C%C9%CF%BA%A3'
		end
		#get company search next page
		def getNextSearchPage(data)
			doc = Hpricot(data)
			
			nextPageHref = doc.at('//div[@class="pages"]/div[@class="list_offer_pages"]/h1/b/a[text()="   "]')
			if !nextPageHref.nil? then
				#puts '   : ' + nextPageHref.attributes['href']
				return nextPageHref.attributes['href']
			end
		end
		#get company search total page
		def getTotalPage(data)
			totalItems = data.scan(/   \s<span class="red sm">(\d+)<\/span>\s /)[0]
			if !totalItems.nil? then
				totalPage = ((totalItems.last.to_i%30 == 0) ? totalItems.last.to_i/30 : (totalItems.last.to_i/30 + 1))
				puts '  ' + totalItems.last + '   ' + totalPage.to_s + '  '
				return totalPage
			else
				return 0
			end
		end
		
end		

使い方は簡単です。キーワードを入力すればいいです。

#! /usr/local/bin/ruby 

require 'test/unit'
require 'AlibabaWebCatcher'
require 'export/CsvExport'
require 'export/MysqlExport'

class AlibabaWebCatcherTest < Test::Unit::TestCase
	
	def setup
		@webCatcher = AlibabaWebCatcher.new
		#@webCatcher.export = CsvExport.new('test.csv')
		@webCatcher.exporter = MysqlExport.new
	end
	
	def teardown
		@webCatcher = nil
	end
	
	def test_download
		assert_not_nil(@webCatcher)
		#@webCatcher.collectByCompanyShortName('qianbusha')
		@webCatcher.collectByKeywords('     ')
		assert(true, 'Should go here')
	end
	
end