RCurlを利用して映画の団体購入情報をつかむ

2915 ワード

1サイトは360団体購入
http://tuan.360.cn/bei_jing/c_0.html?kw=映画&pageno=1#tuanFilter
2 firefoxのFireBugプラグインを使用してソースコードを分析します.以下のようにします.
在此输入图片描述
"//*/h3[@class='desc']"        
"//*/span [@class='discount']"     
"//*/span [@class='price']"      
"//*/div [@class='other-info clearfix']"       
"//*/div[@class='source clearfix']"         

3ソースコード
##   RCurl        

library(RCurl)
library(XML)
library("plyr")

page <- 1:5
urlist[page]  <- paste("http://tuan.360.cn/bei_jing/c_0.html?kw=  &pageno=",page,"#tuanFilter",sep="")
#    
myheader=c("User-Agent"="Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1.6) ",
           "Accept"="text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
           "Accept-Language"="en-us",
           "Connection"="keep-alive",
           "Accept-Charset"="GB2312,utf-8;q=0.7,*;q=0.7"
)
#     
dyy_name<-c("")
#  
last_price<-c("")
#    
now_price<-c("")
#  
others<-c("")
#  
tg_source<-c("")
for(url in urlist){
  #    
  webpage  <- getURL(url,httpheader=myheader,.encoding="utf-8")
  #   XML  
  pagetree <- htmlTreeParse(webpage,encoding="utf-8", error=function(...){}, useInternalNodes = TRUE,trim=TRUE)
  #  XPATH  
  temp_name <- xpathSApply (pagetree, "//*/h3[@class='desc']", xmlValue)
  dyy_name<-c(dyy_name,temp_name)
  temp_price <- xpathSApply (pagetree, "//*/span [@class='discount']",xmlValue)
  last_price<-c(last_price,temp_price)
  temp_now_price <- xpathSApply (pagetree, "//*/span [@class='price']",xmlValue)
  now_price<-c(now_price,temp_now_price)
  temp_others <- xpathSApply (pagetree, "//*/div [@class='other-info clearfix']",xmlValue)
  others<-c(others,temp_others)
  temp_tg_source <- xpathSApply (pagetree, "//*/div[@class='source clearfix']", xmlValue)
  tg_source<-c(tg_source,temp_tg_source)
}
#         
tg_source<-laply(as.list(tg_source),function(x){
  unlist(strsplit(x,"
"))[2] }) # others<-laply(as.list(others),function(x){ unlist(strsplit(x,"
"))[2] }) # content<-data.frame(dyy_name,last_price,now_price,others,tg_source) names(content)<-c(' ',' ',' ',' ',' ') # csv write.csv(content,file=" .csv")