Sparkプログラミングケース:tomcatアクセスログを分析してアクセス量が最も高い最初の2つのページを求めます

15655 ワード

需要は問題のようで、tomcatアクセスログは以下の通りです.
192.168.88.1 - - [30/Jul/2017:12:53:43 +0800] "GET /MyDemoWeb/ HTTP/1.1" 200 259
192.168.88.1 - - [30/Jul/2017:12:53:43 +0800] "GET /MyDemoWeb/head.jsp HTTP/1.1" 200 713
192.168.88.1 - - [30/Jul/2017:12:53:43 +0800] "GET /MyDemoWeb/body.jsp HTTP/1.1" 200 240
192.168.88.1 - - [30/Jul/2017:12:54:37 +0800] "GET /MyDemoWeb/oracle.jsp HTTP/1.1" 200 242
192.168.88.1 - - [30/Jul/2017:12:54:38 +0800] "GET /MyDemoWeb/hadoop.jsp HTTP/1.1" 200 242
192.168.88.1 - - [30/Jul/2017:12:54:38 +0800] "GET /MyDemoWeb/java.jsp HTTP/1.1" 200 240
192.168.88.1 - - [30/Jul/2017:12:54:40 +0800] "GET /MyDemoWeb/oracle.jsp HTTP/1.1" 200 242
192.168.88.1 - - [30/Jul/2017:12:54:40 +0800] "GET /MyDemoWeb/hadoop.jsp HTTP/1.1" 200 242
192.168.88.1 - - [30/Jul/2017:12:54:41 +0800] "GET /MyDemoWeb/mysql.jsp HTTP/1.1" 200 241
192.168.88.1 - - [30/Jul/2017:12:54:41 +0800] "GET /MyDemoWeb/hadoop.jsp HTTP/1.1" 200 242
192.168.88.1 - - [30/Jul/2017:12:54:42 +0800] "GET /MyDemoWeb/web.jsp HTTP/1.1" 200 239
192.168.88.1 - - [30/Jul/2017:12:54:42 +0800] "GET /MyDemoWeb/oracle.jsp HTTP/1.1" 200 242
192.168.88.1 - - [30/Jul/2017:12:54:52 +0800] "GET /MyDemoWeb/oracle.jsp HTTP/1.1" 200 242
192.168.88.1 - - [30/Jul/2017:12:54:52 +0800] "GET /MyDemoWeb/hadoop.jsp HTTP/1.1" 200 242
192.168.88.1 - - [30/Jul/2017:12:54:53 +0800] "GET /MyDemoWeb/oracle.jsp HTTP/1.1" 200 242
192.168.88.1 - - [30/Jul/2017:12:54:54 +0800] "GET /MyDemoWeb/mysql.jsp HTTP/1.1" 200 241
192.168.88.1 - - [30/Jul/2017:12:54:54 +0800] "GET /MyDemoWeb/hadoop.jsp HTTP/1.1" 200 242
192.168.88.1 - - [30/Jul/2017:12:54:54 +0800] "GET /MyDemoWeb/hadoop.jsp HTTP/1.1" 200 242
192.168.88.1 - - [30/Jul/2017:12:54:56 +0800] "GET /MyDemoWeb/web.jsp HTTP/1.1" 200 239
192.168.88.1 - - [30/Jul/2017:12:54:56 +0800] "GET /MyDemoWeb/java.jsp HTTP/1.1" 200 240
192.168.88.1 - - [30/Jul/2017:12:54:57 +0800] "GET /MyDemoWeb/oracle.jsp HTTP/1.1" 200 242
192.168.88.1 - - [30/Jul/2017:12:54:57 +0800] "GET /MyDemoWeb/java.jsp HTTP/1.1" 200 240
192.168.88.1 - - [30/Jul/2017:12:54:58 +0800] "GET /MyDemoWeb/oracle.jsp HTTP/1.1" 200 242
192.168.88.1 - - [30/Jul/2017:12:54:58 +0800] "GET /MyDemoWeb/hadoop.jsp HTTP/1.1" 200 242
192.168.88.1 - - [30/Jul/2017:12:54:59 +0800] "GET /MyDemoWeb/oracle.jsp HTTP/1.1" 200 242
192.168.88.1 - - [30/Jul/2017:12:54:59 +0800] "GET /MyDemoWeb/hadoop.jsp HTTP/1.1" 200 242
192.168.88.1 - - [30/Jul/2017:12:55:00 +0800] "GET /MyDemoWeb/mysql.jsp HTTP/1.1" 200 241
192.168.88.1 - - [30/Jul/2017:12:55:00 +0800] "GET /MyDemoWeb/oracle.jsp HTTP/1.1" 200 242
192.168.88.1 - - [30/Jul/2017:12:55:02 +0800] "GET /MyDemoWeb/web.jsp HTTP/1.1" 200 239
192.168.88.1 - - [30/Jul/2017:12:55:02 +0800] "GET /MyDemoWeb/hadoop.jsp HTTP/1.1" 200 242

考え方:
  • ログによる各ページへのアクセス量合計
  • 降順ソート
  • 前の2つの記録
  • を取る.
    scalaコードは次のとおりです.
    import org.apache.spark.SparkConf
    import org.apache.spark.SparkContext
    import java.util.regex.Pattern
    import java.util.regex.Matcher
    
    object TomcatLogDemo {
      def main(args: Array[String]):Unit = {
        //   SparkContext
        val conf = new SparkConf()
        conf.setAppName("MyWebLogDemo")
        conf.setMaster("local")
        val sc = new SparkContext(conf)
        
        //   tomcat     
        val rdd1 = sc.textFile("F:\\localhost_access_log.2017-07-30.txt")
        
        //              
        val rdd2 = rdd1.map(e => {
          //           
          val regex:String = "\"(.*?)\""
          val pattern:Pattern = Pattern.compile(regex)
          val matcher:Matcher = pattern.matcher(e)
          var goal:(String, Int) = ("", 0)
          while (matcher.find()) {
            val str = matcher.group()
            //            , URI
            val re:String = " (.*?) "
            val pa:Pattern = Pattern.compile(re)
            val ma:Matcher = pa.matcher(str)
            while (ma.find()) {
              val temp = ma.group().trim
              val index = temp.lastIndexOf("/")
              //   jsp   
              val result = temp.substring(index + 1)
    //          println((result, 1))
              goal = (result, 1)
            }
          }
          goal
        })
        //               
        val rdd3 = rdd2.reduceByKey(_+_)
        
        //          
        val rdd4 = rdd3.sortBy(_._2, false)
    //    rdd4.foreach(println)
        
        //             
        val result = rdd4.take(2)
        
        result.foreach(println)
        
        sc.stop()
      }
    }
    

    結果:
    (oracle.jsp,9)
    (hadoop.jsp,9)
    

    私の微信の公衆番号(曲健磊の個人エッセイ)に注目して、もっとすばらしい内容を見ます:Spark编程案例:分析tomcat访问日志求访问量最高的前两个网页_第1张图片