Window Functions in Spark SQL using HiveQL


SparkでHiveContextを用いてwindow functionを使用するサンプル。

build.sbt

...
libraryDependencies ++= Seq(
  "org.apache.spark" %% "spark-core" % "1.5.2" % "provided",
  "org.apache.spark" %% "spark-sql" % "1.5.2" % "provided",
  "org.apache.spark" %% "spark-hive" % "1.5.2" % "provided"
)
...

scalaサンプル

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.hive.HiveContext
...

val conf = new SparkConf()
val sc = new SparkContext(conf)

val sqlContext = new HiveContext(sc)
import sqlContext.implicits._

val src = sqlContext.read.json(test_data)
src.registerTempTable("test")

val r = sqlContext.sql("select id, time, row_number() over(partition by id order by time) num from test")
...