Spark dataframeとlistの変換(複数行が1行になる)

3412 ワード

1.dataframeの列をlistに変更
import spark.implicits._
var data_csv = Seq(
  ("ke,sun"),
  ("tian,sun")
).toDF("CST_NO")
    
+--------+
|  CST_NO|
+--------+
|  ke,sun|
|tian,sun|
+--------+

CST_をNO列
var neg_tmp = data_tmp.select("CST_NO").collect().map(_(0)).toList
println(neg_tmp.length)

//      neg_tmp(0)
var neg_list = neg_tmp(0).toString.split(",")
println(neg_list)

  :
neg_tmp: List[Any] = List(ke,sun,tian,sun)
1
neg_list: Array[String] = Array(ke, sun, tian, sun)

参考ブログ:転送をクリック
Listデリバリー
 1,          distinct

scala> val l = List(1,2,3,3,4,4,5,5,6,6,6,8,9)
l: List[Int] = List(1, 2, 3, 3, 4, 4, 5, 5, 6, 6, 6, 8, 9)

scala> l.distinct
res32: List[Int] = List(1, 2, 3, 4, 5, 6, 8, 9)


2, toSet

scala> l.toSet.toList
res33: List[Int] = List(5, 1, 6, 9, 2, 3, 8, 4)

参考ブログ:転送をクリック
2.listをdataframeの列に変更
//               ,    String(null  ,   )      toDF    
var lst = List[String]("57.54", "trusfortMeans", null, "20190720", "5852.00", null, null, "25.77", null)
var name_list = List("idm", "CO", "distrn","dayId", "Ant", "CLP", "CAC", "PE_num","CE")
import org.apache.spark.sql.functions._
import org.apache.spark.ml._

var df = List((lst.toArray)).toDF("features")
//df: org.apache.spark.sql.DataFrame = [id: int, features: vector]

df.show()
+--------------------+
|            features|
+--------------------+
|[57.54, trusfortM...|
+--------------------+

3.listをdataframeの行に変更
// name_list     lst     

//               ,    String(null  ,   )      toDF    
var lst = List[String]("57.54", "trusfortMeans", null, "20190720", "5852.00", null, null, "25.77", null)
var name_list = List("idm", "CO", "distrn","dayId", "Ant", "CLP", "CAC", "PE_num","CE")
import org.apache.spark.sql.functions._
import org.apache.spark.ml._

var df = List((lst.toArray)).toDF("features")
//df: org.apache.spark.sql.DataFrame = [id: int, features: vector]

df.show()
// +--------------------+
// |            features|
// +--------------------+
// |[57.54, trusfortM...|
// +--------------------+


// sizeof `elements` should be equal to the number of entries in column `features`
val elements = name_list.toArray

// Create a SQL-like expression using the array 
val sqlExpr = elements.zipWithIndex.map{ case (alias, idx) => col("features").getItem(idx).as(alias) }

// Extract Elements from dfArr    
df = df.select(sqlExpr : _*)
df.show()

df: org.apache.spark.sql.DataFrame = [features: array]
+--------------------+
|            features|
+--------------------+
|[57.54, trusfortM...|
+--------------------+
df: org.apache.spark.sql.DataFrame = [idm: string, CO: string ... 7 more fields]
+-----+-------------+------+--------+-------+----+----+------+----+
|  idm|           CO|distrn|   dayId|    Ant| CLP| CAC|PE_num|  CE|
+-----+-------------+------+--------+-------+----+----+------+----+
|57.54|trusfortMeans|  null|20190720|5852.00|null|null| 25.77|null|
+-----+-------------+------+--------+-------+----+----+------+----+

参考リンク:転送をクリック