kuduの基本操作(クラスタ状態の表示,APT操作,impala操作,spark統合)

10050 ワード

1.コマンドラインkuduステータス表示アクション
rootユーザがkuduユーザ1.1に切り替えてクラスタ全体情報を表示する(自機のhostnameはhadoop 002、すなわちkuduのmasterはhadoop 002)
-bash-4.2$ kudu cluster ksck hadoop002
Connected to the Master
Fetched info from all 1 Tablet Servers
Table wc is HEALTHY (3 tablet(s) checked)

Table Summary
 Name | Status  | Total Tablets | Healthy | Recovering | Under-replicated | Unavailable
------+---------+---------------+---------+------------+------------------+-------------
 wc   | HEALTHY | 3             | 3       | 0          | 0                | 0
The metadata for 1 table(s) is HEALTHY
OK

1.2マスターステータスの表示
-bash-4.2$ kudu master status localhost
node_instance {
  permanent_uuid: "2431dd2c03a54ff0be6f14fec9cb4ab7"
  instance_seqno: 1586275391374513
}
bound_rpc_addresses {
  host: "hadoop002"
  port: 7051
}
bound_http_addresses {
  host: "hadoop002"
  port: 8051
}
version_info {
  git_hash: "70babd7c7391a980df09d8b7bc5b42ed35a26f62"
  build_hostname: "impala-ec2-pkg-centos-7-1c83.vpc.cloudera.com"
  build_timestamp: "09 Aug 2018 09:46:59 PST"
  build_username: "jenkins"
  build_clean_repo: true
  build_id: "2018-08-09_08-50-19"
  build_type: "RELEASE"
  version_string: "1.7.0-cdh5.15.1"
}

1.2 tserverのステータスの表示
-bash-4.2$  kudu tserver status localhost
node_instance {
  permanent_uuid: "8cd8c704667747d98afb3ef342f6f1b5"
  instance_seqno: 1586275398491656
}
bound_rpc_addresses {
  host: "hadoop002"
  port: 7050
}
bound_http_addresses {
  host: "hadoop002"
  port: 8050
}
version_info {
  git_hash: "70babd7c7391a980df09d8b7bc5b42ed35a26f62"
  build_hostname: "impala-ec2-pkg-centos-7-1c83.vpc.cloudera.com"
  build_timestamp: "09 Aug 2018 09:46:59 PST"
  build_username: "jenkins"
  build_clean_repo: true
  build_id: "2018-08-09_08-50-19"
  build_type: "RELEASE"
  version_string: "1.7.0-cdh5.15.1"
}

2.kudu統合impala操作
impalaインストールチュートリアル:
3.kuduのapi操作
3.1 kuduのクライアントを初期化する
    val KUDU_MASTERS = "hadoop002"
    val client: KuduClient = new KuduClient.KuduClientBuilder(KUDU_MASTERS).build()
    val tableName = "test"

3.2表の作成
  /**
    *    
    */
  def createTable(client: KuduClient, tableName: String): Unit = {
    import scala.collection.JavaConverters._
    val columns = List(
      new ColumnSchema.ColumnSchemaBuilder("id", Type.STRING).key(true).build(),
      new ColumnSchema.ColumnSchemaBuilder("name", Type.INT32).build()
    ).asJava

    val schema = new Schema(columns)

    val options: CreateTableOptions = new CreateTableOptions()
    options.setNumReplicas(1)

    val parcols: util.LinkedList[String] = new util.LinkedList[String]()
    parcols.add("word")
    options.addHashPartitions(parcols,3)

    client.createTable(tableName,schema,options)
  }

3.3データの挿入
  def insertRows(client: KuduClient, tableName: String) = {
    val table: KuduTable = client.openTable(tableName)  //       kudu  
    val session: KuduSession = client.newSession() // JPA Hibernate

    for(i

3.4表構造の変更
  def renameTable(client: KuduClient, tableName: String, newTableName: String) = {

    val options: AlterTableOptions = new AlterTableOptions()
    options.renameTable(newTableName)
    client.alterTable(tableName, options)
  }

3.5クエリーデータ
  def query(client: KuduClient, tableName: String) = {
    val table: KuduTable = client.openTable(tableName)

    val scanner: KuduScanner = client.newScannerBuilder(table).build()

    while(scanner.hasMoreRows) {
      val iterator: RowResultIterator = scanner.nextRows()

      while(iterator.hasNext) {
        val result: RowResult = iterator.next()
        println(result.getString("id") + " => " + result.getInt("name"))
      }
    }

  }

3.6データの変更
  def upsertRow(client: KuduClient, tableName: String) = {
    val table: KuduTable = client.openTable(tableName)
    val session: KuduSession = client.newSession()

    val update: Update = table.newUpdate()
    val row: PartialRow = update.getRow
    row.addString("word", "pk-10")
    row.addInt("cnt", 8888)
    session.apply(update)
  }

3.7表の削除
  def deleteTable(client: KuduClient, tableName: String) = {
    client.deleteTable(tableName)
  }

呼び出し例
  def main(args: Array[String]): Unit = {
    val KUDU_MASTERS = "hadoop002"
    val client: KuduClient = new KuduClient.KuduClientBuilder(KUDU_MASTERS).build()
    val tableName = "test"
    createTable(client, tableName)
    client.close()
  }

4.kudu統合spark kudu統合sparkの公式サイトドキュメント:https://kudu.apache.org/docs/developing.html#_kudu_integration_with_spark4.1 spark MySQLデータソースを読み込み、kuduデータソースからデータを読み込みます.
package com.wxx.bigdata.kudu

import java.util.Properties

import com.typesafe.config.ConfigFactory
import org.apache.spark.sql.{SaveMode, SparkSession}

object SparkKuduApp {
  def main(args: Array[String]): Unit = {

    val spark = SparkSession.builder().master("local").getOrCreate()

    val config = ConfigFactory.load()
    val url = config.getString("db.default.url")
    val user = config.getString("db.default.user")
    val password = config.getString("db.default.password")
    val driver = config.getString("db.default.driver")
    val database = config.getString("db.default.database")
    val table = config.getString("db.default.table")

    // mysql    
    val connectionProperties = new Properties()
    connectionProperties.put("user", user)
    connectionProperties.put("password", password)
    //TODO       
    val jdbcDF = spark.read.jdbc(url, s"$database.$table", connectionProperties)
    jdbcDF.show()

    //      kudu
    val kuduMaster = "hadoop002"
    jdbcDF.write.format("org.apache.kudu.spark.kudu")
        .mode(SaveMode.Append)
        .option("kudu.master",kuduMaster)
        .option("kudu.table", "test")
        .save()

    //  Kudu      
    val df = spark.read.format("org.apache.kudu.spark.kudu")
      .option("kudu.master", kuduMaster)
      .option("kudu.table", "test")
      .load()
    df.show(false)
    spark.stop()
  }
}

MySQLデータソースのプロファイルの読み込み
db.default.driver="com.mysql.jdbc.Driver"
db.default.url="jdbc:mysql://hostname:13306"
db.default.user="user"
db.default.password="password"
db.default.database=test
db.default.table=wc

4.2 KuduContextによるkudu操作
package com.imooc.bigdata.chapter07

import org.apache.kudu.client.CreateTableOptions
import org.apache.kudu.spark.kudu.KuduContext
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}

import collection.JavaConverters._

object SparkKuduApp2 {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession.builder().master("local[2]").appName("SparkKuduApp2").getOrCreate()
//    val df = spark.read.format("org.apache.kudu.spark.kudu")
//      .option("kudu.master", "hadoop002")
//      .option("kudu.table", "test")
//      .load()
//    df.select("word","cnt").filter("cnt > 100").show(false)

    // Use KuduContext to create, delete, or write to Kudu tables
    val kuduContext = new KuduContext("hadoop002:7051", spark.sparkContext)

    val schema =StructType(
      List(
        //  Bad schema: Nullable key columns are not supported: id
        StructField("id", IntegerType, false),
        StructField("name", StringType, true),
        StructField("age", IntegerType, true)
      ))
    kuduContext.createTable(
      "user", schema, Seq("id"),
      new CreateTableOptions()
        .setNumReplicas(1)
        .addHashPartitions(List("id").asJava, 3))

    // Check for the existence of a Kudu table
    val isExisted = kuduContext.tableExists("user")
    println(isExisted)

    // Insert data
    import spark.implicits._
    val userDf = Seq((1, "zhangsan", 22),(2,"lisi",33), (3,"wangwu", 18)).toDF("id", "name", "age")
    kuduContext.insertRows(userDf, "user")
      val userReturnDf = spark.read.format("org.apache.kudu.spark.kudu")
        .option("kudu.master", "hadoop002")
        .option("kudu.table", "user")
        .load()
      userReturnDf.show()

    // Delete data
    kuduContext.deleteRows(userDf, "user")

    // Upsert data
      val userUpsertDf = Seq((1, "zhangsan2", 22),(2,"lisi",44), (4,"zhaoliu", 33)).toDF("id", "name", "age")
      kuduContext.upsertRows(userUpsertDf, "user")
      val userUpsertReturnDf = spark.read.format("org.apache.kudu.spark.kudu")
        .option("kudu.master", "hadoop002")
        .option("kudu.table", "user")
        .load()
    userUpsertReturnDf.show()

    //  Update data
    val userUpdateDF = Seq((1, "tianqi", 20)).toDF("id", "name", "age")
        kuduContext.updateRows(userUpdateDF, "user")
        val userUpdateReturnDf = spark.read.format("org.apache.kudu.spark.kudu")
          .option("kudu.master", "hadoop002")
          .option("kudu.table", "user")
          .load()
        userUpdateReturnDf.show()

    // Delete a Kudu table
    kuduContext.deleteTable("user")

    spark.stop()


  }
}