hiveステップ4


hiveステップ4
hiveのフィールド区切り:
hive             :\001 、ctrl+V ctrl+A(^A) 、SOH 、 \u0001(   java  ),    tab
     :
tab
,
" "
|

\001 ^A (\u0001, \0001 \01) \002 ^B \003 ^C
hiveのファイル保存フォーマット:
  • File Formats and Commpression)(https://cwiki.apache.org/confluence/display/Hive/FileFormats):RCFile,Avro,ORC,Parquet;Commpression,LZO
  • 上記のすべてのファイルフォーマットはロードできないことに注意してください。
  • 各保存フォーマットの属性:
    hive            :textfile
    
    textfile:         ,   。    ,      。(        )
    
    sequencefile:
    hive           ,     。   load      
    
    rcfile:
    hive        ,hive     ,                    。    ,      。
    
    orc :
        rcfile。
    
    parquet :
          。    ,    (    )
    
    hive.default.fileformat
        TextFile
        
          Expects one of [textfile, sequencefile, rcfile, orc].
          Default file format for CREATE TABLE statement. Users can explicitly override it by CREATE TABLE ... STORED AS [FORMAT]
          
          
    textfile:              。
    map     :
    mapreduce.map.output.compress=false
    mapreduce.map.output.compress.codec=org.apache.hadoop.io.compress.DefaultCodec
    
    reduce    (reduce  ):
    snappy、bzip2、gzip、DefaultCompress
    mapreduce.output.fileoutputformat.compress=false
    mapreduce.output.fileoutputformat.compress.type=NONE/RECORD/BLOCK(  RECORD)
    mapreduce.output.fileoutputformat.compress.codec=org.apache.hadoop.io.compress.DefaultCodec
    
    hive    :
    set hive.exec.compress.output=false;
    set hive.exec.compress.intermediate=false;
    set hive.intermediate.compression.codec=
    set hive.intermediate.compression.type=
    
    
    CREATE TABLE `u4`(
      `id` int,
      `name` string)
    ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
    stored as textfile;
    
    set mapreduce.output.fileoutputformat.compress=true;
    set hive.exec.compress.output=true;
    insert into table u4
    select * from u2;
    
    2:
    sequence :
    CREATE TABLE `u4`(
      `id` int,
      `name` string)
    ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
    stored as sequencefile;
    
    3:
    rcfile : 
    CREATE TABLE `u5`(
      `id` int,
      `name` string)
    ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
    stored as rcfile;
    
    
    4:
    orc : 
    CREATE TABLE `u6`(
      `id` int,
      `name` string)
    ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
    stored as orc;
    
    5:
    parquet:
    CREATE TABLE `u7`(
      `id` int,
      `name` string)
    ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
    stored as PARQUET;
    insert into table u7
    select * from u2;
    
       :
      :
    seq_yd     :
    aGVsbG8gemhhbmdoYW8=
    aGVsbG8gZmVpZmVpLGdvb2QgZ29vZCBzdHVkeSxkYXkgZGF5IHVw
    seq_yd   base64      ,decode    :
    
    create table cus(str STRING)  
    stored as  
    inputformat 'org.apache.hadoop.hive.contrib.fileformat.base64.Base64TextInputFormat'  
    outputformat 'org.apache.hadoop.hive.contrib.fileformat.base64.Base64TextOutputFormat'; 
    
    LOAD DATA LOCAL INPATH '/home/hivedata/cus' INTO TABLE cus;
    
    
    Hiveビュー
      :                
    
    hive             
    hive          ,       。
    
    hive     :
    1、         (         )。
    2、      。
    
        (cvas):
    
    create view if not exists tab_v1 
    as 
    select id from u2;
    
        :
    
    show tables;
    
    show create table tab_v1;
    
    -----------------------------------------------------------------------------
    show create table tab2;
    OK
    CREATE VIEW `tab2` AS select `lg`.`user_id` from `ali`.`lg`
    Time taken: 0.14 seconds, Fetched: 1 row(s)
    -----------------------------------------------------------------------------
    
    desc tab2;
    
    -----------------------------------------------------------------------------
    OK
    user_id             	string 
    -----------------------------------------------------------------------------
            :(hive-1.2.1     )
    create view tab_v2 like tab_v1; ---    
    
                  
    alter view tab1 rename to tab2;
    
    
                            
    
        :
    drop view if exists tab_v2;    (    )
    drop table if exists tab_v1;   (   )
    
      :
    1、                  。
    2、      insert into   load        。
    3、     ,       、     。
    
    hiveのログ:
    hive     :
        :/tmp/{user.name}
        hive root          :/tmp/root/hive.log,root             。
    hive.log.dir={java.io.tmpdir}/{user.name}
    hive.log.file=hive.log
    hive     :
      hive conf  hive-log4j2.propertie     
    hive.querylog.location
    {system:java.io.tmpdir}/${system:user.name}
    Location of Hive run time structured log file
    
    hiveの運転方式:
    1、cli :    (hive/beeline)      beeline      hiveserver2
    
    hive --service hiveserver2 &
    hiveserver2 &
        :       root  ...       。             hadoop  core-site.xml  
       :
    hadoop.proxyuser.root.hosts
    *
    
    
    hadoop.proxyuser.root.groups
    *
    
               root   root    
    ------------------------------------------------------
     
         hadoop.proxyuser.root.hosts
         192.168.80.10/16
       
                  ip           
    -----------------------------------------------------
    
    
    beeline             ,      ?    
    
    2、java jdbc    
    
            
    
    3、hive -f hql  
    
        ,            
    
    4、hive -e     
    
    hive -e 'current_date+1' hive          -e   
    
    #!/bins/bash
    
    u5_query="
    select 
    * 
    from 
    qf24.u5
    "
    
    hive -e $u5_query
    hive -e $u6_query
    
    属性の設定:
    1、hive-site.xml   (  ,                   、     )
    2、hive          hive --hiveconf a=10 -e ''
    3、hive  cli set   
    set ...
    select ...;
    
               。
    
    
    hiveのjdbc:
    1、conn、ps\rs        rs\ps\conn,    sasl
    2、              ,          root、root,         。
    3、      hiveserver2
    
    kylin:hiveのクエリを加速する(事前に調べて実行し、結果をhbaseに保存する)
    最適化
    1、      (     、  )
    2、        (         )
    3、          (hive-default.xml    )
    
    1、      
    explain extended
    select 
    id id,
    count(id) cnt
    from u4 
    group by id;
    
        > explain extended
        > select
        > id id,
        > count(id) cnt
        > from u4
        > group by id;
    
    
    
    ABSTRACT SYNTAX TREE:
    
    TOK_QUERY
       TOK_FROM
          TOK_TABREF
             TOK_TABNAME
                u4
       TOK_INSERT
          TOK_DESTINATION
             TOK_DIR
                TOK_TMP_FILE
          TOK_SELECT
             TOK_SELEXPR
                TOK_TABLE_OR_COL
                   id
                id
             TOK_SELEXPR
                TOK_FUNCTION
                   count
                   TOK_TABLE_OR_COL
                      id
                cnt
          TOK_GROUPBY
             TOK_TABLE_OR_COL
                id
    
    
    STAGE DEPENDENCIES:
      Stage-1 is a root stage
      Stage-0 depends on stages: Stage-1
    
    STAGE PLANS:
      Stage: Stage-1
        Map Reduce
          Map Operator Tree:
              TableScan
                alias: u4
                Statistics: Num rows: 4 Data size: 16 Basic stats: COMPLETE Column stats: NONE
                GatherStats: false
                Select Operator
                  expressions: id (type: int)
                  outputColumnNames: id
                  Statistics: Num rows: 4 Data size: 16 Basic stats: COMPLETE Column stats: NONE
                  Group By Operator
                    aggregations: count(id)
                    keys: id (type: int)
                    mode: hash
                    outputColumnNames: _col0, _col1
                    Statistics: Num rows: 4 Data size: 16 Basic stats: COMPLETE Column stats: NONE
                    Reduce Output Operator
                      key expressions: _col0 (type: int)
                      sort order: +
                      Map-reduce partition columns: _col0 (type: int)
                      Statistics: Num rows: 4 Data size: 16 Basic stats: COMPLETE Column stats: NONE
                      tag: -1
                      value expressions: _col1 (type: bigint)
                      auto parallelism: false
          Path -> Alias:
            hdfs://hadoop01:9000/user/hive/warehouse/qf24.db/u4 [u4]
          Path -> Partition:
            hdfs://hadoop01:9000/user/hive/warehouse/qf24.db/u4
              Partition
                base file name: u4
                input format: org.apache.hadoop.mapred.TextInputFormat
                output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
                properties:
                  COLUMN_STATS_ACCURATE true
                  bucket_count -1
                  columns id,name
                  columns.comments
                  columns.types int:string
                  field.delim ,
                  file.inputformat org.apache.hadoop.mapred.TextInputFormat
                  file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
                  location hdfs://hadoop01:9000/user/hive/warehouse/qf24.db/u4
                  name qf24.u4
                  numFiles 1
                  numRows 4
                  rawDataSize 16
                  serialization.ddl struct u4 { i32 id, string name}
                  serialization.format ,
                  serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
                  totalSize 28
                  transient_lastDdlTime 1568602270
                serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
    
                  input format: org.apache.hadoop.mapred.TextInputFormat
                  output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
                  properties:
                    COLUMN_STATS_ACCURATE true
                    bucket_count -1
                    columns id,name
                    columns.comments
                    columns.types int:string
                    field.delim ,
                    file.inputformat org.apache.hadoop.mapred.TextInputFormat
                    file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
                    location hdfs://hadoop01:9000/user/hive/warehouse/qf24.db/u4
                    name qf24.u4
                    numFiles 1
                    numRows 4
                    rawDataSize 16
                    serialization.ddl struct u4 { i32 id, string name}
                    serialization.format ,
                    serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
                    totalSize 28
                    transient_lastDdlTime 1568602270
                  serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
                  name: qf24.u4
                name: qf24.u4
          Truncated Path -> Alias:
            /qf24.db/u4 [u4]
          Needs Tagging: false
          Reduce Operator Tree:
            Group By Operator
              aggregations: count(VALUE._col0)
              keys: KEY._col0 (type: int)
              mode: mergepartial
              outputColumnNames: _col0, _col1
              Statistics: Num rows: 2 Data size: 8 Basic stats: COMPLETE Column stats: NONE
              File Output Operator
                compressed: false
                GlobalTableId: 0
                directory: hdfs://hadoop01:9000/tmp/hive/root/5cc41b9b-d7a2-4416-8945-a17f6b462de7/hive_2019-09-16_15-26-40_893_5149303040930924164-1/-mr-10000/.hive-staging_hive_2019-09-16_15-26-40_893_5149303040930924164-1/-ext-10001
                NumFilesPerFileSink: 1
                Statistics: Num rows: 2 Data size: 8 Basic stats: COMPLETE Column stats: NONE
                Stats Publishing Key Prefix: hdfs://hadoop01:9000/tmp/hive/root/5cc41b9b-d7a2-4416-8945-a17f6b462de7/hive_2019-09-16_15-26-40_893_5149303040930924164-1/-mr-10000/.hive-staging_hive_2019-09-16_15-26-40_893_5149303040930924164-1/-ext-10001/
                table:
                    input format: org.apache.hadoop.mapred.TextInputFormat
                    output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
                    properties:
                      columns _col0,_col1
                      columns.types int:bigint
                      escape.delim \
                      hive.serialization.extend.additional.nesting.levels true
                      serialization.format 1
                      serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
                    serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
                TotalFiles: 1
                GatherStats: false
                MultiFileSpray: false
    
      Stage: Stage-0
        Fetch Operator
          limit: -1
          Processor Tree:
            ListSink
            
            
      stage       ,stage   ,        ,    hql        。
    stage       ,       ,  stage   mr  mr    。
    
    
        :
        :  key                    。
           
    join      
    count(distinct col)        
    group by       
    
        :
         reduce       。
    
    
        :
    1、         key,     hql    (       task  --->  task     、group by\count(distrinct col) --->        --->         key )。       ,           union all。
    
    2、     key    (            、              )。
    
     select 
     t2.*
     from t_user2 t2
     join t_user2 t1
     on t2.id = t1.id
     ;
    
    3、         
    hive.map.aggr=true;
    hive.groupby.skewindata=false;  (    )
    hive.optimize.skewjoin=false;
    skewjoin       :
    skew      :
    
    4、     ,         ,      。
    
    
    
    2、join
    hive        (   )    (   )
    hive  on         and   
      hive      join   map join、  mapjoin         
      hive   join:
    hive.optimize.skewjoin=false
    hive.skewjoin.key=100000
    hive.skewjoin.mapjoin.map.tasks=10000
    
    3、limit   :
    hive.limit.row.max.size=100000
    hive.limit.optimize.limit.file=10
    hive.limit.optimize.enable=false  (  limit       )
    hive.limit.optimize.fetch.max=50000
    
    4、    
    hive.exec.mode.local.auto=false (    )
    hive.exec.mode.local.auto.inputbytes.max=134217728  (128M)
    hive.exec.mode.local.auto.input.files.max=4
    
    5、    :
    hive.exec.parallel=false   (    )
    hive.exec.parallel.thread.number=8
    
    6、    
    hive.mapred.mode=nonstrict
    
    7、mapper reducer   :
      mapper redcuer      ,       。    。
    
            (       :CombineTextInputFormat)
              :
    mapred.max.split.size=256000000   
    mapred.min.split.size.per.node=1
    mapred.min.split.size.per.rack=1
    hive.input.format=org.apache.hadoop.hive.ql.io.CombineHiveInputFormat
    
        :
    set mapred.map.tasks=2;
    
    reducer   (         ):
    mapred.reduce.tasks=-1
    hive.exec.reducers.max=1009
    
    8、  jvm  :
    mapreduce.job.jvm.numtasks=1   ###
    
    mapred.job.reuse.jvm.num.tasks=1;
    
    
    
    10、     hive   :(     )
    
    11、      hive     :
    
    12、job   :
               job,        job,        、  join、  group by 、  limit     。
    
    1 job:
    select
    t1.*
    from t_user1 t1
    left join t_user2 t2
    on t1.id = t2.id
    where t2.id is null
    ;
    
      3 job:
    select
    t1.*
    from t_user1 t1
    where id in (
    select
    t2.id
    from t_user2 t2
    limit 1
    )
    ;
    
    13、analyze:
        :https://cwiki.apache.org/confluence/display/Hive/StatsDev
    
    Analyze,   (         )      Hive  ,                  。               ,               ,         (  ),                 。
    
          Analyze  :
    ANALYZE TABLE [db_name.]tablename [PARTITION(partcol1[=val1], partcol2[=val2], ...)]  -- (Note: Fully support qualified table name since Hive 1.2.0, see HIVE-10007.)
      COMPUTE STATISTICS 
      [FOR COLUMNS]          -- (Note: Hive 0.10.0 and later.)
      [CACHE METADATA]       -- (Note: Hive 2.1.0 and later.)
      [NOSCAN];
    
     1(    )、
    ANALYZE table dw_employee_hive partition(bdp_day=20190701) COMPUTE STATISTICS;
        bdp_day=20190701               。            。             ,        Hive Metastore        。       ,      ,NULL    ,      ,            (       )      。
    
     2(     )、
    ANALYZE table dw_employee_hive partition(bdp_day=20190701) COMPUTE STATISTICS FOR COLUMNS;
        bdp_day=20190701              。
    
     3(    )、
    ANALYZE table dw_employee_hive partition(bdp_day=20190701) COMPUTE STATISTICS FOR COLUMNS snum,dept;
    
     4、
    ANALYZE TABLE dw_employee_hive partition(bdp_day=20190701) COMPUTE STATISTICS NOSCAN;
              ,       。
    
            。
     1、
    DESCRIBE EXTENDED dw_employee_hive partition(bdp_day=20190701);
    
        :
    ...parameters:{totalSize=10202043, numRows=33102, rawDataSize=430326, ...
    
     2、
    desc formatted dw_employee_hive partition(bdp_day=20190701) name;
    
        :
    # col_name  data_type   min max num_nulls   distinct_count  avg_col_len max_col_len num_trues   num_falses  comment
    name string 0 37199 4.0 4 from deserializer
    
    
      :
           ,        ,      ,       partition(bdp_day).
               :https://www.cnblogs.com/lunatic-cto/p/10988342.html
    
    mysqlの格納過程(詳しくは説明できません):
      1:           :
    Id name(“”+i) age(   )
    
    CREATE TABLE IF NOT EXISTS USER(
    id BIGINT(11) NOT NULL AUTO_INCREMENT,
    NAME VARCHAR(45) DEFAULT NULL,
    age INT(1) DEFAULT 1,
    PRIMARY KEY(id)
    )
    ENGINE=INNODB AUTO_INCREMENT=0 DEFAULT CHARSET=utf8;
      2:
    User
    Id	name	age
    
    
    User-info
    Uid	birthday sex .....
    
    
     mysql      :
         :declare
              :
    Decalare         (  )  [default    ];
     :declare name varchar(45) default ‘’;
    
     mysql    :
         :set
     : Set i =100;
    
     mysql       if...end if  、if...else...   if...else if...else... 。
        if else    :(       and  or)
    If      then
       ; //        ,
    End if;
    
    If      then
       ; //        
    Else
       ; //        
    End if;
    
    If      then
       ; //        
    Elseif        //  Elseif     
       ; //        
    Else
       ; //        
    End if;
    
    
    
    
    
     MySQL                  :WHILE  ,REPEAT    LOOP  ,            :GOTO,            ,          。       while repreat
                :
    WHILE      DO
       ; //         
    END WHILE;
    
    REPEAT
       ; //         
    UNTIL       //     
    END REPEAT;
    
     1           :
    INSERT into test.`USER`(`name`,age) 
    SELECT
    `name`,
    age
    FROM stu
    ;
    
        :
    BEGIN
    
    DECLARE _id INT(11) DEFAULT 0;
    DECLARE _nm VARCHAR(22) DEFAULT '';
    
    #    
    DECLARE  _done int default 0;  
    
    DECLARE stu_set cursor for 
    SELECT 
    s.id id,
    s.`name` nm
    FROM stu1 s
    ;
    
    
    DECLARE CONTINUE HANDLER FOR SQLSTATE '02000' SET _done = 1;#    ,       
    
    #    
    OPEN stu_set;
         /*         */ 
    		 REPEAT
    				FETCH stu_set INTO _id,_nm;  
    			 IF NOT _done THEN
    				INSERT INTO  test.`USER`(`NAME`,age) VALUES (_nm,_id);
    				END IF;
    		UNTIL _done END REPEAT; # _done=1       
    CLOSE stu_set;
    
    
    END
    
    hiveの保存プロセス
    CREATE PROCEDURE set_message(IN name STRING, OUT result STRING)
    BEGIN
     SET result = 'Hello, ' || name || '!';
    END;
     
    -- Now call the procedure and print the results
    DECLARE str STRING;
    CALL set_message('world', str);
    PRINT str;
     
    Result:
    --
    Hello, world!
    
    
      :
    use ali;
    create procedure select_u5()
    begin
    select * from ali.lg;
    end;
    
    
    create function hello(text string)
    returnS string
    BEGIN
    RETRUEN 'Hello,' || text || '!';
    END;
    
    create procedure select_u53()
    begin
    FOR item IN(
    SELECT user_id,ds FROM ali.read limit 2
    )
    loop
            println item.user_id || '|' || item.ds || '|' || hello(item.ds);
    end loop;
    end;
    
    
    create procedure pc()
    begin
    DECLARE tabname VARCHAR DEFAULT 'ali.pay';
    DECLARE user_id INT;
    DECLARE cur CURSOR FOR 'SELECT user_id FROM ' || tabname;
    OPEN cur;
    FETCH cur INTO user_id;
    WHILE SQLCODE=0 THEN
      PRINT user_id;
      FETCH cur INTO user_id;
    END WHILE;
    CLOSE cur;
    end;
    
    
        :
    include /usr/local/sc/fp.hql  --             
    
    call select_u5();  --               
    
    call select_u53();
    
    call hello("text");
    
    call pc();