hiveステップ4
hiveステップ4
hiveのフィールド区切り: File Formats and Commpression)(https://cwiki.apache.org/confluence/display/Hive/FileFormats):RCFile,Avro,ORC,Parquet;Commpression,LZO 上記のすべてのファイルフォーマットはロードできないことに注意してください。 各保存フォーマットの属性:
最適化
hiveのフィールド区切り:
hive :\001 、ctrl+V ctrl+A(^A) 、SOH 、 \u0001( java ), tab
:
tab
,
" "
|
\001 ^A (\u0001, \0001 \01)
\002 ^B
\003 ^C
hiveのファイル保存フォーマット:hive :textfile
textfile: , 。 , 。( )
sequencefile:
hive , 。 load
rcfile:
hive ,hive , 。 , 。
orc :
rcfile。
parquet :
。 , ( )
hive.default.fileformat
TextFile
Expects one of [textfile, sequencefile, rcfile, orc].
Default file format for CREATE TABLE statement. Users can explicitly override it by CREATE TABLE ... STORED AS [FORMAT]
textfile: 。
map :
mapreduce.map.output.compress=false
mapreduce.map.output.compress.codec=org.apache.hadoop.io.compress.DefaultCodec
reduce (reduce ):
snappy、bzip2、gzip、DefaultCompress
mapreduce.output.fileoutputformat.compress=false
mapreduce.output.fileoutputformat.compress.type=NONE/RECORD/BLOCK( RECORD)
mapreduce.output.fileoutputformat.compress.codec=org.apache.hadoop.io.compress.DefaultCodec
hive :
set hive.exec.compress.output=false;
set hive.exec.compress.intermediate=false;
set hive.intermediate.compression.codec=
set hive.intermediate.compression.type=
CREATE TABLE `u4`(
`id` int,
`name` string)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
stored as textfile;
set mapreduce.output.fileoutputformat.compress=true;
set hive.exec.compress.output=true;
insert into table u4
select * from u2;
2:
sequence :
CREATE TABLE `u4`(
`id` int,
`name` string)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
stored as sequencefile;
3:
rcfile :
CREATE TABLE `u5`(
`id` int,
`name` string)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
stored as rcfile;
4:
orc :
CREATE TABLE `u6`(
`id` int,
`name` string)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
stored as orc;
5:
parquet:
CREATE TABLE `u7`(
`id` int,
`name` string)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
stored as PARQUET;
insert into table u7
select * from u2;
:
:
seq_yd :
aGVsbG8gemhhbmdoYW8=
aGVsbG8gZmVpZmVpLGdvb2QgZ29vZCBzdHVkeSxkYXkgZGF5IHVw
seq_yd base64 ,decode :
create table cus(str STRING)
stored as
inputformat 'org.apache.hadoop.hive.contrib.fileformat.base64.Base64TextInputFormat'
outputformat 'org.apache.hadoop.hive.contrib.fileformat.base64.Base64TextOutputFormat';
LOAD DATA LOCAL INPATH '/home/hivedata/cus' INTO TABLE cus;
Hiveビュー :
hive
hive , 。
hive :
1、 ( )。
2、 。
(cvas):
create view if not exists tab_v1
as
select id from u2;
:
show tables;
show create table tab_v1;
-----------------------------------------------------------------------------
show create table tab2;
OK
CREATE VIEW `tab2` AS select `lg`.`user_id` from `ali`.`lg`
Time taken: 0.14 seconds, Fetched: 1 row(s)
-----------------------------------------------------------------------------
desc tab2;
-----------------------------------------------------------------------------
OK
user_id string
-----------------------------------------------------------------------------
:(hive-1.2.1 )
create view tab_v2 like tab_v1; ---
alter view tab1 rename to tab2;
:
drop view if exists tab_v2; ( )
drop table if exists tab_v1; ( )
:
1、 。
2、 insert into load 。
3、 , 、 。
hiveのログ:hive :
:/tmp/{user.name}
hive root :/tmp/root/hive.log,root 。
hive.log.dir={java.io.tmpdir}/{user.name}
hive.log.file=hive.log
hive :
hive conf hive-log4j2.propertie
hive.querylog.location
{system:java.io.tmpdir}/${system:user.name}
Location of Hive run time structured log file
hiveの運転方式:1、cli : (hive/beeline) beeline hiveserver2
hive --service hiveserver2 &
hiveserver2 &
: root ... 。 hadoop core-site.xml
:
hadoop.proxyuser.root.hosts
*
hadoop.proxyuser.root.groups
*
root root
------------------------------------------------------
hadoop.proxyuser.root.hosts
192.168.80.10/16
ip
-----------------------------------------------------
beeline , ?
2、java jdbc
3、hive -f hql
,
4、hive -e
hive -e 'current_date+1' hive -e
#!/bins/bash
u5_query="
select
*
from
qf24.u5
"
hive -e $u5_query
hive -e $u6_query
属性の設定:1、hive-site.xml ( , 、 )
2、hive hive --hiveconf a=10 -e ''
3、hive cli set
set ...
select ...;
。
hiveのjdbc:1、conn、ps\rs rs\ps\conn, sasl
2、 , root、root, 。
3、 hiveserver2
kylin:hiveのクエリを加速する(事前に調べて実行し、結果をhbaseに保存する)最適化
1、 ( 、 )
2、 ( )
3、 (hive-default.xml )
1、
explain extended
select
id id,
count(id) cnt
from u4
group by id;
> explain extended
> select
> id id,
> count(id) cnt
> from u4
> group by id;
ABSTRACT SYNTAX TREE:
TOK_QUERY
TOK_FROM
TOK_TABREF
TOK_TABNAME
u4
TOK_INSERT
TOK_DESTINATION
TOK_DIR
TOK_TMP_FILE
TOK_SELECT
TOK_SELEXPR
TOK_TABLE_OR_COL
id
id
TOK_SELEXPR
TOK_FUNCTION
count
TOK_TABLE_OR_COL
id
cnt
TOK_GROUPBY
TOK_TABLE_OR_COL
id
STAGE DEPENDENCIES:
Stage-1 is a root stage
Stage-0 depends on stages: Stage-1
STAGE PLANS:
Stage: Stage-1
Map Reduce
Map Operator Tree:
TableScan
alias: u4
Statistics: Num rows: 4 Data size: 16 Basic stats: COMPLETE Column stats: NONE
GatherStats: false
Select Operator
expressions: id (type: int)
outputColumnNames: id
Statistics: Num rows: 4 Data size: 16 Basic stats: COMPLETE Column stats: NONE
Group By Operator
aggregations: count(id)
keys: id (type: int)
mode: hash
outputColumnNames: _col0, _col1
Statistics: Num rows: 4 Data size: 16 Basic stats: COMPLETE Column stats: NONE
Reduce Output Operator
key expressions: _col0 (type: int)
sort order: +
Map-reduce partition columns: _col0 (type: int)
Statistics: Num rows: 4 Data size: 16 Basic stats: COMPLETE Column stats: NONE
tag: -1
value expressions: _col1 (type: bigint)
auto parallelism: false
Path -> Alias:
hdfs://hadoop01:9000/user/hive/warehouse/qf24.db/u4 [u4]
Path -> Partition:
hdfs://hadoop01:9000/user/hive/warehouse/qf24.db/u4
Partition
base file name: u4
input format: org.apache.hadoop.mapred.TextInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
properties:
COLUMN_STATS_ACCURATE true
bucket_count -1
columns id,name
columns.comments
columns.types int:string
field.delim ,
file.inputformat org.apache.hadoop.mapred.TextInputFormat
file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
location hdfs://hadoop01:9000/user/hive/warehouse/qf24.db/u4
name qf24.u4
numFiles 1
numRows 4
rawDataSize 16
serialization.ddl struct u4 { i32 id, string name}
serialization.format ,
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
totalSize 28
transient_lastDdlTime 1568602270
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
input format: org.apache.hadoop.mapred.TextInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
properties:
COLUMN_STATS_ACCURATE true
bucket_count -1
columns id,name
columns.comments
columns.types int:string
field.delim ,
file.inputformat org.apache.hadoop.mapred.TextInputFormat
file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
location hdfs://hadoop01:9000/user/hive/warehouse/qf24.db/u4
name qf24.u4
numFiles 1
numRows 4
rawDataSize 16
serialization.ddl struct u4 { i32 id, string name}
serialization.format ,
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
totalSize 28
transient_lastDdlTime 1568602270
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
name: qf24.u4
name: qf24.u4
Truncated Path -> Alias:
/qf24.db/u4 [u4]
Needs Tagging: false
Reduce Operator Tree:
Group By Operator
aggregations: count(VALUE._col0)
keys: KEY._col0 (type: int)
mode: mergepartial
outputColumnNames: _col0, _col1
Statistics: Num rows: 2 Data size: 8 Basic stats: COMPLETE Column stats: NONE
File Output Operator
compressed: false
GlobalTableId: 0
directory: hdfs://hadoop01:9000/tmp/hive/root/5cc41b9b-d7a2-4416-8945-a17f6b462de7/hive_2019-09-16_15-26-40_893_5149303040930924164-1/-mr-10000/.hive-staging_hive_2019-09-16_15-26-40_893_5149303040930924164-1/-ext-10001
NumFilesPerFileSink: 1
Statistics: Num rows: 2 Data size: 8 Basic stats: COMPLETE Column stats: NONE
Stats Publishing Key Prefix: hdfs://hadoop01:9000/tmp/hive/root/5cc41b9b-d7a2-4416-8945-a17f6b462de7/hive_2019-09-16_15-26-40_893_5149303040930924164-1/-mr-10000/.hive-staging_hive_2019-09-16_15-26-40_893_5149303040930924164-1/-ext-10001/
table:
input format: org.apache.hadoop.mapred.TextInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
properties:
columns _col0,_col1
columns.types int:bigint
escape.delim \
hive.serialization.extend.additional.nesting.levels true
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
TotalFiles: 1
GatherStats: false
MultiFileSpray: false
Stage: Stage-0
Fetch Operator
limit: -1
Processor Tree:
ListSink
stage ,stage , , hql 。
stage , , stage mr mr 。
:
: key 。
join
count(distinct col)
group by
:
reduce 。
:
1、 key, hql ( task ---> task 、group by\count(distrinct col) ---> ---> key )。 , union all。
2、 key ( 、 )。
select
t2.*
from t_user2 t2
join t_user2 t1
on t2.id = t1.id
;
3、
hive.map.aggr=true;
hive.groupby.skewindata=false; ( )
hive.optimize.skewjoin=false;
skewjoin :
skew :
4、 , , 。
2、join
hive ( ) ( )
hive on and
hive join map join、 mapjoin
hive join:
hive.optimize.skewjoin=false
hive.skewjoin.key=100000
hive.skewjoin.mapjoin.map.tasks=10000
3、limit :
hive.limit.row.max.size=100000
hive.limit.optimize.limit.file=10
hive.limit.optimize.enable=false ( limit )
hive.limit.optimize.fetch.max=50000
4、
hive.exec.mode.local.auto=false ( )
hive.exec.mode.local.auto.inputbytes.max=134217728 (128M)
hive.exec.mode.local.auto.input.files.max=4
5、 :
hive.exec.parallel=false ( )
hive.exec.parallel.thread.number=8
6、
hive.mapred.mode=nonstrict
7、mapper reducer :
mapper redcuer , 。 。
( :CombineTextInputFormat)
:
mapred.max.split.size=256000000
mapred.min.split.size.per.node=1
mapred.min.split.size.per.rack=1
hive.input.format=org.apache.hadoop.hive.ql.io.CombineHiveInputFormat
:
set mapred.map.tasks=2;
reducer ( ):
mapred.reduce.tasks=-1
hive.exec.reducers.max=1009
8、 jvm :
mapreduce.job.jvm.numtasks=1 ###
mapred.job.reuse.jvm.num.tasks=1;
10、 hive :( )
11、 hive :
12、job :
job, job, 、 join、 group by 、 limit 。
1 job:
select
t1.*
from t_user1 t1
left join t_user2 t2
on t1.id = t2.id
where t2.id is null
;
3 job:
select
t1.*
from t_user1 t1
where id in (
select
t2.id
from t_user2 t2
limit 1
)
;
13、analyze:
:https://cwiki.apache.org/confluence/display/Hive/StatsDev
Analyze, ( ) Hive , 。 , , ( ), 。
Analyze :
ANALYZE TABLE [db_name.]tablename [PARTITION(partcol1[=val1], partcol2[=val2], ...)] -- (Note: Fully support qualified table name since Hive 1.2.0, see HIVE-10007.)
COMPUTE STATISTICS
[FOR COLUMNS] -- (Note: Hive 0.10.0 and later.)
[CACHE METADATA] -- (Note: Hive 2.1.0 and later.)
[NOSCAN];
1( )、
ANALYZE table dw_employee_hive partition(bdp_day=20190701) COMPUTE STATISTICS;
bdp_day=20190701 。 。 , Hive Metastore 。 , ,NULL , , ( ) 。
2( )、
ANALYZE table dw_employee_hive partition(bdp_day=20190701) COMPUTE STATISTICS FOR COLUMNS;
bdp_day=20190701 。
3( )、
ANALYZE table dw_employee_hive partition(bdp_day=20190701) COMPUTE STATISTICS FOR COLUMNS snum,dept;
4、
ANALYZE TABLE dw_employee_hive partition(bdp_day=20190701) COMPUTE STATISTICS NOSCAN;
, 。
。
1、
DESCRIBE EXTENDED dw_employee_hive partition(bdp_day=20190701);
:
...parameters:{totalSize=10202043, numRows=33102, rawDataSize=430326, ...
2、
desc formatted dw_employee_hive partition(bdp_day=20190701) name;
:
# col_name data_type min max num_nulls distinct_count avg_col_len max_col_len num_trues num_falses comment
name string 0 37199 4.0 4 from deserializer
:
, , , partition(bdp_day).
:https://www.cnblogs.com/lunatic-cto/p/10988342.html
mysqlの格納過程(詳しくは説明できません): 1: :
Id name(“”+i) age( )
CREATE TABLE IF NOT EXISTS USER(
id BIGINT(11) NOT NULL AUTO_INCREMENT,
NAME VARCHAR(45) DEFAULT NULL,
age INT(1) DEFAULT 1,
PRIMARY KEY(id)
)
ENGINE=INNODB AUTO_INCREMENT=0 DEFAULT CHARSET=utf8;
2:
User
Id name age
User-info
Uid birthday sex .....
mysql :
:declare
:
Decalare ( ) [default ];
:declare name varchar(45) default ‘’;
mysql :
:set
: Set i =100;
mysql if...end if 、if...else... if...else if...else... 。
if else :( and or)
If then
; // ,
End if;
If then
; //
Else
; //
End if;
If then
; //
Elseif // Elseif
; //
Else
; //
End if;
MySQL :WHILE ,REPEAT LOOP , :GOTO, , 。 while repreat
:
WHILE DO
; //
END WHILE;
REPEAT
; //
UNTIL //
END REPEAT;
1 :
INSERT into test.`USER`(`name`,age)
SELECT
`name`,
age
FROM stu
;
:
BEGIN
DECLARE _id INT(11) DEFAULT 0;
DECLARE _nm VARCHAR(22) DEFAULT '';
#
DECLARE _done int default 0;
DECLARE stu_set cursor for
SELECT
s.id id,
s.`name` nm
FROM stu1 s
;
DECLARE CONTINUE HANDLER FOR SQLSTATE '02000' SET _done = 1;# ,
#
OPEN stu_set;
/* */
REPEAT
FETCH stu_set INTO _id,_nm;
IF NOT _done THEN
INSERT INTO test.`USER`(`NAME`,age) VALUES (_nm,_id);
END IF;
UNTIL _done END REPEAT; # _done=1
CLOSE stu_set;
END
hiveの保存プロセスCREATE PROCEDURE set_message(IN name STRING, OUT result STRING)
BEGIN
SET result = 'Hello, ' || name || '!';
END;
-- Now call the procedure and print the results
DECLARE str STRING;
CALL set_message('world', str);
PRINT str;
Result:
--
Hello, world!
:
use ali;
create procedure select_u5()
begin
select * from ali.lg;
end;
create function hello(text string)
returnS string
BEGIN
RETRUEN 'Hello,' || text || '!';
END;
create procedure select_u53()
begin
FOR item IN(
SELECT user_id,ds FROM ali.read limit 2
)
loop
println item.user_id || '|' || item.ds || '|' || hello(item.ds);
end loop;
end;
create procedure pc()
begin
DECLARE tabname VARCHAR DEFAULT 'ali.pay';
DECLARE user_id INT;
DECLARE cur CURSOR FOR 'SELECT user_id FROM ' || tabname;
OPEN cur;
FETCH cur INTO user_id;
WHILE SQLCODE=0 THEN
PRINT user_id;
FETCH cur INTO user_id;
END WHILE;
CLOSE cur;
end;
:
include /usr/local/sc/fp.hql --
call select_u5(); --
call select_u53();
call hello("text");
call pc();