HadoopのHive
8547 ワード
/*
Hive sql map-reduce
Hive , hadoop , DML 。
Hive , , , 。
*/
NoSQL,NOT Only SQL。 , 。 sql 。
NewSQL,SQL
" "
Hive
, Hadoop Hive 。
SQL HiveQL。 , , SQL
。
SQL Map-Reduce 。
shell,JDBC/ODBC,Thrift,web 。
Hive :
Hive :shell,thrift,web .
Thrift (Java )
"Derby,Mysql"
Hadoop
UDF = USER DEFINITION FUNCTION
Hive
: Derby , 。
: Mysql, Mysql 。
: Mysql 。
Hive Sql
-- sql
create table Loc(
CDRID STRING,
IMSI INT,
....
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '|'
STORED AS TEXTFILE;
--
DESCRIBE EXTENDED mydb.employees;
-- sql
LOAD DATA LOCAL INPATH '/home/james/location_20120316.txt'
OVERWRITE INTO TABLE LOC;
INSERT OVERWRITE TABLE RESULT
SELECT
IMSI,IMEI,SUBSTR(CGI,8),STARTTIME,NULL,UPDATETYPE,3......
FROM LOC
WHERE IMSI IS NOT NULL;
-- SQL
SELECT IMSI,CGI,TIME FROM RESULT;
SELECT BAR FROM POKES LIMIT 5;
--
SELECT RESULT.IMSI,LOC.INSTIME
FROM RESULT JOIN LOC ON (RESULT.IMSI = LOC.IMSI);
--
show tables;
--
drop table abc;
JDBC/ODBC
JDBC ODBC Hive
jdbc Hive, hive Thrift Server, hive connection refused 。
:
hive -- service hiveserver
Hive ?
HDFS warehouse , 。
reduce
/tmp
struct('John','Doe')
map('first','john','last','Doe')
array('john','Doe')
create table employees(
name STRING,
salary FLOAT,
subordinates ARRAY(<TRING>,
deductions MAP<STRING,FLOAT>,
address STRUCT<street:STRING,city:STRING,state:STRING,zip:INT>);
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\001'
COLLECTION ITEMS TERMINATED BY '\002'
MAP KEYS TERMINATED BY '\003'
LINES TERMINATED BY '
'
STORED AS TEXTFILE;
DDL
--
CREATE DATABASE financials;
CREATE DATABASE IF NOT EXISTS financials;
SHOW DATABASES;
SHOW DATABASES LIKE 'h.*'
--
CREATE DATABASE financials
LOCATION '/my/preferred/directory'
--
USE financials;
--
set hive.cli.print.current.db=true;
--
DROP DATABASE IF EXISTS financials;
DROP DATABASE IF EXISTS financials CASCADE;
--
ALTER DATABASE financials SET DBPROPERTIES('edited-by','Joe Dba');
--
-- , oracle
Hive , ; , ,
。 , ,
, 。 , , 。
CREATE EXTERNAL TABLE IF NOT EXISTS stocks
(
exchange STIRNG,
...
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
LOCATION '/data/stocks'
--
-- , sql 。
create table employees(
name STRING,
salary FLOAT,
...
)
PARTITIONED BY (country STRING,state STRING);
。 。
: 。
--Strict
set hive.mapred.mode=strict;
strict , , where 。
nostrict 。
--
SHOW PARTITIONS employees PARTITION(country='US');
SHOW PARTITIONS employees PARTITION(country='US','state=AK');
--
alter table xx rename to xx;
--
alter table xx add if not exists
partition(year=2011,month=1,day=1) location '/logs/2011/01/01'
--
--
alter table xx change column xx int;
--
alter table xx add columns (xx string comment '');
--replace
alter table log_messages replace columns
(
xx int comment ''
);
DML
DML 。 。
--
Load data local input 'path'
overwrite into table employees
partition(country='US',state='CA')
--insert overwrite
insert overwrite table employees
partition(country='US',state='OR')
select * from staged_employees se
where se.cnty='US' and se.st='OR';
-- 。Hadoop , 。
--
FROM staged_employees se
INSERT OVERWRITE table employees
partition(country='US',state='OR')
select * where se.cnty='US' and se.st='OR'
INSERT OVERWRITE table employees
partition(country='US',state='CA')
select * where se.cnty='US' and se.st='CA'
INSERT OVERWRITE table employees
partition(country='US',state='IL')
select * where se.cnty='US' and se.st='IL';
-- , 。
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nostrict;
set hive.exec.max.dynamic.partitions.pernode=1000;
insert overwrite table employees
partition(country,state)
select ..,se.cty,se.st
from staged_employees se;
--
create table ... as select ...
--
-- , hdfs
--
INSERT OVERWRITE LOCAL DIRECTORY 'tmp/ca_employees'
select name,salary,address
from employees
where se.state = 'CA';
--
-- prick
SELECT sysmbol,price.* FROM stocks;
--% 。_
SELECT NAME,ADDRESS.STREET FROM EMPLOYEES WHERE ADDRESS.STREET
LIKE '%Ave.';
--RLIKE
select name,address.street
from employees where address.street RLIKE '.*(Chicago|Ontario).*';
--explode
select array(1,2,3) from dual;
[1,2,3]
select explode(array(1,2,3)) as elements from src;
1,
2,
3
--
FROM
(
select upper(name),salary,deductions["Federal Taxes"] as fed_taxes,
round(salary*(1-deductions["Federal Taxes"])) as salary_minus_fed_taxes
from employees
)e
select e.name,e.salary_minus_fed_taxes
where e.salary_minus_fed_taxes >70000;
--
-- , map-reduce combine。
-- , reduce
set hive.map.aggr=true;
select count(*),avg(salary) from employees;
-- , ( , , )
-- !
-- reduce , "map-side joins", ,
-- reduce 。
-- ,
map-side joins .
select /*+ MAPJOIN(d) */
s.ymd,s.symbol,s.price_close,d.dividend
FROM stocks s JOIN dividends d on s.ymd = d.ymd
and s.symbol=d.symbol
where s.symbol='AAPL';
--hive.auto.convert.join=true
--hive mapjoin.
-- 。
--hive.mapjoin.smalltable.filesize=25000000
--
--
order by --
sort by -- reduce , . reducer
distribute by-- reduce , sort by
select s.ymd,s.symbol,s.price_close
from stocks s
distribute by s.symbol
sort by s.symbol asc,s.ymd asc;
cluster by -- sort by + distribute by
--bucket
select * from number tablesample(BUCKET 3 OUT OF 10 ON rand())s;
--
select * from numbersflat TABLESAMPLE(0.1 PERCENT) S;
--
--Hive
Hive , B+
,Hive 。