HadoopのHive

8547 ワード
hadoop
/*
Hive   sql   map-reduce    
Hive         ，    hadoop         ，        DML  。
Hive       ，   ，     ，          。

*/

NoSQL,NOT Only SQL。           ，            。    sql   。
NewSQL,SQL   
  "    "   
Hive            

      ，   Hadoop           Hive   。
     SQL         HiveQL。       ，     ，  SQL   
      。
     SQL Map-Reduce    。
  shell,JDBC/ODBC,Thrift,web   。


Hive   ：
Hive    ：shell,thrift,web .
Thrift   (Java   )
     "Derby,Mysql" 
   
Hadoop


UDF = USER DEFINITION FUNCTION

Hive  

    :          Derby  ，         。
      ：     Mysql，      Mysql 。
    ：         Mysql   。

Hive Sql  
--   sql
create table Loc(
CDRID STRING,
IMSI INT,
....
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '|'
STORED AS TEXTFILE;

--      

DESCRIBE EXTENDED mydb.employees;

--    sql
LOAD DATA LOCAL INPATH '/home/james/location_20120316.txt'
OVERWRITE INTO TABLE LOC;

INSERT OVERWRITE TABLE RESULT
SELECT 
IMSI,IMEI,SUBSTR(CGI,8),STARTTIME,NULL,UPDATETYPE,3......
FROM LOC
WHERE IMSI IS NOT NULL;

--  SQL
SELECT IMSI,CGI,TIME FROM RESULT;
SELECT BAR FROM POKES LIMIT 5;

--   
SELECT RESULT.IMSI,LOC.INSTIME
FROM RESULT JOIN LOC ON (RESULT.IMSI = LOC.IMSI);


--       
show tables;

--   
drop table abc;



JDBC/ODBC  

                JDBC ODBC  Hive
      
  jdbc     Hive，            hive Thrift Server,    hive     connection refused   。
    ：
hive -- service hiveserver

Hive       ？
   HDFS warehouse   ，          。
  reduce
   /tmp           

      
struct('John','Doe')
map('first','john','last','Doe')
array('john','Doe')

    
create table employees(
name STRING,
salary FLOAT,
subordinates ARRAY(<TRING>,
deductions MAP<STRING,FLOAT>,
address STRUCT<street:STRING,city:STRING,state:STRING,zip:INT>);
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\001'
COLLECTION ITEMS TERMINATED BY '\002'
MAP KEYS TERMINATED BY '\003'
LINES TERMINATED BY '
'
STORED AS TEXTFILE;

DDL  
     
   

--        
CREATE DATABASE financials;
CREATE DATABASE IF NOT EXISTS financials;
SHOW DATABASES;
SHOW DATABASES LIKE 'h.*'

--         
CREATE DATABASE financials
LOCATION '/my/preferred/directory'

--     
USE financials;
--       
set hive.cli.print.current.db=true;
--        
DROP DATABASE IF EXISTS financials;
DROP DATABASE IF EXISTS financials CASCADE;
--          
ALTER DATABASE financials SET DBPROPERTIES('edited-by','Joe Dba');

--   
--            ，          oracle
Hive       ，                ；      ，          ， 
            。       ，                ， 
          ，     。              ，         ，       。


CREATE EXTERNAL TABLE IF NOT EXISTS stocks
(
exchange STIRNG,
...
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
LOCATION '/data/stocks'

--   
--        ，     sql   。
create table employees(
name STRING,
salary FLOAT,
...
)
PARTITIONED BY (country STRING,state STRING);

                   。            。
      ：                。

--Strict          
set hive.mapred.mode=strict;

    strict  ，           ， where          。
     nostrict  。


--    

SHOW PARTITIONS employees PARTITION(country='US');
SHOW PARTITIONS employees PARTITION(country='US','state=AK');

--   
alter table xx rename to xx;
--    
alter table xx add if not exists
partition(year=2011,month=1,day=1) location '/logs/2011/01/01'


--   
--      
alter table xx change column xx int;

--   
alter table xx add columns (xx string comment '');
--replace 
alter table log_messages replace columns
(
xx int comment ''
);

DML  

       DML  。                  。

--        
Load data local input 'path'
overwrite into table employees
partition(country='US',state='CA')

--insert overwrite  

insert overwrite table employees
partition(country='US',state='OR')
select * from staged_employees se
where se.cnty='US' and se.st='OR';


--       。Hadoop              ，         。
--              
FROM staged_employees se
INSERT OVERWRITE table employees
partition(country='US',state='OR')
select * where se.cnty='US' and se.st='OR'
INSERT OVERWRITE table employees
partition(country='US',state='CA')
select * where se.cnty='US' and se.st='CA'
INSERT OVERWRITE table employees
partition(country='US',state='IL')
select * where se.cnty='US' and se.st='IL';

--      ，         。
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nostrict;
set hive.exec.max.dynamic.partitions.pernode=1000;

insert overwrite table employees
partition(country,state)
select ..,se.cty,se.st
from staged_employees se;
--   
create table ... as select ...

--    
--             ，      hdfs       
--
INSERT OVERWRITE LOCAL DIRECTORY 'tmp/ca_employees'
select name,salary,address
from employees
where se.state = 'CA';

--       
--   prick      
SELECT sysmbol,price.* FROM stocks;

--%         。_        
SELECT NAME,ADDRESS.STREET FROM EMPLOYEES WHERE ADDRESS.STREET
LIKE '%Ave.';

--RLIKE       
select name,address.street
from employees where address.street RLIKE '.*(Chicago|Ontario).*';

--explode   

select array(1,2,3) from dual;
[1,2,3]

select explode(array(1,2,3)) as elements from src;
1,
2,
3

--    
FROM
(
select upper(name),salary,deductions["Federal Taxes"] as fed_taxes,
round(salary*(1-deductions["Federal Taxes"])) as salary_minus_fed_taxes
from employees
)e
select e.name,e.salary_minus_fed_taxes
where e.salary_minus_fed_taxes >70000;

--       
--         ，     map-reduce  combine。
--          ，         reduce  
set hive.map.aggr=true;
select count(*),avg(salary) from employees;


--    ,              (     ,   ,   )
--        !
--        reduce      ,  "map-side joins",                           ,
--        reduce  。
--    ，       
                
        
  map-side joins    .

select /*+ MAPJOIN(d) */
s.ymd,s.symbol,s.price_close,d.dividend
FROM stocks s JOIN dividends d on s.ymd = d.ymd
and s.symbol=d.symbol
where s.symbol='AAPL';

--hive.auto.convert.join=true
--hive            mapjoin.
--                    。
--hive.mapjoin.smalltable.filesize=25000000
--                


--  

order by --    
sort by --  reduce  ,       .  reducer     
distribute by--            reduce  ,   sort by    
select s.ymd,s.symbol,s.price_close
from stocks s
distribute by s.symbol
sort by s.symbol asc,s.ymd asc;


cluster by --    sort by + distribute by 

--bucket     
               
select * from number tablesample(BUCKET 3 OUT OF 10 ON rand())s;
--    
select * from numbersflat TABLESAMPLE(0.1 PERCENT) S;

--     
--Hive                  
Hive         ，          B+   
   ，Hive                     。
redis突然の停電によるaofファイル破損修復の一例
プログラマー面接金典-3.6ダブルスタックソート