pigとmysqlスクリプトの比較
139456 ワード
テストデータ:/home/hadoop/luogankun/workspace/sync_data/pig
deptとempテーブルはoracleデータベースから持参したテーブルに由来します
dept.txt
emp.txt
HDFSシステムへのデータのアップロード
インポート
テーブル構造の表示
テーブル全体を問い合わせる
照会前N条
クエリー・テーブルの一部の列
列に別名を付ける
ツールバーの
条件クエリー
内部接続Inner Join
左接続Left Join
右接続Right Join
フルコネクションFull Join
同時に複数のテーブルをクロスクエリする
グループGROUP BY
グループ化と統計
脱重DISTINCT
pig判空
deptとempテーブルはoracleデータベースから持参したテーブルに由来します
dept.txt
10 ACCOUNTING NEW YORK
20 RESEARCH DALLAS
30 SALES CHICAGO
40 OPERATIONS BOSTON
emp.txt
7369 SMITH CLERK 7902 1980-12-17 800.00 20
7499 ALLEN SALESMAN 7698 1981-2-20 1600.00 300.00 30
7521 WARD SALESMAN 7698 1981-2-22 1250.00 500.00 30
7566 JONES MANAGER 7839 1981-4-2 2975.00 20
7654 MARTIN SALESMAN 7698 1981-9-28 1250.00 1400.00 30
7698 BLAKE MANAGER 7839 1981-5-1 2850.00 30
7782 CLARK MANAGER 7839 1981-6-9 2450.00 10
7788 SCOTT ANALYST 7566 1987-4-19 3000.00 20
7839 KING PRESIDENT 1981-11-17 5000.00 10
7844 TURNER SALESMAN 7698 1981-9-8 1500.00 0.00 30
7876 ADAMS CLERK 7788 1987-5-23 1100.00 20
7900 JAMES CLERK 7698 1981-12-3 950.00 30
7902 FORD ANALYST 7566 1981-12-3 3000.00 20
7934 MILLER CLERK 7782 1982-1-23 1300.00 10
HDFSシステムへのデータのアップロード
cd /home/hadoop/luogankun/workspace/sync_data/pig
hadoop fs -put dept.txt input/pig/dept.txt
hadoop fs -put emp.txt input/pig/emp.txt
インポート
CREATE TABLE TMP_TABLE(USER VARCHAR(32),AGE INT,IS_MALE BOOLEAN);
dept= LOAD 'input/pig/dept.txt' USING PigStorage('\t') AS (deptno:int,dname:chararray,loc:chararray);
emp = LOAD 'input/pig/emp.txt' USING PigStorage('\t') AS (empno:int,ename:chararray, job:chararray, mgr:int, hiredate:chararray,sal:double,comm:double,dept:int);
テーブル構造の表示
desc TMP_TABLE;
describe dept
dept: {deptno: int,dname: chararray,loc: chararray}
describe emp
emp: {empno: int,ename: chararray,job: chararray,mgr: int,hiredate: chararray,sal: double,comm: double,dept: int}
テーブル全体を問い合わせる
SELECT * FROM TMP_TABLE;
DUMP dept
(10,ACCOUNTING,NEW YORK)
(20,RESEARCH,DALLAS)
(30,SALES,CHICAGO)
(40,OPERATIONS,BOSTON)
DUMP emp
(7369,SMITH,CLERK,7902,1980-12-17,800.0,,20)
(7499,ALLEN,SALESMAN,7698,1981-2-20,1600.0,300.0,30)
(7521,WARD,SALESMAN,7698,1981-2-22,1250.0,500.0,30)
(7566,JONES,MANAGER,7839,1981-4-2,2975.0,,20)
(7654,MARTIN,SALESMAN,7698,1981-9-28,1250.0,1400.0,30)
(7698,BLAKE,MANAGER,7839,1981-5-1,2850.0,,30)
(7782,CLARK,MANAGER,7839,1981-6-9,2450.0,,10)
(7788,SCOTT,ANALYST,7566,1987-4-19,3000.0,,20)
(7839,KING,PRESIDENT,,1981-11-17,5000.0,,10)
(7844,TURNER,SALESMAN,7698,1981-9-8,1500.0,0.0,30)
(7876,ADAMS,CLERK,7788,1987-5-23,1100.0,,20)
(7900,JAMES,CLERK,7698,1981-12-3,950.0,,30)
(7902,FORD,ANALYST,7566,1981-12-3,3000.0,,20)
(7934,MILLER,CLERK,7782,1982-1-23,1300.0,,10)
照会前N条
SELECT * FROM TMP_TABLE LIMIT 10;
emp_table_limit = LIMIT emp 10;
DUMP emp_table_limit;
(7369,SMITH,CLERK,7902,1980-12-17,800.0,,20)
(7499,ALLEN,SALESMAN,7698,1981-2-20,1600.0,300.0,30)
(7521,WARD,SALESMAN,7698,1981-2-22,1250.0,500.0,30)
(7566,JONES,MANAGER,7839,1981-4-2,2975.0,,20)
(7654,MARTIN,SALESMAN,7698,1981-9-28,1250.0,1400.0,30)
(7698,BLAKE,MANAGER,7839,1981-5-1,2850.0,,30)
(7782,CLARK,MANAGER,7839,1981-6-9,2450.0,,10)
(7788,SCOTT,ANALYST,7566,1987-4-19,3000.0,,20)
(7839,KING,PRESIDENT,,1981-11-17,5000.0,,10)
(7844,TURNER,SALESMAN,7698,1981-9-8,1500.0,0.0,30)
クエリー・テーブルの一部の列
SELECT USER FROM TMP_TABLE;
emp_table_empno_ename = FOREACH emp GENERATE empno, ename;
describe emp_table_empno_ename
emp_table_empno_ename: {empno: int,ename: chararray}
DUMP emp_table_empno_ename;
(7369,SMITH)
(7499,ALLEN)
(7521,WARD)
(7566,JONES)
(7654,MARTIN)
(7698,BLAKE)
(7782,CLARK)
(7788,SCOTT)
(7839,KING)
(7844,TURNER)
(7876,ADAMS)
(7900,JAMES)
(7902,FORD)
(7934,MILLER)
列に別名を付ける
SELECT USER AS USER_NAME,AGE AS USER_AGE FROM TMP_TABLE;
emp_table_column_alias = FOREACH emp GENERATE empno AS id,ename AS name;
describe emp_table_column_alias
emp_table_column_alias: {id: int,name: chararray}
DUMP emp_table_column_alias
(7369,SMITH)
(7499,ALLEN)
(7521,WARD)
(7566,JONES)
(7654,MARTIN)
(7698,BLAKE)
(7782,CLARK)
(7788,SCOTT)
(7839,KING)
(7844,TURNER)
(7876,ADAMS)
(7900,JAMES)
(7902,FORD)
(7934,MILLER)
ツールバーの
SELECT * FROM TMP_TABLE ORDER BY AGE;
emp_table_order = ORDER emp BY empno ASC;
DUMP emp_table_order;
(7369,SMITH,CLERK,7902,1980-12-17,800.0,,20)
(7499,ALLEN,SALESMAN,7698,1981-2-20,1600.0,300.0,30)
(7521,WARD,SALESMAN,7698,1981-2-22,1250.0,500.0,30)
(7566,JONES,MANAGER,7839,1981-4-2,2975.0,,20)
(7654,MARTIN,SALESMAN,7698,1981-9-28,1250.0,1400.0,30)
(7698,BLAKE,MANAGER,7839,1981-5-1,2850.0,,30)
(7782,CLARK,MANAGER,7839,1981-6-9,2450.0,,10)
(7788,SCOTT,ANALYST,7566,1987-4-19,3000.0,,20)
(7839,KING,PRESIDENT,,1981-11-17,5000.0,,10)
(7844,TURNER,SALESMAN,7698,1981-9-8,1500.0,0.0,30)
(7876,ADAMS,CLERK,7788,1987-5-23,1100.0,,20)
(7900,JAMES,CLERK,7698,1981-12-3,950.0,,30)
(7902,FORD,ANALYST,7566,1981-12-3,3000.0,,20)
(7934,MILLER,CLERK,7782,1982-1-23,1300.0,,10)
条件クエリー
SELECT * FROM TMP_TABLE WHERE AGE>20;
emp_table_where = FILTER emp by sal > 1500;
DUMP emp_table_where;
(7499,ALLEN,SALESMAN,7698,1981-2-20,1600.0,300.0,30)
(7566,JONES,MANAGER,7839,1981-4-2,2975.0,,20)
(7698,BLAKE,MANAGER,7839,1981-5-1,2850.0,,30)
(7782,CLARK,MANAGER,7839,1981-6-9,2450.0,,10)
(7788,SCOTT,ANALYST,7566,1987-4-19,3000.0,,20)
(7839,KING,PRESIDENT,,1981-11-17,5000.0,,10)
(7902,FORD,ANALYST,7566,1981-12-3,3000.0,,20)
内部接続Inner Join
SELECT * FROM TMP_TABLE A JOIN TMP_TABLE_2 B ON A.AGE=B.AGE;
emp_table_inner_join = JOIN emp BY dept,dept BY deptno;
describe emp_table_inner_join
emp_table_inner_join:
{emp::empno: int,emp::ename: chararray,emp::job: chararray,emp::mgr: int,
emp::hiredate: chararray,emp::sal: double,emp::comm: double,emp::dept: int,
dept::deptno: int,dept::dname: chararray,dept::loc: chararray}
DUMP emp_table_inner_join;
(7839,KING,PRESIDENT,,1981-11-17,5000.0,,10,10,ACCOUNTING,NEW YORK)
(7782,CLARK,MANAGER,7839,1981-6-9,2450.0,,10,10,ACCOUNTING,NEW YORK)
(7934,MILLER,CLERK,7782,1982-1-23,1300.0,,10,10,ACCOUNTING,NEW YORK)
(7369,SMITH,CLERK,7902,1980-12-17,800.0,,20,20,RESEARCH,DALLAS)
(7566,JONES,MANAGER,7839,1981-4-2,2975.0,,20,20,RESEARCH,DALLAS)
(7876,ADAMS,CLERK,7788,1987-5-23,1100.0,,20,20,RESEARCH,DALLAS)
(7902,FORD,ANALYST,7566,1981-12-3,3000.0,,20,20,RESEARCH,DALLAS)
(7788,SCOTT,ANALYST,7566,1987-4-19,3000.0,,20,20,RESEARCH,DALLAS)
(7499,ALLEN,SALESMAN,7698,1981-2-20,1600.0,300.0,30,30,SALES,CHICAGO)
(7844,TURNER,SALESMAN,7698,1981-9-8,1500.0,0.0,30,30,SALES,CHICAGO)
(7900,JAMES,CLERK,7698,1981-12-3,950.0,,30,30,SALES,CHICAGO)
(7698,BLAKE,MANAGER,7839,1981-5-1,2850.0,,30,30,SALES,CHICAGO)
(7654,MARTIN,SALESMAN,7698,1981-9-28,1250.0,1400.0,30,30,SALES,CHICAGO)
(7521,WARD,SALESMAN,7698,1981-2-22,1250.0,500.0,30,30,SALES,CHICAGO)
左接続Left Join
SELECT * FROM TMP_TABLE A LEFT JOIN TMP_TABLE_2 B ON A.AGE=B.AGE;
emp_table_left_join = JOIN emp BY dept LEFT OUTER,dept BY deptno;
describe emp_table_left_join
emp_table_left_join: {emp::empno: int,emp::ename: chararray,emp::job: chararray,emp::mgr: int,
emp::hiredate: chararray,emp::sal: double,emp::comm: double,emp::dept: int,
dept::deptno: int,dept::dname: chararray,dept::loc: chararray}
DUMP emp_table_left_join;
(7839,KING,PRESIDENT,,1981-11-17,5000.0,,10,10,ACCOUNTING,NEW YORK)
(7782,CLARK,MANAGER,7839,1981-6-9,2450.0,,10,10,ACCOUNTING,NEW YORK)
(7934,MILLER,CLERK,7782,1982-1-23,1300.0,,10,10,ACCOUNTING,NEW YORK)
(7369,SMITH,CLERK,7902,1980-12-17,800.0,,20,20,RESEARCH,DALLAS)
(7566,JONES,MANAGER,7839,1981-4-2,2975.0,,20,20,RESEARCH,DALLAS)
(7876,ADAMS,CLERK,7788,1987-5-23,1100.0,,20,20,RESEARCH,DALLAS)
(7902,FORD,ANALYST,7566,1981-12-3,3000.0,,20,20,RESEARCH,DALLAS)
(7788,SCOTT,ANALYST,7566,1987-4-19,3000.0,,20,20,RESEARCH,DALLAS)
(7499,ALLEN,SALESMAN,7698,1981-2-20,1600.0,300.0,30,30,SALES,CHICAGO)
(7844,TURNER,SALESMAN,7698,1981-9-8,1500.0,0.0,30,30,SALES,CHICAGO)
(7900,JAMES,CLERK,7698,1981-12-3,950.0,,30,30,SALES,CHICAGO)
(7698,BLAKE,MANAGER,7839,1981-5-1,2850.0,,30,30,SALES,CHICAGO)
(7654,MARTIN,SALESMAN,7698,1981-9-28,1250.0,1400.0,30,30,SALES,CHICAGO)
(7521,WARD,SALESMAN,7698,1981-2-22,1250.0,500.0,30,30,SALES,CHICAGO)
右接続Right Join
SELECT * FROM TMP_TABLE A RIGHT JOIN TMP_TABLE_2 B ON A.AGE=B.AGE;
emp_table_right_join = JOIN emp BY dept RIGHT OUTER,dept BY deptno;
describe emp_table_right_join
emp_table_right_join: {emp::empno: int,emp::ename: chararray,emp::job: chararray,emp::mgr: int,
emp::hiredate: chararray,emp::sal: double,emp::comm: double,emp::dept: int,
dept::deptno: int,dept::dname: chararray,dept::loc: chararray}
DUMP emp_table_right_join;
(7839,KING,PRESIDENT,,1981-11-17,5000.0,,10,10,ACCOUNTING,NEW YORK)
(7782,CLARK,MANAGER,7839,1981-6-9,2450.0,,10,10,ACCOUNTING,NEW YORK)
(7934,MILLER,CLERK,7782,1982-1-23,1300.0,,10,10,ACCOUNTING,NEW YORK)
(7369,SMITH,CLERK,7902,1980-12-17,800.0,,20,20,RESEARCH,DALLAS)
(7566,JONES,MANAGER,7839,1981-4-2,2975.0,,20,20,RESEARCH,DALLAS)
(7876,ADAMS,CLERK,7788,1987-5-23,1100.0,,20,20,RESEARCH,DALLAS)
(7902,FORD,ANALYST,7566,1981-12-3,3000.0,,20,20,RESEARCH,DALLAS)
(7788,SCOTT,ANALYST,7566,1987-4-19,3000.0,,20,20,RESEARCH,DALLAS)
(7499,ALLEN,SALESMAN,7698,1981-2-20,1600.0,300.0,30,30,SALES,CHICAGO)
(7844,TURNER,SALESMAN,7698,1981-9-8,1500.0,0.0,30,30,SALES,CHICAGO)
(7900,JAMES,CLERK,7698,1981-12-3,950.0,,30,30,SALES,CHICAGO)
(7698,BLAKE,MANAGER,7839,1981-5-1,2850.0,,30,30,SALES,CHICAGO)
(7654,MARTIN,SALESMAN,7698,1981-9-28,1250.0,1400.0,30,30,SALES,CHICAGO)
(7521,WARD,SALESMAN,7698,1981-2-22,1250.0,500.0,30,30,SALES,CHICAGO)
(,,,,,,,,40,OPERATIONS,BOSTON)
フルコネクションFull Join
SELECT * FROM TMP_TABLE A JOIN TMP_TABLE_2 B ON A.AGE=B.AGE
emp_table_full_join = JOIN emp BY dept FULL OUTER,dept BY deptno;
describe emp_table_full_join
emp_table_full_join: {emp::empno: int,emp::ename: chararray,emp::job: chararray,emp::mgr: int,
emp::hiredate: chararray,emp::sal: double,emp::comm: double,emp::dept: int,
dept::deptno: int,dept::dname: chararray,dept::loc: chararray}
DUMP emp_table_full_join;
(7839,KING,PRESIDENT,,1981-11-17,5000.0,,10,10,ACCOUNTING,NEW YORK)
(7782,CLARK,MANAGER,7839,1981-6-9,2450.0,,10,10,ACCOUNTING,NEW YORK)
(7934,MILLER,CLERK,7782,1982-1-23,1300.0,,10,10,ACCOUNTING,NEW YORK)
(7369,SMITH,CLERK,7902,1980-12-17,800.0,,20,20,RESEARCH,DALLAS)
(7566,JONES,MANAGER,7839,1981-4-2,2975.0,,20,20,RESEARCH,DALLAS)
(7876,ADAMS,CLERK,7788,1987-5-23,1100.0,,20,20,RESEARCH,DALLAS)
(7902,FORD,ANALYST,7566,1981-12-3,3000.0,,20,20,RESEARCH,DALLAS)
(7788,SCOTT,ANALYST,7566,1987-4-19,3000.0,,20,20,RESEARCH,DALLAS)
(7499,ALLEN,SALESMAN,7698,1981-2-20,1600.0,300.0,30,30,SALES,CHICAGO)
(7844,TURNER,SALESMAN,7698,1981-9-8,1500.0,0.0,30,30,SALES,CHICAGO)
(7900,JAMES,CLERK,7698,1981-12-3,950.0,,30,30,SALES,CHICAGO)
(7698,BLAKE,MANAGER,7839,1981-5-1,2850.0,,30,30,SALES,CHICAGO)
(7654,MARTIN,SALESMAN,7698,1981-9-28,1250.0,1400.0,30,30,SALES,CHICAGO)
(7521,WARD,SALESMAN,7698,1981-2-22,1250.0,500.0,30,30,SALES,CHICAGO)
(,,,,,,,,40,OPERATIONS,BOSTON)
同時に複数のテーブルをクロスクエリする
SELECT * FROM TMP_TABLE,TMP_TABLE_2;
emp_table_cross = CROSS emp,dept;
describe emp_table_cross
emp_table_cross: {emp::empno: int,emp::ename: chararray,emp::job: chararray,emp::mgr: int,
emp::hiredate: chararray,emp::sal: double,emp::comm: double,emp::dept: int,
dept::deptno: int,dept::dname: chararray,dept::loc: chararray}
DUMP emp_table_cross;
(7369,SMITH,CLERK,7902,1980-12-17,800.0,,20,10,ACCOUNTING,NEW YORK)
(7369,SMITH,CLERK,7902,1980-12-17,800.0,,20,20,RESEARCH,DALLAS)
(7369,SMITH,CLERK,7902,1980-12-17,800.0,,20,30,SALES,CHICAGO)
(7369,SMITH,CLERK,7902,1980-12-17,800.0,,20,40,OPERATIONS,BOSTON)
(7499,ALLEN,SALESMAN,7698,1981-2-20,1600.0,300.0,30,10,ACCOUNTING,NEW YORK)
(7499,ALLEN,SALESMAN,7698,1981-2-20,1600.0,300.0,30,20,RESEARCH,DALLAS)
(7499,ALLEN,SALESMAN,7698,1981-2-20,1600.0,300.0,30,30,SALES,CHICAGO)
(7499,ALLEN,SALESMAN,7698,1981-2-20,1600.0,300.0,30,40,OPERATIONS,BOSTON)
(7521,WARD,SALESMAN,7698,1981-2-22,1250.0,500.0,30,10,ACCOUNTING,NEW YORK)
(7521,WARD,SALESMAN,7698,1981-2-22,1250.0,500.0,30,20,RESEARCH,DALLAS)
(7521,WARD,SALESMAN,7698,1981-2-22,1250.0,500.0,30,30,SALES,CHICAGO)
(7521,WARD,SALESMAN,7698,1981-2-22,1250.0,500.0,30,40,OPERATIONS,BOSTON)
(7566,JONES,MANAGER,7839,1981-4-2,2975.0,,20,10,ACCOUNTING,NEW YORK)
(7566,JONES,MANAGER,7839,1981-4-2,2975.0,,20,20,RESEARCH,DALLAS)
(7566,JONES,MANAGER,7839,1981-4-2,2975.0,,20,30,SALES,CHICAGO)
(7566,JONES,MANAGER,7839,1981-4-2,2975.0,,20,40,OPERATIONS,BOSTON)
(7654,MARTIN,SALESMAN,7698,1981-9-28,1250.0,1400.0,30,10,ACCOUNTING,NEW YORK)
(7654,MARTIN,SALESMAN,7698,1981-9-28,1250.0,1400.0,30,20,RESEARCH,DALLAS)
(7654,MARTIN,SALESMAN,7698,1981-9-28,1250.0,1400.0,30,30,SALES,CHICAGO)
(7654,MARTIN,SALESMAN,7698,1981-9-28,1250.0,1400.0,30,40,OPERATIONS,BOSTON)
(7698,BLAKE,MANAGER,7839,1981-5-1,2850.0,,30,10,ACCOUNTING,NEW YORK)
(7698,BLAKE,MANAGER,7839,1981-5-1,2850.0,,30,20,RESEARCH,DALLAS)
(7698,BLAKE,MANAGER,7839,1981-5-1,2850.0,,30,30,SALES,CHICAGO)
(7698,BLAKE,MANAGER,7839,1981-5-1,2850.0,,30,40,OPERATIONS,BOSTON)
(7782,CLARK,MANAGER,7839,1981-6-9,2450.0,,10,10,ACCOUNTING,NEW YORK)
(7782,CLARK,MANAGER,7839,1981-6-9,2450.0,,10,20,RESEARCH,DALLAS)
(7782,CLARK,MANAGER,7839,1981-6-9,2450.0,,10,30,SALES,CHICAGO)
(7782,CLARK,MANAGER,7839,1981-6-9,2450.0,,10,40,OPERATIONS,BOSTON)
(7788,SCOTT,ANALYST,7566,1987-4-19,3000.0,,20,10,ACCOUNTING,NEW YORK)
(7788,SCOTT,ANALYST,7566,1987-4-19,3000.0,,20,20,RESEARCH,DALLAS)
(7788,SCOTT,ANALYST,7566,1987-4-19,3000.0,,20,30,SALES,CHICAGO)
(7788,SCOTT,ANALYST,7566,1987-4-19,3000.0,,20,40,OPERATIONS,BOSTON)
(7839,KING,PRESIDENT,,1981-11-17,5000.0,,10,10,ACCOUNTING,NEW YORK)
(7839,KING,PRESIDENT,,1981-11-17,5000.0,,10,20,RESEARCH,DALLAS)
(7839,KING,PRESIDENT,,1981-11-17,5000.0,,10,30,SALES,CHICAGO)
(7839,KING,PRESIDENT,,1981-11-17,5000.0,,10,40,OPERATIONS,BOSTON)
(7844,TURNER,SALESMAN,7698,1981-9-8,1500.0,0.0,30,10,ACCOUNTING,NEW YORK)
(7844,TURNER,SALESMAN,7698,1981-9-8,1500.0,0.0,30,20,RESEARCH,DALLAS)
(7844,TURNER,SALESMAN,7698,1981-9-8,1500.0,0.0,30,30,SALES,CHICAGO)
(7844,TURNER,SALESMAN,7698,1981-9-8,1500.0,0.0,30,40,OPERATIONS,BOSTON)
(7876,ADAMS,CLERK,7788,1987-5-23,1100.0,,20,10,ACCOUNTING,NEW YORK)
(7876,ADAMS,CLERK,7788,1987-5-23,1100.0,,20,20,RESEARCH,DALLAS)
(7876,ADAMS,CLERK,7788,1987-5-23,1100.0,,20,30,SALES,CHICAGO)
(7876,ADAMS,CLERK,7788,1987-5-23,1100.0,,20,40,OPERATIONS,BOSTON)
(7900,JAMES,CLERK,7698,1981-12-3,950.0,,30,10,ACCOUNTING,NEW YORK)
(7900,JAMES,CLERK,7698,1981-12-3,950.0,,30,20,RESEARCH,DALLAS)
(7900,JAMES,CLERK,7698,1981-12-3,950.0,,30,30,SALES,CHICAGO)
(7900,JAMES,CLERK,7698,1981-12-3,950.0,,30,40,OPERATIONS,BOSTON)
(7902,FORD,ANALYST,7566,1981-12-3,3000.0,,20,10,ACCOUNTING,NEW YORK)
(7902,FORD,ANALYST,7566,1981-12-3,3000.0,,20,20,RESEARCH,DALLAS)
(7902,FORD,ANALYST,7566,1981-12-3,3000.0,,20,30,SALES,CHICAGO)
(7902,FORD,ANALYST,7566,1981-12-3,3000.0,,20,40,OPERATIONS,BOSTON)
(7934,MILLER,CLERK,7782,1982-1-23,1300.0,,10,10,ACCOUNTING,NEW YORK)
(7934,MILLER,CLERK,7782,1982-1-23,1300.0,,10,20,RESEARCH,DALLAS)
(7934,MILLER,CLERK,7782,1982-1-23,1300.0,,10,30,SALES,CHICAGO)
(7934,MILLER,CLERK,7782,1982-1-23,1300.0,,10,40,OPERATIONS,BOSTON)
グループGROUP BY
SELECT * FROM TMP_TABLE GROUP BY IS_MALE;
emp_table_group = GROUP emp BY dept;
describe emp_table_group
emp_table_group: {
group: int,
emp: {
(empno: int,ename: chararray,job: chararray,mgr: int,hiredate: chararray,sal: double, comm: double,dept: int)
}
}
DUMP emp_table_group;
(10,{
(7839,KING,PRESIDENT,,1981-11-17,5000.0,,10),
(7782,CLARK,MANAGER,7839,1981-6-9,2450.0,,10),
(7934,MILLER,CLERK,7782,1982-1-23,1300.0,,10)})
(20,{
(7369,SMITH,CLERK,7902,1980-12-17,800.0,,20),
(7566,JONES,MANAGER,7839,1981-4-2,2975.0,,20),
(7876,ADAMS,CLERK,7788,1987-5-23,1100.0,,20),
(7902,FORD,ANALYST,7566,1981-12-3,3000.0,,20),
(7788,SCOTT,ANALYST,7566,1987-4-19,3000.0,,20)})
(30,{
(7499,ALLEN,SALESMAN,7698,1981-2-20,1600.0,300.0,30),
(7844,TURNER,SALESMAN,7698,1981-9-8,1500.0,0.0,30),
(7900,JAMES,CLERK,7698,1981-12-3,950.0,,30),
(7698,BLAKE,MANAGER,7839,1981-5-1,2850.0,,30),
(7654,MARTIN,SALESMAN,7698,1981-9-28,1250.0,1400.0,30),
(7521,WARD,SALESMAN,7698,1981-2-22,1250.0,500.0,30)})
グループ化と統計
SELECT IS_MALE,COUNT(*) FROM TMP_TABLE GROUP BY IS_MALE;
emp_table_group_count = GROUP emp BY dept;
describe emp_table_group_count
emp_table_group_count: {
group: int,
emp: {
(empno: int,ename: chararray,job: chararray,mgr: int,hiredate: chararray,sal: double,comm: double,dept: int
)
}
}
emp_table_group_count = FOREACH emp_table_group_count GENERATE group,COUNT($1);
describe emp_table_group_count
DUMP emp_table_group_count;
(10,3)
(20,5)
(30,6)
emp_table_group_count = FOREACH emp_table_group_count GENERATE emp.dept,COUNT($1);
describe emp_table_group_count
emp_table_group_count: {{(dept: int)},long}
DUMP emp_table_group_count;
({(10),(10),(10)},3)
({(20),(20),(20),(20),(20)},5)
({(30),(30),(30),(30),(30),(30)},6)
脱重DISTINCT
SELECT DISTINCT IS_MALE FROM TMP_TABLE;
emp_table_distinct = FOREACH emp GENERATE dept;
describe emp_table_distinct
emp_table_distinct: {dept: int}
emp_table_distinct = DISTINCT emp_table_distinct;
describe emp_table_distinct
emp_table_distinct: {dept: int}
DUMP emp_table_distinct;
(10)
(20)
(30)
pig判空
select * from emp where comm is not null;
emp_table_where_null = FILTER emp by comm is not null;
DUMP emp_table_where_null;
(7499,ALLEN,SALESMAN,7698,1981-2-20,1600.0,300.0,30)
(7521,WARD,SALESMAN,7698,1981-2-22,1250.0,500.0,30)
(7654,MARTIN,SALESMAN,7698,1981-9-28,1250.0,1400.0,30)
(7844,TURNER,SALESMAN,7698,1981-9-8,1500.0,0.0,30)