DataBaser.Net: Hive Tip&Tech

1 모니터링
2 ssh 명령
3 webhdfs로 파일 업로드
4 Windows OS에서 ODBC 이용할 때 8000자 넘는 문자열은 어찌 가져오나?
5 debug
6 alter column
7 날짜
8 partitioned 테이블 삭제(drop)할 때
9 압축
10 string to array
11 set 명령어
12 Out Of Memory
13 beeline: 이쁜 hive CLI
14 select 시 컬럼명 보이게하기
15 hive server(hive thrift)
16 hive web interface
17 .hiverc
18 매개변수
19 oracle의 dual 테이블 흉내
20 repair
21 파티션 입력
22 partitioned table 에 add columns 할 때 주의사항
23 Error: org.apache.hadoop.mapreduce.task.reduce.Shuffle$ShuffleError: error in shuffle in fetcher#15, Caused by: java.lang.OutOfMemoryError: Java heap space

1 모니터링 #

watch -n5 "yarn application -appStates RUNNING -list|sed '1d'|cut -f1,2,4,8|column -ts$'\t'"

yarn logs -applicationId application_1552564637687_16592

[edit]

2 ssh 명령 #

plink.exe -ssh -batch -l 로그인ID -pw 패스워드 호스트명(IP) 명령어

[edit]

3 webhdfs로 파일 업로드 #

"c:\curl\bin\" curl.exe -i -X PUT -L -T "d:\log\test.log" "http://1.1.1.10:50070/webhdfs/v1/tmp/100000.log?op=CREATE&user.name=hdfs&overwrite=true"

비동기 처리는 이렇게

start /d "c:\curl\bin\" curl.exe -i -X PUT -L -T "d:\log\test.log" "http://1.1.1.10:50070/webhdfs/v1/tmp/100000.log?op=CREATE&user.name=hdfs&overwrite=true"

[edit]

4 Windows OS에서 ODBC 이용할 때 8000자 넘는 문자열은 어찌 가져오나? #

--이렇게 쪼개서 가져오고, 합치면 된다.
select concat(convert(nvarchar(max), s1), s2, s3) s
from openquery(hive_odbc, 'select substring(s, 1, 4000) s1, substring(s, 4001, 4000) s2, substring(2, 8001, 4000) s3 from tbl')

[edit]

5 debug #

hive -hiveconf hive.root.logger=DEBUG,console

[edit]

6 alter column #

ALTER TABLE table_name CHANGE old_col_name new_col_name new_data_type

id 컬럼 뒤에..

ALTER TABLE table_name CHANGE old_col_name new_col_name new_data_type AFTER id

[edit]

7 날짜 #

select
    from_unixtime(unix_timestamp(), 'yyyy-MM-dd') today  
,   from_unixtime(unix_timestamp()-1*60*60, 'yyyy-MM-dd hh:mm:ss.SSS') as `1시간전`
,   from_unixtime(unix_timestamp()-1*60*60, 'yyyy-MM-dd HH:mm:ss.SSS') as `1시간전` 
,   from_unixtime(unix_timestamp()-1*24*60*60, 'yyyy-MM-dd') yesterday
,   from_unixtime(unix_timestamp()+1*24*60*60, 'yyyy-MM-dd') tomorrow
,   cast(from_unixtime(unix_timestamp(), 'yyyyMMdd') as int) today_int
,   cast(cast(current_timestamp as decimal(38,6)) + 0.000001 as timestamp) `+1마이크로세컨드`
from dual

[edit]

8 partitioned 테이블 삭제(drop)할 때 #

많은 파티션이 있는 파티션된 hive 테이블을 drop 할 때 조넨 느리다.
느린 이유는 메타데이터를 정리하는 시간 때문이다.
빨리 삭제하려면 메타데이터를 정리해주면 된다. hive 메타데이터 스토어가 mysql이라 가정하고..

1. 메타 데이터 정리

--TBL_ID 확인
select * from hive.TBLS where TBL_ID = 27992
 
 
--지운다.
delete from hive.PARTITION_KEY_VALS where PART_ID in (select part_id from hive.PARTITIONS where TBL_ID = 27992);
delete from hive.PARTITION_PARAMS   where PART_ID in (select part_id from hive.PARTITIONS where TBL_ID = 27992);
delete from hive.PART_COL_PRIVS     where PART_ID in (select part_id from hive.PARTITIONS where TBL_ID = 27992);
delete from hive.PART_COL_STATS     where PART_ID in (select part_id from hive.PARTITIONS where TBL_ID = 27992);
delete from hive.PART_PRIVS         where PART_ID in (select part_id from hive.PARTITIONS where TBL_ID = 27992);
delete from hive.PARTITIONS                                                               where TBL_ID = 27992 ;

select * from hive.PARTITIONS where TBL_ID = 27992

2. drop table xxxxx

[edit]

9 압축 #

STORED AS ORC
LOCATION 'maprfs:/mapr/demo.jbates.mapr/data/hive/orc2/trips
TBLPROPERTIES ( "orc.compress"="LZ4" );



CREATE TABLE student_text_lz4 (id STRING, name STRING) 
ROW FORMAT DELIMITED 
    FIELDS TERMINATED BY ',' 
    LINES TERMINATED BY '\n' 
STORED AS TEXTFILE; 

SET hive.exec.compress.output=true; 
SET mapred.output.compress=true; 
SET mapred.output.compression.codec=org.apache.hadoop.io.compress.Lz4Codec; 

INSERT OVERWRITE TABLE student_text_lz4 SELECT * FROM student;

[edit]

10 string to array #

--https://stackoverflow.com/questions/45965889/how-do-we-convert-a-string-into-array-in-hive

hive> with releases as (select '["us","ca","fr"]' as country)
    > select  split(regexp_extract(country,'^\\["(.*)\\"]$',1),'","')
    > from    releases
    > ;
OK
_c0
["us","ca","fr"]

[edit]

11 set 명령어 #

set hive.execution.engine=mr;

[edit]

12 Out Of Memory #

http://118k.tistory.com/351

[edit]

13 beeline: 이쁜 hive CLI #

실행예제

[root@sandbox ~]# beeline
Beeline version 1.2.1000.2.6.1.0-129 by Apache Hive
beeline> !connect jdbc:hive2://localhost:10000 scott tiger
Connecting to jdbc:hive2://localhost:10000
Connected to: Apache Hive (version 1.2.1000.2.6.1.0-129)
Driver: Hive JDBC (version 1.2.1000.2.6.1.0-129)
Transaction isolation: TRANSACTION_REPEATABLE_READ
0: jdbc:hive2://localhost:10000> show tables;
+------------+--+
|  tab_name  |
+------------+--+
| sample_07  |
| sample_08  |
+------------+--+
2 rows selected (1.895 seconds)
0: jdbc:hive2://localhost:10000> select * from sample_07 limit 10;
+-----------------+--------------------------------------+----------------------+-------------------+--+
| sample_07.code  |        sample_07.description         | sample_07.total_emp  | sample_07.salary  |
+-----------------+--------------------------------------+----------------------+-------------------+--+
| 00-0000         | All Occupations                      | 134354250            | 40690             |
| 11-0000         | Management occupations               | 6003930              | 96150             |
| 11-1011         | Chief executives                     | 299160               | 151370            |
| 11-1021         | General and operations managers      | 1655410              | 103780            |
| 11-1031         | Legislators                          | 61110                | 33880             |
| 11-2011         | Advertising and promotions managers  | 36300                | 91100             |
| 11-2021         | Marketing managers                   | 165240               | 113400            |
| 11-2022         | Sales managers                       | 322170               | 106790            |
| 11-2031         | Public relations managers            | 47210                | 97170             |
| 11-3011         | Administrative services managers     | 239360               | 76370             |
+-----------------+--------------------------------------+----------------------+-------------------+--+
10 rows selected (0.607 seconds)
0: jdbc:hive2://localhost:10000>

[edit]

14 select 시 컬럼명 보이게하기 #

set hive.cli.print.header=true;

[edit]

15 hive server(hive thrift) #

명령

hive --service hiveserver

기본 10000 port 를 사용한다.

[edit]

16 hive web interface #

명령

hive --service hwi

기본 9999 port 다. 9999 port가 아니라면 hive-site.xml 명시된 port다.

http://192.168.136.100:9999/hwi/ --> 이렇게 붙어본다. (난 http://192.168.136.100:9999 붙는 것으로 계속 시도하는 삽질을 했다)

[edit]

17 .hiverc #

$HOME/.hiverc

set hive.cli.print.current.db=true;
set hive.exec.mode.local.auto=true;
set hive.cli.print.header=true;

hive를 다음과 같이 실행시킨다.

hive -i $HOME/.hiverc

[edit]

18 매개변수 #

set hivevar:p1 = 'SCOTT';
select * from emp where ename = ${hivevar:p1};

다음은 결과 예제다.

hive (default)> set hivevar:p1 = 'SCOTT';
hive (default)> select * from emp where ename = ${hivevar:p1};
Automatically selecting local only mode for query
Total MapReduce jobs = 1
Launching Job 1 out of 1
Number of reduce tasks is set to 0 since there's no reduce operator
Execution log at: /tmp/huser/.log
Job running in-process (local Hadoop)
Hadoop job information for null: number of mappers: 0; number of reducers: 0
2013-08-06 23:20:35,671 null map = 0%,  reduce = 0%
Ended Job = job_local419394988_0001
Execution completed successfully
Mapred Local Task Succeeded . Convert the Join into MapJoin
OK
empno   ename   job     mgr     hiredate        sal     comm    deptno
7788    SCOTT   ANALYST 7566    1982-12-09      3000    NULL    20
Time taken: 2.894 seconds, Fetched: 1 row(s)
hive (default)>

[edit]

19 oracle의 dual 테이블 흉내 #

echo 1 > dual.txt
create table dual (dummy STRING);
load data local inpath 'dual.txt' overwrite into table dual;
select date_add('2008-12-31', 1) from dual;

[edit]

20 repair #

--https://stackoverflow.com/questions/44931305/partition-in-metastore-but-path-doesnt-exist-in-hdfs

hive> create table mytable (i int) partitioned by (p int);
OK
Time taken: 0.539 seconds

hive> !mkdir mytable/p=1;
hive> !mkdir mytable/p=2;
hive> !mkdir mytable/p=3;

hive> msck repair table mytable;
OK
Partitions not in metastore:    mytable:p=1 mytable:p=2 mytable:p=3
Repair: Added partition to metastore mytable:p=1
Repair: Added partition to metastore mytable:p=2
Repair: Added partition to metastore mytable:p=3
Time taken: 0.918 seconds, Fetched: 4 row(s)

hive> show partitions mytable;
OK
p=1
p=2
p=3
Time taken: 0.331 seconds, Fetched: 3 row(s)

hive> !rmdir mytable/p=1;
hive> !rmdir mytable/p=2;
hive> !rmdir mytable/p=3;

hive> msck repair table mytable;
OK
Partitions missing from filesystem: mytable:p=1 mytable:p=2 mytable:p=3
Time taken: 0.425 seconds, Fetched: 1 row(s)

hive> show partitions mytable;
OK
p=1
p=2
p=3
Time taken: 0.56 seconds, Fetched: 3 row(s)

[edit]

21 파티션 입력 #

set hive.exec.compress.output=true;
set mapred.output.compression.codec=org.apache.hadoop.io.compress.SnappyCodec;
set mapred.output.compression.type=BLOCK;
set io.seqfile.compression.type=BLOCK;
set hive.exec.parallel=true;
set hive.exec.max.dynamic.partitions.pernode=100000;
set hive.exec.max.dynamic.partitions=1000000;
set hive.exec.max.created.files=10000000;
set hive.exec.dynamic.partition.mode=nonstrict;
set hive.exec.dynamic.partition=true;
set hive.execution.engine=spark;
  
insert overwrite table gamelog partition(date_key, channel_id)
select log, date_key, channel_id --파티션 키는 가장 마지막에 순서대로..
from gamelog

[edit]

22 partitioned table 에 add columns 할 때 주의사항 #

set hive.cli.print.header=true;
set hive.exec.dynamic.partition.mode=nonstrict;
set hive.exec.dynamic.partition=true;

drop table test3;
create external table test3
(
    a string
)
partitioned by (date_key int, hh int)
stored as sequencefile
;

insert overwrite table test3 partition(date_key, hh)
select 'a', 20190101, 1
from (select 1) t;

select * from test3;

hive> 
    > select * from test3;
OK
test3.a test3.date_key  test3.hh
a       20190101        1
Time taken: 0.064 seconds, Fetched: 1 row(s)
hive>

그냥 add columns 하면...

alter table test3 add columns(b string, c string);

insert overwrite table test3 partition(date_key, hh)
select 'a', 'b', 'c', 20190101, 1
from (select 1) t;

insert overwrite table test3 partition(date_key, hh)
select 'a', 'b', 'c', 20190102, 1
from (select 1) t;

select * from test3;

아래처럼 과거의 파티션에는 적용이 안 된다.

hive> 
    > select * from test3;
OK
test3.a test3.b test3.c test3.date_key  test3.hh
a       NULL    NULL    20190101        1
a       b       c       20190102        1
Time taken: 0.082 seconds, Fetched: 2 row(s)
hive>

이런 경우 cascade 옵션을 줘야 한다.

alter table test3 add columns(b string, c string) cascade;

insert overwrite table test3 partition(date_key, hh)
select 'a', 'b', 'c', 20190101, 1
from (select 1) t;

select * from test3;

결과

hive> 
    > select * from test3;
OK
test3.a test3.b test3.c test3.date_key  test3.hh
a       b       c       20190101        1
a       b       c       20190102        1
Time taken: 0.083 seconds, Fetched: 2 row(s)

[edit]

23 Error: org.apache.hadoop.mapreduce.task.reduce.Shuffle$ShuffleError: error in shuffle in fetcher#15, Caused by: java.lang.OutOfMemoryError: Java heap space #

대략 이런 에러

task_1616538534439_302311_r_000000      2-Sep-2021 04:25:08     2-Sep-2021 04:30:03 (4mins, 54sec)      , Error: org.apache.hadoop.mapreduce.task.reduce.Shuffle$ShuffleError: error in shuffle in fetcher#6
        at org.apache.hadoop.mapreduce.task.reduce.Shuffle.run(Shuffle.java:134)
        at org.apache.hadoop.mapred.ReduceTask.run(ReduceTask.java:376)
        at org.apache.hadoop.mapred.YarnChild$2.run(YarnChild.java:164)
        at java.security.AccessController.doPrivileged(Native Method)
        at javax.security.auth.Subject.doAs(Subject.java:415)
        at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1917)
        at org.apache.hadoop.mapred.YarnChild.main(YarnChild.java:158)
Caused by: java.lang.OutOfMemoryError: Java heap space
        at org.apache.hadoop.io.BoundedByteArrayOutputStream.<init>(BoundedByteArrayOutputStream.java:56)
        at org.apache.hadoop.io.BoundedByteArrayOutputStream.<init>(BoundedByteArrayOutputStream.java:46)
        at org.apache.hadoop.mapreduce.task.reduce.InMemoryMapOutput.<init>(InMemoryMapOutput.java:63)
        at org.apache.hadoop.mapreduce.task.reduce.MergeManagerImpl.unconditionalReserve(MergeManagerImpl.java:309)
        at org.apache.hadoop.mapreduce.task.reduce.MergeManagerImpl.reserve(MergeManagerImpl.java:299)
        at org.apache.hadoop.mapreduce.task.reduce.Fetcher.copyMapOutput(Fetcher.java:539)
        at org.apache.hadoop.mapreduce.task.reduce.Fetcher.copyFromHost(Fetcher.java:348)
        at org.apache.hadoop.mapreduce.task.reduce.Fetcher.run(Fetcher.java:198)

다음을 추가하여 해결

set mapreduce.reduce.memory.total.bytes=2048MB;
set mapreduce.reduce.shuffle.input.buffer.percent=0.5;

Contents

1 모니터링 #

2 ssh 명령 #

3 webhdfs로 파일 업로드 #

4 Windows OS에서 ODBC 이용할 때 8000자 넘는 문자열은 어찌 가져오나? #

5 debug #

6 alter column #

7 날짜 #

8 partitioned 테이블 삭제(drop)할 때 #

9 압축 #

10 string to array #

11 set 명령어 #

12 Out Of Memory #

13 beeline: 이쁜 hive CLI #

14 select 시 컬럼명 보이게하기 #

15 hive server(hive thrift) #

16 hive web interface #

17 .hiverc #

18 매개변수 #

19 oracle의 dual 테이블 흉내 #

20 repair #

21 파티션 입력 #

22 partitioned table 에 add columns 할 때 주의사항 #

23 Error: org.apache.hadoop.mapreduce.task.reduce.Shuffle$ShuffleError: error in shuffle in fetcher#15, Caused by: java.lang.OutOfMemoryError: Java heap space #