tonglin0325的个人主页

Hive学习笔记——在Hive中使用AvroSerde

Hive支持使用avro serde作为序列化的方式,参考:

1
2
3
4
https://cwiki.apache.org/confluence/display/hive/avroserde
https://www.docs4dev.com/docs/zh/apache-hive/3.1.1/reference/AvroSerDe.html
https://github.com/jghoman/haivvreo

以及CDH官方的文档

1
2
https://docs.cloudera.com/documentation/enterprise/6/6.3/topics/cdh_ig_avro_usage.html

  

1.定义avro schema,kst.avsc

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
{
"namespace": "com.linkedin.haivvreo",
"name": "test_serializer",
"type": "record",
"fields": [
{ "name":"string1", "type":"string" },
{ "name":"int1", "type":"int" },
{ "name":"tinyint1", "type":"int" },
{ "name":"smallint1", "type":"int" },
{ "name":"bigint1", "type":"long" },
{ "name":"boolean1", "type":"boolean" },
{ "name":"float1", "type":"float" },
{ "name":"double1", "type":"double" },
{ "name":"list1", "type":{"type":"array", "items":"string"} },
{ "name":"map1", "type":{"type":"map", "values":"int"} },
{ "name":"struct1", "type":{"type":"record", "name":"struct1_name", "fields": [
{ "name":"sInt", "type":"int" }, { "name":"sBoolean", "type":"boolean" }, { "name":"sString", "type":"string" } ] } },
{ "name":"union1", "type":["float", "boolean", "string"] },
{ "name":"enum1", "type":{"type":"enum", "name":"enum1_values", "symbols":["BLUE","RED", "GREEN"]} },
{ "name":"nullableint", "type":["int", "null"] },
{ "name":"bytes1", "type":"bytes" },
{ "name":"fixed1", "type":{"type":"fixed", "name":"threebytes", "size":3} }
] }

将schema文件其放到HDFS上

1
2
3
4
hadoop fs -ls /user/hive/schema
Found 1 items
-rw-r--r-- 3 hive hive 1131 2021-12-04 13:53 /user/hive/schema/kst.avsc

2.建Hive表

1
2
3
4
5
6
7
8
9
10
11
12
CREATE TABLE default.kst
PARTITIONED BY (ds string)
ROW FORMAT SERDE
'org.apache.hadoop.hive.serde2.avro.AvroSerDe'
STORED AS INPUTFORMAT
'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
TBLPROPERTIES (
'avro.schema.url'='hdfs:///user/hive/schema/kst.avsc');


或者直接在TBLPROPERTIES中指定schema

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
CREATE TABLE default.kst
PARTITIONED BY (ds string)
ROW FORMAT SERDE
'org.apache.hadoop.hive.serde2.avro.AvroSerDe'
STORED AS INPUTFORMAT
'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
TBLPROPERTIES (
'avro.schema.literal'='{
"namespace": "com.linkedin.haivvreo",
"name": "test_serializer",
"type": "record",
"fields": [
{ "name":"string1", "type":"string" },
{ "name":"int1", "type":"int" },
{ "name":"tinyint1", "type":"int" },
{ "name":"smallint1", "type":"int" },
{ "name":"bigint1", "type":"long" },
{ "name":"boolean1", "type":"boolean" },
{ "name":"float1", "type":"float" },
{ "name":"double1", "type":"double" },
{ "name":"list1", "type":{"type":"array", "items":"string"} },
{ "name":"map1", "type":{"type":"map", "values":"int"} },
{ "name":"struct1", "type":{"type":"record", "name":"struct1_name", "fields": [
{ "name":"sInt", "type":"int" }, { "name":"sBoolean", "type":"boolean" }, { "name":"sString", "type":"string" } ] } },
{ "name":"union1", "type":["float", "boolean", "string"] },
{ "name":"enum1", "type":{"type":"enum", "name":"enum1_values", "symbols":["BLUE","RED", "GREEN"]} },
{ "name":"nullableint", "type":["int", "null"] },
{ "name":"bytes1", "type":"bytes" },
{ "name":"fixed1", "type":{"type":"fixed", "name":"threebytes", "size":3} }
] }');

3.查看Hive表的schema

1
2
describe kst;

 

使用show create table查看表结构

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
CREATE TABLE `default.kst`(
`string1` string COMMENT '',
`int1` int COMMENT '',
`tinyint1` int COMMENT '',
`smallint1` int COMMENT '',
`bigint1` bigint COMMENT '',
`boolean1` boolean COMMENT '',
`float1` float COMMENT '',
`double1` double COMMENT '',
`list1` array<string> COMMENT '',
`map1` map<string,int> COMMENT '',
`struct1` struct<sint:int,sboolean:boolean,sstring:string> COMMENT '',
`union1` uniontype<float,boolean,string> COMMENT '',
`enum1` string COMMENT '',
`nullableint` int COMMENT '',
`bytes1` binary COMMENT '',
`fixed1` binary COMMENT '')
PARTITIONED BY (
`ds` string)
ROW FORMAT SERDE
'org.apache.hadoop.hive.serde2.avro.AvroSerDe'
STORED AS INPUTFORMAT
'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
LOCATION
'hdfs://master:8020/user/hive/warehouse/kst'
TBLPROPERTIES (
'avro.schema.url'='hdfs:///user/hive/schema/kst.avsc',

4.插入数据

1
2
3
insert into table default.kst partition(ds="2019-08-20") select "test",2,1Y,2S,1000L,true,1.111,2.22222,array("test1","test2","test3"),map("test123",2222,"test321",4444),
named_struct('sInt',123,'sBoolean',true, 'sString','London') as struct1,create_union(0,cast(0.2 as float),false,"test3"),"BLUE",12345,"00008DAC",'111';

5.查看刚刚insert的数据