hive> CREATE TABLE psn4 ( id int, name string, -- age int, likes array<string>, address map<string, string> ) PARTITIONED BY (age int) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' COLLECTION ITEMS TERMINATED BY '-' MAP KEYS TERMINATED BY ':' LINES TERMINATED BY '\n';
然后再试试创建带两个分区的Hive表
hive> CREATE TABLE psn5 ( id int, name string, likes array<string>, address map<string, string> ) PARTITIONED BY (age int, sex string) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' COLLECTION ITEMS TERMINATED BY '-' MAP KEYS TERMINATED BY ':' LINES TERMINATED BY '\n';
数据导入
首先将数据导入一个分区的分区表
hive> LOAD DATA LOCAL INPATH '/root/data' INTO TABLE psn4 PARTITION (age=10);
将数据导入两个分区表,记住导入数据的时候需要将分区写完整
hive> LOAD DATA LOCAL INPATH '/root/data' INTO TABLE psn5 PARTITION (age=10, sex='boy');
添加分区
添加分区的字段必须是在创建表的时候定义的
hive> ALTER TABLE psn4 ADD PARTITION (age=10); hive> ALTER TABLE psn5 ADD PARTITION (age=20, sex='boy'); hive> ALTER TABLE psn5 ADD PARTITION (age=30, sex='girl');
下面这句话会报错,因为额外增加了分区的种类!!!
hive> ALTER TABLE psn4 ADD PARTITION (age=10, height=112);
CREATE TABLE psn( id int, name string, age int, sex string, likes array<string>, address map<string, string> ) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' COLLECTION ITEMS TERMINATED BY '-' MAP KEYS TERMINATED BY ':' LINES TERMINATED BY '\n';
Step2. 创建分区表
CREATE TABLE partipsn( id int, name string, likes array<string>, address map<string, string> ) partitioned by(age int, sex string) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' COLLECTION ITEMS TERMINATED BY '-' MAP KEYS TERMINATED BY ':' LINES TERMINATED BY '\n';
Step3. 启动动态分区
将mode设置成nostrict, 如果是strict必须满足有一个分区是静态的
set hive.exec.dynamic.partition=true; set hive.exec.dynamic.partition.mode=nostrict
Step4. 导入数据
# 使用静态分区时,需要指定分区的值 insert table partipsn partition (age=12, sex='man') select name, ip from psn where age= 12 and sex='man';
hive> set hive.enforce.bucketing=true; # step5 通过分桶表进行抽样 hive> select * from psnbucket tablesample(bucket 1 outof 4 on columns);
# tablesample(bucket x out of y)语法说明 # 假设共有32个桶 # tablesample(bucket 2 out of 4) # x=2表示从第2个桶开始取数据, y=4是桶的因子或者倍数 # 抽取的数据量为32/y=8, 即分别从2,6, 10, 14, 18, 22, 26, 30桶中取数据
# tablesample(bucket 3 out of 256) # 从第3个桶开始取数据 # 抽取的数据量为32/256=0.125, 接下来从3号桶中取1/8个数据,即取1条数据
step2. 创建原始表
hive> create table psn(id int, name string, age int) row format delimited fields terminated by ','; # 将数据数据导入原始表 hive> load data local inpath '/root/data' into table psn;
Step3. 创建分桶表并指定分桶
hive> create table psnbucket(id int, name string, age int) clustered by (age) into 4 buckets row format delimited fields terminated by ',';
publicclassTuoMinextendsUDF{ public Text evaluate(final Text s){ if (s == null){ returnnull; } String str = s.toString().substring(0, 3) + "***"; returnnew Text(str); } }
step2. 将程序打jar包发送到服务器
step3. 添加jar包
hive> add jar /root/tumin.jar;
step4. 创建临时函数
hive> CREATE TEMPORARY FUNCTION tm as 'hive.udf.ADD';
hive> select count(distinct(col1)), count(distinct(col2)) from psn lateral view explode(likes) psn as col1 lateral view explode(address) psn as col2, col3;
其中人员表psn为
psn.id psn.name psn.age psn.sex psn.likes psn.address 1 小明1 10 man ["lol","book","movie"] {"beijing":"\"shangxuetang\"","shanghai":"pudong"} 2 小明2 20 boy ["lol","book","movie"] {"beijing":"\"shangxuetang\"","shanghai":"pudong"} 3 小明3 10 man ["lol","book","movie"] {"beijing":"\"shangxuetang\"","shanghai":"pudong"} 4 小明4 20 man ["lol","book","movie"] {"beijing":"\"shangxuetang\"","shanghai":"pudong"} 5 小明5 10 boy ["lol","movie"] {"beijing":"\"shangxuetang\"","shanghai":"pudong"} 6 小明6 20 boy ["lol","book","movie"] {"beijing":"\"shangxuetang\"","shanghai":"pudong"} 7 小明7 10 man ["lol","book"] {"beijing":"\"shangxuetang\"","shanghai":"pudong"} 8 小明8 10 man ["lol","book"] {"beijing":"\"shangxuetang\"","shanghai":"pudong"} 9 小明9 20 boy ["lol","book","movie"] {"beijing":"\"shangxuetang\"","shanghai":"pudong"}
视图
和关系型数据库一样,Hive也支持视图,有以下三个特点
不支持物化视图,即视图不可持久化
只能做查询操作,不能做load数据操作
视图也是表,所以会保留元信息在MySQL,但是Hive中是不存数据的
# 创建视图 # 视图也是表,即在mysql里面可以查到,但在HDFS里面没有数据 hive> create view v_psn as select id, name from psn;
Hive练习
掉话率统计
统计基站掉线率最高的前10个基站
Step1. 创建原始表
hive> create table call_monitor( imei string, # 基站编号 drop_num int, # 掉话量 duration int # 通话量 ) row format delimited fields terminated by ',';
# 导入数据 load data local inpath '/root/calldata' into table wc;
Step2. 创建结果表
hive> create table call_result( imei string, drop_num int, duration int, drop_rate double ) row format delimited fields terminated by ',';
Step3. 原始表数据导入结果表
hive> from call_monitor cm; insert into call_result select cm.imei, sum(cm.drop_num) sdrop, sum(cm.duration) sdura, sum(cm.drop_num)/sum(cm.duration) s_rate group by cm.imei order by s_rate desc limit 10