1. 小文件合并位置 :
1. map输入端 对小文件合并
2. map输出端 对小文件合并
3. reduce输出端 对小文件合并
2. map输入时,对小文件合并
参数设置 :
-- CombineHiveInputFormat 按切片大小切分(多个小文件可归一个切片)
-- 默认实现类 为CombineHiveInputFormat
set hive.input.format=org.apache.hadoop.hive.ql.io.CombineHiveInputFormat;
说明 : CombineHiveInputFormat 是 HiveInputFormat 的子类,对切片规则进行了优化
当小文件不满足切片(filesize < splitsize)时,会根据切片大小,将多个小文件划分成一个切片
测试(map输入时,对小文件合并)
-- 测试(map输入时,对小文件合并)
-- 测试1 : 文件个数3、文件大小34.8、inputformat类HiveInputFormat、切片大小128M
-- 预期 : 切片个数=文件个数=mapTask=3个
34.8 M 104.4 M /user/hive/warehouse/home.db/gulivideo_user_ori/user.txt
34.8 M 104.4 M /user/hive/warehouse/home.db/gulivideo_user_ori/user1.txt
34.8 M 104.4 M /user/hive/warehouse/home.db/gulivideo_user_ori/user2.txt
set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
select
substr(uploader,0,1)
,count(1)
from gulivideo_user_ori
group by substr(uploader,0,1);
Hadoop job information for Stage-1: number of mappers: 3; number of reducers: 1
Time taken: 14.557 seconds, Fetched: 62 row(s)
-- 测试2 : 文件个数3、文件大小34.8、inputformat类CombineHiveInputFormat、切片大小128M
-- 预期 : 切片个数=文件个数=mapTask=3个
34.8 M 104.4 M /user/hive/warehouse/home.db/gulivideo_user_ori/user.txt
34.8 M 104.4 M /user/hive/warehouse/home.db/gulivideo_user_ori/user1.txt
34.8 M 104.4 M /user/hive/warehouse/home.db/gulivideo_user_ori/user2.txt
set hive.input.format=org.apache.hadoop.hive.ql.io.CombineHiveInputFormat;
select
substr(uploader,0,1)
,count(1)
from gulivideo_user_ori
group by substr(uploader,0,1);
Hadoop job information for Stage-1: number of mappers: 1; number of reducers: 1
Time taken: 12.913 seconds, Fetched: 62 row(s)
View Code
3. 在MapTask、ReduceTask任务结束时,对小文件合并
参数设置 :
-- 在 map-only 任务结束时合并小文件,默认 true
SET hive.merge.mapfiles=true;
-- 在 map-reduce 任务结束时合并小文件,默认 false
SET hive.merge.mapredfiles=true;
-- 合并文件的大小,默认 256M
set hive.merge.size.per.task=256000000;
-- 当输出文件的平均大小小于该值时,启动一个独立的 map-reduce 任务进行文件 merge
-- 默认值为16M
SET hive.merge.smallfiles.avgsize=16000000;
测试(在Mr任务结束时,对小文件合并)
-- 测试(在Mr任务结束时,对小文件合并)
-- 测试1 : 文件个数1、文件大小34.8、不开启任务结束后文件合并、reduce个数3
set mapreduce.job.reduces=3;
SET hive.merge.mapredfiles=false; -- 任务结束时,不对小文件进行合并
set yarn.scheduler.maximum-allocation-mb=118784;
set mapreduce.map.memory.mb=4096;
set mapreduce.reduce.memory.mb=4096;
set yarn.nodemanager.vmem-pmem-ratio=4.2;
create table mergeTab as
select
substr(uploader,0,1)
,count(1)
from gulivideo_user_ori
group by substr(uploader,0,1);
Hadoop job information for Stage-1: number of mappers: 1; number of reducers: 3
Time taken: 32.277 seconds
162 486 /user/hive/warehouse/home.db/mergetab/000000_0
157 471 /user/hive/warehouse/home.db/mergetab/000001_0
163 489 /user/hive/warehouse/home.db/mergetab/000002_0
-- 测试2 : 文件个数1、文件大小34.8、不开启任务结束后文件合并、reduce个数3、合并文件大小256M
set mapreduce.job.reduces=3;
SET hive.merge.mapredfiles=true; -- 任务结束时,对小文件进行合并
set hive.merge.size.per.task=256000000;
set yarn.scheduler.maximum-allocation-mb=118784;
set mapreduce.map.memory.mb=4096;
set mapreduce.reduce.memory.mb=4096;
set yarn.nodemanager.vmem-pmem-ratio=4.2;
create table mergeTab2 as
select
substr(uploader,0,1)
,count(1)
from gulivideo_user_ori
group by substr(uploader,0,1);
Hadoop job information for Stage-1: number of mappers: 1; number of reducers: 3
Time taken: 47.983 seconds
482 1.4 K /user/hive/warehouse/home.db/mergetab2/000000_0
-- 测试3 : 文件个数1、文件大小34.8、不开启任务结束后文件合并、reduce个数3、合并文件大小300bytes
set mapreduce.job.reduces=3;
SET hive.merge.mapfiles = true;
SET hive.merge.mapredfiles=true; -- 任务结束时,对小文件进行合并
set hive.merge.size.per.task=100;
set yarn.scheduler.maximum-allocation-mb=118784;
set mapreduce.map.memory.mb=4096;
set mapreduce.reduce.memory.mb=4096;
set yarn.nodemanager.vmem-pmem-ratio=4.2;
create table mergeTab2 as
select
substr(uploader,0,1)
,count(1)
from gulivideo_user_ori
group by substr(uploader,0,1);
Hadoop job information for Stage-1: number of mappers: 1; number of reducers: 3
Time taken: 53.033 seconds
View Code