/*ORCMapper.java*/ import java.io.IOException; import java.util.*; import org.apache.hadoop.mapred.*; import org.apache.hadoop.hive.ql.io.orc.*; import org.apache.hadoop.io.*; public class ORCMapper extends MapReduceBase implements Mapper<LongWritable, Text, NullWritable, Writable>{OrcSerde serde; @Override public void configure(JobConf job) { serde = new OrcSerde(); } @Override public void map(LongWritable key, Text value, OutputCollector<NullWritable, Writable> output, Reporter reporter) throws IOException { output.collect(NullWritable.get(),serde.serialize(value, null)); } } /*ORCReducer.java*/ import java.io.IOException; import java.util.Iterator; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Writable; import org.apache.hadoop.mapred.MapReduceBase; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.Reducer; import org.apache.hadoop.mapred.Reporter; public class ORCReducer extends MapReduceBase implements Reducer<NullWritable, Writable, NullWritable, Writable>{ @Override public void reduce(NullWritable key, Iterator<Writable> values, OutputCollector<NullWritable, Writable> output, Reporter reporter) throws IOException { Writable value = values.next(); output.collect(key, value); } } /*ORCDriver.java*/ import java.io.*; import org.apache.hadoop.fs.*; import org.apache.hadoop.hive.ql.io.orc.*; import org.apache.hadoop.io.*; import org.apache.hadoop.mapred.*; public class ORCDriver { public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException { JobClient client = new JobClient(); JobConf conf = new JobConf("ORC_Generator"); conf.setInputFormat(TextInputFormat.class); conf.setOutputKeyClass(NullWritable.class); conf.setOutputValueClass(Writable.class); conf.setOutputFormat(OrcOutputFormat.class);FileInputFormat.addInputPath(conf, new Path("hdfs://localhost:9000/path/to/ipdir/textfile"));OrcOutputFormat.setOutputPath(conf, new Path("hdfs://localhost:9000/path/to/opdir/orcfile")); conf.setMapperClass(ORCMapper.class); System.out.println(OrcOutputFormat.getWorkOutputPath(conf)); conf.setNumReduceTasks(0); client.setConf(conf); try { JobClient.runJob(conf); } catch (Exception e) { e.printStackTrace(); } } }
hive :text to ORC
精选 转载文章标签 一行数据写入hive 表 文章分类 Hive 大数据
-
Apache Hive
Apache Hive的相关知识,包括简单介绍,环境配置,和使用简介
mysql Hive SQL -
Sqoop将MySQL表结构同步到hive(text、orc)
Sqoop将MySQL表结构同步到hive orc格式的
sqoop mysql hive -
hive RCFILE 和orc区别 hive orc parquet
Parquet和ORC对比1.存储文件的压缩比总结:ORC > Parquet 2.存储文件的查询速度总结:查询速度相近,ORC好一点点3.可兼容的平台:ORC常用于Hive、Presto;
hive RCFILE 和orc区别 大数据 Hive Hadoop sql