1wordcount

 

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

//Administrator
public class WordCountExample {
private static class WordCountMapper extends Mapper<Object, Text, Text, IntWritable>{

@Override
protected void map(Object key, Text value, Context context)
throws IOException, InterruptedException {
String str=value.toString();
String []strArray=str.split(" ");
for(String s:strArray){
context.write(new Text(s), new IntWritable(1));
}
}

}

private static class WordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable>{

@Override
protected void reduce(Text key, Iterable<IntWritable> values,
Context context)
throws IOException, InterruptedException {
int sum=0;
for(IntWritable count:values){
sum+=count.get();
}
context.write(key, new IntWritable(sum));
}

}

/**
* @param args
*/
public static void main(String[] args) throws Exception{
Configuration conf=new Configuration();
String []argArray=new GenericOptionsParser(conf,args).getRemainingArgs();
if(argArray.length!=2){
System.out.println("需要两个参数");
System.exit(1);
}
Job job=new Job(conf,"wordcount");
job.setJarByClass(WordCountExample.class);
job.setMapperClass(WordCountMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
job.setReducerClass(WordCountReducer.class);
FileInputFormat.addInputPath(job, new Path(argArray[0]));
FileOutputFormat.setOutputPath(job, new Path(argArray[1]));
System.exit(job.waitForCompletion(true)?0:1);
}

}

2去重

 

 

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

//Administrator
public class DeleteRepeatExample {
private static class DeleteRepeatMapper extends Mapper<Object, Text, Text, IntWritable>{

@Override
protected void map(Object key, Text value, Context context)
throws IOException, InterruptedException {
context.write(value, new IntWritable(0));
}

}

private static class DeleteRepeatReducer extends Reducer<Text, IntWritable, Text, Object>{

@Override
protected void reduce(Text key, Iterable<IntWritable> values,
Context context)
throws IOException, InterruptedException {
context.write(key, null);
}

}

/**
* @param args
*/
public static void main(String[] args) throws Exception{
Configuration conf=new Configuration();
String[]argArray=new GenericOptionsParser(conf, args).getRemainingArgs();
if(argArray.length!=2){
System.out.println("请提供两个参数");
System.exit(1);
}
Job job=new Job(conf,"delete repeat");
job.setJarByClass(DeleteRepeatExample.class);
job.setMapperClass(DeleteRepeatMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setReducerClass(DeleteRepeatReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Object.class);
FileInputFormat.addInputPath(job, new Path(argArray[0]));
FileOutputFormat.setOutputPath(job,new Path(argArray[1]));
System.exit(job.waitForCompletion(true)?0:1);

}

}

3排序

 

 

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

//Administrator
public class SortExample {
private static class SortMapper extends Mapper<Object, Text, IntWritable, IntWritable>{

@Override
protected void map(Object key, Text value, Context context)
throws IOException, InterruptedException {
context.write(new IntWritable(Integer.parseInt(value.toString())), new IntWritable(0));
}

}

private static class SortReducer extends Reducer<IntWritable, IntWritable, Text,Text>{
private int index=0;
@Override
protected void reduce(IntWritable key, Iterable<IntWritable> values,
Context context)
throws IOException, InterruptedException {
for(IntWritable i:values){
index++;
context.write(new Text(index+""),new Text(key.get()+""));
}
}

}

/**
* @param args
*/
public static void main(String[] args) throws Exception{
Configuration conf=new Configuration();
String[]argArray=new GenericOptionsParser(conf, args).getRemainingArgs();
if(argArray.length!=2){
System.out.println("请输入两个参数");
System.exit(1);
}
Job job=new Job(conf,"sort");
job.setJarByClass(SortExample.class);
job.setMapperClass(SortMapper.class);
job.setMapOutputKeyClass(IntWritable.class);
job.setMapOutputValueClass(IntWritable.class);
job.setReducerClass(SortReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
FileInputFormat.addInputPath(job, new Path(argArray[0]));
FileOutputFormat.setOutputPath(job, new Path(argArray[1]));
System.exit(job.waitForCompletion(true)?0:1);

}

}

4表自连接

 

package demo;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.vod.Ejob;

public class SelfJoin {
private static class SelfJoinMapper extends Mapper<Object, Text, Text, Text>{

@Override
protected void map(Object key, Text value, Context context)
throws IOException, InterruptedException {
String str=value.toString();
String[] nameArray=str.split(" ");
context.write(new Text(nameArray[1]), new Text("1-"+nameArray[0]+"-"+nameArray[1]));
context.write(new Text(nameArray[0]), new Text("2-"+nameArray[0]+"-"+nameArray[1]));

}

}
private static class SelfJoinReducer extends Reducer<Text, Text, Text, Text>{

@Override
protected void reduce(Text key, Iterable<Text> values,
Context context)
throws IOException, InterruptedException {
List<String> outKey=new ArrayList<String>();
List<String> outValue=new ArrayList<String>();
/*for(Text value:values){
context.write(NullWritable.get(), value);
}
context.write(NullWritable.get(), new Text("---------"));*/
for(Text value:values){
String[] relationArray=value.toString().split("-");
if(relationArray[0].equals("1")){
outKey.add(relationArray[1]);
}else if(relationArray[0].equals("2")){
outValue.add(relationArray[2]);
}
}
for(String k:outKey){
for(int i=0;i<outValue.size();i++){
context.write(new Text(k), new Text(outValue.get(i)));
}
}
}

}
public static void main(String[] args) throws Exception{
File jarFile = Ejob.createTempJar("bin");
//Ejob.addClasspath("/opt/hadoop/conf");
ClassLoader classLoader = Ejob.getClassLoader();
Thread.currentThread().setContextClassLoader(classLoader);

Configuration conf=new Configuration();
String [] argArray=new GenericOptionsParser(conf, args).getRemainingArgs();
if(argArray.length!=2){
System.out.println("参数错误");
System.exit(1);
}
JobConf jobConf=new JobConf(conf);
jobConf.setJar(jarFile.toString());
Job job=new Job(jobConf,"self join");
job.setJarByClass(SelfJoin.class);
job.setMapperClass(SelfJoinMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setReducerClass(SelfJoinReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
FileInputFormat.addInputPath(job, new Path(argArray[0]));
FileOutputFormat.setOutputPath(job, new Path(argArray[1]));
System.exit(job.waitForCompletion(true)?0:1);

}

}

数据:

Tom Lucy
Tom Jack
Jone Lucy
Jone Jack
Lucy Mary
Lucy Ben
Jack Alice
Jack Jesse
Terry Alice
Terry Jesse
Philip Terry
Philip Alma
Mark Terry
Mark Alma

结果:

 

 

Tom  Alice
Tom Jesse
Jone Alice
Jone Jesse
Tom Mary
Tom Ben
Jone Mary
Jone Ben
Philip Alice
Philip Jesse
Mark Alice
Mark Jesse


 

5多表连接

 

package demo;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.vod.Ejob;

public class MultiTableJoin {
private static class MultiTableMapper extends Mapper<Object, Text, Text, Text>{

@Override
protected void map(Object key, Text value, Context context)
throws IOException, InterruptedException {
String str=value.toString();
if(str.charAt(0)>'0'&&str.charAt(0)<'9'){
context.write(new Text(str.charAt(0)+""), new Text("2-"+str.substring(1).trim()));
}else{
context.write(new Text(str.substring(str.length()-1)), new Text("1-"+str.substring(0, str.length()-1).trim()));
}
}

}

private static class MultiTableReducer extends Reducer<Text, Text, Text, Text>{

@Override
protected void reduce(Text key, Iterable<Text> values,
Context context)
throws IOException, InterruptedException {
List<String>keyList=new ArrayList<String>();
List<String>valueList=new ArrayList<String>();
for(Text value:values){
String str=value.toString();
String []strArray=str.split("-");
if(strArray[0].equals("1")){
keyList.add(strArray[1]);
}else if(strArray[0].equals("2")){
valueList.add(strArray[1]);
}
}
for(String skey:keyList){
for(String svalue:valueList){
context.write(new Text(skey), new Text(svalue));
}
}
}

}


public static void main(String[] args) throws Exception{
File jarFile=Ejob.createTempJar("bin");
ClassLoader classLoader=Ejob.getClassLoader();
Thread.currentThread().setContextClassLoader(classLoader);

Configuration conf=new Configuration();
String [] argArray=new GenericOptionsParser(conf, args).getRemainingArgs();
if(argArray.length!=2){
System.out.println("参数错误");
System.exit(1);
}
JobConf jobConf=new JobConf(conf);
jobConf.setJar(jarFile.toString());
Job job=new Job(jobConf,"multiTalbe join");
job.setMapperClass(MultiTableMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setReducerClass(MultiTableReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
FileInputFormat.addInputPath(job, new Path(argArray[0]));
FileOutputFormat.setOutputPath(job, new Path(argArray[1]));
System.exit(job.waitForCompletion(true)?0:1);

}
}

数据:table1.txt


Beijing Red Star 1
Shenzhen Thunder 3
Guangzhou Honda 2
Beijing Rising 1
Guangzhou Development Bank 2
Tencent 3
Bank of Beijing 1

table2.txt

 

 

1 Beijing
2 Guangzhou
3 Shenzhen
4 Xian

运行结果:

 

 

Beijing Red Star  Beijing
Beijing Rising Beijing
Bank of Beijing Beijing
Guangzhou Honda Guangzhou
Guangzhou Development Bank Guangzhou
Shenzhen Thunder Shenzhen
Tencent Shenzhen