springboot 读取hadoop配置文件 springboot连接hadoop

转载

mob64ca1417b0c6 2024-03-05 04:32:00

文章标签 hadoop apache mapreduce 文章分类 Hadoop 大数据

代码:

package com.hadoop.reduce.model;

import org.apache.hadoop.io.Writable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

/**
 * 订单商品对象
 * @author linhaiy
 * @date 2019.05.18
 */
public class OrderInfo implements Writable, Cloneable {
	// 订单号
	private Integer orderId;
	// 时间
	private String orderDate;
	// 产品编号
	private String pid;
	// 数量
	private Integer amount;
	// 产品名称
	private String pname;
	// 种类
	private Integer categoryId;
	// 价格
	private Double price;
	/**
	 * 这个字段需要理解<br>
	 * 因为这个对象，包含了订单与产品的两个文件的内容，当我们加载一个文件的时候，肯定只能加载一部分的信息，另一部分是加载不到的，需要在join的时候，
	 * 加进去，这个字段就代表着这个对象存的是哪些信息 如果为0 则是存了订单信息 如果为1 则是存了产品信息
	 */
	private String flag;

	public OrderInfo() {
	}

	@Override
	public Object clone() throws CloneNotSupportedException {
		return super.clone();
	}

	@Override
	public void write(DataOutput output) throws IOException {
		output.writeInt(orderId);
		output.writeUTF(orderDate);
		output.writeUTF(pid);
		output.writeInt(amount);
		output.writeUTF(pname);
		output.writeInt(categoryId);
		output.writeDouble(price);
		output.writeUTF(flag);
	}

	@Override
	public void readFields(DataInput input) throws IOException {
		orderId = input.readInt();
		orderDate = input.readUTF();
		pid = input.readUTF();
		amount = input.readInt();
		pname = input.readUTF();
		categoryId = input.readInt();
		price = input.readDouble();
		flag = input.readUTF();
	}

	public void set(Integer orderId, String orderDate, String pid, Integer amount, String pname, Integer categoryId,
			Double price, String flag) {
		this.orderId = orderId;
		this.orderDate = orderDate;
		this.pid = pid;
		this.amount = amount;
		this.pname = pname;
		this.categoryId = categoryId;
		this.price = price;
		this.flag = flag;
	}

	public Integer getOrderId() {
		return orderId;
	}

	public void setOrderId(Integer orderId) {
		this.orderId = orderId;
	}

	public String getOrderDate() {
		return orderDate;
	}

	public void setOrderDate(String orderDate) {
		this.orderDate = orderDate;
	}

	public String getPid() {
		return pid;
	}

	public void setPid(String pid) {
		this.pid = pid;
	}

	public Integer getAmount() {
		return amount;
	}

	public void setAmount(Integer amount) {
		this.amount = amount;
	}

	public String getPname() {
		return pname;
	}

	public void setPname(String pname) {
		this.pname = pname;
	}

	public Integer getCategoryId() {
		return categoryId;
	}

	public void setCategoryId(Integer categoryId) {
		this.categoryId = categoryId;
	}

	public Double getPrice() {
		return price;
	}

	public void setPrice(Double price) {
		this.price = price;
	}

	public String getFlag() {
		return flag;
	}

	public void setFlag(String flag) {
		this.flag = flag;
	}

	@Override
	public String toString() {
		final StringBuilder sb = new StringBuilder("{");
		sb.append("\"orderId\":").append(orderId);
		sb.append(",\"orderDate\":\"").append(orderDate).append('\"');
		sb.append(",\"pid\":").append(pid);
		sb.append(",\"amount\":").append(amount);
		sb.append(",\"pname\":\"").append(pname).append('\"');
		sb.append(",\"categoryId\":").append(categoryId);
		sb.append(",\"price\":").append(price);
		sb.append(",\"flag\":\"").append(flag).append('\"');
		sb.append('}');
		return sb.toString();
	}
}

package com.hadoop.reduce.mapper;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;

import com.hadoop.reduce.model.OrderInfo;

import java.io.IOException;

/**
 * mapreduce 表join功能
 * @author linhaiy
 * @date 2019.05.18
 */
public class JoinMapper extends Mapper<LongWritable, Text, Text, OrderInfo> {
	private Text text = new Text();
	private OrderInfo orderInfo = new OrderInfo();
	private final static String ORDER_FILE_NAME = "order";
	private final static String PRODUCT_FILE_NAME = "product";
	private final static String ORDER_FLAG = "0";
	private final static String PRODUCT_FLAG = "1";

	/**
	 * 读取 order.txt 内容格式 1001,20170822,p1,3 读取 product.txt 内容格式 p1,防空火箭,1,20.2
	 * @param key
	 * @param value
	 * @param context
	 * @throws IOException
	 * @throws InterruptedException
	 */
	@Override
	protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
		String line = new String(value.getBytes(), 0, value.getLength(), "GBK");
		// 跳过标题，标题带有#号
		if (line.startsWith("#")) {
			return;
		}

        //获取当前任务的输入切片，这个InputSplit是一个最上层抽象类，可以转换成FileSplit
		InputSplit inputSplit = context.getInputSplit();
		FileSplit fileSplit = (FileSplit) inputSplit;
		// 得到的是文件名，这里根据文件名来判断是哪一种类型的数据，得到的是order或者product
		String fileName = fileSplit.getPath().getName();

		// 我们这里通过文件名判断是哪种数据
		String pid = "";
		String[] spilt = line.split(",");
		if (fileName.startsWith(ORDER_FILE_NAME)) {
			// 加载订单内容，订单数据里面有 订单号，时间，产品ID，数量
			Integer orderId = Integer.parseInt(spilt[0]);
			String orderDate = spilt[1];
			pid = spilt[2];
			Integer amount = Integer.parseInt(spilt[3]);
//          set(Integer orderId, String orderDate, String pid, Integer amount, String pname, Integer categoryId, Double price, String flag)
			orderInfo.set(orderId, orderDate, pid, amount, "", 0, 0.0, ORDER_FLAG);
		} else {
			// 加载产品内容，产品数据有 产品编号，产品名称，种类，价格
			pid = spilt[0];
			String pname = spilt[1];
			Integer categoryId = Integer.parseInt(spilt[2]);
			Double price = Double.valueOf(spilt[3]);
			orderInfo.set(0, "", pid, 0, pname, categoryId, price, PRODUCT_FLAG);
		}
		text.set(pid);
		context.write(text, orderInfo);
	}
}

package com.hadoop.reduce.reducer;

import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import com.hadoop.reduce.model.OrderInfo;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

/**
 * mapreduce的表join操作
 * @author linhaiy
 * @date 2019.05.18
 */
public class JoinReduce extends Reducer<Text, OrderInfo, OrderInfo, NullWritable> {
	private final static String ORDER_FLAG = "0";
	private final static String PRODUCT_FLAG = "1";

	/**
	 * 解析mapper读取后的文件格式 产品pid orderInfo对象
	 * @param key
	 * @param values
	 * @param context
	 * @throws IOException
	 * @throws InterruptedException
	 */
	@Override
	protected void reduce(Text key, Iterable<OrderInfo> values, Context context)
			throws IOException, InterruptedException {
		// 这个对象用来存放产品的数据，一个产品所以只有一个对象
		OrderInfo product = new OrderInfo();
		// 这个list用来存放所有的订单数据，订单肯定是有多个的
		List<OrderInfo> list = new ArrayList<>();

		// 循环map输出
		for (OrderInfo info : values) {
			// 判断是订单还是产品的map输出
			if (ORDER_FLAG.equals(info.getFlag())) {
				// 订单表数据
				OrderInfo tmp = new OrderInfo();
				try {
					tmp = (OrderInfo) info.clone();
				} catch (Exception e) {
					e.printStackTrace();
				}
				list.add(tmp);
			} else {
				// 产品表数据
				try {
					product = (OrderInfo) info.clone();
				} catch (Exception e) {
					e.printStackTrace();
				}
			}
		}

		// 经过上面的操作，就把订单与产品完全分离出来了，订单在list集合中，产品在单独的一个对象中
		// 然后可以分别综合设置进去
		for (OrderInfo tmp : list) {
			tmp.setPname(product.getPname());
			tmp.setCategoryId(product.getCategoryId());
			tmp.setPrice(product.getPrice());
			// 最后输出
			context.write(tmp, NullWritable.get());
		}

	}
}

package com.hadoop.reduce.service;

import java.io.IOException;

import javax.annotation.PostConstruct;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.CombineTextInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Component;

import com.hadoop.reduce.bean.StaffProvincePartitioner;
import com.hadoop.reduce.bean.WeiboInputFormat;
import com.hadoop.reduce.mapper.CounterMapper;
import com.hadoop.reduce.mapper.FriendsMapper;
import com.hadoop.reduce.mapper.JoinMapper;
import com.hadoop.reduce.mapper.StaffMap;
import com.hadoop.reduce.mapper.WeatherMap;
import com.hadoop.reduce.mapper.WeiboMapper;
import com.hadoop.reduce.mapper.WordCount;
import com.hadoop.reduce.mapper.WordCountMap;
import com.hadoop.reduce.model.GroupSortModel;
import com.hadoop.reduce.model.OrderInfo;
import com.hadoop.reduce.model.StaffModel;
import com.hadoop.reduce.model.Weibo;
import com.hadoop.reduce.reducer.FriendsReduce;
import com.hadoop.reduce.reducer.JoinReduce;
import com.hadoop.reduce.reducer.StaffReduce;
import com.hadoop.reduce.reducer.WeatherReduce;
import com.hadoop.reduce.reducer.WeiboReduce;
import com.hadoop.reduce.reducer.WordCountReduce;
import com.hadoop.util.GroupSort;

/**
 * Map/Reduce工具类
 * @author linhaiy
 * @date 2019.05.18
 */
@Component
public class ReduceJobsUtils {

	@Value("${hdfs.path}")
	private String path;

	private static String hdfsPath;

	/**
	 * 获取HDFS配置信息
	 * @return
	 */
	public static Configuration getConfiguration() {
		Configuration configuration = new Configuration();
		configuration.set("fs.defaultFS", hdfsPath);
		configuration.set("mapred.job.tracker", hdfsPath);
		// 运行在yarn的集群模式
		// configuration.set("mapreduce.framework.name", "yarn");
		// 这个配置是让main方法寻找该机器的mr环境
		// configuration.set("yarn.resourcemanmager.hostname", "node1");
		return configuration;
	}

	/**
	 * mapreduce 表join
	 * @param jobName
	 * @param inputPath
	 * @param outputPath
	 * @throws IOException
	 * @throws ClassNotFoundException
	 * @throws InterruptedException
	 */
	public static void join(String jobName, String inputPath, String outputPath)
			throws IOException, ClassNotFoundException, InterruptedException {
		Configuration config = getConfiguration();
		Job job = Job.getInstance(config, jobName);
		// 设置jar中的启动类，可以根据这个类找到相应的jar包
		job.setJarByClass(OrderInfo.class);

		job.setMapperClass(JoinMapper.class);
		job.setReducerClass(JoinReduce.class);

		// 设置Mapper的输出
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(OrderInfo.class);

		// 设置reduce的输出
		job.setOutputKeyClass(OrderInfo.class);
		job.setOutputValueClass(NullWritable.class);

		// 指定输入输出文件的位置
		FileInputFormat.setInputPaths(job, new Path(inputPath));
		FileOutputFormat.setOutputPath(job, new Path(outputPath));

		job.waitForCompletion(true);
	}

	@PostConstruct
	public void getPath() {
		hdfsPath = this.path;
	}

	public static String getHdfsPath() {
		return hdfsPath;
	}
}

package com.hadoop.reduce.service;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.springframework.stereotype.Service;
import com.hadoop.hdfs.service.HdfsService;

/**
 * 单词统计
 * @author linhaiy
 * @date 2019.05.18
 */
@Service
public class MapReduceService {

	// 默认reduce输出目录
	private static final String OUTPUT_PATH = "/output";

	/**
	 * mapreduce 表join操作
	 * @param jobName
	 * @param inputPath
	 * @throws Exception
	 */
	public void join(String jobName, String inputPath) throws Exception {
		if (StringUtils.isEmpty(jobName) || StringUtils.isEmpty(inputPath)) {
			return;
		}
		// 输出目录 = output/当前Job
		String outputPath = OUTPUT_PATH + "/" + jobName;
		if (HdfsService.existFile(outputPath)) {
			HdfsService.deleteFile(outputPath);
		}
		ReduceJobsUtils.join(jobName, inputPath, outputPath);
	}
}

package com.hadoop.reduce.controller;

import org.apache.commons.lang.StringUtils;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RequestMethod;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.bind.annotation.ResponseBody;
import org.springframework.web.bind.annotation.RestController;
import com.hadoop.reduce.service.MapReduceService;
import com.hadoop.util.Result;

/**
 * MapReduce处理控制层
 * @author linhaiy
 * @date 2019.05.18
 */
@RestController
@RequestMapping("/hadoop/reduce")
public class MapReduceAction {

	@Autowired
    MapReduceService mapReduceService;
	
	/**
	 * mapreduce 表join操作
	 * @param jobName
	 * @param inputPath
	 * @return
	 * @throws Exception
	 */
	@RequestMapping(value = "join",method= RequestMethod.POST)
    @ResponseBody
	public Result join(@RequestParam("jobName") String jobName, @RequestParam("inputPath") String inputPath) throws  Exception{
		if (StringUtils.isEmpty(jobName) || StringUtils.isEmpty(inputPath)) {
			return new Result(Result.FAILURE, "请求参数为空");
		}
		mapReduceService.join(jobName, inputPath);
		return new Result(Result.SUCCESS, "表join操作成功");
	}
}

本文章为转载内容，我们尊重原作者对文章享有的著作权。如有内容错误或侵权问题，欢迎原作者联系我们进行内容更正或删除文章。