前言
简单讲讲我怎么在IDEA进行开发的。
- 大数据 基础概念
- 大数据 Centos基础
- 大数据 Shell基础
- 大数据 ZooKeeper
- 大数据 Hadoop介绍、配置与使用
- 大数据 Hadoop之HDFS
- 大数据 MapReduce
- 大数据 Hive
- 大数据 Yarn
- 大数据 MapReduce使用
- 大数据 Hadoop高可用HA
开发环境
- IDEA
- Hadoop
创建IDEA工程
配置
创建一个Maven
工程,之后,配置pom.xml
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
<java.version>1.8</java.version>
</properties>
<url>http://maven.apache.org</url>
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.7.3</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>2.7.3</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>2.7.3</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
</dependency>
</dependencies>
配置log4j.properties
。
public class constant {
// Hadoop的输入输出目录,用于本地测试。
public static final String output = "/tmp/output";
public static final String input = "/tmp/input";
/**
* 清除本地文件夹及其目录下的文件
*/
private static void deleteFileAndFolder(String path) {
File f = new File(path);
if (f.isDirectory()) {//如果是目录,先递归删除
String[] list = f.list();
assert list != null;
for (String aList : list) {
deleteFileAndFolder(path + "//" + aList);//先删除目录下的文件
}
}
if (f.delete()) {
System.out.println("Delete successful.");
}
}
public static void init() {
Properties properties = new Properties();
properties.setProperty("log4j.rootLogger", "WARN, stdout");
properties.setProperty("log4j.appender.stdout", "org.apache.log4j.ConsoleAppender");
properties.setProperty("log4j.appender.stdout.layout", "org.apache.log4j.PatternLayout");
properties.setProperty("log4j.appender.stdout.layout.ConversionPattern", "%d %p [%c] - %m%n");
PropertyConfigurator.configure(properties);
// 清除本地文件夹及其目录下的文件
deleteFileAndFolder(output);
}
}
public class constant {
// Hadoop的输入输出目录,用于本地测试。
public static final String output = "/tmp/output";
public static final String input = "/tmp/input";
/**
* 清除本地文件夹及其目录下的文件
*/
private static void deleteFileAndFolder(String path) {
File f = new File(path);
if (f.isDirectory()) {//如果是目录,先递归删除
String[] list = f.list();
assert list != null;
for (String aList : list) {
deleteFileAndFolder(path + "//" + aList);//先删除目录下的文件
}
}
if (f.delete()) {
System.out.println("Delete successful.");
}
}
public static void init() {
Properties properties = new Properties();
properties.setProperty("log4j.rootLogger", "WARN, stdout");
properties.setProperty("log4j.appender.stdout", "org.apache.log4j.ConsoleAppender");
properties.setProperty("log4j.appender.stdout.layout", "org.apache.log4j.PatternLayout");
properties.setProperty("log4j.appender.stdout.layout.ConversionPattern", "%d %p [%c] - %m%n");
PropertyConfigurator.configure(properties);
// 清除本地文件夹及其目录下的文件
deleteFileAndFolder(output);
}
}
下面,IDEA的工程就创建完毕了,我们可以编写代码了。
编写代码
- 代码主要使用了BX-Book-Ratings这一份数据作为处理,由于数据的第一行是分类标签,所以需要去掉。
-
cat BX-Book-Ratings.csv | tail -n +2 > data.csv
执行一下指令去掉第一行标签。 - cat命令显示文件指定行
head BX-Book-Ratings.csv
即可查看数据的格式(尚未去除第一行标签)。
"User-ID";"ISBN";"Book-Rating"
"276725";"034545104X";"0"
"276726";"0155061224";"5"
"276727";"0446520802";"0"
"276729";"052165615X";"3"
"276729";"0521795028";"6"
"276733";"2080674722";"0"
"276736";"3257224281";"8"
"276737";"0600570967";"6"
"276744";"038550120X";"7"
如果需要在本地环境中跑,可以先在本机中搭建一个Hadoop环境,并将Hadoop的目录加入到系统环境中,方便调用Hadoop的bin
和sbin
目录下的指令。
本地环境需要配置两个变量,
// 配置Mapper,Reducer
Configuration conf = new Configuration();
// 配置本地运行测试(非Hadoop集群,但是需要搭建Hadoop环境,
// 因为需要使用Hadoop的指令)
// 这里需要结合一下内容使用:(input和output请见配置中Constant.java定义)
conf.set("mapreduce.framework.name", "local");
conf.set("fs.defaultFS", "file:///");
FileInputFormat.setInputPaths(job, new Path(input));
FileOutputFormat.setOutputPath(job, new Path(output));
// 配置Mapper,Reducer
Configuration conf = new Configuration();
// 配置本地运行测试(非Hadoop集群,但是需要搭建Hadoop环境,
// 因为需要使用Hadoop的指令)
// 这里需要结合一下内容使用:(input和output请见配置中Constant.java定义)
conf.set("mapreduce.framework.name", "local");
conf.set("fs.defaultFS", "file:///");
FileInputFormat.setInputPaths(job, new Path(input));
FileOutputFormat.setOutputPath(job, new Path(output));
如果不需要在本地环境跑,需要将input
和output
修改为args[0]
和args[1]
。
完整代码如下:(实现功能:计算每本书的平均评价分数)
public class MainDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
// 初始化Log4J,如果是在Hadoop集群上跑,就不需要deleteFileAndFolder方法,请自行注释掉。
init();
// 配置Mapper,Reducer
Configuration conf = new Configuration();
// 配置本地运行测试(非Hadoop集群,
// 但是需要搭建Hadoop环境,因为需要使用Hadoop的指令)
// 这里需要结合一下内容使用:(input和output请见配置中Constant.java定义)
// conf.set("mapreduce.framework.name", "local");
// conf.set("fs.defaultFS", "file:///");
// FileInputFormat.setInputPaths(job, new Path(input));
// FileOutputFormat.setOutputPath(job, new Path(output));
Job job = Job.getInstance(conf);
// 配置主函数的路径
job.setJarByClass(MainDriver.class);
// 设置自定义的Mapper和Reducer。
job.setMapperClass(MapTask.class);
job.setReducerClass(ReduceTask.class);
// 设置Mapper的输出格式
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(BookBean.class);
// 设置Reducer的输出格式
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
// 配置自定义的分组比较。
// job.setGroupingComparatorClass(MyComparator.class);
//
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.waitForCompletion(true);
}
/**
* Mapper,通过重写Map方法,实现自定义的数据处理方式。
*/
static class MapTask extends Mapper<LongWritable, Text, Text, BookBean> {
@Override
protected void map(LongWritable key, Text text, Context context) throws IOException, InterruptedException {
// "User-ID";"ISBN";"Book-Rating"
// "276725";"034545104X";"0"
String[] values = text.toString().trim().split(";");
if (values.length == 3) {
context.write(new Text(values[1]), new BookBean(values[0], values[1], values[2]));
}
}
}
/**
* Reducer,通过重写Reduce方法,实现自定义的数据输出。
*/
static class ReduceTask extends Reducer<Text, BookBean, Text, Text> {
@Override
protected void reduce(Text key, Iterable<BookBean> values, Context context) throws IOException, InterruptedException {
int size = 0;
int count = 0;
while (values.iterator().hasNext()) {
BookBean bookBean = values.iterator().next();
if (!bookBean.getBookRating().equals("")) {
++size;
count += Integer.parseInt(bookBean.getBookRating());
}
}
if (size != 0) {
context.write(key, new Text(String.valueOf(count / size)));
}
}
}
}
public class MainDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
// 初始化Log4J,如果是在Hadoop集群上跑,就不需要deleteFileAndFolder方法,请自行注释掉。
init();
// 配置Mapper,Reducer
Configuration conf = new Configuration();
// 配置本地运行测试(非Hadoop集群,
// 但是需要搭建Hadoop环境,因为需要使用Hadoop的指令)
// 这里需要结合一下内容使用:(input和output请见配置中Constant.java定义)
// conf.set("mapreduce.framework.name", "local");
// conf.set("fs.defaultFS", "file:///");
// FileInputFormat.setInputPaths(job, new Path(input));
// FileOutputFormat.setOutputPath(job, new Path(output));
Job job = Job.getInstance(conf);
// 配置主函数的路径
job.setJarByClass(MainDriver.class);
// 设置自定义的Mapper和Reducer。
job.setMapperClass(MapTask.class);
job.setReducerClass(ReduceTask.class);
// 设置Mapper的输出格式
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(BookBean.class);
// 设置Reducer的输出格式
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
// 配置自定义的分组比较。
// job.setGroupingComparatorClass(MyComparator.class);
//
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.waitForCompletion(true);
}
/**
* Mapper,通过重写Map方法,实现自定义的数据处理方式。
*/
static class MapTask extends Mapper<LongWritable, Text, Text, BookBean> {
@Override
protected void map(LongWritable key, Text text, Context context) throws IOException, InterruptedException {
// "User-ID";"ISBN";"Book-Rating"
// "276725";"034545104X";"0"
String[] values = text.toString().trim().split(";");
if (values.length == 3) {
context.write(new Text(values[1]), new BookBean(values[0], values[1], values[2]));
}
}
}
/**
* Reducer,通过重写Reduce方法,实现自定义的数据输出。
*/
static class ReduceTask extends Reducer<Text, BookBean, Text, Text> {
@Override
protected void reduce(Text key, Iterable<BookBean> values, Context context) throws IOException, InterruptedException {
int size = 0;
int count = 0;
while (values.iterator().hasNext()) {
BookBean bookBean = values.iterator().next();
if (!bookBean.getBookRating().equals("")) {
++size;
count += Integer.parseInt(bookBean.getBookRating());
}
}
if (size != 0) {
context.write(key, new Text(String.valueOf(count / size)));
}
}
}
}
修改Maven
的./conf/settings.xml
配置文件,在<profiles></profiles>
中添加以下内容。
<profile>
<id>jdk18</id>
<activation>
<activeByDefault>true</activeByDefault>
<jdk>1.8</jdk>
</activation>
<properties>
<maven.compiler.source>1.8</maven.compiler.source>
<maven.compiler.target>1.8</maven.compiler.target>
<maven.compiler.compilerVersion>1.8</maven.compiler.compilerVersion>
</properties>
</profile>
需要注意的是,Bean类需要实现WritableComparable
的接口并重写其中的compareTo
、write
、readFields
方法。
public class BookBean implements WritableComparable<BookBean> {
// "User-ID";"ISBN";"Book-Rating"
private String userId;
private String ISBN;
private String bookRating;
// 空构造函数必须要用,反射需要用到。
public BookBean() {
}
BookBean(String userId, String ISBN, String bookRating) {
this.userId = userId.replaceAll("[^0-9a-zA-Z]", "");
this.ISBN = ISBN;
this.bookRating = bookRating.replaceAll("[^0-9a-zA-Z]", "");
}
public String getUserId() {
return userId;
}
public void setUserId(String userId) {
this.userId = userId;
}
public String getISBN() {
return ISBN;
}
public void setISBN(String ISBN) {
this.ISBN = ISBN;
}
public String getBookRating() {
return bookRating;
}
public void setBookRating(String bookRating) {
this.bookRating = bookRating;
}
@Override
public String toString() {
return "BookBean [userId=" + userId +
" ISBN=" + ISBN +
" bookRating=" + bookRating + "]";
}
@Override
public int compareTo(BookBean o) {
return ISBN.compareTo(o.ISBN);
}
@Override
public void write(DataOutput out) throws IOException {
out.writeUTF(userId);
out.writeUTF(ISBN);
out.writeUTF(bookRating);
}
@Override
public void readFields(DataInput in) throws IOException {
try {
this.userId = in.readUTF();
this.ISBN = in.readUTF();
this.bookRating = in.readUTF();
} catch (EOFException e) {
this.userId = "";
this.ISBN = "";
this.bookRating = "";
}
}
}
public class BookBean implements WritableComparable<BookBean> {
// "User-ID";"ISBN";"Book-Rating"
private String userId;
private String ISBN;
private String bookRating;
// 空构造函数必须要用,反射需要用到。
public BookBean() {
}
BookBean(String userId, String ISBN, String bookRating) {
this.userId = userId.replaceAll("[^0-9a-zA-Z]", "");
this.ISBN = ISBN;
this.bookRating = bookRating.replaceAll("[^0-9a-zA-Z]", "");
}
public String getUserId() {
return userId;
}
public void setUserId(String userId) {
this.userId = userId;
}
public String getISBN() {
return ISBN;
}
public void setISBN(String ISBN) {
this.ISBN = ISBN;
}
public String getBookRating() {
return bookRating;
}
public void setBookRating(String bookRating) {
this.bookRating = bookRating;
}
@Override
public String toString() {
return "BookBean [userId=" + userId +
" ISBN=" + ISBN +
" bookRating=" + bookRating + "]";
}
@Override
public int compareTo(BookBean o) {
return ISBN.compareTo(o.ISBN);
}
@Override
public void write(DataOutput out) throws IOException {
out.writeUTF(userId);
out.writeUTF(ISBN);
out.writeUTF(bookRating);
}
@Override
public void readFields(DataInput in) throws IOException {
try {
this.userId = in.readUTF();
this.ISBN = in.readUTF();
this.bookRating = in.readUTF();
} catch (EOFException e) {
this.userId = "";
this.ISBN = "";
this.bookRating = "";
}
}
}
一键部署到服务端并执行
为了在IDEA中一键部署并执行,我特意在本机写了Linux Shell脚本流程化控制。结合Maven的打包机制、scp
、Hadoop指令完成了流程化控制。
一共两个脚本quick.sh
和deal.sh
。quick.sh
放置在本地,deal.sh
则需要放置在NameNode
的root
目录下。本机需要搭建SSH免密登录——可以在【大数据 ZooKeeper】中搜索免密登录访问关键字查找对应教程。
- 本地执行脚本
quick.sh
- 需要修改
Hadoop
工程的目录路径——src_path
- 以及
NameNode
上的脚本和Jar
包的上传执行路径——target_path
# !/bin/sh
src_path=工程目录
jar_name=hdfs-1.0-SNAPSHOT.jar
target_path=/root
data_path=$1
file_name=$2
main_class=$3
echo ">>> data_path->$1"
echo ">>> file_name->$2"
echo ">>> main_class->$3"
cd $src_path
mvn clean install -Dmaven.test.skip=true
cd -
scp $src_path/target/$jar_name root@node0:$target_path
scp $data_path/$file_name root@node0:$target_path
echo $jar_name,$target_path,$data_path,$file_name,$main_class >> arg.txt
scp arg.txt root@node0:$target_path
ssh root@node0 "sh deal.sh"
rm arg.txt
unset src_path
unset jar_name
unset target_path
unset data_path
unset file_name
unset main_class
unset hdfs_input
unset hdfs_output
echo "Finish..."
-
NameNode
节点root
目录下的deal.sh
脚本:
# !/bin/sh
hdfs_input=/data/input
hdfs_output=/data/output
# 初始化参数
jar_name=`cat arg.txt | cut -d ',' -f 1`
target_path=`cat arg.txt | cut -d ',' -f 2`
data_path=`cat arg.txt | cut -d ',' -f 3`
file_name=`cat arg.txt | cut -d ',' -f 4`
main_class=`cat arg.txt | cut -d ',' -f 5`
echo ">>> hadoop...."
echo ">>> remove $hdfs_output data"
hadoop fs -rm -r -f $hdfs_output
echo ">>> remove $hdfs_input"
hadoop fs -rm -r -f $hdfs_input
echo ">>> create $hdfs_input"
hadoop fs -mkdir -p $hdfs_input
echo ">>> put file $target_path/$file_name"
hadoop fs -put $target_path/$file_name $hdfs_input
echo ">>> dealing..."
hadoop jar $target_path/$jar_name $main_class $hdfs_input $hdfs_output
echo "<<done."
echo ">>> show result..."
temp=`hadoop fs -ls $hdfs_output`
echo $temp
num=`echo $temp | wc -w`
count=11
while [ $count -le $num ]; do
file=`echo $temp | cut -d ' ' -f $count`
echo ">>> show ---- $file"
hadoop fs -cat $file
count=$(($count+8))
echo "<<< done."
done
echo "<<< hadoop done."
rm $target_path/$jar_name
rm $target_path/arg.txt
rm $target_path/$file_name
unset count
unset file
unset temp
unset num
unset src_path
unset jar_name
unset target_path
unset data_path
unset file_name
unset main_class
unset hdfs_input
unset hdfs_output
- 脚本执行:
source quick.sh /tmp/input BX-Book-Ratings.csv MR.AnalysisBook.MainDriver
- 第一个参数
/tmp/input
是输入数据的路径。 - 第二个参数
BX-Book-Ratings.csv
是数据文件的名称。 - 第三个参数
MR.AnalysisBook.MainDriver
是主函数的位置。
注意:刚启动
NameNode
需要等待30s,等待NameNode
退出安全模式。
执行结果如下:
cat BX-Book-Ratings.csv | tail -n +2 > data.csv
source quick.sh /tmp/input data.csv MR.AnalysisBook.MainDriver
>>> data_path->/tmp/input
>>> file_name->data.csv
>>> main_class->MR.AnalysisBook.MainDriver
[INFO] Scanning for projects...
[INFO]
[INFO] ------------------------------------------------------------------------
[INFO] Building hdfs 1.0-SNAPSHOT
[INFO] ------------------------------------------------------------------------
[INFO]
[INFO] --- maven-clean-plugin:2.5:clean (default-clean) @ hdfs ---
[INFO] Deleting /home/Notzuonotdied/IdeaProjects/BBS/hadoop-test/target
[INFO]
[INFO] --- maven-resources-plugin:2.6:resources (default-resources) @ hdfs ---
[INFO] Using 'UTF-8' encoding to copy filtered resources.
[INFO] Copying 0 resource
[INFO]
[INFO] --- maven-compiler-plugin:3.1:compile (default-compile) @ hdfs ---
[INFO] Changes detected - recompiling the module!
[INFO] Compiling 26 source files to /home/Notzuonotdied/IdeaProjects/BBS/hadoop-test/target/classes
[WARNING] /home/Notzuonotdied/IdeaProjects/BBS/hadoop-test/src/main/java/MR/MapOrder/OrderDriver.java: /home/Notzuonotdied/IdeaProjects/BBS/hadoop-test/src/main/java/MR/MapOrder/OrderDriver.java uses or overrides a deprecated API.
[WARNING] /home/Notzuonotdied/IdeaProjects/BBS/hadoop-test/src/main/java/MR/MapOrder/OrderDriver.java: Recompile with -Xlint:deprecation for details.
[INFO]
[INFO] --- maven-resources-plugin:2.6:testResources (default-testResources) @ hdfs ---
[INFO] Not copying test resources
[INFO]
[INFO] --- maven-compiler-plugin:3.1:testCompile (default-testCompile) @ hdfs ---
[INFO] Not compiling test sources
[INFO]
[INFO] --- maven-surefire-plugin:2.12.4:test (default-test) @ hdfs ---
[INFO] Tests are skipped.
[INFO]
[INFO] --- maven-jar-plugin:2.4:jar (default-jar) @ hdfs ---
[INFO] Building jar: /home/Notzuonotdied/IdeaProjects/BBS/hadoop-test/target/hdfs-1.0-SNAPSHOT.jar
[INFO]
[INFO] --- maven-install-plugin:2.4:install (default-install) @ hdfs ---
[INFO] Installing /home/Notzuonotdied/IdeaProjects/BBS/hadoop-test/target/hdfs-1.0-SNAPSHOT.jar to /home/Notzuonotdied/.m2/repository/hadoop/hdfs/1.0-SNAPSHOT/hdfs-1.0-SNAPSHOT.jar
[INFO] Installing /home/Notzuonotdied/IdeaProjects/BBS/hadoop-test/pom.xml to /home/Notzuonotdied/.m2/repository/hadoop/hdfs/1.0-SNAPSHOT/hdfs-1.0-SNAPSHOT.pom
[INFO] ------------------------------------------------------------------------
[INFO] BUILD SUCCESS
[INFO] ------------------------------------------------------------------------
[INFO] Total time: 5.992 s
[INFO] Finished at: 2018-09-25T14:38:24+08:00
[INFO] Final Memory: 26M/331M
[INFO] ------------------------------------------------------------------------
~
hdfs-1.0-SNAPSHOT.jar 100% 52KB 22.9MB/s 00:00
data.csv 100% 29MB 42.1MB/s 00:00
arg.txt 100% 75 85.7KB/s 00:00
>>> hadoop....
>>> remove /data/output data
18/09/25 02:38:29 INFO fs.TrashPolicyDefault: Namenode trash configuration: Deletion interval = 0 minutes, Emptier interval = 0 minutes.
Deleted /data/output
>>> remove /data/input
18/09/25 02:38:33 INFO fs.TrashPolicyDefault: Namenode trash configuration: Deletion interval = 0 minutes, Emptier interval = 0 minutes.
Deleted /data/input
>>> create /data/input
>>> put file /root/data.csv
>>> dealing...
2018-09-25 02:38:51,057 WARN [org.apache.hadoop.mapreduce.JobResourceUploader] - Hadoop command-line option parsing not performed. Implement the Tool interface and execute your application with ToolRunner to remedy this.
<<done.
>>> show result...
Found 2 items -rw-r--r-- 1 root supergroup 0 2018-09-25 02:40 /data/output/_SUCCESS -rw-r--r-- 1 root supergroup 5126955 2018-09-25 02:40 /data/output/part-r-00000
>>> show ---- /data/output/_SUCCESS
<<< done.
>>> show ---- /data/output/part-r-00000
"b00005wz75" 0
"cn108465" 0
……
"cn113107" 0
"ooo7156103" 7
"Խcrosoft" 7
<<< done.
<<< hadoop done.
Finish...
附录
- 机器学习:推荐系统测试数据集个人汇总
- idea打包java可执行jar包
- Maven搭建Hadoop应用开发环境