软件准备
使用Windows搭建单机开发环境
- 安装JDK注意安装目录中不要有中文和空格,注意默认安装目录中Program Files有空格),配置JAVA_HOME
- 首先根据需要下载hadoop版本,首先在Windows系统里打开浏览器,下载hadoop的安装包(二进制文件):http://hadoop.apache.org/releases.html
- 直接解压(注意解压目录中不要有中文和空格)
- 配置环境变量HADOOP_HOME
- 配置系统变量Path,添加%HADOOP_HOME%\bin
- 至此HADOOP单机开发环境完成,这时可以进行简单mapreduce程序开发
- 写一个简单的wordcount测试一下,可以运行证明安装成功
- 详细代码,新建一个maven工程,根据安装Hadoop版本导入依赖,新建三个java文件WordcountMapper、WordCountReducer、WordCountDriver
1.pom.xml
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.zys</groupId>
<artifactId>hadoop2-init</artifactId>
<version>1.0-SNAPSHOT</version>
<dependencies>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>RELEASE</version>
</dependency>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-core</artifactId>
<version>2.8.2</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>2.7.2</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.7.2</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>2.7.2</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<artifactId>maven-compiler-plugin</artifactId>
<version>2.3.2</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
</configuration>
</plugin>
<plugin>
<artifactId>maven-assembly-plugin </artifactId>
<configuration>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
<archive>
<manifest>
<mainClass>com.zys.wordcount.WordCountDriver</mainClass>
</manifest>
</archive>
</configuration>
<executions>
<execution>
<id>make-assembly</id>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
2.WordcountMapper.java
package com.zys.wordcount;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
/**
* @author zhengyunshuo
* @date 2020/10/19 - 12:26
*/
/**
* map阶段
* KEYIN 输入数据的key
* VALUEIN 输入数据的value
* KEYOUT 输出数据的类型 aa,1 bb,1
* VALUEOUT 输出数据的类型
*/
public class WordcountMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
// 防止在for循环中不断创造对象
Text k = new Text();
IntWritable v = new IntWritable(1);
/**
* 从文件中读取一行进行Map操作
* @param key
* @param value
* @param context
* @throws IOException
* @throws InterruptedException
*/
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
// 1.获取一行数据
String line = value.toString();
// 2.切割单词
String[] words = line.split(" ");
// 3.循环写出
for(String word : words){
// Text k = new Text();
k.set(word);
// IntWritable v = new IntWritable()
// v.set(1); // 不写默认为1
context.write(k,v);
}
}
}
3.WordCountReducer.java
package com.zys.wordcount;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
/**
* @author zhengyunshuo
* @date 2020/10/19 - 12:42
*/
public class WordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
IntWritable v = new IntWritable();
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int sum = 0;
// 累加求和
for(IntWritable value : values){
sum += value.get();
}
v.set(sum);
// 写出
context.write(key,v);
}
}
- WordCountDriver.java
package com.zys.wordcount;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
/**
* @author zhengyunshuo
* @date 2020/10/19 - 13:59
*/
public class WordCountDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
// 1.获取Job对象
Job job = Job.getInstance(conf);
// 2.设置Jar存储位置
job.setJarByClass(WordCountDriver.class);
// 3.关联Map和Reduce类
job.setMapperClass(WordcountMapper.class);
job.setReducerClass(WordCountReducer.class);
// 4.设置Mapper阶段输出数据的key和value类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
// 5.设置最终数据输出的key和value类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
// 6.设置输入路径和输出路径
String input = "E:\\Workspace\\ideaworksapce\\wordcount\\src\\data";
String output = "E:\\Workspace\\ideaworksapce\\wordcount\\src\\data\\output";
FileInputFormat.setInputPaths(job,new Path(input));
FileOutputFormat.setOutputPath(job,new Path(output));
// 7.提交job
// job.submit(); // 仅提交
boolean res = job.waitForCompletion(true); // 除提交外还有日志记录
System.exit(res?0:1);
}
}
使用 Windows搭建伪分布式环境
按照上边的配置虽然可以编写mapreduce程序,但是如果要测试hdfs的API,比方说上传下载还是不行,接下来就是配置HDFS的文件,进行伪分布式集群配置。(备注:伪分布式集群也是分布式集群,可以起动分布式计算的效果)
- 修改配置文件
我们来到之前解压的hadoop文件夹下,打开etc/hadoop文件夹,修改如下几个配置文件
1.修改hadoop-env.cmd,在文件末尾添加如下配置
set HADOOP_PREFIX=%HADOOP_HOME%
set HADOOP_CONF_DIR=%HADOOP_PREFIX%\etc\hadoop
set YARN_CONF_DIR=%HADOOP_CONF_DIR%
set PATH=%PATH%;%HADOOP_PREFIX%\bin
2.修改core-site.xml,修改
<configuration>
<property>
<name>fs.defaultFS</name>
<value>hdfs://0.0.0.0:9000</value>
</property>
</configuration>
3.修改hdfs-site.xml
<configuration>
<property>
<name>dfs.replication</name>
<value>1</value>
</property>
<property>
<name>dfs.name.dir</name>
<value>file:///D:/hadoop/hadoop-3.1.4/local/dfs/name</value>
</property>
<property>
<name>dfs.data.dir</name>
<value>file:///D:/hadoop/hadoop-3.1.4/local/dfs/data</value>
</property>
</configuration>
4.mapred-site.xml %USERNAME%更换为自己的用户名
<configuration>
<property>
<name>mapreduce.job.user.name</name>
<value>%USERNAME%</value>
</property>
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
</property>
<property>
<name>yarn.apps.stagingDir</name>
<value>/user/%USERNAME%/staging</value>
</property>
<property>
<name>mapreduce.jobtracker.address</name>
<value>local</value>
</property>
</configuration>
4.yarn-site.xml
<configuration>
<property>
<name>yarn.server.resourcemanager.address</name>
<value>0.0.0.0:8020</value>
</property>
<property>
<name>yarn.server.resourcemanager.application.expiry.interval</name>
<value>60000</value>
</property>
<property>
<name>yarn.server.nodemanager.address</name>
<value>0.0.0.0:45454</value>
</property>
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
<property>
<name>yarn.nodemanager.aux-services.mapreduce.shuffle.class</name>
<value>org.apache.hadoop.mapred.ShuffleHandler</value>
</property>
<property>
<name>yarn.server.nodemanager.remote-app-log-dir</name>
<value>/app-logs</value>
</property>
<property>
<name>yarn.nodemanager.log-dirs</name>
<value>/dep/logs/userlogs</value>
</property>
<property>
<name>yarn.server.mapreduce-appmanager.attempt-listener.bindAddress</name>
<value>0.0.0.0</value>
</property>
<property>
<name>yarn.server.mapreduce-appmanager.client-service.bindAddress</name>
<value>0.0.0.0</value>
</property>
<property>
<name>yarn.log-aggregation-enable</name>
<value>true</value>
</property>
<property>
<name>yarn.log-aggregation.retain-seconds</name>
<value>-1</value>
</property>
<property>
<name>yarn.application.classpath</name>
<value>%HADOOP_CONF_DIR%,%HADOOP_COMMON_HOME%/share/hadoop/common/*,%HADOOP_COMMON_HOME%/share/hadoop/common/lib/*,%HADOOP_HDFS_HOME%/share/hadoop/hdfs/*,%HADOOP_HDFS_HOME%/share/hadoop/hdfs/lib/*,%HADOOP_MAPRED_HOME%/share/hadoop/mapreduce/*,%HADOOP_MAPRED_HOME%/share/hadoop/mapreduce/lib/*,%HADOOP_YARN_HOME%/share/hadoop/yarn/*,%HADOOP_YARN_HOME%/share/hadoop/yarn/lib/*</value>
</property>
</configuration>
- 安装winutills
由于windows下想要开启集群,会有一定的bug,因此我们去网站:https://github.com/steveloughran/winutils
下载对应版本的winutils.exe文件,将对应版本下的bin目录下的内容拷贝到hadoop解压目录的etc目录下 - 配置hadoop.dll
将上步的hadoop.dll放到C:\Windows\System32下,重启电脑 - 初始化环境变量
windows下的win+r中输入cmd,输入命令
%HADOOP_HOME%\etc\hadoop\hadoop-env.cmd
- 格式化namenode,输入命令
hadoop namenode -format
- 启动集群,控制台输入命令%HADOOP_HOME%/sbin/start-all.cmd
- 通过jps查看集群情况,如图为正常
- 打开浏览器验证
我们在浏览器输入localhost:50070,如果能够打开这样的网页,说明hadoop已经成功开启:
- 编写代码,尝试在hdfs上创建目录
在上述工程中新建一个HDFSClient.java文件,代码如下:
package com.zys.hdfs;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import java.io.IOException;
/**
* @author zhengyunshuo
* @date 2021/1/4 - 10:47
*/
public class HDFSClient {
public static void main(String args[]) throws IOException {
// 1.获取hdfs客户端对象
Configuration conf = new Configuration();
conf.set("fs.defaultFS","hdfs://127.0.0.1:9000");
FileSystem fs = FileSystem.get(conf);
// 2.在hdfs上创建路径
fs.mkdirs(new Path("/zys/test"));
//3.关闭资源
fs.close();
System.out.println("Compile Over");
}
}
可以看到目录创建成功,证明没有问题,接下来就可以进行愉快的开发了