1.规划
2. ssh时不提示信息配置
后续需要编写HDFS HA集群的启动和关闭的Shell脚本,在Shell脚本中会涉及到 ssh nodeX 命令,将会出现提示fingerprint信息,比较烦人, 如何让ssh不提示fingerprint信息?
/etc/ssh/ssh_config(客户端配置文件) 区别于sshd_config(服务端配置文件)
[root@node1 ~]# vim /etc/ssh/ssh_config
# StrictHostKeyChecking ask
StrictHostKeyChecking no
#将 修改后的文件拷贝到node2、node3、node4
[root@node1 ~]# vim /etc/ssh/ssh_config
[root@node1 ~]# scp /etc/ssh/ssh_config node2:/etc/ssh/
ssh_config 100% 2301 894.3KB/s 00:00
[root@node1 ~]# scp /etc/ssh/ssh_config node3:/etc/ssh/
ssh_config 100% 2301 579.9KB/s 00:00
[root@node1 ~]# scp /etc/ssh/ssh_config node4:/etc/ssh/
ssh_config 100% 2301 298.3KB/s 00:00
3.HDFS配置
关闭hdfs集群后,删除四台节点上/var/itbaizhan/hadoop/full目录和/opt/hadoop3.1.3/logs目录下的全部内容
rm -rf /var/itbaizhan/hadoop/full
rm -rf /opt/hadoop3.1.3/logs
以下一律在node1上操作,做完后scp到node2、node3、node4
- hadoop-env.sh配置JDK
[root@node1 hadoop]# cd /opt/hadoop-3.1.3/etc/hadoop
[root@node1 hadoop]# vim hadoop-env.sh
export JAVA_HOME=/usr/java/default
- 修改workers指定datanode的位置
[root@node1 hadoop]# vim workers
node2
node3
node4
- 修改core-site.xml
<configuration>
<property>
<name>fs.defaultFS</name>
<value>hdfs://mycluster</value>
</property>
<!-- 数据的存放目录 -->
<property>
<name>hadoop.tmp.dir</name>
<value>/var/itbaizhan/hadoop/ha</value>
</property>
<!-- 指定每个zookeeper服务器的位置和客户端端口号 -->
<property>
<name>ha.zookeeper.quorum</name>
<value>node2:2181,node3:2181,node4:2181</value>
</property>
<!-- 解决HDFS web页面上删除、创建文件权限不足的问题 -->
<property>
<name>hadoop.http.staticuser.user</name>
<value>root</value>
</property>
</configuration>
- hdfs-site.xml
<configuration>
<!-- JournalNode数据存储目录 -->
<property>
<name>dfs.journalnode.edits.dir</name>
<value>${hadoop.tmp.dir}/dfs/journalnode/</value>
</property>
<!--集群名称 -->
<property>
<name>dfs.nameservices</name>
<value>mycluster</value>
</property>
<!-- 集群中NameNode节点都有哪些 -->
<property>
<name>dfs.ha.namenodes.mycluster</name>
<value>nn1,nn2</value>
</property>
<!-- NameNode的RPC通信地址 -->
<property>
<name>dfs.namenode.rpc-address.mycluster.nn1</name>
<value>node1:9820</value>
</property>
<property>
<name>dfs.namenode.rpc-address.mycluster.nn2</name>
<value>node2:9820</value>
</property>
<!-- NameNode的http通信地址 -->
<property>
<name>dfs.namenode.http-address.mycluster.nn1</name>
<value>node1:9870</value>
</property>
<property>
<name>dfs.namenode.http-address.mycluster.nn2</name>
<value>node2:9870</value>
</property>
<!-- 指定NameNode元数据在JournalNode上的存放位置 -->
<property>
<name>dfs.namenode.shared.edits.dir</name>
<value>qjournal://node1:8485;node2:8485;node3:8485/mycluster</value>
</property>
<!-- 访问代理类:client用于确定哪个NameNode为Active -->
<property>
<name>dfs.client.failover.proxy.provider.mycluster</name>
<value>org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider</value>
</property>
<!-- 配置隔离机制,即同一时刻只能有一台服务器对外响应 -->
<property>
<name>dfs.ha.fencing.methods</name>
<value>sshfence</value>
</property>
<!-- 使用隔离机制时需要ssh秘钥登录-->
<property>
<name>dfs.ha.fencing.ssh.private-key-files</name>
<value>/root/.ssh/id_dsa</value>
</property>
<!-- 启用nn故障自动转移 -->
<property>
<name>dfs.ha.automatic-failover.enabled</name>
<value>true</value>
</property>
</configuration>
- 先同步配置文件到node2、node3、node4
#node1上执行:
[root@node1 hadoop]# scp hadoop-env.sh core-site.xml hdfs-site.xml node2:`pwd`
[root@node1 hadoop]# scp hadoop-env.sh core-site.xml hdfs-site.xml node3:`pwd`
[root@node1 hadoop]# scp hadoop-env.sh core-site.xml hdfs-site.xml node4:`pwd`
3.首次启动HDFS HA集群
a) 启动zookeeper集群, node2、node3、node4分别执行:
zkServer.sh start
b) 在node1\node2\node3上启动三台journalnode
hdfs --daemon start journalnode
c) 选择node1,格式化HDFS
[root@node1 hadoop]# hdfs namenode -format
#看到如下提示,表示格式化成功
2021-10-15 13:21:33,318 INFO common.Storage: Storage directory /var/itbaizhan/hadoop/ha/dfs/name has been successfully formatted.
/var/itbaizhan/hadoop/ha/dfs/name/current/目录下产生了fsimage文件
[root@node1 hadoop]# ll /var/itbaizhan/hadoop/ha/dfs/name/current/
总用量 16
-rw-r--r-- 1 root root 391 10月 15 13:21 fsimage_0000000000000000000
-rw-r--r-- 1 root root 62 10月 15 13:21 fsimage_0000000000000000000.md5
-rw-r--r-- 1 root root 2 10月 15 13:21 seen_txid
-rw-r--r-- 1 root root 218 10月 15 13:21 VERSION
格式化后,启动namenode进程
[root@node1 hadoop]# hdfs --daemon start namenode
[root@node1 hadoop]# jps
7347 JournalNode
7689 NameNode
7737 Jps
d) 在另一台node2上同步元数据,然后在该节点上启动NameNode。
[root@node2 ~]# hdfs namenode -bootstrapStandby
#出现以下提示:
2021-10-15 13:26:36,101 INFO ha.BootstrapStandby: Found nn: nn1, ipc: node1/192.168.20.101:9820
=====================================================
About to bootstrap Standby ID nn2 from:
Nameservice ID: mycluster
Other Namenode ID: nn1
Other NN's HTTP address: http://node1:9870
Other NN's IPC address: node1/192.168.20.101:9820
Namespace ID: 1743499963
Block pool ID: BP-166908272-192.168.20.101-1634275293276
Cluster ID: CID-38fac5df-ed87-46c5-a4e0-f92ce7008c07
Layout version: -64
isUpgradeFinalized: true
=====================================================
#启动NameNode
[root@node2 ~]# hdfs --daemon start namenode
[root@node2 ~]# jps
7249 QuorumPeerMain
8019 Jps
7466 JournalNode
7980 NameNode # 看到NameNode进程表示NameNode正常启动了。
e) 初始化zookeeper上的内容 一定是在namenode节点(node1或node2)上
[root@node4 hadoop]# zkCli.sh
[zk: localhost:2181(CONNECTED) 1] ls /
[itbaizhan, registry, wzyy, zk001, zookeeper]
接下来在node1上执行
[root@node1 ~]# hdfs zkfc -formatZK
2021-10-15 13:30:20,048 INFO ha.ActiveStandbyElector: Successfully created /hadoop-ha/mycluster in ZK.
然后在node4上接着执行:
[zk: localhost:2181(CONNECTED) 1] ls /
[zookeeper, hadoop-ha]
[zk: localhost:2181(CONNECTED) 2] ls /hadoop-ha
[mycluster]
[z: localhost:2181(CONNECTED) 3] ls /hadoop-ha/mycluster
[]
执行到此处,还没有启动3个DataNode和2个ZKFC进程。
f) 启动hadoop集群,在node1执行
[root@node1 ~]# start-dfs.sh
#出现如下错误提示
ERROR: Attempting to operate on hdfs journalnode as root
ERROR: but there is no HDFS_JOURNALNODE_USER defined. Aborting operation.
Starting ZK Failover Controllers on NN hosts [node1 node2]
ERROR: Attempting to operate on hdfs zkfc as root
ERROR: but there is no HDFS_ZKFC_USER defined. Aborting operation.
#解决办法:修改start-dfs.sh文件
[root@node1 ~]# vim /opt/hadoop-3.1.3/sbin/start-dfs.sh
#添加
HDFS_JOURNALNODE_USER=root
HDFS_ZKFC_USER=root
#为了防止关闭时出现类似的错误提示,修改stop-dfs.sh
[root@node1 ~]# vim /opt/hadoop-3.1.3/sbin/stop-dfs.sh
#添加
HDFS_JOURNALNODE_USER=root
HDFS_ZKFC_USER=root
#再次启动
[root@node1 hadoop]# start-dfs.sh
在启动zkCli.sh的节点node4上观察:
[zk: localhost:2181(CONNECTED) 5] ls /hadoop-ha/mycluster
[ActiveBreadCrumb, ActiveStandbyElectorLock]
[zk: localhost:2181(CONNECTED) 6] get -s /hadoop-ha/mycluster/ActiveStandbyElectorLock
myclusternn1node1 �L(�>
cZxid = 0x600000008
ctime = Fri Oct 15 13:40:10 CST 2021
mZxid = 0x600000008
mtime = Fri Oct 15 13:40:10 CST 2021
pZxid = 0x600000008
cversion = 0
dataVersion = 0
aclVersion = 0
ephemeralOwner = 0x300006fd40a0002
dataLength = 29
numChildren = 0
node1占用着锁,它的状态是active的。浏览器访问:http://node1:9870
node2为standby,浏览器地址栏输入:http://node2:9870
将Active NameNode对应节点node1上NameNode进程kill掉:
[root@node1 hadoop]# jps
10337 Jps
7347 JournalNode
9701 DFSZKFailoverController
7689 NameNode
[root@node1 hadoop]# kill -9 7689
#或者
[root@node1 hadoop]# hdfs --daemon stop namenode
[root@node1 hadoop]# jps
7347 JournalNode
9701 DFSZKFailoverController
10381 Jps
node4上继续查看:
[zk: localhost:2181(CONNECTED) 12] get -s /hadoop-ha/mycluster/ActiveStandbyElectorLock
myclusternn2node2 �L(�>
cZxid = 0x60000006c
......
但是通过浏览器访问发现Active NameNode不能自动进行切换。这是因为缺少一个rpm包:psmisc。接下来在四台节点上安装psmisc包。
yum install -y psmisc
node1访问不了,node2 从Standby变为了Active。
node1上再次启动namenode:
[root@node1 hadoop]# hdfs --daemon start namenode
node1变为standby,变为备机。