Corosync用来实现多个机器互相通讯(维持心跳)的,而pacemaker是在corosync上层来统一管理整个集群的运行。Corosync是未来的发展方向。在以后的新项目里,一般采用Corosync,而hb_gui可以提供很好的HA管理功能,可以实现图形化的管理。另外相关的图形化有RHCS的套件luci+ricci。

高可用群集的一致性:硬件、软件以及时间的一致性

ip地址规划:

node1.a.com   192.168.101.10

node2.a.com   192.168.101.20

VIP   192.168.101.100

拓扑:

corosync/openais+pacemaker实现web的高可用性_图形 

一:环境准备:

node1.a.com配置:

1:ip地址设置:

corosync/openais+pacemaker实现web的高可用性_图形_02

2:修改主机名:

[root@lyt ~]# vim /etc/sysconfig/network

corosync/openais+pacemaker实现web的高可用性_图形_03 

[root@lyt ~]# init 6      #重启使主机名生效

[root@node1 ~]# hostname     #查看主机名

corosync/openais+pacemaker实现web的高可用性_图形_04

3:编辑dns缓存文件:

[root@node1 ~]# vim /etc/hosts

corosync/openais+pacemaker实现web的高可用性_的_05

4:同步时间:

[root@node1 ~]# hwclock –s

5:编辑本地yum:

[root@node1 ~]# vim /etc/yum.repos.d/rhel-debuginfo.repo

corosync/openais+pacemaker实现web的高可用性_图形_06

 

node2.a.com配置:

1:ip地址配置

corosync/openais+pacemaker实现web的高可用性_图形_07

2:修改主机名:

[root@lyt ~]# vim /etc/sysconfig/network

corosync/openais+pacemaker实现web的高可用性_图形_08

[root@lyt ~]# init  6     #是主机名生效

[root@node2 ~]# hostname     #查看主机名

corosync/openais+pacemaker实现web的高可用性_图形_09

3:同步时间:

[root@node2 ~]# hwclock –s   

在node1.a.com和node2.a.com上设置无障碍通讯:

[root@node1 ~]# ssh-keygen -t rsa     #使用rsa算法的得出一个钥匙对

corosync/openais+pacemaker实现web的高可用性_的_10

[root@node1 ~]# cd .ssh/

corosync/openais+pacemaker实现web的高可用性_的_11

[root@node1 .ssh]# ssh-copy-id -i id_rsa.pub node2     #将公钥拷贝到node2.a.com中,此处使用它的别名node2,他会自动寻找位置,不用指明存放位置

[root@node1 .ssh]# scp /etc/hosts node2:/etc/     #将dns缓存文件拷贝到node2中

[root@node2 ~]# ssh-keygen -t rsa       #使用rsa算法算出一个钥匙对

[root@node2 ~]# cd .ssh/

corosync/openais+pacemaker实现web的高可用性_的_12

[root@node2 .ssh]# ssh-copy-id -i id_rsa.pub node1      #将公钥拷贝到node1

[root@node2 .ssh]# scp node1:/etc/yum.repos.d/rhel-debuginfo.repo /etc/yum.repos.d/     #将node1上的yum文件拷贝到node2上

二:安装相关软件包:

node1上的配置:

[root@node1 ~]# ll
total 3376
-rw-r--r-- 1 root root 271360 Jul 15 23:25  cluster-glue-1.0.6-1.6.el5.i386.rpm
-rw-r--r-- 1 root root 133254 Jul 15 23:25  cluster-glue-libs-1.0.6-1.6.el5.i386.rpm
-rw-r--r-- 1 root root 170052 Jul 15 23:25  corosync-1.2.7-1.1.el5.i386.rpm
-rw-r--r-- 1 root root 158502 Jul 15 23:25  corosynclib-1.2.7-1.1.el5.i386.rpm
-rw-r--r-- 1 root root 165591 Jul 15 23:25  heartbeat-3.0.3-2.3.el5.i386.rpm
-rw-r--r-- 1 root root 289600 Jul 15 23:25  heartbeat-libs-3.0.3-2.3.el5.i386.rpm
-rw-r--r-- 1 root root   60458 Jul 15 23:25  libesmtp-1.0.4-5.el5.i386.rpm
-rw-r--r-- 1 root root 207085 Jul 15 23:25  openais-1.1.3-1.6.el5.i386.rpm
-rw-r--r-- 1 root root   94614 Jul 15 23:25  openaislib-1.1.3-1.6.el5.i386.rpm
-rw-r--r-- 1 root root 796813 Jul 15 23:25  pacemaker-1.1.5-1.1.el5.i386.rpm
-rw-r--r-- 1 root root 207925 Jul 15 23:25  pacemaker-cts-1.1.5-1.1.el5.i386.rpm
-rw-r--r-- 1 root root 332026 Jul 15 23:25  pacemaker-libs-1.1.5-1.1.el5.i386.rpm
-rw-r--r-- 1 root root   32818 Jul 15 23:25  perl-TimeDate-1.16-5.el5.noarch.rpm
-rw-r--r-- 1 root root 388632 Jul 15 23:25  resource-agents-1.0.4-1.1.el5.i386.rpm

[root@node1 ~]# mkdir /mnt/cdrom

[root@node1 ~]# mount /dev/cdrom /mnt/cdrom/      #挂载本地光盘

[root@node1 ~]# yum localinstall *.rpm –y  --nogpgcheck        #安装该目录所有的rpm包

[root@node1 ~]# scp *.rpm node2:/root       #将所有的rpm软件包拷贝到node2的/root下

[root@node1 ~]# yum install httpd –y #安装httpd服务器

[root@node1 ~]# echo "node1.a.com" >/var/www/html/index.html        #编辑网页

node2上的配置:

[root@node2 ~]# mkdir /mnt/cdrom

[root@node2 ~]# mount /dev/cdrom /mnt/cdrom/        #创建挂载点用于挂载本地光盘

[root@node2 ~]# yum localinstall *.rpm –y  --nogpgcheck          #使用localinstall用于解决不是系统自带的rpm包的依赖光盘软件包,它可以调用光盘上的软件包

[root@node2 ~]# yum install httpd –y #安装httpd服务器

[root@node2 ~]# echo "node2.a.com" >/var/www/html/index.html       #编辑网页内容

三:修改配置文件

node1.a.com配置

[root@node1 ~]# cd /etc/corosync/

[root@node1 corosync]# cp -p corosync.conf.example corosync.conf     #生成corosync的配置文件

[root@node1 corosync]# vim corosync.conf     #编辑该文件

1 # Please read the corosync.conf.5 manual page
2 compatibility: whitetank
3
4 totem {
5         version: 2         #版本号
6         secauth: off     
#是否×××安全认证
7         threads: 0        
#多少个现成认证,0表示无限制
8         interface {
9                 ringnumber: 0
10                 bindnetaddr: 192.168.101.0   
#通过哪个网络地址进行通讯,可以给个网络地址(给成192.168.2.0)
11                 mcastaddr: 226.94.1.1
12                 mcastport: 5405
13         }
14 }
15
16 logging {
17         fileline: off
18         to_stderr: no     
#是否发送标准出错
19         to_logfile: yes    
#日志
20         to_syslog: yes   #系统日志  (建议关掉一个),会降低性能
21         logfile: /var/log/cluster/corosync.log     
#(手动创建目录)
22         debug: off
23         timestamp: on    
#日志中是否记录时间
24         logger_subsys {
25                 subsys: AMF
26                 debug: off
27         }
28 }
29
30 amf {
31         mode: disabled
32 }
33 service {
34         ver: 0
35         name: pacemaker     
#使用到了pacemaker
36 }
37 aisexec {                   
#使用到openais的一些子选项
38         user: root
39         group: root
40 }

[root@node1 corosync]# corosync-keygen          #产生认证文件

corosync/openais+pacemaker实现web的高可用性_的_13

[root@node1 corosync]# scp -p authkey corosync.conf node2:/etc/corosync/       #将文件拷贝到node2节点(-p表示带上文件属性)

[root@node1 corosync]# mkdir /var/log/cluster      #创建目录cluster

[root@node1 corosync]# ssh node2 'mkdir /var/log/cluster'        #在node1上位node2创建目录cluster

[root@node1 corosync]# service corosync start       #启动corosync服务

[root@node1 corosync]# ssh node2 'service corosync start'       #在node1上将node2上的corosync服务启动

[root@node1 corosync]#grep -i  -e "corosync cluster engine" -e "configuration file" /var/log/messages      #验证corosync引擎是否正常启动了

Jul 15 13:24:50 lyt smartd[3205]: Opened configuration file /etc/smartd.conf
Jul 15 13:24:50 lyt smartd[3205]: Configuration file /etc/smartd.conf was parsed, found DEVICESCAN, scanning devices
Jul 15 13:31:09 lyt smartd[3030]: Opened configuration file /etc/smartd.conf
Jul 15 13:31:09 lyt smartd[3030]: Configuration file /etc/smartd.conf was parsed, found DEVICESCAN, scanning devices
Jul 15 20:04:48 lyt smartd[3060]: Opened configuration file /etc/smartd.conf
Jul 15 20:04:48 lyt smartd[3060]: Configuration file /etc/smartd.conf was parsed, found DEVICESCAN, scanning devices
Jul 15 21:37:01 lyt smartd[3286]: Opened configuration file /etc/smartd.conf
Jul 15 21:37:01 lyt smartd[3286]: Configuration file /etc/smartd.conf was parsed, found DEVICESCAN, scanning devices
Jul 15 22:25:02 lyt smartd[2994]: Opened configuration file /etc/smartd.conf
Jul 15 22:25:02 lyt smartd[2994]: Configuration file /etc/smartd.conf was parsed, found DEVICESCAN, scanning devices
Jul 16 00:41:47 node1 smartd[3000]: Opened configuration file /etc/smartd.conf
Jul 16 00:41:47 node1 smartd[3000]: Configuration file /etc/smartd.conf was parsed, found DEVICESCAN, scanning devices
Jul 16 01:08:14 node1 corosync[3190]:   [MAIN  ] Corosync Cluster Engine ('1.2.7'): started and ready to provide service.
Jul 16 01:08:14 node1 corosync[3190]:   [MAIN  ] Successfully read main configuration file '/etc/corosync/corosync.conf'.

[root@node1 corosync]# grep -i totem /var/log/messages    #查看初始化成员节点通知是否发出

Jul 16 01:08:14 node1 corosync[3190]:   [TOTEM ] Initializing transport (UDP/IP).
Jul 16 01:08:14 node1 corosync[3190]:   [TOTEM ] Initializing transmit/receive security: libtomcrypt SOBER128/SHA1HMAC (mode 0).
Jul 16 01:08:14 node1 corosync[3190]:   [TOTEM ] The network interface is down.
Jul 16 01:08:15 node1 corosync[3190]:   [TOTEM ] Process pause detected for 649 ms, flushing membership messages.
Jul 16 01:08:15 node1 corosync[3190]:   [TOTEM ] A processor joined or left the membership and a new membership was formed.
Jul 16 01:22:04 node1 corosync[3279]:   [TOTEM ] Initializing transport (UDP/IP).
Jul 16 01:22:04 node1 corosync[3279]:   [TOTEM ] Initializing transmit/receive security: libtomcrypt SOBER128/SHA1HMAC (mode 0).
Jul 16 01:22:04 node1 corosync[3279]:   [TOTEM ]
The network interface [192.168.101.10] is now up.
Jul 16 01:22:07 node1 corosync[3279]:   [TOTEM ] Process pause detected for 536 ms, flushing membership messages.
Jul 16 01:22:07 node1 corosync[3279]:   [TOTEM ] A processor joined or left the membership and a new membership was formed.

[root@node1 corosync]# grep -i pcmk_startup /var/log/messages       #检查pacemaker时候已经启动了

Jul 16 01:08:15 node1 corosync[3190]:   [pcmk  ] info: pcmk_startup: CRM: Initialized
Jul 16 01:08:15 node1 corosync[3190]:   [pcmk  ] Logging: Initialized pcmk_startup
Jul 16 01:08:15 node1 corosync[3190]:   [pcmk  ] info: pcmk_startup: Maximum core file size is: 4294967295
Jul 16 01:08:15 node1 corosync[3190]:   [pcmk  ] info: pcmk_startup: Service: 9
Jul 16 01:08:15 node1 corosync[3190]:   [pcmk  ] info: pcmk_startup: Local hostname: node1.a.com
Jul 16 01:22:06 node1 corosync[3279]:   [pcmk  ] info: pcmk_startup: CRM: Initialized
Jul 16 01:22:06 node1 corosync[3279]:   [pcmk  ] Logging: Initialized pcmk_startup
Jul 16 01:22:06 node1 corosync[3279]:   [pcmk  ] info: pcmk_startup: Maximum core file size is: 4294967295
Jul 16 01:22:06 node1 corosync[3279]:   [pcmk  ] info: pcmk_startup: Service: 9
Jul 16 01:22:06 node1 corosync[3279]:   [pcmk  ] info: pcmk_startup: Local hostname: node1.a.com

[root@node1 corosync]#  grep -i error:  /var/log/messages  |grep -v unpack_resources       #便面stonith的错误(此处有错误需要修改)

Jul 16 01:09:18 node1 pengine: [3200]: ERROR: unpack_resources: Resource start-up disabled since no STONITH resources have been defined
Jul 16 01:09:18 node1 pengine: [3200]: ERROR: unpack_resources: Either configure some or disable STONITH with the stonith-enabled option
Jul 16 01:09:18 node1 pengine: [3200]: ERROR: unpack_resources: NOTE: Clusters with shared data need STONITH to ensure data integrity
Jul 16 01:21:59 node1 pengine: [3200]: ERROR: unpack_resources: Resource start-up disabled since no STONITH resources have been defined
Jul 16 01:21:59 node1 pengine: [3200]: ERROR: unpack_resources: Either configure some or disable STONITH with the stonith-enabled option
Jul 16 01:21:59 node1 pengine: [3200]: ERROR: unpack_resources: NOTE: Clusters with shared data need STONITH to ensure data integrity
Jul 16 01:23:11 node1 pengine: [3289]: ERROR: unpack_resources: Resource start-up disabled since no STONITH resources have been defined
Jul 16 01:23:11 node1 pengine: [3289]: ERROR: unpack_resources: Either configure some or disable STONITH with the stonith-enabled option
Jul 16 01:23:11 node1 pengine: [3289]: ERROR: unpack_resources: NOTE: Clusters with shared data need STONITH to ensure data integrity

在node2.a.com上查看:

[root@node2 ~]# grep -i  -e "corosync cluster engine" -e "configuration file" /var/log/messages        #验证corosync引擎是否正常启动了

Jul 15 13:24:50 lyt smartd[3205]: Opened configuration file /etc/smartd.conf
Jul 15 13:24:50 lyt smartd[3205]: Configuration file /etc/smartd.conf was parsed, found DEVICESCAN, scanning devices
Jul 15 13:31:09 lyt smartd[3030]: Opened configuration file /etc/smartd.conf
Jul 15 13:31:09 lyt smartd[3030]: Configuration file /etc/smartd.conf was parsed, found DEVICESCAN, scanning devices
Jul 15 20:04:48 lyt smartd[3060]: Opened configuration file /etc/smartd.conf
Jul 15 20:04:48 lyt smartd[3060]: Configuration file /etc/smartd.conf was parsed, found DEVICESCAN, scanning devices
Jul 15 21:37:01 lyt smartd[3286]: Opened configuration file /etc/smartd.conf
Jul 15 21:37:01 lyt smartd[3286]: Configuration file /etc/smartd.conf was parsed, found DEVICESCAN, scanning devices
Jul 16 01:07:12 lyt smartd[3361]: Opened configuration file /etc/smartd.conf
Jul 16 01:07:12 lyt smartd[3361]: Configuration file /etc/smartd.conf was parsed, found DEVICESCAN, scanning devices
Jul 16 01:10:47 lyt smartd[3364]: Opened configuration file /etc/smartd.conf
Jul 16 01:10:47 lyt smartd[3364]: Configuration file /etc/smartd.conf was parsed, found DEVICESCAN, scanning devices
Jul 16 03:26:50 node2 smartd[3033]: Opened configuration file /etc/smartd.conf
Jul 16 03:26:50 node2 smartd[3033]: Configuration file /etc/smartd.conf was parsed, found DEVICESCAN, scanning devices
Jul 16 03:53:49 node2 corosync[3267]:   [MAIN  ] Corosync Cluster Engine ('1.2.7'): started and ready to provide service.
Jul 16 03:53:49 node2 corosync[3267]:   [MAIN  ] Successfully read main configuration file '/etc/corosync/corosync.conf'.

[root@node2 ~]# grep -i totem /var/log/messages         #查看初始化成员节点通知是否发出

Jul 16 03:53:49 node2 corosync[3267]:   [TOTEM ] Initializing transport (UDP/IP).
Jul 16 03:53:49 node2 corosync[3267]:   [TOTEM ] Initializing transmit/receive security: libtomcrypt SOBER128/SHA1HMAC (mode 0).
Jul 16 03:53:49 node2 corosync[3267]:   [TOTEM ] The network interface is down.
Jul 16 03:53:51 node2 corosync[3267]:   [TOTEM ] Process pause detected for 744 ms, flushing membership messages.
Jul 16 03:53:51 node2 corosync[3267]:   [TOTEM ] A processor joined or left the membership and a new membership was formed.
Jul 16 04:06:48 node2 corosync[29324]:   [TOTEM ] Initializing transport (UDP/IP).
Jul 16 04:06:48 node2 corosync[29324]:   [TOTEM ] Initializing transmit/receive security: libtomcrypt SOBER128/SHA1HMAC (mode 0).
Jul 16 04:06:48 node2 corosync[29324]:   [TOTEM ] The network interface [192.168.101.20] is now up.
Jul 16 04:06:57 node2 corosync[29324]:   [TOTEM ] Process pause detected for 2825 ms, flushing membership messages.
Jul 16 04:06:58 node2 corosync[29324]:   [TOTEM ] A processor joined or left the membership and a new membership was formed.
Jul 16 04:07:00 node2 corosync[29324]:   [TOTEM ] A processor joined or left the membership and a new membership was formed.

[root@node2 ~]# grep -i pcmk_startup /var/log/messages        #检查pacemaker时候已经启动了

Jul 16 03:53:49 node2 corosync[3267]:   [pcmk  ] info: pcmk_startup: CRM: Initialized
Jul 16 03:53:50 node2 corosync[3267]:   [pcmk  ] Logging: Initialized pcmk_startup
Jul 16 03:53:50 node2 corosync[3267]:   [pcmk  ] info: pcmk_startup: Maximum core file size is: 4294967295
Jul 16 03:53:50 node2 corosync[3267]:   [pcmk  ] info: pcmk_startup: Service: 9
Jul 16 03:53:50 node2 corosync[3267]:   [pcmk  ] info: pcmk_startup: Local hostname: node2.a.com
Jul 16 04:06:50 node2 corosync[29324]:   [pcmk  ] info: pcmk_startup: CRM: Initialized
Jul 16 04:06:50 node2 corosync[29324]:   [pcmk  ] Logging: Initialized pcmk_startup
Jul 16 04:06:50 node2 corosync[29324]:   [pcmk  ] info: pcmk_startup: Maximum core file size is: 4294967295
Jul 16 04:06:51 node2 corosync[29324]:   [pcmk  ] info: pcmk_startup: Service: 9
Jul 16 04:06:51 node2 corosync[29324]:   [pcmk  ] info: pcmk_startup: Local hostname: node2.a.com

[root@node2 ~]# grep -i error:  /var/log/messages  |grep -v unpack_resources         #便面stonith的错误(此处有错误需要修改)

Jul 16 03:54:53 node2 pengine: [3277]: ERROR: unpack_resources: Resource start-up disabled since no STONITH resources have been defined
Jul 16 03:54:53 node2 pengine: [3277]: ERROR: unpack_resources: Either configure some or disable STONITH with the stonith-enabled option
Jul 16 03:54:53 node2 pengine: [3277]: ERROR: unpack_resources: NOTE: Clusters with shared data need STONITH to ensure data integrity
Jul 16 04:06:45 node2 pengine: [3277]: ERROR: unpack_resources: Resource start-up disabled since no STONITH resources have been defined
Jul 16 04:06:45 node2 pengine: [3277]: ERROR: unpack_resources: Either configure some or disable STONITH with the stonith-enabled option
Jul 16 04:06:45 node2 pengine: [3277]: ERROR: unpack_resources: NOTE: Clusters with shared data need STONITH to ensure data integrity

四:定义群集资源:

node1.a.com的配置:

[root@node1 ~]# crm

crm(live)# configure

crm(live)configure# property stonith-enabled=false       #在上一步骤中,stonith报错,所以将stonith关闭

crm(live)configure# commit        #提交

crm(live)configure# primitive webip ocf:heartbeat:IPaddr params ip=192.168.101.100     #资源名称是webip,后边的ip地址是VIP

crm(live)configure# commit        #提交

crm(live)configure# primitive webserver lsb:httpd        #定义资源名称webserver,资源是httpd服务

crm(live)configure# commit       #提交

crm(live)configure# group web webip webserver      #定义组名web,包含了webip和webserver

crm(live)configure# show         #查看定义的资源

corosync/openais+pacemaker实现web的高可用性_图形_14

crm(live)configure# commit #提交

crm(live)configure# end       #结束

crm(live)# status        #查看状态

corosync/openais+pacemaker实现web的高可用性_的_15

[root@node1 ~]# service httpd status       查看httpd的运行状态

corosync/openais+pacemaker实现web的高可用性_的_16

在node2.a.com上查看:

corosync/openais+pacemaker实现web的高可用性_图形_17

corosync/openais+pacemaker实现web的高可用性_的_18

[root@node2 ~]# service httpd status 查看httpd的运行状态

corosync/openais+pacemaker实现web的高可用性_的_19

[root@node2 ~]# crm configure

crm(live)configure# property no-quorum-policy=ignore       #再借点node2上关闭票数的功能

crm(live)configure# commit

测试:

corosync/openais+pacemaker实现web的高可用性_的_20

模拟node1节点停止工作:

[root@node1 ~]# service corosync stop       #关闭node1节点的corosync服务

在node2上查看:

corosync/openais+pacemaker实现web的高可用性_的_21

[root@node2 ~]# service httpd status        #查看httpd状态

corosync/openais+pacemaker实现web的高可用性_图形_22

corosync/openais+pacemaker实现web的高可用性_图形_23

注:在node1.a.com上启用corosync服务后,节点node1并不会将资源抢夺过来,这是为了保证群集的稳定性!!!