实验环境:

两台linux 5.4 的虚拟机

实验拓扑:

corosync+openais+pacemaker+web+drbd_pacemaker

实验配置 如下:

服务器上的基础配置

[root@love ~]# hostname a.abc.com

[root@love ~]#logout

[root@a ~]# vim /etc/sysconfig/network

NETWORKING=yes
NETWORKING_IPV6=no
HOSTNAME=a.abc.com

[root@a ~]# vim /etc/hosts

1 # Do not remove the following line, or various programs
2 # that require network functionality will fail.
3 127.0.0.1               localhost.localdomain localhost
4 ::1             localhost6.localdomain6 localhost6
5 192.168.10.99   a.abc.com
6 192.168.10.100  b.abc.com

[root@a ~]# ssh-keygen -t rsa
Generating public/private rsa key pair.
Enter file in which to save the key (/root/.ssh/id_rsa):
Created directory '/root/.ssh'.
Enter passphrase (empty for no passphrase):
Enter same passphrase again:
Your identification has been saved in /root/.ssh/id_rsa.
Your public key has been saved in /root/.ssh/id_rsa.pub.
The key fingerprint is:
dd:b2:26:28:b5:fb:e5:84:3b:6b:c8:3d:29:b4:e3:a5
root@a.abc.com 

[root@a ~]# cd .ssh/
[root@a .ssh]# ssh-copy-id -i id_rsa.pub b.abc.com
10
The authenticity of host 'b.abc.com (192.168.10.100)' can't be established.
RSA key fingerprint is 0a:78:89:da:1e:1d:97:95:0b:8b:03:22:e7:af:22:5c.
Are you sure you want to continue connecting (yes/no)? yes 
Warning: Permanently added 'b.abc.com,192.168.10.100' (RSA) to the list of known hosts.
root@b.abc.com's password:
Now try logging into the machine, with "ssh 'b.abc.com'", and check in:

  .ssh/authorized_keys

to make sure we haven't added extra keys that you weren't expecting.

在节点a.abc.com 与 b.abc.com上均安装必要的软件包

[root@a ~]# ll |grep rpm
-rw-r--r-- 1 root root 271360 10-06 19:32 cluster-glue-1.0.6-1.6.el5.i386.rpm

#群集的图形化管理工具
-rw-r--r-- 1 root root 133254 10-06 19:32 cluster-glue-libs-1.0.6-1.6.el5.i386.rpm
-rw-r--r-- 1 root root 170052 10-06 19:32 corosync-1.2.7-1.1.el5.i386.rpm

#corosync群集服务的主程序包
-rw-r--r-- 1 root root 158502 10-06 19:32 corosynclib-1.2.7-1.1.el5.i386.rpm
-rw-r--r-- 1 root root 221868 10-06 19:31 drbd83-8.3.8-1.el5.centos.i386.rpm

#网络存储的管理工具
-rw-r--r-- 1 root root 165591 10-06 19:32 heartbeat-3.0.3-2.3.el5.i386.rpm
#用于心跳探测的软件

-rw-r--r-- 1 root root 289600 10-06 19:32 heartbeat-libs-3.0.3-2.3.el5.i386.rpm

#drbd存储的内核模块的安装包
-rw-r--r-- 1 root root 125974 10-06 19:31 kmod-drbd83-8.3.8-1.el5.centos.i686.rpm
-rw-r--r-- 1 root root  60458 10-06 19:32 libesmtp-1.0.4-5.el5.i386.rpm

-rw-r--r-- 1 root root 207085 10-06 19:32 openais-1.1.3-1.6.el5.i386.rpm
#一种资源管理工具

-rw-r--r-- 1 root root  94614 10-06 19:32 openaislib-1.1.3-1.6.el5.i386.rpm
-rw-r--r-- 1 root root 796813 10-06 19:32 pacemaker-1.1.5-1.1.el5.i386.rpm

#一种类似于heartbeat的用于监控的模块
-rw-r--r-- 1 root root 207925 10-06 19:32 pacemaker-cts-1.1.5-1.1.el5.i386.rpm
-rw-r--r-- 1 root root 332026 10-06 19:32 pacemaker-libs-1.1.5-1.1.el5.i386.rpm
-rw-r--r-- 1 root root  32818 10-06 19:32 perl-TimeDate-1.16-5.el5.noarch.rpm
-rw-r--r-- 1 root root 388632 10-06 19:32 resource-agents-1.0.4-1.1.el5.i386.rpm

[root@a ~]# yum  localinstall *.rpm -y --nogpgcheck

[root@a ~]# cd /etc/corosync/
[root@a corosync]# ll
总计 20
-rw-r--r-- 1 root root 5384 2010-07-28 amf.conf.example
-rw-r--r-- 1 root root  436 2010-07-28 corosync.conf.example
drwxr-xr-x 2 root root 4096 2010-07-28 service.d
drwxr-xr-x 2 root root 4096 2010-07-28 uidgid.d
[root@a corosync]# cp corosync.conf.example corosync.conf

[root@a corosync]# vim corosync.conf #修改添加如下

10                 bindnetaddr: 192.168.10.0

33 service {
34         ver: 0
35         name: pacemaker
36 }      
37 aisexec {
38         user: root
39         group: root
40 }

生成验证的账号库

[root@a corosync]# corosync-keygen
Corosync Cluster Engine Authentication key generator.
Gathering 1024 bits for key from /dev/random.
Press keys on your keyboard to generate entropy.
Writing corosync key to /etc/corosync/authkey.
[root@a corosync]# ll
总计 28
-rw-r--r-- 1 root root 5384 2010-07-28 amf.conf.example
-r-------- 1 root root  128 10-06 20:37 authkey
-rw-r--r-- 1 root root  539 10-06 20:29 corosync.conf
-rw-r--r-- 1 root root  436 2010-07-28 corosync.conf.example
drwxr-xr-x 2 root root 4096 2010-07-28 service.d
drwxr-xr-x 2 root root 4096 2010-07-28 uidgid.d

保证两个节点上的配置文件一致

[root@a corosync]# scp -p authkey corosync.conf b.abc.com:/etc/corosync/
authkey                          100%  128     0.1KB/s   00:00   
corosync.conf                    100%  539     0.5KB/s   00:00   
在两个节点上手动创建日志文件的目录

[root@a corosync]# mkdir /var/log/cluster
[root@a corosync]# ssh b.abc.com 'mkdir /var/log/cluster'

[root@a corosync]# service corosync start
Starting Corosync Cluster Engine (corosync):               [确定]
[root@a corosync]# ssh b.abc.com 'service corosync start'
Starting Corosync Cluster Engine (corosync): [确定]

验证corosync引擎是否正常启动了

[root@a corosync]# grep -i  -e "corosync cluster engine" -e "configuration file" /var/log/messages
Oct  5 19:21:28 love smartd[3114]: Opened configuration file /etc/smartd.conf
Oct  5 19:21:28 love smartd[3114]: Configuration file /etc/smartd.conf was parsed, found DEVICESCAN, scanning devices
Oct  6 19:01:36 love smartd[3238]: Opened configuration file /etc/smartd.conf
Oct  6 19:01:36 love smartd[3238]: Configuration file /etc/smartd.conf was parsed, found DEVICESCAN, scanning devices
Oct  6 21:01:08 love corosync[6871]:   [MAIN  ] Corosync Cluster Engine ('1.2.7'): started and ready to provide service.
Oct  6 21:01:08 love corosync[6871]:   [MAIN  ] Successfully read main configuration file '/etc/corosync/corosync.conf'.

查看初始化成员节点通知是否发出

[root@a corosync]# grep -i totem /var/log/messages
Oct  6 21:01:08 love corosync[6871]:   [TOTEM ] Initializing transport (UDP/IP).
Oct  6 21:01:08 love corosync[6871]:   [TOTEM ] Initializing transmit/receive security: libtomcrypt SOBER128/SHA1HMAC (mode 0).
Oct  6 21:01:09 love corosync[6871]:   [TOTEM ] The network interface [192.168.10.99] is now up.
Oct  6 21:01:10 love corosync[6871]:   [TOTEM ] A processor joined or left the membership and a new membership was formed.

检查过程中是否有错误产生

[root@a corosync]# grep -i error:  /var/log/messages  |grep -v unpack_resources
Oct  6 20:43:34 love crmd: [6346]: ERROR: crm_timer_popped: Election Timeout (I_ELECTION_DC) just popped! (120000ms)
Oct  6 20:46:34 love crmd: [6346]: ERROR: crm_timer_popped: Integration Timer (I_INTEGRATED) just popped! (180000ms)
Oct  6 20:49:34 love crmd: [6346]: ERROR: crm_timer_popped: Integration Timer (I_INTEGRATED) just popped! (180000ms)
Oct  6 20:52:35 love corosync[6333]:   [pcmk  ] ERROR: pcmk_peer_update: Something strange happened: 1
Oct  6 20:52:40 love corosync[6333]:   [pcmk  ] ERROR: pcmk_peer_update: Something strange happened: 1
Oct  6 20:53:54 love crmd: [6346]: ERROR: crmd_ha_msg_filter: Another DC detected: b.abc.com (op=noop)

检查pacemaker是否已经启动了

[root@a corosync]# grep -i pcmk_startup /var/log/messages
Oct  6 21:01:09 love corosync[6871]:   [pcmk  ] info: pcmk_startup: CRM: Initialized
Oct  6 21:01:09 love corosync[6871]:   [pcmk  ] Logging: Initialized pcmk_startup
Oct  6 21:01:09 love corosync[6871]:   [pcmk  ] info: pcmk_startup: Maximum core file size is: 4294967295
Oct  6 21:01:09 love corosync[6871]:   [pcmk  ] info: pcmk_startup: Service: 9
Oct  6 21:01:09 love corosync[6871]:   [pcmk  ] info: pcmk_startup: Local hostname: a.abc.com

查看群集成员状态

[root@a corosync]# crm status
============
Last updated: Sat Oct  6 21:06:03 2012
Stack: openais
Current DC: a.abc.com - partition with quorum
Version: 1.1.5-1.1.el5-01e86afaaa6d4a8c4836f68df80ababd6ca3902f
2 Nodes configured, 2 expected votes
0 Resources configured.
============

Online: [ a.abc.com b.abc.com ]

crm #  pacemaker 提供,是一个shell

[root@a corosync]# crm configure show xml  #通过xml的格式显示群集的配置
<?xml version="1.0" ?>
<cib admin_epoch="0" crm_feature_set="3.0.5" dc-uuid="a.abc.com" epoch="13" have-quorum="1" num_updates="102" validate-with="pacemaker-1.2">
  <configuration>
    <crm_config>
      <cluster_property_set id="cib-bootstrap-options">
        <nvpair id="cib-bootstrap-options-dc-version" name="dc-version" value="1.1.5-1.1.el5-01e86afaaa6d4a8c4836f68df80ababd6ca3902f"/>
        <nvpair id="cib-bootstrap-options-cluster-infrastructure" name="cluster-infrastructure" value="openais"/>
        <nvpair id="cib-bootstrap-options-expected-quorum-votes" name="expected-quorum-votes" value="2"/>
      </cluster_property_set>
    </crm_config>
    <nodes>
      <node id="b.abc.com" type="normal" uname="b.abc.com"/>
      <node id="a.abc.com" type="normal" uname="a.abc.com"/>
    </nodes>
    <resources/>
    <constraints/>
  </configuration>
</cib>

设置默认策略

crm(live)configure# property stonith-enabled=false #禁用脑裂
crm(live)configure# property no-quorum-policy=ignore #禁用计算票数

crm(live)configure# show
node a.abc.com
node b.abc.com
property $id="cib-bootstrap-options" \
    dc-version="1.1.5-1.1.el5-01e86afaaa6d4a8c4836f68df80ababd6ca3902f" \
    cluster-infrastructure="openais" \
    expected-quorum-votes="2" \
    stonith-enabled="false"  
crm(live)configure# commit  #修改配置后需要提交

添加本地的ip资源。

crm(live)configure# primitive web_IP ocf:heartbeat:IPaddr params ip=192.168.10.101
crm(live)configure# commit

crm(live)configure# property stonith-enabled=false #禁用脑裂
crm(live)configure# property no-quorum-policy=ignore #禁用计算票数

此时查看群集资源状况
[root@a corosync]# crm status
============
Last updated: Sat Oct  6 22:13:01 2012
Stack: openais
Current DC: a.abc.com - partition with quorum
Version: 1.1.5-1.1.el5-01e86afaaa6d4a8c4836f68df80ababd6ca3902f
2 Nodes configured, 2 expected votes
1 Resources configured.
============

Online: [ a.abc.com b.abc.com ]

web_IP    (ocf::heartbeat:IPaddr):    Started a.abc.com  #ip资源运行在a.abc.com节点上

[root@a corosync]# ifconfig eth0:0  #在节点上查看

eth0:0    Link encap:Ethernet  HWaddr 00:0C:29:2E:BB:4E 
          inet addr:192.168.10.101  Bcast:192.168.10.255  Mask:255.255.255.0
          UP BROADCAST RUNNING MULTICAST  MTU:1500  Metric:1
          Interrupt:67 Base address:0x2000

在两个节点上均安装httpd服务

yum install httpd

crm(live)configure# primitive httpd lsb:httpd   #定义http资源

将vip 与 httpd 服务放在一个服务器上

crm(live)configure# group drbdweb webip webserver  #将vip与httpd服务

crm(live)configure# commit

在节点 a.abc.com 上查看

[root@a corosync]# service httpd status
httpd (pid  7592) 正在运行...
 

在节点 b.abc.com 上查看

[root@b ~]# service httpd status
httpd 已停

无虚拟ip

[root@a corosync]# crm status
============
Last updated: Sat Oct  6 22:33:28 2012
Stack: openais
Current DC: a.abc.com - partition with quorum
Version: 1.1.5-1.1.el5-01e86afaaa6d4a8c4836f68df80ababd6ca3902f
2 Nodes configured, 2 expected votes
1 Resources configured.
============

Online: [ a.abc.com b.abc.com ]

Resource Group: web
     web_IP    (ocf::heartbeat:IPaddr):    Started a.abc.com 
     httpd    (lsb:httpd):    Started a.abc.com

drbd网络存储的搭建

[root@a ~]# modprobe drbd  #加载drbd存储模块
[root@a ~]# ssh b.abc.com  'modprobe drbd'

在 a.abc.com 与 b.abc.com 同样执行相同操作

[root@a ~]# fdisk /dev/sda  #为存储创建新的分区

The number of cylinders for this disk is set to 2610.
There is nothing wrong with that, but this is larger than 1024,
and could in certain setups cause problems with:
1) software that runs at boot time (e.g., old versions of LILO)
2) booting and partitioning software from other OSs
   (e.g., DOS FDISK, OS/2 FDISK)

Command (m for help): n
Command action
   e   extended
   p   primary partition (1-4)
e
Selected partition 4
First cylinder (1420-2610, default 1420):
Using default value 1420
Last cylinder or +size or +sizeM or +sizeK (1420-2610, default 2610):
Using default value 2610

Command (m for help): n
First cylinder (1420-2610, default 1420):
Using default value 1420
Last cylinder or +size or +sizeM or +sizeK (1420-2610, default 2610):
Using default value 2610

Command (m for help): w
The partition table has been altered!

Calling ioctl() to re-read partition table.

WARNING: Re-reading the partition table failed with error 16: 设备或资源忙.
The kernel still uses the old table.
The new table will be used at the next reboot.
Syncing disks.
[root@a ~]# partprobe /dev/sda #使得内核重新读取硬盘的分区表

[root@a ~]# fdisk –l  #查看当前系统的磁盘分区

Disk /dev/sda: 21.4 GB, 21474836480 bytes
255 heads, 63 sectors/track, 2610 cylinders
Units = cylinders of 16065 * 512 = 8225280 bytes

   Device Boot      Start         End      Blocks   Id  System
/dev/sda1   *           1          13      104391   83  Linux
/dev/sda2              14        1288    10241437+  83  Linux
/dev/sda3            1289        1419     1052257+  82  Linux swap / Solaris
/dev/sda4            1420        2610     9566707+   5  Extended
/dev/sda5            1420        2610     9566676   83  Linux

拷贝并修改配置文件

[root@a ~]# cd /usr/share/doc/drbd83-8.3.8/
[root@a drbd83-8.3.8]# ll
总计 64
-rw-r--r-- 1 root root 31183 2010-06-02 ChangeLog
-rw-r--r-- 1 root root 17990 2008-11-24 COPYING
-rw-r--r-- 1 root root   133 2010-06-02 drbd.conf
-rw-r--r-- 1 root root    22 2010-06-04 file.list
-rw-r--r-- 1 root root   425 2010-03-02 README
[root@a drbd83-8.3.8]# cp drbd.conf /etc/
cp:是否覆盖“/etc/drbd.conf”? y

[root@a ~]# cd /etc/drbd.d/

[root@a drbd.d]# vim global_common.conf

1 global {
2 usage-count no;
3 }
4 common {
5 protocol C;
6 handlers {
7 pri-on-incon-degr "/usr/lib/drbd/notify-pri-on-incon-degr.sh; /usr/lib/drbd/notify-emergency-reboot.s h; echo b > /proc/sysrq-trigger ; reboot -f";
8 pri-lost-after-sb "/usr/lib/drbd/notify-pri-lost-after-sb.sh; /usr/lib/drbd/notify-emergency-reboot.s h; echo b > /proc/sysrq-trigger ; reboot -f";
9 local-io-error "/usr/lib/drbd/notify-io-error.sh; /usr/lib/drbd/notify-emergency-shutdown.sh; echo o > /proc/sysrq-trigger ; halt -f";
10 fence-peer "/usr/lib/drbd/crm-fence-peer.sh";
11 split-brain "/usr/lib/drbd/notify-split-brain.sh root";
12 out-of-sync "/usr/lib/drbd/notify-out-of-sync.sh root";
13 before-resync-target "/usr/lib/drbd/snapshot-resync-target-lvm.sh -p 15 -- -c 16k";
14 after-resync-target /usr/lib/drbd/unsnapshot-resync-target-lvm.sh;
15 }
16 startup {
17 wfc-timeout 120;
18 degr-wfc-timeout 120;
19 }
20 disk {
21 on-io-error detach;
22 fencing resource-only;
23 }
24 net {
25 cram-hmac-alg "sha1";
26 shared-secret "mydrdblab";
27 }
28 syncer {
29 rate 100M;
30 }
31 }

[root@a drbd.d]# vim web.res

  1 resource web {
  2        on a.abc.com {
  3        device /dev/drbd0;
  4       disk /dev/sda5;
  5        address 192.168.10.99:7789;
  6        meta-disk  internal;
  7        }
  8        on b.abc.com {
  9        device /dev/drbd0;
10       disk /dev/sda5;
11        address 192.168.10.100:7789;
12        meta-disk  internal;
13        }
14 }

[root@a drbd.d]# scp * b.abc.com:/etc/drbd.d/
global_common.conf                                                     100%  417     0.4KB/s   00:00   
global_common.conf.bak                                                 100% 1418     1.4KB/s   00:00   
web.res                                                                100%  315     0.3KB/s   00:00   

[root@a drbd.d]# drbdadm create-md web   #初始化资源
Writing meta data...
initializing activity log
NOT initialized bitmap
New drbd meta data block successfully created.
[root@a drbd.d]# ssh b.abc.com 'drbdadm create-md web'
NOT initialized bitmap
Writing meta data...
initializing activity log
New drbd meta data block successfully created.
启动drbd服务并对硬盘进行格式化

[root@a drbd.d]# service drbd start
Starting DRBD resources: [
web
Found valid meta data in the expected location, 9796272128 bytes into /dev/sda5.
d(web) s(web) n(web) ]..........
***************************************************************
DRBD's startup script waits for the peer node(s) to appear.
- In case this node was already a degraded cluster before the
   reboot the timeout is 120 seconds. [degr-wfc-timeout]
- If the peer was available before the reboot the timeout will
   expire after 120 seconds. [wfc-timeout]
   (These values are for resource 'web'; 0 sec -> wait forever)
To abort waiting enter 'yes' [  17]:

[root@b ~]# service drbd start #同时在b.abc.com节点上启动drbd服务
Starting DRBD resources: [
web
Found valid meta data in the expected location, 9796272128 bytes into /dev/sda5.
d(web) s(web) n(web) ].

[root@a drbd.d]# drbdadm -- --overwrite-data-of-peer primary web

#初次设置资源的类型使用。非初次使用使用如下命令:drbdadm primary/secondary web

#将当前节点设置成主或备份节点。同时只能有一个节点为primary。

#使用如下命令查看drbd存储的状态。并器同步完成在进行后续的操作

[root@a drbd.d]#watch –n 1 ‘service drbd status’

#最后结果如下

[root@a drbd.d]# service drbd status
drbd driver loaded OK; device status:
version: 8.3.8 (api:88/proto:86-94)
GIT-hash: d78846e52224fd00562f7c225bcc25b2d422321d build by mockbuild@builder10.centos.org, 2010-06-04 08:04:16
m:res  cs         ro                 ds                 p  mounted        fstype
0:web  Connected  Primary/Secondary  UpToDate/UpToDate  C  /var/www/html  ext3

[root@a ~]# mkfs -t ext3 /dev/drbd0
mke2fs 1.39 (29-May-2006)
Filesystem label=
OS type: Linux
Block size=4096 (log=2)
Fragment size=4096 (log=2)
1196032 inodes, 2391587 blocks
119579 blocks (5.00%) reserved for the super user
First data block=0
Maximum filesystem blocks=2449473536
73 block groups
32768 blocks per group, 32768 fragments per group
16384 inodes per group
Superblock backups stored on blocks:
    32768, 98304, 163840, 229376, 294912, 819200, 884736, 1605632

Writing inode tables: done                           
Creating journal (32768 blocks):
done
Writing superblocks and filesystem accounting information:
done

This filesystem will be automatically checked every 31 mounts or
180 days, whichever comes first.  Use tune2fs -c or -i to override.

对节点 b.abc.com 上的drbd 存储盘进行格式化

[root@a ~]# drbdadm secondary web
[root@a ~]# ssh b.abc.com 'drbdadm primary web'
[root@a ~]# service drbd status
drbd driver loaded OK; device status:
version: 8.3.8 (api:88/proto:86-94)
GIT-hash: d78846e52224fd00562f7c225bcc25b2d422321d build by mockbuild@builder10.centos.org, 2010-06-04 08:04:16
m:res  cs         ro                 ds                 p  mounted  fstype
0:web  Connected  Secondary/Primary  UpToDate/UpToDate  C

最后关闭drbd服务并取消开机自动启动

[root@a ~]# service drbd stop 
Stopping all DRBD resources: .
[root@a ~]# ssh b.abc.com 'service drbd stop'
Stopping all DRBD resources: .
[root@a ~]# service drbd status
drbd not loaded
[root@a ~]# chkconfig drbd off
[root@a ~]# ssh b.abc.com 'chkconfig drbd off'

[root@a ~]# crm status
============
Last updated: Sun Oct  7 20:39:57 2012
Stack: openais
Current DC: b.abc.com - partition with quorum
Version: 1.1.5-1.1.el5-01e86afaaa6d4a8c4836f68df80ababd6ca3902f
2 Nodes configured, 2 expected votes
2 Resources configured.
============

Online: [ a.abc.com b.abc.com ]

webip    (ocf::heartbeat:IPaddr):    Started b.abc.com
webserver    (lsb:httpd):    Started a.abc.com

配置web服务的资源约束

web服务的这两个资源必须运行在同一个节点上才能保证web服务的正常运行,所以对这两资源进行排列约束

[root@a ~]# crm configuregroup drbdweb http_IP httpd

#将http与ipaddr放置在一个组中

配置drbd服务为高可用

将已经配置好的drbd设备/dev/drbd0定义为集群服务;

配置drbd为集群资源:
drbd需要同时运行在两个节点上,但只能有一个节点(primary/secondary模型)是Master,而另一个节点为Slave;因此,它是一种比较特殊的集群资源,其资源类型为多状态(Multi-state)clone类型,即主机节点有Master和Slave之分,且要求服务刚启动时两个节点都处于slave状态。

crm(live)configure# primitive httpd_drbd_web ocf:heartbeat:drbd params drbd_resource=web op monitor role=Master interval=60s timeout=40s op monitor role=Slave inter val=70s timeout=40s

#此处可能会有一些警告信息。不过不致命,可以忽略。

指定drbd资源运行在web所运行的节点上

crm(live)configure# master MS_Webdrbd httpd_drbd_web meta master-max="1" master-node-max="1" clone-max="2" clone-node-max="1" notify="true"

指定主资源的可变化量

crm configure primitive drbd_web_FS ocf:heartbeat:Filesystem params device="/dev/drbd0" directory="/var/www/html" fstype="ext3"

#定义的文件系统资源 

crm configure colocation drbd_web_FS_on_MS_Webdrbd inf: drbd_web_FS MS_Webdrbd:Master

#定义资源在drbd与web服务启动的主节点上启动

crm  configure  order drbd_web_FS_after_MS_Webdrbd inf: MS_Webdrbd:promote drbd_web_FS:start

#定义资源的先后顺序

[root@a ~]# crm status
============
Last updated: Mon Oct  8 09:11:37 2012
Stack: openais
Current DC: a.abc.com - partition with quorum
Version: 1.1.5-1.1.el5-01e86afaaa6d4a8c4836f68df80ababd6ca3902f
2 Nodes configured, 2 expected votes
4 Resources configured.
============

Online: [ a.abc.com b.abc.com ]

Master/Slave Set: MS_Webdrbd [httpd_drbd_web]
     Masters: [ a.abc.com ]
     Slaves: [ b.abc.com ]
WebFS    (ocf::heartbeat:Filesystem):    Started a.abc.com
http_ip    (ocf::heartbeat:IPaddr):    Started a.abc.com
httpd    (lsb:httpd):    Started a.abc.com

[root@a ~]# ssh b.abc.com 'crm status'
============
Last updated: Mon Oct  8 09:12:25 2012
Stack: openais
Current DC: a.abc.com - partition with quorum
Version: 1.1.5-1.1.el5-01e86afaaa6d4a8c4836f68df80ababd6ca3902f
2 Nodes configured, 2 expected votes
4 Resources configured.
============

Online: [ a.abc.com b.abc.com ]

Master/Slave Set: MS_Webdrbd [httpd_drbd_web]
     Masters: [ a.abc.com ]
     Slaves: [ b.abc.com ]
WebFS    (ocf::heartbeat:Filesystem):    Started a.abc.com
http_ip    (ocf::heartbeat:IPaddr):    Started a.abc.com
httpd    (lsb:httpd):    Started a.abc.com

[root@a ~]# mount
/dev/sda2 on / type ext3 (rw)
proc on /proc type proc (rw)
sysfs on /sys type sysfs (rw)
devpts on /dev/pts type devpts (rw,gid=5,mode=620)
/dev/sda1 on /boot type ext3 (rw)
tmpfs on /dev/shm type tmpfs (rw)
none on /proc/sys/fs/binfmt_misc type binfmt_misc (rw)
sunrpc on /var/lib/nfs/rpc_pipefs type rpc_pipefs (rw)
/dev/drbd0 on /var/www/html type ext3 (rw)

[root@a ~]# service httpd status
httpd (pid  14682) 正在运行...
[root@a ~]# ssh b.abc.com 'service httpd status'
httpd 已停

#使用命令“crm configure edit”删除由系统自动产生的禁止辅助节点编程主节点的设置,如下:

location drbd-fence-by-handler-MS_Webdrbd MS_Webdrbd \
rule $id="drbd-fence-by-handler-rule-MS_Webdrbd" $role="Master" -inf: #uname ne b.abc.com

测试系统的资源的迁移

[root@a ~]# crm
crm(live)# node standby a.abc.com  #将主节点设置成备份

[root@b ~]# crm status
============
Last updated: Tue Oct  9 16:22:39 2012
Stack: openais
Current DC: a.abc.com - partition with quorum
Version: 1.1.5-1.1.el5-01e86afaaa6d4a8c4836f68df80ababd6ca3902f
2 Nodes configured, 2 expected votes
3 Resources configured.
============

Node a.abc.com: standby
Online: [ b.abc.com ]

Master/Slave Set: MS_Webdrbd [httpd_drbd_web]
     Masters: [ b.abc.com ]
     Stopped: [ httpd_drbd_web:0 ]
drbd_web_FS    (ocf::heartbeat:Filesystem):    Started b.abc.com
Resource Group: drbdweb
     http_IP    (ocf::heartbeat:IPaddr):    Started b.abc.com
     httpd    (lsb:httpd):    Started b.abc.com

删除如下两行

由于我们在/etc/drbd.d/global_common.conf配置文件中开启了资源隔离和脑列处理机制,所以在crm的配置文件cib中将会自动出现一个位置约束配置,当主节点宕机之后,禁止从节点变为主节点,以免当主节点恢复的时候产生脑列,进行资源争用,但是我们此时只是为了验证资源能够流转,所以将这个位置约束删除:

location drbd-fence-by-handler-MS_Webdrbd MS_Webdrbd \
    rule $id="drbd-fence-by-handler-rule-MS_Webdrbd" $role="Master" -inf: #uname ne b.abc.com

web服务必须运行在drbd服务的主节点上,这样才能保证drbd可以存储web服务的数据

通过使用排列约束将web服务的资源运行在drbd服务的主节点上:

crm(live)configure# colocation http_on_drbd inf: http_IP_on_httpd MS_Webdrbd:Master

只有当drbd服务的所有资源启动之后才能启动web服务的资源,所以对web服务和drbd服务的资源配置次序约束

crm(live)configure# order http_after_drbd inf: drbd_web_FS_after_MS_Webdrbd:promote http_IP_on_httpd:start

corosync+openais+pacemaker+web+drbd_oepnais_02 

[root@a ~]# crm configure edit  #使用手动方式调整群集的配置

node a.abc.com \ 
        attributes standby="off"
node b.abc.com \
        attributes standby="on" #将 b.abc.com 调成备份状态

[root@a ~]# crm status
============
Last updated: Mon Oct  8 12:31:00 2012
Stack: openais
Current DC: a.abc.com - partition with quorum
Version: 1.1.5-1.1.el5-01e86afaaa6d4a8c4836f68df80ababd6ca3902f
2 Nodes configured, 2 expected votes
4 Resources configured.
============

Node b.abc.com: standby
Online: [ a.abc.com ]

Master/Slave Set: MS_Webdrbd [httpd_drbd_web]
     Masters: [ a.abc.com ] 
     Stopped: [ httpd_drbd_web:1 ]  #此时节点 b.abc.com显示处于stoped 即: standby状态。
drbd_web_FS    (ocf::heartbeat:Filesystem):    Started a.abc.com
http_IP    (ocf::heartbeat:IPaddr):    Started a.abc.com
httpd    (lsb:httpd):    Started a.abc.com

此时一切正常进行切换

一些crm管理中的语法关键字

位置:资源更乐意留在哪个节点上

help location 查看帮助

例:location Web_on_node1 Web 500: node1.a.org

次序:定义资源的先后顺序

help order 查看帮助

例:order WebServer_after_WebIP mandatory: WebServer:start WebIP

排序:是否能同时运行在两节点上

help colocation 查看帮助

root@a drbd.d]# service corosync restart

[root@a drbd.d]# ssh b.abc.com 'service corosync restart'

corosync+openais+pacemaker+web+drbd_debd_03

最后贴出本人的配置信息 如下:

[root@a drbd.d]# crm configure show
node a.abc.com \
    attributes standby="off"
node b.abc.com \
    attributes standby="off"
primitive drbd_web_FS ocf:heartbeat:Filesystem \  #定义的文件系统资源
    params device="/dev/drbd0" directory="/var/www/html" fstype="ext3" \
    meta target-role="Started"
primitive http_IP ocf:heartbeat:IPaddr \   #定义的ipaddr资源
    params ip="192.168.10.101" \
    meta target-role="Started"
primitive httpd lsb:httpd \    #定义的httpd服务器资源
    meta target-role="Started"
primitive httpd_drbd_web ocf:heartbeat:drbd \  #定义的drbd服务资源
    params drbd_resource="web" \
    op monitor interval="60s" role="Master" timeout="40s" \
    op monitor interval="70s" role="Slave" timeout="40s"
group drbdweb http_IP httpd   #将http与ipaddr放置在一个组中
ms MS_Webdrbd httpd_drbd_web \  #配置drbd为集群资源
    meta master-max="1" master-node-max="1" clone-max="2" clone-node-max="1" notify="true" target-role="Started"
location drbd-fence-by-handler-MS_Webdrbd MS_Webdrbd \ 

    rule $id="drbd-fence-by-handler-rule-MS_Webdrbd" $role="Master" -inf: #uname ne a.abc.com

#系统根据drbd服务的配置文件/etc/drbd.d/global_common.conf配置文件中开启了资源隔离和脑列处理机制,所以在crm的配置文件cib中将会自动出现一个位置约束配置,当主节点宕机之后,禁止从节点变为主节点,以免当主节点恢复的时候产生脑列,进行资源争用
colocation drbd_web_FS_on_MS_Webdrbd inf: drbd_web_FS MS_Webdrbd:Master

#MS_Webdrbd的Master节点即为drbd服务web资源的Primary节点,此节点的设备/dev/drbd0可以挂载使用,且在某集群服务的应用当中也需要能够实现自动挂载
colocation httpd_on_drbd inf: drbdweb MS_Webdrbd:Master

#通过使用排列约束将web服务的资源运行在drbd服务的主节点上
order drbd_web_FS_after_MS_Webdrbd inf: MS_Webdrbd:promote drbd_web_FS:start

#因为此自动挂载的集群资源需要运行于drbd服务的Master节点上,并且只能在drbd服务将某节点设置为Primary以后方可启动

#order httpd_after_drbd inf: drbd_web_FS_after_MS_Webdrbd:promote http_IP_on_httpd:start 

#此行不加也可,前面的group已经将服务器资源约束到同一个节点上

#只有当drbd服务的所有资源启动之后才能启动web服务的资源,所以对web服务和drbd服务的资源配置次序约束
property $id="cib-bootstrap-options" \
    dc-version="1.1.5-1.1.el5-01e86afaaa6d4a8c4836f68df80ababd6ca3902f" \
    cluster-infrastructure="openais" \
    expected-quorum-votes="2" \
    stonith-enabled="false" \  #关闭stonish,防止群集分裂
    no-quorum-policy="ignore" #关闭计票功能