MongoDB复制集
优/特点
支持大数据量、高扩展性、高性能、灵活数据模型、高可用性。
同步机制
数据复制的目的是使数据得到最大的可用性,避免单点故障引起的整站不能访问的情况的发生,Mongodb的副本集在同一时刻只有一台服务器是可以写的,副本集的主从复制也是一个异步同步的过程,是slave端从primary端获取日志,然后在自己身上完全顺序的执行日志所记录的各种操作(该日志是不记录查询操作的),这个日志就是local数据库中的oplog.rs表,默认在64位机器上这个表是比较大的,占磁盘大小的5%,oplog.rs的大小可以在启动参数中设定:--oplogSize 1000,单位是M。

鉴于双机房容灾意外情况可能放生,本方案选择故障时人工介入转移或恢复,不加入仲裁节点。其中A机房为主机房一个primary+2个Secondary节点,B机房作为灾备机房,2个Secondary节点。最坏当主机房挂掉时通过权重来启动B机房某节点为Primary,继续提供服务。
环境规划
A机房 角色 B机房 角色
192.168.70.214 Primary 192.168.71.214 Secondary 3 复制集节点 3
192.168.70.215 Secondary 1 复制集节点 1 192.168.71.215 Secondary 4 复制集节点 4
192.168.70.216 Secondary 2 复制集节点 2

架构示意图
其中下面是主机房断电断网时的故障转移示意图。

安装配置

这里所有节点目录创建一致,方便管理维护,从配置文件来判断各节点的角色。
创建目录
--为MongoDB创建软件、数据、日志目录,默认情况下它将数据存储在/mgdata
[root@test153 /]# mkdir -p /mgdb/mongodbtest/replset/data
[root@test153 /]# mkdir /mgdata
[root@test153 /]# mkdir /mglog上传介质
sftp> cd /mgdb
sftp> put mongodb-linux-x86_64-2.2.3.tgz.tar

解压

[root@test153 /]# cd /mgdb
$ tar -xvf mongodb-linux-x86_64-2.2.3.tgz.tar
[root@test153 mgdb]# mv mongodb-linux-x86_64-2.2.3 mongodb

服务启动

每个节点都要执行
cd /root/mongodb/bin192.168.70.214
/root/mongodb/bin/mongod --replSet repset --port 27017 --dbpath /root/data27011 --oplogSize 2048 --logpath /root/log27011/log27011.log &
./mongo 192.168.70.214:27017192.168.70.215
/root/mongodb/bin/mongod --replSet repset --port 27017 --dbpath /root/data27012 --oplogSize 2048 --logpath /root/log27012/log27012.log &
./mongo 192.168.70.215:27017192.168.70.216
/root/mongodb/bin/mongod --replSet repset --port 27017 --dbpath /root/data27013 --oplogSize 2048 --logpath /root/log27013/log27013.log &
./mongo 192.168.70.216:27017192.168.71.214
/root/mongodb/bin/mongod --replSet repset --port 27017 --dbpath /root/data27017 --oplogSize 2048 --logpath /root/log27017/log27017.log &
./mongo 192.168.71.214:27017192.168.71.215
/root/mongodb/bin/mongod --replSet repset --port 27017 --dbpath /root/data27018 --oplogSize 2048 --logpath /root/log27018/log27018.log &
./mongo 192.168.71.215:27017分别通过 tail -f /root/log27011/log27011.log 来观察分析个节点运行情况
复制集配置
在任何一台mongodb实例上登录,进入admin库,执行config命令,配置相应权重
[root@localhost bin]# pwd
/root/mongodb/bin
[root@localhost bin]# ./mongo 192.168.70.214:27017
MongoDB shell version: 2.2.3
connecting to: test
Welcome to the MongoDB shell.
For interactive help, type "help".
For more comprehensive documentation, see
http://docs.mongodb.org/"> http://docs.mongodb.org/Questions? Try the support group
 http://groups.google.com/group/mongodb-user
> use admin
switched to db admin
> config = { _id:"repset", members:[
... {_id:0,host:"192.168.70.214:27017",priority:10},
... {_id:1,host:"192.168.70.215:27017",priority:7},
... {_id:2,host:"192.168.70.216:27017",priority:6},
... {_id:3,host:"192.168.71.214:27017",priority:9}]
... {_id:4,host:"192.168.71.215:27017",priority:8}]
... }
{
 "_id" : "repset",
 "members" : [
 {
 "_id" : 0,
 "host" : "192.168.70.214:27017",
 "priority" : 10
 },
 {
 "_id" : 0,
 "host" : "192.168.70.215:27017",
 "priority" : 7
 },
 {
 "_id" : 0,
 "host" : "192.168.70.216:27017",
 "priority" : 6
 },
 {
 "_id" : 1,
 "host" : "192.168.71.214:27017",
 "priority" : 9
 },
 {
 "_id" : 2,
 "host" : "192.168.71.215:27017",
 "priority" : 8
 }
 ]
}
--查看
repset:PRIMARY> rs.conf()
{
 "_id" : "repset",
 "version" : 38349,
 "members" : [
 {
 "_id" : 4,
 "host" : "192.168.71.214:27017",
 "priority" : 9
 },
 {
 "_id" : 5,
 "host" : "192.168.71.215:27017",
 "priority" : 8
 },
 {
 "_id" : 6,
 "host" : "192.168.70.214:27017",
 "priority" : 10
 },
 {
 "_id" : 7,
 "host" : "192.168.70.215:27017",
 "priority" : 7
 },
 {
 "_id" : 8,
 "host" : "192.168.70.216:27017",
 "priority" : 6
 }
 ]
}
初始化副本集配置
> rs.initiate(config);
{
 "info" : "Config now saved locally. Should come online in about a minute.",
 "ok" : 1
}
初始需要一点时间同步
查看集群节点状态
repset:PRIMARY> rs.status()
{
 "set" : "repset",
 "date" : ISODate("2018-11-09T07:55:04Z"),
 "myState" : 1,
 "members" : [
 {
 "_id" : 4,
 "name" : "192.168.71.214:27017",
 "health" : 1,
 "state" : 2,
 "stateStr" : "SECONDARY",
 "uptime" : 1104,
 "optime" : Timestamp(1541749003000, 1),
 "optimeDate" : ISODate("2018-11-09T07:36:43Z"),
 "lastHeartbeat" : ISODate("2018-11-09T07:55:03Z"),
 "pingMs" : 0
 },
 {
 "_id" : 5,
 "name" : "192.168.71.215:27017",
 "health" : 1,
 "state" : 2,
 "stateStr" : "SECONDARY",
 "uptime" : 1104,
 "optime" : Timestamp(1541749003000, 1),
 "optimeDate" : ISODate("2018-11-09T07:36:43Z"),
 "lastHeartbeat" : ISODate("2018-11-09T07:55:03Z"),
 "pingMs" : 0
 },
 {
 "_id" : 6,
 "name" : "192.168.70.214:27017",
 "health" : 1,
 "state" : 1,
 "stateStr" : "PRIMARY",
 "uptime" : 1680,
 "optime" : Timestamp(1541749003000, 1),
 "optimeDate" : ISODate("2018-11-09T07:36:43Z"),
 "self" : true
 },
 {
 "_id" : 7,
 "name" : "192.168.70.215:27017",
 "health" : 1,
 "state" : 2,
 "stateStr" : "SECONDARY",
 "uptime" : 1104,
 "optime" : Timestamp(1541749003000, 1),
 "optimeDate" : ISODate("2018-11-09T07:36:43Z"),
 "lastHeartbeat" : ISODate("2018-11-09T07:55:03Z"),
 "pingMs" : 0
 },
 {
 "_id" : 8,
 "name" : "192.168.70.216:27017",
 "health" : 1,
 "state" : 2,
 "stateStr" : "SECONDARY",
 "uptime" : 1104,
 "optime" : Timestamp(1541749003000, 1),
 "optimeDate" : ISODate("2018-11-09T07:36:43Z"),
 "lastHeartbeat" : ISODate("2018-11-09T07:55:03Z"),
 "pingMs" : 0
 }
 ],
 "ok" : 1
}
repset:PRIMARY> 查看后台日志
[root@oracle_master ~]# tail -f /mgdata/mongodb/log27017/mongod.log
验证复制集数据一致性
先进去主库primary的mongodb上,录入数据
如
repset:PRIMARY> use dinpay
switched to db dinpay
repset:PRIMARY> db.dinpay.insert({"test1108":"xiawu1"})
repset:PRIMARY> db.getMongo().setSlaveOk();去另一个备库上验证数据
repset:SECONDARY> db.dinpay.find()
{ "_id" : ObjectId("5bd676e97e238f7b0dddfb0d"), "MongoDB TEST" : "dinpay" }
{ "_id" : ObjectId("5bd823b65b237ec32e664db2"), "mdbtest" : "zgy20181030" }
{ "_id" : ObjectId("5be53e2c60074628c8509830"), "test1108" : "xiawu1" }断电断网模拟
断电:直接kill mongdb进程
断网:开启某一机房的防火墙限制机房间通讯
B机房断电断网
192.168.71.214、192.16871.215断电断网 后个节点状态
repset:PRIMARY> rs.status()
{
 "set" : "repset",
 "date" : ISODate("2018-11-09T08:02:36Z"),
 "myState" : 1,
 "members" : [
 {
 "_id" : 4,
 "name" : "192.168.71.214:27017",
 "health" : 0,
 "state" : 8,
 "stateStr" : "(not reachable/healthy)",
 "uptime" : 0,
 "optime" : Timestamp(1541750316000, 1),
 "optimeDate" : ISODate("2018-11-09T07:58:36Z"),
 "lastHeartbeat" : ISODate("2018-11-09T08:01:59Z"),
 "pingMs" : 0,
 "errmsg" : "socket exception [CONNECT_ERROR] for 192.168.71.214:27017"
 },
 {
 "_id" : 5,
 "name" : "192.168.71.215:27017",
 "health" : 0,
 "state" : 8,
 "stateStr" : "(not reachable/healthy)",
 "uptime" : 0,
 "optime" : Timestamp(1541750316000, 1),
 "optimeDate" : ISODate("2018-11-09T07:58:36Z"),
 "lastHeartbeat" : ISODate("2018-11-09T08:01:57Z"),
 "pingMs" : 0,
 "errmsg" : "socket exception [CONNECT_ERROR] for 192.168.71.215:27017"
 },
 {
 "_id" : 6,
 "name" : "192.168.70.214:27017",
 "health" : 1,
 "state" : 1,
 "stateStr" : "PRIMARY",
 "uptime" : 2132,
 "optime" : Timestamp(1541750316000, 1),
 "optimeDate" : ISODate("2018-11-09T07:58:36Z"),
 "self" : true
 },
 {
 "_id" : 7,
 "name" : "192.168.70.215:27017",
 "health" : 1,
 "state" : 2,
 "stateStr" : "SECONDARY",
 "uptime" : 1556,
 "optime" : Timestamp(1541750316000, 1),
 "optimeDate" : ISODate("2018-11-09T07:58:36Z"),
 "lastHeartbeat" : ISODate("2018-11-09T08:02:35Z"),
 "pingMs" : 0
 },
 {
 "_id" : 8,
 "name" : "192.168.70.216:27017",
 "health" : 1,
 "state" : 2,
 "stateStr" : "SECONDARY",
 "uptime" : 1556,
 "optime" : Timestamp(1541750316000, 1),
 "optimeDate" : ISODate("2018-11-09T07:58:36Z"),
 "lastHeartbeat" : ISODate("2018-11-09T08:02:35Z"),
 "pingMs" : 0
 }
 ],
 "ok" : 1
}
repset:PRIMARY> 结论:A机房运行正常。
A机房断电断网
192.168.70.214(PRI)、192.168.70.215、192.168.70.216 
登录B机房任一台节点强制reconfig恢复副本集,仅保留活动着的节点
repset:SECONDARY> use admin
switched to db admin
--查看现有配置,其中70网段3个节点都已死掉了
repset:SECONDARY> cfg=rs.conf()
{
 "_id" : "repset",
 "version" : 79,
 "members" : [
 {
 "_id" : 4,
 "host" : "192.168.71.214:27017",
 "priority" : 10
 },
 {
 "_id" : 5,
 "host" : "192.168.71.215:27017",
 "priority" : 9
 },
 {
 "_id" : 7,
 "host" : "192.168.70.214:27017",
 "priority" : 11
 },
 {
 "_id" : 8,
 "host" : "192.168.70.215:27017",
 "priority" : 6
 },
 {
 "_id" : 13,
 "host" : "192.168.70.216:27017",
 "priority" : 5
 }
 ]
}
--只保留活着的节点
repset:SECONDARY> cfg.members = [cfg.members[0], cfg.members[1]]
[
 {
 "_id" : 4,
 "host" : "192.168.71.214:27017",
 "priority" : 10
 },
 {
 "_id" : 5,
 "host" : "192.168.71.215:27017",
 "priority" : 9
 }
]
--强制启动并新产生一个PRIMARY组成2节点的备份集
repset:SECONDARY> rs.reconfig(cfg, {force :true })
{ "ok" : 1 }
repset:SECONDARY> rs.status()
{
 "set" : "repset",
 "date" : ISODate("2018-11-09T03:45:29Z"),
 "myState" : 1,
 "members" : [
 {
 "_id" : 4,
 "name" : "192.168.71.214:27017",
 "health" : 1,
 "state" : 1,
 "stateStr" : "PRIMARY",
 "uptime" : 69133,
 "optime" : Timestamp(1541663971000, 1),
 "optimeDate" : ISODate("2018-11-08T07:59:31Z"),
 "self" : true
 },
 {
 "_id" : 5,
 "name" : "192.168.71.215:27017",
 "health" : 1,
 "state" : 2,
 "stateStr" : "SECONDARY",
 "uptime" : 8,
 "optime" : Timestamp(1541663971000, 1),
 "optimeDate" : ISODate("2018-11-08T07:59:31Z"),
 "lastHeartbeat" : ISODate("2018-11-09T03:45:29Z"),
 "pingMs" : 0
 }
 ],
 "ok" : 1
}
repset:PRIMARY> 
--检查数据,先前数据仍存在
repset:PRIMARY> use test
switched to db test
repset:PRIMARY> show collections
system.indexes
test
Testdb
--状态查询
repset:PRIMARY> rs.conf()
{
 "_id" : "repset",
 "version" : 38342,
 "members" : [
 {
 "_id" : 4,
 "host" : "192.168.71.214:27017",
 "priority" : 10
 },
 {
 "_id" : 5,
 "host" : "192.168.71.215:27017",
 "priority" : 9
 }
 ]
}结论:B机房强制启动,变成新的备份集
尽管已产生新的备份集,但不能保证断电断网瞬间存在旧PRIMARY未同步至各从节点而导致数据丢失的可能性。恢复初始状态
Kill并重启A机房个节点mongdb192.168.70.214
/root/mongodb/bin/mongod --replSet repset --port 27017 --dbpath /root/data27011 --oplogSize 2048 --logpath /root/log27011/log27011.log &
192.168.70.215
/root/mongodb/bin/mongod --replSet repset --port 27017 --dbpath /root/data27012 --oplogSize 2048 --logpath /root/log27012/log27012.log &
192.168.70.216
/root/mongodb/bin/mongod --replSet repset --port 27017 --dbpath /root/data27013 --oplogSize 2048 --logpath /root/log27013/log27013.log &将A机房个节点加入新备份集(B机房),并对A机房某节点提权升为新的PRIMARY,恢复至断电断网前的状态
epset:PRIMARY> use admin
switched to db admin
repset:PRIMARY> cfg=rs.conf()
repset:PRIMARY> cfg.members[XX].priority = 8
8
repset:PRIMARY> rs.reconfig(cfg)主机房挂了再恢复测试。。。比上面恢复详细
--A机房(主机房) 192.168.70.214(主)/192.168.70.215/192.168.70.216断电断网,B机房强制重启后成为了新的集群,现在将2机房重新回到初始状态,首先要确认之前各节点都是什么角色
--加节点
--设权重
repset:PRIMARY> use admin
switched to db admin
repset:PRIMARY> rs.add("192.168.70.214:27017")
{ "ok" : 1 }
repset:PRIMARY> rs.add("192.168.70.215:27017")
{ "ok" : 1 }
repset:PRIMARY> rs.add("192.168.70.216:27017")
{ "ok" : 1 }
repset:PRIMARY> 
repset:PRIMARY> cfg=rs.conf()
{
 "_id" : "repset",
 "version" : 63044,
 "members" : [
 {
 "_id" : 4,
 "host" : "192.168.71.214:27017",
 "priority" : 9
 },
 {
 "_id" : 5,
 "host" : "192.168.71.215:27017",
 "priority" : 8
 },
 {
 "_id" : 6,
 "host" : "192.168.70.214:27017"
 },
 {
 "_id" : 7,
 "host" : "192.168.70.215:27017"
 },
 {
 "_id" : 8,
 "host" : "192.168.70.216:27017"
 }
 ]
}
repset:PRIMARY> cfg.members[2].priority = 11
11
repset:PRIMARY> cfg.members[3].priority = 6
6
repset:PRIMARY> cfg.members[4].priority = 5
5
repset:PRIMARY> rs.reconfig(cfg)
Mon Nov 12 16:04:10 DBClientCursor::init call() failed
Mon Nov 12 16:04:10 query failed : admin.$cmd { replSetReconfig: { _id: "repset", version: 63045, members: [ { _id: 4, host: "192.168.71.214:27017", priority: 9.0 }, { _id: 5, host: "192.168.71.215:27017", priority: 8.0 }, { _id: 6, host: "192.168.70.214:27017", priority: 11.0 }, { _id: 7, host: "192.168.70.215:27017", priority: 6.0 }, { _id: 8, host: "192.168.70.216:27017", priority: 5.0 } ] } } to: 192.168.71.214:27017
Mon Nov 12 16:04:10 trying reconnect to 192.168.71.214:27017
Mon Nov 12 16:04:10 reconnect 192.168.71.214:27017 ok
reconnected to server after rs command (which is normal)repset:PRIMARY> 
Mon Nov 12 16:04:29 Socket recv() errno:104 Connection reset by peer 192.168.71.214:27017
Mon Nov 12 16:04:29 SocketException: remote: 192.168.71.214:27017 error: 9001 socket exception [1] server [192.168.71.214:27017] 
Mon Nov 12 16:04:29 DBClientCursor::init call() failed
Mon Nov 12 16:04:29 query failed : admin.$cmd { replSetGetStatus: 1.0, forShell: 1.0 } to: 192.168.71.214:27017
> 
Mon Nov 12 16:04:37 trying reconnect to 192.168.71.214:27017
Mon Nov 12 16:04:37 reconnect 192.168.71.214:27017 ok
repset:SECONDARY> 
repset:SECONDARY> rs.conf()
{
 "_id" : "repset",
 "version" : 63045,
 "members" : [
 {
 "_id" : 4,
 "host" : "192.168.71.214:27017",
 "priority" : 9
 },
 {
 "_id" : 5,
 "host" : "192.168.71.215:27017",
 "priority" : 8
 },
 {
 "_id" : 6,
 "host" : "192.168.70.214:27017",
 "priority" : 11
 },
 {
 "_id" : 7,
 "host" : "192.168.70.215:27017",
 "priority" : 6
 },
 {
 "_id" : 8,
 "host" : "192.168.70.216:27017",
 "priority" : 5
 }
 ]
}
repset:SECONDARY> 
repset:SECONDARY> rs.status()
{
 "set" : "repset",
 "date" : ISODate("2018-11-12T08:05:16Z"),
 "myState" : 2,
 "syncingTo" : "192.168.70.214:27017",
 "members" : [
 {
 "_id" : 4,
 "name" : "192.168.71.214:27017",
 "health" : 1,
 "state" : 2,
 "stateStr" : "SECONDARY",
 "uptime" : 19426,
 "optime" : Timestamp(1542009850000, 1),
 "optimeDate" : ISODate("2018-11-12T08:04:10Z"),
 "errmsg" : "syncing to: 192.168.70.214:27017",
 "self" : true
 },
 {
 "_id" : 5,
 "name" : "192.168.71.215:27017",
 "health" : 1,
 "state" : 2,
 "stateStr" : "SECONDARY",
 "uptime" : 50,
 "optime" : Timestamp(1542009850000, 1),
 "optimeDate" : ISODate("2018-11-12T08:04:10Z"),
 "lastHeartbeat" : ISODate("2018-11-12T08:05:14Z"),
 "pingMs" : 0,
 "errmsg" : "syncing to: 192.168.70.214:27017"
 },
 {
 "_id" : 6,
 "name" : "192.168.70.214:27017",
 "health" : 1,
 "state" : 1,
 "stateStr" : "PRIMARY",
 "uptime" : 64,
 "optime" : Timestamp(1542009850000, 1),
 "optimeDate" : ISODate("2018-11-12T08:04:10Z"),
 "lastHeartbeat" : ISODate("2018-11-12T08:05:14Z"),
 "pingMs" : 1
 },
 {
 "_id" : 7,
 "name" : "192.168.70.215:27017",
 "health" : 1,
 "state" : 2,
 "stateStr" : "SECONDARY",
 "uptime" : 64,
 "optime" : Timestamp(1542009850000, 1),
 "optimeDate" : ISODate("2018-11-12T08:04:10Z"),
 "lastHeartbeat" : ISODate("2018-11-12T08:05:14Z"),
 "pingMs" : 0,
 "errmsg" : "syncing to: 192.168.70.214:27017"
 },
 {
 "_id" : 8,
 "name" : "192.168.70.216:27017",
 "health" : 1,
 "state" : 2,
 "stateStr" : "SECONDARY",
 "uptime" : 64,
 "optime" : Timestamp(1542009850000, 1),
 "optimeDate" : ISODate("2018-11-12T08:04:10Z"),
 "lastHeartbeat" : ISODate("2018-11-12T08:05:14Z"),
 "pingMs" : 1,
 "errmsg" : "syncing to: 192.168.70.214:27017"
 }
 ],
 "ok" : 1
}
repset:SECONDARY>