一、环境
$ cat /etc/redhat-release
CentOS Linux release 7.0.1406 (Core)
$ uname -a
Linux zhaopin-2-201 3.10.0-123.el7.x86_64 #1 SMP Mon Jun 30 12:09:22 UTC 2014 x86_64 x86_64 x86_64 GNU/Linux
$ mongo
MongoDB shell version: 3.0.6
connecting to: test
rs0:PRIMARY> rs.status();
{
"set" : "rs0",
"date" : ISODate("2015-09-28T07:00:05.507Z"),
"myState" : 1,
"members" : [
{
"_id" : 0,
"name" : "172.30.2.201:27017",
"health" : 1,
"state" : 1,
"stateStr" : "PRIMARY",
"uptime" : 83,
"optime" : Timestamp(1443423600, 1),
"optimeDate" : ISODate("2015-09-28T07:00:00Z"),
"electionTime" : Timestamp(1443423535, 2),
"electionDate" : ISODate("2015-09-28T06:58:55Z"),
"configVersion" : 3,
"self" : true
},
{
"_id" : 1,
"name" : "172.30.2.203:27017",
"health" : 1,
"state" : 2,
"stateStr" : "SECONDARY",
"uptime" : 44,
"optime" : Timestamp(1443423600, 1),
"optimeDate" : ISODate("2015-09-28T07:00:00Z"),
"lastHeartbeat" : ISODate("2015-09-28T07:00:04.918Z"),
"lastHeartbeatRecv" : ISODate("2015-09-28T07:00:05.042Z"),
"pingMs" : 0,
"syncingTo" : "172.30.2.201:27017",
"configVersion" : 3
},
{
"_id" : 2,
"name" : "172.30.2.202:27017",
"health" : 1,
"state" : 5,
"stateStr" : "STARTUP2",
"uptime" : 4,
"optime" : Timestamp(0, 0),
"optimeDate" : ISODate("1970-01-01T00:00:00Z"),
"lastHeartbeat" : ISODate("2015-09-28T07:00:04.918Z"),
"lastHeartbeatRecv" : ISODate("2015-09-28T07:00:04.926Z"),
"pingMs" : 0,
"configVersion" : 3
}
],
"ok" : 1
}
二、单节点故障
1.primary节点故障
1)关闭primay节点
rs0:PRIMARY> use admin;
switched to db admin
rs0:PRIMARY> db.shutdownServer();
2015-09-28T15:00:51.828+0800 I NETWORK DBClientCursor::init call() failed
server should be down...
2015-09-28T15:00:51.830+0800 I NETWORK trying reconnect to 127.0.0.1:27017 (127.0.0.1) failed
2015-09-28T15:00:51.831+0800 I NETWORK reconnect 127.0.0.1:27017 (127.0.0.1) ok
2015-09-28T15:00:51.831+0800 I NETWORK DBClientCursor::init call() failed
>
bye
2)查看集群状态
$ mongo
MongoDB shell version: 3.0.6
connecting to: test
rs0:SECONDARY> rs.status();
{
"set" : "rs0",
"date" : ISODate("2015-09-28T07:01:28.818Z"),
"myState" : 2,
"members" : [
{
"_id" : 0,
"name" : "172.30.2.201:27017",
"health" : 0,
"state" : 8,
"stateStr" : "(not reachable/healthy)",
"uptime" : 0,
"optime" : Timestamp(0, 0),
"optimeDate" : ISODate("1970-01-01T00:00:00Z"),
"lastHeartbeat" : ISODate("2015-09-28T07:01:27.006Z"),
"lastHeartbeatRecv" : ISODate("2015-09-28T07:00:50.935Z"),
"pingMs" : 0,
"lastHeartbeatMessage" : "Failed attempt to connect to 172.30.2.201:27017; couldn't connect to server 172.30.2.201:27017 (172.30.2.201), connection attempt failed",
"configVersion" : -1
},
{
"_id" : 1,
"name" : "172.30.2.203:27017",
"health" : 1,
"state" : 1,
"stateStr" : "PRIMARY",
"uptime" : 87,
"optime" : Timestamp(1443423600, 1),
"optimeDate" : ISODate("2015-09-28T07:00:00Z"),
"lastHeartbeat" : ISODate("2015-09-28T07:01:26.963Z"),
"lastHeartbeatRecv" : ISODate("2015-09-28T07:01:27.078Z"),
"pingMs" : 0,
"electionTime" : Timestamp(1443423653, 1),
"electionDate" : ISODate("2015-09-28T07:00:53Z"),
"configVersion" : 3
},
{
"_id" : 2,
"name" : "172.30.2.202:27017",
"health" : 1,
"state" : 2,
"stateStr" : "SECONDARY",
"uptime" : 90,
"optime" : Timestamp(1443423600, 1),
"optimeDate" : ISODate("2015-09-28T07:00:00Z"),
"configVersion" : 3,
"self" : true
}
],
"ok" : 1
}
发现集群进行了自动切换,把172.30.2.202:27017变为了primary
3)启动原来的primary
$ sudo /opt/mongodb/bin/mongod --config /data/mongodb/conf/db0/mongodb.conf
about to fork child process, waiting until server is ready for connections.
forked process: 25738
child process started successfully, parent exiting
$ mongo
MongoDB shell version: 3.0.6
connecting to: test
rs0:PRIMARY> rs.status();
{
"set" : "rs0",
"date" : ISODate("2015-09-28T07:02:24.312Z"),
"myState" : 1,
"members" : [
{
"_id" : 0,
"name" : "172.30.2.201:27017",
"health" : 1,
"state" : 2,
"stateStr" : "SECONDARY",
"uptime" : 13,
"optime" : Timestamp(1443423600, 1),
"optimeDate" : ISODate("2015-09-28T07:00:00Z"),
"lastHeartbeat" : ISODate("2015-09-28T07:02:23.189Z"),
"lastHeartbeatRecv" : ISODate("2015-09-28T07:02:22.873Z"),
"pingMs" : 0,
"configVersion" : 3
},
{
"_id" : 1,
"name" : "172.30.2.203:27017",
"health" : 1,
"state" : 1,
"stateStr" : "PRIMARY",
"uptime" : 185,
"optime" : Timestamp(1443423600, 1),
"optimeDate" : ISODate("2015-09-28T07:00:00Z"),
"electionTime" : Timestamp(1443423653, 1),
"electionDate" : ISODate("2015-09-28T07:00:53Z"),
"configVersion" : 3,
"self" : true
},
{
"_id" : 2,
"name" : "172.30.2.202:27017",
"health" : 1,
"state" : 2,
"stateStr" : "SECONDARY",
"uptime" : 143,
"optime" : Timestamp(1443423600, 1),
"optimeDate" : ISODate("2015-09-28T07:00:00Z"),
"lastHeartbeat" : ISODate("2015-09-28T07:02:23.103Z"),
"lastHeartbeatRecv" : ISODate("2015-09-28T07:02:22.990Z"),
"pingMs" : 0,
"configVersion" : 3
}
],
"ok" : 1
}
发现原来的primary自动切为了secondary
2.secondary节点故障
1)关闭secondary节点
$ mongo
MongoDB shell version: 3.0.6
connecting to: test
rs0:SECONDARY> use admin;
switched to db admin
rs0:SECONDARY> db.shutdownServer();
2015-09-28T15:04:39.064+0800 I NETWORK DBClientCursor::init call() failed
server should be down...
2015-09-28T15:04:39.066+0800 I NETWORK trying reconnect to 127.0.0.1:27017 (127.0.0.1) failed
2015-09-28T15:04:39.067+0800 W NETWORK Failed to connect to 127.0.0.1:27017, reason: errno:111 Connection refused
2015-09-28T15:04:39.067+0800 I NETWORK reconnect 127.0.0.1:27017 (127.0.0.1) failed failed couldn't connect to server 127.0.0.1:27017 (127.0.0.1), connection attempt failed
2015-09-28T15:04:39.070+0800 I NETWORK trying reconnect to 127.0.0.1:27017 (127.0.0.1) failed
2015-09-28T15:04:39.070+0800 W NETWORK Failed to connect to 127.0.0.1:27017, reason: errno:111 Connection refused
2015-09-28T15:04:39.070+0800 I NETWORK reconnect 127.0.0.1:27017 (127.0.0.1) failed failed couldn't connect to server 127.0.0.1:27017 (127.0.0.1), connection attempt failed
>
bye
2)查看集群状态
$ mongo
MongoDB shell version: 3.0.6
connecting to: test
rs0:PRIMARY> rs.status();
{
"set" : "rs0",
"date" : ISODate("2015-09-28T07:05:12.140Z"),
"myState" : 1,
"members" : [
{
"_id" : 0,
"name" : "172.30.2.201:27017",
"health" : 1,
"state" : 2,
"stateStr" : "SECONDARY",
"uptime" : 180,
"optime" : Timestamp(1443423600, 1),
"optimeDate" : ISODate("2015-09-28T07:00:00Z"),
"lastHeartbeat" : ISODate("2015-09-28T07:05:11.265Z"),
"lastHeartbeatRecv" : ISODate("2015-09-28T07:05:10.951Z"),
"pingMs" : 0,
"configVersion" : 3
},
{
"_id" : 1,
"name" : "172.30.2.203:27017",
"health" : 1,
"state" : 1,
"stateStr" : "PRIMARY",
"uptime" : 353,
"optime" : Timestamp(1443423600, 1),
"optimeDate" : ISODate("2015-09-28T07:00:00Z"),
"electionTime" : Timestamp(1443423653, 1),
"electionDate" : ISODate("2015-09-28T07:00:53Z"),
"configVersion" : 3,
"self" : true
},
{
"_id" : 2,
"name" : "172.30.2.202:27017",
"health" : 0,
"state" : 8,
"stateStr" : "(not reachable/healthy)",
"uptime" : 0,
"optime" : Timestamp(0, 0),
"optimeDate" : ISODate("1970-01-01T00:00:00Z"),
"lastHeartbeat" : ISODate("2015-09-28T07:05:11.226Z"),
"lastHeartbeatRecv" : ISODate("2015-09-28T07:04:37.055Z"),
"pingMs" : 0,
"lastHeartbeatMessage" : "Failed attempt to connect to 172.30.2.202:27017; couldn't connect to server 172.30.2.202:27017 (172.30.2.202), connection attempt failed",
"configVersion" : -1
}
],
"ok" : 1
}
可见单个secondary节点故障对集群没有影响
3)再启动secondary
$ sudo /opt/mongodb/bin/mongod --config /data/mongodb/conf/db0/mongodb.conf
about to fork child process, waiting until server is ready for connections.
forked process: 49507
child process started successfully, parent exiting
$ mongo
MongoDB shell version: 3.0.6
connecting to: test
rs0:SECONDARY> rs.status();
{
"set" : "rs0",
"date" : ISODate("2015-09-28T07:06:41.733Z"),
"myState" : 2,
"members" : [
{
"_id" : 0,
"name" : "172.30.2.201:27017",
"health" : 1,
"state" : 2,
"stateStr" : "SECONDARY",
"uptime" : 12,
"optime" : Timestamp(1443423600, 1),
"optimeDate" : ISODate("2015-09-28T07:00:00Z"),
"lastHeartbeat" : ISODate("2015-09-28T07:06:40.999Z"),
"lastHeartbeatRecv" : ISODate("2015-09-28T07:06:41.233Z"),
"pingMs" : 0,
"lastHeartbeatMessage" : "could not find member to sync from",
"configVersion" : 3
},
{
"_id" : 1,
"name" : "172.30.2.203:27017",
"health" : 1,
"state" : 1,
"stateStr" : "PRIMARY",
"uptime" : 12,
"optime" : Timestamp(1443423600, 1),
"optimeDate" : ISODate("2015-09-28T07:00:00Z"),
"lastHeartbeat" : ISODate("2015-09-28T07:06:40.999Z"),
"lastHeartbeatRecv" : ISODate("2015-09-28T07:06:41.360Z"),
"pingMs" : 0,
"electionTime" : Timestamp(1443423653, 1),
"electionDate" : ISODate("2015-09-28T07:00:53Z"),
"configVersion" : 3
},
{
"_id" : 2,
"name" : "172.30.2.202:27017",
"health" : 1,
"state" : 2,
"stateStr" : "SECONDARY",
"uptime" : 13,
"optime" : Timestamp(1443423600, 1),
"optimeDate" : ISODate("2015-09-28T07:00:00Z"),
"configVersion" : 3,
"self" : true
}
],
"ok" : 1
}
重新启动后又重新连上了集群
三、多节点故障
1.primary和secondary节点同时故障
1)停掉一个secondary节点
$ mongo
MongoDB shell version: 3.0.6
connecting to: test
rs0:SECONDARY> use admin;
switched to db admin
rs0:SECONDARY> db.shutdownServer();
2015-09-28T15:10:43.049+0800 I NETWORK DBClientCursor::init call() failed
server should be down...
2015-09-28T15:10:43.051+0800 I NETWORK trying reconnect to 127.0.0.1:27017 (127.0.0.1) failed
2015-09-28T15:10:43.052+0800 W NETWORK Failed to connect to 127.0.0.1:27017, reason: errno:111 Connection refused
2015-09-28T15:10:43.052+0800 I NETWORK reconnect 127.0.0.1:27017 (127.0.0.1) failed failed couldn't connect to server 127.0.0.1:27017 (127.0.0.1), connection attempt failed
2015-09-28T15:10:43.055+0800 I NETWORK trying reconnect to 127.0.0.1:27017 (127.0.0.1) failed
2015-09-28T15:10:43.055+0800 W NETWORK Failed to connect to 127.0.0.1:27017, reason: errno:111 Connection refused
2015-09-28T15:10:43.055+0800 I NETWORK reconnect 127.0.0.1:27017 (127.0.0.1) failed failed couldn't connect to server 127.0.0.1:27017 (127.0.0.1), connection attempt failed
2)停掉primary节点
$ mongoMongoDB shell version: 3.0.6connecting to: testrs0:PRIMARY> use admin;switched to db adminrs0:PRIMARY> db.shutdownServer();2015-09-28T15:10:53.069+0800 I NETWORK DBClientCursor::init call() failedserver should be down...2015-09-28T15:10:53.072+0800 I NETWORK trying reconnect to 127.0.0.1:27017 (127.0.0.1) failed2015-09-28T15:10:53.073+0800 I NETWORK reconnect 127.0.0.1:27017 (127.0.0.1) ok2015-09-28T15:10:53.073+0800 I NETWORK DBClientCursor::init call() failed2015-09-28T15:10:53.076+0800 I NETWORK trying reconnect to 127.0.0.1:27017 (127.0.0.1) failed2015-09-28T15:10:53.076+0800 I NETWORK reconnect 127.0.0.1:27017 (127.0.0.1) ok2015-09-28T15:10:53.888+0800 I NETWORK Socket recv() errno:104 Connection reset by peer 127.0.0.1:270172015-09-28T15:10:53.888+0800 I NETWORK SocketException: remote: 127.0.0.1:27017 error: 9001 socket exception [RECV_ERROR] server [127.0.0.1:27017]2015-09-28T15:10:53.888+0800 I NETWORK DBClientCursor::init call() failed
3)查看集群状态
$ mongo
MongoDB shell version: 3.0.6
connecting to: test
rs0:SECONDARY> rs.status();
{
"set" : "rs0",
"date" : ISODate("2015-09-28T07:12:10.946Z"),
"myState" : 2,
"members" : [
{
"_id" : 0,
"name" : "172.30.2.201:27017",
"health" : 1,
"state" : 2,
"stateStr" : "SECONDARY",
"uptime" : 600,
"optime" : Timestamp(1443423600, 1),
"optimeDate" : ISODate("2015-09-28T07:00:00Z"),
"configVersion" : 3,
"self" : true
},
{
"_id" : 1,
"name" : "172.30.2.203:27017",
"health" : 0,
"state" : 8,
"stateStr" : "(not reachable/healthy)",
"uptime" : 0,
"optime" : Timestamp(0, 0),
"optimeDate" : ISODate("1970-01-01T00:00:00Z"),
"lastHeartbeat" : ISODate("2015-09-28T07:12:10.008Z"),
"lastHeartbeatRecv" : ISODate("2015-09-28T07:10:51.422Z"),
"pingMs" : 0,
"lastHeartbeatMessage" : "Failed attempt to connect to 172.30.2.203:27017; couldn't connect to server 172.30.2.203:27017 (172.30.2.203), connection attempt failed",
"configVersion" : -1
},
{
"_id" : 2,
"name" : "172.30.2.202:27017",
"health" : 0,
"state" : 8,
"stateStr" : "(not reachable/healthy)",
"uptime" : 0,
"optime" : Timestamp(0, 0),
"optimeDate" : ISODate("1970-01-01T00:00:00Z"),
"lastHeartbeat" : ISODate("2015-09-28T07:12:09.477Z"),
"lastHeartbeatRecv" : ISODate("2015-09-28T07:10:41.112Z"),
"pingMs" : 0,
"lastHeartbeatMessage" : "Failed attempt to connect to 172.30.2.202:27017; couldn't connect to server 172.30.2.202:27017 (172.30.2.202), connection attempt failed",
"configVersion" : -1
}
],
"ok" : 1
}
只剩下一个secondary节点,集群变得不可用了
4)解决方案
重新配置:
rs0:SECONDARY> cfg={_id:"rs0", members:[ {_id:0,host:"172.30.2.201:27017"}] }
{
"_id" : "rs0",
"members" : [
{
"_id" : 0,
"host" : "172.30.2.201:27017"
}
]
}
rs0:SECONDARY> rs.reconfig(cfg, {force:true});
{ "ok" : 1 }
rs0:PRIMARY> rs.status();
{
"set" : "rs0",
"date" : ISODate("2015-09-28T07:14:09.350Z"),
"myState" : 1,
"members" : [
{
"_id" : 0,
"name" : "172.30.2.201:27017",
"health" : 1,
"state" : 1,
"stateStr" : "PRIMARY",
"uptime" : 719,
"optime" : Timestamp(1443423600, 1),
"optimeDate" : ISODate("2015-09-28T07:00:00Z"),
"electionTime" : Timestamp(1443424428, 1),
"electionDate" : ISODate("2015-09-28T07:13:48Z"),
"configVersion" : 71840,
"self" : true
}
],
"ok" : 1
}
此时就变成了单primary节点,可以提供读写服务,然后再制作secondary节点
2.两个secondary节点故障
1)故障前状态
rs0:PRIMARY> rs.status();
{
"set" : "rs0",
"date" : ISODate("2015-09-28T07:16:06.571Z"),
"myState" : 1,
"members" : [
{
"_id" : 0,
"name" : "172.30.2.201:27017",
"health" : 1,
"state" : 1,
"stateStr" : "PRIMARY",
"uptime" : 836,
"optime" : Timestamp(1443424538, 1),
"optimeDate" : ISODate("2015-09-28T07:15:38Z"),
"electionTime" : Timestamp(1443424534, 1),
"electionDate" : ISODate("2015-09-28T07:15:34Z"),
"configVersion" : 71842,
"self" : true
},
{
"_id" : 1,
"name" : "172.30.2.202:27017",
"health" : 1,
"state" : 2,
"stateStr" : "SECONDARY",
"uptime" : 31,
"optime" : Timestamp(1443424538, 1),
"optimeDate" : ISODate("2015-09-28T07:15:38Z"),
"lastHeartbeat" : ISODate("2015-09-28T07:16:06.230Z"),
"lastHeartbeatRecv" : ISODate("2015-09-28T07:16:06.089Z"),
"pingMs" : 0,
"configVersion" : 71842
},
{
"_id" : 2,
"name" : "172.30.2.203:27017",
"health" : 1,
"state" : 2,
"stateStr" : "SECONDARY",
"uptime" : 26,
"optime" : Timestamp(1443424538, 1),
"optimeDate" : ISODate("2015-09-28T07:15:38Z"),
"lastHeartbeat" : ISODate("2015-09-28T07:16:06.229Z"),
"lastHeartbeatRecv" : ISODate("2015-09-28T07:16:06.233Z"),
"pingMs" : 0,
"configVersion" : 71842
}
],
"ok" : 1
}
2)停掉两个secondary节点
在两个secondary节点分别执行:
$ mongo
MongoDB shell version: 3.0.6
connecting to: test
rs0:SECONDARY> use admin;
switched to db admin
rs0:SECONDARY> db.shutdownServer();
2015-09-28T15:18:11.114+0800 I NETWORK DBClientCursor::init call() failed
server should be down...
2015-09-28T15:18:11.117+0800 I NETWORK trying reconnect to 127.0.0.1:27017 (127.0.0.1) failed
2015-09-28T15:18:11.118+0800 W NETWORK Failed to connect to 127.0.0.1:27017, reason: errno:111 Connection refused
2015-09-28T15:18:11.118+0800 I NETWORK reconnect 127.0.0.1:27017 (127.0.0.1) failed failed couldn't connect to server 127.0.0.1:27017 (127.0.0.1), connection attempt failed
2015-09-28T15:18:11.121+0800 I NETWORK trying reconnect to 127.0.0.1:27017 (127.0.0.1) failed
2015-09-28T15:18:11.121+0800 W NETWORK Failed to connect to 127.0.0.1:27017, reason: errno:111 Connection refused
2015-09-28T15:18:11.121+0800 I NETWORK reconnect 127.0.0.1:27017 (127.0.0.1) failed failed couldn't connect to server 127.0.0.1:27017 (127.0.0.1), connection attempt failed
>
bye
3)查看集群状态
rs0:SECONDARY> rs.status();
{
"set" : "rs0",
"date" : ISODate("2015-09-28T07:19:09.196Z"),
"myState" : 2,
"members" : [
{
"_id" : 0,
"name" : "172.30.2.201:27017",
"health" : 1,
"state" : 2,
"stateStr" : "SECONDARY",
"uptime" : 1019,
"optime" : Timestamp(1443424538, 1),
"optimeDate" : ISODate("2015-09-28T07:15:38Z"),
"configVersion" : 71842,
"self" : true
},
{
"_id" : 1,
"name" : "172.30.2.202:27017",
"health" : 0,
"state" : 8,
"stateStr" : "(not reachable/healthy)",
"uptime" : 0,
"optime" : Timestamp(0, 0),
"optimeDate" : ISODate("1970-01-01T00:00:00Z"),
"lastHeartbeat" : ISODate("2015-09-28T07:19:08.371Z"),
"lastHeartbeatRecv" : ISODate("2015-09-28T07:18:10.147Z"),
"pingMs" : 0,
"lastHeartbeatMessage" : "Failed attempt to connect to 172.30.2.202:27017; couldn't connect to server 172.30.2.202:27017 (172.30.2.202), connection attempt failed",
"configVersion" : -1
},
{
"_id" : 2,
"name" : "172.30.2.203:27017",
"health" : 0,
"state" : 8,
"stateStr" : "(not reachable/healthy)",
"uptime" : 0,
"optime" : Timestamp(0, 0),
"optimeDate" : ISODate("1970-01-01T00:00:00Z"),
"lastHeartbeat" : ISODate("2015-09-28T07:19:08.350Z"),
"lastHeartbeatRecv" : ISODate("2015-09-28T07:18:34.298Z"),
"pingMs" : 0,
"lastHeartbeatMessage" : "Failed attempt to connect to 172.30.2.203:27017; couldn't connect to server 172.30.2.203:27017 (172.30.2.203), connection attempt failed",
"configVersion" : -1
}
],
"ok" : 1
}
可见剩下的primary节点自动变为了secondary节点,集群变得不可用了
4)解决方案
rs0:SECONDARY> cfg={_id:"rs0", members:[ {_id:0,host:"172.30.2.201:27017"}] }
{
"_id" : "rs0",
"members" : [
{
"_id" : 0,
"host" : "172.30.2.201:27017"
}
]
}
rs0:SECONDARY> rs.reconfig(cfg, {force:true});
{ "ok" : 1 }
rs0:PRIMARY> rs.status();
{
"set" : "rs0",
"date" : ISODate("2015-09-28T07:20:08.099Z"),
"myState" : 1,
"members" : [
{
"_id" : 0,
"name" : "172.30.2.201:27017",
"health" : 1,
"state" : 1,
"stateStr" : "PRIMARY",
"uptime" : 1078,
"optime" : Timestamp(1443424538, 1),
"optimeDate" : ISODate("2015-09-28T07:15:38Z"),
"electionTime" : Timestamp(1443424795, 1),
"electionDate" : ISODate("2015-09-28T07:19:55Z"),
"configVersion" : 127342,
"self" : true
}
],
"ok" : 1
}
处理方法和上面的相同,也是强制将剩下的secondary节点配置为单primary节点