一、环境
$ cat /etc/redhat-release
CentOS Linux release 7.0.1406 (Core)
$ uname -a
Linux zhaopin-2-201 3.10.0-123.el7.x86_64 #1 SMP Mon Jun 30 12:09:22 UTC 2014 x86_64 x86_64 x86_64 GNU/Linux
$ mongo --port 30000
MongoDB shell version: 3.0.6
connecting to: 127.0.0.1:30000/test
mongos> sh.status();
--- Sharding Status ---
sharding version: {
"_id" : 1,
"minCompatibleVersion" : 5,
"currentVersion" : 6,
"clusterId" : ObjectId("561728b4030ea038bcb57fa0")
}
shards:
{ "_id" : "sh0", "host" : "sh0/172.30.2.201:27010" }
{ "_id" : "sh1", "host" : "sh1/172.30.2.201:27011" }
balancer:
Currently enabled: yes
Currently running: no
Failed balancer rounds in last 5 attempts: 0
Migration Results for the last 24 hours:
No recent migrations
databases:
{ "_id" : "admin", "partitioned" : false, "primary" : "config" }
{ "_id" : "mydb", "partitioned" : true, "primary" : "sh0" }
二、准备
插入测试数据
$ mongo --port 30000
MongoDB shell version: 3.0.6
connecting to: 127.0.0.1:30000/test
mongos> use mydb
switched to db mydb
mongos> sh.shardCollection("mydb.test",{"Uid":1});
{ "collectionsharded" : "mydb.test", "ok" : 1 }
mongos> for (var i = 1; i <= 100000; i++) db.test.save({"Uid":i,"Name":"zhanjindong","Age":13,"Date":new Date()});
WriteResult({ "nInserted" : 1 })
mongos> use config;
switched to db config
mongos> db.chunks.find().limit(3);
{ "_id" : "mydb.test-Uid_MinKey", "lastmod" : Timestamp(10, 1), "lastmodEpoch" : ObjectId("56178dad030ea038bcb592c2"), "ns" : "mydb.test", "min" : { "Uid" : { "$minKey" : 1 } }, "max" : { "Uid" : 2 }, "shard" : "sh0" }
{ "_id" : "mydb.test-Uid_2.0", "lastmod" : Timestamp(1, 2), "lastmodEpoch" : ObjectId("56178dad030ea038bcb592c2"), "ns" : "mydb.test", "min" : { "Uid" : 2 }, "max" : { "Uid" : 10 }, "shard" : "sh0" }
{ "_id" : "mydb.test-Uid_10.0", "lastmod" : Timestamp(11, 1), "lastmodEpoch" : ObjectId("56178dad030ea038bcb592c2"), "ns" : "mydb.test", "min" : { "Uid" : 10 }, "max" : { "Uid" : 4691 }, "shard" : "sh1" }
三、Shard Server节点故障
1.分片中副本集的节点故障
参考MongoDB副本集故障测试和解决方案,Query Routers会自动更新副本集的配置,只需要在副本集里面解决故障
2.分片故障
分片故障是指分片中的节点全部不可用了(如果分片中配置了副本集的话,基本是不会出现这种情况的)
1)将sh0停机
$ ps aux | grep mongo
wenhang+ 1493 0.0 0.0 112640 976 pts/0 S+ 10:42 0:00 grep --color=auto mongo
root 41492 0.5 0.1 1045696 118336 ? Sl Oct09 8:12 /opt/mongodb/bin/mongod --config /data/mongodb/conf/sh0/mongodb.conf
root 41509 0.5 0.1 1028196 108720 ? Sl Oct09 7:30 /opt/mongodb/bin/mongod --config /data/mongodb/conf/sh1/mongodb.conf
root 41855 0.6 0.0 566248 55208 ? Sl Oct09 9:14 /opt/mongodb/bin/mongod --config /data/mongodb/conf/cf0/config.conf
root 41870 0.5 0.0 565220 56180 ? Sl Oct09 8:29 /opt/mongodb/bin/mongod --config /data/mongodb/conf/cf1/config.conf
root 42170 0.5 0.0 565216 56872 ? Sl Oct09 8:25 /opt/mongodb/bin/mongod --config /data/mongodb/conf/cf2/config.conf
root 42233 0.3 0.0 260180 13200 ? Sl Oct09 5:40 /opt/mongodb/bin/mongos --config /data/mongodb/conf/ms0/mongos.conf
root 42286 0.3 0.0 259148 11564 ? Sl Oct09 4:31 /opt/mongodb/bin/mongos --config /data/mongodb/conf/ms1/mongos.conf
$ ps aux | grep mongo | grep -v grep
root 41492 0.5 0.1 1045696 118336 ? Sl Oct09 8:12 /opt/mongodb/bin/mongod --config /data/mongodb/conf/sh0/mongodb.conf
root 41509 0.5 0.1 1028196 108720 ? Sl Oct09 7:31 /opt/mongodb/bin/mongod --config /data/mongodb/conf/sh1/mongodb.conf
root 41855 0.6 0.0 566248 54988 ? Sl Oct09 9:14 /opt/mongodb/bin/mongod --config /data/mongodb/conf/cf0/config.conf
root 41870 0.5 0.1 565220 72392 ? Sl Oct09 8:29 /opt/mongodb/bin/mongod --config /data/mongodb/conf/cf1/config.conf
root 42170 0.5 0.0 565216 56776 ? Sl Oct09 8:25 /opt/mongodb/bin/mongod --config /data/mongodb/conf/cf2/config.conf
root 42233 0.3 0.0 260180 13200 ? Sl Oct09 5:40 /opt/mongodb/bin/mongos --config /data/mongodb/conf/ms0/mongos.conf
root 42286 0.3 0.0 259148 11564 ? Sl Oct09 4:31 /opt/mongodb/bin/mongos --config /data/mongodb/conf/ms1/mongos.conf
$ sudo kill 41492
$ ps aux | grep mongo | grep -v grep
root 41509 0.5 0.1 1028196 108736 ? Sl Oct09 7:31 /opt/mongodb/bin/mongod --config /data/mongodb/conf/sh1/mongodb.conf
root 41855 0.6 0.1 566248 87832 ? Sl Oct09 9:14 /opt/mongodb/bin/mongod --config /data/mongodb/conf/cf0/config.conf
root 41870 0.5 0.1 565220 89116 ? Sl Oct09 8:29 /opt/mongodb/bin/mongod --config /data/mongodb/conf/cf1/config.conf
root 42170 0.5 0.0 565216 56852 ? Sl Oct09 8:26 /opt/mongodb/bin/mongod --config /data/mongodb/conf/cf2/config.conf
root 42233 0.3 0.0 260180 13200 ? Sl Oct09 5:40 /opt/mongodb/bin/mongos --config /data/mongodb/conf/ms0/mongos.conf
root 42286 0.3 0.0 259148 11564 ? Sl Oct09 4:31 /opt/mongodb/bin/mongos --config /data/mongodb/conf/ms1/mongos.conf
2)查看集群状态
$ mongo --port 30000
MongoDB shell version: 3.0.6
connecting to: 127.0.0.1:30000/test
mongos> sh.status();
--- Sharding Status ---
sharding version: {
"_id" : 1,
"minCompatibleVersion" : 5,
"currentVersion" : 6,
"clusterId" : ObjectId("561728b4030ea038bcb57fa0")
}
shards:
{ "_id" : "sh0", "host" : "sh0/172.30.2.201:27010" }
{ "_id" : "sh1", "host" : "sh1/172.30.2.201:27011" }
balancer:
Currently enabled: yes
Currently running: no
Failed balancer rounds in last 5 attempts: 5
Last reported error: socket exception [CONNECT_ERROR] for sh0/172.30.2.201:27010
Time of Reported error: Sat Oct 10 2015 10:44:56 GMT+0800 (CST)
Migration Results for the last 24 hours:
19 : Success
2 : Failed with error 'migration already in progress', from sh0 to sh1
databases:
{ "_id" : "admin", "partitioned" : false, "primary" : "config" }
{ "_id" : "mydb", "partitioned" : true, "primary" : "sh0" }
mydb.test
shard key: { "Uid" : 1 }
chunks:
sh0 11
sh1 10
too many chunks to print, use verbose if you want to force print
3)测试可用性
mongos> db.test.find({Uid:12});
{ "_id" : ObjectId("56178e473af16d338338d3bf"), "Uid" : 12, "Name" : "zhanjindong", "Age" : 13, "Date" : ISODate("2015-10-09T09:52:07.205Z") }
mongos> db.test.find({Uid:8});
Error: error: {
"$err" : "socket exception [CONNECT_ERROR] for sh0/172.30.2.201:27010",
"code" : 11002,
"shard" : "sh0"
}
mongos> db.test.insert({"Uid":12,"Name":"zhanjindong","Age":13,"Date":new Date()});
WriteResult({ "nInserted" : 1 })
mongos> db.test.insert({"Uid":8,"Name":"zhanjindong","Age":13,"Date":new Date()});
WriteResult({
"nInserted" : 0,
"writeError" : {
"code" : 7,
"errmsg" : "could not contact primary for replica set sh0"
}
})
mongos>
bye
设计到sh0的操作都会报错,其他分片的可以正常执行
重新启动sh0
$ sudo /opt/mongodb/bin/mongod --config /data/mongodb/conf/sh0/mongodb.conf
about to fork child process, waiting until server is ready for connections.
forked process: 2534
child process started successfully, parent exiting
$ ps aux | grep mongo | grep -v grep
root 2534 1.1 0.0 971800 64060 ? Sl 10:53 0:00 /opt/mongodb/bin/mongod --config /data/mongodb/conf/sh0/mongodb.conf
root 41509 0.5 0.1 1028196 108804 ? Sl Oct09 7:33 /opt/mongodb/bin/mongod --config /data/mongodb/conf/sh1/mongodb.conf
root 41855 0.6 0.0 566248 55572 ? Sl Oct09 9:17 /opt/mongodb/bin/mongod --config /data/mongodb/conf/cf0/config.conf
root 41870 0.5 0.0 565220 56480 ? Sl Oct09 8:33 /opt/mongodb/bin/mongod --config /data/mongodb/conf/cf1/config.conf
root 42170 0.5 0.0 565216 57240 ? Sl Oct09 8:29 /opt/mongodb/bin/mongod --config /data/mongodb/conf/cf2/config.conf
root 42233 0.3 0.0 260180 13388 ? Sl Oct09 5:42 /opt/mongodb/bin/mongos --config /data/mongodb/conf/ms0/mongos.conf
root 42286 0.3 0.0 259148 11572 ? Sl Oct09 4:32 /opt/mongodb/bin/mongos --config /data/mongodb/conf/ms1/mongos.conf
$ mongo --port 30000
MongoDB shell version: 3.0.6
connecting to: 127.0.0.1:30000/test
mongos> use mydb;
switched to db mydb
mongos> db.test.find({Uid:8});
{ "_id" : ObjectId("56178e453af16d338338d3bb"), "Uid" : 8, "Name" : "zhanjindong", "Age" : 13, "Date" : ISODate("2015-10-09T09:52:05.539Z") }
mongos> db.test.find({Uid:12});
{ "_id" : ObjectId("56178e473af16d338338d3bf"), "Uid" : 12, "Name" : "zhanjindong", "Age" : 13, "Date" : ISODate("2015-10-09T09:52:07.205Z") }
{ "_id" : ObjectId("56187d6743c2e9badb0ca53b"), "Uid" : 12, "Name" : "zhanjindong", "Age" : 13, "Date" : ISODate("2015-10-10T02:52:23.497Z") }
mongos> db.test.insert({"Uid":8,"Name":"zhanjindong","Age":13,"Date":new Date()});
WriteResult({ "nInserted" : 1 })
两个分片都可读写
四、Config Server故障
1.单节点故障
1)停掉cf0
$ ps axu | grep mongo | grep -v grep
root 2534 0.4 0.1 982052 67608 ? Sl 10:53 0:03 /opt/mongodb/bin/mongod --config /data/mongodb/conf/sh0/mongodb.conf
root 41509 0.5 0.1 1028196 108808 ? Sl Oct09 7:37 /opt/mongodb/bin/mongod --config /data/mongodb/conf/sh1/mongodb.conf
root 41855 0.6 0.0 566248 55444 ? Sl Oct09 9:23 /opt/mongodb/bin/mongod --config /data/mongodb/conf/cf0/config.conf
root 41870 0.5 0.0 565220 56456 ? Sl Oct09 8:38 /opt/mongodb/bin/mongod --config /data/mongodb/conf/cf1/config.conf
root 42170 0.5 0.0 565216 56820 ? Sl Oct09 8:34 /opt/mongodb/bin/mongod --config /data/mongodb/conf/cf2/config.conf
root 42233 0.3 0.0 261204 13408 ? Sl Oct09 5:45 /opt/mongodb/bin/mongos --config /data/mongodb/conf/ms0/mongos.conf
root 42286 0.3 0.0 259148 11596 ? Sl Oct09 4:35 /opt/mongodb/bin/mongos --config /data/mongodb/conf/ms1/mongos.conf
[wenhang.pan@zhaopin-2-201 ~]$ sudo kill 41855
[wenhang.pan@zhaopin-2-201 ~]$ ps axu | grep mongo | grep -v grep
root 2534 0.4 0.1 982052 67636 ? Sl 10:53 0:03 /opt/mongodb/bin/mongod --config /data/mongodb/conf/sh0/mongodb.conf
root 41509 0.5 0.1 1028196 108808 ? Sl Oct09 7:37 /opt/mongodb/bin/mongod --config /data/mongodb/conf/sh1/mongodb.conf
root 41870 0.5 0.0 565220 56324 ? Sl Oct09 8:38 /opt/mongodb/bin/mongod --config /data/mongodb/conf/cf1/config.conf
root 42170 0.5 0.0 565216 56600 ? Sl Oct09 8:34 /opt/mongodb/bin/mongod --config /data/mongodb/conf/cf2/config.conf
root 42233 0.3 0.0 261204 13408 ? Sl Oct09 5:45 /opt/mongodb/bin/mongos --config /data/mongodb/conf/ms0/mongos.conf
root 42286 0.3 0.0 259148 11596 ? Sl Oct09 4:35 /opt/mongodb/bin/mongos --config /data/mongodb/conf/ms1/mongos.conf
2)查看状态
$ mongo --port 30000
MongoDB shell version: 3.0.6
connecting to: 127.0.0.1:30000/test
mongos> sh.status();
--- Sharding Status ---
sharding version: {
"_id" : 1,
"minCompatibleVersion" : 5,
"currentVersion" : 6,
"clusterId" : ObjectId("561728b4030ea038bcb57fa0")
}
shards:
{ "_id" : "sh0", "host" : "sh0/172.30.2.201:27010" }
{ "_id" : "sh1", "host" : "sh1/172.30.2.201:27011" }
balancer:
Currently enabled: yes
Currently running: yes
Balancer lock taken at Sat Oct 10 2015 11:08:14 GMT+0800 (CST) by zhaopin-2-201:30000:1444358323:1804289383:Balancer:1681692777
Failed balancer rounds in last 5 attempts: 0
Migration Results for the last 24 hours:
19 : Success
2 : Failed with error 'migration already in progress', from sh0 to sh1
databases:
{ "_id" : "admin", "partitioned" : false, "primary" : "config" }
{ "_id" : "mydb", "partitioned" : true, "primary" : "sh0" }
mydb.test
shard key: { "Uid" : 1 }
chunks:
sh0 11
sh1 10
too many chunks to print, use verbose if you want to force print
3)测试可用性
mongos> use mydb;
switched to db mydb
mongos> db.test.find({Uid:12});
{ "_id" : ObjectId("56178e473af16d338338d3bf"), "Uid" : 12, "Name" : "zhanjindong", "Age" : 13, "Date" : ISODate("2015-10-09T09:52:07.205Z") }
{ "_id" : ObjectId("56187d6743c2e9badb0ca53b"), "Uid" : 12, "Name" : "zhanjindong", "Age" : 13, "Date" : ISODate("2015-10-10T02:52:23.497Z") }
mongos> db.test.find({Uid:8});
{ "_id" : ObjectId("56178e453af16d338338d3bb"), "Uid" : 8, "Name" : "zhanjindong", "Age" : 13, "Date" : ISODate("2015-10-09T09:52:05.539Z") }
{ "_id" : ObjectId("56187e117ec37f9bb7244496"), "Uid" : 8, "Name" : "zhanjindong", "Age" : 13, "Date" : ISODate("2015-10-10T02:55:13.529Z") }
mongos> db.test.insert({"Uid":8,"Name":"zhanjindong","Age":13,"Date":new Date()});
WriteResult({ "nInserted" : 1 })
mongos> db.test.insert({"Uid":12,"Name":"zhanjindong","Age":13,"Date":new Date()});
WriteResult({ "nInserted" : 1 })
mongos>
bye
集群可正常使用
2.双节点故障
1)停掉cf1
$ ps axu | grep mongo | grep -v grep
root 2534 0.4 0.1 982052 68128 ? Sl 10:53 0:05 /opt/mongodb/bin/mongod --config /data/mongodb/conf/sh0/mongodb.conf
root 41509 0.5 0.1 1028196 108892 ? Sl Oct09 7:39 /opt/mongodb/bin/mongod --config /data/mongodb/conf/sh1/mongodb.conf
root 41870 0.5 0.1 565220 92328 ? Sl Oct09 8:40 /opt/mongodb/bin/mongod --config /data/mongodb/conf/cf1/config.conf
root 42170 0.5 0.1 565216 90392 ? Sl Oct09 8:36 /opt/mongodb/bin/mongod --config /data/mongodb/conf/cf2/config.conf
root 42233 0.3 0.0 261204 13508 ? Sl Oct09 5:46 /opt/mongodb/bin/mongos --config /data/mongodb/conf/ms0/mongos.conf
root 42286 0.3 0.0 260172 11716 ? Sl Oct09 4:37 /opt/mongodb/bin/mongos --config /data/mongodb/conf/ms1/mongos.conf
$ sudo kill 41870
$ ps axu | grep mongo | grep -v grep
root 2534 0.4 0.1 982052 68128 ? Sl 10:53 0:05 /opt/mongodb/bin/mongod --config /data/mongodb/conf/sh0/mongodb.conf
root 41509 0.5 0.1 1028196 108892 ? Sl Oct09 7:39 /opt/mongodb/bin/mongod --config /data/mongodb/conf/sh1/mongodb.conf
root 42170 0.5 0.1 565216 90408 ? Sl Oct09 8:36 /opt/mongodb/bin/mongod --config /data/mongodb/conf/cf2/config.conf
root 42233 0.3 0.0 261204 13508 ? Sl Oct09 5:46 /opt/mongodb/bin/mongos --config /data/mongodb/conf/ms0/mongos.conf
root 42286 0.3 0.0 260172 11716 ? Sl Oct09 4:37 /opt/mongodb/bin/mongos --config /data/mongodb/conf/ms1/mongos.conf
2)查看集群状态
$ mongo --port 30000
MongoDB shell version: 3.0.6
connecting to: 127.0.0.1:30000/test
mongos> sh.status();
--- Sharding Status ---
sharding version: {
"_id" : 1,
"minCompatibleVersion" : 5,
"currentVersion" : 6,
"clusterId" : ObjectId("561728b4030ea038bcb57fa0")
}
shards:
{ "_id" : "sh0", "host" : "sh0/172.30.2.201:27010" }
{ "_id" : "sh1", "host" : "sh1/172.30.2.201:27011" }
balancer:
Currently enabled: yes
Currently running: yes
Balancer lock taken at Sat Oct 10 2015 11:08:14 GMT+0800 (CST) by zhaopin-2-201:30000:1444358323:1804289383:Balancer:1681692777
Failed balancer rounds in last 5 attempts: 0
Migration Results for the last 24 hours:
19 : Success
2 : Failed with error 'migration already in progress', from sh0 to sh1
databases:
{ "_id" : "admin", "partitioned" : false, "primary" : "config" }
{ "_id" : "mydb", "partitioned" : true, "primary" : "sh0" }
mydb.test
shard key: { "Uid" : 1 }
chunks:
sh0 11
sh1 10
too many chunks to print, use verbose if you want to force print
3)测试可用性
mongos> use mydb;
switched to db mydb
mongos> db.test.find({Uid:8});
{ "_id" : ObjectId("56178e453af16d338338d3bb"), "Uid" : 8, "Name" : "zhanjindong", "Age" : 13, "Date" : ISODate("2015-10-09T09:52:05.539Z") }
{ "_id" : ObjectId("56187e117ec37f9bb7244496"), "Uid" : 8, "Name" : "zhanjindong", "Age" : 13, "Date" : ISODate("2015-10-10T02:55:13.529Z") }
{ "_id" : ObjectId("5618824d7bd66d65a90fb01c"), "Uid" : 8, "Name" : "zhanjindong", "Age" : 13, "Date" : ISODate("2015-10-10T03:13:17.564Z") }
mongos> db.test.find({Uid:12});
{ "_id" : ObjectId("56178e473af16d338338d3bf"), "Uid" : 12, "Name" : "zhanjindong", "Age" : 13, "Date" : ISODate("2015-10-09T09:52:07.205Z") }
{ "_id" : ObjectId("56187d6743c2e9badb0ca53b"), "Uid" : 12, "Name" : "zhanjindong", "Age" : 13, "Date" : ISODate("2015-10-10T02:52:23.497Z") }
{ "_id" : ObjectId("561882527bd66d65a90fb01d"), "Uid" : 12, "Name" : "zhanjindong", "Age" : 13, "Date" : ISODate("2015-10-10T03:13:22.594Z") }
mongos> db.test.insert({"Uid":12,"Name":"zhanjindong","Age":13,"Date":new Date()});
WriteResult({ "nInserted" : 1 })
mongos> db.test.insert({"Uid":8,"Name":"zhanjindong","Age":13,"Date":new Date()});
WriteResult({ "nInserted" : 1 })
mongos>
bye
集群可以正常使用
4)重新启动cf0和cf1
$ ps axu | grep mongo | grep -v grep
root 2534 0.4 0.1 982052 68228 ? Sl 10:53 0:05 /opt/mongodb/bin/mongod --config /data/mongodb/conf/sh0/mongodb.conf
root 41509 0.5 0.1 1028196 107976 ? Sl Oct09 7:39 /opt/mongodb/bin/mongod --config /data/mongodb/conf/sh1/mongodb.conf
root 42170 0.5 0.1 565216 93140 ? Sl Oct09 8:37 /opt/mongodb/bin/mongod --config /data/mongodb/conf/cf2/config.conf
root 42233 0.3 0.0 261204 13540 ? Sl Oct09 5:47 /opt/mongodb/bin/mongos --config /data/mongodb/conf/ms0/mongos.conf
root 42286 0.3 0.0 260172 11752 ? Sl Oct09 4:37 /opt/mongodb/bin/mongos --config /data/mongodb/conf/ms1/mongos.conf
$ sudo /opt/mongodb/bin/mongod --config /data/mongodb/conf/cf0/config.conf
about to fork child process, waiting until server is ready for connections.
forked process: 7309
child process started successfully, parent exiting
$ sudo /opt/mongodb/bin/mongod --config /data/mongodb/conf/cf1/config.conf
about to fork child process, waiting until server is ready for connections.
forked process: 7355
child process started successfully, parent exiting
$ ps axu | grep mongo | grep -v grep
root 2534 0.4 0.1 982052 68228 ? Sl 10:53 0:06 /opt/mongodb/bin/mongod --config /data/mongodb/conf/sh0/mongodb.conf
root 7309 0.6 0.0 545696 37504 ? Sl 11:17 0:00 /opt/mongodb/bin/mongod --config /data/mongodb/conf/cf0/config.conf
root 7355 0.5 0.0 542616 33748 ? Sl 11:17 0:00 /opt/mongodb/bin/mongod --config /data/mongodb/conf/cf1/config.conf
root 41509 0.5 0.1 1028196 108188 ? Sl Oct09 7:40 /opt/mongodb/bin/mongod --config /data/mongodb/conf/sh1/mongodb.conf
root 42170 0.5 0.0 565216 57992 ? Sl Oct09 8:37 /opt/mongodb/bin/mongod --config /data/mongodb/conf/cf2/config.conf
root 42233 0.3 0.0 261204 13540 ? Sl Oct09 5:47 /opt/mongodb/bin/mongos --config /data/mongodb/conf/ms0/mongos.conf
root 42286 0.3 0.0 260172 11768 ? Sl Oct09 4:38 /opt/mongodb/bin/mongos --config /data/mongodb/conf/ms1/mongos.conf
5)解决方案
参考:
http://docs.mongodb.org/manual/tutorial/replace-config-server/
http://docs.mongodb.org/manual/tutorial/migrate-config-servers-with-different-hostnames/
五、Query Routers故障
1)将ms0停机
$ ps axu | grep mongo | grep -v grep
root 2534 0.4 0.1 982052 68228 ? Sl 10:53 0:06 /opt/mongodb/bin/mongod --config /data/mongodb/conf/sh0/mongodb.conf
root 7309 0.6 0.0 545696 37504 ? Sl 11:17 0:00 /opt/mongodb/bin/mongod --config /data/mongodb/conf/cf0/config.conf
root 7355 0.5 0.0 542616 33748 ? Sl 11:17 0:00 /opt/mongodb/bin/mongod --config /data/mongodb/conf/cf1/config.conf
root 41509 0.5 0.1 1028196 108188 ? Sl Oct09 7:40 /opt/mongodb/bin/mongod --config /data/mongodb/conf/sh1/mongodb.conf
root 42170 0.5 0.0 565216 57992 ? Sl Oct09 8:37 /opt/mongodb/bin/mongod --config /data/mongodb/conf/cf2/config.conf
root 42233 0.3 0.0 261204 13540 ? Sl Oct09 5:47 /opt/mongodb/bin/mongos --config /data/mongodb/conf/ms0/mongos.conf
root 42286 0.3 0.0 260172 11768 ? Sl Oct09 4:38 /opt/mongodb/bin/mongos --config /data/mongodb/conf/ms1/mongos.conf
$ sudo kill 42233
$ ps axu | grep mongo | grep -v grep
root 2534 0.4 0.1 982052 68408 ? Sl 10:53 0:06 /opt/mongodb/bin/mongod --config /data/mongodb/conf/sh0/mongodb.conf
root 7309 0.6 0.0 547752 39096 ? Sl 11:17 0:01 /opt/mongodb/bin/mongod --config /data/mongodb/conf/cf0/config.conf
root 7355 0.6 0.0 544672 34256 ? Sl 11:17 0:01 /opt/mongodb/bin/mongod --config /data/mongodb/conf/cf1/config.conf
root 41509 0.5 0.1 1028196 108164 ? Sl Oct09 7:40 /opt/mongodb/bin/mongod --config /data/mongodb/conf/sh1/mongodb.conf
root 42170 0.5 0.0 565216 57796 ? Sl Oct09 8:38 /opt/mongodb/bin/mongod --config /data/mongodb/conf/cf2/config.conf
root 42286 0.3 0.0 260172 11768 ? Sl Oct09 4:38 /opt/mongodb/bin/mongos --config /data/mongodb/conf/ms1/mongos.conf
2)查看集群状态
$ mongo --port 30000
MongoDB shell version: 3.0.6
connecting to: 127.0.0.1:30000/test
2015-10-10T11:21:04.681+0800 W NETWORK Failed to connect to 127.0.0.1:30000, reason: errno:111 Connection refused
2015-10-10T11:21:04.683+0800 E QUERY Error: couldn't connect to server 127.0.0.1:30000 (127.0.0.1), connection attempt failed
at connect (src/mongo/shell/mongo.js:181:14)
at (connect):1:6 at src/mongo/shell/mongo.js:181
exception: connect failed
$ mongo --port 30001
MongoDB shell version: 3.0.6
connecting to: 127.0.0.1:30001/test
mongos> sh.status();
--- Sharding Status ---
sharding version: {
"_id" : 1,
"minCompatibleVersion" : 5,
"currentVersion" : 6,
"clusterId" : ObjectId("561728b4030ea038bcb57fa0")
}
shards:
{ "_id" : "sh0", "host" : "sh0/172.30.2.201:27010" }
{ "_id" : "sh1", "host" : "sh1/172.30.2.201:27011" }
balancer:
Currently enabled: yes
Currently running: no
Failed balancer rounds in last 5 attempts: 0
Migration Results for the last 24 hours:
19 : Success
2 : Failed with error 'migration already in progress', from sh0 to sh1
databases:
{ "_id" : "admin", "partitioned" : false, "primary" : "config" }
{ "_id" : "mydb", "partitioned" : true, "primary" : "sh0" }
mydb.test
shard key: { "Uid" : 1 }
chunks:
sh0 11
sh1 10
too many chunks to print, use verbose if you want to force print
mongos>
bye
可见30000端口的mongos已经宕机,不能提供服务了,但30001端口的mongos可以使用
3)测试可用性
mongos> use mydb;
switched to db mydb
mongos> db.test.find({Uid:12});
{ "_id" : ObjectId("56178e473af16d338338d3bf"), "Uid" : 12, "Name" : "zhanjindong", "Age" : 13, "Date" : ISODate("2015-10-09T09:52:07.205Z") }
{ "_id" : ObjectId("56187d6743c2e9badb0ca53b"), "Uid" : 12, "Name" : "zhanjindong", "Age" : 13, "Date" : ISODate("2015-10-10T02:52:23.497Z") }
{ "_id" : ObjectId("561882527bd66d65a90fb01d"), "Uid" : 12, "Name" : "zhanjindong", "Age" : 13, "Date" : ISODate("2015-10-10T03:13:22.594Z") }
{ "_id" : ObjectId("561882f9ff80495f50bec62f"), "Uid" : 12, "Name" : "zhanjindong", "Age" : 13, "Date" : ISODate("2015-10-10T03:16:09.930Z") }
mongos> db.test.find({Uid:8});
{ "_id" : ObjectId("56178e453af16d338338d3bb"), "Uid" : 8, "Name" : "zhanjindong", "Age" : 13, "Date" : ISODate("2015-10-09T09:52:05.539Z") }
{ "_id" : ObjectId("56187e117ec37f9bb7244496"), "Uid" : 8, "Name" : "zhanjindong", "Age" : 13, "Date" : ISODate("2015-10-10T02:55:13.529Z") }
{ "_id" : ObjectId("5618824d7bd66d65a90fb01c"), "Uid" : 8, "Name" : "zhanjindong", "Age" : 13, "Date" : ISODate("2015-10-10T03:13:17.564Z") }
{ "_id" : ObjectId("561882fdff80495f50bec630"), "Uid" : 8, "Name" : "zhanjindong", "Age" : 13, "Date" : ISODate("2015-10-10T03:16:13.788Z") }
mongos> db.test.insert({"Uid":8,"Name":"zhanjindong","Age":13,"Date":new Date()});
WriteResult({ "nInserted" : 1 })
mongos> db.test.insert({"Uid":12,"Name":"zhanjindong","Age":13,"Date":new Date()});
WriteResult({ "nInserted" : 1 })
可见集群可以正常使用
4)解决方案
可以在代码中配置连接多台mongo实现failover,java代码如下:
List<ServerAddress> mongoHostList = new ArrayList<ServerAddress>();
mongoHostList.add(new ServerAddress("ip",port));
mongoHostList.add(new ServerAddress("ip",port));
mongoHostList.add(new ServerAddress("ip",port));
MongoClient mg = new MongoClient(mongoHostList);
六、结论
1.Shard Server故障
1)分片节点故障,可以按照MongoDB副本集故障测试和解决方案中解决,仍可以实现可用性
2)整个分片故障,如果可以重新启动的话,仍然可以使用,否则只能从备份中恢复,必然会丢数据
2.Config Server故障
只要有可用的配置服务器,整个集群都可以正常使用
3.Query Routers故障
只要有可用的mongos服务器,整个集群就可以正常使用,要客户端也实现failover,需要将代码写成连接多台mongo的方式