Nagios监控mongodb分片集群服务实战

1，监控插件下载

Mongodb插件下载地址为：git clone git://github.com/mzupan/nagios-plugin-mongodb.git，刚開始本人这里没有安装gitpub环境，找网友草根帮忙下载的。之后上传到了csdn资源页面，新的下载地址为：http://download.csdn.net/detail/mchdba/8019077

2，加入新的mongodb监控命令

由于mongodb服务是和mysql从库公用一台物理机。之前已经做了基础nagios以及mysql服务监控，所以这里仅仅须要在原来的基础上加入mongodb命令和服务就可以。Nagios监控mysql请參考：http://blog.itpub.net/26230597/viewspace-760141/以及http://blog.itpub.net/26230597/viewspace-1217246/。所以这里须要加入的mongodb监控命令例如以下所看到的：

[root@wgq objects]# cd /usr/local/nagios/etc/objects

[root@wgq objects]# vim commands.cfg

define command {

    command_name check_mongodb

    command_line $USER1$/nagios-plugin-mongodb/check_mongodb.py -H $HOSTADDRESS$ -A $ARG1$ -P $ARG2$ -W $ARG3$ -C $ARG4$

}

define command {

    command_name check_mongodb_database

    command_line $USER1$/nagios-plugin-mongodb/check_mongodb.py -H $HOSTADDRESS$ -A $ARG1$ -P $ARG2$ -W $ARG3$ -C $ARG4$ -d $ARG5$

}

define command {

    command_name check_mongodb_collection

    command_line $USER1$/nagios-plugin-mongodb/check_mongodb.py -H $HOSTADDRESS$ -A $ARG1$ -P $ARG2$ -W $ARG3$ -C $ARG4$ -d $ARG5$ -c $ARG6$

}

define command {

    command_name check_mongodb_replicaset

    command_line $USER1$/nagios-plugin-mongodb/check_mongodb.py -H $HOSTADDRESS$ -A $ARG1$ -P $ARG2$ -W $ARG3$ -C $ARG4$ -r $ARG5$

}

define command {

    command_name check_mongodb_query

    command_line $USER1$/nagios-plugin-mongodb/check_mongodb.py -H $HOSTADDRESS$ -A $ARG1$ -P $ARG2$ -W $ARG3$ -C $ARG4$ -q $ARG5$

}

3，加入mongodb监控服务

mongodb的服务也须要单独又一次加入。例如以下所看到的：

#检測mongodb服务的连接时间，超过2秒就普通报警，5秒就严重报警

define service{

        host_name dbm1slave1

        service_description Mongo Connect Check

        check_command check_mongodb!connect!30000!2!5

        max_check_attempts 5

        normal_check_interval 3

        retry_check_interval 2

        check_period 24x7

        notification_interval 10

        notification_period 24x7

        notification_options w,u,c,r

        contact_groups ops

        }

#检查mongodb的连接数，超过150普通报警，200严重报警

define service{

        host_name dbm1slave1

        service_description Mongo Free Connections

        check_command check_mongodb!connections!27017!70!80

        max_check_attempts 5

        normal_check_interval 3

        retry_check_interval 2

        check_period 24x7

        notification_interval 10

        notification_period 24x7

        notification_options w,u,c,r

        contact_groups ops

        }

#检查mongodb复制完毕的百分比率，确保primary和standby的time是一致的。

define service{

        host_name dbm1slave1

        service_description Mongo Replication Lag

        check_command check_mongodb!replication_lag!27017!15!30

        max_check_attempts 5

        normal_check_interval 3

        retry_check_interval 2

        check_period 24x7

        notification_interval 10

        notification_period 24x7

        notification_options w,u,c,r

        contact_groups ops

        }

#检查mongodb内存使用率。阀值与mongodb所在机器的总内存数相关

define service{

        host_name dbm1slave1

        service_description Mongo Memory Usage

        check_command check_mongodb!memory!27017!20!28

        max_check_attempts 5

        normal_check_interval 3

        retry_check_interval 2

        check_period 24x7

        notification_interval 10

        notification_period 24x7

        notification_options w,u,c,r

        contact_groups ops

        }

#检查mongodb Mapped的内存使用率。阀值与mongodb所在机器的总内存数相关

define service{

        host_name dbm1slave1

        service_description Mongo Mapped Memory Usage

        check_command check_mongodb!memory_mapped!27017!20!28

        max_check_attempts 5

        normal_check_interval 3

        retry_check_interval 2

        check_period 24x7

        notification_interval 10

        notification_period 24x7

        notification_options w,u,c,r

        contact_groups ops

        }

#检查Lock Time的百分率。假设lock time占领mongo运行时间的5%就普通报警。假设超过10%就严重报警

define service{

        host_name dbm1slave1

        service_description Mongo Lock Percentage

        check_command check_mongodb!lock!27017!5!10

        max_check_attempts 5

        normal_check_interval 3

        retry_check_interval 2

        check_period 24x7

        notification_interval 10

        notification_period 24x7

        notification_options w,u,c,r

        contact_groups ops

        }

# Check Average Flush Time，检查mongo服务器的平均flush时间，

define service{

        host_name dbm1slave1

        service_description Mongo Flush Average

        check_command check_mongodb!flushing!27017!100!200

        max_check_attempts 5

        normal_check_interval 3

        retry_check_interval 2

        check_period 24x7

        notification_interval 10

        notification_period 24x7

        notification_options w,u,c,r

        contact_groups ops

        }

# Check Last Flush Time，检查最新的flush时间，假设超过200ms就普通报警。超过400ms就严重报警

define service{

        host_name dbm1slave1

        service_description Mongo Last Flush Time

        check_command check_mongodb!last_flush_time!27017!200!400

        max_check_attempts 5

        normal_check_interval 3

        retry_check_interval 2

        check_period 24x7

        notification_interval 10

        notification_period 24x7

        notification_options w,u,c,r

        contact_groups ops

        }

# Check status of mongodb replicaset，检查mongo复制的状态

define service{

        host_name dbm1slave1

        service_description MongoDB state

        check_command check_mongodb!replset_state!27017!0!0

        max_check_attempts 5

        normal_check_interval 3

        retry_check_interval 2

        check_period 24x7

        notification_interval 10

        notification_period 24x7

        notification_options w,u,c,r

        contact_groups ops

        }

# Check status of index miss ratio，检查索引命中率。

define service{

        host_name dbm1slave1

        service_description MongoDB Index Miss Ratio

        check_command check_mongodb!index_miss_ratio!27017!.005!.01

        max_check_attempts 5

        normal_check_interval 3

        retry_check_interval 2

        check_period 24x7

        notification_interval 10

        notification_period 24x7

        notification_options w,u,c,r

        contact_groups ops

        }

# Check number of databases and number of collections

define service{

        host_name dbm1slave1

        service_description MongoDB Number of databases

        check_command check_mongodb!databases!27017!300!500

        max_check_attempts 5

        normal_check_interval 3

        retry_check_interval 2

        check_period 24x7

        notification_interval 10

        notification_period 24x7

        notification_options w,u,c,r

        contact_groups ops

        }

define service{

        host_name dbm1slave1

        service_description MongoDB Number of collections

        check_command check_mongodb!collections!27017!300!500

        max_check_attempts 5

        normal_check_interval 3

        retry_check_interval 2

        check_period 24x7

        notification_interval 10

        notification_period 24x7

        notification_options w,u,c,r

        contact_groups ops

        }        

# Check size of a database，检查库的大小

define service{

        host_name dbm1slave1

        service_description MongoDB Database size your-database

        check_command check_mongodb_database!database_size!27017!300!500!your-database

        max_check_attempts 5

        normal_check_interval 3

        retry_check_interval 2

        check_period 24x7

        notification_interval 10

        notification_period 24x7

        notification_options w,u,c,r

        contact_groups ops

        }                

# Check index size of a database，检查库索引的大小

define service{

        host_name dbm1slave1

        service_description MongoDB Database index size your-database

        check_command check_mongodb_database!database_indexes!27017!50!100!your-database

        max_check_attempts 5

        normal_check_interval 3

        retry_check_interval 2

        check_period 24x7

        notification_interval 10

        notification_period 24x7

        notification_options w,u,c,r

        contact_groups ops

        }            

# Check index size of a collection，检查集合collection的索引大小

define service{

        host_name dbm1slave1

        service_description MongoDB Database index size your-database

        check_command check_mongodb_collection!collection_indexes!27017!50!100!your-database!your-collection

        max_check_attempts 5

        normal_check_interval 3

        retry_check_interval 2

        check_period 24x7

        notification_interval 10

        notification_period 24x7

        notification_options w,u,c,r

        contact_groups ops

        }

# Check the primary server of replicaset。检查复制的primary服务

define service{

        host_name dbm1slave1

        service_description MongoDB Replicaset Master Monitor: your-replicaset

        check_command check_mongodb_replicaset!replica_primary!27017!0!1!your-replicaset

        #演示样例：check_command check_mongodb_replicaset!replica_primary!27017!0!1!shard2

        max_check_attempts 5

        normal_check_interval 3

        retry_check_interval 2

        check_period 24x7

        notification_interval 10

        notification_period 24x7

        notification_options w,u,c,r

        contact_groups ops

        }

# Check the number of queries per second，检查每一秒的查询数量

define service{

        host_name dbm1slave1

        service_description MongoDB Updates per Second

        check_command check_mongodb_query!queries_per_second!27017!200!150!update

        max_check_attempts 5

        normal_check_interval 3

        retry_check_interval 2

        check_period 24x7

        notification_interval 10

        notification_period 24x7

        notification_options w,u,c,r

        contact_groups ops

        }

# Check Primary Connection，检查复制中与primary库的连接时间，超过2秒就普通报警，超过4秒就严重报警

define service{

        host_name dbm1slave1

        service_description Mongo Connect Check

        check_command check_mongodb!connect_primary!27017!2!4

        max_check_attempts 5

        normal_check_interval 3

        retry_check_interval 2

        check_period 24x7

        notification_interval 10

        notification_period 24x7

        notification_options w,u,c,r

        contact_groups ops

        }

# Check Collection State。检查collection状态，检查mongo服务组列表的每个主机，能够检查重要collection的高可用性（锁、超时、服务配置的可用性）。假设发现一个查询失败就会报警。

define service{

        host_name dbm1slave1

        service_description Mongo Collection State

        check_command check_mongodb!collection_state!27017!your-database!your-collection

        max_check_attempts 5

        normal_check_interval 3

        retry_check_interval 2

        check_period 24x7

        notification_interval 10

        notification_period 24x7

        notification_options w,u,c,r

        contact_groups ops

        }

4，查看部分监控项效果

配置完nagios端服务。重新启动下service nagios restart; 等上几分钟，nagios监控界面就会出现完整的mongo服务信息，例如以下所看到的：

5，从ps中确定mongodb的架构

[root@db-m1-slave-1 ~]# ps -eaf|grep mongo

mongodb 2457 1 0 2013 ?

2-03:39:08 ./mongod --configsvr --dbpath /home/data/mongodb/config --port 20000 --logpath /home/data/mongodb/config.log --logappend --fork

mongodb 2804 1 0 2013 ? 1-10:02:33 mongos --configdb 192.168.12.62:20000,192.168.12.63:20000,192.168.12.72:20000 --port 30000 --chunkSize 64 --logpath /home/data/mongodb/mongos.log --logappend --fork

mongodb 3072 1 0 2013 ?

1-10:17:20 mongod --shardsvr --replSet shard1 --port 27017 --dbpath /home/data/mongodb/shard11 --oplogSize 2048 --logpath /home/data/mongodb/shard11.log --logappend --fork

root 11179 9391 0 11:14 pts/1 00:00:00 grep mongo

mongodb 30414 1 0 Feb14 ? 1-06:20:50 mongod --shardsvr --replSet shard2 --port 27018 --dbpath /home/data/mongodb/shard21 --oplogSize 2048 --logpath /home/data/mongodb/shard21.log --logappend --fork

[root@db-m1-slave-1 ~]#

看到有4个mongo进程，

a) 启动參数有“--configdb”的就是集群入口进程；

b) Shard Server，启动參数带“--shardsvr --replSet”的是集群分片的一个片组启动进程，用户存储实际的数据块，也就是27017port和27018port的mongodb服务实例。至于怎样推断27017port中哪个是primary哪个是secondary须要去登录27107port运行rs.status();去查看一下。

c) Config Server：启动參数带“--configsvr”的进程，存储了整个Cluster Metadata，当中包含chunk信息，也就是20000port的mongodb服务实例。

d) Route Server：启动參数带“mongos --configdb”的进程，前端路由，client由此接入。且让整个集群看上去像单一数据库，前端应用能够透明使用。也就是30000port的mongodb实例。

6，调试中出现过的错误

错误1：

[root@wgq nagios ~]# tail -f /usr/local/nagios/var/nagios.log

[1412819956] Warning: Return code of 13 for check of service 'Mongo Memory Usage' on host 'dbm1slave1' was out of bounds.

[1412819956] SERVICE ALERT: dbm1slave1;Mongo Memory Usage;CRITICAL;SOFT;1;(Return code of 13 is out of bounds)

[1412819975] Warning: Return code of 13 for check of service 'Mongodb Connect Check' on host 'dbm1slave1' was out of bounds.

[1412819975] SERVICE ALERT: dbm1slave1;Mongodb Connect Check;CRITICAL;SOFT;1;(Return code of 13 is out of bounds)

[1412820058] Warning: Return code of 13 for check of service 'Mongo Free Connections' on host 'dbm1slave1' was out of bounds.

须要赋值nagios用户全部权限以及r运行权限

chmod 770 /usr/lib/nagios/plugins/check_mongodb.py

chown -R nagios.nagios /usr/lib/nagios/plugins/check_mongodb.py

错误2：

监控界面Status Information一栏出现 No module named pymongo报错提示信息：

出现这个提示是由于须要安装pymongo模块，运行easy_install pymongo命令安装就可以。例如以下所看到的：

[root@wgq objects]# easy_install pymongo

Searching for pymongo

Reading http://pypi.python.org/simple/pymongo/

Best match: pymongo 2.7.2

......

zip_safe flag not set; analyzing archive contents...

Adding pymongo 2.7.2 to easy-install.pth file

Installed /usr/lib/python2.6/site-packages/pymongo-2.7.2-py2.6-linux-x86_64.egg

Processing dependencies for pymongo

Finished processing dependencies for pymongo

----------------------------------------------------------------------------------------------------------------

<版权全部，文章同意转载，但必须以链接方式注明源地址，否则追究法律责任!>
原博客地址：http://blog.itpub.net/26230597/viewspace-1293589/
原作者：黄杉 (mchdba)

----------------------------------------------------------------------------------------------------------------

參考文章：https://github.com/mzupan/nagios-plugin-mongodb/blob/master/README.md

Nagios监控mongodb分片集群服务实战的更多相关文章

网易云MongoDB分片集群（Sharding）服务已上线
此文已由作者温正湖授权网易云社区发布. 欢迎访问网易云社区,了解更多网易技术产品运营经验. MongoDB sharding cluster(分片集群)是MongoDB提供的数据在线水平扩展方案,包括 ...
MongoDB 分片集群实战
背景在如今的互联网环境下,海量数据已随处可见并且还在不断增长,对于如何存储处理海量数据,比较常见的方法有两种: 垂直扩展:通过增加单台服务器的配置,例如使用更强悍的 CPU.更大的内存.更大容量的磁 ...
TiDB和MongoDB分片集群架构比较
此文已由作者温正湖授权网易云社区发布. 欢迎访问网易云社区,了解更多网易技术产品运营经验. 最近阅读了TiDB源码的说明文档,跟MongoDB的分片集群做了下简单对比. 首先展示TiDB的整体架构 M ...
CentOS7+Docker+MangoDB下部署简单的MongoDB分片集群
简单的在Docker上快速部署MongoDB分片集群前言文中使用的环境如下 OS:CentOS Linux release 7.5.1804 (Core) Docker:Docker versio ...
Windows 搭建MongoDB分片集群（二）
在本篇博客中我们主要讲描述分片集群的搭建过程.配置分片集群主要有两个步骤,第一启动所有需要的mongod和mongos进程.第二步就是启动一个mongos与集群通信.下面我们一步步来描述集群的搭建过程 ...
分布式文档存储数据库之MongoDB分片集群
前文我们聊到了mongodb的副本集以及配置副本集,回顾请参考https://www.cnblogs.com/qiuhom-1874/p/13953598.html:今天我们来聊下mongodb的分片 ...
MongoDB分片集群原理、搭建及测试详解
随着技术的发展,目前数据库系统对于海量数据的存储和高效访问海量数据要求越来越高,MongoDB分片机制就是为了解决海量数据的存储和高效海量数据访问而生. MongoDB分片集群由mongos路由进程( ...
mongodb分片集群
第一章 1.mongodb 分片集群解释和目的一组Mongodb复制集,就是一组mongod进程,这些进程维护同一个数据集合.复制集提供了数据冗余和高等级的可靠性,这是生产部署的基础. 第二章 1. ...
搭建MongoDB分片集群
在部门服务器搭建MongoDB分片集群,记录整个操作过程,朋友们也可以参考. 计划如下: 用5台机器搭建,IP分别为:192.168.58.5.192.168.58.6.192.168.58.8.19 ...

随机推荐

tensorboard简单使用
代码写的再好,没有图别人也不知道好在哪. 我们在使用tensorflow的时候,使用tensorboard可以直观的看到我们的网络结构,甚至它可以计算卷积和池化的维度(我不知道是不是因为我已经运行了一 ...
HTML 转义字符对照表
http://tool.oschina.net/commons 字符十进制转义字符 " " " & & & < < < &g ...
JAVA-JSP内置对象之out对象求得缓冲区使用大小
相关资料:<21天学通Java Web开发> out对象 out对象求得缓冲区使用大小1.通过out对象的getBufferSize()方法可以获得缓冲区的大小.2.通过getRemain ...
Eigen教程(11)
整理下Eigen库的教程,参考:http://eigen.tuxfamily.org/dox/index.html 存储顺序对于矩阵和二维数组有两种存储方式,列优先和行优先. 假设矩阵: 按行优先存 ...
ios label 简单的长按复制文本信息
在iOS开发过程中,有时候会用到UILabel展示的内容,那么就设计到点击UILabel复制它上面展示的内容的功能,也就是Label长按复制功能.网上有很多种给Label添加长按复制功能的方法,这里我 ...
[转]mybatis if test非空判断数字0为什么是false
原文地址:http://blog.51cto.com/wangguangshuo/1944531 今天工作中发现一个Long类型的参数没有传到sql中去,在sql xml配置文件中是使用if test ...
regsvr32.exe是什么东西
Regsvr32命令修复系统故障实例使用过activex的人都知道,activex不注册是不能够被系统识别和使用的,一般安装程序都会自动地把它所使用的activex控件注册,但如果你拿到的一个控件需要 ...
Jenkins+git
https://www.cnblogs.com/Csir/category/1100433.html
JPA和Spring-Data-JPA简介
什么是JPA JPA(Java Persistence API)是Sun官方提出的Java持久化规范.它为Java开发人员提供了一种对象/关联映射工具来管理Java应用中的关系数据.它的出现主要是为了 ...
关闭R语言载入包时候的警告
options(warn =-1)

Nagios监控mongodb分片集群服务实战

Nagios监控mongodb分片集群服务实战的更多相关文章

随机推荐

热门专题