File: //usr/lib/nagios/plugins/pmp-check-mongo.py
#!/usr/bin/env python2.7
"""MongoDB Nagios check script
This program is part of $PROJECT_NAME$
License: GPL License (see COPYING)
Author David Murphy
Copyright 2014-2015 Percona LLC and/or its affiliates
"""
import sys
import time
import optparse
import os
import stat
import pickle
import traceback
import pprint
from types import FunctionType
# Not yet implemented
# import DeepDiff
try:
import pymongo
except ImportError, e:
print e
sys.exit(2)
# As of pymongo v 1.9 the SON API is part of the BSON package, therefore attempt
# to import from there and fall back to pymongo in cases of older pymongo
if pymongo.version >= "1.9":
import bson.son as son
else:
import pymongo.son as son
# Adding special behavior for optparse
class OptionParsingError(RuntimeError):
def __init__(self, msg):
self.msg = msg
class ModifiedOptionParser(optparse.OptionParser):
def error(self, msg):
raise OptionParsingError(msg)
def unicode_truncate(s, length, encoding='utf-8'):
encoded = s.encode(encoding)[:length]
return encoded.decode(encoding, 'ignore')
def parse_options(args):
funcList = []
for item_name, item_type in NagiosMongoChecks.__dict__.items():
if type(item_type) is FunctionType and item_name.startswith("check_") and item_name is not 'check_levels':
funcList.append(item_name)
p = ModifiedOptionParser()
p.add_option('-H', '--host', action='store', type='string', dest='host', default='127.0.0.1', help='The hostname you want to connect to')
p.add_option('-P', '--port', action='store', type='int', dest='port', default=27017, help='The port mongodb is running on')
p.add_option('-u', '--user', action='store', type='string', dest='user', default=None, help='The username you want to login as')
p.add_option('-p', '--password', action='store', type='string', dest='passwd', default=None, help='The password you want to use for that user')
p.add_option('-W', '--warning', action='store', dest='warning', default=None, help='The warning threshold you want to set')
p.add_option('-C', '--critical', action='store', dest='critical', default=None, help='The critical threshold you want to set')
p.add_option('-A', '--action', action='store', type='choice', dest='action', default='check_connect',
choices=funcList, help="The action you want to take. Valid choices are (%s) Default: %s" % (", ".join(funcList), 'check_connect'))
p.add_option('-s', '--ssl', dest='ssl', default=False, help='Connect using SSL')
p.add_option('-r', '--replicaset', dest='replicaset', default=None, help='Connect to replicaset')
p.add_option('-c', '--collection', action='store', dest='collection', default='foo', help='Specify the collection in check_cannary_test')
p.add_option('-d', '--database', action='store', dest='database', default='tmp', help='Specify the database in check_cannary_test')
p.add_option('-q', '--query', action='store', dest='query', default='{"_id":1}', help='Specify the query in check_cannary_test')
p.add_option('--statusfile', action='store', dest='status_filename', default='status.dat', help='File to current store state data in for delta checks')
p.add_option('--backup-statusfile', action='store', dest='status_filename_backup', default='status_backup.dat',
help='File to previous store state data in for delta checks')
p.add_option('--max-stale', action='store', dest='max_stale', type='int', default=60, help='Age of status file to make new checks (seconds)')
# Add options for output stat file
try:
result = p.parse_args()
except OptionParsingError, e:
if 'no such option' in e.msg:
sys.exit("UNKNOWN - No such options of %s" % e.msg.split(":")[1])
if 'invalid choice' in e.msg:
error_item = e.msg.split(":")[2].split("'")[1]
sys.exit('UNKNOWN - No such action of %s found!' % error_item)
return result
def return_result(result_type, message):
if result_type == "ok":
print "OK - " + message
sys.exit(0)
elif result_type == "critical":
print "CRITICAL - " + message
sys.exit(2)
elif result_type == "warning":
print "WARNING - " + message
sys.exit(1)
else:
print "UNKNOWN - " + message
sys.exit(2)
def main(argv):
options, arguments = parse_options(argv)
check(options, options.action)
def check(args, check_name):
try:
checksObj = globals()['NagiosMongoChecks'](args)
run_check = getattr(checksObj, check_name)
result_type, message = run_check(args, args.warning, args.critical)
except Exception, e:
raise
print(traceback.extract_tb(sys.exc_info()[-1], 1))
return_result("critical", str(e))
return_result(result_type, message)
class NagiosMongoChecks:
# need to initialize variables and such still
def __init__(self, args):
# setup inital values from optParse
self.host = '127.0.0.1'
self.port = 27017
self.user = None
self.password = None
self.warning = None
self.critical = None
self.action = 'check_connect'
self.ssl = False
self.replicaset = None
self.collection = 'foo'
self.database = 'tmp'
self.query = '{"_id":1}'
self.status_filename = 'status.dat'
self.status_filename_backup = 'status_backup.dat'
self.max_stale = 60
for option in vars(args):
setattr(self, option, getattr(args, option))
# Fix filepaths to be relative
if not self.status_filename.startswith("/") or not self.status_filename.startswith(".."):
self.status_filename_backup = "%s/%s" % (os.curdir, self.status_filename_backup)
self.status_filename = "%s/%s" % (os.curdir, self.status_filename)
# ammend known intenal values we will need
self.current_status = {}
self.last_status = {}
self.connection = None
self.connection_time = None
self.pyMongoError = None
self.connect()
if self.file_age(self.status_filename) <= self.max_stale:
# Save status_file contents status as current_status
self.get_last_status(True)
# Save status_filename_backup contents as last_status
self.get_last_status(False, self.status_filename_backup)
else:
if self.connection is None:
raise pymongo.errors.ConnectionFailure(self.pyMongoError or "No connection Found, did connect fail?")
# Get fresh current_status from server
self.current_status = self.sanatize(self.get_server_status())
# user last status_filename contents as last_status
self.get_last_status(False, self.status_filename)
# Not yet implemented
# self.compute_deltas()
# get last status
# check if needs refresh, refresh if needed
# set last/current to self.current_status
pass
def get_last_status(self, returnAsCurrent, forceFile=None):
# Open file using self.file
try:
file_name = forceFile if forceFile is not None else self.status_filename
fileObject = open(file_name, 'r')
if returnAsCurrent is None or returnAsCurrent is False:
self.last_status = pickle.load(fileObject)
else:
self.current_status = pickle.load(fileObject)
except Exception:
return False
return True
def get_server_status(self):
try:
data = self.connection['admin'].command(pymongo.son_manipulator.SON([('serverStatus', 1)]))
except:
try:
data = self.connection['admin'].command(son.SON([('serverStatus', 1)]))
except Exception, e:
if type(e).__name__ == "OperationFailure":
sys.exit("UNKNOWN - Not authorized!")
else:
sys.exit("UNKNOWN - Unable to run serverStatus: %s::%s" % (type(e).__name__, unicode_truncate(e.message, 45)))
if self.current_status is None:
self.current_status = data
return data
# figure out how to use this one later
def rotate_files(self):
# 1)this needs to rename self.status_filename to status_filename_backup
# 2) Save current_status to self.status_filename ( new file )
if self.last_status == {}:
# Build the last status file for future deltas from current data
self.save_file(self.status_filename_backup, self.current_status)
# Set the current status file to empty to set the aging clock
self.save_file(self.status_filename, {})
sys.exit("UNKNOWN - No status data present, please try again in %s seconds" % self.max_stale)
else:
self.save_file(self.status_filename_backup, self.last_status)
self.save_file(self.status_filename, self.current_status)
def save_file(self, filename, contents):
try:
pickle.dump(contents, open(filename, "wb"))
except Exception, e:
sys.exit("UNKNOWN - Error saving stat file %s: %s" % (filename, e.message))
# TODO - Fill in all check defaults
def get_default(self, key, level):
defaults = {
'check_connections': {'warning': 15000, 'critical': 19000},
'check_connect': {'warning': 50, 'critical': 100},
'check_queues': {'warning': 30, 'critical': 100},
'check_lock_pct': {'warning': 30, 'critical': 50},
'check_repl_lag': {'warning': 200, 'critical': 500},
# 'check_flushing': {'warning':XX, 'critical': XX},
'check_total_indexes': {'warning': 100, 'critical': 300},
'check_cannary_test': {'warning': 30, 'critical': 50},
'check_oplog': {'warning': 36, 'critical': 24},
'check_index_ratio': {'warning': .9, 'critical': .8},
}
try:
return defaults[key][level]
except KeyError:
sys.exit("UNKNOWN - Missing defaults found for %s please use -w and -c" % key)
# Not yet implemented
# def compute_deltas(self):
# deltas = []
# for item in DeepDiff(self.last_status, self.current_status)['values_changed']:
# name = item.split(":")[0].split("root")[1].replace("['", "").replace("']", ".")[:-1]
# if 'time' not in item.lower():
# values = item.split(":")[1]
# print(values)
# old, new = values.split("===>")
# print("%s: %s - %s = %s" % (name, new, old, float(new)-float(old)))
# deltas[name] = float(new) - float(old)
# self.delta_data = deltas
# return True
def file_age(self, filename):
try:
age = time.time() - os.stat(filename)[stat.ST_CTIME]
except OSError:
age = 999999
return age
# TODO - Add meat to this if needed, here for future planning
def sanatize(self, status_output):
return status_output
def connect(self):
start_time = time.time()
try:
# ssl connection for pymongo > 2.3
if self.replicaset is None:
con = pymongo.MongoClient(self.host, self.port, ssl=self.ssl, serverSelectionTimeoutMS=2500)
else:
con = pymongo.MongoClient(self.host, self.port, ssl=self.ssl, replicaSet=self.replicaset, serverSelectionTimeoutMS=2500)
if (self.user and self.passwd) and not con['admin'].authenticate(self.user, self.passwd):
sys.exit("CRITICAL - Username and password incorrect")
except Exception, e:
raise
if isinstance(e, pymongo.errors.AutoReconnect) and str(e).find(" is an arbiter") != -1:
# We got a pymongo AutoReconnect exception that tells us we connected to an Arbiter Server
# This means: Arbiter is reachable and can answer requests/votes - this is all we need to know from an arbiter
print "OK - State: 7 (Arbiter)"
sys.exit(0)
con = None
self.pyMongoError = str(e)
if con is not None:
try:
con['admin'].command(pymongo.son_manipulator.SON([('ping', 1)]))
except Exception, e:
sys.exit("UNKNOWN - Unable to run commands, possible auth issue: %s" % e.message)
self.connection_time = round(time.time() - start_time, 2)
version = con.server_info()['version'].split('.')
self.mongo_version = (version[0], version[1], version[2])
self.connection = con
def check_levels(self, check_result, warning_level, critical_level, message):
if check_result < warning_level:
return "ok", message
elif check_result > critical_level:
return "critical", message
elif check_result > warning_level and check_result < critical_level:
return "warning", message
else:
return "unknown", "Unable to parse %s into a result" % check_result
def check_connect(self, args, warning_level, critical_level):
warning_level = warning_level or self.get_default('check_connect', 'warning')
critical_level = critical_level or self.get_default('check_connect', 'critical')
con_time = self.connection_time
message = "Connection time %.2f ms" % con_time
return self.check_levels(float(con_time), float(warning_level), float(critical_level), message)
def check_connections(self, args, warning_level, critical_level):
warning_level = warning_level or self.get_default('check_connections', 'warning')
critical_level = critical_level or self.get_default('check_connections', 'critical')
connections = self.current_status['connections']
connections['total'] = connections['available'] + connections['current']
used_percent = int((connections['current'] / connections['total']) * 100)
message = "%i%% connections used ( %d of %d )" % (used_percent, connections['current'], connections['total'])
return self.check_levels(float(used_percent), int(warning_level), int(critical_level), message)
def check_lock_pct(self, args, warning_level, critical_level):
warning_level = warning_level or self.get_default('check_lock_pct', 'warning')
critical_level = critical_level or self.get_default('check_lock_pct', 'critical')
if self.mongo_version >= ('2', '7', '0'):
return "ok", "Mongo 3.0 and above do not have lock %"
lockTime = self.current_status['globalLock']['lockTime'] - self.last_status['globalLock']['lockTime']
totalTime = self.current_status['globalLock']['totalTime'] - self.last_status['globalLock']['totalTime']
lock_percent = int((lockTime / totalTime) * 100)
message = "%i%% locking found (over 100%% is possible)" % (lock_percent)
return self.check_levels(lock_percent, warning_level, critical_level, message)
def check_flushing(self, args, warning_level, critical_level):
warning_level = warning_level or self.get_default('check_flushing', 'warning')
critical_level = critical_level or self.get_default('check_flushing', 'critical')
flushData = self.current_status['backgroundFlushing']
if args.average:
flush_time = flushData['average_ms']
stat_type = "Average"
else:
flush_time = flushData['last_ms']
stat_type = "Last"
message = "%s Flush Time: %.2fms" % (stat_type, flush_time)
return self.check_levels(flush_time, warning_level, critical_level, message)
def check_index_ratio(self, args, warning_level, critical_level):
warning_level = warning_level or self.get_default('check_index_ratio', 'warning')
critical_level = critical_level or self.get_default('check_index_ratio', 'critical')
message = None
indexCounters = self.current_status['indexCounters']
if 'note' in indexCounters:
ratio = 1.0
message = "not supported defaulting to 1.0 ratio"
elif self.mongo_version >= ('2', '4', '0'):
ratio = indexCounters['missRatio']
else:
ratio = indexCounters['btree']['missRatio']
if message is None:
message = "Miss Ratio: %.2f" % ratio
return self.check_levels(ratio, warning_level, critical_level, message)
def check_have_primary(self, args, warning_level, critical_level):
replset_status = self.connection['admin'].command("replSetGetStatus")
for member in replset_status['members']:
if member['state'] == 1:
return "ok", "Cluster has primary"
return "critical", "Cluster has no primary!"
def check_total_indexes(self, args, warning_level, critical_level):
warning_level = warning_level or self.get_default('check_total_indexes', 'warning')
critical_level = critical_level or self.get_default('check_total_indexes', 'critical')
index_count = 0
database_count = 0
for database in self.connection.database_names():
if database not in ["admin", "local"]:
database_count += 1
self.connection[database]['system.indexes'].count()
index_count += self.connection[database]['system.indexes'].count()
message = "Found %d indexes in %d databases" % (index_count, database_count)
return self.check_levels(index_count, warning_level, critical_level, message)
def check_queues(self, args, warning_level, critical_level):
warning_level = warning_level or self.get_default('check_queues', 'warning')
critical_level = critical_level or self.get_default('check_queues', 'critical')
currentQueue = self.current_status['globalLock']['currentQueue']
currentQueue['total'] = currentQueue['readers'] + currentQueue['writers']
message = "Queue Sizes: read (%d) write(%d) total (%d)" % (currentQueue['readers'], currentQueue['writers'], currentQueue['total'])
return self.check_levels(currentQueue['total'], warning_level, critical_level, message)
def check_oplog(self, args, warning_level, critical_level):
warning_level = warning_level or self.get_default('check_oplog', 'warning')
critical_level = critical_level or self.get_default('check_oplog', 'critical')
if 'local' not in self.connection.database_names() or 'oplog.rs' not in self.connection['local'].collection_names():
return "critical", "We do not seem to be in a replset!"
oplog = self.connection['local']['oplog.rs']
first_ts = oplog.find().sort("$natural", pymongo.ASCENDING).limit(1)[0]['ts']
last_ts = oplog.find().sort("$natural", pymongo.DESCENDING).limit(1)[0]['ts']
oplog_range = (last_ts.as_datetime() - first_ts.as_datetime())
oplog_range_hours = oplog_range.total_seconds() / 60 / 60
message = "Oplog Time is %d hours" % (oplog_range_hours)
return self.check_levels(int(oplog_range_hours), warning_level, critical_level, message)
def check_election(self, args, warning_level, critical_level):
replset_status = self.connection['admin'].command("replSetGetStatus")
for member in replset_status['members']:
if member['stateStr'] == "PRIMARY":
#last_primary = member.name
last_primary = member['name']
for member in replset_status['members']:
if member['stateStr'] == "PRIMARY":
current_primary = member['name']
message = "Old PRI: %s New PRI: %s" % (last_primary, current_primary)
if current_primary == last_primary:
return "ok", message
else:
return "critical", message
def is_balanced(self):
chunks = {}
# Loop through each of the chunks, tallying things up
for chunk in self.connection["config"]["chunks"].find():
namespace = chunk['ns']
shard = chunk['shard']
if namespace not in chunks:
chunks[namespace] = {'shards': {}, 'total': 0}
if shard not in chunks[namespace]['shards']:
chunks[namespace]['shards'][shard] = 0
chunks[namespace]['shards'][shard] += 1
chunks[namespace]['total'] += 1
shardsCount = self.connection["config"]["shards"].count()
chunksCount = self.connection["config"]["chunks"].count()
# Different migration thresholds depending on cluster size
# http://docs.mongodb.org/manual/core/sharding-internals/#sharding-migration-thresholds
if chunksCount < 20:
threshold = 2
elif chunksCount < 80 and chunksCount > 21:
threshold = 4
else:
threshold = 8
# Default to balanced state, any failure will then mark it as False forevermore
isBalanced = True
# Loop through each ns and determine if it's balanced or not
for ns in chunks:
balanced = chunks[ns]['total'] / shardsCount
for shard in chunks[ns]['shards']:
if shard > balanced - threshold and shard < balanced + threshold:
pass
else:
isBalanced = False
return isBalanced
def check_balance(self, args, warning_level, critical_level):
if self.is_balanced() is True:
return "ok", "Shards are balanced by chunk counts"
else:
return "critcal", "Shards are not balanced by chunk and need review"
def check_cannary_test(self, args, warning_level, critical_level):
warning_level = warning_level or self.get_default('check_cannary_test', 'warning')
critical_level = critical_level or self.get_default('check_cannary_test', 'critical')
# this does not check for a timeout, we assume NRPE or Nagios will alert on that timeout.
try:
start = time.time()
self.connection[self.database][self.collection].find_one(self.query)
time_range = (time.time() - start).total_seconds
message = "Collection %s.%s query took: %d s" % (self.database, self.collection, time_range)
return self.check_levels(time_range, warning_level, critical_level, message)
except Exception, e:
message = "Collection %s.%s query FAILED: %s" % (self.database, self.collection, e)
return "critical", message
def check_repl_lag(self, args, warning_level, critical_level):
warning_level = warning_level or self.get_default('check_repl_lag', 'warning')
critical_level = critical_level or self.get_default('check_repl_lag', 'critical')
# make a write incase the client is not writing, but us an update to avoid wasting space
self.connection['test']['lag_check'].update({"_id":1}, {"_id": 1, "x": 1})
# get a fresh status for the replset
try:
replset_status = self.connection['admin'].command("replSetGetStatus")
except Exception, e:
return "critical", "Are your running with --replset? - %s" % (e)
for member in replset_status['members']:
if member['stateStr'] == "PRIMARY":
primary = member
if 'self' in member and member['self'] is True:
hostOptimeDate = member['optimeDate']
if primary is not None:
highest_optimeDate = primary['optimeDate']
highest_name = primary['name']
else:
# find the most current secondary as there is not primary
highest_optimeDate = time.gmtime(0)
for member in replset_status['members']:
if member['optimeDate'] > highest_optimeDate:
highest_optimeDate = member['optimeDate']
highest_name = member['name']
rep_lag_seconds = (highest_optimeDate - hostOptimeDate).seconds
rep_lag_hours = round(rep_lag_seconds/60/60, 4)
message = "Lagging %s by %.4f hours" % (highest_name, rep_lag_hours)
return self.check_levels(rep_lag_hours, warning_level, critical_level, message)
#
# main app
#
if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))
# ############################################################################
# Documentation
# ############################################################################
"""
=pod
=head1 NAME
pmp-check-mongo.py - MongoDB Nagios check script.
=head1 SYNOPSIS
Usage: pmp-check-mongo.py [options]
Options:
-h, --help show this help message and exit
-H HOST, --host=HOST The hostname you want to connect to
-P PORT, --port=PORT The port mongodb is running on
-u USER, --user=USER The username you want to login as
-p PASSWD, --password=PASSWD
The password you want to use for that user
-W WARNING, --warning=WARNING
The warning threshold you want to set
-C CRITICAL, --critical=CRITICAL
The critical threshold you want to set
-A ACTION, --action=ACTION
The action you want to take. Valid choices are
(check_connections, check_election, check_lock_pct,
check_repl_lag, check_flushing, check_total_indexes,
check_balance, check_queues, check_cannary_test,
check_have_primary, check_oplog, check_index_ratio,
check_connect) Default: check_connect
-s SSL, --ssl=SSL Connect using SSL
-r REPLICASET, --replicaset=REPLICASET
Connect to replicaset
-c COLLECTION, --collection=COLLECTION
Specify the collection in check_cannary_test
-d DATABASE, --database=DATABASE
Specify the database in check_cannary_test
-q QUERY, --query=QUERY
Specify the query in check_cannary_test
--statusfile=STATUS_FILENAME
File to current store state data in for delta checks
--backup-statusfile=STATUS_FILENAME_BACKUP
File to previous store state data in for delta checks
--max-stale=MAX_STALE
Age of status file to make new checks (seconds)
=head1 COPYRIGHT, LICENSE, AND WARRANTY
This program is copyright 2014 Percona LLC and/or its affiliates.
Feedback and improvements are welcome.
THIS PROGRAM IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED
WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
Foundation, version 2. You should have received a copy of the GNU General
Public License along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
=head1 VERSION
$PROJECT_NAME$ pmp-check-mongo.py $VERSION$
=cut
"""