initial commit
This commit is contained in:
parent
27957d4418
commit
85029a0f01
20
defaults/main.yml
Normal file
20
defaults/main.yml
Normal file
@ -0,0 +1,20 @@
|
||||
---
|
||||
nrpe_allowed_hosts: '127.0.0.1,212.85.154.82,51.158.69.165'
|
||||
|
||||
nrpe_load_warning: '`cat /proc/cpuinfo |grep -c processor`'
|
||||
nrpe_load_critical: '`echo "$(($(cat /proc/cpuinfo |grep -c processor) * 2 ))"`'
|
||||
|
||||
nrpe_memory_warning: 80
|
||||
nrpe_memory_critical: 90
|
||||
|
||||
nrpe_swap_warning: 40
|
||||
nrpe_swap_critical: 60
|
||||
|
||||
nrpe_exim_warning: 10
|
||||
nrpe_exim_critical: 20
|
||||
|
||||
nrpe_postfix_warning: 10
|
||||
nrpe_postfix_critical: 20
|
||||
|
||||
nrpe_eth_warning: '12M'
|
||||
nrpe_eth_critical: '15M'
|
344
files/nrpe/check_3ware
Executable file
344
files/nrpe/check_3ware
Executable file
@ -0,0 +1,344 @@
|
||||
#!/usr/bin/perl
|
||||
|
||||
# -------------------------------------------------------
|
||||
# -=- <check_3ware-raid.pl> -=-
|
||||
# -------------------------------------------------------
|
||||
#
|
||||
# Description : yet another plugin to check your 3ware RAID
|
||||
# controller
|
||||
#
|
||||
# Version : 0.1
|
||||
# -------------------------------------------------------
|
||||
# In :
|
||||
# - see the How to use section
|
||||
#
|
||||
# Out :
|
||||
# - only print on the standard output
|
||||
#
|
||||
# Features :
|
||||
# - perfdata output
|
||||
#
|
||||
# Fix Me/Todo :
|
||||
# - too many things ;) but let me know what do you think about it
|
||||
#
|
||||
# ####################################################################
|
||||
|
||||
# ####################################################################
|
||||
# GPL v3
|
||||
# This program is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation, either version 3 of the License, or
|
||||
# (at your option) any later version.
|
||||
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
# ####################################################################
|
||||
|
||||
# ####################################################################
|
||||
# How to use :
|
||||
# ------------
|
||||
#
|
||||
# 1 to use this script you have to install firt tw_cli. You can find
|
||||
# the source here : http://www.3ware.com/support/download.asp
|
||||
# just follow the instructions to compile and deploy it
|
||||
#
|
||||
# 2 then you just have to run the following command :
|
||||
# $ ./check_3ware-raid.pl --help
|
||||
#
|
||||
# If you need to use this script with NRPE you just have to do the
|
||||
# following steps :
|
||||
#
|
||||
# 1 allow your user to run the script with the sudo rights. Just add
|
||||
# something like that in your /etc/sudoers (use visudo) :
|
||||
# nagios ALL=(ALL) NOPASSWD: /<path-to>/check_3ware-raid.pl
|
||||
#
|
||||
# 2 then just add this kind of line in your NRPE config file :
|
||||
# command[check_3ware]=/usr/bin/sudo /<path-to>/check_3ware-raid.pl
|
||||
#
|
||||
# 3 don't forget to restart your NRPE daemon
|
||||
#
|
||||
# ####################################################################
|
||||
|
||||
# ####################################################################
|
||||
# Changelog :
|
||||
# -----------
|
||||
#
|
||||
# --------------------------------------------------------------------
|
||||
# Date:28/11/2009 Version:0.1 Author:Erwan Ben Souiden
|
||||
# >> creation
|
||||
# ####################################################################
|
||||
|
||||
# ####################################################################
|
||||
# Don't touch anything under this line!
|
||||
# You shall not pass - Gandalf is watching you
|
||||
# ####################################################################
|
||||
|
||||
use strict;
|
||||
use warnings;
|
||||
use Getopt::Long qw(:config no_ignore_case);
|
||||
|
||||
# Generic variables
|
||||
# -----------------
|
||||
my $version = '0.1';
|
||||
my $author = 'Erwan Labynocle Ben Souiden';
|
||||
my $a_mail = 'erwan@aleikoum.net';
|
||||
my $script_name = 'check_3ware-raid.pl';
|
||||
my $verbose_value = 0;
|
||||
my $version_value = 0;
|
||||
my $more_value = 0;
|
||||
my $help_value = 0;
|
||||
my $perfdata_value = 0;
|
||||
my %ERRORS=('OK'=>0,'WARNING'=>1,'CRITICAL'=>2,'UNKNOWN'=>3,'DEPENDENT'=>4);
|
||||
|
||||
# Plugin default variables
|
||||
# ------------------------
|
||||
my $display = 'CHECK 3ware RAID - ';
|
||||
my ($critical,$warning) = (2,1);
|
||||
my $tw_cli_path = '/usr/sbin/tw_cli';
|
||||
my ($id_controller,$action) = ("",'disk_check');
|
||||
|
||||
GetOptions (
|
||||
'P=s' => \ $tw_cli_path,
|
||||
'path-tw_cli=s' => \ $tw_cli_path,
|
||||
'w=i' => \ $warning,
|
||||
'warning=i' => \ $warning,
|
||||
'c=i' => \ $critical,
|
||||
'critical=i' => \ $critical,
|
||||
'action=s' => \ $action,
|
||||
'a=s' => \ $action,
|
||||
'C=s' => \ $id_controller,
|
||||
'controller=s' => \ $id_controller,
|
||||
'm' => \ $more_value,
|
||||
'more' => \ $more_value,
|
||||
'V' => \ $version_value,
|
||||
'version' => \ $version_value,
|
||||
'h' => \ $help_value,
|
||||
'H' => \ $help_value,
|
||||
'help' => \ $help_value,
|
||||
'display=s' => \ $display,
|
||||
'D=s' => \ $display,
|
||||
'perfdata' => \ $perfdata_value,
|
||||
'p' => \ $perfdata_value,
|
||||
'v' => \ $verbose_value,
|
||||
'verbose' => \ $verbose_value
|
||||
);
|
||||
|
||||
print_usage() if ($help_value);
|
||||
print_version() if ($version_value);
|
||||
|
||||
|
||||
# Syntax check of your specified options
|
||||
# --------------------------------------
|
||||
|
||||
print "DEBUG : action : $action, path-tw_cli : $tw_cli_path\n" if ($verbose_value);
|
||||
if (($action eq "") or ($tw_cli_path eq "")) {
|
||||
print $display.'one or more following arguments are missing :action/path-tw_cli'."\n";
|
||||
exit $ERRORS{"UNKNOWN"};
|
||||
}
|
||||
|
||||
print "DEBUG : check if $tw_cli_path exists and is executable\n" if ($verbose_value);
|
||||
if(! -x $tw_cli_path) {
|
||||
print $display."$tw_cli_path".' is not executable by you'."\n";
|
||||
exit $ERRORS{"UNKNOWN"};
|
||||
}
|
||||
|
||||
print "DEBUG : warning threshold : $warning, critical threshold : $critical\n" if ($verbose_value);
|
||||
if (($critical < 0) or ($warning < 0) or ($critical < $warning)) {
|
||||
print $display.'the thresholds must be integers and the critical threshold higher or equal than the warning threshold'."\n";
|
||||
exit $ERRORS{"UNKNOWN"};
|
||||
}
|
||||
|
||||
print "DEBUG : controller : $id_controller\n" if ($verbose_value);
|
||||
if ($id_controller ne "") {
|
||||
if (check_controller("$tw_cli_path",$id_controller) != 0) {
|
||||
print $display.'UNKNOWN - problem with the controller '."$id_controller ".'may be it does not exist'."\n";
|
||||
exit $ERRORS{"UNKNOWN"};
|
||||
}
|
||||
}
|
||||
|
||||
# Core script
|
||||
# -----------
|
||||
my ($return,$return_more,$plugstate) = ("","","OK");
|
||||
|
||||
my @controller_list;
|
||||
if (! $id_controller) {
|
||||
@controller_list = list_all_controller("$tw_cli_path");
|
||||
if (! @controller_list) {
|
||||
print $display.'UNKNOWN - problem to have the controllers list'."\n";
|
||||
exit $ERRORS{"UNKNOWN"};
|
||||
}
|
||||
}
|
||||
else {
|
||||
push(@controller_list,$id_controller);
|
||||
}
|
||||
|
||||
print "DEBUG : action = $action\n" if ($verbose_value);
|
||||
|
||||
my @show_return;
|
||||
|
||||
# disk_check action
|
||||
# -----------------
|
||||
if ($action eq 'disk_check') {
|
||||
my ($c_ok,$c_other) = (0,0);
|
||||
foreach (@controller_list) {
|
||||
@show_return = `$tw_cli_path /$_ show`;
|
||||
foreach (@show_return) {
|
||||
if ($_=~/^(p\d+)\s+(\S+)\s/ ) {
|
||||
print "DEBUG : disk $1/status $2\n" if ($verbose_value);
|
||||
$c_ok++ if ($2 eq "OK");
|
||||
$c_other++ if (($2 ne "OK") and ($2 ne "NOT-PRESENT"));
|
||||
$return_more .= " ($1,$2)";
|
||||
}
|
||||
}
|
||||
$return .= "$c_ok disk(s) detected as OK";
|
||||
$return .= " and $c_other with potential problem" if ($c_other);
|
||||
$return .= " -$return_more" if ($more_value);
|
||||
$return .= " | disksOK=$c_ok disksNOK=$c_other" if ($perfdata_value);
|
||||
$plugstate = "WARNING" if ($c_other >= $warning);
|
||||
$plugstate = "CRITICAL" if ($c_other >= $critical);
|
||||
}
|
||||
}
|
||||
|
||||
# unit action
|
||||
# -----------
|
||||
elsif ($action eq 'unit_check') {
|
||||
my ($c_ok,$c_rebuild,$c_other) = (0,0,0);
|
||||
foreach (@controller_list) {
|
||||
@show_return = `$tw_cli_path /$_ show`;
|
||||
foreach (@show_return) {
|
||||
if ($_=~/^(u\d+)\s+(\S+)\s+(\S+)/) {
|
||||
print "DEBUG : disk $1/type $2/status $3\n" if ($verbose_value);
|
||||
$c_ok++ if ($3 eq "OK");
|
||||
$c_rebuild++ if ($3 eq "REBUILD");
|
||||
$c_other++ if (($3 ne "OK") and ($3 ne "REBUILD"));
|
||||
$return_more .= " ($1,$2,$3)";
|
||||
}
|
||||
}
|
||||
$return .= "$c_ok unit(s) detected as OK";
|
||||
$return .= " and $c_rebuild as REBUILD" if ($c_rebuild);
|
||||
$return .= "and $c_other with potential problem" if ($c_other);
|
||||
$return .= " -$return_more" if ($more_value);
|
||||
$return .= " | unitOK=$c_ok unitREBUILD=$c_rebuild unitNOK=$c_other" if ($perfdata_value);
|
||||
$plugstate = "WARNING" if ($c_rebuild);
|
||||
$plugstate = "CRITICAL" if ($c_other);
|
||||
}
|
||||
}
|
||||
|
||||
else {
|
||||
$return .= "action must be unit_check|disk_check";
|
||||
$action = "";
|
||||
$plugstate = "UNKNOWN";
|
||||
}
|
||||
|
||||
print $display.$action." - ".$plugstate." - ".$return;
|
||||
exit $ERRORS{$plugstate};
|
||||
|
||||
# ####################################################################
|
||||
# function 1 : display the help
|
||||
# ------------------------------
|
||||
sub print_usage {
|
||||
print <<EOT;
|
||||
$script_name version $version by $author
|
||||
|
||||
This plugin checks state of your physical disks and logical units of a 3ware RAID card.
|
||||
|
||||
Usage: /<path-to>/$script_name [-a unit_check|disk_check] [-p] [-D "$display"] [-v] [-m] [-c 2] [-w 1] [-C /c1]
|
||||
|
||||
Options:
|
||||
-h, --help
|
||||
Print detailed help screen
|
||||
-V, --version
|
||||
Print version information
|
||||
-D, --display=STRING
|
||||
to modify the output display...
|
||||
default is "CHECK 3ware RAID - "
|
||||
-P, --path-tw_cli=STRING
|
||||
specify the path to the tw_cli binary
|
||||
default value is /usr/sbin/tw_cli
|
||||
-a, --action=STRING
|
||||
specify the action : unit_check|disk_check
|
||||
default is disk_check
|
||||
disk_check : display state of all physical disks
|
||||
unit_check : display state of all logical unit
|
||||
-C, --controller=STRING
|
||||
allow you to specify only one controller to check
|
||||
the default behavior is to check each time every controller
|
||||
-c, --critical=INT
|
||||
specify a critical threshold for the number of disks in a non-OK state.
|
||||
default is 2
|
||||
only for the disk_check action
|
||||
-w, --warning=INT
|
||||
specify a warning threshold for the number of disks in a non-OK state.
|
||||
default is 1
|
||||
only for the disk_check action
|
||||
-m, --more
|
||||
Print a longer output. By default, the output is not complet because
|
||||
Nagios may truncate it. This option is just for you
|
||||
-p, --perfdata
|
||||
If you want to activate the perfdata output
|
||||
-v, --verbose
|
||||
Show details for command-line debugging (Nagios may truncate the output)
|
||||
|
||||
Send email to $a_mail if you have questions
|
||||
regarding use of this software. To submit patches or suggest improvements,
|
||||
send email to $a_mail
|
||||
This plugin has been created by $author
|
||||
|
||||
Hope you will enjoy it ;)
|
||||
|
||||
Remember :
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
|
||||
EOT
|
||||
exit $ERRORS{"UNKNOWN"};
|
||||
}
|
||||
|
||||
# function 2 : display version information
|
||||
# -----------------------------------------
|
||||
sub print_version {
|
||||
print <<EOT;
|
||||
$script_name version $version
|
||||
EOT
|
||||
exit $ERRORS{"UNKNOWN"};
|
||||
}
|
||||
|
||||
# function 3 : check if controller exists
|
||||
# ---------------------------------------
|
||||
sub check_controller {
|
||||
my ($tw_cli_path,$id_controller) = @_;
|
||||
system("$tw_cli_path /$id_controller show >> /dev/null 2>&1");
|
||||
return $?;
|
||||
}
|
||||
|
||||
# function 4 : return the controllers list
|
||||
# ----------------------------------------
|
||||
sub list_all_controller {
|
||||
my ($tw_cli_path) = @_;
|
||||
my @controller_list;
|
||||
my @cmd_output = `$tw_cli_path show`;
|
||||
if ($? == 0) {
|
||||
foreach (@cmd_output) {
|
||||
if ($_=~/^(c\d+)\s/ ) {
|
||||
push(@controller_list,$1);
|
||||
}
|
||||
}
|
||||
}
|
||||
return @controller_list;
|
||||
}
|
BIN
files/nrpe/check_disk_advanced
Executable file
BIN
files/nrpe/check_disk_advanced
Executable file
Binary file not shown.
BIN
files/nrpe/check_dns
Executable file
BIN
files/nrpe/check_dns
Executable file
Binary file not shown.
985
files/nrpe/check_docker
Executable file
985
files/nrpe/check_docker
Executable file
@ -0,0 +1,985 @@
|
||||
#!/usr/bin/env python3
|
||||
# logging.basicConfig(level=logging.DEBUG)
|
||||
import math
|
||||
from collections import deque, namedtuple, UserDict, defaultdict
|
||||
from sys import argv
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import socket
|
||||
import stat
|
||||
import traceback
|
||||
from concurrent import futures
|
||||
from datetime import datetime, timezone
|
||||
from functools import lru_cache
|
||||
from http.client import HTTPConnection
|
||||
from urllib import request
|
||||
from urllib.error import HTTPError, URLError
|
||||
from urllib.request import AbstractHTTPHandler, HTTPHandler, HTTPSHandler, OpenerDirector, HTTPRedirectHandler, \
|
||||
Request, HTTPBasicAuthHandler
|
||||
|
||||
logger = logging.getLogger()
|
||||
__author__ = 'Tim Laurence'
|
||||
__copyright__ = "Copyright 2018"
|
||||
__credits__ = ['Tim Laurence']
|
||||
__license__ = "GPL"
|
||||
__version__ = "2.1.0"
|
||||
|
||||
'''
|
||||
nrpe compatible check for docker containers.
|
||||
|
||||
Requires Python 3
|
||||
|
||||
Note: I really would have preferred to have used requests for all the network connections but that would have added a
|
||||
dependency.
|
||||
'''
|
||||
|
||||
DEFAULT_SOCKET = '/var/run/docker.sock'
|
||||
DEFAULT_TIMEOUT = 10.0
|
||||
DEFAULT_PORT = 2375
|
||||
DEFAULT_MEMORY_UNITS = 'B'
|
||||
DEFAULT_HEADERS = [('Accept', 'application/vnd.docker.distribution.manifest.v2+json')]
|
||||
DEFAULT_PUBLIC_REGISTRY = 'registry-1.docker.io'
|
||||
|
||||
# The second value is the power to raise the base to.
|
||||
UNIT_ADJUSTMENTS_TEMPLATE = {
|
||||
'%': 0,
|
||||
'B': 0,
|
||||
'KB': 1,
|
||||
'MB': 2,
|
||||
'GB': 3,
|
||||
'TB': 4
|
||||
}
|
||||
unit_adjustments = None
|
||||
|
||||
# Reduce message to a single OK unless a checks fail.
|
||||
no_ok = False
|
||||
|
||||
# Suppress performance data reporting
|
||||
no_performance = False
|
||||
|
||||
OK_RC = 0
|
||||
WARNING_RC = 1
|
||||
CRITICAL_RC = 2
|
||||
UNKNOWN_RC = 3
|
||||
|
||||
# These hold the final results
|
||||
rc = -1
|
||||
messages = []
|
||||
performance_data = []
|
||||
|
||||
ImageName = namedtuple('ImageName', "registry name tag full_name")
|
||||
|
||||
|
||||
class ThresholdSpec(UserDict):
|
||||
def __init__(self, warn, crit, units=''):
|
||||
super().__init__(warn=warn, crit=crit, units=units)
|
||||
|
||||
def __getattr__(self, item):
|
||||
return self[item]
|
||||
|
||||
|
||||
# How much threading can we do? We are generally not CPU bound so I am using this a worse case cap
|
||||
DEFAULT_PARALLELISM = 10
|
||||
|
||||
# Holds list of all threads
|
||||
threads = []
|
||||
|
||||
# This is used during testing
|
||||
DISABLE_THREADING = False
|
||||
|
||||
|
||||
# Hacked up urllib to handle sockets
|
||||
#############################################################################################
|
||||
# Docker runs a http connection over a socket. http.client is knows how to deal with these
|
||||
# but lacks some niceties. Urllib wraps that and makes up for some of the deficiencies but
|
||||
# cannot fix the fact http.client can't read from socket files. In order to take advantage of
|
||||
# urllib and http.client's capabilities the class below tweaks HttpConnection and passes it
|
||||
# to urllib registering for socket:// connections
|
||||
|
||||
class SocketFileHandler(AbstractHTTPHandler):
|
||||
class SocketFileToHttpConnectionAdaptor(HTTPConnection):
|
||||
def __init__(self, socket_file, timeout=DEFAULT_TIMEOUT):
|
||||
super().__init__(host='', port=0, timeout=timeout)
|
||||
self.socket_file = socket_file
|
||||
|
||||
def connect(self):
|
||||
self.sock = socket.socket(family=socket.AF_UNIX, type=socket.SOCK_STREAM, proto=0, fileno=None)
|
||||
self.sock.settimeout(self.timeout)
|
||||
self.sock.connect(self.socket_file)
|
||||
|
||||
def socket_open(self, req):
|
||||
socket_file, path = req.selector.split(':', 1)
|
||||
req.host = socket_file
|
||||
req.selector = path
|
||||
return self.do_open(self.SocketFileToHttpConnectionAdaptor, req)
|
||||
|
||||
|
||||
# Tokens are not cached because I expect the callers to cache the responses
|
||||
class Oauth2TokenAuthHandler(HTTPBasicAuthHandler):
|
||||
auth_failure_tracker = defaultdict(int)
|
||||
|
||||
def http_response(self, request, response):
|
||||
code, hdrs = response.code, response.headers
|
||||
|
||||
www_authenticate_header = response.headers.get('www-authenticate', None)
|
||||
if code == 401 and www_authenticate_header:
|
||||
scheme = www_authenticate_header.split()[0]
|
||||
if scheme.lower() == 'bearer':
|
||||
return self.process_oauth2(request, response, www_authenticate_header)
|
||||
|
||||
return response
|
||||
|
||||
https_response = http_response
|
||||
|
||||
@staticmethod
|
||||
def _get_outh2_token(www_authenticate_header):
|
||||
auth_fields = dict(re.findall(r"""(?:(?P<key>[^ ,=]+)="([^"]+)")""", www_authenticate_header))
|
||||
|
||||
auth_url = "{realm}?scope={scope}&service={service}".format(
|
||||
realm=auth_fields['realm'],
|
||||
scope=auth_fields['scope'],
|
||||
service=auth_fields['service'],
|
||||
)
|
||||
token_request = Request(auth_url)
|
||||
token_request.add_header("Content-Type", "application/x-www-form-urlencoded; charset=utf-8")
|
||||
token_response = request.urlopen(token_request)
|
||||
return process_urllib_response(token_response)['token']
|
||||
|
||||
def process_oauth2(self, request, response, www_authenticate_header):
|
||||
|
||||
# This keep infinite auth loops from happening
|
||||
full_url = request.full_url
|
||||
self.auth_failure_tracker[full_url] += 1
|
||||
if self.auth_failure_tracker[full_url] > 1:
|
||||
raise HTTPError(full_url, 401, "Stopping Oauth2 failure loop for {}".format(full_url),
|
||||
response.headers, response)
|
||||
|
||||
auth_token = self._get_outh2_token(www_authenticate_header)
|
||||
|
||||
request.add_unredirected_header('Authorization', 'Bearer ' + auth_token)
|
||||
return self.parent.open(request, timeout=request.timeout)
|
||||
|
||||
|
||||
# Got some help from this example https://gist.github.com/FiloSottile/2077115
|
||||
class HeadRequest(Request):
|
||||
def get_method(self):
|
||||
return "HEAD"
|
||||
|
||||
|
||||
better_urllib_get = OpenerDirector()
|
||||
better_urllib_get.addheaders = DEFAULT_HEADERS.copy()
|
||||
better_urllib_get.add_handler(HTTPHandler())
|
||||
better_urllib_get.add_handler(HTTPSHandler())
|
||||
better_urllib_get.add_handler(HTTPRedirectHandler())
|
||||
better_urllib_get.add_handler(SocketFileHandler())
|
||||
better_urllib_get.add_handler(Oauth2TokenAuthHandler())
|
||||
|
||||
|
||||
class RegistryError(Exception):
|
||||
def __init__(self, response):
|
||||
self.response_obj = response
|
||||
|
||||
|
||||
# Util functions
|
||||
#############################################################################################
|
||||
def parse_thresholds(spec, include_units=True, units_required=True):
|
||||
"""
|
||||
Given a spec string break it up into ':' separated chunks. Convert strings to ints as it makes sense
|
||||
|
||||
:param spec: The threshold specification being parsed
|
||||
:param include_units: Specifies that units should be processed and returned if present
|
||||
:param units_required: Mark spec as invalid if the units are missing.
|
||||
:return: A list containing the thresholds in order of warn, crit, and units(if included and present)
|
||||
"""
|
||||
parts = deque(spec.split(':'))
|
||||
if not all(parts):
|
||||
raise ValueError("Blanks are not allowed in a threshold specification: {}".format(spec))
|
||||
|
||||
# Warn
|
||||
warn = int(parts.popleft())
|
||||
# Crit
|
||||
crit = int(parts.popleft())
|
||||
|
||||
units = ''
|
||||
if include_units:
|
||||
if len(parts):
|
||||
# units
|
||||
units = parts.popleft()
|
||||
elif units_required:
|
||||
raise ValueError("Missing units in {}".format(spec))
|
||||
|
||||
if len(parts) != 0:
|
||||
raise ValueError("Too many threshold specifiers in {}".format(spec))
|
||||
|
||||
return ThresholdSpec(warn=warn, crit=crit, units=units)
|
||||
|
||||
|
||||
def pretty_time(seconds):
|
||||
remainder = seconds
|
||||
result = []
|
||||
if remainder > 24 * 60 * 60:
|
||||
days, remainder = divmod(remainder, 24 * 60 * 60)
|
||||
result.append("{}d".format(int(days)))
|
||||
if remainder > 60 * 60:
|
||||
hours, remainder = divmod(remainder, 60 * 60)
|
||||
result.append("{}h".format(int(hours)))
|
||||
if remainder > 60:
|
||||
minutes, remainder = divmod(remainder, 60)
|
||||
result.append("{}min".format(int(minutes)))
|
||||
result.append("{}s".format(int(remainder)))
|
||||
return result
|
||||
|
||||
|
||||
def evaluate_numeric_thresholds(container, value, thresholds, name, short_name,
|
||||
min=None, max=None, greater_than=True):
|
||||
rounder = lambda x: round(x, 2)
|
||||
|
||||
INTEGER_UNITS = ['B', '%', '']
|
||||
|
||||
# Some units don't have decimal places
|
||||
rounded_value = int(value) if thresholds.units in INTEGER_UNITS else rounder(value)
|
||||
|
||||
perf_string = "{container}_{short_name}={value}{units};{warn};{crit}".format(
|
||||
container=container,
|
||||
short_name=short_name,
|
||||
value=rounded_value,
|
||||
**thresholds)
|
||||
if min is not None:
|
||||
rounded_min = math.floor(min) if thresholds.units in INTEGER_UNITS else rounder(min)
|
||||
perf_string += ';{}'.format(rounded_min)
|
||||
if max is not None:
|
||||
rounded_max = math.ceil(max) if thresholds.units in INTEGER_UNITS else rounder(max)
|
||||
perf_string += ';{}'.format(rounded_max)
|
||||
|
||||
global performance_data
|
||||
performance_data.append(perf_string)
|
||||
|
||||
if thresholds.units == 's':
|
||||
nice_time = ' '.join(pretty_time(rounded_value)[:2])
|
||||
results_str = "{} {} is {}".format(container, name, nice_time)
|
||||
else:
|
||||
results_str = "{} {} is {}{}".format(container, name, rounded_value, thresholds.units)
|
||||
|
||||
if greater_than:
|
||||
comparator = lambda value, threshold: value >= threshold
|
||||
else:
|
||||
comparator = lambda value, threshold: value <= threshold
|
||||
|
||||
if comparator(value, thresholds.crit):
|
||||
critical(results_str)
|
||||
elif comparator(value, thresholds.warn):
|
||||
warning(results_str)
|
||||
else:
|
||||
ok(results_str)
|
||||
|
||||
|
||||
@lru_cache(maxsize=None)
|
||||
def get_url(url):
|
||||
logger.debug("get_url: {}".format(url))
|
||||
response = better_urllib_get.open(url, timeout=timeout)
|
||||
logger.debug("get_url: {} {}".format(url, response.status))
|
||||
return process_urllib_response(response), response.status
|
||||
|
||||
|
||||
@lru_cache(maxsize=None)
|
||||
def head_url(url):
|
||||
# Follow redirects
|
||||
response = better_urllib_get.open(HeadRequest(url), timeout=timeout)
|
||||
logger.debug("{} {}".format(url, response.status))
|
||||
return response
|
||||
|
||||
|
||||
def process_urllib_response(response):
|
||||
response_bytes = response.read()
|
||||
body = response_bytes.decode('utf-8')
|
||||
# logger.debug("BODY: {}".format(body))
|
||||
return json.loads(body)
|
||||
|
||||
|
||||
def get_container_info(name):
|
||||
content, _ = get_url(daemon + '/containers/{container}/json'.format(container=name))
|
||||
return content
|
||||
|
||||
|
||||
def get_image_info(name):
|
||||
content, _ = get_url(daemon + '/images/{image}/json'.format(image=name))
|
||||
return content
|
||||
|
||||
|
||||
def get_state(container):
|
||||
return get_container_info(container)['State']
|
||||
|
||||
|
||||
def get_stats(container):
|
||||
content, _ = get_url(daemon + '/containers/{container}/stats?stream=0'.format(container=container))
|
||||
return content
|
||||
|
||||
|
||||
def get_ps_name(name_list):
|
||||
# Pick the name that starts with a '/' but doesn't contain a '/' and return that value
|
||||
for name in name_list:
|
||||
if '/' not in name[1:] and name[0] == '/':
|
||||
return name[1:]
|
||||
else:
|
||||
raise NameError("Error when trying to identify 'ps' name in {}".format(name_list))
|
||||
|
||||
|
||||
def get_containers(names, require_present):
|
||||
containers_list, _ = get_url(daemon + '/containers/json?all=1')
|
||||
|
||||
all_container_names = set(get_ps_name(x['Names']) for x in containers_list)
|
||||
|
||||
if 'all' in names:
|
||||
return all_container_names
|
||||
|
||||
filtered = set()
|
||||
for matcher in names:
|
||||
found = False
|
||||
for candidate in all_container_names:
|
||||
if re.match("^{}$".format(matcher), candidate):
|
||||
filtered.add(candidate)
|
||||
found = True
|
||||
# If we don't find a container that matches out regex
|
||||
if require_present and not found:
|
||||
critical("No containers match {}".format(matcher))
|
||||
|
||||
return filtered
|
||||
|
||||
|
||||
def get_container_digest(container):
|
||||
# find registry and tag
|
||||
inspection = get_container_info(container)
|
||||
image_id = inspection['Image']
|
||||
image_info = get_image_info(image_id)
|
||||
try:
|
||||
return image_info['RepoDigests'][0].split('@')[1]
|
||||
except IndexError:
|
||||
return None
|
||||
|
||||
|
||||
def get_container_image_urls(container):
|
||||
inspection = get_container_info(container)
|
||||
image_id = inspection['Image']
|
||||
image_info = get_image_info(image_id)
|
||||
return image_info['RepoTags']
|
||||
|
||||
|
||||
def normalize_image_name_to_manifest_url(image_name, insecure_registries):
|
||||
parsed_url = parse_image_name(image_name)
|
||||
|
||||
lower_insecure = [reg.lower() for reg in insecure_registries]
|
||||
|
||||
# Registry query url
|
||||
scheme = 'http' if parsed_url.registry.lower() in lower_insecure else 'https'
|
||||
url = '{scheme}://{registry}/v2/{image_name}/manifests/{image_tag}'.format(scheme=scheme,
|
||||
registry=parsed_url.registry,
|
||||
image_name=parsed_url.name,
|
||||
image_tag=parsed_url.tag)
|
||||
return url, parsed_url.registry
|
||||
|
||||
|
||||
# Auth servers seem picky about being hit too hard. Can't figure out why. ;)
|
||||
# As result it is best to single thread this check
|
||||
# This is based on https://docs.docker.com/registry/spec/auth/token/#requesting-a-token
|
||||
def get_digest_from_registry(url):
|
||||
logger.debug("get_digest_from_registry")
|
||||
# query registry
|
||||
# TODO: Handle logging in if needed
|
||||
registry_info = head_url(url=url)
|
||||
|
||||
digest = registry_info.getheader('Docker-Content-Digest', None)
|
||||
if digest is None:
|
||||
raise RegistryError(response=registry_info)
|
||||
return digest
|
||||
|
||||
|
||||
def set_rc(new_rc):
|
||||
global rc
|
||||
rc = new_rc if new_rc > rc else rc
|
||||
|
||||
|
||||
def ok(message):
|
||||
set_rc(OK_RC)
|
||||
messages.append('OK: ' + message)
|
||||
|
||||
|
||||
def warning(message):
|
||||
set_rc(WARNING_RC)
|
||||
messages.append('WARNING: ' + message)
|
||||
|
||||
|
||||
def critical(message):
|
||||
set_rc(CRITICAL_RC)
|
||||
messages.append('CRITICAL: ' + message)
|
||||
|
||||
|
||||
def unknown(message):
|
||||
set_rc(UNKNOWN_RC)
|
||||
messages.append('UNKNOWN: ' + message)
|
||||
|
||||
|
||||
def require_running(name):
|
||||
def inner_decorator(func):
|
||||
def wrapper(container, *args, **kwargs):
|
||||
container_state = get_state(container)
|
||||
state = normalize_state(container_state)
|
||||
if state.lower() == "running":
|
||||
func(container, *args, **kwargs)
|
||||
else:
|
||||
# container is not running, can't perform check
|
||||
critical('{container} is not "running", cannot check {check}"'.format(container=container,
|
||||
check=name))
|
||||
|
||||
return wrapper
|
||||
|
||||
return inner_decorator
|
||||
|
||||
|
||||
def multithread_execution(disable_threading=DISABLE_THREADING):
|
||||
def inner_decorator(func):
|
||||
def wrapper(container, *args, **kwargs):
|
||||
if DISABLE_THREADING:
|
||||
func(container, *args, **kwargs)
|
||||
else:
|
||||
threads.append(parallel_executor.submit(func, container, *args, **kwargs))
|
||||
|
||||
return wrapper
|
||||
|
||||
return inner_decorator
|
||||
|
||||
|
||||
def singlethread_execution(disable_threading=DISABLE_THREADING):
|
||||
def inner_decorator(func):
|
||||
def wrapper(container, *args, **kwargs):
|
||||
if DISABLE_THREADING:
|
||||
func(container, *args, **kwargs)
|
||||
else:
|
||||
threads.append(serial_executor.submit(func, container, *args, **kwargs))
|
||||
|
||||
return wrapper
|
||||
|
||||
return inner_decorator
|
||||
|
||||
|
||||
def parse_image_name(image_name):
|
||||
"""
|
||||
Parses image names into their constituent parts.
|
||||
:param image_name:
|
||||
:return: ImageName
|
||||
"""
|
||||
|
||||
# These are based on information found here
|
||||
# https://docs.docker.com/engine/reference/commandline/tag/#extended-description
|
||||
# https://github.com/docker/distribution/blob/master/reference/regexp.go
|
||||
host_segment_re = '[a-zA-Z0-9]([a-zA-Z0-9-]*[a-zA-Z0-9])?'
|
||||
hostname_re = r'({host_segment}\.)+{host_segment}'.format(host_segment=host_segment_re)
|
||||
registry_re = r'((?P<registry>({hostname_re}(:\d+)?|{host_segment_re}:\d+))/)'.format(
|
||||
host_segment_re=host_segment_re, hostname_re=hostname_re)
|
||||
name_component_ends_re = '[a-z0-9]'
|
||||
name_component_middle_re = '[a-z0-9._-]' # Ignoring spec limit of two _
|
||||
name_component_re = '({end}{middle}*{end}|{end})'.format(end=name_component_ends_re,
|
||||
middle=name_component_middle_re)
|
||||
image_name_re = "(?P<image_name>({name_component}/)*{name_component})".format(name_component=name_component_re)
|
||||
image_tag_re = '(?P<image_tag>[a-zA-Z0-9_][a-zA-Z0-9_.-]*)'
|
||||
full_re = '^{registry}?{image_name}(:{image_tag})?$'.format(registry=registry_re, image_name=image_name_re,
|
||||
image_tag=image_tag_re)
|
||||
parsed = re.match(full_re, image_name)
|
||||
|
||||
registry = parsed.group('registry') if parsed.group('registry') else DEFAULT_PUBLIC_REGISTRY
|
||||
|
||||
image_name = parsed.group('image_name')
|
||||
image_name = image_name if '/' in image_name or registry != DEFAULT_PUBLIC_REGISTRY else 'library/' + image_name
|
||||
|
||||
image_tag = parsed.group('image_tag')
|
||||
image_tag = image_tag if image_tag else 'latest'
|
||||
|
||||
full_image_name = "{registry}/{image_name}:{image_tag}".format(
|
||||
registry=registry,
|
||||
image_name=image_name,
|
||||
image_tag=image_tag)
|
||||
|
||||
return ImageName(registry=registry, name=image_name, tag=image_tag, full_name=full_image_name)
|
||||
|
||||
|
||||
def normalize_state(status_info):
|
||||
# Ugh, docker used to report state in as silly way then they figured out how to do it better.
|
||||
# This tries the simpler new way and if that doesn't work fails back to the old way
|
||||
|
||||
# On new docker engines the status holds whatever the current state is, running, stopped, paused, etc.
|
||||
if "Status" in status_info:
|
||||
return status_info['Status']
|
||||
|
||||
status = 'Exited'
|
||||
if status_info["Restarting"]:
|
||||
status = 'Restarting'
|
||||
elif status_info["Paused"]:
|
||||
status = 'Paused'
|
||||
elif status_info["Dead"]:
|
||||
status = 'Dead'
|
||||
elif status_info["Running"]:
|
||||
return "Running"
|
||||
return status
|
||||
|
||||
|
||||
# Checks
|
||||
#############################################################################################
|
||||
|
||||
@multithread_execution()
|
||||
@require_running(name='memory')
|
||||
def check_memory(container, thresholds):
|
||||
if not thresholds.units in unit_adjustments:
|
||||
unknown("Memory units must be one of {}".format(list(unit_adjustments.keys())))
|
||||
return
|
||||
|
||||
inspection = get_stats(container)
|
||||
|
||||
# Subtracting cache to match what `docker stats` does.
|
||||
adjusted_usage = inspection['memory_stats']['usage'] - inspection['memory_stats']['stats']['total_cache']
|
||||
if thresholds.units == '%':
|
||||
max = 100
|
||||
usage = int(100 * adjusted_usage / inspection['memory_stats']['limit'])
|
||||
else:
|
||||
max = inspection['memory_stats']['limit'] / unit_adjustments[thresholds.units]
|
||||
usage = adjusted_usage / unit_adjustments[thresholds.units]
|
||||
|
||||
evaluate_numeric_thresholds(container=container, value=usage, thresholds=thresholds, name='memory',
|
||||
short_name='mem', min=0, max=max)
|
||||
|
||||
|
||||
@multithread_execution()
|
||||
def check_status(container, desired_state):
|
||||
normized_desired_state = desired_state.lower()
|
||||
normalized_state = normalize_state(get_state(container)).lower()
|
||||
if normized_desired_state != normalized_state:
|
||||
critical("{} state is not {}".format(container, desired_state))
|
||||
return
|
||||
ok("{} status is {}".format(container, desired_state))
|
||||
|
||||
|
||||
@multithread_execution()
|
||||
@require_running('health')
|
||||
def check_health(container):
|
||||
state = get_state(container)
|
||||
if "Health" in state and "Status" in state["Health"]:
|
||||
health = state["Health"]["Status"]
|
||||
message = "{} is {}".format(container, health)
|
||||
if health == 'healthy':
|
||||
ok(message)
|
||||
elif health == 'unhealthy':
|
||||
critical(message)
|
||||
else:
|
||||
unknown(message)
|
||||
else:
|
||||
unknown('{} has no health check data'.format(container))
|
||||
|
||||
|
||||
@multithread_execution()
|
||||
@require_running('uptime')
|
||||
def check_uptime(container, thresholds):
|
||||
inspection = get_container_info(container)['State']['StartedAt']
|
||||
only_secs = inspection[0:19]
|
||||
start = datetime.strptime(only_secs, "%Y-%m-%dT%H:%M:%S")
|
||||
start = start.replace(tzinfo=timezone.utc)
|
||||
now = datetime.now(timezone.utc)
|
||||
uptime = (now - start).total_seconds()
|
||||
|
||||
graph_padding = 2
|
||||
thresholds.units = 's'
|
||||
evaluate_numeric_thresholds(container=container, value=uptime, thresholds=thresholds, name='uptime',
|
||||
short_name='up', min=0, max=graph_padding, greater_than=False)
|
||||
|
||||
|
||||
@multithread_execution()
|
||||
@require_running('restarts')
|
||||
def check_restarts(container, thresholds):
|
||||
inspection = get_container_info(container)
|
||||
|
||||
restarts = int(inspection['RestartCount'])
|
||||
graph_padding = 2
|
||||
evaluate_numeric_thresholds(container=container, value=restarts, thresholds=thresholds, name='restarts',
|
||||
short_name='re', min=0, max=graph_padding)
|
||||
|
||||
|
||||
@singlethread_execution()
|
||||
def check_version(container, insecure_registries):
|
||||
image_digest = get_container_digest(container)
|
||||
if image_digest is None:
|
||||
unknown('Checksum missing for "{}", try doing a pull'.format(container))
|
||||
return
|
||||
|
||||
image_urls = get_container_image_urls(container=container)
|
||||
if len(image_urls) > 1:
|
||||
unknown('"{}" has multiple tags/names. Unsure which one to use to check the version.'.format(container))
|
||||
return
|
||||
elif len(image_urls) == 0:
|
||||
unknown('"{}" has last no repository tag. Is this anywhere else?'.format(container))
|
||||
return
|
||||
|
||||
url, registry = normalize_image_name_to_manifest_url(image_urls[0], insecure_registries)
|
||||
|
||||
try:
|
||||
registry_hash = get_digest_from_registry(url)
|
||||
except URLError as e:
|
||||
if hasattr(e.reason, 'reason') and e.reason.reason == 'UNKNOWN_PROTOCOL':
|
||||
unknown(
|
||||
"TLS error connecting to registry {} for {}, should you use the '--insecure-registry' flag?" \
|
||||
.format(registry, container))
|
||||
return
|
||||
elif hasattr(e.reason, 'strerror') and e.reason.strerror == 'nodename nor servname provided, or not known':
|
||||
unknown(
|
||||
"Cannot reach registry for {} at {}".format(container, url))
|
||||
return
|
||||
else:
|
||||
raise e
|
||||
except RegistryError as e:
|
||||
unknown("Cannot check version, couldn't retrieve digest for {} while checking {}.".format(container, url))
|
||||
return
|
||||
|
||||
if registry_hash == image_digest:
|
||||
ok("{}'s version matches registry".format(container))
|
||||
return
|
||||
critical("{}'s version does not match registry".format(container))
|
||||
|
||||
|
||||
def calculate_cpu_capacity_precentage(info, stats):
|
||||
host_config = info['HostConfig']
|
||||
|
||||
if 'online_cpus' in stats['cpu_stats']:
|
||||
num_cpus = stats['cpu_stats']['online_cpus']
|
||||
else:
|
||||
num_cpus = len(stats['cpu_stats']['cpu_usage']['percpu_usage'])
|
||||
|
||||
# Identify limit system being used
|
||||
# --cpus
|
||||
if 'NanoCpus' in host_config and host_config['NanoCpus'] != 0:
|
||||
period = 1000000000
|
||||
quota = host_config['NanoCpus']
|
||||
# --cpu-quota
|
||||
elif 'CpuQuota' in host_config and host_config['CpuQuota'] != 0:
|
||||
period = 100000 if host_config['CpuPeriod'] == 0 else host_config['CpuPeriod']
|
||||
quota = host_config['CpuQuota']
|
||||
# unlimited
|
||||
else:
|
||||
period = 1
|
||||
quota = num_cpus
|
||||
|
||||
if period * num_cpus < quota:
|
||||
# This handles the case where the quota is actually bigger than amount available by all the cpus.
|
||||
available_limit_ratio = 1
|
||||
else:
|
||||
available_limit_ratio = (period * num_cpus) / quota
|
||||
|
||||
cpu_delta = stats['cpu_stats']['cpu_usage']['total_usage'] - stats['precpu_stats']['cpu_usage']['total_usage']
|
||||
system_delta = stats['cpu_stats']['system_cpu_usage'] - stats['precpu_stats']['system_cpu_usage']
|
||||
usage = (cpu_delta / system_delta) * available_limit_ratio
|
||||
usage = round(usage * 100, 0)
|
||||
return usage
|
||||
|
||||
|
||||
@multithread_execution()
|
||||
@require_running('cpu')
|
||||
def check_cpu(container, thresholds):
|
||||
info = get_container_info(container)
|
||||
|
||||
stats = get_stats(container=container)
|
||||
|
||||
usage = calculate_cpu_capacity_precentage(info=info, stats=stats)
|
||||
|
||||
max = 100
|
||||
thresholds.units = '%'
|
||||
evaluate_numeric_thresholds(container=container, value=usage, thresholds=thresholds, name='cpu', short_name='cpu',
|
||||
min=0, max=max)
|
||||
|
||||
|
||||
def process_args(args):
|
||||
parser = argparse.ArgumentParser(description='Check docker containers.')
|
||||
|
||||
# Connect to local socket or ip address
|
||||
connection_group = parser.add_mutually_exclusive_group()
|
||||
connection_group.add_argument('--connection',
|
||||
dest='connection',
|
||||
action='store',
|
||||
default=DEFAULT_SOCKET,
|
||||
type=str,
|
||||
metavar='[/<path to>/docker.socket|<ip/host address>:<port>]',
|
||||
help='Where to find docker daemon socket. (default: %(default)s)')
|
||||
|
||||
connection_group.add_argument('--secure-connection',
|
||||
dest='secure_connection',
|
||||
action='store',
|
||||
type=str,
|
||||
metavar='[<ip/host address>:<port>]',
|
||||
help='Where to find TLS protected docker daemon socket.')
|
||||
|
||||
base_group = parser.add_mutually_exclusive_group()
|
||||
base_group.add_argument('--binary_units',
|
||||
dest='units_base',
|
||||
action='store_const',
|
||||
const=1024,
|
||||
help='Use a base of 1024 when doing calculations of KB, MB, GB, & TB (This is default)')
|
||||
|
||||
base_group.add_argument('--decimal_units',
|
||||
dest='units_base',
|
||||
action='store_const',
|
||||
const=1000,
|
||||
help='Use a base of 1000 when doing calculations of KB, MB, GB, & TB')
|
||||
parser.set_defaults(units_base=1024)
|
||||
|
||||
# Connection timeout
|
||||
parser.add_argument('--timeout',
|
||||
dest='timeout',
|
||||
action='store',
|
||||
type=float,
|
||||
default=DEFAULT_TIMEOUT,
|
||||
help='Connection timeout in seconds. (default: %(default)s)')
|
||||
|
||||
# Container name
|
||||
parser.add_argument('--containers',
|
||||
dest='containers',
|
||||
action='store',
|
||||
nargs='+',
|
||||
type=str,
|
||||
default=['all'],
|
||||
help='One or more RegEx that match the names of the container(s) to check. If omitted all containers are checked. (default: %(default)s)')
|
||||
|
||||
# Container name
|
||||
parser.add_argument('--present',
|
||||
dest='present',
|
||||
default=False,
|
||||
action='store_true',
|
||||
help='Modifies --containers so that each RegEx must match at least one container.')
|
||||
|
||||
# Threads
|
||||
parser.add_argument('--threads',
|
||||
dest='threads',
|
||||
default=DEFAULT_PARALLELISM,
|
||||
action='store',
|
||||
type=int,
|
||||
help='This + 1 is the maximum number of concurent threads/network connections. (default: %(default)s)')
|
||||
|
||||
# CPU
|
||||
parser.add_argument('--cpu',
|
||||
dest='cpu',
|
||||
action='store',
|
||||
type=str,
|
||||
metavar='WARN:CRIT',
|
||||
help='Check cpu usage percentage taking into account any limits. Valid values are 0 - 100.')
|
||||
|
||||
# Memory
|
||||
parser.add_argument('--memory',
|
||||
dest='memory',
|
||||
action='store',
|
||||
type=str,
|
||||
metavar='WARN:CRIT:UNITS',
|
||||
help='Check memory usage taking into account any limits. Valid values for units are %%,B,KB,MB,GB.')
|
||||
|
||||
# State
|
||||
parser.add_argument('--status',
|
||||
dest='status',
|
||||
action='store',
|
||||
type=str,
|
||||
help='Desired container status (running, exited, etc).')
|
||||
|
||||
# Health
|
||||
parser.add_argument('--health',
|
||||
dest='health',
|
||||
default=None,
|
||||
action='store_true',
|
||||
help="Check container's health check status")
|
||||
|
||||
# Age
|
||||
parser.add_argument('--uptime',
|
||||
dest='uptime',
|
||||
action='store',
|
||||
type=str,
|
||||
metavar='WARN:CRIT',
|
||||
help='Minimum container uptime in seconds. Use when infrequent crashes are tolerated.')
|
||||
|
||||
# Version
|
||||
parser.add_argument('--version',
|
||||
dest='version',
|
||||
default=None,
|
||||
action='store_true',
|
||||
help='Check if the running images are the same version as those in the registry. Useful for finding stale images. Does not support login.')
|
||||
|
||||
# Version
|
||||
parser.add_argument('--insecure-registries',
|
||||
dest='insecure_registries',
|
||||
action='store',
|
||||
nargs='+',
|
||||
type=str,
|
||||
default=[],
|
||||
help='List of registries to connect to with http(no TLS). Useful when using "--version" with images from insecure registries.')
|
||||
|
||||
# Restart
|
||||
parser.add_argument('--restarts',
|
||||
dest='restarts',
|
||||
action='store',
|
||||
type=str,
|
||||
metavar='WARN:CRIT',
|
||||
help='Container restart thresholds.')
|
||||
|
||||
# no-ok
|
||||
parser.add_argument('--no-ok',
|
||||
dest='no_ok',
|
||||
action='store_true',
|
||||
help='Make output terse suppressing OK messages. If all checks are OK return a single OK.')
|
||||
|
||||
# no-performance
|
||||
parser.add_argument('--no-performance',
|
||||
dest='no_performance',
|
||||
action='store_true',
|
||||
help='Suppress performance data. Reduces output when performance data is not being used.')
|
||||
|
||||
parser.add_argument('-V', action='version', version='%(prog)s {}'.format(__version__))
|
||||
|
||||
if len(args) == 0:
|
||||
parser.print_help()
|
||||
|
||||
parsed_args = parser.parse_args(args=args)
|
||||
|
||||
global timeout
|
||||
timeout = parsed_args.timeout
|
||||
|
||||
global daemon
|
||||
global connection_type
|
||||
if parsed_args.secure_connection:
|
||||
daemon = 'https://' + parsed_args.secure_connection
|
||||
connection_type = 'https'
|
||||
elif parsed_args.connection:
|
||||
if parsed_args.connection[0] == '/':
|
||||
daemon = 'socket://' + parsed_args.connection + ':'
|
||||
connection_type = 'socket'
|
||||
else:
|
||||
daemon = 'http://' + parsed_args.connection
|
||||
connection_type = 'http'
|
||||
|
||||
return parsed_args
|
||||
|
||||
|
||||
def no_checks_present(parsed_args):
|
||||
# Look for all functions whose name starts with 'check_'
|
||||
checks = [key[6:] for key in globals().keys() if key.startswith('check_')]
|
||||
# Act like --present is a check though it is not implemented like one
|
||||
return all(getattr(parsed_args, check) is None for check in checks) and not parsed_args.present
|
||||
|
||||
|
||||
def socketfile_permissions_failure(parsed_args):
|
||||
if connection_type == 'socket':
|
||||
return not (os.path.exists(parsed_args.connection)
|
||||
and stat.S_ISSOCK(os.stat(parsed_args.connection).st_mode)
|
||||
and os.access(parsed_args.connection, os.R_OK)
|
||||
and os.access(parsed_args.connection, os.W_OK))
|
||||
else:
|
||||
return False
|
||||
|
||||
|
||||
def print_results():
|
||||
if no_ok:
|
||||
# Remove all the "OK"s
|
||||
filtered_messages = [message for message in messages if not message.startswith('OK: ')]
|
||||
if len(filtered_messages) == 0:
|
||||
messages_concat = 'OK'
|
||||
else:
|
||||
messages_concat = '; '.join(filtered_messages)
|
||||
|
||||
else:
|
||||
messages_concat = '; '.join(messages)
|
||||
|
||||
if no_performance or len(performance_data) == 0:
|
||||
print(messages_concat)
|
||||
else:
|
||||
perfdata_concat = ' '.join(performance_data)
|
||||
print(messages_concat + '|' + perfdata_concat)
|
||||
|
||||
|
||||
def perform_checks(raw_args):
|
||||
args = process_args(raw_args)
|
||||
|
||||
global parallel_executor
|
||||
parallel_executor = futures.ThreadPoolExecutor(max_workers=args.threads)
|
||||
global serial_executor
|
||||
serial_executor = futures.ThreadPoolExecutor(max_workers=1)
|
||||
|
||||
global unit_adjustments
|
||||
unit_adjustments = {key: args.units_base ** value for key, value in UNIT_ADJUSTMENTS_TEMPLATE.items()}
|
||||
|
||||
global no_ok
|
||||
no_ok = args.no_ok
|
||||
|
||||
global no_performance
|
||||
no_performance = args.no_ok
|
||||
|
||||
if socketfile_permissions_failure(args):
|
||||
unknown("Cannot access docker socket file. User ID={}, socket file={}".format(os.getuid(), args.connection))
|
||||
return
|
||||
|
||||
if args.containers == ["all"] and args.present:
|
||||
unknown("You can not use --present without --containers")
|
||||
return
|
||||
|
||||
if no_checks_present(args):
|
||||
unknown("No checks specified.")
|
||||
return
|
||||
|
||||
# Here is where all the work happens
|
||||
#############################################################################################
|
||||
containers = get_containers(args.containers, args.present)
|
||||
|
||||
if len(containers) == 0 and not args.present:
|
||||
unknown("No containers names found matching criteria")
|
||||
return
|
||||
|
||||
for container in containers:
|
||||
|
||||
# Check status
|
||||
if args.status:
|
||||
check_status(container, args.status)
|
||||
|
||||
# Check version
|
||||
if args.version:
|
||||
check_version(container, args.insecure_registries)
|
||||
|
||||
# below are checks that require a 'running' status
|
||||
|
||||
# Check status
|
||||
if args.health:
|
||||
check_health(container)
|
||||
|
||||
# Check cpu usage
|
||||
if args.cpu:
|
||||
check_cpu(container, parse_thresholds(args.cpu, units_required=False))
|
||||
|
||||
# Check memory usage
|
||||
if args.memory:
|
||||
check_memory(container, parse_thresholds(args.memory, units_required=False))
|
||||
|
||||
# Check uptime
|
||||
if args.uptime:
|
||||
check_uptime(container, parse_thresholds(args.uptime, include_units=False))
|
||||
|
||||
# Check restart count
|
||||
if args.restarts:
|
||||
check_restarts(container, parse_thresholds(args.restarts, include_units=False))
|
||||
|
||||
|
||||
def main():
|
||||
try:
|
||||
perform_checks(argv[1:])
|
||||
|
||||
# get results to let exceptions in threads bubble out
|
||||
[x.result() for x in futures.as_completed(threads)]
|
||||
|
||||
except Exception as e:
|
||||
traceback.print_exc()
|
||||
unknown("Exception raised during check': {}".format(repr(e)))
|
||||
print_results()
|
||||
exit(rc)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
181
files/nrpe/check_eth
Executable file
181
files/nrpe/check_eth
Executable file
@ -0,0 +1,181 @@
|
||||
#!/usr/bin/perl -w
|
||||
|
||||
use strict;
|
||||
use warnings;
|
||||
use Getopt::Long;
|
||||
|
||||
use constant BITS => 8;
|
||||
use constant BYTES => 1;
|
||||
|
||||
my $iface = "";
|
||||
my $bandwidth = "";
|
||||
my $warning = "";
|
||||
my $critical = "";
|
||||
my $percent = "";
|
||||
|
||||
GetOptions(
|
||||
"i|interface=s" => \$iface,
|
||||
"w|warning=s" => \$warning,
|
||||
"c|critical=s" => \$critical,
|
||||
"b|bandwidth=s" => \$bandwidth,
|
||||
"p|percent" => \$percent
|
||||
);
|
||||
|
||||
my $bitmod = BYTES;
|
||||
|
||||
my $tmpfile = "/tmp/traffic";
|
||||
my $output = "";
|
||||
my $line = "";
|
||||
|
||||
my %status = ( 'OK' => 0,
|
||||
'WARNING' => 1,
|
||||
'CRITICAL' => 2,
|
||||
'UNKNOWN' => 3
|
||||
);
|
||||
my $exit_status = $status{OK};
|
||||
|
||||
my %data = ( 'time' => 0, 'last_time' => 0,
|
||||
'rxbytes' => 0, 'last_rxbytes' => 0,
|
||||
'txbytes' => 0, 'last_txbytes' => 0
|
||||
);
|
||||
|
||||
my %speed = ( 'tx' => 0,
|
||||
'rx' => 0,
|
||||
'interval' => 1
|
||||
);
|
||||
|
||||
usage() if ( !$iface || !$warning || !$critical );
|
||||
if ( $percent ) {
|
||||
usage() if ( !$bandwidth || $bandwidth !~ /^\d+[kKmMgG]$/ );
|
||||
usage() if ( $warning !~ /^\d{1,3}$/ || $warning>100 || $critical !~ /^\d{1,3}$/ || $critical>100 );
|
||||
$bandwidth = human2bytes($bandwidth);
|
||||
} else {
|
||||
$warning = human2bytes($warning);
|
||||
$critical = human2bytes($critical);
|
||||
usage() if ( !$warning || !$critical )
|
||||
}
|
||||
usage() if ( $warning > $critical );
|
||||
|
||||
open ( NET, "</proc/net/dev" ) or die ( "Can't open /proc/net/dev: $!" );
|
||||
while ( <NET> ) {
|
||||
chomp();
|
||||
if ( $_ =~ /^\s*$iface\:\s*(\d+)(?:\s*(?:\d+)){7}\s*(\d+)(?:\s*(?:\d+)){7}\s*$/ ) {
|
||||
$data{time} = time - 1;
|
||||
$data{rxbytes} = $1;
|
||||
$data{txbytes} = $2;
|
||||
last;
|
||||
}
|
||||
}
|
||||
close( NET );
|
||||
|
||||
if ( $data{time} == 0 && $data{rxbytes} == 0 && $data{txbytes} == 0 ) {
|
||||
exit $status{UNKNOWN};
|
||||
}
|
||||
|
||||
if ( open( TMP, "<$tmpfile-$iface" ) ) {
|
||||
my @line = <TMP>; chomp( @line );
|
||||
( $data{last_time}, $data{last_rxbytes}, $data{last_txbytes} ) = split( ":", $line[0] );
|
||||
}
|
||||
|
||||
if ( open( TMP, ">$tmpfile-$iface" ) ) {
|
||||
print( TMP "$data{time}:$data{rxbytes}:$data{txbytes}\n" );
|
||||
close( TMP );
|
||||
}
|
||||
|
||||
$data{last_time} = $data{time} if ( !$data{last_time} || $data{last_time} > $data{time} );
|
||||
$data{last_rxbytes} = $data{rxbytes} if ( !$data{last_rxbytes} || $data{last_rxbytes} > $data{rxbytes} );
|
||||
$data{last_txbytes} = $data{txbytes} if ( !$data{last_txbytes} || $data{last_txbytes} > $data{txbytes} );
|
||||
|
||||
$speed{interval} = $data{time} - $data{last_time} + 1;
|
||||
$speed{rx} = ( $data{rxbytes} - $data{last_rxbytes} ) / $speed{interval};
|
||||
$speed{tx} = ( $data{txbytes} - $data{last_txbytes} ) / $speed{interval};
|
||||
|
||||
$output = "RX Bytes: ". bytes2human($data{rxbytes}) ."B, TX Bytes: ". bytes2human($data{txbytes}) ."B; ";
|
||||
$output .= sprintf( "RX Speed: %s%sps, TX Speed: %s%sps; ",
|
||||
bytes2human($speed{rx}*$bitmod), ($bitmod==BITS)?"b":"B", bytes2human($speed{tx}*$bitmod), ($bitmod==BITS)?"b":"B" );
|
||||
|
||||
if ( $percent ) {
|
||||
if ( ( $speed{rx} / $bandwidth ) * 100 > $critical || ( $speed{tx} / $bandwidth ) * 100 > $critical ) {
|
||||
$exit_status = $status{CRITICAL};
|
||||
$output .= "CRITICAL";
|
||||
} elsif ( ( $speed{rx} / $bandwidth ) * 100 > $warning || ( $speed{tx} / $bandwidth ) * 100 > $warning ) {
|
||||
$exit_status = $status{WARNING};
|
||||
$output .= "WARNING";
|
||||
} else {
|
||||
$output .= "OK";
|
||||
}
|
||||
} else {
|
||||
if ( ( $speed{rx} > $critical ) or ( $speed{tx} > $critical ) ) {
|
||||
$exit_status = $status{CRITICAL};
|
||||
$output .= "CRITICAL";
|
||||
} elsif ( ( $speed{rx} > $warning ) or ( $speed{tx} > $warning ) ) {
|
||||
$exit_status = $status{WARNING};
|
||||
$output .= "WARNING";
|
||||
} else {
|
||||
$output .= "OK";
|
||||
}
|
||||
}
|
||||
|
||||
$output .= " bandwidth utilization";
|
||||
$output .= sprintf( " | rx=%.0f;%2.0f;%2.0f tx=%.0f;%2.0f;%2.0f",
|
||||
$speed{rx}*$bitmod, ($percent)?$warning*$bandwidth/100:$warning, ($percent)?$critical*$bandwidth/100:$critical,
|
||||
$speed{tx}*$bitmod, ($percent)?$warning*$bandwidth/100:$warning, ($percent)?$critical*$bandwidth/100:$critical );
|
||||
|
||||
print "$output\n";
|
||||
exit( $exit_status );
|
||||
|
||||
|
||||
# helper function
|
||||
sub bytes2human {
|
||||
my $bytes = shift;
|
||||
return 0 if !$bytes;
|
||||
|
||||
my @units = ( '','K','M','G','T' );
|
||||
my $offset = 0;
|
||||
|
||||
while ( $bytes > 1024 ){
|
||||
$bytes = $bytes / 1024;
|
||||
$offset++;
|
||||
}
|
||||
return sprintf( "%2.0f%s", $bytes, $units[$offset] );
|
||||
}
|
||||
|
||||
sub human2bytes {
|
||||
my $value = shift;
|
||||
return 0 if ( !$value || $value !~ /^(\d+)(\w)$/ );
|
||||
my ($number, $scale) = ($1,$2);
|
||||
|
||||
my $bitmod = ( $scale =~ /[kmg]/ ) ? BITS : BYTES;
|
||||
my @units = ( '','K','M','G','T' );
|
||||
my $offset = 0;
|
||||
|
||||
while( $units[$offset] ne "\u$scale" && $offset <= scalar(@units) ) {
|
||||
$number *= 1024;
|
||||
$offset++;
|
||||
}
|
||||
|
||||
return $number/$bitmod;
|
||||
}
|
||||
|
||||
sub usage {
|
||||
print <<EOU;
|
||||
|
||||
Usage: $0 -i <interface> -w <warn> -c <critical> [-p -b <bandwidth>]
|
||||
|
||||
-i, --interface STRING
|
||||
Network interface name (example: eth0)
|
||||
-w, --warning STRING
|
||||
Warning interface speed level (K/M/G Bps, k/m/g bps)
|
||||
If using with -p value should be in percentage (1-100)
|
||||
-c, --critilcal STRING
|
||||
Critical interface speed level (K/M/G Bps, k/m/g bps)
|
||||
If using with -p value should be in percentage (1-100)
|
||||
-p
|
||||
Calculate warning and critical levels in percentage based on interface bandwidth
|
||||
-b, --bandwidth STRING
|
||||
Interface bandwidth value (K/M/G Bps, k/m/g bps)
|
||||
|
||||
EOU
|
||||
unlink($tmpfile);
|
||||
exit $status{UNKNOWN};
|
||||
}
|
139
files/nrpe/check_exim_mailqueue
Executable file
139
files/nrpe/check_exim_mailqueue
Executable file
@ -0,0 +1,139 @@
|
||||
#!/bin/sh
|
||||
###############################################
|
||||
#
|
||||
# Nagios script to check Exim mail queue status
|
||||
#
|
||||
# Copyright 2007, 2008 Ian Yates
|
||||
#
|
||||
# NOTE: Depending on your config, the nagios user will probably be
|
||||
# needed to be added to the exim group for this script to function correctly
|
||||
#
|
||||
# See usage for command line switches
|
||||
#
|
||||
# You need to add the following to /etc/sudoers:
|
||||
# nagios ALL=NOPASSWD:/usr/local/exim/bin/exim
|
||||
#
|
||||
# Created: 2006-07-31 (i.yates@uea.ac.uk)
|
||||
# Updated: 2007-04-30 (i.yates@uea.ac.uk) - Linux/sudo tweaks
|
||||
# Updated: 2008-03-26 (i.yates@uea.ac.uk) - Fixed bug in critical/warning level checking which could result in erroneous results.
|
||||
# Updated: 2008-11-27 (i.yates@uea.ac.uk) - Added GPLv3 licence
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation, either version 3 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
#
|
||||
###############################################
|
||||
|
||||
. /usr/lib/nagios/plugins/utils.sh
|
||||
|
||||
VERSION="1.3"
|
||||
|
||||
EXIM=/usr/sbin/exim
|
||||
SUDO=/usr/bin/sudo
|
||||
|
||||
FLAG_VERBOSE=FALSE
|
||||
LEVEL_WARN=""
|
||||
LEVEL_CRIT=""
|
||||
RESULT=""
|
||||
EXIT_STATUS=$STATE_OK
|
||||
|
||||
|
||||
###############################################
|
||||
#
|
||||
## FUNCTIONS
|
||||
#
|
||||
|
||||
## Print usage
|
||||
usage() {
|
||||
echo " check_eximailqueue $VERSION - Nagios Exim mail queue check script"
|
||||
echo ""
|
||||
echo " Usage: check_eximailqueue -w <warning queue size> -c <critical queue size> [ -v ] [ -h ]"
|
||||
echo ""
|
||||
echo " -w Queue size at which a warning is triggered"
|
||||
echo " -c Queue size at which a critical is triggered"
|
||||
echo " -v Verbose output (ignored for now)"
|
||||
echo " -h Show this page"
|
||||
echo ""
|
||||
}
|
||||
|
||||
## Process command line options
|
||||
doopts() {
|
||||
if ( `test 0 -lt $#` )
|
||||
then
|
||||
while getopts w:c:vh myarg "$@"
|
||||
do
|
||||
case $myarg in
|
||||
h|\?)
|
||||
usage
|
||||
exit;;
|
||||
w)
|
||||
LEVEL_WARN=$OPTARG;;
|
||||
c)
|
||||
LEVEL_CRIT=$OPTARG;;
|
||||
v)
|
||||
FLAG_VERBOSE=TRUE;;
|
||||
*) # Default
|
||||
usage
|
||||
exit;;
|
||||
esac
|
||||
done
|
||||
else
|
||||
usage
|
||||
exit
|
||||
fi
|
||||
}
|
||||
|
||||
|
||||
# Write output and return result
|
||||
theend() {
|
||||
echo $RESULT
|
||||
exit $EXIT_STATUS
|
||||
}
|
||||
|
||||
|
||||
#
|
||||
## END FUNCTIONS
|
||||
#
|
||||
|
||||
#############################################
|
||||
#
|
||||
## MAIN
|
||||
#
|
||||
|
||||
|
||||
# Handle command line options
|
||||
doopts $@
|
||||
|
||||
# Do the do
|
||||
OUTPUT=`$SUDO -u root $EXIM -bpc`
|
||||
if test -z "$OUTPUT" ; then
|
||||
RESULT="Mailqueue WARNING - query returned no output!"
|
||||
EXIT_STATUS=$STATE_WARNING
|
||||
else
|
||||
if test "$OUTPUT" -lt "$LEVEL_WARN" ; then
|
||||
RESULT="Mailqueue OK - $OUTPUT messages on queue"
|
||||
EXIT_STATUS=$STATE_OK
|
||||
else
|
||||
if test "$OUTPUT" -ge "$LEVEL_CRIT" ; then
|
||||
RESULT="Mailqueue CRITICAL - $OUTPUT messages on queue"
|
||||
EXIT_STATUS=$STATE_CRITICAL
|
||||
else
|
||||
if test "$OUTPUT" -ge "$LEVEL_WARN" ; then
|
||||
RESULT="Mailqueue WARNING - $OUTPUT messages on queue"
|
||||
EXIT_STATUS=$STATE_WARNING
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
||||
# Quit and return information and exit status
|
||||
theend
|
42
files/nrpe/check_mdadm
Executable file
42
files/nrpe/check_mdadm
Executable file
@ -0,0 +1,42 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# Created by Sebastian Grewe, Jammicron Technology
|
||||
#
|
||||
|
||||
# Get count of raid arrays
|
||||
RAID_DEVICES=`grep ^md -c /proc/mdstat`
|
||||
|
||||
# Get count of degraded arrays
|
||||
#RAID_STATUS=`grep "\[.*_.*\]" /proc/mdstat -c`
|
||||
RAID_STATUS=`egrep "\[.*(=|>|\.).*\]" /proc/mdstat -c`
|
||||
|
||||
# Is an array currently recovering, get percentage of recovery
|
||||
RAID_RECOVER=`grep recovery /proc/mdstat | awk '{print $4}'`
|
||||
RAID_RESYNC=`grep resync /proc/mdstat | awk '{print $4}'`
|
||||
RAID_CHECK=`grep check /proc/mdstat | awk '{print $4}'`
|
||||
|
||||
# Check raid status
|
||||
# RAID recovers --> Warning
|
||||
if [[ $RAID_RECOVER ]]; then
|
||||
STATUS="WARNING - Checked $RAID_DEVICES arrays, recovering : $RAID_RECOVER"
|
||||
EXIT=1
|
||||
elif [[ $RAID_RESYNC ]]; then
|
||||
STATUS="WARNING - Checked $RAID_DEVICES arrays, resync : $RAID_RESYNC"
|
||||
EXIT=1
|
||||
elif [[ $RAID_CHECK ]]; then
|
||||
STATUS="OK - Checked $RAID_DEVICES arrays, check : $RAID_CHECK"
|
||||
EXIT=0
|
||||
# RAID ok
|
||||
elif [[ $RAID_STATUS == "0" ]]; then
|
||||
STATUS="OK - Checked $RAID_DEVICES arrays."
|
||||
EXIT=0
|
||||
# All else critical, better save than sorry
|
||||
else
|
||||
EXTEND_RAID_STATUS=`egrep "\[.*(=|>|\.|_).*\]" /proc/mdstat | awk '{print $2}' | uniq -c | xargs echo`
|
||||
STATUS="WARNING- Checked $RAID_DEVICES arrays, $RAID_STATUS have failed check: $EXTEND_RAID_STATUS "
|
||||
EXIT=1
|
||||
fi
|
||||
|
||||
# Status and quit
|
||||
echo $STATUS
|
||||
exit $EXIT
|
124
files/nrpe/check_memory
Executable file
124
files/nrpe/check_memory
Executable file
@ -0,0 +1,124 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
#Set script name
|
||||
SCRIPT=`basename ${BASH_SOURCE[0]}`
|
||||
|
||||
#Set default values
|
||||
optMW=95
|
||||
optMC=98
|
||||
optSW=95
|
||||
optSC=98
|
||||
|
||||
# help function
|
||||
function printHelp {
|
||||
echo -e \\n"Help for $SCRIPT"\\n
|
||||
echo -e "Basic usage: $SCRIPT -w {warning} -c {critical} -W {warning} -C {critical}"\\n
|
||||
echo "Command switches are optional, default values for warning is 95% and critical is 98%"
|
||||
echo "-w - Sets warning value for Memory Usage. Default is 95%"
|
||||
echo "-c - Sets critical value for Memory Usage. Default is 98%"
|
||||
echo "-W - Sets warning value for Swap Usage. Default is 95%"
|
||||
echo "-C - Sets critical value for Swap Usage. Default is 98%"
|
||||
echo -e "-h - Displays this help message"\\n
|
||||
echo -e "Example: $SCRIPT -w 80 -c 90 -W 40 -C 60"\\n
|
||||
echo -e \\n\\n"Author: Lukasz Gogolin, lukasz.gogolin@gmail.com"
|
||||
echo -e "Git: http://bitbucket.org/lgogolin/nagios_plugins"
|
||||
exit 1
|
||||
}
|
||||
|
||||
# regex to check is OPTARG an integer
|
||||
re='^[0-9]+$'
|
||||
|
||||
while getopts :w:c:W:C:h FLAG; do
|
||||
case $FLAG in
|
||||
w)
|
||||
if ! [[ $OPTARG =~ $re ]] ; then
|
||||
echo "error: Not a number" >&2; exit 1
|
||||
else
|
||||
optMW=$OPTARG
|
||||
fi
|
||||
;;
|
||||
c)
|
||||
if ! [[ $OPTARG =~ $re ]] ; then
|
||||
echo "error: Not a number" >&2; exit 1
|
||||
else
|
||||
optMC=$OPTARG
|
||||
fi
|
||||
;;
|
||||
W)
|
||||
if ! [[ $OPTARG =~ $re ]] ; then
|
||||
echo "error: Not a number" >&2; exit 1
|
||||
else
|
||||
optSW=$OPTARG
|
||||
fi
|
||||
;;
|
||||
C)
|
||||
if ! [[ $OPTARG =~ $re ]] ; then
|
||||
echo "error: Not a number" >&2; exit 1
|
||||
else
|
||||
optSC=$OPTARG
|
||||
fi
|
||||
;;
|
||||
h)
|
||||
printHelp
|
||||
;;
|
||||
\?)
|
||||
echo -e \\n"Option - $OPTARG not allowed."
|
||||
printHelp
|
||||
exit 2
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
shift $((OPTIND-1))
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
array=( $(cat /proc/meminfo | egrep 'MemTotal|MemFree|Buffers|Cached|SwapTotal|SwapFree' |awk '{print $1 " " $2}' |tr '\n' ' ' |tr -d ':' |awk '{ printf("%i %i %i %i %i %i %i", $2, $4, $6, $8, $10, $12, $14) }') )
|
||||
|
||||
memTotal_k=${array[0]}
|
||||
memTotal_b=$(($memTotal_k*1024))
|
||||
memFree_k=${array[1]}
|
||||
memFree_b=$(($memFree_k*1024))
|
||||
memBuffer_k=${array[2]}
|
||||
memBuffer_b=$(($memBuffer_k*1024))
|
||||
memCache_k=${array[3]}
|
||||
memCache_b=$(($memCache_k*1024))
|
||||
memTotal_m=$(($memTotal_k/1024))
|
||||
memFree_m=$(($memFree_k/1024))
|
||||
memBuffer_m=$(($memBuffer_k/1024))
|
||||
memCache_m=$(($memCache_k/1024))
|
||||
memUsed_b=$(($memTotal_b-$memFree_b-$memBuffer_b-$memCache_b))
|
||||
memUsed_m=$(($memTotal_m-$memFree_m-$memBuffer_m-$memCache_m))
|
||||
memUsedPrc=$((($memUsed_b*100)/$memTotal_b))
|
||||
|
||||
swapTotal_k=${array[5]}
|
||||
swapTotal_b=$(($swapTotal_k*1024))
|
||||
swapFree_k=${array[6]}
|
||||
swapFree_b=$(($swapFree_k*1024))
|
||||
swapUsed_k=$(($swapTotal_k-$swapFree_k))
|
||||
swapUsed_b=$(($swapUsed_k*1024))
|
||||
swapTotal_m=$(($swapTotal_k/1024))
|
||||
swapFree_m=$(($swapFree_k/1024))
|
||||
swapUsed_m=$(($swapTotal_m-$swapFree_m))
|
||||
|
||||
if [ $swapTotal_k -eq 0 ]; then
|
||||
swapUsedPrc=0
|
||||
else
|
||||
swapUsedPrc=$((($swapUsed_k*100)/$swapTotal_k))
|
||||
fi
|
||||
|
||||
message="[MEMORY] Total: $memTotal_m MB - Used: $memUsed_m MB - $memUsedPrc% [SWAP] Total: $swapTotal_m MB - Used: $swapUsed_m MB - $swapUsedPrc% | MTOTAL=$memTotal_b;;;; MUSED=$memUsed_b;;;; MCACHE=$memCache_b;;;; MBUFFER=$memBuffer_b;;;; STOTAL=$swapTotal_b;;;; SUSED=$swapUsed_b;;;;"
|
||||
|
||||
|
||||
if [ $memUsedPrc -ge $optMC ] || [ $swapUsedPrc -ge $optSC ]; then
|
||||
echo -e $message
|
||||
$(exit 2)
|
||||
elif [ $memUsedPrc -ge $optMW ] || [ $swapUsedPrc -ge $optSW ]; then
|
||||
echo -e $message
|
||||
$(exit 1)
|
||||
else
|
||||
echo -e $message
|
||||
$(exit 0)
|
||||
fi
|
237
files/nrpe/check_mysql_longqueries
Executable file
237
files/nrpe/check_mysql_longqueries
Executable file
@ -0,0 +1,237 @@
|
||||
#!/usr/bin/perl
|
||||
# $Id$
|
||||
#
|
||||
# check_mysql_longqueries plugin for Nagios
|
||||
#
|
||||
# Copyright (C) 2009 Vincent Rivellino <vrivellino@paybycash.com>
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or
|
||||
# modify it under the terms of the GNU General Public License
|
||||
# as published by the Free Software Foundation; either version 2
|
||||
# of the License, or (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program; if not, write to the Free Software
|
||||
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
||||
#
|
||||
#
|
||||
# Checks MySQL's processlist to see if there are queries running longer than
|
||||
# defined thresholds.
|
||||
#
|
||||
# Requires the following modules:
|
||||
# DBI
|
||||
# Monitoring::Plugin
|
||||
#
|
||||
# Copyright Notice: GPLv2
|
||||
#
|
||||
# CHANGES
|
||||
#
|
||||
# 30 Jan 2009 - Vincent Rivellino <vrivellino@paybycash.com>
|
||||
# Initial version released.
|
||||
#
|
||||
# 02 Mar 2020 - Ludovic Cartier <ludovic.cartier@brainsys.io>
|
||||
# Replace Nagios::Plugin by Monitoring::Plugin
|
||||
# need debian package libmonitoring-plugin-perl
|
||||
#
|
||||
|
||||
use warnings;
|
||||
use strict;
|
||||
use DBI;
|
||||
use Monitoring::Plugin;
|
||||
|
||||
|
||||
## setup Monitoring::Plugin
|
||||
my $np = Monitoring::Plugin->new(
|
||||
usage => "Usage: %s [-v|--verbose] [-H <host>] [-P <port>] [-S <socket>] [-u <user>] [-p <password>] -w <warn time> -c <crit time>",
|
||||
version => "1.0",
|
||||
license => "Copyright (C) 2009 Vincent Rivellino <vrivellino\@paybycash.com>\n" .
|
||||
"This plugin comes with ABSOLUTELY NO WARRANTY. This is free software, and you\n" .
|
||||
"are welcome to redistribute it under the conditions of version 2 of the GPL."
|
||||
);
|
||||
|
||||
## add command line arguments
|
||||
$np->add_arg(
|
||||
spec => 'host|H=s',
|
||||
help => "-H, --host\n MySQL server host"
|
||||
);
|
||||
$np->add_arg(
|
||||
spec => 'port|P=i',
|
||||
help => "-P, --port\n MySQL server port"
|
||||
);
|
||||
$np->add_arg(
|
||||
spec => 'socket|S=s',
|
||||
help => "-S, --socket\n MySQL server socket"
|
||||
);
|
||||
$np->add_arg(
|
||||
spec => 'user|u=s',
|
||||
help => "-u, --user\n database user (must have privilege to SHOW PROCESSLIST)"
|
||||
);
|
||||
$np->add_arg(
|
||||
spec => 'password|p=s',
|
||||
help => "-p, --password\n database password"
|
||||
);
|
||||
$np->add_arg(
|
||||
spec => 'warn|w=i',
|
||||
help => "-w, --warn\n Query time in seconds to generate a WARNING",
|
||||
required => 1
|
||||
);
|
||||
$np->add_arg(
|
||||
spec => 'crit|c=i',
|
||||
help => "-c, --crit\n Query time in seconds to generate a CRITICAL",
|
||||
required => 1
|
||||
);
|
||||
$np->add_arg(
|
||||
spec => 'db=s',
|
||||
help => "--db\n Only check queries running on this database\n To specify more than one, separate with commas."
|
||||
);
|
||||
$np->add_arg(
|
||||
spec => 'skip_db=s',
|
||||
help => "--skip_db\n Don't check queries running on this database\n To specify more than one, separate with commas."
|
||||
);
|
||||
$np->add_arg(
|
||||
spec => 'clientuser=s',
|
||||
help => "--clientuser\n Only check queries running by this MySQL user\n To specify more than one, separate with commas."
|
||||
);
|
||||
$np->add_arg(
|
||||
spec => 'skip_clientuser=s',
|
||||
help => "--skip_clientuser\n Don't check queries running by this MySQL user\n To specify more than one, separate with commas."
|
||||
);
|
||||
$np->add_arg(
|
||||
spec => 'clienthost=s',
|
||||
help => "--clienthost\n Only check queries running from this client host\n To specify more than one, separate with commas."
|
||||
);
|
||||
$np->add_arg(
|
||||
spec => 'skip_clienthost=s',
|
||||
help => "--skip_clienthost\n Don't check queries running from this client host\n To specify more than one, separate with commas."
|
||||
);
|
||||
|
||||
|
||||
## parse the command line arguments
|
||||
$np->getopts;
|
||||
my $verbose = $np->opts->verbose || 0;
|
||||
|
||||
if ( $verbose >= 2 ) {
|
||||
print "Plugin options:\n";
|
||||
printf " %-23s %d\n", "verbose:", $verbose;
|
||||
printf " %-23s %s\n", "host:", $np->opts->host || '';
|
||||
printf " %-23s %s\n", "port:", $np->opts->port || '';
|
||||
printf " %-23s %s\n", "socket:", $np->opts->socket || '';
|
||||
printf " %-23s %s\n", "user:", $np->opts->user || '';
|
||||
printf " %-23s %s\n", "password:", $np->opts->password || '';
|
||||
printf " %-23s %d\n", "warn:", $np->opts->warn;
|
||||
printf " %-23s %d\n", "crit:", $np->opts->crit;
|
||||
printf " %-23s %s\n", "db:", $np->opts->db || '';
|
||||
printf " %-23s %s\n", "skip_db:", $np->opts->skip_db || '';
|
||||
printf " %-23s %s\n", "clientuser:", $np->opts->clientuser || '';
|
||||
printf " %-23s %s\n", "skip_clientuser:", $np->opts->skip_clientuser || '';
|
||||
printf " %-23s %s\n", "clienthost:", $np->opts->clienthost || '';
|
||||
printf " %-23s %s\n", "skip_clienthost:", $np->opts->skip_clienthost || '';
|
||||
}
|
||||
|
||||
# extract restrictions from args - will grep() these lists
|
||||
my @db = split( '/,/', $np->opts->db || '' );
|
||||
my @skipdb = split( '/,/', $np->opts->skip_db || '' );
|
||||
my @clientuser = split( '/,/', $np->opts->clientuser || '' );
|
||||
my @skipclientuser = split( '/,/', $np->opts->skip_clientuser || '' );
|
||||
my @clienthost = split( '/,/', $np->opts->clienthost || '' );
|
||||
my @skipclienthost = split( '/,/', $np->opts->skip_clienthost || '' );
|
||||
|
||||
alarm $np->opts->timeout;
|
||||
|
||||
## setup the dsn - no need to specify a database
|
||||
my $dsn = 'DBI:mysql:';
|
||||
|
||||
## if we're connecting to localhost (by name) or the host isn't defined ...
|
||||
if ( ! $np->opts->host || $np->opts->host eq 'localhost' ) {
|
||||
# connect via a local socket (if it's defined)
|
||||
$dsn .= ';mysql_socket=' . $np->opts->socket
|
||||
if $np->opts->socket;
|
||||
|
||||
## otherwise, attempt to connect via host and/or port (if they're defined)
|
||||
} else {
|
||||
$dsn .= ';host=' . $np->opts->host
|
||||
if $np->opts->host;
|
||||
$dsn .= ';port=' . $np->opts->port
|
||||
if $np->opts->port;
|
||||
}
|
||||
|
||||
## print dsn if really verbose
|
||||
print "DSN: '$dsn' USER: '", $np->opts->user || '', "' PASS: '", $np->opts->password || '', "'\n"
|
||||
if $verbose >= 2;
|
||||
|
||||
## connect to the database server
|
||||
my $dbh = DBI->connect( $dsn, $np->opts->user || '', $np->opts->password || '',
|
||||
{ RaiseError => 0, PrintError => 0, AutoCommit => 1 } )
|
||||
or $np->nagios_exit( UNKNOWN, "Could not connect to database: $DBI::errstr" );
|
||||
|
||||
## get the list of running queries
|
||||
my $sth = $dbh->prepare( 'SHOW FULL PROCESSLIST' );
|
||||
$sth->execute();
|
||||
$np->nagios_exit( UNKNOWN, $sth->errstr ) if $sth->err;
|
||||
|
||||
## bind each row result to a hash
|
||||
my %row;
|
||||
$sth->bind_columns( \( @row{ @{$sth->{NAME_lc} } } ));
|
||||
|
||||
|
||||
## use these to keep track of the longest-running query
|
||||
my $longquery_info = '';
|
||||
my $longquery_time = 0;
|
||||
|
||||
## process the results
|
||||
my $count = 0;
|
||||
while ( $sth->fetch ) {
|
||||
$count++;
|
||||
|
||||
# skip if time is zero or NULL
|
||||
next unless $row{'time'};
|
||||
|
||||
# skip ignorable results
|
||||
next if $row{'user'} eq 'system user';
|
||||
next if $row{'command'} =~ m/(Sleep|Binlog Dump|Ping|Processlist)/io;
|
||||
|
||||
# extract connection info
|
||||
my $db = $row{'db'} || '';
|
||||
my $user = $row{'user'} || '';
|
||||
my $host = $row{'host'} || '';
|
||||
$host =~ s/:\d+$//o;
|
||||
|
||||
# skip if connection info does or doest match criteria
|
||||
next if $np->opts->db and grep !/^$db$/, @db;
|
||||
next if $np->opts->skip_db and grep /^$db$/, @skipdb;
|
||||
|
||||
next if $np->opts->clientuser and grep !/^$user$/, @clientuser;
|
||||
next if $np->opts->skip_clientuser and grep /^$user$/, @skipclientuser;
|
||||
|
||||
next if $np->opts->clienthost and grep !/^$host$/, @clienthost;
|
||||
next if $np->opts->skip_clienthost and grep /^$host$/, @skipclienthost;
|
||||
|
||||
# only save the longest running query
|
||||
if ( $row{'time'} > $longquery_time ) {
|
||||
$longquery_time = $row{'time'};
|
||||
$longquery_info = "TIME: $row{'time'}";
|
||||
foreach my $k ( sort keys %row ) {
|
||||
next if $k eq 'time' or $k eq 'info';
|
||||
$longquery_info .= " $k=" . ( $row{$k} || 'NULL' );
|
||||
}
|
||||
$longquery_info .= " INFO=" . ( $row{'info'} || 'NULL' );
|
||||
}
|
||||
}
|
||||
|
||||
# we're done with the db handle
|
||||
$dbh->disconnect;
|
||||
|
||||
# OK if no long queries were found
|
||||
$np->nagios_exit( OK, "No long running queries found ($count threads checked)" ) unless $longquery_info;
|
||||
|
||||
# check for crit
|
||||
$np->nagios_exit( CRITICAL, $longquery_info ) if $longquery_time >= $np->opts->crit;
|
||||
$np->nagios_exit( WARNING, $longquery_info ) if $longquery_time >= $np->opts->warn;
|
||||
|
||||
# OK if if the longest query didn't match crit & warn
|
||||
$np->nagios_exit( OK, "No long running queries found ($count threads checked)" );
|
140
files/nrpe/check_postfix_mailqueue
Executable file
140
files/nrpe/check_postfix_mailqueue
Executable file
@ -0,0 +1,140 @@
|
||||
#!/bin/bash
|
||||
###################################################################
|
||||
# check_postfix_mailqueue is developped with GPL Licence 2.0
|
||||
#
|
||||
# GPL License: http://www.gnu.org/licenses/old-licenses/gpl-2.0.txt
|
||||
#
|
||||
# Developped by : Bjoern Bongermino
|
||||
#
|
||||
###################################################################
|
||||
# This program is free software; you can redistribute it and/or
|
||||
# modify it under the terms of the GNU General Public License
|
||||
# as published by the Free Software Foundation; either version 2
|
||||
# of the License, or (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
####################################################################
|
||||
|
||||
# Uncomment to enable debugging
|
||||
# set -x
|
||||
|
||||
PROGNAME=`basename $0`
|
||||
VERSION="Version 1.0"
|
||||
AUTHOR="Bjoern Bongermino (http://www.bongermino.de)"
|
||||
|
||||
STATE_OK=0
|
||||
STATE_WARNING=1
|
||||
STATE_CRITICAL=2
|
||||
STATE_UNKNOWN=3
|
||||
|
||||
warning=0
|
||||
critical=0
|
||||
|
||||
print_version() {
|
||||
echo "$PROGNAME $VERSION $AUTHOR"
|
||||
}
|
||||
|
||||
print_help() {
|
||||
print_version $PROGNAME $VERSION
|
||||
echo ""
|
||||
echo "$PROGNAME - Checks postfix mailqueue statistic"
|
||||
echo ""
|
||||
echo "$PROGNAME is a Nagios plugin which generates statistics"
|
||||
echo "for the postfix mailqueue and checks for corrupt messages."
|
||||
echo "The following values will be checked:"
|
||||
echo "maildrop: Localy posted mail"
|
||||
echo "incoming: Processed local mail and received from network"
|
||||
echo "active: Mails being delivered (should be small)"
|
||||
echo "deferred: Stuck mails (that will be retried later)"
|
||||
echo "corrupt: Messages found to not be in correct format (shold be 0)"
|
||||
echo "hold: Recent addition, messages put on hold indefinitly - delete of free"
|
||||
echo ""
|
||||
echo "Usage: $PROGNAME -w WARN-Level -c CRIT-Level"
|
||||
echo ""
|
||||
echo "Options:"
|
||||
echo " -w)"
|
||||
echo " Warning level for deferred mails"
|
||||
echo " -c)"
|
||||
echo " Critical level for deferred mail"
|
||||
echo " -h)"
|
||||
echo " This help"
|
||||
echo " -v)"
|
||||
echo " Version"
|
||||
exit $STATE_OK
|
||||
}
|
||||
|
||||
# Check for parameters
|
||||
while test -n "$1"; do
|
||||
case "$1" in
|
||||
-h)
|
||||
print_help
|
||||
exit $STATE_OK;;
|
||||
-v)
|
||||
print_version
|
||||
exit $STATE_OK;;
|
||||
-w)
|
||||
warning=$2
|
||||
shift
|
||||
;;
|
||||
-c)
|
||||
critical=$2
|
||||
shift
|
||||
;;
|
||||
*)
|
||||
check_postfix_mailqueue
|
||||
;;
|
||||
esac
|
||||
shift
|
||||
done
|
||||
|
||||
check_postfix_mailqueue() {
|
||||
# Can be set via environment, but default is fetched by postconf (if available,
|
||||
# else /var/spool/postfix)
|
||||
if which postconf > /dev/null ; then
|
||||
SPOOLDIR=${spooldir:-`postconf -h queue_directory`}
|
||||
else
|
||||
SPOOLDIR=${spooldir:-/var/spool/postfix}
|
||||
fi
|
||||
|
||||
cd $SPOOLDIR >/dev/null 2>/dev/null || {
|
||||
echo -n "Cannot cd to $SPOOLDIR"
|
||||
exit $STATE_CRITICAL
|
||||
}
|
||||
|
||||
# Get values
|
||||
deferred=`(test -d deferred && find deferred -type f ) | wc -l`
|
||||
active=`(test -d active && find active -type f ) | wc -l`
|
||||
maildrop=`(test -d maildrop && find maildrop -type f ) | wc -l`
|
||||
incoming=`(test -d incoming && find incoming -type f ) | wc -l`
|
||||
corrupt=`(test -d corrupt && find corrupt -type f ) | wc -l`
|
||||
hold=`( test -d hold && find hold -type f ) | wc -l`
|
||||
}
|
||||
|
||||
check_postfix_mailqueue
|
||||
values="Deferred mails=$deferred Active deliveries=$active Locally posted mails=$maildrop Incoming mails=$incoming Corrupt mails=$corrupt Mails on hold=$hold"
|
||||
perfdata="deferred=$deferred;; active=$active;; maildrop=$maildrop;; incoming=$incoming;; corrupt=$corrupt;; hold=$hold;;"
|
||||
|
||||
if [ $corrupt -gt 0 ]; then
|
||||
echo -n "Postfix Mailqueue CRITICAL - $corrupt corrupt messages found! | $perfdata"
|
||||
exit $STATE_CRITICAL
|
||||
fi
|
||||
|
||||
if [ $warning -gt 0 ] && [ $critical -gt 0 ]; then
|
||||
if [ $deferred -gt $critical ]; then
|
||||
echo -n "Postfix Mailqueue CRITICAL - $values | $perfdata"
|
||||
exit $STATE_CRITICAL
|
||||
elif [ $deferred -gt $warning ]; then
|
||||
echo -n "Postfix Mailqueue WARNING - $values | $perfdata"
|
||||
exit $STATE_WARNING
|
||||
else
|
||||
echo -n "Postfix Mailqueue OK - $values | $perfdata"
|
||||
exit $STATE_OK
|
||||
fi
|
||||
else
|
||||
echo -n "Postfix Mailqueue OK - $values | $perfdata"
|
||||
exit $STATE_OK
|
||||
fi
|
11848
files/nrpe/check_postgresql
Executable file
11848
files/nrpe/check_postgresql
Executable file
File diff suppressed because it is too large
Load Diff
101
files/nrpe/check_proc_age
Executable file
101
files/nrpe/check_proc_age
Executable file
@ -0,0 +1,101 @@
|
||||
#! /bin/bash
|
||||
|
||||
# Nagios plugin
|
||||
# created 09.01.2011 by symphonic.mushroom@gmail.com
|
||||
# modified 04.24.2012 by symphonic.mushroom@gmail.com with the advices from formwandler
|
||||
# modified 07.22.2017 by symphonic.mushroom@gmail.com with the help from Toby Wahlers toby@100.rpm.com
|
||||
# check if processes matching to a pattern are exceeding a given elapsed time
|
||||
# return a Nagios exit code depending on the result
|
||||
# 0 = OK
|
||||
# 1 = WARNING
|
||||
# 2 = CRITICAL
|
||||
# 3 = UNKNOWN
|
||||
|
||||
|
||||
# This program is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation, either version 3 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
|
||||
# for help printing
|
||||
print_help() {
|
||||
echo "This Nagios plugin check if processes matching to a pattern are exceeding a given elapsed time"
|
||||
echo "Usage : $0 -p <process_name> -w <seconds> -c <seconds> "
|
||||
echo " -p parameter : name of the monitoring process. For granularity, quote commands with spaces."
|
||||
echo " -w parameter : minimal elapsed time for status WARNING on NAGIOS, in seconds."
|
||||
echo " -c parameter : minimal elapsed time for status CRITICAL on NAGIOS, in seconds."
|
||||
echo "returned performance data : number of process; oldest time in minutes; warning time in minutes; critical time in minutes; 0;"
|
||||
exit 3
|
||||
}
|
||||
|
||||
# check if there is at least one argument
|
||||
if [ -z $1 ]
|
||||
then echo "Missing arguments"
|
||||
echo "try \'$0 --help\' for help"
|
||||
exit 3
|
||||
fi
|
||||
|
||||
# print help
|
||||
if [[ ( $1 = "--help" || $1 = "-h" ) ]]
|
||||
then print_help
|
||||
exit 3
|
||||
fi
|
||||
|
||||
# assign value to arguments
|
||||
# print an error in case of unkown argument
|
||||
while getopts ":w:c:p:" options
|
||||
do
|
||||
case $options in
|
||||
w ) warning=$OPTARG ;;
|
||||
c ) critical=$OPTARG ;;
|
||||
p ) proc=$OPTARG ;;
|
||||
* ) echo "Unknown argument"
|
||||
echo "try \'$0 --help\' for help"
|
||||
exit 3 ;;
|
||||
esac
|
||||
done
|
||||
|
||||
# check if all arguments are present
|
||||
if [[ ( -z $warning || -z $critical || -z $proc ) ]]
|
||||
then echo "Missing argument"
|
||||
echo "try \'$0 --help\' for help"
|
||||
exit 3
|
||||
fi
|
||||
|
||||
#calculate number of process
|
||||
nbproc=$(ps -A -o args | grep -w "$proc" | grep -v $0 | grep -v grep | wc -l)
|
||||
if [ $nbproc -gt 0 ]
|
||||
then
|
||||
|
||||
#calculate age of oldest process
|
||||
ageproc=$(ps -A -o etime,comm,args | grep "$proc" | grep -v $0 | grep -v grep | gawk '{split($1,t,":");split(t[1],td,"-");if (td[2]) {ta=td[1]*86400; t[1]=td[2]} else {ta=0}; if (t[3]) {$1=(t[1]*60+t[2])*60+t[3]+ta} else {$1=t[1]*60+t[2]};if (NR==1) {maxi=$1;} else {if ($1>maxi){maxi=$1;}}};END {print maxi}')
|
||||
case $ageproc in
|
||||
?|[0-5]? ) maxage=$ageproc" Seconds";;
|
||||
??|???|[0-2]???|3[0-5]?? ) maxage=$(($ageproc/60))" Minutes";;
|
||||
* ) maxage=$(($ageproc/3600))" Hours "$(($ageproc % 3600 / 60))" minutes";;
|
||||
esac
|
||||
msg="there are $nbproc process $proc, oldest has got $maxage age"
|
||||
perfmaxage=$(($ageproc/60))
|
||||
perfdata="Processes=${nbproc:-0} MaxAge=${perfmaxage:-0}Minutes;$(($warning/60));$(($critical/60));0;"
|
||||
if [ $ageproc -gt $critical ]
|
||||
then echo "CRITICAL: $msg | $perfdata"
|
||||
exit 2
|
||||
elif [ $ageproc -gt $warning ]
|
||||
then echo "WARNING: $msg | $perfdata"
|
||||
exit 1
|
||||
else echo "OK: $msg | $perfdata"
|
||||
exit 0
|
||||
fi
|
||||
else
|
||||
echo "OK: there is no process matching $proc"
|
||||
exit 0
|
||||
fi
|
7
files/nrpe/check_process
Executable file
7
files/nrpe/check_process
Executable file
@ -0,0 +1,7 @@
|
||||
#!/bin/bash
|
||||
|
||||
rc=0
|
||||
for proc in cron rsyslogd ntpd munin-node; do
|
||||
sudo /usr/lib/nagios/plugins/check_procs -C $proc -c 1:
|
||||
rc=$(($rc|$?))
|
||||
done
|
33
files/nrpe/check_rofs
Executable file
33
files/nrpe/check_rofs
Executable file
@ -0,0 +1,33 @@
|
||||
#!/bin/bash
|
||||
# checks for read_only fs
|
||||
# @Author Joerg 'johe' Stephan <johe.stephan@googlemail.com>
|
||||
#
|
||||
|
||||
E_SUCCESS="0"
|
||||
E_WARNING="1"
|
||||
E_CRITICAL="2"
|
||||
E_UNKNOWN="3"
|
||||
|
||||
if [ -z $1 ]; then
|
||||
echo "Usage: check_rofs.sh <mountpoint>"
|
||||
else tfs=$1
|
||||
fi
|
||||
|
||||
|
||||
cat /proc/mounts | while read diskid mountpoint fs options rub1 rub2; do
|
||||
if [ x$mountpoint = x$tfs ]; then
|
||||
if grep -q rw <<<$options; then
|
||||
echo "The Filesystem mounted on $tfs is writeable"
|
||||
exit ${E_SUCCESS}
|
||||
else
|
||||
if grep -q ro <<<$options; then
|
||||
echo "The Filesystem mounted on $tfs is NOT writeable"
|
||||
exit ${E_CRITICAL}
|
||||
else
|
||||
echo "Test result empty (For any reason)"
|
||||
exit ${E_WARNING}
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
||||
done
|
50
files/nrpe/check_systemd_service
Executable file
50
files/nrpe/check_systemd_service
Executable file
@ -0,0 +1,50 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Copyright © 2016, 2017 Mohamed El Morabity <melmorabity@fedoraproject.com>
|
||||
#
|
||||
# This module is free software: you can redistribute it and/or modify it under
|
||||
# the terms of the GNU General Public License as published by the Free Software
|
||||
# Foundation, either version 3 of the License, or (at your option) any later
|
||||
# version.
|
||||
#
|
||||
# This software is distributed in the hope that it will be useful, but WITHOUT
|
||||
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
||||
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License along with
|
||||
# this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
|
||||
PLUGINDIR=$(dirname $0)
|
||||
. $PLUGINDIR/utils.sh
|
||||
|
||||
|
||||
if [[ $# -ne 1 ]]; then
|
||||
echo "Usage: ${0##*/} <service name>"
|
||||
exit $STATE_UNKNOWN
|
||||
fi
|
||||
|
||||
service=$1
|
||||
|
||||
|
||||
status=$(systemctl is-enabled $service 2>/dev/null)
|
||||
r=$?
|
||||
if [[ -z "$status" ]]; then
|
||||
echo "ERROR: service $service doesn't exist"
|
||||
exit $STATE_CRITICAL
|
||||
fi
|
||||
|
||||
if [[ $r -ne 0 ]]; then
|
||||
echo "ERROR: service $service is $status"
|
||||
exit $STATE_CRITICAL
|
||||
fi
|
||||
|
||||
|
||||
systemctl --quiet is-active $service
|
||||
if [[ $? -ne 0 ]]; then
|
||||
echo "ERROR: service $service is not running"
|
||||
exit $STATE_CRITICAL
|
||||
fi
|
||||
|
||||
echo "OK: service $service is running"
|
||||
exit $STATE_OK
|
3
tasks/main.yml
Normal file
3
tasks/main.yml
Normal file
@ -0,0 +1,3 @@
|
||||
---
|
||||
- name: "monitoring | install nrpe"
|
||||
include: nrpe.yml
|
42
tasks/nrpe.yml
Normal file
42
tasks/nrpe.yml
Normal file
@ -0,0 +1,42 @@
|
||||
---
|
||||
- name: nrpe | apt update cache
|
||||
apt:
|
||||
update_cache: yes
|
||||
cache_valid_time: 86400 #One day
|
||||
|
||||
- name: nrpe | install nrpe packages
|
||||
apt:
|
||||
name: "{{ item }}"
|
||||
update_cache: true
|
||||
state: present
|
||||
with_items:
|
||||
- nagios-nrpe-server
|
||||
- libmonitoring-plugin-perl
|
||||
- monitoring-plugins-standard
|
||||
- libdbd-mysql-perl
|
||||
|
||||
- name: nrpe | copy nrpe configuration
|
||||
template:
|
||||
src: "nrpe.j2"
|
||||
dest: "/etc/nagios/nrpe.d/brainsys.cfg"
|
||||
mode: "0644"
|
||||
force: yes
|
||||
backup: yes
|
||||
|
||||
- name: nrpe | copy nrpe plugins
|
||||
copy:
|
||||
src: nrpe/
|
||||
dest: /usr/lib/nagios/plugins
|
||||
mode: 0755
|
||||
|
||||
- name: nrpe | restart nagios-nrpe-server
|
||||
systemd:
|
||||
state: restarted
|
||||
name: nagios-nrpe-server
|
||||
|
||||
- name: nrpe | allow nagios user to specific sudo
|
||||
template:
|
||||
src: nrpe.sudoers.j2
|
||||
dest: /etc/sudoers.d/nrpe
|
||||
validate: 'visudo -cf %s'
|
||||
mode: 0440
|
51
templates/nrpe.j2
Normal file
51
templates/nrpe.j2
Normal file
@ -0,0 +1,51 @@
|
||||
allowed_hosts={{ nrpe_allowed_hosts }}
|
||||
dont_blame_nrpe=1
|
||||
|
||||
command[check_load]=/usr/lib/nagios/plugins/check_load -w {{ nrpe_load_warning }} -c {{ nrpe_load_critical }}
|
||||
command[check_memory]=/usr/lib/nagios/plugins/check_memory -w {{ nrpe_memory_warning }} -c {{ nrpe_memory_critical }} -W {{ nrpe_swap_warning }} -C {{ nrpe_swap_critical }}
|
||||
command[check_mailq]=/usr/bin/sudo /usr/lib/nagios/plugins/check_postfix_mailqueue -w {{ nrpe_postfix_warning }} -c {{ nrpe_postfix_critical }}
|
||||
command[check_smtp]=/usr/lib/nagios/plugins/check_tcp -p 25
|
||||
command[check_zombie_procs]=/usr/lib/nagios/plugins/check_procs -w 5 -c 10 -s Z
|
||||
command[check_total_procs]=/usr/lib/nagios/plugins/check_procs -w 500 -c 800
|
||||
command[check_process]=/usr/lib/nagios/plugins/check_process
|
||||
command[check_dns]=/usr/lib/nagios/plugins/check_dns -H google.com
|
||||
command[check_ssl]=/usr/lib/nagios/plugins/check_http --sni 'www.brainsys.io' -C 14,3
|
||||
command[check_eth]=/usr/lib/nagios/plugins/check_eth -i {{ ansible_default_ipv4.interface }} -w {{ nrpe_eth_warning }} -c {{ nrpe_eth_critical }}
|
||||
command[check_proc_fail2ban]=/usr/lib/nagios/plugins/check_procs -a fail2ban -w 1: -c 1:
|
||||
command[check_proc_age]=/usr/lib/nagios/plugins/check_proc_age -p <proc> -w 400 -c 600
|
||||
|
||||
# disk
|
||||
# -w space warning / -c space critical / -W inode warning / -K inode criticak / -C reset after
|
||||
command[check_disk_advanced]=/usr/lib/nagios/plugins/check_disk_advanced -x /lib/init/rw -x /sys -x /dev/shm -X tmpfs -X nsfs -X proc -X sysfs -X devtmpfs -X overlay -X tracefs -w 10% -c 3% -W 10% -K 3% -H
|
||||
command[check_disk_root]=/usr/lib/nagios/plugins/check_disk -w 30% -W 30% -c 10% -K 10% -p /
|
||||
command[check_rw_root]=/usr/lib/nagios/plugins/check_rofs /
|
||||
command[check_disk_data]=/usr/lib/nagios/plugins/check_disk -w 30% -W 30% -c 10% -K 10% -p /data
|
||||
command[check_rw_data]=/usr/lib/nagios/plugins/check_rofs /data
|
||||
|
||||
# mysql
|
||||
command[check_mysql]=/usr/lib/nagios/plugins/check_mysql -u nagios -pBu[VetFeifoipVithlok2odHabrAiltAjHavciUjRi -d mysql -H 127.0.0.1
|
||||
command[check_mysql_longqueries]=/usr/lib/nagios/plugins/check_mysql_longqueries -u nagios -pBu[VetFeifoipVithlok2odHabrAiltAjHavciUjRi -H 127.0.0.1 -w 600 -c 1200
|
||||
|
||||
# postgresql
|
||||
command[check_pgsql_port]=/usr/lib/nagios/plugins/check_tcp -p 5432
|
||||
command[check_pgsql_connection]=/usr/lib/nagios/plugins/check_postgresql -H 127.0.0.1 -p 5432 --dbuser=nagios --dbpass=uDUTHt14FC3w4cE9vRk4XyZFD3KWlx --action=connection
|
||||
command[check_pgsql_backends]=/usr/lib/nagios/plugins/check_postgresql -H 127.0.0.1 -p 5432 --dbuser=nagios --dbpass=uDUTHt14FC3w4cE9vRk4XyZFD3KWlx --action=backends -w 175 -c 190
|
||||
|
||||
# raid
|
||||
command[check_mdadm]=/usr/lib/nagios/plugins/check_mdadm
|
||||
command[check_3ware]=/usr/bin/sudo /usr/lib/nagios/plugins/check_3ware
|
||||
|
||||
# services
|
||||
command[check_proc_docker]=/usr/lib/nagios/plugins/check_systemd_service docker
|
||||
command[check_proc_haproxy]=/usr/lib/nagios/plugins/check_systemd_service haproxy
|
||||
command[check_proc_nginx]=/usr/lib/nagios/plugins/check_systemd_service nginx
|
||||
command[check_proc_php5.6]=/usr/lib/nagios/plugins/check_systemd_service php5.6-fpm
|
||||
command[check_proc_php7.0]=/usr/lib/nagios/plugins/check_systemd_service php7.0-fpm
|
||||
command[check_proc_php7.1]=/usr/lib/nagios/plugins/check_systemd_service php7.1-fpm
|
||||
command[check_proc_php7.2]=/usr/lib/nagios/plugins/check_systemd_service php7.2-fpm
|
||||
command[check_proc_php7.3]=/usr/lib/nagios/plugins/check_systemd_service php7.3-fpm
|
||||
command[check_proc_php7.4]=/usr/lib/nagios/plugins/check_systemd_service php7.4-fpm
|
||||
command[check_proc_php8.0]=/usr/lib/nagios/plugins/check_systemd_service php8.0-fpm
|
||||
command[check_proc_php8.1]=/usr/lib/nagios/plugins/check_systemd_service php8.1-fpm
|
||||
command[check_proc_mysql]=/usr/lib/nagios/plugins/check_systemd_service mysql
|
||||
command[check_proc_postgresql]=/usr/lib/nagios/plugins/check_systemd_service postgresql
|
2
templates/nrpe.sudoers.j2
Normal file
2
templates/nrpe.sudoers.j2
Normal file
@ -0,0 +1,2 @@
|
||||
nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_postfix_mailqueue -w {{ nrpe_postfix_warning }} -c {{ nrpe_postfix_critical }}
|
||||
nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_exim_mailqueue -w {{ nrpe_exim_warning }} -c {{ nrpe_exim_critical }}
|
Loading…
x
Reference in New Issue
Block a user