initial commit

This commit is contained in:
Ludovic Cartier 2024-12-16 19:24:54 +01:00
parent 27957d4418
commit 85029a0f01
20 changed files with 14349 additions and 0 deletions

20
defaults/main.yml Normal file
View File

@ -0,0 +1,20 @@
---
nrpe_allowed_hosts: '127.0.0.1,212.85.154.82,51.158.69.165'
nrpe_load_warning: '`cat /proc/cpuinfo |grep -c processor`'
nrpe_load_critical: '`echo "$(($(cat /proc/cpuinfo |grep -c processor) * 2 ))"`'
nrpe_memory_warning: 80
nrpe_memory_critical: 90
nrpe_swap_warning: 40
nrpe_swap_critical: 60
nrpe_exim_warning: 10
nrpe_exim_critical: 20
nrpe_postfix_warning: 10
nrpe_postfix_critical: 20
nrpe_eth_warning: '12M'
nrpe_eth_critical: '15M'

344
files/nrpe/check_3ware Executable file
View File

@ -0,0 +1,344 @@
#!/usr/bin/perl
# -------------------------------------------------------
# -=- <check_3ware-raid.pl> -=-
# -------------------------------------------------------
#
# Description : yet another plugin to check your 3ware RAID
# controller
#
# Version : 0.1
# -------------------------------------------------------
# In :
# - see the How to use section
#
# Out :
# - only print on the standard output
#
# Features :
# - perfdata output
#
# Fix Me/Todo :
# - too many things ;) but let me know what do you think about it
#
# ####################################################################
# ####################################################################
# GPL v3
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
# ####################################################################
# ####################################################################
# How to use :
# ------------
#
# 1 to use this script you have to install firt tw_cli. You can find
# the source here : http://www.3ware.com/support/download.asp
# just follow the instructions to compile and deploy it
#
# 2 then you just have to run the following command :
# $ ./check_3ware-raid.pl --help
#
# If you need to use this script with NRPE you just have to do the
# following steps :
#
# 1 allow your user to run the script with the sudo rights. Just add
# something like that in your /etc/sudoers (use visudo) :
# nagios ALL=(ALL) NOPASSWD: /<path-to>/check_3ware-raid.pl
#
# 2 then just add this kind of line in your NRPE config file :
# command[check_3ware]=/usr/bin/sudo /<path-to>/check_3ware-raid.pl
#
# 3 don't forget to restart your NRPE daemon
#
# ####################################################################
# ####################################################################
# Changelog :
# -----------
#
# --------------------------------------------------------------------
# Date:28/11/2009 Version:0.1 Author:Erwan Ben Souiden
# >> creation
# ####################################################################
# ####################################################################
# Don't touch anything under this line!
# You shall not pass - Gandalf is watching you
# ####################################################################
use strict;
use warnings;
use Getopt::Long qw(:config no_ignore_case);
# Generic variables
# -----------------
my $version = '0.1';
my $author = 'Erwan Labynocle Ben Souiden';
my $a_mail = 'erwan@aleikoum.net';
my $script_name = 'check_3ware-raid.pl';
my $verbose_value = 0;
my $version_value = 0;
my $more_value = 0;
my $help_value = 0;
my $perfdata_value = 0;
my %ERRORS=('OK'=>0,'WARNING'=>1,'CRITICAL'=>2,'UNKNOWN'=>3,'DEPENDENT'=>4);
# Plugin default variables
# ------------------------
my $display = 'CHECK 3ware RAID - ';
my ($critical,$warning) = (2,1);
my $tw_cli_path = '/usr/sbin/tw_cli';
my ($id_controller,$action) = ("",'disk_check');
GetOptions (
'P=s' => \ $tw_cli_path,
'path-tw_cli=s' => \ $tw_cli_path,
'w=i' => \ $warning,
'warning=i' => \ $warning,
'c=i' => \ $critical,
'critical=i' => \ $critical,
'action=s' => \ $action,
'a=s' => \ $action,
'C=s' => \ $id_controller,
'controller=s' => \ $id_controller,
'm' => \ $more_value,
'more' => \ $more_value,
'V' => \ $version_value,
'version' => \ $version_value,
'h' => \ $help_value,
'H' => \ $help_value,
'help' => \ $help_value,
'display=s' => \ $display,
'D=s' => \ $display,
'perfdata' => \ $perfdata_value,
'p' => \ $perfdata_value,
'v' => \ $verbose_value,
'verbose' => \ $verbose_value
);
print_usage() if ($help_value);
print_version() if ($version_value);
# Syntax check of your specified options
# --------------------------------------
print "DEBUG : action : $action, path-tw_cli : $tw_cli_path\n" if ($verbose_value);
if (($action eq "") or ($tw_cli_path eq "")) {
print $display.'one or more following arguments are missing :action/path-tw_cli'."\n";
exit $ERRORS{"UNKNOWN"};
}
print "DEBUG : check if $tw_cli_path exists and is executable\n" if ($verbose_value);
if(! -x $tw_cli_path) {
print $display."$tw_cli_path".' is not executable by you'."\n";
exit $ERRORS{"UNKNOWN"};
}
print "DEBUG : warning threshold : $warning, critical threshold : $critical\n" if ($verbose_value);
if (($critical < 0) or ($warning < 0) or ($critical < $warning)) {
print $display.'the thresholds must be integers and the critical threshold higher or equal than the warning threshold'."\n";
exit $ERRORS{"UNKNOWN"};
}
print "DEBUG : controller : $id_controller\n" if ($verbose_value);
if ($id_controller ne "") {
if (check_controller("$tw_cli_path",$id_controller) != 0) {
print $display.'UNKNOWN - problem with the controller '."$id_controller ".'may be it does not exist'."\n";
exit $ERRORS{"UNKNOWN"};
}
}
# Core script
# -----------
my ($return,$return_more,$plugstate) = ("","","OK");
my @controller_list;
if (! $id_controller) {
@controller_list = list_all_controller("$tw_cli_path");
if (! @controller_list) {
print $display.'UNKNOWN - problem to have the controllers list'."\n";
exit $ERRORS{"UNKNOWN"};
}
}
else {
push(@controller_list,$id_controller);
}
print "DEBUG : action = $action\n" if ($verbose_value);
my @show_return;
# disk_check action
# -----------------
if ($action eq 'disk_check') {
my ($c_ok,$c_other) = (0,0);
foreach (@controller_list) {
@show_return = `$tw_cli_path /$_ show`;
foreach (@show_return) {
if ($_=~/^(p\d+)\s+(\S+)\s/ ) {
print "DEBUG : disk $1/status $2\n" if ($verbose_value);
$c_ok++ if ($2 eq "OK");
$c_other++ if (($2 ne "OK") and ($2 ne "NOT-PRESENT"));
$return_more .= " ($1,$2)";
}
}
$return .= "$c_ok disk(s) detected as OK";
$return .= " and $c_other with potential problem" if ($c_other);
$return .= " -$return_more" if ($more_value);
$return .= " | disksOK=$c_ok disksNOK=$c_other" if ($perfdata_value);
$plugstate = "WARNING" if ($c_other >= $warning);
$plugstate = "CRITICAL" if ($c_other >= $critical);
}
}
# unit action
# -----------
elsif ($action eq 'unit_check') {
my ($c_ok,$c_rebuild,$c_other) = (0,0,0);
foreach (@controller_list) {
@show_return = `$tw_cli_path /$_ show`;
foreach (@show_return) {
if ($_=~/^(u\d+)\s+(\S+)\s+(\S+)/) {
print "DEBUG : disk $1/type $2/status $3\n" if ($verbose_value);
$c_ok++ if ($3 eq "OK");
$c_rebuild++ if ($3 eq "REBUILD");
$c_other++ if (($3 ne "OK") and ($3 ne "REBUILD"));
$return_more .= " ($1,$2,$3)";
}
}
$return .= "$c_ok unit(s) detected as OK";
$return .= " and $c_rebuild as REBUILD" if ($c_rebuild);
$return .= "and $c_other with potential problem" if ($c_other);
$return .= " -$return_more" if ($more_value);
$return .= " | unitOK=$c_ok unitREBUILD=$c_rebuild unitNOK=$c_other" if ($perfdata_value);
$plugstate = "WARNING" if ($c_rebuild);
$plugstate = "CRITICAL" if ($c_other);
}
}
else {
$return .= "action must be unit_check|disk_check";
$action = "";
$plugstate = "UNKNOWN";
}
print $display.$action." - ".$plugstate." - ".$return;
exit $ERRORS{$plugstate};
# ####################################################################
# function 1 : display the help
# ------------------------------
sub print_usage {
print <<EOT;
$script_name version $version by $author
This plugin checks state of your physical disks and logical units of a 3ware RAID card.
Usage: /<path-to>/$script_name [-a unit_check|disk_check] [-p] [-D "$display"] [-v] [-m] [-c 2] [-w 1] [-C /c1]
Options:
-h, --help
Print detailed help screen
-V, --version
Print version information
-D, --display=STRING
to modify the output display...
default is "CHECK 3ware RAID - "
-P, --path-tw_cli=STRING
specify the path to the tw_cli binary
default value is /usr/sbin/tw_cli
-a, --action=STRING
specify the action : unit_check|disk_check
default is disk_check
disk_check : display state of all physical disks
unit_check : display state of all logical unit
-C, --controller=STRING
allow you to specify only one controller to check
the default behavior is to check each time every controller
-c, --critical=INT
specify a critical threshold for the number of disks in a non-OK state.
default is 2
only for the disk_check action
-w, --warning=INT
specify a warning threshold for the number of disks in a non-OK state.
default is 1
only for the disk_check action
-m, --more
Print a longer output. By default, the output is not complet because
Nagios may truncate it. This option is just for you
-p, --perfdata
If you want to activate the perfdata output
-v, --verbose
Show details for command-line debugging (Nagios may truncate the output)
Send email to $a_mail if you have questions
regarding use of this software. To submit patches or suggest improvements,
send email to $a_mail
This plugin has been created by $author
Hope you will enjoy it ;)
Remember :
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
EOT
exit $ERRORS{"UNKNOWN"};
}
# function 2 : display version information
# -----------------------------------------
sub print_version {
print <<EOT;
$script_name version $version
EOT
exit $ERRORS{"UNKNOWN"};
}
# function 3 : check if controller exists
# ---------------------------------------
sub check_controller {
my ($tw_cli_path,$id_controller) = @_;
system("$tw_cli_path /$id_controller show >> /dev/null 2>&1");
return $?;
}
# function 4 : return the controllers list
# ----------------------------------------
sub list_all_controller {
my ($tw_cli_path) = @_;
my @controller_list;
my @cmd_output = `$tw_cli_path show`;
if ($? == 0) {
foreach (@cmd_output) {
if ($_=~/^(c\d+)\s/ ) {
push(@controller_list,$1);
}
}
}
return @controller_list;
}

BIN
files/nrpe/check_disk_advanced Executable file

Binary file not shown.

BIN
files/nrpe/check_dns Executable file

Binary file not shown.

985
files/nrpe/check_docker Executable file
View File

@ -0,0 +1,985 @@
#!/usr/bin/env python3
# logging.basicConfig(level=logging.DEBUG)
import math
from collections import deque, namedtuple, UserDict, defaultdict
from sys import argv
import argparse
import json
import logging
import os
import re
import socket
import stat
import traceback
from concurrent import futures
from datetime import datetime, timezone
from functools import lru_cache
from http.client import HTTPConnection
from urllib import request
from urllib.error import HTTPError, URLError
from urllib.request import AbstractHTTPHandler, HTTPHandler, HTTPSHandler, OpenerDirector, HTTPRedirectHandler, \
Request, HTTPBasicAuthHandler
logger = logging.getLogger()
__author__ = 'Tim Laurence'
__copyright__ = "Copyright 2018"
__credits__ = ['Tim Laurence']
__license__ = "GPL"
__version__ = "2.1.0"
'''
nrpe compatible check for docker containers.
Requires Python 3
Note: I really would have preferred to have used requests for all the network connections but that would have added a
dependency.
'''
DEFAULT_SOCKET = '/var/run/docker.sock'
DEFAULT_TIMEOUT = 10.0
DEFAULT_PORT = 2375
DEFAULT_MEMORY_UNITS = 'B'
DEFAULT_HEADERS = [('Accept', 'application/vnd.docker.distribution.manifest.v2+json')]
DEFAULT_PUBLIC_REGISTRY = 'registry-1.docker.io'
# The second value is the power to raise the base to.
UNIT_ADJUSTMENTS_TEMPLATE = {
'%': 0,
'B': 0,
'KB': 1,
'MB': 2,
'GB': 3,
'TB': 4
}
unit_adjustments = None
# Reduce message to a single OK unless a checks fail.
no_ok = False
# Suppress performance data reporting
no_performance = False
OK_RC = 0
WARNING_RC = 1
CRITICAL_RC = 2
UNKNOWN_RC = 3
# These hold the final results
rc = -1
messages = []
performance_data = []
ImageName = namedtuple('ImageName', "registry name tag full_name")
class ThresholdSpec(UserDict):
def __init__(self, warn, crit, units=''):
super().__init__(warn=warn, crit=crit, units=units)
def __getattr__(self, item):
return self[item]
# How much threading can we do? We are generally not CPU bound so I am using this a worse case cap
DEFAULT_PARALLELISM = 10
# Holds list of all threads
threads = []
# This is used during testing
DISABLE_THREADING = False
# Hacked up urllib to handle sockets
#############################################################################################
# Docker runs a http connection over a socket. http.client is knows how to deal with these
# but lacks some niceties. Urllib wraps that and makes up for some of the deficiencies but
# cannot fix the fact http.client can't read from socket files. In order to take advantage of
# urllib and http.client's capabilities the class below tweaks HttpConnection and passes it
# to urllib registering for socket:// connections
class SocketFileHandler(AbstractHTTPHandler):
class SocketFileToHttpConnectionAdaptor(HTTPConnection):
def __init__(self, socket_file, timeout=DEFAULT_TIMEOUT):
super().__init__(host='', port=0, timeout=timeout)
self.socket_file = socket_file
def connect(self):
self.sock = socket.socket(family=socket.AF_UNIX, type=socket.SOCK_STREAM, proto=0, fileno=None)
self.sock.settimeout(self.timeout)
self.sock.connect(self.socket_file)
def socket_open(self, req):
socket_file, path = req.selector.split(':', 1)
req.host = socket_file
req.selector = path
return self.do_open(self.SocketFileToHttpConnectionAdaptor, req)
# Tokens are not cached because I expect the callers to cache the responses
class Oauth2TokenAuthHandler(HTTPBasicAuthHandler):
auth_failure_tracker = defaultdict(int)
def http_response(self, request, response):
code, hdrs = response.code, response.headers
www_authenticate_header = response.headers.get('www-authenticate', None)
if code == 401 and www_authenticate_header:
scheme = www_authenticate_header.split()[0]
if scheme.lower() == 'bearer':
return self.process_oauth2(request, response, www_authenticate_header)
return response
https_response = http_response
@staticmethod
def _get_outh2_token(www_authenticate_header):
auth_fields = dict(re.findall(r"""(?:(?P<key>[^ ,=]+)="([^"]+)")""", www_authenticate_header))
auth_url = "{realm}?scope={scope}&service={service}".format(
realm=auth_fields['realm'],
scope=auth_fields['scope'],
service=auth_fields['service'],
)
token_request = Request(auth_url)
token_request.add_header("Content-Type", "application/x-www-form-urlencoded; charset=utf-8")
token_response = request.urlopen(token_request)
return process_urllib_response(token_response)['token']
def process_oauth2(self, request, response, www_authenticate_header):
# This keep infinite auth loops from happening
full_url = request.full_url
self.auth_failure_tracker[full_url] += 1
if self.auth_failure_tracker[full_url] > 1:
raise HTTPError(full_url, 401, "Stopping Oauth2 failure loop for {}".format(full_url),
response.headers, response)
auth_token = self._get_outh2_token(www_authenticate_header)
request.add_unredirected_header('Authorization', 'Bearer ' + auth_token)
return self.parent.open(request, timeout=request.timeout)
# Got some help from this example https://gist.github.com/FiloSottile/2077115
class HeadRequest(Request):
def get_method(self):
return "HEAD"
better_urllib_get = OpenerDirector()
better_urllib_get.addheaders = DEFAULT_HEADERS.copy()
better_urllib_get.add_handler(HTTPHandler())
better_urllib_get.add_handler(HTTPSHandler())
better_urllib_get.add_handler(HTTPRedirectHandler())
better_urllib_get.add_handler(SocketFileHandler())
better_urllib_get.add_handler(Oauth2TokenAuthHandler())
class RegistryError(Exception):
def __init__(self, response):
self.response_obj = response
# Util functions
#############################################################################################
def parse_thresholds(spec, include_units=True, units_required=True):
"""
Given a spec string break it up into ':' separated chunks. Convert strings to ints as it makes sense
:param spec: The threshold specification being parsed
:param include_units: Specifies that units should be processed and returned if present
:param units_required: Mark spec as invalid if the units are missing.
:return: A list containing the thresholds in order of warn, crit, and units(if included and present)
"""
parts = deque(spec.split(':'))
if not all(parts):
raise ValueError("Blanks are not allowed in a threshold specification: {}".format(spec))
# Warn
warn = int(parts.popleft())
# Crit
crit = int(parts.popleft())
units = ''
if include_units:
if len(parts):
# units
units = parts.popleft()
elif units_required:
raise ValueError("Missing units in {}".format(spec))
if len(parts) != 0:
raise ValueError("Too many threshold specifiers in {}".format(spec))
return ThresholdSpec(warn=warn, crit=crit, units=units)
def pretty_time(seconds):
remainder = seconds
result = []
if remainder > 24 * 60 * 60:
days, remainder = divmod(remainder, 24 * 60 * 60)
result.append("{}d".format(int(days)))
if remainder > 60 * 60:
hours, remainder = divmod(remainder, 60 * 60)
result.append("{}h".format(int(hours)))
if remainder > 60:
minutes, remainder = divmod(remainder, 60)
result.append("{}min".format(int(minutes)))
result.append("{}s".format(int(remainder)))
return result
def evaluate_numeric_thresholds(container, value, thresholds, name, short_name,
min=None, max=None, greater_than=True):
rounder = lambda x: round(x, 2)
INTEGER_UNITS = ['B', '%', '']
# Some units don't have decimal places
rounded_value = int(value) if thresholds.units in INTEGER_UNITS else rounder(value)
perf_string = "{container}_{short_name}={value}{units};{warn};{crit}".format(
container=container,
short_name=short_name,
value=rounded_value,
**thresholds)
if min is not None:
rounded_min = math.floor(min) if thresholds.units in INTEGER_UNITS else rounder(min)
perf_string += ';{}'.format(rounded_min)
if max is not None:
rounded_max = math.ceil(max) if thresholds.units in INTEGER_UNITS else rounder(max)
perf_string += ';{}'.format(rounded_max)
global performance_data
performance_data.append(perf_string)
if thresholds.units == 's':
nice_time = ' '.join(pretty_time(rounded_value)[:2])
results_str = "{} {} is {}".format(container, name, nice_time)
else:
results_str = "{} {} is {}{}".format(container, name, rounded_value, thresholds.units)
if greater_than:
comparator = lambda value, threshold: value >= threshold
else:
comparator = lambda value, threshold: value <= threshold
if comparator(value, thresholds.crit):
critical(results_str)
elif comparator(value, thresholds.warn):
warning(results_str)
else:
ok(results_str)
@lru_cache(maxsize=None)
def get_url(url):
logger.debug("get_url: {}".format(url))
response = better_urllib_get.open(url, timeout=timeout)
logger.debug("get_url: {} {}".format(url, response.status))
return process_urllib_response(response), response.status
@lru_cache(maxsize=None)
def head_url(url):
# Follow redirects
response = better_urllib_get.open(HeadRequest(url), timeout=timeout)
logger.debug("{} {}".format(url, response.status))
return response
def process_urllib_response(response):
response_bytes = response.read()
body = response_bytes.decode('utf-8')
# logger.debug("BODY: {}".format(body))
return json.loads(body)
def get_container_info(name):
content, _ = get_url(daemon + '/containers/{container}/json'.format(container=name))
return content
def get_image_info(name):
content, _ = get_url(daemon + '/images/{image}/json'.format(image=name))
return content
def get_state(container):
return get_container_info(container)['State']
def get_stats(container):
content, _ = get_url(daemon + '/containers/{container}/stats?stream=0'.format(container=container))
return content
def get_ps_name(name_list):
# Pick the name that starts with a '/' but doesn't contain a '/' and return that value
for name in name_list:
if '/' not in name[1:] and name[0] == '/':
return name[1:]
else:
raise NameError("Error when trying to identify 'ps' name in {}".format(name_list))
def get_containers(names, require_present):
containers_list, _ = get_url(daemon + '/containers/json?all=1')
all_container_names = set(get_ps_name(x['Names']) for x in containers_list)
if 'all' in names:
return all_container_names
filtered = set()
for matcher in names:
found = False
for candidate in all_container_names:
if re.match("^{}$".format(matcher), candidate):
filtered.add(candidate)
found = True
# If we don't find a container that matches out regex
if require_present and not found:
critical("No containers match {}".format(matcher))
return filtered
def get_container_digest(container):
# find registry and tag
inspection = get_container_info(container)
image_id = inspection['Image']
image_info = get_image_info(image_id)
try:
return image_info['RepoDigests'][0].split('@')[1]
except IndexError:
return None
def get_container_image_urls(container):
inspection = get_container_info(container)
image_id = inspection['Image']
image_info = get_image_info(image_id)
return image_info['RepoTags']
def normalize_image_name_to_manifest_url(image_name, insecure_registries):
parsed_url = parse_image_name(image_name)
lower_insecure = [reg.lower() for reg in insecure_registries]
# Registry query url
scheme = 'http' if parsed_url.registry.lower() in lower_insecure else 'https'
url = '{scheme}://{registry}/v2/{image_name}/manifests/{image_tag}'.format(scheme=scheme,
registry=parsed_url.registry,
image_name=parsed_url.name,
image_tag=parsed_url.tag)
return url, parsed_url.registry
# Auth servers seem picky about being hit too hard. Can't figure out why. ;)
# As result it is best to single thread this check
# This is based on https://docs.docker.com/registry/spec/auth/token/#requesting-a-token
def get_digest_from_registry(url):
logger.debug("get_digest_from_registry")
# query registry
# TODO: Handle logging in if needed
registry_info = head_url(url=url)
digest = registry_info.getheader('Docker-Content-Digest', None)
if digest is None:
raise RegistryError(response=registry_info)
return digest
def set_rc(new_rc):
global rc
rc = new_rc if new_rc > rc else rc
def ok(message):
set_rc(OK_RC)
messages.append('OK: ' + message)
def warning(message):
set_rc(WARNING_RC)
messages.append('WARNING: ' + message)
def critical(message):
set_rc(CRITICAL_RC)
messages.append('CRITICAL: ' + message)
def unknown(message):
set_rc(UNKNOWN_RC)
messages.append('UNKNOWN: ' + message)
def require_running(name):
def inner_decorator(func):
def wrapper(container, *args, **kwargs):
container_state = get_state(container)
state = normalize_state(container_state)
if state.lower() == "running":
func(container, *args, **kwargs)
else:
# container is not running, can't perform check
critical('{container} is not "running", cannot check {check}"'.format(container=container,
check=name))
return wrapper
return inner_decorator
def multithread_execution(disable_threading=DISABLE_THREADING):
def inner_decorator(func):
def wrapper(container, *args, **kwargs):
if DISABLE_THREADING:
func(container, *args, **kwargs)
else:
threads.append(parallel_executor.submit(func, container, *args, **kwargs))
return wrapper
return inner_decorator
def singlethread_execution(disable_threading=DISABLE_THREADING):
def inner_decorator(func):
def wrapper(container, *args, **kwargs):
if DISABLE_THREADING:
func(container, *args, **kwargs)
else:
threads.append(serial_executor.submit(func, container, *args, **kwargs))
return wrapper
return inner_decorator
def parse_image_name(image_name):
"""
Parses image names into their constituent parts.
:param image_name:
:return: ImageName
"""
# These are based on information found here
# https://docs.docker.com/engine/reference/commandline/tag/#extended-description
# https://github.com/docker/distribution/blob/master/reference/regexp.go
host_segment_re = '[a-zA-Z0-9]([a-zA-Z0-9-]*[a-zA-Z0-9])?'
hostname_re = r'({host_segment}\.)+{host_segment}'.format(host_segment=host_segment_re)
registry_re = r'((?P<registry>({hostname_re}(:\d+)?|{host_segment_re}:\d+))/)'.format(
host_segment_re=host_segment_re, hostname_re=hostname_re)
name_component_ends_re = '[a-z0-9]'
name_component_middle_re = '[a-z0-9._-]' # Ignoring spec limit of two _
name_component_re = '({end}{middle}*{end}|{end})'.format(end=name_component_ends_re,
middle=name_component_middle_re)
image_name_re = "(?P<image_name>({name_component}/)*{name_component})".format(name_component=name_component_re)
image_tag_re = '(?P<image_tag>[a-zA-Z0-9_][a-zA-Z0-9_.-]*)'
full_re = '^{registry}?{image_name}(:{image_tag})?$'.format(registry=registry_re, image_name=image_name_re,
image_tag=image_tag_re)
parsed = re.match(full_re, image_name)
registry = parsed.group('registry') if parsed.group('registry') else DEFAULT_PUBLIC_REGISTRY
image_name = parsed.group('image_name')
image_name = image_name if '/' in image_name or registry != DEFAULT_PUBLIC_REGISTRY else 'library/' + image_name
image_tag = parsed.group('image_tag')
image_tag = image_tag if image_tag else 'latest'
full_image_name = "{registry}/{image_name}:{image_tag}".format(
registry=registry,
image_name=image_name,
image_tag=image_tag)
return ImageName(registry=registry, name=image_name, tag=image_tag, full_name=full_image_name)
def normalize_state(status_info):
# Ugh, docker used to report state in as silly way then they figured out how to do it better.
# This tries the simpler new way and if that doesn't work fails back to the old way
# On new docker engines the status holds whatever the current state is, running, stopped, paused, etc.
if "Status" in status_info:
return status_info['Status']
status = 'Exited'
if status_info["Restarting"]:
status = 'Restarting'
elif status_info["Paused"]:
status = 'Paused'
elif status_info["Dead"]:
status = 'Dead'
elif status_info["Running"]:
return "Running"
return status
# Checks
#############################################################################################
@multithread_execution()
@require_running(name='memory')
def check_memory(container, thresholds):
if not thresholds.units in unit_adjustments:
unknown("Memory units must be one of {}".format(list(unit_adjustments.keys())))
return
inspection = get_stats(container)
# Subtracting cache to match what `docker stats` does.
adjusted_usage = inspection['memory_stats']['usage'] - inspection['memory_stats']['stats']['total_cache']
if thresholds.units == '%':
max = 100
usage = int(100 * adjusted_usage / inspection['memory_stats']['limit'])
else:
max = inspection['memory_stats']['limit'] / unit_adjustments[thresholds.units]
usage = adjusted_usage / unit_adjustments[thresholds.units]
evaluate_numeric_thresholds(container=container, value=usage, thresholds=thresholds, name='memory',
short_name='mem', min=0, max=max)
@multithread_execution()
def check_status(container, desired_state):
normized_desired_state = desired_state.lower()
normalized_state = normalize_state(get_state(container)).lower()
if normized_desired_state != normalized_state:
critical("{} state is not {}".format(container, desired_state))
return
ok("{} status is {}".format(container, desired_state))
@multithread_execution()
@require_running('health')
def check_health(container):
state = get_state(container)
if "Health" in state and "Status" in state["Health"]:
health = state["Health"]["Status"]
message = "{} is {}".format(container, health)
if health == 'healthy':
ok(message)
elif health == 'unhealthy':
critical(message)
else:
unknown(message)
else:
unknown('{} has no health check data'.format(container))
@multithread_execution()
@require_running('uptime')
def check_uptime(container, thresholds):
inspection = get_container_info(container)['State']['StartedAt']
only_secs = inspection[0:19]
start = datetime.strptime(only_secs, "%Y-%m-%dT%H:%M:%S")
start = start.replace(tzinfo=timezone.utc)
now = datetime.now(timezone.utc)
uptime = (now - start).total_seconds()
graph_padding = 2
thresholds.units = 's'
evaluate_numeric_thresholds(container=container, value=uptime, thresholds=thresholds, name='uptime',
short_name='up', min=0, max=graph_padding, greater_than=False)
@multithread_execution()
@require_running('restarts')
def check_restarts(container, thresholds):
inspection = get_container_info(container)
restarts = int(inspection['RestartCount'])
graph_padding = 2
evaluate_numeric_thresholds(container=container, value=restarts, thresholds=thresholds, name='restarts',
short_name='re', min=0, max=graph_padding)
@singlethread_execution()
def check_version(container, insecure_registries):
image_digest = get_container_digest(container)
if image_digest is None:
unknown('Checksum missing for "{}", try doing a pull'.format(container))
return
image_urls = get_container_image_urls(container=container)
if len(image_urls) > 1:
unknown('"{}" has multiple tags/names. Unsure which one to use to check the version.'.format(container))
return
elif len(image_urls) == 0:
unknown('"{}" has last no repository tag. Is this anywhere else?'.format(container))
return
url, registry = normalize_image_name_to_manifest_url(image_urls[0], insecure_registries)
try:
registry_hash = get_digest_from_registry(url)
except URLError as e:
if hasattr(e.reason, 'reason') and e.reason.reason == 'UNKNOWN_PROTOCOL':
unknown(
"TLS error connecting to registry {} for {}, should you use the '--insecure-registry' flag?" \
.format(registry, container))
return
elif hasattr(e.reason, 'strerror') and e.reason.strerror == 'nodename nor servname provided, or not known':
unknown(
"Cannot reach registry for {} at {}".format(container, url))
return
else:
raise e
except RegistryError as e:
unknown("Cannot check version, couldn't retrieve digest for {} while checking {}.".format(container, url))
return
if registry_hash == image_digest:
ok("{}'s version matches registry".format(container))
return
critical("{}'s version does not match registry".format(container))
def calculate_cpu_capacity_precentage(info, stats):
host_config = info['HostConfig']
if 'online_cpus' in stats['cpu_stats']:
num_cpus = stats['cpu_stats']['online_cpus']
else:
num_cpus = len(stats['cpu_stats']['cpu_usage']['percpu_usage'])
# Identify limit system being used
# --cpus
if 'NanoCpus' in host_config and host_config['NanoCpus'] != 0:
period = 1000000000
quota = host_config['NanoCpus']
# --cpu-quota
elif 'CpuQuota' in host_config and host_config['CpuQuota'] != 0:
period = 100000 if host_config['CpuPeriod'] == 0 else host_config['CpuPeriod']
quota = host_config['CpuQuota']
# unlimited
else:
period = 1
quota = num_cpus
if period * num_cpus < quota:
# This handles the case where the quota is actually bigger than amount available by all the cpus.
available_limit_ratio = 1
else:
available_limit_ratio = (period * num_cpus) / quota
cpu_delta = stats['cpu_stats']['cpu_usage']['total_usage'] - stats['precpu_stats']['cpu_usage']['total_usage']
system_delta = stats['cpu_stats']['system_cpu_usage'] - stats['precpu_stats']['system_cpu_usage']
usage = (cpu_delta / system_delta) * available_limit_ratio
usage = round(usage * 100, 0)
return usage
@multithread_execution()
@require_running('cpu')
def check_cpu(container, thresholds):
info = get_container_info(container)
stats = get_stats(container=container)
usage = calculate_cpu_capacity_precentage(info=info, stats=stats)
max = 100
thresholds.units = '%'
evaluate_numeric_thresholds(container=container, value=usage, thresholds=thresholds, name='cpu', short_name='cpu',
min=0, max=max)
def process_args(args):
parser = argparse.ArgumentParser(description='Check docker containers.')
# Connect to local socket or ip address
connection_group = parser.add_mutually_exclusive_group()
connection_group.add_argument('--connection',
dest='connection',
action='store',
default=DEFAULT_SOCKET,
type=str,
metavar='[/<path to>/docker.socket|<ip/host address>:<port>]',
help='Where to find docker daemon socket. (default: %(default)s)')
connection_group.add_argument('--secure-connection',
dest='secure_connection',
action='store',
type=str,
metavar='[<ip/host address>:<port>]',
help='Where to find TLS protected docker daemon socket.')
base_group = parser.add_mutually_exclusive_group()
base_group.add_argument('--binary_units',
dest='units_base',
action='store_const',
const=1024,
help='Use a base of 1024 when doing calculations of KB, MB, GB, & TB (This is default)')
base_group.add_argument('--decimal_units',
dest='units_base',
action='store_const',
const=1000,
help='Use a base of 1000 when doing calculations of KB, MB, GB, & TB')
parser.set_defaults(units_base=1024)
# Connection timeout
parser.add_argument('--timeout',
dest='timeout',
action='store',
type=float,
default=DEFAULT_TIMEOUT,
help='Connection timeout in seconds. (default: %(default)s)')
# Container name
parser.add_argument('--containers',
dest='containers',
action='store',
nargs='+',
type=str,
default=['all'],
help='One or more RegEx that match the names of the container(s) to check. If omitted all containers are checked. (default: %(default)s)')
# Container name
parser.add_argument('--present',
dest='present',
default=False,
action='store_true',
help='Modifies --containers so that each RegEx must match at least one container.')
# Threads
parser.add_argument('--threads',
dest='threads',
default=DEFAULT_PARALLELISM,
action='store',
type=int,
help='This + 1 is the maximum number of concurent threads/network connections. (default: %(default)s)')
# CPU
parser.add_argument('--cpu',
dest='cpu',
action='store',
type=str,
metavar='WARN:CRIT',
help='Check cpu usage percentage taking into account any limits. Valid values are 0 - 100.')
# Memory
parser.add_argument('--memory',
dest='memory',
action='store',
type=str,
metavar='WARN:CRIT:UNITS',
help='Check memory usage taking into account any limits. Valid values for units are %%,B,KB,MB,GB.')
# State
parser.add_argument('--status',
dest='status',
action='store',
type=str,
help='Desired container status (running, exited, etc).')
# Health
parser.add_argument('--health',
dest='health',
default=None,
action='store_true',
help="Check container's health check status")
# Age
parser.add_argument('--uptime',
dest='uptime',
action='store',
type=str,
metavar='WARN:CRIT',
help='Minimum container uptime in seconds. Use when infrequent crashes are tolerated.')
# Version
parser.add_argument('--version',
dest='version',
default=None,
action='store_true',
help='Check if the running images are the same version as those in the registry. Useful for finding stale images. Does not support login.')
# Version
parser.add_argument('--insecure-registries',
dest='insecure_registries',
action='store',
nargs='+',
type=str,
default=[],
help='List of registries to connect to with http(no TLS). Useful when using "--version" with images from insecure registries.')
# Restart
parser.add_argument('--restarts',
dest='restarts',
action='store',
type=str,
metavar='WARN:CRIT',
help='Container restart thresholds.')
# no-ok
parser.add_argument('--no-ok',
dest='no_ok',
action='store_true',
help='Make output terse suppressing OK messages. If all checks are OK return a single OK.')
# no-performance
parser.add_argument('--no-performance',
dest='no_performance',
action='store_true',
help='Suppress performance data. Reduces output when performance data is not being used.')
parser.add_argument('-V', action='version', version='%(prog)s {}'.format(__version__))
if len(args) == 0:
parser.print_help()
parsed_args = parser.parse_args(args=args)
global timeout
timeout = parsed_args.timeout
global daemon
global connection_type
if parsed_args.secure_connection:
daemon = 'https://' + parsed_args.secure_connection
connection_type = 'https'
elif parsed_args.connection:
if parsed_args.connection[0] == '/':
daemon = 'socket://' + parsed_args.connection + ':'
connection_type = 'socket'
else:
daemon = 'http://' + parsed_args.connection
connection_type = 'http'
return parsed_args
def no_checks_present(parsed_args):
# Look for all functions whose name starts with 'check_'
checks = [key[6:] for key in globals().keys() if key.startswith('check_')]
# Act like --present is a check though it is not implemented like one
return all(getattr(parsed_args, check) is None for check in checks) and not parsed_args.present
def socketfile_permissions_failure(parsed_args):
if connection_type == 'socket':
return not (os.path.exists(parsed_args.connection)
and stat.S_ISSOCK(os.stat(parsed_args.connection).st_mode)
and os.access(parsed_args.connection, os.R_OK)
and os.access(parsed_args.connection, os.W_OK))
else:
return False
def print_results():
if no_ok:
# Remove all the "OK"s
filtered_messages = [message for message in messages if not message.startswith('OK: ')]
if len(filtered_messages) == 0:
messages_concat = 'OK'
else:
messages_concat = '; '.join(filtered_messages)
else:
messages_concat = '; '.join(messages)
if no_performance or len(performance_data) == 0:
print(messages_concat)
else:
perfdata_concat = ' '.join(performance_data)
print(messages_concat + '|' + perfdata_concat)
def perform_checks(raw_args):
args = process_args(raw_args)
global parallel_executor
parallel_executor = futures.ThreadPoolExecutor(max_workers=args.threads)
global serial_executor
serial_executor = futures.ThreadPoolExecutor(max_workers=1)
global unit_adjustments
unit_adjustments = {key: args.units_base ** value for key, value in UNIT_ADJUSTMENTS_TEMPLATE.items()}
global no_ok
no_ok = args.no_ok
global no_performance
no_performance = args.no_ok
if socketfile_permissions_failure(args):
unknown("Cannot access docker socket file. User ID={}, socket file={}".format(os.getuid(), args.connection))
return
if args.containers == ["all"] and args.present:
unknown("You can not use --present without --containers")
return
if no_checks_present(args):
unknown("No checks specified.")
return
# Here is where all the work happens
#############################################################################################
containers = get_containers(args.containers, args.present)
if len(containers) == 0 and not args.present:
unknown("No containers names found matching criteria")
return
for container in containers:
# Check status
if args.status:
check_status(container, args.status)
# Check version
if args.version:
check_version(container, args.insecure_registries)
# below are checks that require a 'running' status
# Check status
if args.health:
check_health(container)
# Check cpu usage
if args.cpu:
check_cpu(container, parse_thresholds(args.cpu, units_required=False))
# Check memory usage
if args.memory:
check_memory(container, parse_thresholds(args.memory, units_required=False))
# Check uptime
if args.uptime:
check_uptime(container, parse_thresholds(args.uptime, include_units=False))
# Check restart count
if args.restarts:
check_restarts(container, parse_thresholds(args.restarts, include_units=False))
def main():
try:
perform_checks(argv[1:])
# get results to let exceptions in threads bubble out
[x.result() for x in futures.as_completed(threads)]
except Exception as e:
traceback.print_exc()
unknown("Exception raised during check': {}".format(repr(e)))
print_results()
exit(rc)
if __name__ == '__main__':
main()

181
files/nrpe/check_eth Executable file
View File

@ -0,0 +1,181 @@
#!/usr/bin/perl -w
use strict;
use warnings;
use Getopt::Long;
use constant BITS => 8;
use constant BYTES => 1;
my $iface = "";
my $bandwidth = "";
my $warning = "";
my $critical = "";
my $percent = "";
GetOptions(
"i|interface=s" => \$iface,
"w|warning=s" => \$warning,
"c|critical=s" => \$critical,
"b|bandwidth=s" => \$bandwidth,
"p|percent" => \$percent
);
my $bitmod = BYTES;
my $tmpfile = "/tmp/traffic";
my $output = "";
my $line = "";
my %status = ( 'OK' => 0,
'WARNING' => 1,
'CRITICAL' => 2,
'UNKNOWN' => 3
);
my $exit_status = $status{OK};
my %data = ( 'time' => 0, 'last_time' => 0,
'rxbytes' => 0, 'last_rxbytes' => 0,
'txbytes' => 0, 'last_txbytes' => 0
);
my %speed = ( 'tx' => 0,
'rx' => 0,
'interval' => 1
);
usage() if ( !$iface || !$warning || !$critical );
if ( $percent ) {
usage() if ( !$bandwidth || $bandwidth !~ /^\d+[kKmMgG]$/ );
usage() if ( $warning !~ /^\d{1,3}$/ || $warning>100 || $critical !~ /^\d{1,3}$/ || $critical>100 );
$bandwidth = human2bytes($bandwidth);
} else {
$warning = human2bytes($warning);
$critical = human2bytes($critical);
usage() if ( !$warning || !$critical )
}
usage() if ( $warning > $critical );
open ( NET, "</proc/net/dev" ) or die ( "Can't open /proc/net/dev: $!" );
while ( <NET> ) {
chomp();
if ( $_ =~ /^\s*$iface\:\s*(\d+)(?:\s*(?:\d+)){7}\s*(\d+)(?:\s*(?:\d+)){7}\s*$/ ) {
$data{time} = time - 1;
$data{rxbytes} = $1;
$data{txbytes} = $2;
last;
}
}
close( NET );
if ( $data{time} == 0 && $data{rxbytes} == 0 && $data{txbytes} == 0 ) {
exit $status{UNKNOWN};
}
if ( open( TMP, "<$tmpfile-$iface" ) ) {
my @line = <TMP>; chomp( @line );
( $data{last_time}, $data{last_rxbytes}, $data{last_txbytes} ) = split( ":", $line[0] );
}
if ( open( TMP, ">$tmpfile-$iface" ) ) {
print( TMP "$data{time}:$data{rxbytes}:$data{txbytes}\n" );
close( TMP );
}
$data{last_time} = $data{time} if ( !$data{last_time} || $data{last_time} > $data{time} );
$data{last_rxbytes} = $data{rxbytes} if ( !$data{last_rxbytes} || $data{last_rxbytes} > $data{rxbytes} );
$data{last_txbytes} = $data{txbytes} if ( !$data{last_txbytes} || $data{last_txbytes} > $data{txbytes} );
$speed{interval} = $data{time} - $data{last_time} + 1;
$speed{rx} = ( $data{rxbytes} - $data{last_rxbytes} ) / $speed{interval};
$speed{tx} = ( $data{txbytes} - $data{last_txbytes} ) / $speed{interval};
$output = "RX Bytes: ". bytes2human($data{rxbytes}) ."B, TX Bytes: ". bytes2human($data{txbytes}) ."B; ";
$output .= sprintf( "RX Speed: %s%sps, TX Speed: %s%sps; ",
bytes2human($speed{rx}*$bitmod), ($bitmod==BITS)?"b":"B", bytes2human($speed{tx}*$bitmod), ($bitmod==BITS)?"b":"B" );
if ( $percent ) {
if ( ( $speed{rx} / $bandwidth ) * 100 > $critical || ( $speed{tx} / $bandwidth ) * 100 > $critical ) {
$exit_status = $status{CRITICAL};
$output .= "CRITICAL";
} elsif ( ( $speed{rx} / $bandwidth ) * 100 > $warning || ( $speed{tx} / $bandwidth ) * 100 > $warning ) {
$exit_status = $status{WARNING};
$output .= "WARNING";
} else {
$output .= "OK";
}
} else {
if ( ( $speed{rx} > $critical ) or ( $speed{tx} > $critical ) ) {
$exit_status = $status{CRITICAL};
$output .= "CRITICAL";
} elsif ( ( $speed{rx} > $warning ) or ( $speed{tx} > $warning ) ) {
$exit_status = $status{WARNING};
$output .= "WARNING";
} else {
$output .= "OK";
}
}
$output .= " bandwidth utilization";
$output .= sprintf( " | rx=%.0f;%2.0f;%2.0f tx=%.0f;%2.0f;%2.0f",
$speed{rx}*$bitmod, ($percent)?$warning*$bandwidth/100:$warning, ($percent)?$critical*$bandwidth/100:$critical,
$speed{tx}*$bitmod, ($percent)?$warning*$bandwidth/100:$warning, ($percent)?$critical*$bandwidth/100:$critical );
print "$output\n";
exit( $exit_status );
# helper function
sub bytes2human {
my $bytes = shift;
return 0 if !$bytes;
my @units = ( '','K','M','G','T' );
my $offset = 0;
while ( $bytes > 1024 ){
$bytes = $bytes / 1024;
$offset++;
}
return sprintf( "%2.0f%s", $bytes, $units[$offset] );
}
sub human2bytes {
my $value = shift;
return 0 if ( !$value || $value !~ /^(\d+)(\w)$/ );
my ($number, $scale) = ($1,$2);
my $bitmod = ( $scale =~ /[kmg]/ ) ? BITS : BYTES;
my @units = ( '','K','M','G','T' );
my $offset = 0;
while( $units[$offset] ne "\u$scale" && $offset <= scalar(@units) ) {
$number *= 1024;
$offset++;
}
return $number/$bitmod;
}
sub usage {
print <<EOU;
Usage: $0 -i <interface> -w <warn> -c <critical> [-p -b <bandwidth>]
-i, --interface STRING
Network interface name (example: eth0)
-w, --warning STRING
Warning interface speed level (K/M/G Bps, k/m/g bps)
If using with -p value should be in percentage (1-100)
-c, --critilcal STRING
Critical interface speed level (K/M/G Bps, k/m/g bps)
If using with -p value should be in percentage (1-100)
-p
Calculate warning and critical levels in percentage based on interface bandwidth
-b, --bandwidth STRING
Interface bandwidth value (K/M/G Bps, k/m/g bps)
EOU
unlink($tmpfile);
exit $status{UNKNOWN};
}

139
files/nrpe/check_exim_mailqueue Executable file
View File

@ -0,0 +1,139 @@
#!/bin/sh
###############################################
#
# Nagios script to check Exim mail queue status
#
# Copyright 2007, 2008 Ian Yates
#
# NOTE: Depending on your config, the nagios user will probably be
# needed to be added to the exim group for this script to function correctly
#
# See usage for command line switches
#
# You need to add the following to /etc/sudoers:
# nagios ALL=NOPASSWD:/usr/local/exim/bin/exim
#
# Created: 2006-07-31 (i.yates@uea.ac.uk)
# Updated: 2007-04-30 (i.yates@uea.ac.uk) - Linux/sudo tweaks
# Updated: 2008-03-26 (i.yates@uea.ac.uk) - Fixed bug in critical/warning level checking which could result in erroneous results.
# Updated: 2008-11-27 (i.yates@uea.ac.uk) - Added GPLv3 licence
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
###############################################
. /usr/lib/nagios/plugins/utils.sh
VERSION="1.3"
EXIM=/usr/sbin/exim
SUDO=/usr/bin/sudo
FLAG_VERBOSE=FALSE
LEVEL_WARN=""
LEVEL_CRIT=""
RESULT=""
EXIT_STATUS=$STATE_OK
###############################################
#
## FUNCTIONS
#
## Print usage
usage() {
echo " check_eximailqueue $VERSION - Nagios Exim mail queue check script"
echo ""
echo " Usage: check_eximailqueue -w <warning queue size> -c <critical queue size> [ -v ] [ -h ]"
echo ""
echo " -w Queue size at which a warning is triggered"
echo " -c Queue size at which a critical is triggered"
echo " -v Verbose output (ignored for now)"
echo " -h Show this page"
echo ""
}
## Process command line options
doopts() {
if ( `test 0 -lt $#` )
then
while getopts w:c:vh myarg "$@"
do
case $myarg in
h|\?)
usage
exit;;
w)
LEVEL_WARN=$OPTARG;;
c)
LEVEL_CRIT=$OPTARG;;
v)
FLAG_VERBOSE=TRUE;;
*) # Default
usage
exit;;
esac
done
else
usage
exit
fi
}
# Write output and return result
theend() {
echo $RESULT
exit $EXIT_STATUS
}
#
## END FUNCTIONS
#
#############################################
#
## MAIN
#
# Handle command line options
doopts $@
# Do the do
OUTPUT=`$SUDO -u root $EXIM -bpc`
if test -z "$OUTPUT" ; then
RESULT="Mailqueue WARNING - query returned no output!"
EXIT_STATUS=$STATE_WARNING
else
if test "$OUTPUT" -lt "$LEVEL_WARN" ; then
RESULT="Mailqueue OK - $OUTPUT messages on queue"
EXIT_STATUS=$STATE_OK
else
if test "$OUTPUT" -ge "$LEVEL_CRIT" ; then
RESULT="Mailqueue CRITICAL - $OUTPUT messages on queue"
EXIT_STATUS=$STATE_CRITICAL
else
if test "$OUTPUT" -ge "$LEVEL_WARN" ; then
RESULT="Mailqueue WARNING - $OUTPUT messages on queue"
EXIT_STATUS=$STATE_WARNING
fi
fi
fi
fi
# Quit and return information and exit status
theend

42
files/nrpe/check_mdadm Executable file
View File

@ -0,0 +1,42 @@
#!/bin/bash
#
# Created by Sebastian Grewe, Jammicron Technology
#
# Get count of raid arrays
RAID_DEVICES=`grep ^md -c /proc/mdstat`
# Get count of degraded arrays
#RAID_STATUS=`grep "\[.*_.*\]" /proc/mdstat -c`
RAID_STATUS=`egrep "\[.*(=|>|\.).*\]" /proc/mdstat -c`
# Is an array currently recovering, get percentage of recovery
RAID_RECOVER=`grep recovery /proc/mdstat | awk '{print $4}'`
RAID_RESYNC=`grep resync /proc/mdstat | awk '{print $4}'`
RAID_CHECK=`grep check /proc/mdstat | awk '{print $4}'`
# Check raid status
# RAID recovers --> Warning
if [[ $RAID_RECOVER ]]; then
STATUS="WARNING - Checked $RAID_DEVICES arrays, recovering : $RAID_RECOVER"
EXIT=1
elif [[ $RAID_RESYNC ]]; then
STATUS="WARNING - Checked $RAID_DEVICES arrays, resync : $RAID_RESYNC"
EXIT=1
elif [[ $RAID_CHECK ]]; then
STATUS="OK - Checked $RAID_DEVICES arrays, check : $RAID_CHECK"
EXIT=0
# RAID ok
elif [[ $RAID_STATUS == "0" ]]; then
STATUS="OK - Checked $RAID_DEVICES arrays."
EXIT=0
# All else critical, better save than sorry
else
EXTEND_RAID_STATUS=`egrep "\[.*(=|>|\.|_).*\]" /proc/mdstat | awk '{print $2}' | uniq -c | xargs echo`
STATUS="WARNING- Checked $RAID_DEVICES arrays, $RAID_STATUS have failed check: $EXTEND_RAID_STATUS "
EXIT=1
fi
# Status and quit
echo $STATUS
exit $EXIT

124
files/nrpe/check_memory Executable file
View File

@ -0,0 +1,124 @@
#!/usr/bin/env bash
#Set script name
SCRIPT=`basename ${BASH_SOURCE[0]}`
#Set default values
optMW=95
optMC=98
optSW=95
optSC=98
# help function
function printHelp {
echo -e \\n"Help for $SCRIPT"\\n
echo -e "Basic usage: $SCRIPT -w {warning} -c {critical} -W {warning} -C {critical}"\\n
echo "Command switches are optional, default values for warning is 95% and critical is 98%"
echo "-w - Sets warning value for Memory Usage. Default is 95%"
echo "-c - Sets critical value for Memory Usage. Default is 98%"
echo "-W - Sets warning value for Swap Usage. Default is 95%"
echo "-C - Sets critical value for Swap Usage. Default is 98%"
echo -e "-h - Displays this help message"\\n
echo -e "Example: $SCRIPT -w 80 -c 90 -W 40 -C 60"\\n
echo -e \\n\\n"Author: Lukasz Gogolin, lukasz.gogolin@gmail.com"
echo -e "Git: http://bitbucket.org/lgogolin/nagios_plugins"
exit 1
}
# regex to check is OPTARG an integer
re='^[0-9]+$'
while getopts :w:c:W:C:h FLAG; do
case $FLAG in
w)
if ! [[ $OPTARG =~ $re ]] ; then
echo "error: Not a number" >&2; exit 1
else
optMW=$OPTARG
fi
;;
c)
if ! [[ $OPTARG =~ $re ]] ; then
echo "error: Not a number" >&2; exit 1
else
optMC=$OPTARG
fi
;;
W)
if ! [[ $OPTARG =~ $re ]] ; then
echo "error: Not a number" >&2; exit 1
else
optSW=$OPTARG
fi
;;
C)
if ! [[ $OPTARG =~ $re ]] ; then
echo "error: Not a number" >&2; exit 1
else
optSC=$OPTARG
fi
;;
h)
printHelp
;;
\?)
echo -e \\n"Option - $OPTARG not allowed."
printHelp
exit 2
;;
esac
done
shift $((OPTIND-1))
array=( $(cat /proc/meminfo | egrep 'MemTotal|MemFree|Buffers|Cached|SwapTotal|SwapFree' |awk '{print $1 " " $2}' |tr '\n' ' ' |tr -d ':' |awk '{ printf("%i %i %i %i %i %i %i", $2, $4, $6, $8, $10, $12, $14) }') )
memTotal_k=${array[0]}
memTotal_b=$(($memTotal_k*1024))
memFree_k=${array[1]}
memFree_b=$(($memFree_k*1024))
memBuffer_k=${array[2]}
memBuffer_b=$(($memBuffer_k*1024))
memCache_k=${array[3]}
memCache_b=$(($memCache_k*1024))
memTotal_m=$(($memTotal_k/1024))
memFree_m=$(($memFree_k/1024))
memBuffer_m=$(($memBuffer_k/1024))
memCache_m=$(($memCache_k/1024))
memUsed_b=$(($memTotal_b-$memFree_b-$memBuffer_b-$memCache_b))
memUsed_m=$(($memTotal_m-$memFree_m-$memBuffer_m-$memCache_m))
memUsedPrc=$((($memUsed_b*100)/$memTotal_b))
swapTotal_k=${array[5]}
swapTotal_b=$(($swapTotal_k*1024))
swapFree_k=${array[6]}
swapFree_b=$(($swapFree_k*1024))
swapUsed_k=$(($swapTotal_k-$swapFree_k))
swapUsed_b=$(($swapUsed_k*1024))
swapTotal_m=$(($swapTotal_k/1024))
swapFree_m=$(($swapFree_k/1024))
swapUsed_m=$(($swapTotal_m-$swapFree_m))
if [ $swapTotal_k -eq 0 ]; then
swapUsedPrc=0
else
swapUsedPrc=$((($swapUsed_k*100)/$swapTotal_k))
fi
message="[MEMORY] Total: $memTotal_m MB - Used: $memUsed_m MB - $memUsedPrc% [SWAP] Total: $swapTotal_m MB - Used: $swapUsed_m MB - $swapUsedPrc% | MTOTAL=$memTotal_b;;;; MUSED=$memUsed_b;;;; MCACHE=$memCache_b;;;; MBUFFER=$memBuffer_b;;;; STOTAL=$swapTotal_b;;;; SUSED=$swapUsed_b;;;;"
if [ $memUsedPrc -ge $optMC ] || [ $swapUsedPrc -ge $optSC ]; then
echo -e $message
$(exit 2)
elif [ $memUsedPrc -ge $optMW ] || [ $swapUsedPrc -ge $optSW ]; then
echo -e $message
$(exit 1)
else
echo -e $message
$(exit 0)
fi

View File

@ -0,0 +1,237 @@
#!/usr/bin/perl
# $Id$
#
# check_mysql_longqueries plugin for Nagios
#
# Copyright (C) 2009 Vincent Rivellino <vrivellino@paybycash.com>
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
#
#
# Checks MySQL's processlist to see if there are queries running longer than
# defined thresholds.
#
# Requires the following modules:
# DBI
# Monitoring::Plugin
#
# Copyright Notice: GPLv2
#
# CHANGES
#
# 30 Jan 2009 - Vincent Rivellino <vrivellino@paybycash.com>
# Initial version released.
#
# 02 Mar 2020 - Ludovic Cartier <ludovic.cartier@brainsys.io>
# Replace Nagios::Plugin by Monitoring::Plugin
# need debian package libmonitoring-plugin-perl
#
use warnings;
use strict;
use DBI;
use Monitoring::Plugin;
## setup Monitoring::Plugin
my $np = Monitoring::Plugin->new(
usage => "Usage: %s [-v|--verbose] [-H <host>] [-P <port>] [-S <socket>] [-u <user>] [-p <password>] -w <warn time> -c <crit time>",
version => "1.0",
license => "Copyright (C) 2009 Vincent Rivellino <vrivellino\@paybycash.com>\n" .
"This plugin comes with ABSOLUTELY NO WARRANTY. This is free software, and you\n" .
"are welcome to redistribute it under the conditions of version 2 of the GPL."
);
## add command line arguments
$np->add_arg(
spec => 'host|H=s',
help => "-H, --host\n MySQL server host"
);
$np->add_arg(
spec => 'port|P=i',
help => "-P, --port\n MySQL server port"
);
$np->add_arg(
spec => 'socket|S=s',
help => "-S, --socket\n MySQL server socket"
);
$np->add_arg(
spec => 'user|u=s',
help => "-u, --user\n database user (must have privilege to SHOW PROCESSLIST)"
);
$np->add_arg(
spec => 'password|p=s',
help => "-p, --password\n database password"
);
$np->add_arg(
spec => 'warn|w=i',
help => "-w, --warn\n Query time in seconds to generate a WARNING",
required => 1
);
$np->add_arg(
spec => 'crit|c=i',
help => "-c, --crit\n Query time in seconds to generate a CRITICAL",
required => 1
);
$np->add_arg(
spec => 'db=s',
help => "--db\n Only check queries running on this database\n To specify more than one, separate with commas."
);
$np->add_arg(
spec => 'skip_db=s',
help => "--skip_db\n Don't check queries running on this database\n To specify more than one, separate with commas."
);
$np->add_arg(
spec => 'clientuser=s',
help => "--clientuser\n Only check queries running by this MySQL user\n To specify more than one, separate with commas."
);
$np->add_arg(
spec => 'skip_clientuser=s',
help => "--skip_clientuser\n Don't check queries running by this MySQL user\n To specify more than one, separate with commas."
);
$np->add_arg(
spec => 'clienthost=s',
help => "--clienthost\n Only check queries running from this client host\n To specify more than one, separate with commas."
);
$np->add_arg(
spec => 'skip_clienthost=s',
help => "--skip_clienthost\n Don't check queries running from this client host\n To specify more than one, separate with commas."
);
## parse the command line arguments
$np->getopts;
my $verbose = $np->opts->verbose || 0;
if ( $verbose >= 2 ) {
print "Plugin options:\n";
printf " %-23s %d\n", "verbose:", $verbose;
printf " %-23s %s\n", "host:", $np->opts->host || '';
printf " %-23s %s\n", "port:", $np->opts->port || '';
printf " %-23s %s\n", "socket:", $np->opts->socket || '';
printf " %-23s %s\n", "user:", $np->opts->user || '';
printf " %-23s %s\n", "password:", $np->opts->password || '';
printf " %-23s %d\n", "warn:", $np->opts->warn;
printf " %-23s %d\n", "crit:", $np->opts->crit;
printf " %-23s %s\n", "db:", $np->opts->db || '';
printf " %-23s %s\n", "skip_db:", $np->opts->skip_db || '';
printf " %-23s %s\n", "clientuser:", $np->opts->clientuser || '';
printf " %-23s %s\n", "skip_clientuser:", $np->opts->skip_clientuser || '';
printf " %-23s %s\n", "clienthost:", $np->opts->clienthost || '';
printf " %-23s %s\n", "skip_clienthost:", $np->opts->skip_clienthost || '';
}
# extract restrictions from args - will grep() these lists
my @db = split( '/,/', $np->opts->db || '' );
my @skipdb = split( '/,/', $np->opts->skip_db || '' );
my @clientuser = split( '/,/', $np->opts->clientuser || '' );
my @skipclientuser = split( '/,/', $np->opts->skip_clientuser || '' );
my @clienthost = split( '/,/', $np->opts->clienthost || '' );
my @skipclienthost = split( '/,/', $np->opts->skip_clienthost || '' );
alarm $np->opts->timeout;
## setup the dsn - no need to specify a database
my $dsn = 'DBI:mysql:';
## if we're connecting to localhost (by name) or the host isn't defined ...
if ( ! $np->opts->host || $np->opts->host eq 'localhost' ) {
# connect via a local socket (if it's defined)
$dsn .= ';mysql_socket=' . $np->opts->socket
if $np->opts->socket;
## otherwise, attempt to connect via host and/or port (if they're defined)
} else {
$dsn .= ';host=' . $np->opts->host
if $np->opts->host;
$dsn .= ';port=' . $np->opts->port
if $np->opts->port;
}
## print dsn if really verbose
print "DSN: '$dsn' USER: '", $np->opts->user || '', "' PASS: '", $np->opts->password || '', "'\n"
if $verbose >= 2;
## connect to the database server
my $dbh = DBI->connect( $dsn, $np->opts->user || '', $np->opts->password || '',
{ RaiseError => 0, PrintError => 0, AutoCommit => 1 } )
or $np->nagios_exit( UNKNOWN, "Could not connect to database: $DBI::errstr" );
## get the list of running queries
my $sth = $dbh->prepare( 'SHOW FULL PROCESSLIST' );
$sth->execute();
$np->nagios_exit( UNKNOWN, $sth->errstr ) if $sth->err;
## bind each row result to a hash
my %row;
$sth->bind_columns( \( @row{ @{$sth->{NAME_lc} } } ));
## use these to keep track of the longest-running query
my $longquery_info = '';
my $longquery_time = 0;
## process the results
my $count = 0;
while ( $sth->fetch ) {
$count++;
# skip if time is zero or NULL
next unless $row{'time'};
# skip ignorable results
next if $row{'user'} eq 'system user';
next if $row{'command'} =~ m/(Sleep|Binlog Dump|Ping|Processlist)/io;
# extract connection info
my $db = $row{'db'} || '';
my $user = $row{'user'} || '';
my $host = $row{'host'} || '';
$host =~ s/:\d+$//o;
# skip if connection info does or doest match criteria
next if $np->opts->db and grep !/^$db$/, @db;
next if $np->opts->skip_db and grep /^$db$/, @skipdb;
next if $np->opts->clientuser and grep !/^$user$/, @clientuser;
next if $np->opts->skip_clientuser and grep /^$user$/, @skipclientuser;
next if $np->opts->clienthost and grep !/^$host$/, @clienthost;
next if $np->opts->skip_clienthost and grep /^$host$/, @skipclienthost;
# only save the longest running query
if ( $row{'time'} > $longquery_time ) {
$longquery_time = $row{'time'};
$longquery_info = "TIME: $row{'time'}";
foreach my $k ( sort keys %row ) {
next if $k eq 'time' or $k eq 'info';
$longquery_info .= " $k=" . ( $row{$k} || 'NULL' );
}
$longquery_info .= " INFO=" . ( $row{'info'} || 'NULL' );
}
}
# we're done with the db handle
$dbh->disconnect;
# OK if no long queries were found
$np->nagios_exit( OK, "No long running queries found ($count threads checked)" ) unless $longquery_info;
# check for crit
$np->nagios_exit( CRITICAL, $longquery_info ) if $longquery_time >= $np->opts->crit;
$np->nagios_exit( WARNING, $longquery_info ) if $longquery_time >= $np->opts->warn;
# OK if if the longest query didn't match crit & warn
$np->nagios_exit( OK, "No long running queries found ($count threads checked)" );

View File

@ -0,0 +1,140 @@
#!/bin/bash
###################################################################
# check_postfix_mailqueue is developped with GPL Licence 2.0
#
# GPL License: http://www.gnu.org/licenses/old-licenses/gpl-2.0.txt
#
# Developped by : Bjoern Bongermino
#
###################################################################
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
####################################################################
# Uncomment to enable debugging
# set -x
PROGNAME=`basename $0`
VERSION="Version 1.0"
AUTHOR="Bjoern Bongermino (http://www.bongermino.de)"
STATE_OK=0
STATE_WARNING=1
STATE_CRITICAL=2
STATE_UNKNOWN=3
warning=0
critical=0
print_version() {
echo "$PROGNAME $VERSION $AUTHOR"
}
print_help() {
print_version $PROGNAME $VERSION
echo ""
echo "$PROGNAME - Checks postfix mailqueue statistic"
echo ""
echo "$PROGNAME is a Nagios plugin which generates statistics"
echo "for the postfix mailqueue and checks for corrupt messages."
echo "The following values will be checked:"
echo "maildrop: Localy posted mail"
echo "incoming: Processed local mail and received from network"
echo "active: Mails being delivered (should be small)"
echo "deferred: Stuck mails (that will be retried later)"
echo "corrupt: Messages found to not be in correct format (shold be 0)"
echo "hold: Recent addition, messages put on hold indefinitly - delete of free"
echo ""
echo "Usage: $PROGNAME -w WARN-Level -c CRIT-Level"
echo ""
echo "Options:"
echo " -w)"
echo " Warning level for deferred mails"
echo " -c)"
echo " Critical level for deferred mail"
echo " -h)"
echo " This help"
echo " -v)"
echo " Version"
exit $STATE_OK
}
# Check for parameters
while test -n "$1"; do
case "$1" in
-h)
print_help
exit $STATE_OK;;
-v)
print_version
exit $STATE_OK;;
-w)
warning=$2
shift
;;
-c)
critical=$2
shift
;;
*)
check_postfix_mailqueue
;;
esac
shift
done
check_postfix_mailqueue() {
# Can be set via environment, but default is fetched by postconf (if available,
# else /var/spool/postfix)
if which postconf > /dev/null ; then
SPOOLDIR=${spooldir:-`postconf -h queue_directory`}
else
SPOOLDIR=${spooldir:-/var/spool/postfix}
fi
cd $SPOOLDIR >/dev/null 2>/dev/null || {
echo -n "Cannot cd to $SPOOLDIR"
exit $STATE_CRITICAL
}
# Get values
deferred=`(test -d deferred && find deferred -type f ) | wc -l`
active=`(test -d active && find active -type f ) | wc -l`
maildrop=`(test -d maildrop && find maildrop -type f ) | wc -l`
incoming=`(test -d incoming && find incoming -type f ) | wc -l`
corrupt=`(test -d corrupt && find corrupt -type f ) | wc -l`
hold=`( test -d hold && find hold -type f ) | wc -l`
}
check_postfix_mailqueue
values="Deferred mails=$deferred Active deliveries=$active Locally posted mails=$maildrop Incoming mails=$incoming Corrupt mails=$corrupt Mails on hold=$hold"
perfdata="deferred=$deferred;; active=$active;; maildrop=$maildrop;; incoming=$incoming;; corrupt=$corrupt;; hold=$hold;;"
if [ $corrupt -gt 0 ]; then
echo -n "Postfix Mailqueue CRITICAL - $corrupt corrupt messages found! | $perfdata"
exit $STATE_CRITICAL
fi
if [ $warning -gt 0 ] && [ $critical -gt 0 ]; then
if [ $deferred -gt $critical ]; then
echo -n "Postfix Mailqueue CRITICAL - $values | $perfdata"
exit $STATE_CRITICAL
elif [ $deferred -gt $warning ]; then
echo -n "Postfix Mailqueue WARNING - $values | $perfdata"
exit $STATE_WARNING
else
echo -n "Postfix Mailqueue OK - $values | $perfdata"
exit $STATE_OK
fi
else
echo -n "Postfix Mailqueue OK - $values | $perfdata"
exit $STATE_OK
fi

11848
files/nrpe/check_postgresql Executable file

File diff suppressed because it is too large Load Diff

101
files/nrpe/check_proc_age Executable file
View File

@ -0,0 +1,101 @@
#! /bin/bash
# Nagios plugin
# created 09.01.2011 by symphonic.mushroom@gmail.com
# modified 04.24.2012 by symphonic.mushroom@gmail.com with the advices from formwandler
# modified 07.22.2017 by symphonic.mushroom@gmail.com with the help from Toby Wahlers toby@100.rpm.com
# check if processes matching to a pattern are exceeding a given elapsed time
# return a Nagios exit code depending on the result
# 0 = OK
# 1 = WARNING
# 2 = CRITICAL
# 3 = UNKNOWN
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
# for help printing
print_help() {
echo "This Nagios plugin check if processes matching to a pattern are exceeding a given elapsed time"
echo "Usage : $0 -p <process_name> -w <seconds> -c <seconds> "
echo " -p parameter : name of the monitoring process. For granularity, quote commands with spaces."
echo " -w parameter : minimal elapsed time for status WARNING on NAGIOS, in seconds."
echo " -c parameter : minimal elapsed time for status CRITICAL on NAGIOS, in seconds."
echo "returned performance data : number of process; oldest time in minutes; warning time in minutes; critical time in minutes; 0;"
exit 3
}
# check if there is at least one argument
if [ -z $1 ]
then echo "Missing arguments"
echo "try \'$0 --help\' for help"
exit 3
fi
# print help
if [[ ( $1 = "--help" || $1 = "-h" ) ]]
then print_help
exit 3
fi
# assign value to arguments
# print an error in case of unkown argument
while getopts ":w:c:p:" options
do
case $options in
w ) warning=$OPTARG ;;
c ) critical=$OPTARG ;;
p ) proc=$OPTARG ;;
* ) echo "Unknown argument"
echo "try \'$0 --help\' for help"
exit 3 ;;
esac
done
# check if all arguments are present
if [[ ( -z $warning || -z $critical || -z $proc ) ]]
then echo "Missing argument"
echo "try \'$0 --help\' for help"
exit 3
fi
#calculate number of process
nbproc=$(ps -A -o args | grep -w "$proc" | grep -v $0 | grep -v grep | wc -l)
if [ $nbproc -gt 0 ]
then
#calculate age of oldest process
ageproc=$(ps -A -o etime,comm,args | grep "$proc" | grep -v $0 | grep -v grep | gawk '{split($1,t,":");split(t[1],td,"-");if (td[2]) {ta=td[1]*86400; t[1]=td[2]} else {ta=0}; if (t[3]) {$1=(t[1]*60+t[2])*60+t[3]+ta} else {$1=t[1]*60+t[2]};if (NR==1) {maxi=$1;} else {if ($1>maxi){maxi=$1;}}};END {print maxi}')
case $ageproc in
?|[0-5]? ) maxage=$ageproc" Seconds";;
??|???|[0-2]???|3[0-5]?? ) maxage=$(($ageproc/60))" Minutes";;
* ) maxage=$(($ageproc/3600))" Hours "$(($ageproc % 3600 / 60))" minutes";;
esac
msg="there are $nbproc process $proc, oldest has got $maxage age"
perfmaxage=$(($ageproc/60))
perfdata="Processes=${nbproc:-0} MaxAge=${perfmaxage:-0}Minutes;$(($warning/60));$(($critical/60));0;"
if [ $ageproc -gt $critical ]
then echo "CRITICAL: $msg | $perfdata"
exit 2
elif [ $ageproc -gt $warning ]
then echo "WARNING: $msg | $perfdata"
exit 1
else echo "OK: $msg | $perfdata"
exit 0
fi
else
echo "OK: there is no process matching $proc"
exit 0
fi

7
files/nrpe/check_process Executable file
View File

@ -0,0 +1,7 @@
#!/bin/bash
rc=0
for proc in cron rsyslogd ntpd munin-node; do
sudo /usr/lib/nagios/plugins/check_procs -C $proc -c 1:
rc=$(($rc|$?))
done

33
files/nrpe/check_rofs Executable file
View File

@ -0,0 +1,33 @@
#!/bin/bash
# checks for read_only fs
# @Author Joerg 'johe' Stephan <johe.stephan@googlemail.com>
#
E_SUCCESS="0"
E_WARNING="1"
E_CRITICAL="2"
E_UNKNOWN="3"
if [ -z $1 ]; then
echo "Usage: check_rofs.sh <mountpoint>"
else tfs=$1
fi
cat /proc/mounts | while read diskid mountpoint fs options rub1 rub2; do
if [ x$mountpoint = x$tfs ]; then
if grep -q rw <<<$options; then
echo "The Filesystem mounted on $tfs is writeable"
exit ${E_SUCCESS}
else
if grep -q ro <<<$options; then
echo "The Filesystem mounted on $tfs is NOT writeable"
exit ${E_CRITICAL}
else
echo "Test result empty (For any reason)"
exit ${E_WARNING}
fi
fi
fi
done

View File

@ -0,0 +1,50 @@
#!/bin/bash
# Copyright © 2016, 2017 Mohamed El Morabity <melmorabity@fedoraproject.com>
#
# This module is free software: you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
# Foundation, either version 3 of the License, or (at your option) any later
# version.
#
# This software is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along with
# this program. If not, see <http://www.gnu.org/licenses/>.
PLUGINDIR=$(dirname $0)
. $PLUGINDIR/utils.sh
if [[ $# -ne 1 ]]; then
echo "Usage: ${0##*/} <service name>"
exit $STATE_UNKNOWN
fi
service=$1
status=$(systemctl is-enabled $service 2>/dev/null)
r=$?
if [[ -z "$status" ]]; then
echo "ERROR: service $service doesn't exist"
exit $STATE_CRITICAL
fi
if [[ $r -ne 0 ]]; then
echo "ERROR: service $service is $status"
exit $STATE_CRITICAL
fi
systemctl --quiet is-active $service
if [[ $? -ne 0 ]]; then
echo "ERROR: service $service is not running"
exit $STATE_CRITICAL
fi
echo "OK: service $service is running"
exit $STATE_OK

3
tasks/main.yml Normal file
View File

@ -0,0 +1,3 @@
---
- name: "monitoring | install nrpe"
include: nrpe.yml

42
tasks/nrpe.yml Normal file
View File

@ -0,0 +1,42 @@
---
- name: nrpe | apt update cache
apt:
update_cache: yes
cache_valid_time: 86400 #One day
- name: nrpe | install nrpe packages
apt:
name: "{{ item }}"
update_cache: true
state: present
with_items:
- nagios-nrpe-server
- libmonitoring-plugin-perl
- monitoring-plugins-standard
- libdbd-mysql-perl
- name: nrpe | copy nrpe configuration
template:
src: "nrpe.j2"
dest: "/etc/nagios/nrpe.d/brainsys.cfg"
mode: "0644"
force: yes
backup: yes
- name: nrpe | copy nrpe plugins
copy:
src: nrpe/
dest: /usr/lib/nagios/plugins
mode: 0755
- name: nrpe | restart nagios-nrpe-server
systemd:
state: restarted
name: nagios-nrpe-server
- name: nrpe | allow nagios user to specific sudo
template:
src: nrpe.sudoers.j2
dest: /etc/sudoers.d/nrpe
validate: 'visudo -cf %s'
mode: 0440

51
templates/nrpe.j2 Normal file
View File

@ -0,0 +1,51 @@
allowed_hosts={{ nrpe_allowed_hosts }}
dont_blame_nrpe=1
command[check_load]=/usr/lib/nagios/plugins/check_load -w {{ nrpe_load_warning }} -c {{ nrpe_load_critical }}
command[check_memory]=/usr/lib/nagios/plugins/check_memory -w {{ nrpe_memory_warning }} -c {{ nrpe_memory_critical }} -W {{ nrpe_swap_warning }} -C {{ nrpe_swap_critical }}
command[check_mailq]=/usr/bin/sudo /usr/lib/nagios/plugins/check_postfix_mailqueue -w {{ nrpe_postfix_warning }} -c {{ nrpe_postfix_critical }}
command[check_smtp]=/usr/lib/nagios/plugins/check_tcp -p 25
command[check_zombie_procs]=/usr/lib/nagios/plugins/check_procs -w 5 -c 10 -s Z
command[check_total_procs]=/usr/lib/nagios/plugins/check_procs -w 500 -c 800
command[check_process]=/usr/lib/nagios/plugins/check_process
command[check_dns]=/usr/lib/nagios/plugins/check_dns -H google.com
command[check_ssl]=/usr/lib/nagios/plugins/check_http --sni 'www.brainsys.io' -C 14,3
command[check_eth]=/usr/lib/nagios/plugins/check_eth -i {{ ansible_default_ipv4.interface }} -w {{ nrpe_eth_warning }} -c {{ nrpe_eth_critical }}
command[check_proc_fail2ban]=/usr/lib/nagios/plugins/check_procs -a fail2ban -w 1: -c 1:
command[check_proc_age]=/usr/lib/nagios/plugins/check_proc_age -p <proc> -w 400 -c 600
# disk
# -w space warning / -c space critical / -W inode warning / -K inode criticak / -C reset after
command[check_disk_advanced]=/usr/lib/nagios/plugins/check_disk_advanced -x /lib/init/rw -x /sys -x /dev/shm -X tmpfs -X nsfs -X proc -X sysfs -X devtmpfs -X overlay -X tracefs -w 10% -c 3% -W 10% -K 3% -H
command[check_disk_root]=/usr/lib/nagios/plugins/check_disk -w 30% -W 30% -c 10% -K 10% -p /
command[check_rw_root]=/usr/lib/nagios/plugins/check_rofs /
command[check_disk_data]=/usr/lib/nagios/plugins/check_disk -w 30% -W 30% -c 10% -K 10% -p /data
command[check_rw_data]=/usr/lib/nagios/plugins/check_rofs /data
# mysql
command[check_mysql]=/usr/lib/nagios/plugins/check_mysql -u nagios -pBu[VetFeifoipVithlok2odHabrAiltAjHavciUjRi -d mysql -H 127.0.0.1
command[check_mysql_longqueries]=/usr/lib/nagios/plugins/check_mysql_longqueries -u nagios -pBu[VetFeifoipVithlok2odHabrAiltAjHavciUjRi -H 127.0.0.1 -w 600 -c 1200
# postgresql
command[check_pgsql_port]=/usr/lib/nagios/plugins/check_tcp -p 5432
command[check_pgsql_connection]=/usr/lib/nagios/plugins/check_postgresql -H 127.0.0.1 -p 5432 --dbuser=nagios --dbpass=uDUTHt14FC3w4cE9vRk4XyZFD3KWlx --action=connection
command[check_pgsql_backends]=/usr/lib/nagios/plugins/check_postgresql -H 127.0.0.1 -p 5432 --dbuser=nagios --dbpass=uDUTHt14FC3w4cE9vRk4XyZFD3KWlx --action=backends -w 175 -c 190
# raid
command[check_mdadm]=/usr/lib/nagios/plugins/check_mdadm
command[check_3ware]=/usr/bin/sudo /usr/lib/nagios/plugins/check_3ware
# services
command[check_proc_docker]=/usr/lib/nagios/plugins/check_systemd_service docker
command[check_proc_haproxy]=/usr/lib/nagios/plugins/check_systemd_service haproxy
command[check_proc_nginx]=/usr/lib/nagios/plugins/check_systemd_service nginx
command[check_proc_php5.6]=/usr/lib/nagios/plugins/check_systemd_service php5.6-fpm
command[check_proc_php7.0]=/usr/lib/nagios/plugins/check_systemd_service php7.0-fpm
command[check_proc_php7.1]=/usr/lib/nagios/plugins/check_systemd_service php7.1-fpm
command[check_proc_php7.2]=/usr/lib/nagios/plugins/check_systemd_service php7.2-fpm
command[check_proc_php7.3]=/usr/lib/nagios/plugins/check_systemd_service php7.3-fpm
command[check_proc_php7.4]=/usr/lib/nagios/plugins/check_systemd_service php7.4-fpm
command[check_proc_php8.0]=/usr/lib/nagios/plugins/check_systemd_service php8.0-fpm
command[check_proc_php8.1]=/usr/lib/nagios/plugins/check_systemd_service php8.1-fpm
command[check_proc_mysql]=/usr/lib/nagios/plugins/check_systemd_service mysql
command[check_proc_postgresql]=/usr/lib/nagios/plugins/check_systemd_service postgresql

View File

@ -0,0 +1,2 @@
nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_postfix_mailqueue -w {{ nrpe_postfix_warning }} -c {{ nrpe_postfix_critical }}
nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_exim_mailqueue -w {{ nrpe_exim_warning }} -c {{ nrpe_exim_critical }}