add check systemd failed

This commit is contained in:
Ludovic Cartier
2025-09-18 17:36:11 +02:00
parent 4bb3edc19f
commit c3248d2351

63
files/nrpe/check_systemd_failed Executable file
View File

@@ -0,0 +1,63 @@
#!/bin/bash
#
# =============================================================================
#
# Nagios plugin to check for failed systemd services.
#
# Author: GitHub Copilot
# Version: 1.1
#
# =============================================================================
#
# This script checks for any systemd services that have entered a "failed"
# state. It is designed to be used as a Nagios check command.
#
# It correctly parses the output from 'systemctl' to identify the service
# name, even if the output line starts with a special character like '●'.
#
# Nagios Exit Codes:
# 0 - OK: No failed services found.
# 1 - WARNING: (Not used by this script)
# 2 - CRITICAL: One or more services are in a failed state.
# 3 - UNKNOWN: The script could not determine the status of services.
#
# =============================================================================
# --- Nagios Exit Codes ---
STATE_OK=0
STATE_WARNING=1
STATE_CRITICAL=2
STATE_UNKNOWN=3
# --- Script ---
# Use systemctl to find failed services.
# The --plain option removes special characters and formatting, making output more stable.
# The output of this command will be lines like:
# prometheus-nginx-exporter.service loaded failed failed Prometheus Nginx Exporter
failed_services_output=$(systemctl list-units --type=service --state=failed --no-legend --plain)
# Check if the systemctl command executed successfully.
if [ $? -ne 0 ]; then
echo "UNKNOWN: Could not query systemd for failed services. Is systemctl available?"
exit $STATE_UNKNOWN
fi
# Check if the output is empty, which means no failed services.
if [ -z "$failed_services_output" ]; then
echo "OK: All systemd services are running correctly."
exit $STATE_OK
else
# Use awk to extract the first column (the service name) from each line.
failed_service_names=$(echo "$failed_services_output" | awk '{print $1}')
# Count the number of failed services.
service_count=$(echo "$failed_service_names" | wc -l)
# Join the service names with a comma and space for a clean list.
service_list_formatted=$(echo "$failed_service_names" | tr '\n' ',' | sed 's/,/, /g' | sed 's/, $//')
# Output the critical message for Nagios.
echo "CRITICAL: Found $service_count failed systemd service(s): $service_list_formatted"
exit $STATE_CRITICAL
fi