add check systemd failed

2025-09-18 17:36:11 +02:00
parent 4bb3edc19f
commit c3248d2351
1 changed files with 63 additions and 0 deletions
--- a/files/nrpe/check_systemd_failed
+++ b/files/nrpe/check_systemd_failed
@@ -0,0 +1,63 @@
 #!/bin/bash
 #
 # =============================================================================
 #
 # Nagios plugin to check for failed systemd services.
 #
 # Author: GitHub Copilot
 # Version: 1.1
 #
 # =============================================================================
 #
 # This script checks for any systemd services that have entered a "failed"
 # state. It is designed to be used as a Nagios check command.
 #
 # It correctly parses the output from 'systemctl' to identify the service
 # name, even if the output line starts with a special character like '●'.
 #
 # Nagios Exit Codes:
 # 0 - OK: No failed services found.
 # 1 - WARNING: (Not used by this script)
 # 2 - CRITICAL: One or more services are in a failed state.
 # 3 - UNKNOWN: The script could not determine the status of services.
 #
 # =============================================================================
 # --- Nagios Exit Codes ---
 STATE_OK=0
 STATE_WARNING=1
 STATE_CRITICAL=2
 STATE_UNKNOWN=3
 # --- Script ---
 # Use systemctl to find failed services.
 # The --plain option removes special characters and formatting, making output more stable.
 # The output of this command will be lines like:
 # prometheus-nginx-exporter.service loaded failed failed Prometheus Nginx Exporter
 failed_services_output=$(systemctl list-units --type=service --state=failed --no-legend --plain)
 # Check if the systemctl command executed successfully.
 if [ $? -ne 0 ]; then
    echo "UNKNOWN: Could not query systemd for failed services. Is systemctl available?"
    exit $STATE_UNKNOWN
 fi
 # Check if the output is empty, which means no failed services.
 if [ -z "$failed_services_output" ]; then
    echo "OK: All systemd services are running correctly."
    exit $STATE_OK
 else
    # Use awk to extract the first column (the service name) from each line.
    failed_service_names=$(echo "$failed_services_output" | awk '{print $1}')
    # Count the number of failed services.
    service_count=$(echo "$failed_service_names" | wc -l)
    # Join the service names with a comma and space for a clean list.
    service_list_formatted=$(echo "$failed_service_names" | tr '\n' ',' | sed 's/,/, /g' | sed 's/, $//')
    # Output the critical message for Nagios.
    echo "CRITICAL: Found $service_count failed systemd service(s): $service_list_formatted"
    exit $STATE_CRITICAL
 fi