diff --git a/files/nrpe/check_systemd_failed b/files/nrpe/check_systemd_failed new file mode 100755 index 0000000..c456ec4 --- /dev/null +++ b/files/nrpe/check_systemd_failed @@ -0,0 +1,63 @@ +#!/bin/bash +# +# ============================================================================= +# +# Nagios plugin to check for failed systemd services. +# +# Author: GitHub Copilot +# Version: 1.1 +# +# ============================================================================= +# +# This script checks for any systemd services that have entered a "failed" +# state. It is designed to be used as a Nagios check command. +# +# It correctly parses the output from 'systemctl' to identify the service +# name, even if the output line starts with a special character like '●'. +# +# Nagios Exit Codes: +# 0 - OK: No failed services found. +# 1 - WARNING: (Not used by this script) +# 2 - CRITICAL: One or more services are in a failed state. +# 3 - UNKNOWN: The script could not determine the status of services. +# +# ============================================================================= + +# --- Nagios Exit Codes --- +STATE_OK=0 +STATE_WARNING=1 +STATE_CRITICAL=2 +STATE_UNKNOWN=3 + +# --- Script --- + +# Use systemctl to find failed services. +# The --plain option removes special characters and formatting, making output more stable. +# The output of this command will be lines like: +# prometheus-nginx-exporter.service loaded failed failed Prometheus Nginx Exporter +failed_services_output=$(systemctl list-units --type=service --state=failed --no-legend --plain) + +# Check if the systemctl command executed successfully. +if [ $? -ne 0 ]; then + echo "UNKNOWN: Could not query systemd for failed services. Is systemctl available?" + exit $STATE_UNKNOWN +fi + +# Check if the output is empty, which means no failed services. +if [ -z "$failed_services_output" ]; then + echo "OK: All systemd services are running correctly." + exit $STATE_OK +else + # Use awk to extract the first column (the service name) from each line. + failed_service_names=$(echo "$failed_services_output" | awk '{print $1}') + + # Count the number of failed services. + service_count=$(echo "$failed_service_names" | wc -l) + + # Join the service names with a comma and space for a clean list. + service_list_formatted=$(echo "$failed_service_names" | tr '\n' ',' | sed 's/,/, /g' | sed 's/, $//') + + # Output the critical message for Nagios. + echo "CRITICAL: Found $service_count failed systemd service(s): $service_list_formatted" + exit $STATE_CRITICAL +fi