add check systemd failed
This commit is contained in:
63
files/nrpe/check_systemd_failed
Executable file
63
files/nrpe/check_systemd_failed
Executable file
@@ -0,0 +1,63 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# =============================================================================
|
||||
#
|
||||
# Nagios plugin to check for failed systemd services.
|
||||
#
|
||||
# Author: GitHub Copilot
|
||||
# Version: 1.1
|
||||
#
|
||||
# =============================================================================
|
||||
#
|
||||
# This script checks for any systemd services that have entered a "failed"
|
||||
# state. It is designed to be used as a Nagios check command.
|
||||
#
|
||||
# It correctly parses the output from 'systemctl' to identify the service
|
||||
# name, even if the output line starts with a special character like '●'.
|
||||
#
|
||||
# Nagios Exit Codes:
|
||||
# 0 - OK: No failed services found.
|
||||
# 1 - WARNING: (Not used by this script)
|
||||
# 2 - CRITICAL: One or more services are in a failed state.
|
||||
# 3 - UNKNOWN: The script could not determine the status of services.
|
||||
#
|
||||
# =============================================================================
|
||||
|
||||
# --- Nagios Exit Codes ---
|
||||
STATE_OK=0
|
||||
STATE_WARNING=1
|
||||
STATE_CRITICAL=2
|
||||
STATE_UNKNOWN=3
|
||||
|
||||
# --- Script ---
|
||||
|
||||
# Use systemctl to find failed services.
|
||||
# The --plain option removes special characters and formatting, making output more stable.
|
||||
# The output of this command will be lines like:
|
||||
# prometheus-nginx-exporter.service loaded failed failed Prometheus Nginx Exporter
|
||||
failed_services_output=$(systemctl list-units --type=service --state=failed --no-legend --plain)
|
||||
|
||||
# Check if the systemctl command executed successfully.
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "UNKNOWN: Could not query systemd for failed services. Is systemctl available?"
|
||||
exit $STATE_UNKNOWN
|
||||
fi
|
||||
|
||||
# Check if the output is empty, which means no failed services.
|
||||
if [ -z "$failed_services_output" ]; then
|
||||
echo "OK: All systemd services are running correctly."
|
||||
exit $STATE_OK
|
||||
else
|
||||
# Use awk to extract the first column (the service name) from each line.
|
||||
failed_service_names=$(echo "$failed_services_output" | awk '{print $1}')
|
||||
|
||||
# Count the number of failed services.
|
||||
service_count=$(echo "$failed_service_names" | wc -l)
|
||||
|
||||
# Join the service names with a comma and space for a clean list.
|
||||
service_list_formatted=$(echo "$failed_service_names" | tr '\n' ',' | sed 's/,/, /g' | sed 's/, $//')
|
||||
|
||||
# Output the critical message for Nagios.
|
||||
echo "CRITICAL: Found $service_count failed systemd service(s): $service_list_formatted"
|
||||
exit $STATE_CRITICAL
|
||||
fi
|
||||
Reference in New Issue
Block a user