add check systemd failed
This commit is contained in:
63
files/nrpe/check_systemd_failed
Executable file
63
files/nrpe/check_systemd_failed
Executable file
@@ -0,0 +1,63 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
#
|
||||||
|
# =============================================================================
|
||||||
|
#
|
||||||
|
# Nagios plugin to check for failed systemd services.
|
||||||
|
#
|
||||||
|
# Author: GitHub Copilot
|
||||||
|
# Version: 1.1
|
||||||
|
#
|
||||||
|
# =============================================================================
|
||||||
|
#
|
||||||
|
# This script checks for any systemd services that have entered a "failed"
|
||||||
|
# state. It is designed to be used as a Nagios check command.
|
||||||
|
#
|
||||||
|
# It correctly parses the output from 'systemctl' to identify the service
|
||||||
|
# name, even if the output line starts with a special character like '●'.
|
||||||
|
#
|
||||||
|
# Nagios Exit Codes:
|
||||||
|
# 0 - OK: No failed services found.
|
||||||
|
# 1 - WARNING: (Not used by this script)
|
||||||
|
# 2 - CRITICAL: One or more services are in a failed state.
|
||||||
|
# 3 - UNKNOWN: The script could not determine the status of services.
|
||||||
|
#
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
# --- Nagios Exit Codes ---
|
||||||
|
STATE_OK=0
|
||||||
|
STATE_WARNING=1
|
||||||
|
STATE_CRITICAL=2
|
||||||
|
STATE_UNKNOWN=3
|
||||||
|
|
||||||
|
# --- Script ---
|
||||||
|
|
||||||
|
# Use systemctl to find failed services.
|
||||||
|
# The --plain option removes special characters and formatting, making output more stable.
|
||||||
|
# The output of this command will be lines like:
|
||||||
|
# prometheus-nginx-exporter.service loaded failed failed Prometheus Nginx Exporter
|
||||||
|
failed_services_output=$(systemctl list-units --type=service --state=failed --no-legend --plain)
|
||||||
|
|
||||||
|
# Check if the systemctl command executed successfully.
|
||||||
|
if [ $? -ne 0 ]; then
|
||||||
|
echo "UNKNOWN: Could not query systemd for failed services. Is systemctl available?"
|
||||||
|
exit $STATE_UNKNOWN
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Check if the output is empty, which means no failed services.
|
||||||
|
if [ -z "$failed_services_output" ]; then
|
||||||
|
echo "OK: All systemd services are running correctly."
|
||||||
|
exit $STATE_OK
|
||||||
|
else
|
||||||
|
# Use awk to extract the first column (the service name) from each line.
|
||||||
|
failed_service_names=$(echo "$failed_services_output" | awk '{print $1}')
|
||||||
|
|
||||||
|
# Count the number of failed services.
|
||||||
|
service_count=$(echo "$failed_service_names" | wc -l)
|
||||||
|
|
||||||
|
# Join the service names with a comma and space for a clean list.
|
||||||
|
service_list_formatted=$(echo "$failed_service_names" | tr '\n' ',' | sed 's/,/, /g' | sed 's/, $//')
|
||||||
|
|
||||||
|
# Output the critical message for Nagios.
|
||||||
|
echo "CRITICAL: Found $service_count failed systemd service(s): $service_list_formatted"
|
||||||
|
exit $STATE_CRITICAL
|
||||||
|
fi
|
||||||
Reference in New Issue
Block a user