#!/bin/bash
#
# =============================================================================
#
# Nagios plugin to check for failed systemd services.
#
# Author: GitHub Copilot
# Version: 1.1
#
# =============================================================================
#
# This script checks for any systemd services that have entered a "failed"
# state. It is designed to be used as a Nagios check command.
#
# It correctly parses the output from 'systemctl' to identify the service
# name, even if the output line starts with a special character like '●'.
#
# Nagios Exit Codes:
# 0 - OK: No failed services found.
# 1 - WARNING: (Not used by this script)
# 2 - CRITICAL: One or more services are in a failed state.
# 3 - UNKNOWN: The script could not determine the status of services.
#
# =============================================================================

# --- Nagios Exit Codes ---
STATE_OK=0
STATE_WARNING=1
STATE_CRITICAL=2
STATE_UNKNOWN=3

# --- Script ---

# Use systemctl to find failed services.
# The --plain option removes special characters and formatting, making output more stable.
# The output of this command will be lines like:
# prometheus-nginx-exporter.service loaded failed failed Prometheus Nginx Exporter
failed_services_output=$(systemctl list-units --type=service --state=failed --no-legend --plain)

# Check if the systemctl command executed successfully.
if [ $? -ne 0 ]; then
    echo "UNKNOWN: Could not query systemd for failed services. Is systemctl available?"
    exit $STATE_UNKNOWN
fi

# Check if the output is empty, which means no failed services.
if [ -z "$failed_services_output" ]; then
    echo "OK: All systemd services are running correctly."
    exit $STATE_OK
else
    # Use awk to extract the first column (the service name) from each line.
    failed_service_names=$(echo "$failed_services_output" | awk '{print $1}')

    # Count the number of failed services.
    service_count=$(echo "$failed_service_names" | wc -l)

    # Join the service names with a comma and space for a clean list.
    service_list_formatted=$(echo "$failed_service_names" | tr '\n' ',' | sed 's/,/, /g' | sed 's/, $//')

    # Output the critical message for Nagios.
    echo "CRITICAL: Found $service_count failed systemd service(s): $service_list_formatted"
    exit $STATE_CRITICAL
fi
