add check disk usage (bash edition)
This commit is contained in:
148
files/nrpe/check_disk_usage
Executable file
148
files/nrpe/check_disk_usage
Executable file
@@ -0,0 +1,148 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
#
|
||||||
|
# =============================================================================
|
||||||
|
#
|
||||||
|
# Nagios plugin to check disk space usage.
|
||||||
|
#
|
||||||
|
# Author: GitHub Copilot
|
||||||
|
# Version: 1.0
|
||||||
|
#
|
||||||
|
# =============================================================================
|
||||||
|
#
|
||||||
|
# This script checks the disk space utilization of all mounted filesystems.
|
||||||
|
# It is designed to be used as a Nagios/Icinga check command.
|
||||||
|
#
|
||||||
|
# The script compares the percentage of used space against configurable
|
||||||
|
# warning and critical thresholds. If any filesystems exceed these thresholds,
|
||||||
|
# it will exit with the appropriate status and display a human-readable
|
||||||
|
# table of the problematic filesystems.
|
||||||
|
#
|
||||||
|
# Thresholds are managed via the -w (warning) and -c (critical) flags.
|
||||||
|
#
|
||||||
|
# Nagios Exit Codes:
|
||||||
|
# 0 - OK: All filesystems are within thresholds.
|
||||||
|
# 1 - WARNING: One or more filesystems have exceeded the warning threshold.
|
||||||
|
# 2 - CRITICAL: One or more filesystems have exceeded the critical threshold.
|
||||||
|
# 3 - UNKNOWN: The script encountered an error (e.g., bad arguments or
|
||||||
|
# the 'df' command failed).
|
||||||
|
#
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
# Nagios Exit Codes
|
||||||
|
OK=0
|
||||||
|
WARNING=1
|
||||||
|
CRITICAL=2
|
||||||
|
UNKNOWN=3
|
||||||
|
|
||||||
|
# Default Thresholds (in percentage)
|
||||||
|
WARN_THRESHOLD=80
|
||||||
|
CRIT_THRESHOLD=90
|
||||||
|
|
||||||
|
# --- Functions ---
|
||||||
|
|
||||||
|
# Function to display help/usage information
|
||||||
|
usage() {
|
||||||
|
echo "Usage: $0 [-w <warning_threshold>] [-c <critical_threshold>]"
|
||||||
|
echo " -w: Warning threshold percentage (e.g., 80). Default: ${WARN_THRESHOLD}%"
|
||||||
|
echo " -c: Critical threshold percentage (e.g., 90). Default: ${CRIT_THRESHOLD}%"
|
||||||
|
echo " -h: Display this help message"
|
||||||
|
exit $UNKNOWN
|
||||||
|
}
|
||||||
|
|
||||||
|
# --- Argument Parsing ---
|
||||||
|
|
||||||
|
while getopts "w:c:h" opt; do
|
||||||
|
case ${opt} in
|
||||||
|
w)
|
||||||
|
WARN_THRESHOLD=${OPTARG}
|
||||||
|
;;
|
||||||
|
c)
|
||||||
|
CRIT_THRESHOLD=${OPTARG}
|
||||||
|
;;
|
||||||
|
h)
|
||||||
|
usage
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
usage
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
# Validate that thresholds are numbers
|
||||||
|
if ! [[ "$WARN_THRESHOLD" =~ ^[0-9]+$ ]] || ! [[ "$CRIT_THRESHOLD" =~ ^[0-9]+$ ]]; then
|
||||||
|
echo "UNKNOWN: Warning and critical thresholds must be integers."
|
||||||
|
exit $UNKNOWN
|
||||||
|
fi
|
||||||
|
|
||||||
|
# --- Main Logic ---
|
||||||
|
|
||||||
|
# Final exit status, assuming OK until a problem is found
|
||||||
|
final_status=$OK
|
||||||
|
# Summary message for the first line of output
|
||||||
|
summary_message="OK: All filesystems are within thresholds."
|
||||||
|
|
||||||
|
# Arrays to store problematic filesystems
|
||||||
|
declare -a critical_filesystems
|
||||||
|
declare -a warning_filesystems
|
||||||
|
|
||||||
|
# Get the disk usage data. Exit if the command fails.
|
||||||
|
df_output=$(df -hP)
|
||||||
|
if [ $? -ne 0 ]; then
|
||||||
|
echo "UNKNOWN: 'df -hP' command failed to execute."
|
||||||
|
exit $UNKNOWN
|
||||||
|
fi
|
||||||
|
|
||||||
|
|
||||||
|
# Use process substitution to read df output line by line, skipping the header
|
||||||
|
# This avoids creating a subshell for the loop, so variables can be modified.
|
||||||
|
while read -r line; do
|
||||||
|
# Extract usage percentage and mount point
|
||||||
|
usage_percent=$(echo "$line" | awk '{print $5}' | sed 's/%//')
|
||||||
|
|
||||||
|
# Skip non-numeric usage percentages (e.g., for certain pseudo-filesystems)
|
||||||
|
if ! [[ "$usage_percent" =~ ^[0-9]+$ ]]; then
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Check against thresholds
|
||||||
|
if [ "$usage_percent" -ge "$CRIT_THRESHOLD" ]; then
|
||||||
|
critical_filesystems+=("$line")
|
||||||
|
final_status=$CRITICAL
|
||||||
|
summary_message="CRITICAL: Filesystem usage has exceeded critical threshold."
|
||||||
|
elif [ "$usage_percent" -ge "$WARN_THRESHOLD" ]; then
|
||||||
|
warning_filesystems+=("$line")
|
||||||
|
if [ "$final_status" -ne "$CRITICAL" ]; then
|
||||||
|
final_status=$WARNING
|
||||||
|
summary_message="WARNING: Filesystem usage has exceeded warning threshold."
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
done < <(echo "${df_output}" | tail -n +2)
|
||||||
|
|
||||||
|
|
||||||
|
# --- Output Generation ---
|
||||||
|
|
||||||
|
# Print the one-line summary for Nagios
|
||||||
|
echo "$summary_message"
|
||||||
|
|
||||||
|
# If there are any issues, print the detailed table
|
||||||
|
if [ "$final_status" -ne "$OK" ]; then
|
||||||
|
echo "" # Add a newline for better formatting
|
||||||
|
printf "%-30s %-10s %-10s %-10s %-10s %-30s\n" "Filesystem" "Size" "Used" "Avail" "Use%" "Mounted on"
|
||||||
|
printf "%s\n" "---------------------------------------------------------------------------------------------------------------"
|
||||||
|
|
||||||
|
# Print critical filesystems
|
||||||
|
if [ ${#critical_filesystems[@]} -gt 0 ]; then
|
||||||
|
for item in "${critical_filesystems[@]}"; do
|
||||||
|
printf "%-30s %-10s %-10s %-10s %-10s %-30s\n" $(echo "$item")
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Print warning filesystems
|
||||||
|
if [ ${#warning_filesystems[@]} -gt 0 ]; then
|
||||||
|
for item in "${warning_filesystems[@]}"; do
|
||||||
|
printf "%-30s %-10s %-10s %-10s %-10s %-30s\n" $(echo "$item")
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
exit $final_status
|
||||||
Reference in New Issue
Block a user