Files
nrpe/files/nrpe/check_nvme_smart
T
Ludovic Cartier dc4cbd13a8 add nvme check
2026-03-16 15:47:32 +01:00

68 lines
1.9 KiB
Bash

#!/bin/bash
# Nagios Exit Codes
STATE_OK=0
STATE_WARNING=1
STATE_CRITICAL=2
STATE_UNKNOWN=3
# Default values (in percentage)
warning=80
critical=90
# Retrieve arguments
while getopts "d:w:c:" opt; do
case $opt in
d) disk=$OPTARG ;;
w) warning=$OPTARG ;;
c) critical=$OPTARG ;;
*) echo "Usage: $0 -d <disk> -w <warning_limit> -c <critical_limit>"; exit $STATE_UNKNOWN ;;
esac
done
# Check if disk is specified
if [ -z "$disk" ]; then
echo "ERROR: Device (e.g., /dev/nvme0) not specified."
exit $STATE_UNKNOWN
fi
# Check if smartctl is installed
if ! command -v smartctl &> /dev/null; then
echo "ERROR: smartctl is not installed."
exit $STATE_UNKNOWN
fi
# Retrieve SMART data (using sudo for permissions)
SMART_DATA=$(sudo smartctl -A "$disk" 2>/dev/null)
if [ $? -ne 0 ]; then
echo "UNKNOWN - Unable to run smartctl on $disk (check sudo permissions)"
exit $STATE_UNKNOWN
fi
# Parse the data
USED=$(echo "$SMART_DATA" | grep "Percentage Used:" | awk '{print $3}' | tr -d '%')
HW_WARNING=$(echo "$SMART_DATA" | grep "Critical Warning:" | awk '{print $3}')
# Check if data was successfully parsed
if [ -z "$USED" ] || [ -z "$HW_WARNING" ]; then
echo "UNKNOWN - Could not parse SMART data for $disk"
exit $STATE_UNKNOWN
fi
# Performance Data string for Nagios graphing
PERFDATA="wear_out=${USED}%;${warning};${critical};0;100"
# Evaluation Logic
if [ "$HW_WARNING" != "0x00" ]; then
echo "CRITICAL - Hardware Controller Alert on $disk (Code: $HW_WARNING) | $PERFDATA"
exit $STATE_CRITICAL
elif [ "$USED" -ge "$critical" ]; then
echo "CRITICAL - Wear level is too high: ${USED}% (Threshold: ${critical}%) | $PERFDATA"
exit $STATE_CRITICAL
elif [ "$USED" -ge "$warning" ]; then
echo "WARNING - Wear level is increasing: ${USED}% (Threshold: ${warning}%) | $PERFDATA"
exit $STATE_WARNING
else
echo "OK - $disk healthy (Wear: ${USED}%) | $PERFDATA"
exit $STATE_OK
fi