You've already forked nrpe
add nvme check
This commit is contained in:
@@ -0,0 +1,68 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# Nagios Exit Codes
|
||||||
|
STATE_OK=0
|
||||||
|
STATE_WARNING=1
|
||||||
|
STATE_CRITICAL=2
|
||||||
|
STATE_UNKNOWN=3
|
||||||
|
|
||||||
|
# Default values (in percentage)
|
||||||
|
warning=80
|
||||||
|
critical=90
|
||||||
|
|
||||||
|
# Retrieve arguments
|
||||||
|
while getopts "d:w:c:" opt; do
|
||||||
|
case $opt in
|
||||||
|
d) disk=$OPTARG ;;
|
||||||
|
w) warning=$OPTARG ;;
|
||||||
|
c) critical=$OPTARG ;;
|
||||||
|
*) echo "Usage: $0 -d <disk> -w <warning_limit> -c <critical_limit>"; exit $STATE_UNKNOWN ;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
# Check if disk is specified
|
||||||
|
if [ -z "$disk" ]; then
|
||||||
|
echo "ERROR: Device (e.g., /dev/nvme0) not specified."
|
||||||
|
exit $STATE_UNKNOWN
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Check if smartctl is installed
|
||||||
|
if ! command -v smartctl &> /dev/null; then
|
||||||
|
echo "ERROR: smartctl is not installed."
|
||||||
|
exit $STATE_UNKNOWN
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Retrieve SMART data (using sudo for permissions)
|
||||||
|
SMART_DATA=$(sudo smartctl -A "$disk" 2>/dev/null)
|
||||||
|
if [ $? -ne 0 ]; then
|
||||||
|
echo "UNKNOWN - Unable to run smartctl on $disk (check sudo permissions)"
|
||||||
|
exit $STATE_UNKNOWN
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Parse the data
|
||||||
|
USED=$(echo "$SMART_DATA" | grep "Percentage Used:" | awk '{print $3}' | tr -d '%')
|
||||||
|
HW_WARNING=$(echo "$SMART_DATA" | grep "Critical Warning:" | awk '{print $3}')
|
||||||
|
|
||||||
|
# Check if data was successfully parsed
|
||||||
|
if [ -z "$USED" ] || [ -z "$HW_WARNING" ]; then
|
||||||
|
echo "UNKNOWN - Could not parse SMART data for $disk"
|
||||||
|
exit $STATE_UNKNOWN
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Performance Data string for Nagios graphing
|
||||||
|
PERFDATA="wear_out=${USED}%;${warning};${critical};0;100"
|
||||||
|
|
||||||
|
# Evaluation Logic
|
||||||
|
if [ "$HW_WARNING" != "0x00" ]; then
|
||||||
|
echo "CRITICAL - Hardware Controller Alert on $disk (Code: $HW_WARNING) | $PERFDATA"
|
||||||
|
exit $STATE_CRITICAL
|
||||||
|
elif [ "$USED" -ge "$critical" ]; then
|
||||||
|
echo "CRITICAL - Wear level is too high: ${USED}% (Threshold: ${critical}%) | $PERFDATA"
|
||||||
|
exit $STATE_CRITICAL
|
||||||
|
elif [ "$USED" -ge "$warning" ]; then
|
||||||
|
echo "WARNING - Wear level is increasing: ${USED}% (Threshold: ${warning}%) | $PERFDATA"
|
||||||
|
exit $STATE_WARNING
|
||||||
|
else
|
||||||
|
echo "OK - $disk healthy (Wear: ${USED}%) | $PERFDATA"
|
||||||
|
exit $STATE_OK
|
||||||
|
fi
|
||||||
@@ -0,0 +1,54 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# Nagios Exit Codes
|
||||||
|
STATE_OK=0
|
||||||
|
STATE_WARNING=1
|
||||||
|
STATE_CRITICAL=2
|
||||||
|
STATE_UNKNOWN=3
|
||||||
|
|
||||||
|
# Default values
|
||||||
|
warning=70
|
||||||
|
critical=80
|
||||||
|
|
||||||
|
# Retrieve arguments
|
||||||
|
while getopts "d:w:c:" opt; do
|
||||||
|
case $opt in
|
||||||
|
d) disk=$OPTARG ;;
|
||||||
|
w) warning=$OPTARG ;;
|
||||||
|
c) critical=$OPTARG ;;
|
||||||
|
*) echo "Usage: $0 -d <disk> -w <warning> -c <critical>"; exit $STATE_UNKNOWN ;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
# Check if disk is specified
|
||||||
|
if [ -z "$disk" ]; then
|
||||||
|
echo "ERROR: Device (e.g., /dev/nvme0) not specified."
|
||||||
|
exit $STATE_UNKNOWN
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Check if smartctl is installed
|
||||||
|
if ! command -v smartctl &> /dev/null; then
|
||||||
|
echo "ERROR: smartctl is not installed."
|
||||||
|
exit $STATE_UNKNOWN
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Extract temperature (looking for 'Temperature:' in smartctl output)
|
||||||
|
TEMP=$(smartctl -A "$disk" | grep -i 'Temperature:' | awk '{print $2}')
|
||||||
|
|
||||||
|
# Check if a numeric value was retrieved
|
||||||
|
if ! [[ "$TEMP" =~ ^[0-9]+$ ]]; then
|
||||||
|
echo "ERROR: Could not read temperature on $disk (check sudo permissions)."
|
||||||
|
exit $STATE_UNKNOWN
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Nagios output logic
|
||||||
|
if [ "$TEMP" -ge "$critical" ]; then
|
||||||
|
echo "CRITICAL - NVMe Temperature: ${TEMP}°C (Threshold: ${critical}) | temp=${TEMP};${warning};${critical};0;100"
|
||||||
|
exit $STATE_CRITICAL
|
||||||
|
elif [ "$TEMP" -ge "$warning" ]; then
|
||||||
|
echo "WARNING - NVMe Temperature: ${TEMP}°C (Threshold: ${warning}) | temp=${TEMP};${warning};${critical};0;100"
|
||||||
|
exit $STATE_WARNING
|
||||||
|
else
|
||||||
|
echo "OK - NVMe Temperature: ${TEMP}°C | temp=${TEMP};${warning};${critical};0;100"
|
||||||
|
exit $STATE_OK
|
||||||
|
fi
|
||||||
Reference in New Issue
Block a user