From dc4cbd13a838d724578b57653ceccf1465f6a0cb Mon Sep 17 00:00:00 2001 From: Ludovic Cartier Date: Mon, 16 Mar 2026 15:47:32 +0100 Subject: [PATCH] add nvme check --- files/nrpe/check_nvme_smart | 68 +++++++++++++++++++++++++++++++ files/nrpe/check_nvme_temperature | 54 ++++++++++++++++++++++++ 2 files changed, 122 insertions(+) create mode 100644 files/nrpe/check_nvme_smart create mode 100644 files/nrpe/check_nvme_temperature diff --git a/files/nrpe/check_nvme_smart b/files/nrpe/check_nvme_smart new file mode 100644 index 0000000..f5a78e8 --- /dev/null +++ b/files/nrpe/check_nvme_smart @@ -0,0 +1,68 @@ +#!/bin/bash + +# Nagios Exit Codes +STATE_OK=0 +STATE_WARNING=1 +STATE_CRITICAL=2 +STATE_UNKNOWN=3 + +# Default values (in percentage) +warning=80 +critical=90 + +# Retrieve arguments +while getopts "d:w:c:" opt; do + case $opt in + d) disk=$OPTARG ;; + w) warning=$OPTARG ;; + c) critical=$OPTARG ;; + *) echo "Usage: $0 -d -w -c "; exit $STATE_UNKNOWN ;; + esac +done + +# Check if disk is specified +if [ -z "$disk" ]; then + echo "ERROR: Device (e.g., /dev/nvme0) not specified." + exit $STATE_UNKNOWN +fi + +# Check if smartctl is installed +if ! command -v smartctl &> /dev/null; then + echo "ERROR: smartctl is not installed." + exit $STATE_UNKNOWN +fi + +# Retrieve SMART data (using sudo for permissions) +SMART_DATA=$(sudo smartctl -A "$disk" 2>/dev/null) +if [ $? -ne 0 ]; then + echo "UNKNOWN - Unable to run smartctl on $disk (check sudo permissions)" + exit $STATE_UNKNOWN +fi + +# Parse the data +USED=$(echo "$SMART_DATA" | grep "Percentage Used:" | awk '{print $3}' | tr -d '%') +HW_WARNING=$(echo "$SMART_DATA" | grep "Critical Warning:" | awk '{print $3}') + +# Check if data was successfully parsed +if [ -z "$USED" ] || [ -z "$HW_WARNING" ]; then + echo "UNKNOWN - Could not parse SMART data for $disk" + exit $STATE_UNKNOWN +fi + +# Performance Data string for Nagios graphing +PERFDATA="wear_out=${USED}%;${warning};${critical};0;100" + +# Evaluation Logic +if [ "$HW_WARNING" != "0x00" ]; then + echo "CRITICAL - Hardware Controller Alert on $disk (Code: $HW_WARNING) | $PERFDATA" + exit $STATE_CRITICAL +elif [ "$USED" -ge "$critical" ]; then + echo "CRITICAL - Wear level is too high: ${USED}% (Threshold: ${critical}%) | $PERFDATA" + exit $STATE_CRITICAL +elif [ "$USED" -ge "$warning" ]; then + echo "WARNING - Wear level is increasing: ${USED}% (Threshold: ${warning}%) | $PERFDATA" + exit $STATE_WARNING +else + echo "OK - $disk healthy (Wear: ${USED}%) | $PERFDATA" + exit $STATE_OK +fi \ No newline at end of file diff --git a/files/nrpe/check_nvme_temperature b/files/nrpe/check_nvme_temperature new file mode 100644 index 0000000..a5207b1 --- /dev/null +++ b/files/nrpe/check_nvme_temperature @@ -0,0 +1,54 @@ +#!/bin/bash + +# Nagios Exit Codes +STATE_OK=0 +STATE_WARNING=1 +STATE_CRITICAL=2 +STATE_UNKNOWN=3 + +# Default values +warning=70 +critical=80 + +# Retrieve arguments +while getopts "d:w:c:" opt; do + case $opt in + d) disk=$OPTARG ;; + w) warning=$OPTARG ;; + c) critical=$OPTARG ;; + *) echo "Usage: $0 -d -w -c "; exit $STATE_UNKNOWN ;; + esac +done + +# Check if disk is specified +if [ -z "$disk" ]; then + echo "ERROR: Device (e.g., /dev/nvme0) not specified." + exit $STATE_UNKNOWN +fi + +# Check if smartctl is installed +if ! command -v smartctl &> /dev/null; then + echo "ERROR: smartctl is not installed." + exit $STATE_UNKNOWN +fi + +# Extract temperature (looking for 'Temperature:' in smartctl output) +TEMP=$(smartctl -A "$disk" | grep -i 'Temperature:' | awk '{print $2}') + +# Check if a numeric value was retrieved +if ! [[ "$TEMP" =~ ^[0-9]+$ ]]; then + echo "ERROR: Could not read temperature on $disk (check sudo permissions)." + exit $STATE_UNKNOWN +fi + +# Nagios output logic +if [ "$TEMP" -ge "$critical" ]; then + echo "CRITICAL - NVMe Temperature: ${TEMP}°C (Threshold: ${critical}) | temp=${TEMP};${warning};${critical};0;100" + exit $STATE_CRITICAL +elif [ "$TEMP" -ge "$warning" ]; then + echo "WARNING - NVMe Temperature: ${TEMP}°C (Threshold: ${warning}) | temp=${TEMP};${warning};${critical};0;100" + exit $STATE_WARNING +else + echo "OK - NVMe Temperature: ${TEMP}°C | temp=${TEMP};${warning};${critical};0;100" + exit $STATE_OK +fi \ No newline at end of file