feat - add ceph check

This commit is contained in:
Ludovic Cartier
2026-04-24 21:29:27 +02:00
parent 9b2412c775
commit 2dc995209c
4 changed files with 122 additions and 0 deletions
+104
View File
@@ -0,0 +1,104 @@
#!/bin/bash
# Nagios Exit Codes
OK=0
WARNING=1
CRITICAL=2
UNKNOWN=3
if [ -z "$1" ]; then
echo "Usage: $0 [health|osd|mon|cap|pg]"
exit $UNKNOWN
fi
COMMAND=$1
case $COMMAND in
health)
HEALTH_DATA=$(sudo ceph health)
if [[ $HEALTH_DATA == *"HEALTH_OK"* ]]; then
echo "OK: Ceph is healthy"
exit $OK
elif [[ $HEALTH_DATA == *"HEALTH_WARN"* ]]; then
echo "WARNING: $HEALTH_DATA"
exit $WARNING
else
echo "CRITICAL: $HEALTH_DATA"
exit $CRITICAL
fi
;;
osd)
OSD_JSON=$(sudo ceph osd stat --format json)
TOTAL=$(echo "$OSD_JSON" | python3 -c "import sys, json; print(json.load(sys.stdin).get('num_osds', 0))")
UP=$(echo "$OSD_JSON" | python3 -c "import sys, json; print(json.load(sys.stdin).get('num_up_osds', 0))")
IN=$(echo "$OSD_JSON" | python3 -c "import sys, json; print(json.load(sys.stdin).get('num_in_osds', 0))")
if [ "$UP" -lt "$TOTAL" ]; then
echo "CRITICAL: OSD down! ($UP/$TOTAL up) | osds=$TOTAL up=$UP in=$IN"
exit $CRITICAL
elif [ "$IN" -lt "$TOTAL" ]; then
echo "WARNING: OSD out! ($IN/$TOTAL in) | osds=$TOTAL up=$UP in=$IN"
exit $WARNING
else
echo "OK: All OSDs are up and in ($TOTAL/$TOTAL) | osds=$TOTAL up=$UP in=$IN"
exit $OK
fi
;;
mon)
MON_JSON=$(sudo ceph mon stat --format json)
QUORUM_COUNT=$(echo "$MON_JSON" | python3 -c "import sys, json; print(len(json.load(sys.stdin).get('quorum', [])))")
if [ "$QUORUM_COUNT" -lt 3 ]; then
echo "CRITICAL: Monitor Quorum degraded! ($QUORUM_COUNT/3) | quorum=$QUORUM_COUNT"
exit $CRITICAL
else
echo "OK: Monitor Quorum is 3/3 | quorum=$QUORUM_COUNT"
exit $OK
fi
;;
cap)
CAP_JSON=$(sudo ceph df --format json)
USAGE=$(echo "$CAP_JSON" | python3 -c "
import sys, json
try:
stats = json.load(sys.stdin)['stats']
ratio = stats.get('total_used_raw_ratio', stats.get('total_used_ratio', 0))
print(int(float(ratio) * 100))
except Exception:
print(-1)
")
if [ "$USAGE" -eq -1 ]; then
echo "UNKNOWN: Unable to parse Ceph capacity"
exit $UNKNOWN
elif [ "$USAGE" -ge 85 ]; then
echo "CRITICAL: Ceph capacity at $USAGE% | usage=$USAGE%"
exit $CRITICAL
elif [ "$USAGE" -ge 75 ]; then
echo "WARNING: Ceph capacity at $USAGE% | usage=$USAGE%"
exit $WARNING
else
echo "OK: Ceph capacity at $USAGE% | usage=$USAGE%"
exit $OK
fi
;;
pg)
PG_STAT=$(sudo ceph pg stat)
if [[ $PG_STAT == *"active+clean"* ]] && [[ $PG_STAT != *"degraded"* ]] && [[ $PG_STAT != *"undersized"* ]]; then
echo "OK: PGs are active+clean"
exit $OK
else
echo "WARNING: PGs status: $PG_STAT"
exit $WARNING
fi
;;
*)
echo "Unknown argument: $COMMAND"
exit $UNKNOWN
;;
esac