feat - add ceph check

This commit is contained in:
Ludovic Cartier
2026-04-24 21:29:27 +02:00
parent 9b2412c775
commit 2dc995209c
4 changed files with 122 additions and 0 deletions
+1
View File
@@ -117,6 +117,7 @@ The following checks are deployed to `/usr/lib/nagios/plugins/` (or configured p
| `nrpe_redis_fragments_critical` | `2.0` | `check_redis_health` | Critical threshold for fragmentation ratio. | | `nrpe_redis_fragments_critical` | `2.0` | `check_redis_health` | Critical threshold for fragmentation ratio. |
| `nrpe_redis_replication_lag_warning` | `10` | `check_redis_health` | Warning threshold for replication lag (seconds). | | `nrpe_redis_replication_lag_warning` | `10` | `check_redis_health` | Warning threshold for replication lag (seconds). |
| `nrpe_redis_replication_lag_critical` | `60` | `check_redis_health` | Critical threshold for replication lag (seconds). | | `nrpe_redis_replication_lag_critical` | `60` | `check_redis_health` | Critical threshold for replication lag (seconds). |
| `nrpe_ceph` | false | `check_ceph` | Ceph server monitoring |
## Example Playbooks ## Example Playbooks
+104
View File
@@ -0,0 +1,104 @@
#!/bin/bash
# Nagios Exit Codes
OK=0
WARNING=1
CRITICAL=2
UNKNOWN=3
if [ -z "$1" ]; then
echo "Usage: $0 [health|osd|mon|cap|pg]"
exit $UNKNOWN
fi
COMMAND=$1
case $COMMAND in
health)
HEALTH_DATA=$(sudo ceph health)
if [[ $HEALTH_DATA == *"HEALTH_OK"* ]]; then
echo "OK: Ceph is healthy"
exit $OK
elif [[ $HEALTH_DATA == *"HEALTH_WARN"* ]]; then
echo "WARNING: $HEALTH_DATA"
exit $WARNING
else
echo "CRITICAL: $HEALTH_DATA"
exit $CRITICAL
fi
;;
osd)
OSD_JSON=$(sudo ceph osd stat --format json)
TOTAL=$(echo "$OSD_JSON" | python3 -c "import sys, json; print(json.load(sys.stdin).get('num_osds', 0))")
UP=$(echo "$OSD_JSON" | python3 -c "import sys, json; print(json.load(sys.stdin).get('num_up_osds', 0))")
IN=$(echo "$OSD_JSON" | python3 -c "import sys, json; print(json.load(sys.stdin).get('num_in_osds', 0))")
if [ "$UP" -lt "$TOTAL" ]; then
echo "CRITICAL: OSD down! ($UP/$TOTAL up) | osds=$TOTAL up=$UP in=$IN"
exit $CRITICAL
elif [ "$IN" -lt "$TOTAL" ]; then
echo "WARNING: OSD out! ($IN/$TOTAL in) | osds=$TOTAL up=$UP in=$IN"
exit $WARNING
else
echo "OK: All OSDs are up and in ($TOTAL/$TOTAL) | osds=$TOTAL up=$UP in=$IN"
exit $OK
fi
;;
mon)
MON_JSON=$(sudo ceph mon stat --format json)
QUORUM_COUNT=$(echo "$MON_JSON" | python3 -c "import sys, json; print(len(json.load(sys.stdin).get('quorum', [])))")
if [ "$QUORUM_COUNT" -lt 3 ]; then
echo "CRITICAL: Monitor Quorum degraded! ($QUORUM_COUNT/3) | quorum=$QUORUM_COUNT"
exit $CRITICAL
else
echo "OK: Monitor Quorum is 3/3 | quorum=$QUORUM_COUNT"
exit $OK
fi
;;
cap)
CAP_JSON=$(sudo ceph df --format json)
USAGE=$(echo "$CAP_JSON" | python3 -c "
import sys, json
try:
stats = json.load(sys.stdin)['stats']
ratio = stats.get('total_used_raw_ratio', stats.get('total_used_ratio', 0))
print(int(float(ratio) * 100))
except Exception:
print(-1)
")
if [ "$USAGE" -eq -1 ]; then
echo "UNKNOWN: Unable to parse Ceph capacity"
exit $UNKNOWN
elif [ "$USAGE" -ge 85 ]; then
echo "CRITICAL: Ceph capacity at $USAGE% | usage=$USAGE%"
exit $CRITICAL
elif [ "$USAGE" -ge 75 ]; then
echo "WARNING: Ceph capacity at $USAGE% | usage=$USAGE%"
exit $WARNING
else
echo "OK: Ceph capacity at $USAGE% | usage=$USAGE%"
exit $OK
fi
;;
pg)
PG_STAT=$(sudo ceph pg stat)
if [[ $PG_STAT == *"active+clean"* ]] && [[ $PG_STAT != *"degraded"* ]] && [[ $PG_STAT != *"undersized"* ]]; then
echo "OK: PGs are active+clean"
exit $OK
else
echo "WARNING: PGs status: $PG_STAT"
exit $WARNING
fi
;;
*)
echo "Unknown argument: $COMMAND"
exit $UNKNOWN
;;
esac
+9
View File
@@ -145,3 +145,12 @@ command[check_nvme_temperature]=/usr/bin/sudo /usr/lib/nagios/plugins/check_nvme
# zpool # zpool
command[check_zpool_health]=/usr/bin/sudo /usr/lib/nagios/plugins/check_zpool_health {{ nrpe_zpool_name }} command[check_zpool_health]=/usr/bin/sudo /usr/lib/nagios/plugins/check_zpool_health {{ nrpe_zpool_name }}
{% endif %} {% endif %}
{% if nrpe_ceph is defined %}
# ceph
command[check_ceph_health]=/usr/lib/nagios/plugins/check_ceph health
command[check_ceph_osd]=/usr/lib/nagios/plugins/check_ceph osd
command[check_ceph_mon]=/usr/lib/nagios/plugins/check_ceph mon
command[check_ceph_cap]=/usr/lib/nagios/plugins/check_ceph cap
command[check_ceph_pg]=/usr/lib/nagios/plugins/check_ceph pg
{% endif %}
+8
View File
@@ -36,3 +36,11 @@ nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_pvesr
nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_nvme_smart -d {{ nrpe_nvme_device }} -w {{ nrpe_nvme_smart_warning }} -c {{ nrpe_nvme_smart_critical }} nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_nvme_smart -d {{ nrpe_nvme_device }} -w {{ nrpe_nvme_smart_warning }} -c {{ nrpe_nvme_smart_critical }}
nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_nvme_temperature -d {{ nrpe_nvme_device }} -w {{ nrpe_nvme_temperature_warning }} -c {{ nrpe_nvme_temperature_critical }} nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_nvme_temperature -d {{ nrpe_nvme_device }} -w {{ nrpe_nvme_temperature_warning }} -c {{ nrpe_nvme_temperature_critical }}
{% endif %} {% endif %}
{% if nrpe_ceph is defined %}
nagios ALL=(ALL) NOPASSWD: /usr/bin/ceph health
nagios ALL=(ALL) NOPASSWD: /usr/bin/ceph osd stat --format json
nagios ALL=(ALL) NOPASSWD: /usr/bin/ceph mon stat --format json
nagios ALL=(ALL) NOPASSWD: /usr/bin/ceph df --format json
nagios ALL=(ALL) NOPASSWD: /usr/bin/ceph pg stat --format json
{% endif %}