You've already forked nrpe
feat - add ceph check
This commit is contained in:
@@ -117,6 +117,7 @@ The following checks are deployed to `/usr/lib/nagios/plugins/` (or configured p
|
||||
| `nrpe_redis_fragments_critical` | `2.0` | `check_redis_health` | Critical threshold for fragmentation ratio. |
|
||||
| `nrpe_redis_replication_lag_warning` | `10` | `check_redis_health` | Warning threshold for replication lag (seconds). |
|
||||
| `nrpe_redis_replication_lag_critical` | `60` | `check_redis_health` | Critical threshold for replication lag (seconds). |
|
||||
| `nrpe_ceph` | false | `check_ceph` | Ceph server monitoring |
|
||||
|
||||
## Example Playbooks
|
||||
|
||||
|
||||
Executable
+104
@@ -0,0 +1,104 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Nagios Exit Codes
|
||||
OK=0
|
||||
WARNING=1
|
||||
CRITICAL=2
|
||||
UNKNOWN=3
|
||||
|
||||
if [ -z "$1" ]; then
|
||||
echo "Usage: $0 [health|osd|mon|cap|pg]"
|
||||
exit $UNKNOWN
|
||||
fi
|
||||
|
||||
COMMAND=$1
|
||||
|
||||
case $COMMAND in
|
||||
health)
|
||||
HEALTH_DATA=$(sudo ceph health)
|
||||
if [[ $HEALTH_DATA == *"HEALTH_OK"* ]]; then
|
||||
echo "OK: Ceph is healthy"
|
||||
exit $OK
|
||||
elif [[ $HEALTH_DATA == *"HEALTH_WARN"* ]]; then
|
||||
echo "WARNING: $HEALTH_DATA"
|
||||
exit $WARNING
|
||||
else
|
||||
echo "CRITICAL: $HEALTH_DATA"
|
||||
exit $CRITICAL
|
||||
fi
|
||||
;;
|
||||
|
||||
osd)
|
||||
OSD_JSON=$(sudo ceph osd stat --format json)
|
||||
TOTAL=$(echo "$OSD_JSON" | python3 -c "import sys, json; print(json.load(sys.stdin).get('num_osds', 0))")
|
||||
UP=$(echo "$OSD_JSON" | python3 -c "import sys, json; print(json.load(sys.stdin).get('num_up_osds', 0))")
|
||||
IN=$(echo "$OSD_JSON" | python3 -c "import sys, json; print(json.load(sys.stdin).get('num_in_osds', 0))")
|
||||
|
||||
if [ "$UP" -lt "$TOTAL" ]; then
|
||||
echo "CRITICAL: OSD down! ($UP/$TOTAL up) | osds=$TOTAL up=$UP in=$IN"
|
||||
exit $CRITICAL
|
||||
elif [ "$IN" -lt "$TOTAL" ]; then
|
||||
echo "WARNING: OSD out! ($IN/$TOTAL in) | osds=$TOTAL up=$UP in=$IN"
|
||||
exit $WARNING
|
||||
else
|
||||
echo "OK: All OSDs are up and in ($TOTAL/$TOTAL) | osds=$TOTAL up=$UP in=$IN"
|
||||
exit $OK
|
||||
fi
|
||||
;;
|
||||
|
||||
mon)
|
||||
MON_JSON=$(sudo ceph mon stat --format json)
|
||||
QUORUM_COUNT=$(echo "$MON_JSON" | python3 -c "import sys, json; print(len(json.load(sys.stdin).get('quorum', [])))")
|
||||
|
||||
if [ "$QUORUM_COUNT" -lt 3 ]; then
|
||||
echo "CRITICAL: Monitor Quorum degraded! ($QUORUM_COUNT/3) | quorum=$QUORUM_COUNT"
|
||||
exit $CRITICAL
|
||||
else
|
||||
echo "OK: Monitor Quorum is 3/3 | quorum=$QUORUM_COUNT"
|
||||
exit $OK
|
||||
fi
|
||||
;;
|
||||
|
||||
cap)
|
||||
CAP_JSON=$(sudo ceph df --format json)
|
||||
USAGE=$(echo "$CAP_JSON" | python3 -c "
|
||||
import sys, json
|
||||
try:
|
||||
stats = json.load(sys.stdin)['stats']
|
||||
ratio = stats.get('total_used_raw_ratio', stats.get('total_used_ratio', 0))
|
||||
print(int(float(ratio) * 100))
|
||||
except Exception:
|
||||
print(-1)
|
||||
")
|
||||
|
||||
if [ "$USAGE" -eq -1 ]; then
|
||||
echo "UNKNOWN: Unable to parse Ceph capacity"
|
||||
exit $UNKNOWN
|
||||
elif [ "$USAGE" -ge 85 ]; then
|
||||
echo "CRITICAL: Ceph capacity at $USAGE% | usage=$USAGE%"
|
||||
exit $CRITICAL
|
||||
elif [ "$USAGE" -ge 75 ]; then
|
||||
echo "WARNING: Ceph capacity at $USAGE% | usage=$USAGE%"
|
||||
exit $WARNING
|
||||
else
|
||||
echo "OK: Ceph capacity at $USAGE% | usage=$USAGE%"
|
||||
exit $OK
|
||||
fi
|
||||
;;
|
||||
|
||||
pg)
|
||||
PG_STAT=$(sudo ceph pg stat)
|
||||
if [[ $PG_STAT == *"active+clean"* ]] && [[ $PG_STAT != *"degraded"* ]] && [[ $PG_STAT != *"undersized"* ]]; then
|
||||
echo "OK: PGs are active+clean"
|
||||
exit $OK
|
||||
else
|
||||
echo "WARNING: PGs status: $PG_STAT"
|
||||
exit $WARNING
|
||||
fi
|
||||
;;
|
||||
|
||||
*)
|
||||
echo "Unknown argument: $COMMAND"
|
||||
exit $UNKNOWN
|
||||
;;
|
||||
esac
|
||||
@@ -145,3 +145,12 @@ command[check_nvme_temperature]=/usr/bin/sudo /usr/lib/nagios/plugins/check_nvme
|
||||
# zpool
|
||||
command[check_zpool_health]=/usr/bin/sudo /usr/lib/nagios/plugins/check_zpool_health {{ nrpe_zpool_name }}
|
||||
{% endif %}
|
||||
|
||||
{% if nrpe_ceph is defined %}
|
||||
# ceph
|
||||
command[check_ceph_health]=/usr/lib/nagios/plugins/check_ceph health
|
||||
command[check_ceph_osd]=/usr/lib/nagios/plugins/check_ceph osd
|
||||
command[check_ceph_mon]=/usr/lib/nagios/plugins/check_ceph mon
|
||||
command[check_ceph_cap]=/usr/lib/nagios/plugins/check_ceph cap
|
||||
command[check_ceph_pg]=/usr/lib/nagios/plugins/check_ceph pg
|
||||
{% endif %}
|
||||
@@ -36,3 +36,11 @@ nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_pvesr
|
||||
nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_nvme_smart -d {{ nrpe_nvme_device }} -w {{ nrpe_nvme_smart_warning }} -c {{ nrpe_nvme_smart_critical }}
|
||||
nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_nvme_temperature -d {{ nrpe_nvme_device }} -w {{ nrpe_nvme_temperature_warning }} -c {{ nrpe_nvme_temperature_critical }}
|
||||
{% endif %}
|
||||
|
||||
{% if nrpe_ceph is defined %}
|
||||
nagios ALL=(ALL) NOPASSWD: /usr/bin/ceph health
|
||||
nagios ALL=(ALL) NOPASSWD: /usr/bin/ceph osd stat --format json
|
||||
nagios ALL=(ALL) NOPASSWD: /usr/bin/ceph mon stat --format json
|
||||
nagios ALL=(ALL) NOPASSWD: /usr/bin/ceph df --format json
|
||||
nagios ALL=(ALL) NOPASSWD: /usr/bin/ceph pg stat --format json
|
||||
{% endif %}
|
||||
Reference in New Issue
Block a user