You've already forked nrpe
feat - add ceph check
This commit is contained in:
@@ -117,6 +117,7 @@ The following checks are deployed to `/usr/lib/nagios/plugins/` (or configured p
|
|||||||
| `nrpe_redis_fragments_critical` | `2.0` | `check_redis_health` | Critical threshold for fragmentation ratio. |
|
| `nrpe_redis_fragments_critical` | `2.0` | `check_redis_health` | Critical threshold for fragmentation ratio. |
|
||||||
| `nrpe_redis_replication_lag_warning` | `10` | `check_redis_health` | Warning threshold for replication lag (seconds). |
|
| `nrpe_redis_replication_lag_warning` | `10` | `check_redis_health` | Warning threshold for replication lag (seconds). |
|
||||||
| `nrpe_redis_replication_lag_critical` | `60` | `check_redis_health` | Critical threshold for replication lag (seconds). |
|
| `nrpe_redis_replication_lag_critical` | `60` | `check_redis_health` | Critical threshold for replication lag (seconds). |
|
||||||
|
| `nrpe_ceph` | false | `check_ceph` | Ceph server monitoring |
|
||||||
|
|
||||||
## Example Playbooks
|
## Example Playbooks
|
||||||
|
|
||||||
|
|||||||
Executable
+104
@@ -0,0 +1,104 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# Nagios Exit Codes
|
||||||
|
OK=0
|
||||||
|
WARNING=1
|
||||||
|
CRITICAL=2
|
||||||
|
UNKNOWN=3
|
||||||
|
|
||||||
|
if [ -z "$1" ]; then
|
||||||
|
echo "Usage: $0 [health|osd|mon|cap|pg]"
|
||||||
|
exit $UNKNOWN
|
||||||
|
fi
|
||||||
|
|
||||||
|
COMMAND=$1
|
||||||
|
|
||||||
|
case $COMMAND in
|
||||||
|
health)
|
||||||
|
HEALTH_DATA=$(sudo ceph health)
|
||||||
|
if [[ $HEALTH_DATA == *"HEALTH_OK"* ]]; then
|
||||||
|
echo "OK: Ceph is healthy"
|
||||||
|
exit $OK
|
||||||
|
elif [[ $HEALTH_DATA == *"HEALTH_WARN"* ]]; then
|
||||||
|
echo "WARNING: $HEALTH_DATA"
|
||||||
|
exit $WARNING
|
||||||
|
else
|
||||||
|
echo "CRITICAL: $HEALTH_DATA"
|
||||||
|
exit $CRITICAL
|
||||||
|
fi
|
||||||
|
;;
|
||||||
|
|
||||||
|
osd)
|
||||||
|
OSD_JSON=$(sudo ceph osd stat --format json)
|
||||||
|
TOTAL=$(echo "$OSD_JSON" | python3 -c "import sys, json; print(json.load(sys.stdin).get('num_osds', 0))")
|
||||||
|
UP=$(echo "$OSD_JSON" | python3 -c "import sys, json; print(json.load(sys.stdin).get('num_up_osds', 0))")
|
||||||
|
IN=$(echo "$OSD_JSON" | python3 -c "import sys, json; print(json.load(sys.stdin).get('num_in_osds', 0))")
|
||||||
|
|
||||||
|
if [ "$UP" -lt "$TOTAL" ]; then
|
||||||
|
echo "CRITICAL: OSD down! ($UP/$TOTAL up) | osds=$TOTAL up=$UP in=$IN"
|
||||||
|
exit $CRITICAL
|
||||||
|
elif [ "$IN" -lt "$TOTAL" ]; then
|
||||||
|
echo "WARNING: OSD out! ($IN/$TOTAL in) | osds=$TOTAL up=$UP in=$IN"
|
||||||
|
exit $WARNING
|
||||||
|
else
|
||||||
|
echo "OK: All OSDs are up and in ($TOTAL/$TOTAL) | osds=$TOTAL up=$UP in=$IN"
|
||||||
|
exit $OK
|
||||||
|
fi
|
||||||
|
;;
|
||||||
|
|
||||||
|
mon)
|
||||||
|
MON_JSON=$(sudo ceph mon stat --format json)
|
||||||
|
QUORUM_COUNT=$(echo "$MON_JSON" | python3 -c "import sys, json; print(len(json.load(sys.stdin).get('quorum', [])))")
|
||||||
|
|
||||||
|
if [ "$QUORUM_COUNT" -lt 3 ]; then
|
||||||
|
echo "CRITICAL: Monitor Quorum degraded! ($QUORUM_COUNT/3) | quorum=$QUORUM_COUNT"
|
||||||
|
exit $CRITICAL
|
||||||
|
else
|
||||||
|
echo "OK: Monitor Quorum is 3/3 | quorum=$QUORUM_COUNT"
|
||||||
|
exit $OK
|
||||||
|
fi
|
||||||
|
;;
|
||||||
|
|
||||||
|
cap)
|
||||||
|
CAP_JSON=$(sudo ceph df --format json)
|
||||||
|
USAGE=$(echo "$CAP_JSON" | python3 -c "
|
||||||
|
import sys, json
|
||||||
|
try:
|
||||||
|
stats = json.load(sys.stdin)['stats']
|
||||||
|
ratio = stats.get('total_used_raw_ratio', stats.get('total_used_ratio', 0))
|
||||||
|
print(int(float(ratio) * 100))
|
||||||
|
except Exception:
|
||||||
|
print(-1)
|
||||||
|
")
|
||||||
|
|
||||||
|
if [ "$USAGE" -eq -1 ]; then
|
||||||
|
echo "UNKNOWN: Unable to parse Ceph capacity"
|
||||||
|
exit $UNKNOWN
|
||||||
|
elif [ "$USAGE" -ge 85 ]; then
|
||||||
|
echo "CRITICAL: Ceph capacity at $USAGE% | usage=$USAGE%"
|
||||||
|
exit $CRITICAL
|
||||||
|
elif [ "$USAGE" -ge 75 ]; then
|
||||||
|
echo "WARNING: Ceph capacity at $USAGE% | usage=$USAGE%"
|
||||||
|
exit $WARNING
|
||||||
|
else
|
||||||
|
echo "OK: Ceph capacity at $USAGE% | usage=$USAGE%"
|
||||||
|
exit $OK
|
||||||
|
fi
|
||||||
|
;;
|
||||||
|
|
||||||
|
pg)
|
||||||
|
PG_STAT=$(sudo ceph pg stat)
|
||||||
|
if [[ $PG_STAT == *"active+clean"* ]] && [[ $PG_STAT != *"degraded"* ]] && [[ $PG_STAT != *"undersized"* ]]; then
|
||||||
|
echo "OK: PGs are active+clean"
|
||||||
|
exit $OK
|
||||||
|
else
|
||||||
|
echo "WARNING: PGs status: $PG_STAT"
|
||||||
|
exit $WARNING
|
||||||
|
fi
|
||||||
|
;;
|
||||||
|
|
||||||
|
*)
|
||||||
|
echo "Unknown argument: $COMMAND"
|
||||||
|
exit $UNKNOWN
|
||||||
|
;;
|
||||||
|
esac
|
||||||
@@ -145,3 +145,12 @@ command[check_nvme_temperature]=/usr/bin/sudo /usr/lib/nagios/plugins/check_nvme
|
|||||||
# zpool
|
# zpool
|
||||||
command[check_zpool_health]=/usr/bin/sudo /usr/lib/nagios/plugins/check_zpool_health {{ nrpe_zpool_name }}
|
command[check_zpool_health]=/usr/bin/sudo /usr/lib/nagios/plugins/check_zpool_health {{ nrpe_zpool_name }}
|
||||||
{% endif %}
|
{% endif %}
|
||||||
|
|
||||||
|
{% if nrpe_ceph is defined %}
|
||||||
|
# ceph
|
||||||
|
command[check_ceph_health]=/usr/lib/nagios/plugins/check_ceph health
|
||||||
|
command[check_ceph_osd]=/usr/lib/nagios/plugins/check_ceph osd
|
||||||
|
command[check_ceph_mon]=/usr/lib/nagios/plugins/check_ceph mon
|
||||||
|
command[check_ceph_cap]=/usr/lib/nagios/plugins/check_ceph cap
|
||||||
|
command[check_ceph_pg]=/usr/lib/nagios/plugins/check_ceph pg
|
||||||
|
{% endif %}
|
||||||
@@ -36,3 +36,11 @@ nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_pvesr
|
|||||||
nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_nvme_smart -d {{ nrpe_nvme_device }} -w {{ nrpe_nvme_smart_warning }} -c {{ nrpe_nvme_smart_critical }}
|
nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_nvme_smart -d {{ nrpe_nvme_device }} -w {{ nrpe_nvme_smart_warning }} -c {{ nrpe_nvme_smart_critical }}
|
||||||
nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_nvme_temperature -d {{ nrpe_nvme_device }} -w {{ nrpe_nvme_temperature_warning }} -c {{ nrpe_nvme_temperature_critical }}
|
nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_nvme_temperature -d {{ nrpe_nvme_device }} -w {{ nrpe_nvme_temperature_warning }} -c {{ nrpe_nvme_temperature_critical }}
|
||||||
{% endif %}
|
{% endif %}
|
||||||
|
|
||||||
|
{% if nrpe_ceph is defined %}
|
||||||
|
nagios ALL=(ALL) NOPASSWD: /usr/bin/ceph health
|
||||||
|
nagios ALL=(ALL) NOPASSWD: /usr/bin/ceph osd stat --format json
|
||||||
|
nagios ALL=(ALL) NOPASSWD: /usr/bin/ceph mon stat --format json
|
||||||
|
nagios ALL=(ALL) NOPASSWD: /usr/bin/ceph df --format json
|
||||||
|
nagios ALL=(ALL) NOPASSWD: /usr/bin/ceph pg stat --format json
|
||||||
|
{% endif %}
|
||||||
Reference in New Issue
Block a user