diff --git a/README.md b/README.md index 94b5a89..4d070f0 100644 --- a/README.md +++ b/README.md @@ -117,6 +117,7 @@ The following checks are deployed to `/usr/lib/nagios/plugins/` (or configured p | `nrpe_redis_fragments_critical` | `2.0` | `check_redis_health` | Critical threshold for fragmentation ratio. | | `nrpe_redis_replication_lag_warning` | `10` | `check_redis_health` | Warning threshold for replication lag (seconds). | | `nrpe_redis_replication_lag_critical` | `60` | `check_redis_health` | Critical threshold for replication lag (seconds). | +| `nrpe_ceph` | false | `check_ceph` | Ceph server monitoring | ## Example Playbooks diff --git a/files/nrpe/check_ceph b/files/nrpe/check_ceph new file mode 100755 index 0000000..33f13fc --- /dev/null +++ b/files/nrpe/check_ceph @@ -0,0 +1,104 @@ +#!/bin/bash + +# Nagios Exit Codes +OK=0 +WARNING=1 +CRITICAL=2 +UNKNOWN=3 + +if [ -z "$1" ]; then + echo "Usage: $0 [health|osd|mon|cap|pg]" + exit $UNKNOWN +fi + +COMMAND=$1 + +case $COMMAND in + health) + HEALTH_DATA=$(sudo ceph health) + if [[ $HEALTH_DATA == *"HEALTH_OK"* ]]; then + echo "OK: Ceph is healthy" + exit $OK + elif [[ $HEALTH_DATA == *"HEALTH_WARN"* ]]; then + echo "WARNING: $HEALTH_DATA" + exit $WARNING + else + echo "CRITICAL: $HEALTH_DATA" + exit $CRITICAL + fi + ;; + + osd) + OSD_JSON=$(sudo ceph osd stat --format json) + TOTAL=$(echo "$OSD_JSON" | python3 -c "import sys, json; print(json.load(sys.stdin).get('num_osds', 0))") + UP=$(echo "$OSD_JSON" | python3 -c "import sys, json; print(json.load(sys.stdin).get('num_up_osds', 0))") + IN=$(echo "$OSD_JSON" | python3 -c "import sys, json; print(json.load(sys.stdin).get('num_in_osds', 0))") + + if [ "$UP" -lt "$TOTAL" ]; then + echo "CRITICAL: OSD down! ($UP/$TOTAL up) | osds=$TOTAL up=$UP in=$IN" + exit $CRITICAL + elif [ "$IN" -lt "$TOTAL" ]; then + echo "WARNING: OSD out! ($IN/$TOTAL in) | osds=$TOTAL up=$UP in=$IN" + exit $WARNING + else + echo "OK: All OSDs are up and in ($TOTAL/$TOTAL) | osds=$TOTAL up=$UP in=$IN" + exit $OK + fi + ;; + + mon) + MON_JSON=$(sudo ceph mon stat --format json) + QUORUM_COUNT=$(echo "$MON_JSON" | python3 -c "import sys, json; print(len(json.load(sys.stdin).get('quorum', [])))") + + if [ "$QUORUM_COUNT" -lt 3 ]; then + echo "CRITICAL: Monitor Quorum degraded! ($QUORUM_COUNT/3) | quorum=$QUORUM_COUNT" + exit $CRITICAL + else + echo "OK: Monitor Quorum is 3/3 | quorum=$QUORUM_COUNT" + exit $OK + fi + ;; + + cap) + CAP_JSON=$(sudo ceph df --format json) + USAGE=$(echo "$CAP_JSON" | python3 -c " +import sys, json +try: + stats = json.load(sys.stdin)['stats'] + ratio = stats.get('total_used_raw_ratio', stats.get('total_used_ratio', 0)) + print(int(float(ratio) * 100)) +except Exception: + print(-1) +") + + if [ "$USAGE" -eq -1 ]; then + echo "UNKNOWN: Unable to parse Ceph capacity" + exit $UNKNOWN + elif [ "$USAGE" -ge 85 ]; then + echo "CRITICAL: Ceph capacity at $USAGE% | usage=$USAGE%" + exit $CRITICAL + elif [ "$USAGE" -ge 75 ]; then + echo "WARNING: Ceph capacity at $USAGE% | usage=$USAGE%" + exit $WARNING + else + echo "OK: Ceph capacity at $USAGE% | usage=$USAGE%" + exit $OK + fi + ;; + + pg) + PG_STAT=$(sudo ceph pg stat) + if [[ $PG_STAT == *"active+clean"* ]] && [[ $PG_STAT != *"degraded"* ]] && [[ $PG_STAT != *"undersized"* ]]; then + echo "OK: PGs are active+clean" + exit $OK + else + echo "WARNING: PGs status: $PG_STAT" + exit $WARNING + fi + ;; + + *) + echo "Unknown argument: $COMMAND" + exit $UNKNOWN + ;; +esac diff --git a/templates/nrpe.j2 b/templates/nrpe.j2 index 3585a1f..632c60e 100644 --- a/templates/nrpe.j2 +++ b/templates/nrpe.j2 @@ -145,3 +145,12 @@ command[check_nvme_temperature]=/usr/bin/sudo /usr/lib/nagios/plugins/check_nvme # zpool command[check_zpool_health]=/usr/bin/sudo /usr/lib/nagios/plugins/check_zpool_health {{ nrpe_zpool_name }} {% endif %} + +{% if nrpe_ceph is defined %} +# ceph +command[check_ceph_health]=/usr/lib/nagios/plugins/check_ceph health +command[check_ceph_osd]=/usr/lib/nagios/plugins/check_ceph osd +command[check_ceph_mon]=/usr/lib/nagios/plugins/check_ceph mon +command[check_ceph_cap]=/usr/lib/nagios/plugins/check_ceph cap +command[check_ceph_pg]=/usr/lib/nagios/plugins/check_ceph pg +{% endif %} \ No newline at end of file diff --git a/templates/nrpe.sudoers.j2 b/templates/nrpe.sudoers.j2 index c4eac59..6de5c22 100644 --- a/templates/nrpe.sudoers.j2 +++ b/templates/nrpe.sudoers.j2 @@ -35,4 +35,12 @@ nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_pvesr {% if nrpe_nvme_device is defined %} nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_nvme_smart -d {{ nrpe_nvme_device }} -w {{ nrpe_nvme_smart_warning }} -c {{ nrpe_nvme_smart_critical }} nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_nvme_temperature -d {{ nrpe_nvme_device }} -w {{ nrpe_nvme_temperature_warning }} -c {{ nrpe_nvme_temperature_critical }} +{% endif %} + +{% if nrpe_ceph is defined %} +nagios ALL=(ALL) NOPASSWD: /usr/bin/ceph health +nagios ALL=(ALL) NOPASSWD: /usr/bin/ceph osd stat --format json +nagios ALL=(ALL) NOPASSWD: /usr/bin/ceph mon stat --format json +nagios ALL=(ALL) NOPASSWD: /usr/bin/ceph df --format json +nagios ALL=(ALL) NOPASSWD: /usr/bin/ceph pg stat --format json {% endif %} \ No newline at end of file