diff --git a/defaults/main.yml b/defaults/main.yml index 430ebd5..5581355 100644 --- a/defaults/main.yml +++ b/defaults/main.yml @@ -45,3 +45,14 @@ nrpe_mysql_longqueries_critical: 1200 nrpe_proc_age_warning: 400 nrpe_proc_age_critical: 600 + +nrpe_redis_memory_warning: 80 +nrpe_redis_memory_critical: 90 +nrpe_redis_connected_clients_warning: 200 +nrpe_redis_connected_clients_critical: 500 +nrpe_redis_hitrate_warning: 80 +nrpe_redis_hitrate_critical: 50 +nrpe_redis_fragments_warning: '1.5' +nrpe_redis_fragments_critical: '2.0' +nrpe_redis_replication_lag_warning: 10 +nrpe_redis_replication_lag_critical: 60 \ No newline at end of file diff --git a/files/nrpe/check_redis_health b/files/nrpe/check_redis_health new file mode 100755 index 0000000..af6a596 --- /dev/null +++ b/files/nrpe/check_redis_health @@ -0,0 +1,211 @@ +#!/bin/bash +# +# Nagios plugin to check for redis health metrics. +# +# Copyright (c) 2026, GitHub Copilot +# +# This script is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This script is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +# Nagios exit codes +STATE_OK=0 +STATE_WARNING=1 +STATE_CRITICAL=2 +STATE_UNKNOWN=3 + +# --- Valeurs par défaut --- +CHECK_TYPE="" +WARN=0 +CRIT=0 +AUTH="" + +# --- Aide --- +usage() { + cat << EOF +Usage: $0 -x [type] -w -c [-a ] + +Types de check (-x): + ping : Test de connexion simple (PONG) + memory : Utilisation mémoire en % (Ex: -w 80 -c 90) + frag : Ratio de fragmentation (Ex: -w 1.5 -c 2.0) + hitrate : Taux d'efficacité du cache en % (Ex: -w 80 -c 50) + persistence : État des sauvegardes RDB/AOF + replication : Statut Master/Slave et lag (Ex: -w 10 -c 60) + clients : Nombre de clients connectés (Ex: -w 200 -c 500) + +Optionnelle: + -a : Mot de passe Redis (sera passé via REDISCLI_AUTH) +EOF + exit $STATE_UNKNOWN +} + +# --- Parsing des arguments --- +while getopts "x:w:c:a:?" opt; do + case "$opt" in + x) CHECK_TYPE="$OPTARG" ;; + w) WARN="$OPTARG" ;; + c) CRIT="$OPTARG" ;; + a) AUTH="$OPTARG" ;; + *) usage ;; + esac +done + +if [ -z "$CHECK_TYPE" ]; then usage; fi + +# Export du mot de passe pour redis-cli +if [ -n "$AUTH" ]; then + export REDISCLI_AUTH="$AUTH" +fi + +EXIT_CODE=$STATE_OK +INSTANCES_FOUND=0 + +# Listes pour stocker les résultats +CRIT_LIST="" +WARN_LIST="" +OK_LIST="" + +# 1. Détection des ports +if command -v ss &> /dev/null; then + PORTS=$(ss -tlnp | grep 'redis-server' | awk '{print $4}' | awk -F: '{print $NF}' | sort -u) +elif command -v netstat &> /dev/null; then + PORTS=$(netstat -tlnp | grep 'redis-server' | awk '{print $4}' | awk -F: '{print $NF}' | sort -u) +else + PORTS=$(ps -ef | grep 'redis-server' | grep -v grep | grep -oP '(?<=:)\d{4,5}' | sort -u) +fi + +if [ -z "$PORTS" ]; then + echo "OK: Aucun Redis détecté" + exit $STATE_OK +fi + +# 2. Exécution du check +for PORT in $PORTS; do + # On ignore les ports < 1024 sauf le 6379 standard + if [ "$PORT" -lt 1024 ] && [ "$PORT" -ne 6379 ]; then continue; fi + + # Vérification si l'instance répond + if ! redis-cli -p "$PORT" PING 2>/dev/null | grep -q "PONG"; then + MSG="Port $PORT (Erreur Connexion/Auth)" + if [ -z "$CRIT_LIST" ]; then CRIT_LIST="$MSG"; else CRIT_LIST="$CRIT_LIST, $MSG"; fi + [ "$EXIT_CODE" -lt $STATE_CRITICAL ] && EXIT_CODE=$STATE_CRITICAL + ((INSTANCES_FOUND++)) + continue + fi + + ((INSTANCES_FOUND++)) + CURR_STATUS=$STATE_OK + VAL_STR="" + + case "$CHECK_TYPE" in + ping) + VAL_STR="PONG" + ;; + memory) + MEM_INFO=$(redis-cli -p "$PORT" info memory 2>/dev/null) + USED=$(echo "$MEM_INFO" | grep "used_memory:" | cut -d: -f2 | tr -d '\r') + MAX=$(echo "$MEM_INFO" | grep "maxmemory:" | cut -d: -f2 | tr -d '\r') + USED_H=$(echo "$MEM_INFO" | grep "used_memory_human:" | cut -d: -f2 | tr -d '\r') + MAX_H=$(echo "$MEM_INFO" | grep "maxmemory_human:" | cut -d: -f2 | tr -d '\r') + if [ "$MAX" -gt 0 ]; then + VALUE=$(( USED * 100 / MAX )) + VAL_STR="${VALUE}% (Used:${USED_H}, Max:${MAX_H})" + if [ "$VALUE" -ge "$CRIT" ]; then CURR_STATUS=$STATE_CRITICAL; elif [ "$VALUE" -ge "$WARN" ]; then CURR_STATUS=$STATE_WARNING; fi + else + VAL_STR="NoLimit (Used:${USED_H})" + fi + ;; + frag) + FRAG=$(redis-cli -p "$PORT" info memory 2>/dev/null | grep "mem_fragmentation_ratio:" | cut -d: -f2 | tr -d '\r') + IS_CRIT=$(echo "$FRAG >= $CRIT" | bc -l 2>/dev/null) + IS_WARN=$(echo "$FRAG >= $WARN" | bc -l 2>/dev/null) + VAL_STR="Ratio:$FRAG" + if [ "$IS_CRIT" == "1" ]; then CURR_STATUS=$STATE_CRITICAL; elif [ "$IS_WARN" == "1" ]; then CURR_STATUS=$STATE_WARNING; fi + ;; + hitrate) + STATS=$(redis-cli -p "$PORT" info stats 2>/dev/null) + HITS=$(echo "$STATS" | grep "keyspace_hits:" | cut -d: -f2 | tr -d '\r') + MISSES=$(echo "$STATS" | grep "keyspace_misses:" | cut -d: -f2 | tr -d '\r') + TOTAL=$((HITS + MISSES)) + VALUE=100 + [ "$TOTAL" -gt 0 ] && VALUE=$(( HITS * 100 / TOTAL )) + VAL_STR="${VALUE}% (Hits:$HITS, Misses:$MISSES)" + if [ "$VALUE" -le "$CRIT" ]; then CURR_STATUS=$STATE_CRITICAL; elif [ "$VALUE" -le "$WARN" ]; then CURR_STATUS=$STATE_WARNING; fi + ;; + persistence) + PERS=$(redis-cli -p "$PORT" info persistence 2>/dev/null) + RDB=$(echo "$PERS" | grep "rdb_last_bgsave_status:" | cut -d: -f2 | tr -d '\r') + AOF=$(echo "$PERS" | grep "aof_last_write_status:" | cut -d: -f2 | tr -d '\r') + VAL_STR="RDB:$RDB, AOF:$AOF" + if [ "$RDB" != "ok" ] || [ "$AOF" != "ok" ]; then CURR_STATUS=$STATE_CRITICAL; fi + ;; + replication) + REPL=$(redis-cli -p "$PORT" info replication 2>/dev/null) + ROLE=$(echo "$REPL" | grep "role:" | cut -d: -f2 | tr -d '\r') + if [ "$ROLE" == "slave" ]; then + LINK=$(echo "$REPL" | grep "master_link_status:" | cut -d: -f2 | tr -d '\r') + LAG=$(echo "$REPL" | grep "master_last_io_seconds_ago:" | cut -d: -f2 | tr -d '\r') + VAL_STR="Slave, Link:$LINK, Lag:${LAG}s" + if [ "$LINK" != "up" ] || [ "$LAG" -ge "$CRIT" ]; then CURR_STATUS=$STATE_CRITICAL; elif [ "$LAG" -ge "$WARN" ]; then CURR_STATUS=$STATE_WARNING; fi + else + SLAVE_COUNT=$(echo "$REPL" | grep "connected_slaves:" | cut -d: -f2 | tr -d '\r') + VAL_STR="Master, Slaves:$SLAVE_COUNT" + fi + ;; + clients) + CLIENTS=$(redis-cli -p "$PORT" info clients 2>/dev/null | grep "connected_clients:" | cut -d: -f2 | tr -d '\r') + MAX_CLIENTS=$(redis-cli -p "$PORT" config get maxclients 2>/dev/null | tail -n1) + VAL_STR="$CLIENTS clients" + if [ -n "$MAX_CLIENTS" ]; then VAL_STR="$VAL_STR (Max:$MAX_CLIENTS)"; fi + if [ "$CLIENTS" -ge "$CRIT" ]; then CURR_STATUS=$STATE_CRITICAL; elif [ "$CLIENTS" -ge "$WARN" ]; then CURR_STATUS=$STATE_WARNING; fi + ;; + *) usage ;; + esac + + # Stockage du résultat + MSG="Port $PORT ($VAL_STR)" + case "$CURR_STATUS" in + $STATE_CRITICAL) + if [ -z "$CRIT_LIST" ]; then CRIT_LIST="$MSG"; else CRIT_LIST="$CRIT_LIST, $MSG"; fi + [ "$EXIT_CODE" -lt $STATE_CRITICAL ] && EXIT_CODE=$STATE_CRITICAL + ;; + $STATE_WARNING) + if [ -z "$WARN_LIST" ]; then WARN_LIST="$MSG"; else WARN_LIST="$WARN_LIST, $MSG"; fi + [ "$EXIT_CODE" -lt $STATE_WARNING ] && EXIT_CODE=$STATE_WARNING + ;; + $STATE_OK) + if [ -z "$OK_LIST" ]; then OK_LIST="$MSG"; else OK_LIST="$OK_LIST, $MSG"; fi + ;; + esac +done + +# Construction du message de sortie +if [ "$EXIT_CODE" -eq $STATE_OK ]; then + echo "OK: All $INSTANCES_FOUND instance(s) are healthy ($CHECK_TYPE). $OK_LIST" + exit $STATE_OK +fi + +FINAL_MSG="" +if [ "$EXIT_CODE" -eq $STATE_CRITICAL ]; then + FINAL_MSG="CRITICAL: " + if [ -n "$CRIT_LIST" ]; then FINAL_MSG="${FINAL_MSG}${CRIT_LIST}"; fi + if [ -n "$WARN_LIST" ]; then + if [ -n "$CRIT_LIST" ]; then FINAL_MSG="${FINAL_MSG} - "; fi + FINAL_MSG="${FINAL_MSG}WARNING: ${WARN_LIST}" + fi +elif [ "$EXIT_CODE" -eq $STATE_WARNING ]; then + FINAL_MSG="WARNING: $WARN_LIST" +fi + +echo "$FINAL_MSG" +exit "$EXIT_CODE" diff --git a/templates/nrpe.j2 b/templates/nrpe.j2 index 100cdd0..e957c39 100644 --- a/templates/nrpe.j2 +++ b/templates/nrpe.j2 @@ -103,4 +103,23 @@ command[check_k8s_pv_pvc]=/usr/bin/sudo /usr/lib/nagios/plugins/check_k8s_pv_pvc command[check_k8s_replicasets]=/usr/bin/sudo /usr/lib/nagios/plugins/check_k8s_replicasets command[check_k8s_pod_restarts]=/usr/bin/sudo /usr/lib/nagios/plugins/check_k8s_pod_restarts {% endif %} +{% endif %} + +{% if nrpe_redis_memory_warning is defined or nrpe_redis_memory_critical is defined or nrpe_redis_persistence is defined %} +# redis +command[check_redis_health]=/usr/bin/sudo /usr/lib/nagios/plugins/check_redis_health -x ping +command[check_redis_memory]=/usr/bin/sudo /usr/lib/nagios/plugins/check_redis_health -x memory -w {{ nrpe_redis_memory_warning }} -c {{ nrpe_redis_memory_critical }} +command[check_redis_persistence]=/usr/bin/sudo /usr/lib/nagios/plugins/check_redis_health -x persistence +{% if nrpe_redis_connected_clients_warning is defined or nrpe_redis_connected_clients_critical is defined %} +command[check_redis_health_clients]=/usr/bin/sudo /usr/lib/nagios/plugins/check_redis_health -x clients -w {{ nrpe_redis_connected_clients_warning }} -c {{ nrpe_redis_connected_clients_critical }} +{% endif %} +{% if nrpe_redis_hitrate_warning is defined or nrpe_redis_hitrate_critical is defined %} +command[check_redis_health_hitrate]=/usr/bin/sudo /usr/lib/nagios/plugins/check_redis_health -x hitrate -w {{ nrpe_redis_hitrate_warning }} -c {{ nrpe_redis_hitrate_critical }} +{% endif %} +{% if nrpe_redis_fragments_warning is defined or nrpe_redis_fragments_critical is defined %} +command[check_redis_health_frag]=/usr/bin/sudo /usr/lib/nagios/plugins/check_redis_health -x frag -w {{ nrpe_redis_fragments_warning }} -c {{ nrpe_redis_fragments_critical }} +{% endif %} +{% if nrpe_redis_replication_lag_warning is defined or nrpe_redis_replication_lag_critical is defined %} +command[check_redis_health_replication]=/usr/bin/sudo /usr/lib/nagios/plugins/check_redis_health -x replication -w {{ nrpe_redis_replication_lag_warning }} -c {{ nrpe_redis_replication_lag_critical }} +{% endif %} {% endif %} \ No newline at end of file diff --git a/templates/nrpe.sudoers.j2 b/templates/nrpe.sudoers.j2 index 86e4cf0..62d0cb2 100644 --- a/templates/nrpe.sudoers.j2 +++ b/templates/nrpe.sudoers.j2 @@ -11,4 +11,5 @@ nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_k8s_jobs_cronjobs nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_k8s_pki_certs nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_k8s_pv_pvc nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_k8s_replicasets -nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_k8s_pod_restarts \ No newline at end of file +nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_k8s_pod_restarts +nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_redis_health \ No newline at end of file