add redis

This commit is contained in:
Ludovic Cartier
2026-01-02 14:26:58 +01:00
parent c7775b37e1
commit 9a6dfd6918
4 changed files with 243 additions and 1 deletions

View File

@@ -45,3 +45,14 @@ nrpe_mysql_longqueries_critical: 1200
nrpe_proc_age_warning: 400 nrpe_proc_age_warning: 400
nrpe_proc_age_critical: 600 nrpe_proc_age_critical: 600
nrpe_redis_memory_warning: 80
nrpe_redis_memory_critical: 90
nrpe_redis_connected_clients_warning: 200
nrpe_redis_connected_clients_critical: 500
nrpe_redis_hitrate_warning: 80
nrpe_redis_hitrate_critical: 50
nrpe_redis_fragments_warning: '1.5'
nrpe_redis_fragments_critical: '2.0'
nrpe_redis_replication_lag_warning: 10
nrpe_redis_replication_lag_critical: 60

211
files/nrpe/check_redis_health Executable file
View File

@@ -0,0 +1,211 @@
#!/bin/bash
#
# Nagios plugin to check for redis health metrics.
#
# Copyright (c) 2026, GitHub Copilot
#
# This script is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This script is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
# Nagios exit codes
STATE_OK=0
STATE_WARNING=1
STATE_CRITICAL=2
STATE_UNKNOWN=3
# --- Valeurs par défaut ---
CHECK_TYPE=""
WARN=0
CRIT=0
AUTH=""
# --- Aide ---
usage() {
cat << EOF
Usage: $0 -x [type] -w <warning> -c <critical> [-a <password>]
Types de check (-x):
ping : Test de connexion simple (PONG)
memory : Utilisation mémoire en % (Ex: -w 80 -c 90)
frag : Ratio de fragmentation (Ex: -w 1.5 -c 2.0)
hitrate : Taux d'efficacité du cache en % (Ex: -w 80 -c 50)
persistence : État des sauvegardes RDB/AOF
replication : Statut Master/Slave et lag (Ex: -w 10 -c 60)
clients : Nombre de clients connectés (Ex: -w 200 -c 500)
Optionnelle:
-a : Mot de passe Redis (sera passé via REDISCLI_AUTH)
EOF
exit $STATE_UNKNOWN
}
# --- Parsing des arguments ---
while getopts "x:w:c:a:?" opt; do
case "$opt" in
x) CHECK_TYPE="$OPTARG" ;;
w) WARN="$OPTARG" ;;
c) CRIT="$OPTARG" ;;
a) AUTH="$OPTARG" ;;
*) usage ;;
esac
done
if [ -z "$CHECK_TYPE" ]; then usage; fi
# Export du mot de passe pour redis-cli
if [ -n "$AUTH" ]; then
export REDISCLI_AUTH="$AUTH"
fi
EXIT_CODE=$STATE_OK
INSTANCES_FOUND=0
# Listes pour stocker les résultats
CRIT_LIST=""
WARN_LIST=""
OK_LIST=""
# 1. Détection des ports
if command -v ss &> /dev/null; then
PORTS=$(ss -tlnp | grep 'redis-server' | awk '{print $4}' | awk -F: '{print $NF}' | sort -u)
elif command -v netstat &> /dev/null; then
PORTS=$(netstat -tlnp | grep 'redis-server' | awk '{print $4}' | awk -F: '{print $NF}' | sort -u)
else
PORTS=$(ps -ef | grep 'redis-server' | grep -v grep | grep -oP '(?<=:)\d{4,5}' | sort -u)
fi
if [ -z "$PORTS" ]; then
echo "OK: Aucun Redis détecté"
exit $STATE_OK
fi
# 2. Exécution du check
for PORT in $PORTS; do
# On ignore les ports < 1024 sauf le 6379 standard
if [ "$PORT" -lt 1024 ] && [ "$PORT" -ne 6379 ]; then continue; fi
# Vérification si l'instance répond
if ! redis-cli -p "$PORT" PING 2>/dev/null | grep -q "PONG"; then
MSG="Port $PORT (Erreur Connexion/Auth)"
if [ -z "$CRIT_LIST" ]; then CRIT_LIST="$MSG"; else CRIT_LIST="$CRIT_LIST, $MSG"; fi
[ "$EXIT_CODE" -lt $STATE_CRITICAL ] && EXIT_CODE=$STATE_CRITICAL
((INSTANCES_FOUND++))
continue
fi
((INSTANCES_FOUND++))
CURR_STATUS=$STATE_OK
VAL_STR=""
case "$CHECK_TYPE" in
ping)
VAL_STR="PONG"
;;
memory)
MEM_INFO=$(redis-cli -p "$PORT" info memory 2>/dev/null)
USED=$(echo "$MEM_INFO" | grep "used_memory:" | cut -d: -f2 | tr -d '\r')
MAX=$(echo "$MEM_INFO" | grep "maxmemory:" | cut -d: -f2 | tr -d '\r')
USED_H=$(echo "$MEM_INFO" | grep "used_memory_human:" | cut -d: -f2 | tr -d '\r')
MAX_H=$(echo "$MEM_INFO" | grep "maxmemory_human:" | cut -d: -f2 | tr -d '\r')
if [ "$MAX" -gt 0 ]; then
VALUE=$(( USED * 100 / MAX ))
VAL_STR="${VALUE}% (Used:${USED_H}, Max:${MAX_H})"
if [ "$VALUE" -ge "$CRIT" ]; then CURR_STATUS=$STATE_CRITICAL; elif [ "$VALUE" -ge "$WARN" ]; then CURR_STATUS=$STATE_WARNING; fi
else
VAL_STR="NoLimit (Used:${USED_H})"
fi
;;
frag)
FRAG=$(redis-cli -p "$PORT" info memory 2>/dev/null | grep "mem_fragmentation_ratio:" | cut -d: -f2 | tr -d '\r')
IS_CRIT=$(echo "$FRAG >= $CRIT" | bc -l 2>/dev/null)
IS_WARN=$(echo "$FRAG >= $WARN" | bc -l 2>/dev/null)
VAL_STR="Ratio:$FRAG"
if [ "$IS_CRIT" == "1" ]; then CURR_STATUS=$STATE_CRITICAL; elif [ "$IS_WARN" == "1" ]; then CURR_STATUS=$STATE_WARNING; fi
;;
hitrate)
STATS=$(redis-cli -p "$PORT" info stats 2>/dev/null)
HITS=$(echo "$STATS" | grep "keyspace_hits:" | cut -d: -f2 | tr -d '\r')
MISSES=$(echo "$STATS" | grep "keyspace_misses:" | cut -d: -f2 | tr -d '\r')
TOTAL=$((HITS + MISSES))
VALUE=100
[ "$TOTAL" -gt 0 ] && VALUE=$(( HITS * 100 / TOTAL ))
VAL_STR="${VALUE}% (Hits:$HITS, Misses:$MISSES)"
if [ "$VALUE" -le "$CRIT" ]; then CURR_STATUS=$STATE_CRITICAL; elif [ "$VALUE" -le "$WARN" ]; then CURR_STATUS=$STATE_WARNING; fi
;;
persistence)
PERS=$(redis-cli -p "$PORT" info persistence 2>/dev/null)
RDB=$(echo "$PERS" | grep "rdb_last_bgsave_status:" | cut -d: -f2 | tr -d '\r')
AOF=$(echo "$PERS" | grep "aof_last_write_status:" | cut -d: -f2 | tr -d '\r')
VAL_STR="RDB:$RDB, AOF:$AOF"
if [ "$RDB" != "ok" ] || [ "$AOF" != "ok" ]; then CURR_STATUS=$STATE_CRITICAL; fi
;;
replication)
REPL=$(redis-cli -p "$PORT" info replication 2>/dev/null)
ROLE=$(echo "$REPL" | grep "role:" | cut -d: -f2 | tr -d '\r')
if [ "$ROLE" == "slave" ]; then
LINK=$(echo "$REPL" | grep "master_link_status:" | cut -d: -f2 | tr -d '\r')
LAG=$(echo "$REPL" | grep "master_last_io_seconds_ago:" | cut -d: -f2 | tr -d '\r')
VAL_STR="Slave, Link:$LINK, Lag:${LAG}s"
if [ "$LINK" != "up" ] || [ "$LAG" -ge "$CRIT" ]; then CURR_STATUS=$STATE_CRITICAL; elif [ "$LAG" -ge "$WARN" ]; then CURR_STATUS=$STATE_WARNING; fi
else
SLAVE_COUNT=$(echo "$REPL" | grep "connected_slaves:" | cut -d: -f2 | tr -d '\r')
VAL_STR="Master, Slaves:$SLAVE_COUNT"
fi
;;
clients)
CLIENTS=$(redis-cli -p "$PORT" info clients 2>/dev/null | grep "connected_clients:" | cut -d: -f2 | tr -d '\r')
MAX_CLIENTS=$(redis-cli -p "$PORT" config get maxclients 2>/dev/null | tail -n1)
VAL_STR="$CLIENTS clients"
if [ -n "$MAX_CLIENTS" ]; then VAL_STR="$VAL_STR (Max:$MAX_CLIENTS)"; fi
if [ "$CLIENTS" -ge "$CRIT" ]; then CURR_STATUS=$STATE_CRITICAL; elif [ "$CLIENTS" -ge "$WARN" ]; then CURR_STATUS=$STATE_WARNING; fi
;;
*) usage ;;
esac
# Stockage du résultat
MSG="Port $PORT ($VAL_STR)"
case "$CURR_STATUS" in
$STATE_CRITICAL)
if [ -z "$CRIT_LIST" ]; then CRIT_LIST="$MSG"; else CRIT_LIST="$CRIT_LIST, $MSG"; fi
[ "$EXIT_CODE" -lt $STATE_CRITICAL ] && EXIT_CODE=$STATE_CRITICAL
;;
$STATE_WARNING)
if [ -z "$WARN_LIST" ]; then WARN_LIST="$MSG"; else WARN_LIST="$WARN_LIST, $MSG"; fi
[ "$EXIT_CODE" -lt $STATE_WARNING ] && EXIT_CODE=$STATE_WARNING
;;
$STATE_OK)
if [ -z "$OK_LIST" ]; then OK_LIST="$MSG"; else OK_LIST="$OK_LIST, $MSG"; fi
;;
esac
done
# Construction du message de sortie
if [ "$EXIT_CODE" -eq $STATE_OK ]; then
echo "OK: All $INSTANCES_FOUND instance(s) are healthy ($CHECK_TYPE). $OK_LIST"
exit $STATE_OK
fi
FINAL_MSG=""
if [ "$EXIT_CODE" -eq $STATE_CRITICAL ]; then
FINAL_MSG="CRITICAL: "
if [ -n "$CRIT_LIST" ]; then FINAL_MSG="${FINAL_MSG}${CRIT_LIST}"; fi
if [ -n "$WARN_LIST" ]; then
if [ -n "$CRIT_LIST" ]; then FINAL_MSG="${FINAL_MSG} - "; fi
FINAL_MSG="${FINAL_MSG}WARNING: ${WARN_LIST}"
fi
elif [ "$EXIT_CODE" -eq $STATE_WARNING ]; then
FINAL_MSG="WARNING: $WARN_LIST"
fi
echo "$FINAL_MSG"
exit "$EXIT_CODE"

View File

@@ -103,4 +103,23 @@ command[check_k8s_pv_pvc]=/usr/bin/sudo /usr/lib/nagios/plugins/check_k8s_pv_pvc
command[check_k8s_replicasets]=/usr/bin/sudo /usr/lib/nagios/plugins/check_k8s_replicasets command[check_k8s_replicasets]=/usr/bin/sudo /usr/lib/nagios/plugins/check_k8s_replicasets
command[check_k8s_pod_restarts]=/usr/bin/sudo /usr/lib/nagios/plugins/check_k8s_pod_restarts command[check_k8s_pod_restarts]=/usr/bin/sudo /usr/lib/nagios/plugins/check_k8s_pod_restarts
{% endif %} {% endif %}
{% endif %}
{% if nrpe_redis_memory_warning is defined or nrpe_redis_memory_critical is defined or nrpe_redis_persistence is defined %}
# redis
command[check_redis_health]=/usr/bin/sudo /usr/lib/nagios/plugins/check_redis_health -x ping
command[check_redis_memory]=/usr/bin/sudo /usr/lib/nagios/plugins/check_redis_health -x memory -w {{ nrpe_redis_memory_warning }} -c {{ nrpe_redis_memory_critical }}
command[check_redis_persistence]=/usr/bin/sudo /usr/lib/nagios/plugins/check_redis_health -x persistence
{% if nrpe_redis_connected_clients_warning is defined or nrpe_redis_connected_clients_critical is defined %}
command[check_redis_health_clients]=/usr/bin/sudo /usr/lib/nagios/plugins/check_redis_health -x clients -w {{ nrpe_redis_connected_clients_warning }} -c {{ nrpe_redis_connected_clients_critical }}
{% endif %}
{% if nrpe_redis_hitrate_warning is defined or nrpe_redis_hitrate_critical is defined %}
command[check_redis_health_hitrate]=/usr/bin/sudo /usr/lib/nagios/plugins/check_redis_health -x hitrate -w {{ nrpe_redis_hitrate_warning }} -c {{ nrpe_redis_hitrate_critical }}
{% endif %}
{% if nrpe_redis_fragments_warning is defined or nrpe_redis_fragments_critical is defined %}
command[check_redis_health_frag]=/usr/bin/sudo /usr/lib/nagios/plugins/check_redis_health -x frag -w {{ nrpe_redis_fragments_warning }} -c {{ nrpe_redis_fragments_critical }}
{% endif %}
{% if nrpe_redis_replication_lag_warning is defined or nrpe_redis_replication_lag_critical is defined %}
command[check_redis_health_replication]=/usr/bin/sudo /usr/lib/nagios/plugins/check_redis_health -x replication -w {{ nrpe_redis_replication_lag_warning }} -c {{ nrpe_redis_replication_lag_critical }}
{% endif %}
{% endif %} {% endif %}

View File

@@ -11,4 +11,5 @@ nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_k8s_jobs_cronjobs
nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_k8s_pki_certs nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_k8s_pki_certs
nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_k8s_pv_pvc nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_k8s_pv_pvc
nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_k8s_replicasets nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_k8s_replicasets
nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_k8s_pod_restarts nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_k8s_pod_restarts
nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_redis_health