#!/bin/bash
#
# Nagios plugin to check for redis health metrics.
#
# Copyright (c) 2026, GitHub Copilot
#
# This script is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This script is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.

# Nagios exit codes
STATE_OK=0
STATE_WARNING=1
STATE_CRITICAL=2
STATE_UNKNOWN=3

# --- Valeurs par défaut ---
CHECK_TYPE=""
WARN=0
CRIT=0
AUTH=""

# --- Aide ---
usage() {
    cat << EOF
Usage: $0 -x [type] -w <warning> -c <critical> [-a <password>]

Types de check (-x):
  ping        : Test de connexion simple (PONG)
  memory      : Utilisation mémoire en % (Ex: -w 80 -c 90)
  frag        : Ratio de fragmentation (Ex: -w 1.5 -c 2.0)
  hitrate     : Taux d'efficacité du cache en % (Ex: -w 80 -c 50)
  persistence : État des sauvegardes RDB/AOF
  replication : Statut Master/Slave et lag (Ex: -w 10 -c 60)
  clients     : Nombre de clients connectés (Ex: -w 200 -c 500)

Optionnelle:
  -a : Mot de passe Redis (sera passé via REDISCLI_AUTH)
EOF
    exit $STATE_UNKNOWN
}

# --- Parsing des arguments ---
while getopts "x:w:c:a:?" opt; do
    case "$opt" in
        x) CHECK_TYPE="$OPTARG" ;;
        w) WARN="$OPTARG" ;;
        c) CRIT="$OPTARG" ;;
        a) AUTH="$OPTARG" ;;
        *) usage ;;
    esac
done

if [ -z "$CHECK_TYPE" ]; then usage; fi

# Export du mot de passe pour redis-cli
if [ -n "$AUTH" ]; then
    export REDISCLI_AUTH="$AUTH"
fi

EXIT_CODE=$STATE_OK
INSTANCES_FOUND=0

# Listes pour stocker les résultats
CRIT_LIST=""
WARN_LIST=""
OK_LIST=""

# 1. Détection des ports
if command -v ss &> /dev/null; then
    PORTS=$(ss -tlnp | grep 'redis-server' | awk '{print $4}' | awk -F: '{print $NF}' | sort -u)
elif command -v netstat &> /dev/null; then
    PORTS=$(netstat -tlnp | grep 'redis-server' | awk '{print $4}' | awk -F: '{print $NF}' | sort -u)
else
    PORTS=$(ps -ef | grep 'redis-server' | grep -v grep | grep -oP '(?<=:)\d{4,5}' | sort -u)
fi

if [ -z "$PORTS" ]; then
    echo "OK: Aucun Redis détecté"
    exit $STATE_OK
fi

# 2. Exécution du check
for PORT in $PORTS; do
    # On ignore les ports < 1024 sauf le 6379 standard
    if [ "$PORT" -lt 1024 ] && [ "$PORT" -ne 6379 ]; then continue; fi

    # Vérification si l'instance répond
    if ! redis-cli -p "$PORT" PING 2>/dev/null | grep -q "PONG"; then
        MSG="Port $PORT (Erreur Connexion/Auth)"
        if [ -z "$CRIT_LIST" ]; then CRIT_LIST="$MSG"; else CRIT_LIST="$CRIT_LIST, $MSG"; fi
        [ "$EXIT_CODE" -lt $STATE_CRITICAL ] && EXIT_CODE=$STATE_CRITICAL
        ((INSTANCES_FOUND++))
        continue
    fi

    ((INSTANCES_FOUND++))
    CURR_STATUS=$STATE_OK
    VAL_STR=""

    case "$CHECK_TYPE" in
        ping)
            VAL_STR="PONG"
            ;;
        memory)
            MEM_INFO=$(redis-cli -p "$PORT" info memory 2>/dev/null)
            USED=$(echo "$MEM_INFO" | grep "used_memory:" | cut -d: -f2 | tr -d '\r')
            MAX=$(echo "$MEM_INFO" | grep "maxmemory:" | cut -d: -f2 | tr -d '\r')
            USED_H=$(echo "$MEM_INFO" | grep "used_memory_human:" | cut -d: -f2 | tr -d '\r')
            MAX_H=$(echo "$MEM_INFO" | grep "maxmemory_human:" | cut -d: -f2 | tr -d '\r')
            if [ "$MAX" -gt 0 ]; then
                VALUE=$(( USED * 100 / MAX ))
                VAL_STR="${VALUE}% (Used:${USED_H}, Max:${MAX_H})"
                if [ "$VALUE" -ge "$CRIT" ]; then CURR_STATUS=$STATE_CRITICAL; elif [ "$VALUE" -ge "$WARN" ]; then CURR_STATUS=$STATE_WARNING; fi
            else
                VAL_STR="NoLimit (Used:${USED_H})"
            fi
            ;;
        frag)
            FRAG=$(redis-cli -p "$PORT" info memory 2>/dev/null | grep "mem_fragmentation_ratio:" | cut -d: -f2 | tr -d '\r')
            IS_CRIT=$(echo "$FRAG >= $CRIT" | bc -l 2>/dev/null)
            IS_WARN=$(echo "$FRAG >= $WARN" | bc -l 2>/dev/null)
            VAL_STR="Ratio:$FRAG"
            if [ "$IS_CRIT" == "1" ]; then CURR_STATUS=$STATE_CRITICAL; elif [ "$IS_WARN" == "1" ]; then CURR_STATUS=$STATE_WARNING; fi
            ;;
        hitrate)
            STATS=$(redis-cli -p "$PORT" info stats 2>/dev/null)
            HITS=$(echo "$STATS" | grep "keyspace_hits:" | cut -d: -f2 | tr -d '\r')
            MISSES=$(echo "$STATS" | grep "keyspace_misses:" | cut -d: -f2 | tr -d '\r')
            TOTAL=$((HITS + MISSES))
            VALUE=100
            [ "$TOTAL" -gt 0 ] && VALUE=$(( HITS * 100 / TOTAL ))
            VAL_STR="${VALUE}% (Hits:$HITS, Misses:$MISSES)"
            if [ "$VALUE" -le "$CRIT" ]; then CURR_STATUS=$STATE_CRITICAL; elif [ "$VALUE" -le "$WARN" ]; then CURR_STATUS=$STATE_WARNING; fi
            ;;
        persistence)
            PERS=$(redis-cli -p "$PORT" info persistence 2>/dev/null)
            RDB=$(echo "$PERS" | grep "rdb_last_bgsave_status:" | cut -d: -f2 | tr -d '\r')
            AOF=$(echo "$PERS" | grep "aof_last_write_status:" | cut -d: -f2 | tr -d '\r')
            VAL_STR="RDB:$RDB, AOF:$AOF"
            if [ "$RDB" != "ok" ] || [ "$AOF" != "ok" ]; then CURR_STATUS=$STATE_CRITICAL; fi
            ;;
        replication)
            REPL=$(redis-cli -p "$PORT" info replication 2>/dev/null)
            ROLE=$(echo "$REPL" | grep "role:" | cut -d: -f2 | tr -d '\r')
            if [ "$ROLE" == "slave" ]; then
                LINK=$(echo "$REPL" | grep "master_link_status:" | cut -d: -f2 | tr -d '\r')
                LAG=$(echo "$REPL" | grep "master_last_io_seconds_ago:" | cut -d: -f2 | tr -d '\r')
                VAL_STR="Slave, Link:$LINK, Lag:${LAG}s"
                if [ "$LINK" != "up" ] || [ "$LAG" -ge "$CRIT" ]; then CURR_STATUS=$STATE_CRITICAL; elif [ "$LAG" -ge "$WARN" ]; then CURR_STATUS=$STATE_WARNING; fi
            else
                SLAVE_COUNT=$(echo "$REPL" | grep "connected_slaves:" | cut -d: -f2 | tr -d '\r')
                VAL_STR="Master, Slaves:$SLAVE_COUNT"
            fi
            ;;
        clients)
            CLIENTS=$(redis-cli -p "$PORT" info clients 2>/dev/null | grep "connected_clients:" | cut -d: -f2 | tr -d '\r')
            MAX_CLIENTS=$(redis-cli -p "$PORT" config get maxclients 2>/dev/null | tail -n1)
            VAL_STR="$CLIENTS clients"
            if [ -n "$MAX_CLIENTS" ]; then VAL_STR="$VAL_STR (Max:$MAX_CLIENTS)"; fi
            if [ "$CLIENTS" -ge "$CRIT" ]; then CURR_STATUS=$STATE_CRITICAL; elif [ "$CLIENTS" -ge "$WARN" ]; then CURR_STATUS=$STATE_WARNING; fi
            ;;
        *) usage ;;
    esac

    # Stockage du résultat
    MSG="Port $PORT ($VAL_STR)"
    case "$CURR_STATUS" in
        $STATE_CRITICAL) 
            if [ -z "$CRIT_LIST" ]; then CRIT_LIST="$MSG"; else CRIT_LIST="$CRIT_LIST, $MSG"; fi
            [ "$EXIT_CODE" -lt $STATE_CRITICAL ] && EXIT_CODE=$STATE_CRITICAL 
            ;;
        $STATE_WARNING) 
            if [ -z "$WARN_LIST" ]; then WARN_LIST="$MSG"; else WARN_LIST="$WARN_LIST, $MSG"; fi
            [ "$EXIT_CODE" -lt $STATE_WARNING ] && EXIT_CODE=$STATE_WARNING 
            ;;
        $STATE_OK)
            if [ -z "$OK_LIST" ]; then OK_LIST="$MSG"; else OK_LIST="$OK_LIST, $MSG"; fi
            ;;
    esac
done

# Construction du message de sortie
if [ "$EXIT_CODE" -eq $STATE_OK ]; then
    echo "OK: All $INSTANCES_FOUND instance(s) are healthy ($CHECK_TYPE). $OK_LIST"
    exit $STATE_OK
fi

FINAL_MSG=""
if [ "$EXIT_CODE" -eq $STATE_CRITICAL ]; then
    FINAL_MSG="CRITICAL: "
    if [ -n "$CRIT_LIST" ]; then FINAL_MSG="${FINAL_MSG}${CRIT_LIST}"; fi
    if [ -n "$WARN_LIST" ]; then 
        if [ -n "$CRIT_LIST" ]; then FINAL_MSG="${FINAL_MSG} - "; fi
        FINAL_MSG="${FINAL_MSG}WARNING: ${WARN_LIST}"
    fi
elif [ "$EXIT_CODE" -eq $STATE_WARNING ]; then
    FINAL_MSG="WARNING: $WARN_LIST"
fi

echo "$FINAL_MSG"
exit "$EXIT_CODE"
