Files
nrpe/files/nrpe/check_etcd_health
2025-12-31 15:17:51 +01:00

230 lines
7.8 KiB
Bash
Executable File

#!/usr/bin/env bash
# check_etcd_health
# Verifie la santé d'etcd et (optionnel) la creation/verifieation des snapshots.
# Retourne : 0=OK, 1=WARNING, 2=CRITICAL, 3=UNKNOWN
#
# Usage example:
# sudo /usr/lib/nagios/plugins/check_etcd_health \
# --endpoints "https://192.168.1.41:2379,https://192.168.1.42:2379" \
# --cacert /etc/ssl/etcd/ssl/ca.pem --cert /etc/ssl/etcd/ssl/admin.pem --key /etc/ssl/etcd/ssl/admin-key.pem \
# --test-snapshot --snapshot-dir /var/backups/etcd --snapshot-max-age 24
#
# Notes:
# - Par securite, execute ce script sur un master (ou via NRPE/SSH) avec un utilisateur ayant acces aux clefs.
# - --snapshot-max-age en heures (defaut 24). Mettre 0 pour desactiver la verification d'age.
# - --test-snapshot creerera un snapshot temporaire pour valider la creation + verification via `etcdctl snapshot status`.
# - Si --keep-snapshot-on-failure est active, le snapshot temporaire sera conserve en cas d'erreur pour debug.
ETCDCTL=${ETCDCTL:-/usr/local/bin/etcdctl}
print_usage() {
cat <<EOF
Usage: $0 --endpoints ENDPOINTS --cacert CA --cert CERT --key KEY [options]
Options:
--warn-db-mb N avertissement si DB >= N MB (default 1024)
--crit-db-mb M critique si DB >= M MB (default 1800)
--timeout SECS etcdctl timeout (default 10)
--test-snapshot tenter de creer un snapshot temporaire et verifier son status
--snapshot-dir DIR repertoire pour snapshots temporaires (default /var/backups/etcd)
--keep-snapshot-on-failure conserver le snapshot temporaire si creation echoue (default false)
--snapshot-max-age HRS verifier qu'il existe un snapshot plus recent que HRS heures (default 24). Mettre 0 pour desactiver.
-h, --help affiche cette aide
EOF
}
# Defaults
WARN_DB_MB=${WARN_DB_MB:-1024}
CRIT_DB_MB=${CRIT_DB_MB:-1800}
TIMEOUT=${TIMEOUT:-10}
TEST_SNAPSHOT=0
SNAPSHOT_DIR=${SNAPSHOT_DIR:-/var/backups/etcd}
KEEP_SNAPSHOT_ON_FAILURE=0
SNAPSHOT_MAX_AGE_HOURS=${SNAPSHOT_MAX_AGE_HOURS:-24}
# Parse args
while [[ $# -gt 0 ]]; do
case "$1" in
--endpoints) ENDPOINTS="$2"; shift 2;;
--cacert) CACERT="$2"; shift 2;;
--cert) CERT="$2"; shift 2;;
--key) KEY="$2"; shift 2;;
--warn-db-mb) WARN_DB_MB="$2"; shift 2;;
--crit-db-mb) CRIT_DB_MB="$2"; shift 2;;
--timeout) TIMEOUT="$2"; shift 2;;
--test-snapshot) TEST_SNAPSHOT=1; shift 1;;
--snapshot-dir) SNAPSHOT_DIR="$2"; shift 2;;
--keep-snapshot-on-failure) KEEP_SNAPSHOT_ON_FAILURE=1; shift 1;;
--snapshot-max-age) SNAPSHOT_MAX_AGE_HOURS="$2"; shift 2;;
-h|--help) print_usage; exit 3;;
*) echo "Unknown arg: $1"; print_usage; exit 3;;
esac
done
# Allow env fallback (if ETCDCTL_* env vars set)
ENDPOINTS=${ENDPOINTS:-${ETCDCTL_ENDPOINTS:-}}
CACERT=${CACERT:-${ETCDCTL_CACERT:-}}
CERT=${CERT:-${ETCDCTL_CERT:-}}
KEY=${KEY:-${ETCDCTL_KEY:-}}
if [[ -z "${ENDPOINTS:-}" || -z "${CACERT:-}" || -z "${CERT:-}" || -z "${KEY:-}" ]]; then
echo "UNKNOWN - missing required args/certs"
print_usage
exit 3
fi
if [[ ! -x "$ETCDCTL" ]]; then
echo "UNKNOWN - etcdctl not found at $ETCDCTL"
exit 3
fi
if [[ ! -r "$CACERT" || ! -r "$CERT" || ! -r "$KEY" ]]; then
echo "CRITICAL - cannot read certificate files (permissions?)"
echo "CACERT=$CACERT CERT=$CERT KEY=$KEY"
exit 2
fi
export ETCDCTL_API=3
# 1) endpoint status check
OUT=$("$ETCDCTL" --command-timeout="${TIMEOUT}s" --endpoints="${ENDPOINTS}" --cacert="$CACERT" --cert="$CERT" --key="$KEY" endpoint status 2>&1) || {
echo "CRITICAL - etcdctl endpoint status failed: $OUT"
exit 2
}
leaders=0
total=0
max_db_mb=0
while IFS= read -r line; do
line=${line//$'\r'/}
[[ -z "$line" ]] && continue
total=$((total+1))
IFS=',' read -r endpoint id version dbsize isLeader isLearner memberCount rest <<<"$line"
isLeader=$(echo "${isLeader:-}" | tr -d ' ' | tr '[:upper:]' '[:lower:]')
if [[ "$isLeader" == "true" ]]; then leaders=$((leaders+1)); fi
db_mb=0
if [[ -n "${dbsize:-}" ]]; then
dbsize=$(echo "$dbsize" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
num=$(echo "$dbsize" | awk '{print $1}' 2>/dev/null || echo "")
unit=$(echo "$dbsize" | awk '{print $2}' 2>/dev/null || echo "")
if [[ "$num" =~ ^[0-9]+(\.[0-9]+)?$ ]]; then
case "${unit^^}" in
B) db_mb=$(( num / 1024 / 1024 )) ;;
KB) db_mb=$(( num / 1024 )) ;;
MB) db_mb=$(printf "%.0f" "$num") ;;
GB) db_mb=$(( num * 1024 )) ;;
*) db_mb=$(printf "%.0f" "$num") ;;
esac
fi
fi
if (( db_mb > max_db_mb )); then max_db_mb=$db_mb; fi
done <<< "$OUT"
if (( total == 0 )); then
echo "CRITICAL - no endpoints returned by etcdctl"
exit 2
fi
if (( leaders == 0 )); then
echo "CRITICAL - no leader found among $total endpoints; detail: $OUT"
exit 2
fi
if (( leaders > 1 )); then
echo "WARNING - multiple leaders detected: $leaders (possible split-brain); detail: $OUT"
exit 1
fi
if (( max_db_mb >= CRIT_DB_MB )); then
echo "CRITICAL - etcd DB size ${max_db_mb}MB >= ${CRIT_DB_MB}MB"
exit 2
fi
if (( max_db_mb >= WARN_DB_MB )); then
echo "WARNING - etcd DB size ${max_db_mb}MB >= ${WARN_DB_MB}MB"
exit 1
fi
# 2) Verification of recent snapshot files (optional, default 24h)
SNAP_CHECK_MSG=""
if [[ -n "$SNAPSHOT_MAX_AGE_HOURS" ]]; then
# SNAPSHOT_MAX_AGE_HOURS == 0 -> disabled
if (( SNAPSHOT_MAX_AGE_HOURS > 0 )); then
mkdir -p "$SNAPSHOT_DIR" 2>/dev/null || {
echo "CRITICAL - cannot create/access snapshot dir $SNAPSHOT_DIR"
exit 2
}
latest_snapshot=$(ls -1t "$SNAPSHOT_DIR"/snapshot-*.db 2>/dev/null | head -n1 || true)
if [[ -z "$latest_snapshot" ]]; then
SNAP_CHECK_MSG="no snapshot files found in $SNAPSHOT_DIR"
echo "CRITICAL - $SNAP_CHECK_MSG (no snapshots)"
exit 2
else
now_s=$(date +%s)
snap_mtime_s=$(stat -c %Y "$latest_snapshot")
age_s=$(( now_s - snap_mtime_s ))
age_h=$(( age_s / 3600 ))
if (( age_h > SNAPSHOT_MAX_AGE_HOURS )); then
SNAP_CHECK_MSG="latest snapshot $latest_snapshot is ${age_h}h old (> ${SNAPSHOT_MAX_AGE_HOURS}h)"
echo "CRITICAL - $SNAP_CHECK_MSG"
exit 2
else
SNAP_CHECK_MSG="latest snapshot $latest_snapshot is ${age_h}h old (<= ${SNAPSHOT_MAX_AGE_HOURS}h)"
fi
fi
fi
fi
# 3) Optional: test snapshot creation and status
SNAP_TEST_MSG=""
if (( TEST_SNAPSHOT == 1 )); then
mkdir -p "$SNAPSHOT_DIR" 2>/dev/null || {
echo "CRITICAL - cannot create/access snapshot dir $SNAPSHOT_DIR"
exit 2
}
if [[ ! -w "$SNAPSHOT_DIR" ]]; then
echo "CRITICAL - snapshot dir not writable: $SNAPSHOT_DIR"
exit 2
fi
SNAPFILE=$(mktemp "${SNAPSHOT_DIR}/snapshot-XXXXXX.db") || {
echo "CRITICAL - mktemp failed in $SNAPSHOT_DIR"
exit 2
}
cleanup() {
rc=$?
if [[ $rc -eq 0 ]]; then
rm -f "$SNAPFILE" 2>/dev/null || true
else
if [[ $KEEP_SNAPSHOT_ON_FAILURE -eq 0 ]]; then
rm -f "$SNAPFILE" 2>/dev/null || true
else
echo "NOTICE - snapshot kept at $SNAPFILE for debugging"
fi
fi
return $rc
}
trap 'cleanup' EXIT
SAVE_OUT=$("$ETCDCTL" --command-timeout="${TIMEOUT}s" --endpoints="${ENDPOINTS}" --cacert="$CACERT" --cert="$CERT" --key="$KEY" snapshot save "$SNAPFILE" 2>&1) || {
echo "CRITICAL - snapshot save failed: $SAVE_OUT"
exit 2
}
STATUS_OUT=$("$ETCDCTL" snapshot status "$SNAPFILE" 2>&1) || {
echo "CRITICAL - snapshot status failed: $STATUS_OUT"
exit 2
}
# If we reach here, creation+status ok
SNAP_TEST_MSG="snapshot test ok: $SNAPFILE ; status: $(echo "$STATUS_OUT" | tr '\n' ' ' | sed 's/ */ /g')"
# cleanup will remove the snapshot (unless KEEP_SNAPSHOT_ON_FAILURE and rc != 0)
fi
# Compose final message
MSG="OK - $total endpoints checked, leaders=$leaders, max_db=${max_db_mb}MB"
if [[ -n "$SNAP_CHECK_MSG" ]]; then
MSG="$MSG ; $SNAP_CHECK_MSG"
fi
if [[ -n "$SNAP_TEST_MSG" ]]; then
MSG="$MSG ; $SNAP_TEST_MSG"
fi
echo "$MSG"
exit 0