You've already forked nrpe
230 lines
7.8 KiB
Bash
Executable File
230 lines
7.8 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# check_etcd_health
|
|
# Verifie la santé d'etcd et (optionnel) la creation/verifieation des snapshots.
|
|
# Retourne : 0=OK, 1=WARNING, 2=CRITICAL, 3=UNKNOWN
|
|
#
|
|
# Usage example:
|
|
# sudo /usr/lib/nagios/plugins/check_etcd_health \
|
|
# --endpoints "https://192.168.1.41:2379,https://192.168.1.42:2379" \
|
|
# --cacert /etc/ssl/etcd/ssl/ca.pem --cert /etc/ssl/etcd/ssl/admin.pem --key /etc/ssl/etcd/ssl/admin-key.pem \
|
|
# --test-snapshot --snapshot-dir /var/backups/etcd --snapshot-max-age 24
|
|
#
|
|
# Notes:
|
|
# - Par securite, execute ce script sur un master (ou via NRPE/SSH) avec un utilisateur ayant acces aux clefs.
|
|
# - --snapshot-max-age en heures (defaut 24). Mettre 0 pour desactiver la verification d'age.
|
|
# - --test-snapshot creerera un snapshot temporaire pour valider la creation + verification via `etcdctl snapshot status`.
|
|
# - Si --keep-snapshot-on-failure est active, le snapshot temporaire sera conserve en cas d'erreur pour debug.
|
|
|
|
ETCDCTL=${ETCDCTL:-/usr/local/bin/etcdctl}
|
|
|
|
print_usage() {
|
|
cat <<EOF
|
|
Usage: $0 --endpoints ENDPOINTS --cacert CA --cert CERT --key KEY [options]
|
|
Options:
|
|
--warn-db-mb N avertissement si DB >= N MB (default 1024)
|
|
--crit-db-mb M critique si DB >= M MB (default 1800)
|
|
--timeout SECS etcdctl timeout (default 10)
|
|
--test-snapshot tenter de creer un snapshot temporaire et verifier son status
|
|
--snapshot-dir DIR repertoire pour snapshots temporaires (default /var/backups/etcd)
|
|
--keep-snapshot-on-failure conserver le snapshot temporaire si creation echoue (default false)
|
|
--snapshot-max-age HRS verifier qu'il existe un snapshot plus recent que HRS heures (default 24). Mettre 0 pour desactiver.
|
|
-h, --help affiche cette aide
|
|
EOF
|
|
}
|
|
|
|
# Defaults
|
|
WARN_DB_MB=${WARN_DB_MB:-1024}
|
|
CRIT_DB_MB=${CRIT_DB_MB:-1800}
|
|
TIMEOUT=${TIMEOUT:-10}
|
|
TEST_SNAPSHOT=0
|
|
SNAPSHOT_DIR=${SNAPSHOT_DIR:-/var/backups/etcd}
|
|
KEEP_SNAPSHOT_ON_FAILURE=0
|
|
SNAPSHOT_MAX_AGE_HOURS=${SNAPSHOT_MAX_AGE_HOURS:-24}
|
|
|
|
# Parse args
|
|
while [[ $# -gt 0 ]]; do
|
|
case "$1" in
|
|
--endpoints) ENDPOINTS="$2"; shift 2;;
|
|
--cacert) CACERT="$2"; shift 2;;
|
|
--cert) CERT="$2"; shift 2;;
|
|
--key) KEY="$2"; shift 2;;
|
|
--warn-db-mb) WARN_DB_MB="$2"; shift 2;;
|
|
--crit-db-mb) CRIT_DB_MB="$2"; shift 2;;
|
|
--timeout) TIMEOUT="$2"; shift 2;;
|
|
--test-snapshot) TEST_SNAPSHOT=1; shift 1;;
|
|
--snapshot-dir) SNAPSHOT_DIR="$2"; shift 2;;
|
|
--keep-snapshot-on-failure) KEEP_SNAPSHOT_ON_FAILURE=1; shift 1;;
|
|
--snapshot-max-age) SNAPSHOT_MAX_AGE_HOURS="$2"; shift 2;;
|
|
-h|--help) print_usage; exit 3;;
|
|
*) echo "Unknown arg: $1"; print_usage; exit 3;;
|
|
esac
|
|
done
|
|
|
|
# Allow env fallback (if ETCDCTL_* env vars set)
|
|
ENDPOINTS=${ENDPOINTS:-${ETCDCTL_ENDPOINTS:-}}
|
|
CACERT=${CACERT:-${ETCDCTL_CACERT:-}}
|
|
CERT=${CERT:-${ETCDCTL_CERT:-}}
|
|
KEY=${KEY:-${ETCDCTL_KEY:-}}
|
|
|
|
if [[ -z "${ENDPOINTS:-}" || -z "${CACERT:-}" || -z "${CERT:-}" || -z "${KEY:-}" ]]; then
|
|
echo "UNKNOWN - missing required args/certs"
|
|
print_usage
|
|
exit 3
|
|
fi
|
|
|
|
if [[ ! -x "$ETCDCTL" ]]; then
|
|
echo "UNKNOWN - etcdctl not found at $ETCDCTL"
|
|
exit 3
|
|
fi
|
|
|
|
if [[ ! -r "$CACERT" || ! -r "$CERT" || ! -r "$KEY" ]]; then
|
|
echo "CRITICAL - cannot read certificate files (permissions?)"
|
|
echo "CACERT=$CACERT CERT=$CERT KEY=$KEY"
|
|
exit 2
|
|
fi
|
|
|
|
export ETCDCTL_API=3
|
|
|
|
# 1) endpoint status check
|
|
OUT=$("$ETCDCTL" --command-timeout="${TIMEOUT}s" --endpoints="${ENDPOINTS}" --cacert="$CACERT" --cert="$CERT" --key="$KEY" endpoint status 2>&1) || {
|
|
echo "CRITICAL - etcdctl endpoint status failed: $OUT"
|
|
exit 2
|
|
}
|
|
|
|
leaders=0
|
|
total=0
|
|
max_db_mb=0
|
|
while IFS= read -r line; do
|
|
line=${line//$'\r'/}
|
|
[[ -z "$line" ]] && continue
|
|
total=$((total+1))
|
|
IFS=',' read -r endpoint id version dbsize isLeader isLearner memberCount rest <<<"$line"
|
|
isLeader=$(echo "${isLeader:-}" | tr -d ' ' | tr '[:upper:]' '[:lower:]')
|
|
if [[ "$isLeader" == "true" ]]; then leaders=$((leaders+1)); fi
|
|
db_mb=0
|
|
if [[ -n "${dbsize:-}" ]]; then
|
|
dbsize=$(echo "$dbsize" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
|
|
num=$(echo "$dbsize" | awk '{print $1}' 2>/dev/null || echo "")
|
|
unit=$(echo "$dbsize" | awk '{print $2}' 2>/dev/null || echo "")
|
|
if [[ "$num" =~ ^[0-9]+(\.[0-9]+)?$ ]]; then
|
|
case "${unit^^}" in
|
|
B) db_mb=$(( num / 1024 / 1024 )) ;;
|
|
KB) db_mb=$(( num / 1024 )) ;;
|
|
MB) db_mb=$(printf "%.0f" "$num") ;;
|
|
GB) db_mb=$(( num * 1024 )) ;;
|
|
*) db_mb=$(printf "%.0f" "$num") ;;
|
|
esac
|
|
fi
|
|
fi
|
|
if (( db_mb > max_db_mb )); then max_db_mb=$db_mb; fi
|
|
done <<< "$OUT"
|
|
|
|
if (( total == 0 )); then
|
|
echo "CRITICAL - no endpoints returned by etcdctl"
|
|
exit 2
|
|
fi
|
|
if (( leaders == 0 )); then
|
|
echo "CRITICAL - no leader found among $total endpoints; detail: $OUT"
|
|
exit 2
|
|
fi
|
|
if (( leaders > 1 )); then
|
|
echo "WARNING - multiple leaders detected: $leaders (possible split-brain); detail: $OUT"
|
|
exit 1
|
|
fi
|
|
if (( max_db_mb >= CRIT_DB_MB )); then
|
|
echo "CRITICAL - etcd DB size ${max_db_mb}MB >= ${CRIT_DB_MB}MB"
|
|
exit 2
|
|
fi
|
|
if (( max_db_mb >= WARN_DB_MB )); then
|
|
echo "WARNING - etcd DB size ${max_db_mb}MB >= ${WARN_DB_MB}MB"
|
|
exit 1
|
|
fi
|
|
|
|
# 2) Verification of recent snapshot files (optional, default 24h)
|
|
SNAP_CHECK_MSG=""
|
|
if [[ -n "$SNAPSHOT_MAX_AGE_HOURS" ]]; then
|
|
# SNAPSHOT_MAX_AGE_HOURS == 0 -> disabled
|
|
if (( SNAPSHOT_MAX_AGE_HOURS > 0 )); then
|
|
mkdir -p "$SNAPSHOT_DIR" 2>/dev/null || {
|
|
echo "CRITICAL - cannot create/access snapshot dir $SNAPSHOT_DIR"
|
|
exit 2
|
|
}
|
|
latest_snapshot=$(ls -1t "$SNAPSHOT_DIR"/snapshot-*.db 2>/dev/null | head -n1 || true)
|
|
if [[ -z "$latest_snapshot" ]]; then
|
|
SNAP_CHECK_MSG="no snapshot files found in $SNAPSHOT_DIR"
|
|
echo "CRITICAL - $SNAP_CHECK_MSG (no snapshots)"
|
|
exit 2
|
|
else
|
|
now_s=$(date +%s)
|
|
snap_mtime_s=$(stat -c %Y "$latest_snapshot")
|
|
age_s=$(( now_s - snap_mtime_s ))
|
|
age_h=$(( age_s / 3600 ))
|
|
if (( age_h > SNAPSHOT_MAX_AGE_HOURS )); then
|
|
SNAP_CHECK_MSG="latest snapshot $latest_snapshot is ${age_h}h old (> ${SNAPSHOT_MAX_AGE_HOURS}h)"
|
|
echo "CRITICAL - $SNAP_CHECK_MSG"
|
|
exit 2
|
|
else
|
|
SNAP_CHECK_MSG="latest snapshot $latest_snapshot is ${age_h}h old (<= ${SNAPSHOT_MAX_AGE_HOURS}h)"
|
|
fi
|
|
fi
|
|
fi
|
|
fi
|
|
|
|
# 3) Optional: test snapshot creation and status
|
|
SNAP_TEST_MSG=""
|
|
if (( TEST_SNAPSHOT == 1 )); then
|
|
mkdir -p "$SNAPSHOT_DIR" 2>/dev/null || {
|
|
echo "CRITICAL - cannot create/access snapshot dir $SNAPSHOT_DIR"
|
|
exit 2
|
|
}
|
|
if [[ ! -w "$SNAPSHOT_DIR" ]]; then
|
|
echo "CRITICAL - snapshot dir not writable: $SNAPSHOT_DIR"
|
|
exit 2
|
|
fi
|
|
|
|
SNAPFILE=$(mktemp "${SNAPSHOT_DIR}/snapshot-XXXXXX.db") || {
|
|
echo "CRITICAL - mktemp failed in $SNAPSHOT_DIR"
|
|
exit 2
|
|
}
|
|
|
|
cleanup() {
|
|
rc=$?
|
|
if [[ $rc -eq 0 ]]; then
|
|
rm -f "$SNAPFILE" 2>/dev/null || true
|
|
else
|
|
if [[ $KEEP_SNAPSHOT_ON_FAILURE -eq 0 ]]; then
|
|
rm -f "$SNAPFILE" 2>/dev/null || true
|
|
else
|
|
echo "NOTICE - snapshot kept at $SNAPFILE for debugging"
|
|
fi
|
|
fi
|
|
return $rc
|
|
}
|
|
trap 'cleanup' EXIT
|
|
|
|
SAVE_OUT=$("$ETCDCTL" --command-timeout="${TIMEOUT}s" --endpoints="${ENDPOINTS}" --cacert="$CACERT" --cert="$CERT" --key="$KEY" snapshot save "$SNAPFILE" 2>&1) || {
|
|
echo "CRITICAL - snapshot save failed: $SAVE_OUT"
|
|
exit 2
|
|
}
|
|
|
|
STATUS_OUT=$("$ETCDCTL" snapshot status "$SNAPFILE" 2>&1) || {
|
|
echo "CRITICAL - snapshot status failed: $STATUS_OUT"
|
|
exit 2
|
|
}
|
|
|
|
# If we reach here, creation+status ok
|
|
SNAP_TEST_MSG="snapshot test ok: $SNAPFILE ; status: $(echo "$STATUS_OUT" | tr '\n' ' ' | sed 's/ */ /g')"
|
|
# cleanup will remove the snapshot (unless KEEP_SNAPSHOT_ON_FAILURE and rc != 0)
|
|
fi
|
|
|
|
# Compose final message
|
|
MSG="OK - $total endpoints checked, leaders=$leaders, max_db=${max_db_mb}MB"
|
|
if [[ -n "$SNAP_CHECK_MSG" ]]; then
|
|
MSG="$MSG ; $SNAP_CHECK_MSG"
|
|
fi
|
|
if [[ -n "$SNAP_TEST_MSG" ]]; then
|
|
MSG="$MSG ; $SNAP_TEST_MSG"
|
|
fi
|
|
|
|
echo "$MSG"
|
|
exit 0 |