nrpe/files/nrpe/check_etcd_health

#!/usr/bin/env bash
# check_etcd_health
# Verifie la santé d'etcd et (optionnel) la creation/verifieation des snapshots.
# Retourne : 0=OK, 1=WARNING, 2=CRITICAL, 3=UNKNOWN
#
# Usage example:
#  sudo /usr/lib/nagios/plugins/check_etcd_health \
#    --endpoints "https://192.168.1.41:2379,https://192.168.1.42:2379" \
#    --cacert /etc/ssl/etcd/ssl/ca.pem --cert /etc/ssl/etcd/ssl/admin.pem --key /etc/ssl/etcd/ssl/admin-key.pem \
#    --test-snapshot --snapshot-dir /var/backups/etcd --snapshot-max-age 24
#
# Notes:
# - Par securite, execute ce script sur un master (ou via NRPE/SSH) avec un utilisateur ayant acces aux clefs.
# - --snapshot-max-age en heures (defaut 24). Mettre 0 pour desactiver la verification d'age.
# - --test-snapshot creerera un snapshot temporaire pour valider la creation + verification via `etcdctl snapshot status`.
# - Si --keep-snapshot-on-failure est active, le snapshot temporaire sera conserve en cas d'erreur pour debug.

ETCDCTL=${ETCDCTL:-/usr/local/bin/etcdctl}

print_usage() {
  cat <<EOF
Usage: $0 --endpoints ENDPOINTS --cacert CA --cert CERT --key KEY [options]
Options:
  --warn-db-mb N           avertissement si DB >= N MB (default 1024)
  --crit-db-mb M           critique si DB >= M MB (default 1800)
  --timeout SECS           etcdctl timeout (default 10)
  --test-snapshot          tenter de creer un snapshot temporaire et verifier son status
  --snapshot-dir DIR       repertoire pour snapshots temporaires (default /var/backups/etcd)
  --keep-snapshot-on-failure  conserver le snapshot temporaire si creation echoue (default false)
  --snapshot-max-age HRS   verifier qu'il existe un snapshot plus recent que HRS heures (default 24). Mettre 0 pour desactiver.
  -h, --help               affiche cette aide
EOF
}

# Defaults
WARN_DB_MB=${WARN_DB_MB:-1024}
CRIT_DB_MB=${CRIT_DB_MB:-1800}
TIMEOUT=${TIMEOUT:-10}
TEST_SNAPSHOT=0
SNAPSHOT_DIR=${SNAPSHOT_DIR:-/var/backups/etcd}
KEEP_SNAPSHOT_ON_FAILURE=0
SNAPSHOT_MAX_AGE_HOURS=${SNAPSHOT_MAX_AGE_HOURS:-24}

# Parse args
while [[ $# -gt 0 ]]; do
  case "$1" in
    --endpoints) ENDPOINTS="$2"; shift 2;;
    --cacert) CACERT="$2"; shift 2;;
    --cert) CERT="$2"; shift 2;;
    --key) KEY="$2"; shift 2;;
    --warn-db-mb) WARN_DB_MB="$2"; shift 2;;
    --crit-db-mb) CRIT_DB_MB="$2"; shift 2;;
    --timeout) TIMEOUT="$2"; shift 2;;
    --test-snapshot) TEST_SNAPSHOT=1; shift 1;;
    --snapshot-dir) SNAPSHOT_DIR="$2"; shift 2;;
    --keep-snapshot-on-failure) KEEP_SNAPSHOT_ON_FAILURE=1; shift 1;;
    --snapshot-max-age) SNAPSHOT_MAX_AGE_HOURS="$2"; shift 2;;
    -h|--help) print_usage; exit 3;;
    *) echo "Unknown arg: $1"; print_usage; exit 3;;
  esac
done

# Allow env fallback (if ETCDCTL_* env vars set)
ENDPOINTS=${ENDPOINTS:-${ETCDCTL_ENDPOINTS:-}}
CACERT=${CACERT:-${ETCDCTL_CACERT:-}}
CERT=${CERT:-${ETCDCTL_CERT:-}}
KEY=${KEY:-${ETCDCTL_KEY:-}}

if [[ -z "${ENDPOINTS:-}" || -z "${CACERT:-}" || -z "${CERT:-}" || -z "${KEY:-}" ]]; then
  echo "UNKNOWN - missing required args/certs"
  print_usage
  exit 3
fi

if [[ ! -x "$ETCDCTL" ]]; then
  echo "UNKNOWN - etcdctl not found at $ETCDCTL"
  exit 3
fi

if [[ ! -r "$CACERT" || ! -r "$CERT" || ! -r "$KEY" ]]; then
  echo "CRITICAL - cannot read certificate files (permissions?)"
  echo "CACERT=$CACERT CERT=$CERT KEY=$KEY"
  exit 2
fi

export ETCDCTL_API=3

# 1) endpoint status check
OUT=$("$ETCDCTL" --command-timeout="${TIMEOUT}s" --endpoints="${ENDPOINTS}" --cacert="$CACERT" --cert="$CERT" --key="$KEY" endpoint status 2>&1) || {
  echo "CRITICAL - etcdctl endpoint status failed: $OUT"
  exit 2
}

leaders=0
total=0
max_db_mb=0
while IFS= read -r line; do
  line=${line//$'\r'/}
  [[ -z "$line" ]] && continue
  total=$((total+1))
  IFS=',' read -r endpoint id version dbsize isLeader isLearner memberCount rest <<<"$line"
  isLeader=$(echo "${isLeader:-}" | tr -d ' ' | tr '[:upper:]' '[:lower:]')
  if [[ "$isLeader" == "true" ]]; then leaders=$((leaders+1)); fi
  db_mb=0
  if [[ -n "${dbsize:-}" ]]; then
    dbsize=$(echo "$dbsize" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
    num=$(echo "$dbsize" | awk '{print $1}' 2>/dev/null || echo "")
    unit=$(echo "$dbsize" | awk '{print $2}' 2>/dev/null || echo "")
    if [[ "$num" =~ ^[0-9]+(\.[0-9]+)?$ ]]; then
      case "${unit^^}" in
        B) db_mb=$(( num / 1024 / 1024 )) ;;
        KB) db_mb=$(( num / 1024 )) ;;
        MB) db_mb=$(printf "%.0f" "$num") ;;
        GB) db_mb=$(( num * 1024 )) ;;
        *) db_mb=$(printf "%.0f" "$num") ;;
      esac
    fi
  fi
  if (( db_mb > max_db_mb )); then max_db_mb=$db_mb; fi
done <<< "$OUT"

if (( total == 0 )); then
  echo "CRITICAL - no endpoints returned by etcdctl"
  exit 2
fi
if (( leaders == 0 )); then
  echo "CRITICAL - no leader found among $total endpoints; detail: $OUT"
  exit 2
fi
if (( leaders > 1 )); then
  echo "WARNING - multiple leaders detected: $leaders (possible split-brain); detail: $OUT"
  exit 1
fi
if (( max_db_mb >= CRIT_DB_MB )); then
  echo "CRITICAL - etcd DB size ${max_db_mb}MB >= ${CRIT_DB_MB}MB"
  exit 2
fi
if (( max_db_mb >= WARN_DB_MB )); then
  echo "WARNING - etcd DB size ${max_db_mb}MB >= ${WARN_DB_MB}MB"
  exit 1
fi

# 2) Verification of recent snapshot files (optional, default 24h)
SNAP_CHECK_MSG=""
if [[ -n "$SNAPSHOT_MAX_AGE_HOURS" ]]; then
  # SNAPSHOT_MAX_AGE_HOURS == 0 -> disabled
  if (( SNAPSHOT_MAX_AGE_HOURS > 0 )); then
    mkdir -p "$SNAPSHOT_DIR" 2>/dev/null || {
      echo "CRITICAL - cannot create/access snapshot dir $SNAPSHOT_DIR"
      exit 2
    }
    latest_snapshot=$(ls -1t "$SNAPSHOT_DIR"/snapshot-*.db 2>/dev/null | head -n1 || true)
    if [[ -z "$latest_snapshot" ]]; then
      SNAP_CHECK_MSG="no snapshot files found in $SNAPSHOT_DIR"
      echo "CRITICAL - $SNAP_CHECK_MSG (no snapshots)"
      exit 2
    else
      now_s=$(date +%s)
      snap_mtime_s=$(stat -c %Y "$latest_snapshot")
      age_s=$(( now_s - snap_mtime_s ))
      age_h=$(( age_s / 3600 ))
      if (( age_h > SNAPSHOT_MAX_AGE_HOURS )); then
        SNAP_CHECK_MSG="latest snapshot $latest_snapshot is ${age_h}h old (> ${SNAPSHOT_MAX_AGE_HOURS}h)"
        echo "CRITICAL - $SNAP_CHECK_MSG"
        exit 2
      else
        SNAP_CHECK_MSG="latest snapshot $latest_snapshot is ${age_h}h old (<= ${SNAPSHOT_MAX_AGE_HOURS}h)"
      fi
    fi
  fi
fi

# 3) Optional: test snapshot creation and status
SNAP_TEST_MSG=""
if (( TEST_SNAPSHOT == 1 )); then
  mkdir -p "$SNAPSHOT_DIR" 2>/dev/null || {
    echo "CRITICAL - cannot create/access snapshot dir $SNAPSHOT_DIR"
    exit 2
  }
  if [[ ! -w "$SNAPSHOT_DIR" ]]; then
    echo "CRITICAL - snapshot dir not writable: $SNAPSHOT_DIR"
    exit 2
  fi

  SNAPFILE=$(mktemp "${SNAPSHOT_DIR}/snapshot-XXXXXX.db") || {
    echo "CRITICAL - mktemp failed in $SNAPSHOT_DIR"
    exit 2
  }

  cleanup() {
    rc=$?
    if [[ $rc -eq 0 ]]; then
      rm -f "$SNAPFILE" 2>/dev/null || true
    else
      if [[ $KEEP_SNAPSHOT_ON_FAILURE -eq 0 ]]; then
        rm -f "$SNAPFILE" 2>/dev/null || true
      else
        echo "NOTICE - snapshot kept at $SNAPFILE for debugging"
      fi
    fi
    return $rc
  }
  trap 'cleanup' EXIT

  SAVE_OUT=$("$ETCDCTL" --command-timeout="${TIMEOUT}s" --endpoints="${ENDPOINTS}" --cacert="$CACERT" --cert="$CERT" --key="$KEY" snapshot save "$SNAPFILE" 2>&1) || {
    echo "CRITICAL - snapshot save failed: $SAVE_OUT"
    exit 2
  }

  STATUS_OUT=$("$ETCDCTL" snapshot status "$SNAPFILE" 2>&1) || {
    echo "CRITICAL - snapshot status failed: $STATUS_OUT"
    exit 2
  }

  # If we reach here, creation+status ok
  SNAP_TEST_MSG="snapshot test ok: $SNAPFILE ; status: $(echo "$STATUS_OUT" | tr '\n' ' ' | sed 's/  */ /g')"
  # cleanup will remove the snapshot (unless KEEP_SNAPSHOT_ON_FAILURE and rc != 0)
fi

# Compose final message
MSG="OK - $total endpoints checked, leaders=$leaders, max_db=${max_db_mb}MB"
if [[ -n "$SNAP_CHECK_MSG" ]]; then
  MSG="$MSG ; $SNAP_CHECK_MSG"
fi
if [[ -n "$SNAP_TEST_MSG" ]]; then
  MSG="$MSG ; $SNAP_TEST_MSG"
fi

echo "$MSG"
exit 0