add k8s check & config

2025-11-24 08:38:24 +01:00
parent 0045a21479
commit 1730b93c3f
12 changed files with 1888 additions and 0 deletions
@@ -0,0 +1,135 @@
+#!/usr/bin/env bash
+# check_k8s_replicasets
+# Vérifie les ReplicaSets Kubernetes : readyReplicas < spec.replicas
+# Retour: 0=OK, 1=WARNING, 2=CRITICAL, 3=UNKNOWN
+#
+# Usage:
+#  sudo /usr/lib/nagios/plugins/check_k8s_replicasets [--warn N] [--crit M] [--ignore-ns ns1,ns2] [--namespaces ns1,ns2] [--age-min MINUTES]
+#
+set -euo pipefail
+
+WARN=${WARN:-0}   # nombre de RS en erreur pour WARNING
+CRIT=${CRIT:-1}   # nombre de RS en erreur pour CRITICAL par défaut (1 => 1 RS -> CRITICAL)
+IGNORE_NS=""
+INCLUDE_NS=""
+AGE_MIN=0
+
+print_usage() {
+  cat <<EOF
+Usage: $0 [--warn N] [--crit M] [--ignore-ns ns1,ns2] [--namespaces ns1,ns2] [--age-min MINUTES]
+ --warn N         : seuil warn si >=N ReplicaSets en erreur (default 0)
+ --crit M         : seuil crit si >=M ReplicaSets en erreur (default 1)
+ --ignore-ns LIST : comma separated namespaces to ignore (default none)
+ --namespaces LIST: comma separated namespaces to check only (default all)
+ --age-min N      : ignore ReplicaSets created less than N minutes ago (avoid flapping during rollout)
+EOF
+}
+
+# parse args
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --warn) WARN="$2"; shift 2;;
+    --crit) CRIT="$2"; shift 2;;
+    --ignore-ns) IGNORE_NS="$2"; shift 2;;
+    --namespaces) INCLUDE_NS="$2"; shift 2;;
+    --age-min) AGE_MIN="$2"; shift 2;;
+    -h|--help) print_usage; exit 3;;
+    *) echo "Unknown arg: $1"; print_usage; exit 3;;
+  esac
+done
+
+if ! command -v kubectl >/dev/null 2>&1; then
+  echo "UNKNOWN - kubectl not found"
+  exit 3
+fi
+
+# Build filter for namespace inclusion/exclusion (regex)
+ignore_pattern=""
+if [[ -n "$IGNORE_NS" ]]; then
+  IFS=',' read -ra arr <<< "$IGNORE_NS"
+  for ns in "${arr[@]}"; do
+    ignore_pattern="${ignore_pattern}|^${ns}\$"
+  done
+  ignore_pattern="${ignore_pattern#|}"
+fi
+
+include_pattern=""
+if [[ -n "$INCLUDE_NS" ]]; then
+  IFS=',' read -ra arr2 <<< "$INCLUDE_NS"
+  for ns in "${arr2[@]}"; do
+    include_pattern="${include_pattern}|^${ns}\$"
+  done
+  include_pattern="${include_pattern#|}"
+fi
+
+# Initialize failures array to avoid "variable sans liaison" when set -u is active
+failures=()
+
+# Collect ReplicaSets: namespace, name, desired(spec.replicas), ready(status.readyReplicas), creationTimestamp
+# If fields missing, jsonpath returns nothing -> we normalize later
+mapfile -t lines < <(kubectl get rs -A -o jsonpath='{range .items[*]}{.metadata.namespace}{"\t"}{.metadata.name}{"\t"}{.spec.replicas}{"\t"}{.status.readyReplicas}{"\t"}{.metadata.creationTimestamp}{"\n"}{end}' 2>/dev/null || true)
+
+now_s=$(date +%s)
+
+for line in "${lines[@]}"; do
+  # Skip empty lines if any
+  [[ -z "$line" ]] && continue
+
+  ns=$(echo "$line" | awk -F'\t' '{print $1}')
+  name=$(echo "$line" | awk -F'\t' '{print $2}')
+  desired=$(echo "$line" | awk -F'\t' '{print $3}')
+  ready=$(echo "$line" | awk -F'\t' '{print $4}')
+  created=$(echo "$line" | awk -F'\t' '{print $5}')
+
+  # normalize numeric values
+  desired=${desired:-0}
+  ready=${ready:-0}
+
+  # namespace filtering
+  if [[ -n "$include_pattern" ]]; then
+    if ! echo "$ns" | egrep -q "$include_pattern"; then
+      continue
+    fi
+  fi
+  if [[ -n "$ignore_pattern" ]]; then
+    if echo "$ns" | egrep -q "$ignore_pattern"; then
+      continue
+    fi
+  fi
+
+  # age filtering (skip very recent RS)
+  if [[ -n "$created" && "$AGE_MIN" -gt 0 ]]; then
+    created_s=0
+    # convert to epoch; if conversion fails, keep created_s=0 so we don't skip
+    if created_s=$(date -d "$created" +%s 2>/dev/null || echo 0); then :; fi
+    age_min=$(( (now_s - created_s) / 60 ))
+    if (( age_min < AGE_MIN )); then
+      continue
+    fi
+  fi
+
+  # Only consider RS where desired > 0 (skip zero-scale RS)
+  if (( desired > 0 )) && (( ready < desired )); then
+    failures+=("${ns}/${name} (desired=${desired},ready=${ready})")
+  fi
+done
+
+count=${#failures[@]}
+
+# If there are no failures and the cluster reports none, return OK
+if (( count == 0 )); then
+  echo "OK - all ReplicaSets report ready==desired"
+  exit 0
+fi
+
+# Determine severity based on thresholds
+if (( count >= CRIT )); then
+  echo "CRITICAL - ${count} ReplicaSets not fully ready: ${failures[*]}"
+  exit 2
+elif (( count >= WARN )); then
+  echo "WARNING - ${count} ReplicaSets not fully ready: ${failures[*]}"
+  exit 1
+else
+  echo "OK - ${count} ReplicaSets not fully ready but below thresholds"
+  exit 0
+fi