add k8s check & config

2025-11-24 08:38:24 +01:00
parent 0045a21479
commit 1730b93c3f
12 changed files with 1888 additions and 0 deletions
--- a/files/nrpe/check_cilium_health
+++ b/files/nrpe/check_cilium_health
@@ -0,0 +1,307 @@
 #!/usr/bin/env bash
 # check_cilium_health
 # Vérifie la santé de Cilium (pods, daemonsets, operator) et optionnellement utilise le binaire `cilium status -o json`.
 # Retour: 0=OK, 1=WARNING, 2=CRITICAL, 3=UNKNOWN
 #
 # Usage:
 #  sudo /usr/lib/nagios/plugins/check_cilium_health [--namespace N] [--label LABEL] [--warn-not-ready N] [--crit-not-ready M] [--use-cilium-cli] [--timeout SECS]
 #
 set -euo pipefail
 # Defaults
 NAMESPACE=${NAMESPACE:-kube-system}
 LABEL=${LABEL:-k8s-app=cilium}
 WARN_NOT_READY=${WARN_NOT_READY:-1}
 CRIT_NOT_READY=${CRIT_NOT_READY:-2}
 WARN_RESTARTS=${WARN_RESTARTS:-3}
 CRIT_RESTARTS=${CRIT_RESTARTS:-10}
 USE_CILIUM_CLI=0
 TIMEOUT=${TIMEOUT:-10}
 print_usage() {
  cat <<EOF
 Usage: $0 [options]
 Options:
  --namespace N          namespace (default: kube-system)
  --label LABEL          pod label selector (default: "k8s-app=cilium")
  --warn-not-ready N     warn if >= N pods not ready (default ${WARN_NOT_READY})
  --crit-not-ready M     critical if >= M pods not ready (default ${CRIT_NOT_READY})
  --warn-restarts R      warn if restartCount >= R per pod (default ${WARN_RESTARTS})
  --crit-restarts S      critical if restartCount >= S per pod (default ${CRIT_RESTARTS})
  --use-cilium-cli       run 'cilium status -o json' as additional check (requires cilium binary)
  --timeout SECS         kubectl timeout in seconds (default ${TIMEOUT})
  -h, --help             show this help
 EOF
 }
 # Parse args
 while [[ $# -gt 0 ]]; do
  case "$1" in
    --namespace) NAMESPACE="$2"; shift 2;;
    --label) LABEL="$2"; shift 2;;
    --warn-not-ready) WARN_NOT_READY="$2"; shift 2;;
    --crit-not-ready) CRIT_NOT_READY="$2"; shift 2;;
    --warn-restarts) WARN_RESTARTS="$2"; shift 2;;
    --crit-restarts) CRIT_RESTARTS="$2"; shift 2;;
    --use-cilium-cli) USE_CILIUM_CLI=1; shift 1;;
    --timeout) TIMEOUT="$2"; shift 2;;
    -h|--help) print_usage; exit 3;;
    *) echo "Unknown arg: $1"; print_usage; exit 3;;
  esac
 done
 # ensure kubectl & python present
 if ! command -v kubectl >/dev/null 2>&1; then
  echo "UNKNOWN - kubectl not found in PATH"
  exit 3
 fi
 if ! command -v python3 >/dev/null 2>&1; then
  echo "UNKNOWN - python3 not found in PATH (required for JSON parsing)"
  exit 3
 fi
 # ---- kubeconfig handling ----
 # If KUBECONFIG is not set, try sensible defaults so sudo/nagios runs succeed.
 # Priority:
 # 1) env KUBECONFIG if already defined
 # 2) /etc/kubernetes/admin.conf if present (common on control-planes)
 # 3) /root/.kube/config if present
 # 4) fallback to empty (kubectl will then try defaults and may fail)
 if [[ -z "${KUBECONFIG:-}" ]]; then
  if [[ -r "/etc/kubernetes/admin.conf" ]]; then
    export KUBECONFIG="/etc/kubernetes/admin.conf"
  elif [[ -r "/root/.kube/config" ]]; then
    export KUBECONFIG="/root/.kube/config"
  else
    # leave unset; kubectl will attempt defaults
    unset KUBECONFIG || true
  fi
 fi
 # Use explicit kubeconfig for kubectl invocations to avoid home/KUBECONFIG differences under sudo
 if [[ -n "${KUBECONFIG:-}" ]]; then
  KC="kubectl --kubeconfig=${KUBECONFIG} --request-timeout=${TIMEOUT}s"
 else
  KC="kubectl --request-timeout=${TIMEOUT}s"
 fi
 # Helper to run python parser safely via temp file
 run_python_parser() {
  # $1 = input (stdin), $2 = python here-doc content (as a bash string)
  local input="$1"
  local pyprog="$2"
  local tmp pyfile
  tmp=$(mktemp) || return 1
  pyfile=$(mktemp) || { rm -f "$tmp"; return 1; }
  printf '%s\n' "$pyprog" > "$pyfile"
  printf '%s' "$input" | python3 "$pyfile" > "$tmp" 2>/dev/null
  local rc=$?
  rm -f "$pyfile"
  if [[ $rc -ne 0 ]]; then
    rm -f "$tmp"
    return $rc
  fi
  cat "$tmp"
  rm -f "$tmp"
  return 0
 }
 # 1) get pods JSON robustly
 set +e
 pods_json=$($KC -n "$NAMESPACE" get pods -l "$LABEL" -o json 2>&1)
 rc_kubectl=$?
 set -e
 if (( rc_kubectl != 0 )); then
  echo "CRITICAL - kubectl failed to list Cilium pods: ${pods_json//$'\n'/ ' '}"
  exit 2
 fi
 # 2) parse pods JSON via python (safe invocation)
 pod_python_prog=$'import sys,json\ntry:\n    data=json.load(sys.stdin)\nexcept Exception:\n    sys.exit(1)\nitems=data.get(\"items\",[])\nfor it in items:\n    name=it.get(\"metadata\",{}).get(\"name\",\"<noname>\")\n    node=it.get(\"spec\",{}).get(\"nodeName\",\"\")\n    phase=it.get(\"status\",{}).get(\"phase\",\"\")\n    cs=it.get(\"status\",{}).get(\"containerStatuses\",[]) or []\n    total_cont=len(cs)\n    ready_cnt=sum(1 for c in cs if c.get(\"ready\") is True)\n    restarts=sum(int(c.get(\"restartCount\",0) or 0) for c in cs)\n    ready_str = f\"{ready_cnt}/{total_cont}\"\n    print(f\"{name}\\t{phase}\\t{ready_str}\\t{restarts}\\t{node}\")\n'
 pod_lines=()
 if pod_out=$(run_python_parser "$pods_json" "$pod_python_prog"); then
  # read into array safely
  IFS=$'\n' read -r -d '' -a pod_lines <<< "$(printf '%s\n' "$pod_out")" || true
 fi
 # Fallback if parsing failed or empty: use simple kubectl get pods --no-headers
 if [[ ${#pod_lines[@]} -eq 0 ]]; then
  simple=$($KC -n "$NAMESPACE" get pods -l "$LABEL" --no-headers 2>&1 || true)
  count_simple=$(printf '%s\n' "$simple" | sed '/^\s*$/d' | wc -l)
  if [[ "$count_simple" -eq 0 ]]; then
    echo "CRITICAL - no Cilium pods found or kubectl output unparsable. kubectl output: ${simple//$'\n'/ ' '}"
    exit 2
  fi
  # convert simple lines into pod_lines minimally: NAME READY ... -> parse name and READY column
  while IFS= read -r l; do
    [[ -z "$l" ]] && continue
    name=$(echo "$l" | awk '{print $1}')
    readycol=$(echo "$l" | awk '{print $2}')
    if [[ "$readycol" == *"/"* ]]; then
      rnum=$(echo "$readycol" | cut -d'/' -f1)
      rtot=$(echo "$readycol" | cut -d'/' -f2)
    else
      rnum=0; rtot=0
    fi
    if [[ "$rnum" == "$rtot" && "$rtot" != "0" ]]; then
      phase="Running"
    else
      phase="NotReady"
    fi
    restarts=0
    node=""
    pod_lines+=("${name}\t${phase}\t${rnum}/${rtot}\t${restarts}\t${node}")
  done < <(printf '%s\n' "$simple")
 fi
 # Now evaluate pod_lines
 total_pods=0
 not_ready=0
 not_ready_list=()
 high_restart_pods=()
 for line in "${pod_lines[@]}"; do
  [[ -z "$line" ]] && continue
  total_pods=$((total_pods+1))
  IFS=$'\t' read -r pname pphase pready prest pnode <<< "$line"
  ready_num=${pready%/*}
  ready_tot=${pready#*/}
  ready_num=${ready_num:-0}
  ready_tot=${ready_tot:-0}
  if [[ "$pphase" != "Running" ]] || (( ready_num < ready_tot )); then
    not_ready=$((not_ready+1))
    not_ready_list+=("${pname}:${pphase}:${pready}")
  fi
  prest=${prest:-0}
  if (( prest >= CRIT_RESTARTS )); then
    high_restart_pods+=("${pname}:${prest}:CRITICAL")
  elif (( prest >= WARN_RESTARTS )); then
    high_restart_pods+=("${pname}:${prest}:WARN")
  fi
 done
 # DaemonSet check (desired vs ready) using safe python parsing
 set +e
 ds_out=$($KC -n "$NAMESPACE" get ds -l "$LABEL" -o json 2>&1)
 rc_ds=$?
 set -e
 ds_desired=0; ds_ready=0
 if (( rc_ds == 0 )); then
  ds_python_prog=$'import sys,json\ndata=json.load(sys.stdin)\nfor it in data.get(\"items\",[]):\n    s=it.get(\"status\",{})\n    desired=int(s.get(\"desiredNumberScheduled\") or 0)\n    ready=int(s.get(\"numberReady\") or 0)\n    print(f\"{desired}\\t{ready}\")\n'
  if ds_out_parsed=$(run_python_parser "$ds_out" "$ds_python_prog"); then
    while IFS=$'\n' read -r d; do
      [[ -z "$d" ]] && continue
      ddesired=$(echo "$d" | cut -f1)
      dready=$(echo "$d" | cut -f2)
      ds_desired=$((ds_desired+ddesired))
      ds_ready=$((ds_ready+dready))
    done <<< "$ds_out_parsed"
  fi
 fi
 # cilium-operator deployment check
 op_ok=1
 op_msg=""
 set +e
 op_json=$($KC -n "$NAMESPACE" get deploy cilium-operator -o json 2>/dev/null || true)
 set -e
 if [[ -n "$op_json" ]]; then
  op_python_prog=$'import sys,json\ndata=json.load(sys.stdin)\nspec=data.get(\"spec\",{})\nstatus=data.get(\"status\",{})\nreplicas=int(spec.get(\"replicas\") or 1)\navailable=int(status.get(\"availableReplicas\") or 0)\nprint(f\"{replicas}\\t{available}\")\n'
  if op_line=$(run_python_parser "$op_json" "$op_python_prog"); then
    IFS=$'\t' read -r op_repl op_avail <<< "$op_line"
    if (( op_avail < op_repl )); then
      op_ok=0
      op_msg="operator available=${op_avail}/${op_repl}"
    else
      op_msg="operator available=${op_avail}/${op_repl}"
    fi
  fi
 fi
 # Optional: cilium CLI
 cilium_ok=1
 cilium_summary=""
 if (( USE_CILIUM_CLI == 1 )); then
  if ! command -v cilium >/dev/null 2>&1; then
    cilium_ok=0
    cilium_summary="cilium binary not in PATH"
  else
    set +e
    cilium_raw=$(cilium status -o json 2>&1) || true
    rc_cilium=$?
    set -e
    if (( rc_cilium != 0 )); then
      cilium_ok=0
      cilium_summary="cilium status failed: ${cilium_raw//$'\n'/ ' '}"
    else
      cilium_ok=1
      cilium_summary=$(printf '%s' "$cilium_raw" | tr '\n' ' ' | sed 's/  */ /g' | cut -c1-300)
    fi
  fi
 fi
 # Compose status
 code=0
 msgs=()
 if (( not_ready >= CRIT_NOT_READY )); then
  code=2
  msgs+=("CRITICAL - ${not_ready}/${total_pods} pods not ready")
 elif (( not_ready >= WARN_NOT_READY )); then
  if (( code < 1 )); then code=1; fi
  msgs+=("WARNING - ${not_ready}/${total_pods} pods not ready")
 else
  msgs+=("OK - ${total_pods} pods, not-ready=${not_ready}")
 fi
 if (( ds_desired > 0 )) && (( ds_ready < ds_desired )); then
  if (( ds_desired - ds_ready >= CRIT_NOT_READY )); then
    code=2
    msgs+=("CRITICAL - daemonsets ready=${ds_ready}/${ds_desired}")
  else
    if (( code < 1 )); then code=1; fi
    msgs+=("WARNING - daemonsets ready=${ds_ready}/${ds_desired}")
  fi
 fi
 if [[ -n "$op_msg" ]]; then
  if (( op_ok == 0 )); then
    code=2
    msgs+=("CRITICAL - ${op_msg}")
  else
    msgs+=("${op_msg}")
  fi
 fi
 if (( ${#high_restart_pods[@]} > 0 )); then
  crit_restart=0; warn_restart=0
  for r in "${high_restart_pods[@]}"; do
    [[ "$r" == *":CRITICAL" ]] && crit_restart=1
    [[ "$r" == *":WARN" ]] && warn_restart=1
  done
  if (( crit_restart == 1 )); then
    code=2
    msgs+=("CRITICAL - pods with high restart counts: ${high_restart_pods[*]}")
  elif (( warn_restart == 1 )); then
    if (( code < 1 )); then code=1; fi
    msgs+=("WARNING - pods with elevated restarts: ${high_restart_pods[*]}")
  fi
 fi
 if (( USE_CILIUM_CLI == 1 )); then
  if (( cilium_ok == 0 )); then
    code=2
    msgs+=("CRITICAL - cilium-cli: ${cilium_summary}")
  else
    msgs+=("cilium-cli ok: ${cilium_summary}")
  fi
 fi
 if (( not_ready > 0 )); then
  truncated=$(printf "%s, " "${not_ready_list[@]}" | sed 's/, $//')
  msgs+=("not-ready-list: ${truncated}")
 fi
 echo "$(IFS=' ; '; echo "${msgs[*]}")"
 exit "${code}"
--- a/files/nrpe/check_coredns_health
+++ b/files/nrpe/check_coredns_health
@@ -0,0 +1,158 @@
 #!/usr/bin/env bash
 # check_coredns_health
 # Vérifie la santé de CoreDNS (endpoints + endpointslices + fallback pods)
 # Retour codes: 0=OK, 1=WARNING, 2=CRITICAL, 3=UNKNOWN
 #
 # Usage:
 #  sudo /usr/lib/nagios/plugins/check_coredns_health [--namespace N] [--service NAME] [--label-fallback LABEL] [--kubeconfig PATH]
 #
 set -euo pipefail
 NAMESPACE=${NAMESPACE:-kube-system}
 SERVICE_NAME=${SERVICE_NAME:-coredns}
 LABEL_FALLBACK=${LABEL_FALLBACK:-k8s-app=kube-dns}
 TIMEOUT=${TIMEOUT:-10}
 usage() {
  cat <<EOF
 Usage: $0 [--namespace N] [--service NAME] [--label-fallback LABEL] [--kubeconfig PATH]
 Defaults: namespace=$NAMESPACE service=$SERVICE_NAME
 EOF
 }
 while [[ $# -gt 0 ]]; do
  case "$1" in
    --namespace) NAMESPACE="$2"; shift 2;;
    --service) SERVICE_NAME="$2"; shift 2;;
    --label-fallback) LABEL_FALLBACK="$2"; shift 2;;
    --kubeconfig) export KUBECONFIG="$2"; shift 2;;
    -h|--help) usage; exit 3;;
    *) echo "Unknown arg: $1"; usage; exit 3;;
  esac
 done
 if ! command -v kubectl >/dev/null 2>&1; then
  echo "UNKNOWN - kubectl not found"
  exit 3
 fi
 # If KUBECONFIG not set, try sensible defaults so sudo/nagios runs succeed.
 if [[ -z "${KUBECONFIG:-}" ]]; then
  if [[ -r "/etc/kubernetes/admin.conf" ]]; then
    export KUBECONFIG="/etc/kubernetes/admin.conf"
  elif [[ -r "/root/.kube/config" ]]; then
    export KUBECONFIG="/root/.kube/config"
  fi
 fi
 # Build kubectl command with explicit kubeconfig when available
 if [[ -n "${KUBECONFIG:-}" ]]; then
  KC=(kubectl --kubeconfig="${KUBECONFIG}" --request-timeout="${TIMEOUT}s")
 else
  KC=(kubectl --request-timeout="${TIMEOUT}s")
 fi
 # run_kc: capture stdout only (stderr -> /dev/null) and return kubectl's exit code
 run_kc() {
  local out rc
  out="$("${KC[@]}" "$@" 2>/dev/null)"
  rc=$?
  printf '%s' "$out"
  return $rc
 }
 # 1) try Endpoints resource
 ep_out=$(run_kc -n "$NAMESPACE" get endpoints "$SERVICE_NAME" -o jsonpath='{.subsets[*].addresses[*].ip}')
 rc=$?
 if (( rc != 0 )); then
  echo "CRITICAL - kubectl failed to get Endpoints (exit code ${rc})"
  exit 2
 fi
 if [[ -n "${ep_out// /}" ]]; then
  echo "OK - service ${SERVICE_NAME} in ${NAMESPACE} has endpoints: $(echo "$ep_out" | tr ' ' ',')"
  exit 0
 fi
 # 2) try EndpointSlices (k8s >= 1.17)
 eps_out=$(run_kc -n "$NAMESPACE" get endpointslices -l "kubernetes.io/service-name=${SERVICE_NAME}" -o jsonpath='{range .items[*]}{range .endpoints[*]}{.addresses[*]}{"\n"}{end}{end}')
 rc=$?
 if (( rc != 0 )); then
  echo "CRITICAL - kubectl failed to get EndpointSlices (exit code ${rc})"
  exit 2
 fi
 if [[ -n "${eps_out// /}" ]]; then
  tops=$(printf '%s\n' "$eps_out" | sed '/^\s*$/d' | tr '\n' ',' | sed 's/,$//')
  echo "OK - service ${SERVICE_NAME} in ${NAMESPACE} has EndpointSlices addresses: ${tops}"
  exit 0
 fi
 # 3) fallback: check service selector and pods matching it
 svc_out=$(run_kc -n "$NAMESPACE" get svc "$SERVICE_NAME" -o jsonpath='{range $k,$v := .spec.selector}{printf "%s=%s;" $k $v}{end}')
 rc=$?
 if (( rc != 0 )); then
  echo "CRITICAL - kubectl failed to get Service selector (exit code ${rc})"
  exit 2
 fi
 SEL="$svc_out"
 if [[ -z "$SEL" ]]; then
  SEL="$LABEL_FALLBACK"
  SEL=${SEL//;/,}
 fi
 SEL=${SEL%[;,]}
 # get pods by selector
 pods_out=$(run_kc -n "$NAMESPACE" get pods -l "$SEL" --no-headers -o custom-columns=READY:.status.containerStatuses[0].ready,NAME:.metadata.name)
 rc=$?
 if (( rc != 0 )); then
  echo "CRITICAL - kubectl failed to list pods for selector '${SEL}' (exit code ${rc})"
  exit 2
 fi
 if [[ -z "${pods_out// /}" ]]; then
  # try alternative labels common for CoreDNS (k8s-app=coredns)
  pods_alt=$(run_kc -n "$NAMESPACE" get pods -l k8s-app=coredns --no-headers -o custom-columns=READY:.status.containerStatuses[0].ready,NAME:.metadata.name)
  rc=$?
  if (( rc != 0 )); then
    echo "CRITICAL - kubectl failed to list pods for fallback selector (exit code ${rc})"
    exit 2
  fi
  if [[ -n "${pods_alt// /}" ]]; then
    pods_out="$pods_alt"
    SEL="k8s-app=coredns (fallback)"
  fi
 fi
 if [[ -z "${pods_out// /}" ]]; then
  echo "CRITICAL - service ${SERVICE_NAME} in ${NAMESPACE} has no endpoints and no pods match selector '${SEL}'"
  exit 2
 fi
 # count Ready pods
 not_ready_count=0
 total_count=0
 not_ready_list=()
 while IFS= read -r line; do
  [[ -z "$line" ]] && continue
  total_count=$((total_count+1))
  ready_flag=$(echo "$line" | awk '{print $1}')
  pod_name=$(echo "$line" | awk '{print $2}')
  if [[ "$ready_flag" != "true" && "$ready_flag" != "True" && "$ready_flag" != "1" ]]; then
    not_ready_count=$((not_ready_count+1))
    not_ready_list+=("$pod_name")
  fi
 done <<< "$pods_out"
 if (( total_count == 0 )); then
  echo "CRITICAL - service ${SERVICE_NAME} in ${NAMESPACE} has no endpoints and no pods found for selector '${SEL}'"
  exit 2
 fi
 if (( not_ready_count > 0 )); then
  echo "WARNING - service ${SERVICE_NAME} in ${NAMESPACE} has no endpoints, but ${not_ready_count}/${total_count} pods matching selector '${SEL}' are not Ready: ${not_ready_list[*]}"
  exit 1
 fi
 # If pods exist and are Ready but no Endpoints/EndpointSlices -> likely endpointcontroller/roles mismatch; consider OK but log it
 echo "OK - service ${SERVICE_NAME} in ${NAMESPACE} has no Endpoints resource but ${total_count} pods matching selector '${SEL}' are Ready (EndpointSlices absent or controller issue)"
 exit 0
--- a/files/nrpe/check_etcd_health
+++ b/files/nrpe/check_etcd_health
@@ -0,0 +1,230 @@
 #!/usr/bin/env bash
 # check_etcd_health
 # Verifie la santé d'etcd et (optionnel) la creation/verifieation des snapshots.
 # Retourne : 0=OK, 1=WARNING, 2=CRITICAL, 3=UNKNOWN
 #
 # Usage example:
 #  sudo /usr/lib/nagios/plugins/check_etcd_health \
 #    --endpoints "https://192.168.1.41:2379,https://192.168.1.42:2379" \
 #    --cacert /etc/ssl/etcd/ssl/ca.pem --cert /etc/ssl/etcd/ssl/admin.pem --key /etc/ssl/etcd/ssl/admin-key.pem \
 #    --test-snapshot --snapshot-dir /var/backups/etcd --snapshot-max-age 24
 #
 # Notes:
 # - Par securite, execute ce script sur un master (ou via NRPE/SSH) avec un utilisateur ayant acces aux clefs.
 # - --snapshot-max-age en heures (defaut 24). Mettre 0 pour desactiver la verification d'age.
 # - --test-snapshot creerera un snapshot temporaire pour valider la creation + verification via `etcdctl snapshot status`.
 # - Si --keep-snapshot-on-failure est active, le snapshot temporaire sera conserve en cas d'erreur pour debug.
 ETCDCTL=${ETCDCTL:-/usr/local/bin/etcdctl}
 print_usage() {
  cat <<EOF
 Usage: $0 --endpoints ENDPOINTS --cacert CA --cert CERT --key KEY [options]
 Options:
  --warn-db-mb N           avertissement si DB >= N MB (default 1024)
  --crit-db-mb M           critique si DB >= M MB (default 1800)
  --timeout SECS           etcdctl timeout (default 10)
  --test-snapshot          tenter de creer un snapshot temporaire et verifier son status
  --snapshot-dir DIR       repertoire pour snapshots temporaires (default /var/backups/etcd)
  --keep-snapshot-on-failure  conserver le snapshot temporaire si creation echoue (default false)
  --snapshot-max-age HRS   verifier qu'il existe un snapshot plus recent que HRS heures (default 24). Mettre 0 pour desactiver.
  -h, --help               affiche cette aide
 EOF
 }
 # Defaults
 WARN_DB_MB=${WARN_DB_MB:-1024}
 CRIT_DB_MB=${CRIT_DB_MB:-1800}
 TIMEOUT=${TIMEOUT:-10}
 TEST_SNAPSHOT=0
 SNAPSHOT_DIR=${SNAPSHOT_DIR:-/var/backups/etcd}
 KEEP_SNAPSHOT_ON_FAILURE=0
 SNAPSHOT_MAX_AGE_HOURS=${SNAPSHOT_MAX_AGE_HOURS:-24}
 # Parse args
 while [[ $# -gt 0 ]]; do
  case "$1" in
    --endpoints) ENDPOINTS="$2"; shift 2;;
    --cacert) CACERT="$2"; shift 2;;
    --cert) CERT="$2"; shift 2;;
    --key) KEY="$2"; shift 2;;
    --warn-db-mb) WARN_DB_MB="$2"; shift 2;;
    --crit-db-mb) CRIT_DB_MB="$2"; shift 2;;
    --timeout) TIMEOUT="$2"; shift 2;;
    --test-snapshot) TEST_SNAPSHOT=1; shift 1;;
    --snapshot-dir) SNAPSHOT_DIR="$2"; shift 2;;
    --keep-snapshot-on-failure) KEEP_SNAPSHOT_ON_FAILURE=1; shift 1;;
    --snapshot-max-age) SNAPSHOT_MAX_AGE_HOURS="$2"; shift 2;;
    -h|--help) print_usage; exit 3;;
    *) echo "Unknown arg: $1"; print_usage; exit 3;;
  esac
 done
 # Allow env fallback (if ETCDCTL_* env vars set)
 ENDPOINTS=${ENDPOINTS:-${ETCDCTL_ENDPOINTS:-}}
 CACERT=${CACERT:-${ETCDCTL_CACERT:-}}
 CERT=${CERT:-${ETCDCTL_CERT:-}}
 KEY=${KEY:-${ETCDCTL_KEY:-}}
 if [[ -z "${ENDPOINTS:-}" || -z "${CACERT:-}" || -z "${CERT:-}" || -z "${KEY:-}" ]]; then
  echo "UNKNOWN - missing required args/certs"
  print_usage
  exit 3
 fi
 if [[ ! -x "$ETCDCTL" ]]; then
  echo "UNKNOWN - etcdctl not found at $ETCDCTL"
  exit 3
 fi
 if [[ ! -r "$CACERT" || ! -r "$CERT" || ! -r "$KEY" ]]; then
  echo "CRITICAL - cannot read certificate files (permissions?)"
  echo "CACERT=$CACERT CERT=$CERT KEY=$KEY"
  exit 2
 fi
 export ETCDCTL_API=3
 # 1) endpoint status check
 OUT=$("$ETCDCTL" --command-timeout="${TIMEOUT}s" --endpoints="${ENDPOINTS}" --cacert="$CACERT" --cert="$CERT" --key="$KEY" endpoint status 2>&1) || {
  echo "CRITICAL - etcdctl endpoint status failed: $OUT"
  exit 2
 }
 leaders=0
 total=0
 max_db_mb=0
 while IFS= read -r line; do
  line=${line//$'\r'/}
  [[ -z "$line" ]] && continue
  total=$((total+1))
  IFS=',' read -r endpoint id version dbsize isLeader isLearner memberCount rest <<<"$line"
  isLeader=$(echo "${isLeader:-}" | tr -d ' ' | tr '[:upper:]' '[:lower:]')
  if [[ "$isLeader" == "true" ]]; then leaders=$((leaders+1)); fi
  db_mb=0
  if [[ -n "${dbsize:-}" ]]; then
    dbsize=$(echo "$dbsize" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
    num=$(echo "$dbsize" | awk '{print $1}' 2>/dev/null || echo "")
    unit=$(echo "$dbsize" | awk '{print $2}' 2>/dev/null || echo "")
    if [[ "$num" =~ ^[0-9]+(\.[0-9]+)?$ ]]; then
      case "${unit^^}" in
        B) db_mb=$(( num / 1024 / 1024 )) ;;
        KB) db_mb=$(( num / 1024 )) ;;
        MB) db_mb=$(printf "%.0f" "$num") ;;
        GB) db_mb=$(( num * 1024 )) ;;
        *) db_mb=$(printf "%.0f" "$num") ;;
      esac
    fi
  fi
  if (( db_mb > max_db_mb )); then max_db_mb=$db_mb; fi
 done <<< "$OUT"
 if (( total == 0 )); then
  echo "CRITICAL - no endpoints returned by etcdctl"
  exit 2
 fi
 if (( leaders == 0 )); then
  echo "CRITICAL - no leader found among $total endpoints; detail: $OUT"
  exit 2
 fi
 if (( leaders > 1 )); then
  echo "WARNING - multiple leaders detected: $leaders (possible split-brain); detail: $OUT"
  exit 1
 fi
 if (( max_db_mb >= CRIT_DB_MB )); then
  echo "CRITICAL - etcd DB size ${max_db_mb}MB >= ${CRIT_DB_MB}MB"
  exit 2
 fi
 if (( max_db_mb >= WARN_DB_MB )); then
  echo "WARNING - etcd DB size ${max_db_mb}MB >= ${WARN_DB_MB}MB"
  exit 1
 fi
 # 2) Verification of recent snapshot files (optional, default 24h)
 SNAP_CHECK_MSG=""
 if [[ -n "$SNAPSHOT_MAX_AGE_HOURS" ]]; then
  # SNAPSHOT_MAX_AGE_HOURS == 0 -> disabled
  if (( SNAPSHOT_MAX_AGE_HOURS > 0 )); then
    mkdir -p "$SNAPSHOT_DIR" 2>/dev/null || {
      echo "CRITICAL - cannot create/access snapshot dir $SNAPSHOT_DIR"
      exit 2
    }
    latest_snapshot=$(ls -1t "$SNAPSHOT_DIR"/snapshot-*.db 2>/dev/null | head -n1 || true)
    if [[ -z "$latest_snapshot" ]]; then
      SNAP_CHECK_MSG="no snapshot files found in $SNAPSHOT_DIR"
      echo "CRITICAL - $SNAP_CHECK_MSG (no snapshots)"
      exit 2
    else
      now_s=$(date +%s)
      snap_mtime_s=$(stat -c %Y "$latest_snapshot")
      age_s=$(( now_s - snap_mtime_s ))
      age_h=$(( age_s / 3600 ))
      if (( age_h > SNAPSHOT_MAX_AGE_HOURS )); then
        SNAP_CHECK_MSG="latest snapshot $latest_snapshot is ${age_h}h old (> ${SNAPSHOT_MAX_AGE_HOURS}h)"
        echo "CRITICAL - $SNAP_CHECK_MSG"
        exit 2
      else
        SNAP_CHECK_MSG="latest snapshot $latest_snapshot is ${age_h}h old (<= ${SNAPSHOT_MAX_AGE_HOURS}h)"
      fi
    fi
  fi
 fi
 # 3) Optional: test snapshot creation and status
 SNAP_TEST_MSG=""
 if (( TEST_SNAPSHOT == 1 )); then
  mkdir -p "$SNAPSHOT_DIR" 2>/dev/null || {
    echo "CRITICAL - cannot create/access snapshot dir $SNAPSHOT_DIR"
    exit 2
  }
  if [[ ! -w "$SNAPSHOT_DIR" ]]; then
    echo "CRITICAL - snapshot dir not writable: $SNAPSHOT_DIR"
    exit 2
  fi
  SNAPFILE=$(mktemp "${SNAPSHOT_DIR}/snapshot-XXXXXX.db") || {
    echo "CRITICAL - mktemp failed in $SNAPSHOT_DIR"
    exit 2
  }
  cleanup() {
    rc=$?
    if [[ $rc -eq 0 ]]; then
      rm -f "$SNAPFILE" 2>/dev/null || true
    else
      if [[ $KEEP_SNAPSHOT_ON_FAILURE -eq 0 ]]; then
        rm -f "$SNAPFILE" 2>/dev/null || true
      else
        echo "NOTICE - snapshot kept at $SNAPFILE for debugging"
      fi
    fi
    return $rc
  }
  trap 'cleanup' EXIT
  SAVE_OUT=$("$ETCDCTL" --command-timeout="${TIMEOUT}s" --endpoints="${ENDPOINTS}" --cacert="$CACERT" --cert="$CERT" --key="$KEY" snapshot save "$SNAPFILE" 2>&1) || {
    echo "CRITICAL - snapshot save failed: $SAVE_OUT"
    exit 2
  }
  STATUS_OUT=$("$ETCDCTL" snapshot status "$SNAPFILE" 2>&1) || {
    echo "CRITICAL - snapshot status failed: $STATUS_OUT"
    exit 2
  }
  # If we reach here, creation+status ok
  SNAP_TEST_MSG="snapshot test ok: $SNAPFILE ; status: $(echo "$STATUS_OUT" | tr '\n' ' ' | sed 's/  */ /g')"
  # cleanup will remove the snapshot (unless KEEP_SNAPSHOT_ON_FAILURE and rc != 0)
 fi
 # Compose final message
 MSG="OK - $total endpoints checked, leaders=$leaders, max_db=${max_db_mb}MB"
 if [[ -n "$SNAP_CHECK_MSG" ]]; then
  MSG="$MSG ; $SNAP_CHECK_MSG"
 fi
 if [[ -n "$SNAP_TEST_MSG" ]]; then
  MSG="$MSG ; $SNAP_TEST_MSG"
 fi
 echo "$MSG"
 exit 0
--- a/files/nrpe/check_k8s_apiserver_access
+++ b/files/nrpe/check_k8s_apiserver_access
@@ -0,0 +1,214 @@
 #!/usr/bin/env bash
 # check_k8s_apiserver_access
 # Vérifie le nombre de réponses HTTP 403 dans les logs de kube-apiserver.
 # Retour codes: 0=OK, 1=WARNING, 2=CRITICAL, 3=UNKNOWN
 #
 # Par défaut: utilise journalctl -u kube-apiserver --since="${WINDOW} minutes ago"
 # Option --kubectl : utilise "kubectl logs" sur les pods correspondant au sélecteur.
 #
 # Usage examples:
 #   sudo /usr/lib/nagios/plugins/check_k8s_apiserver_access --window 5 --warn 10 --crit 50
 #   sudo /usr/lib/nagios/plugins/check_k8s_apiserver_access --kubectl --selector 'k8s-app=kube-apiserver' --window 10 --crit 100
 #
 set -euo pipefail
 PROG_NAME=$(basename "$0")
 # Defaults
 WINDOW_MINUTES=5
 WARN_THRESHOLD=10
 CRIT_THRESHOLD=50
 USE_KUBECTL=0
 KUBECTL_NAMESPACE="kube-system"
 KUBECTL_SELECTOR=""    # if empty, we'll try -l component=kube-apiserver or label provided
 JOURNAL_UNIT="kube-apiserver"  # systemd unit name; adapt if different
 PATTERN=''  # optional custom grep regex
 TOP_N=5     # number of top offenders to show
 print_help() {
  cat <<EOF
 $PROG_NAME - check apiserver 403 rate in logs
 Options:
  --window N           Window in minutes to look back (default: ${WINDOW_MINUTES})
  --warn N             WARN threshold: count >= N -> WARNING (default: ${WARN_THRESHOLD})
  --crit N             CRIT threshold: count >= N -> CRITICAL (default: ${CRIT_THRESHOLD})
  --kubectl            Use 'kubectl logs' on apiserver pods instead of journalctl
  --namespace NS       Namespace for kubectl logs (default: ${KUBECTL_NAMESPACE})
  --selector SEL       Label selector for kubectl logs (e.g. "component=kube-apiserver" or "k8s-app=kube-apiserver")
  --unit UNIT          systemd unit for journalctl (default: ${JOURNAL_UNIT})
  --pattern REGEX      custom grep regex to detect 403 entries (overrides built-in heuristics)
  --top N              show top N request lines causing 403 (default ${TOP_N})
  -h, --help           show this help
 Examples:
  # check last 5 minutes using journalctl
  sudo ./check_apiserver_403.sh --window 5 --warn 20 --crit 50
  # check last 10 minutes using kubectl logs for apiserver static-pods
  sudo ./check_apiserver_403.sh --kubectl --namespace kube-system --selector 'k8s-app=kube-apiserver' --window 10 --crit 100
 EOF
 }
 # Parse args
 while [[ $# -gt 0 ]]; do
  case "$1" in
    --window) WINDOW_MINUTES="$2"; shift 2;;
    --warn) WARN_THRESHOLD="$2"; shift 2;;
    --crit) CRIT_THRESHOLD="$2"; shift 2;;
    --kubectl) USE_KUBECTL=1; shift 1;;
    --namespace) KUBECTL_NAMESPACE="$2"; shift 2;;
    --selector) KUBECTL_SELECTOR="$2"; shift 2;;
    --unit) JOURNAL_UNIT="$2"; shift 2;;
    --pattern) PATTERN="$2"; shift 2;;
    --top) TOP_N="$2"; shift 2;;
    -h|--help) print_help; exit 3;;
    *) echo "Unknown argument: $1"; print_help; exit 3;;
  esac
 done
 # Validate numeric args
 if ! [[ "$WINDOW_MINUTES" =~ ^[0-9]+$ ]]; then echo "UNKNOWN - invalid --window"; exit 3; fi
 if ! [[ "$WARN_THRESHOLD" =~ ^[0-9]+$ ]]; then echo "UNKNOWN - invalid --warn"; exit 3; fi
 if ! [[ "$CRIT_THRESHOLD" =~ ^[0-9]+$ ]]; then echo "UNKNOWN - invalid --crit"; exit 3; fi
 if ! [[ "$TOP_N" =~ ^[0-9]+$ ]]; then echo "UNKNOWN - invalid --top"; exit 3; fi
 # Build detection regex if not provided
 if [[ -z "$PATTERN" ]]; then
  # heuristics: try to match common apiserver log patterns that indicate a 403/Forbidden
  # examples: "\" 403 ", "code=403", "403 Forbidden", "Forbidden" combined with "Denied" etc.
  PATTERN='(" 403 |\" 403 |code=403|403 Forbidden|Forbidden|\"Reason=Forbidden\"|\"message=.*Forbidden)'
  # note: portable grep -E will accept that pattern
 fi
 # Grab logs
 get_logs_journal() {
  # Use journalctl if available
  if ! command -v journalctl >/dev/null 2>&1; then
    echo "ERROR_NO_JOURNAL" 1>&2
    return 1
  fi
  # We use --no-pager; use unit name. If unit not present, journalctl returns non-zero.
  # Example: journalctl -u kube-apiserver --since "5 minutes ago"
  journalctl -u "${JOURNAL_UNIT}" --since="${WINDOW_MINUTES} minutes ago" --no-pager 2>/dev/null || return 1
 }
 get_logs_kubectl() {
  if ! command -v kubectl >/dev/null 2>&1; then
    echo "ERROR_NO_KUBECTL" 1>&2
    return 1
  fi
  # If no selector given try common selectors
  sel="${KUBECTL_SELECTOR}"
  if [[ -z "$sel" ]]; then
    # try common labels
    for try in 'component=kube-apiserver' 'k8s-app=kube-apiserver' 'tier=control-plane' ''; do
      if [[ -z "$try" ]]; then
        sel=""
        break
      fi
      # test if any pods match
      count=$(kubectl -n "${KUBECTL_NAMESPACE}" get pods -l "${try}" --no-headers 2>/dev/null | wc -l || echo 0)
      if [[ "$count" -gt 0 ]]; then
        sel="${try}"
        break
      fi
    done
  fi
  if [[ -z "$sel" ]]; then
    # fallback: get all pods in namespace and try to find apiserver in name
    pods=$(kubectl -n "${KUBECTL_NAMESPACE}" get pods --no-headers -o custom-columns=':metadata.name' 2>/dev/null || true)
    if [[ -z "$pods" ]]; then
      return 1
    fi
    # build selector as empty and we'll filter by name
    # collect logs from pods whose name contains "apiserver"
    out=""
    while IFS= read -r p; do
      [[ -z "$p" ]] && continue
      if echo "$p" | grep -qi 'apiserver'; then
        out="${out}$(kubectl -n ${KUBECTL_NAMESPACE} logs --since=${WINDOW_MINUTES}m ${p} --all-containers 2>/dev/null || true)$'\n'"
      fi
    done <<< "$pods"
    printf '%s' "$out"
    return 0
  else
    # gather logs from all pods matching selector
    podnames=$(kubectl -n "${KUBECTL_NAMESPACE}" get pods -l "${sel}" -o custom-columns=':metadata.name' --no-headers 2>/dev/null || true)
    if [[ -z "$podnames" ]]; then
      return 1
    fi
    out=""
    while IFS= read -r p; do
      [[ -z "$p" ]] && continue
      out="${out}$(kubectl -n ${KUBECTL_NAMESPACE} logs --since=${WINDOW_MINUTES}m ${p} --all-containers 2>/dev/null || true)$'\n'"
    done <<< "$podnames"
    printf '%s' "$out"
    return 0
  fi
 }
 # retrieve logs into variable LOGS
 LOGS=""
 if (( USE_KUBECTL == 1 )); then
  if ! LOGS=$(get_logs_kubectl); then
    echo "CRITICAL - failed to collect logs via kubectl (check KUBECONFIG, namespace/selector, permissions)"
    exit 2
  fi
 else
  if ! LOGS=$(get_logs_journal); then
    echo "CRITICAL - failed to collect logs via journalctl for unit '${JOURNAL_UNIT}' (check unit name/permissions)"
    exit 2
  fi
 fi
 # If logs empty -> OK (no traffic) BUT treat with UNKNOWN if we expected logs
 if [[ -z "$LOGS" ]]; then
  echo "OK - no apiserver logs found in the last ${WINDOW_MINUTES}m (count=0)"
  exit 0
 fi
 # Count matches of 403 using grep -E (case-insensitive)
 # Use printf to pass LOGS safely to grep
 count_403=$(printf '%s\n' "$LOGS" | grep -E -i -c "$PATTERN" || true)
 count_403=${count_403:-0}
 # Optionally extract top request lines that caused 403
 # Try to extract HTTP method + path if present, otherwise use whole line truncated
 top_requests=$(printf '%s\n' "$LOGS" | grep -E -i "$PATTERN" || true)
 if [[ -n "$top_requests" ]]; then
  # try to extract method+path like: "GET /api/..." or GET /api/... 
  top_paths=$(printf '%s\n' "$top_requests" | grep -oE '(GET|POST|PUT|DELETE|PATCH) [^" ]+' | sed 's/"$//' | sort | uniq -c | sort -rn | head -n "${TOP_N}" || true)
  if [[ -z "$top_paths" ]]; then
    # fallback: show most frequent truncated lines
    top_paths=$(printf '%s\n' "$top_requests" | sed 's/^[[:space:]]*//; s/[[:space:]]\+/ /g' | cut -c1-200 | sort | uniq -c | sort -rn | head -n "${TOP_N}" || true)
  fi
 else
  top_paths=""
 fi
 # Decide severity
 if (( count_403 >= CRIT_THRESHOLD )); then
  status=2
  state="CRITICAL"
 elif (( count_403 >= WARN_THRESHOLD )); then
  status=1
  state="WARNING"
 else
  status=0
  state="OK"
 fi
 # Build message
 msg="${state} - ${count_403} occurrences of 403 in last ${WINDOW_MINUTES}m (warn=${WARN_THRESHOLD},crit=${CRIT_THRESHOLD})"
 # Append top paths if present
 if [[ -n "$top_paths" ]]; then
  msg="${msg} ; top=${TOP_N}: $(printf '%s' "$top_paths" | tr '\n' '|' | sed 's/|$//')"
 fi
 # Print and exit
 echo "$msg"
 exit $status
--- a/files/nrpe/check_k8s_deployments
+++ b/files/nrpe/check_k8s_deployments
@@ -0,0 +1,138 @@
 #!/usr/bin/env bash
 # check_k8s_deployments
 # Vérifie les Deployments Kubernetes: availableReplicas < spec.replicas
 # Retour: 0=OK, 1=WARNING, 2=CRITICAL, 3=UNKNOWN
 #
 # Usage:
 #  sudo /usr/lib/nagios/plugins/check_k8s_deployments [--warn N] [--crit M] [--ignore-ns ns1,ns2] [--namespaces ns1,ns2] [--age-min MINUTES]
 #
 # Exemples:
 #  sudo /usr/lib/nagios/plugins/check_k8s_deployments --crit 1
 #  sudo /usr/lib/nagios/plugins/check_k8s_deployments --ignore-ns kube-system,monitoring
 #
 set -euo pipefail
 WARN=${WARN:-0}   # nombre de deploys en erreur pour WARNING
 CRIT=${CRIT:-1}   # nombre de deploys en erreur pour CRITICAL par défaut (1 => tout problème -> CRITICAL)
 IGNORE_NS=""
 INCLUDE_NS=""
 AGE_MIN=0
 print_usage() {
  cat <<EOF
 Usage: $0 [--warn N] [--crit M] [--ignore-ns ns1,ns2] [--namespaces ns1,ns2] [--age-min MINUTES]
 --warn N         : seuil warn si >=N déploiements en erreur (default 0)
 --crit M         : seuil crit si >=M déploiements en erreur (default 1)
 --ignore-ns LIST : comma separated namespaces to ignore (default none)
 --namespaces LIST: comma separated namespaces to check only (default all)
 --age-min N      : ignore deployments created less than N minutes ago (avoid flapping during rollout)
 EOF
 }
 # parse args
 while [[ $# -gt 0 ]]; do
  case "$1" in
    --warn) WARN="$2"; shift 2;;
    --crit) CRIT="$2"; shift 2;;
    --ignore-ns) IGNORE_NS="$2"; shift 2;;
    --namespaces) INCLUDE_NS="$2"; shift 2;;
    --age-min) AGE_MIN="$2"; shift 2;;
    -h|--help) print_usage; exit 3;;
    *) echo "Unknown arg: $1"; print_usage; exit 3;;
  esac
 done
 if ! command -v kubectl >/dev/null 2>&1; then
  echo "UNKNOWN - kubectl not found"
  exit 3
 fi
 # Build filter for namespace inclusion/exclusion
 ignore_pattern=""
 if [[ -n "$IGNORE_NS" ]]; then
  IFS=',' read -ra arr <<< "$IGNORE_NS"
  for ns in "${arr[@]}"; do
    ignore_pattern="${ignore_pattern}|^${ns}\$"
  done
  # remove leading |
  ignore_pattern="${ignore_pattern#|}"
 fi
 include_pattern=""
 if [[ -n "$INCLUDE_NS" ]]; then
  IFS=',' read -ra arr2 <<< "$INCLUDE_NS"
  for ns in "${arr2[@]}"; do
    include_pattern="${include_pattern}|^${ns}\$"
  done
  include_pattern="${include_pattern#|}"
 fi
 # result collection
 # Initialize failures array to avoid "variable sans liaison" when running with set -u
 failures=()
 # get list: namespace, name, desired, available, creationTimestamp
 mapfile -t lines < <(kubectl get deploy -A -o jsonpath='{range .items[*]}{.metadata.namespace}{"\t"}{.metadata.name}{"\t"}{.spec.replicas}{"\t"}{.status.availableReplicas}{"\t"}{.metadata.creationTimestamp}{"\n"}{end}' 2>/dev/null || true)
 now_s=$(date +%s)
 for line in "${lines[@]}"; do
  # skip empty lines
  [[ -z "${line}" ]] && continue
  ns=$(echo "$line" | awk -F'\t' '{print $1}')
  name=$(echo "$line" | awk -F'\t' '{print $2}')
  desired=$(echo "$line" | awk -F'\t' '{print $3}')
  available=$(echo "$line" | awk -F'\t' '{print $4}')
  created=$(echo "$line" | awk -F'\t' '{print $5}')
  # normalize
  desired=${desired:-0}
  available=${available:-0}
  # namespace filtering
  if [[ -n "$include_pattern" ]]; then
    if ! echo "$ns" | egrep -q "$include_pattern"; then
      continue
    fi
  fi
  if [[ -n "$ignore_pattern" ]]; then
    if echo "$ns" | egrep -q "$ignore_pattern"; then
      continue
    fi
  fi
  # age filtering
  if [[ -n "$created" && "$AGE_MIN" -gt 0 ]]; then
    # convert to epoch
    created_s=$(date -d "$created" +%s 2>/dev/null || echo 0)
    age_min=$(( (now_s - created_s) / 60 ))
    if (( age_min < AGE_MIN )); then
      # skip new deployments (they might be still rolling out)
      continue
    fi
  fi
  if (( available < desired )); then
    failures+=("${ns}/${name} (desired=${desired},available=${available})")
  fi
 done
 count=${#failures[@]}
 if (( count == 0 )); then
  echo "OK - all deployments report desired==available"
  exit 0
 fi
 # Decide severity
 if (( count >= CRIT )); then
  echo "CRITICAL - ${count} deployments not available: ${failures[*]}"
  exit 2
 elif (( count >= WARN )); then
  echo "WARNING - ${count} deployments not available: ${failures[*]}"
  exit 1
 else
  echo "OK - ${count} deployments not available but below thresholds"
  exit 0
 fi
--- a/files/nrpe/check_k8s_jobs_cronjobs
+++ b/files/nrpe/check_k8s_jobs_cronjobs
@@ -0,0 +1,232 @@
 #!/usr/bin/env bash
 # check_k8s_jobs_cronjobs
 # Vérifie l'état des Kubernetes Jobs et CronJobs.
 # Exit codes: 0=OK, 1=WARNING, 2=CRITICAL, 3=UNKNOWN
 #
 # Fonctions principales :
 #  - détecte Jobs avec des échecs (.status.failed > 0) ou des Jobs "actifs" trop vieux
 #  - recherche d'événements récents (type=Warning) liés aux Jobs dans les X dernières minutes
 #  - vérifie pour les CronJobs que lastScheduleTime n'est pas trop ancien (configurable) si non suspendu
 #
 # Usage (exemples) :
 #  sudo /usr/lib/nagios/plugins/check_k8s_jobs_cronjobs --crit 1 --recent-minutes 5
 #  sudo /usr/lib/nagios/plugins/check_k8s_jobs_cronjobs --ignore-ns kube-system --cron-max-age 120
 #
 set -euo pipefail
 # Defaults
 WARN=${WARN:-0}
 CRIT=${CRIT:-1}
 IGNORE_NS=""
 INCLUDE_NS=""
 AGE_MIN=${AGE_MIN:-60}
 RECENT_MINUTES=${RECENT_MINUTES:-5}
 CHECK_CRON=1
 CRON_MAX_AGE_MIN=${CRON_MAX_AGE_MIN:-60}
 print_usage() {
  cat <<EOF
 Usage: $0 [options]
 Options:
  --warn N                seuil WARN si >= N objets en erreur (default 0)
  --crit M                seuil CRIT si >= M objets en erreur (default 1)
  --ignore-ns ns1,ns2     namespaces à ignorer
  --namespaces ns1,ns2    limiter aux namespaces donnés (comma separated)
  --age-min MINUTES       considérer un job "actif" normal si démarré moins de MINUTES (default 60)
  --recent-minutes MIN    chercher événements de Job (Warning) dans les MIN dernières minutes (default 5)
  --check-cron            activer la vérification des CronJobs (default ON)
  --cron-max-age MINUTES  si lastScheduleTime > MINUTES => alerter (default 60). Mettre 0 pour désactiver.
  -h, --help              : affiche l'aide
 EOF
 }
 # Parse args
 while [[ $# -gt 0 ]]; do
  case "$1" in
    --warn) WARN="$2"; shift 2;;
    --crit) CRIT="$2"; shift 2;;
    --ignore-ns) IGNORE_NS="$2"; shift 2;;
    --namespaces) INCLUDE_NS="$2"; shift 2;;
    --age-min) AGE_MIN="$2"; shift 2;;
    --recent-minutes) RECENT_MINUTES="$2"; shift 2;;
    --no-cron) CHECK_CRON=0; shift 1;;
    --cron-max-age) CRON_MAX_AGE_MIN="$2"; shift 2;;
    -h|--help) print_usage; exit 3;;
    *) echo "Unknown arg: $1"; print_usage; exit 3;;
  esac
 done
 if ! command -v kubectl >/dev/null 2>&1; then
  echo "UNKNOWN - kubectl not found"
  exit 3
 fi
 # Build namespace filters (regex)
 ignore_pattern=""
 if [[ -n "$IGNORE_NS" ]]; then
  IFS=',' read -ra arr <<< "$IGNORE_NS"
  for ns in "${arr[@]}"; do
    ignore_pattern="${ignore_pattern}|^${ns}\$"
  done
  ignore_pattern="${ignore_pattern#|}"
 fi
 include_pattern=""
 if [[ -n "$INCLUDE_NS" ]]; then
  IFS=',' read -ra arr2 <<< "$INCLUDE_NS"
  for ns in "${arr2[@]}"; do
    include_pattern="${include_pattern}|^${ns}\$"
  done
  include_pattern="${include_pattern#|}"
 fi
 ns_allowed() {
  local ns="$1"
  if [[ -n "$include_pattern" ]]; then
    if ! echo "$ns" | egrep -q "$include_pattern"; then
      return 1
    fi
  fi
  if [[ -n "$ignore_pattern" ]]; then
    if echo "$ns" | egrep -q "$ignore_pattern"; then
      return 1
    fi
  fi
  return 0
 }
 now_s=$(date +%s)
 # Initialize problems array safely
 problems=()
 # ---------------------------
 # 1) Inspect Jobs
 # ---------------------------
 # Fields: namespace, name, active, succeeded, failed, startTime, completionTime
 mapfile -t job_lines < <(kubectl get jobs -A -o jsonpath='{range .items[*]}{.metadata.namespace}{"\t"}{.metadata.name}{"\t"}{.status.active}{"\t"}{.status.succeeded}{"\t"}{.status.failed}{"\t"}{.status.startTime}{"\t"}{.status.completionTime}{"\n"}{end}' 2>/dev/null || true)
 for line in "${job_lines[@]}"; do
  ns=$(echo "$line" | awk -F'\t' '{print $1}')
  name=$(echo "$line" | awk -F'\t' '{print $2}')
  active=$(echo "$line" | awk -F'\t' '{print $3}')
  succeeded=$(echo "$line" | awk -F'\t' '{print $4}')
  failed=$(echo "$line" | awk -F'\t' '{print $5}')
  start=$(echo "$line" | awk -F'\t' '{print $6}')
  completion=$(echo "$line" | awk -F'\t' '{print $7}')
  # defaults
  active=${active:-0}
  succeeded=${succeeded:-0}
  failed=${failed:-0}
  if ! ns_allowed "$ns"; then
    continue
  fi
  # 1.a) Jobs with failures
  if (( failed > 0 )); then
    problems+=("Job ${ns}/${name} failedCount=${failed}")
    continue
  fi
  # 1.b) Active jobs running too long
  if (( active > 0 )); then
    if [[ -n "$start" && "$start" != "null" ]]; then
      # convert start timestamp to epoch (GNU date)
      start_s=$(date -d "$start" +%s 2>/dev/null || echo 0)
      if (( start_s > 0 )); then
        age_min=$(( (now_s - start_s) / 60 ))
        if (( age_min >= AGE_MIN )); then
          problems+=("Job ${ns}/${name} active for ${age_min}min >= ${AGE_MIN}min")
        fi
      fi
    else
      # no start time but active >0 -> flag
      problems+=("Job ${ns}/${name} active but no startTime recorded")
    fi
  fi
 done
 # 1.c) Recent Job warning events (type=Warning) in last RECENT_MINUTES
 if (( RECENT_MINUTES > 0 )); then
  # get events for Jobs (type Warning) with fields: namespace, involvedObject.name, lastTimestamp, reason, message
  mapfile -t event_lines < <(kubectl get events --all-namespaces --field-selector involvedObject.kind=Job,type=Warning -o custom-columns='NAMESPACE:.metadata.namespace,NAME:.involvedObject.name,LAST:.lastTimestamp,REASON:.reason,MESSAGE:.message' --no-headers 2>/dev/null || true)
  cutoff_s=$(( now_s - RECENT_MINUTES * 60 ))
  for ev in "${event_lines[@]}"; do
    ns=$(echo "$ev" | awk '{print $1}')
    name=$(echo "$ev" | awk '{print $2}')
    last=$(echo "$ev" | awk '{print $3}')
    if ! ns_allowed "$ns"; then
      continue
    fi
    if [[ -n "$last" && "$last" != "<none>" ]]; then
      ts=$(date -d "$last" +%s 2>/dev/null || echo 0)
      if (( ts >= cutoff_s )); then
        problems+=("Job event Warning ${ns}/${name} at $last")
      fi
    fi
  done
 fi
 # ---------------------------
 # 2) Inspect CronJobs (optionnel)
 # ---------------------------
 if (( CHECK_CRON == 1 )) && (( CRON_MAX_AGE_MIN > 0 )); then
  # Fields: namespace, name, suspend (true/false/null), lastScheduleTime
  mapfile -t cron_lines < <(kubectl get cronjob -A -o jsonpath='{range .items[*]}{.metadata.namespace}{"\t"}{.metadata.name}{"\t"}{.spec.suspend}{"\t"}{.status.lastScheduleTime}{"\n"}{end}' 2>/dev/null || true)
  for line in "${cron_lines[@]}"; do
    ns=$(echo "$line" | awk -F'\t' '{print $1}')
    name=$(echo "$line" | awk -F'\t' '{print $2}')
    suspend=$(echo "$line" | awk -F'\t' '{print $3}')
    last=$(echo "$line" | awk -F'\t' '{print $4}')
    if ! ns_allowed "$ns"; then
      continue
    fi
    # If suspended, do not consider as problem
    if [[ "$suspend" == "true" ]]; then
      continue
    fi
    if [[ -z "$last" || "$last" == "null" ]]; then
      # Never scheduled yet: warn (useful to detect misconfigured cronjobs)
      problems+=("CronJob ${ns}/${name} has no lastScheduleTime (never scheduled?)")
      continue
    fi
    last_s=$(date -d "$last" +%s 2>/dev/null || echo 0)
    if (( last_s > 0 )); then
      age_min=$(( (now_s - last_s) / 60 ))
      if (( age_min > CRON_MAX_AGE_MIN )); then
        problems+=("CronJob ${ns}/${name} lastSchedule ${age_min}min ago > ${CRON_MAX_AGE_MIN}min")
      fi
    else
      problems+=("CronJob ${ns}/${name} lastScheduleTime unparsable: ${last}")
    fi
  done
 fi
 # ---------------------------
 # Final decision & output
 # ---------------------------
 count=${#problems[@]}
 if (( count == 0 )); then
  echo "OK - Jobs/CronJobs checks passed"
  exit 0
 fi
 # Severity decision
 if (( count >= CRIT )); then
  echo "CRITICAL - ${count} problems found: ${problems[*]}"
  exit 2
 elif (( count >= WARN )); then
  echo "WARNING - ${count} problems found: ${problems[*]}"
  exit 1
 else
  echo "OK - ${count} problems found but below thresholds"
  exit 0
 fi
--- a/files/nrpe/check_k8s_pki_certs
+++ b/files/nrpe/check_k8s_pki_certs
@@ -0,0 +1,194 @@
 #!/usr/bin/env bash
 # check_k8s_pki_certs
 # Vérifie les certificats PEM sous /etc/kubernetes/pki (par défaut) et alerte si expiration <= warn_days (30j par défaut).
 # Exit codes: 0=OK, 1=WARNING, 2=CRITICAL, 3=UNKNOWN
 #
 # Usage:
 #  sudo /usr/lib/nagios/plugins/check_k8s_pki_certs
 #  sudo /usr/lib/nagios/plugins/check_k8s_pki_certs --path /etc/kubernetes/ssl --warn-days 30 --crit-days 7 --recursive
 #
 set -euo pipefail
 PKI_PATH=${PKI_PATH:-/etc/kubernetes/pki}
 WARN_DAYS=${WARN_DAYS:-30}
 CRIT_DAYS=${CRIT_DAYS:-7}
 RECURSIVE=0
 print_usage() {
  cat <<EOF
 Usage: $0 [--path PATH] [--warn-days N] [--crit-days M] [--recursive] [-h|--help]
 Options:
  --path PATH        répertoire à scanner (default: $PKI_PATH)
  --warn-days N      seuil warning en jours (default: $WARN_DAYS)
  --crit-days M      seuil critical en jours (default: $CRIT_DAYS)
  --recursive        scanner récursivement PATH et sous-dirs
  -h, --help         affiche cette aide
 EOF
 }
 while [[ $# -gt 0 ]]; do
  case "$1" in
    --path) PKI_PATH="$2"; shift 2;;
    --warn-days) WARN_DAYS="$2"; shift 2;;
    --crit-days) CRIT_DAYS="$2"; shift 2;;
    --recursive) RECURSIVE=1; shift 1;;
    -h|--help) print_usage; exit 3;;
    *) echo "Unknown arg: $1"; print_usage; exit 3;;
  esac
 done
 # tools
 if ! command -v openssl >/dev/null 2>&1; then
  echo "UNKNOWN - openssl not found"
  exit 3
 fi
 if ! command -v date >/dev/null 2>&1; then
  echo "UNKNOWN - date not found"
  exit 3
 fi
 if ! command -v sed >/dev/null 2>&1; then
  echo "UNKNOWN - sed not found"
  exit 3
 fi
 if ! command -v awk >/dev/null 2>&1; then
  echo "UNKNOWN - awk not found"
  exit 3
 fi
 if ! command -v find >/dev/null 2>&1; then
  echo "UNKNOWN - find not found"
  exit 3
 fi
 # resolve symlink target (realpath or readlink -f)
 if command -v realpath >/dev/null 2>&1; then
  PKI_PATH_RESOLVED=$(realpath -e "$PKI_PATH" 2>/dev/null || true)
 else
  PKI_PATH_RESOLVED=$(readlink -f "$PKI_PATH" 2>/dev/null || true)
 fi
 if [[ -n "$PKI_PATH_RESOLVED" && -d "$PKI_PATH_RESOLVED" ]]; then
  PKI_PATH="$PKI_PATH_RESOLVED"
 fi
 if [[ ! -d "$PKI_PATH" ]]; then
  echo "UNKNOWN - path $PKI_PATH not found or not a directory"
  exit 3
 fi
 now_s=$(date +%s)
 # Initialize arrays explicitly to avoid "variable sans liaison" with set -u
 critical=()
 warning=()
 ok=()
 errors=()
 file_count=0
 cert_count=0
 # build find command: follow symlinks (-L) so that symlinked directories/files are handled
 if [[ $RECURSIVE -eq 1 ]]; then
  FIND_CMD=(find -L "$PKI_PATH" -type f -print0)
 else
  FIND_CMD=(find -L "$PKI_PATH" -maxdepth 1 -type f -print0)
 fi
 # iterate files found
 while IFS= read -r -d '' file; do
  file_count=$((file_count+1))
  # skip unreadable files
  if [[ ! -r "$file" ]]; then
    errors+=("Unreadable file: $file")
    continue
  fi
  # skip files without PEM marker
  if ! grep -q "BEGIN CERTIFICATE" "$file" 2>/dev/null; then
    continue
  fi
  # find pairs of BEGIN/END certificate line numbers robustly using awk
  # prints "start:end" for each certificate block
  mapfile -t pairs < <(awk '
    /BEGIN CERTIFICATE/ {start=NR}
    /END CERTIFICATE/ && start { print start ":" NR; start=0 }
  ' "$file" 2>/dev/null || true)
  if [[ ${#pairs[@]} -eq 0 ]]; then
    errors+=("No certificate block pairs found in $file")
    continue
  fi
  for p in "${pairs[@]}"; do
    start=${p%%:*}
    end=${p##*:}
    # extract block via sed (line range), send to openssl via stdin
    cert_block=$(sed -n "${start},${end}p" "$file" 2>/dev/null || true)
    if [[ -z "$cert_block" ]]; then
      errors+=("Failed to extract certificate block ${start}-${end} from $file")
      continue
    fi
    # openssl expects a file or stdin; use stdin
    endline=$(printf '%s\n' "$cert_block" | openssl x509 -noout -enddate -in /dev/stdin 2>/dev/null) || {
      errors+=("Failed to parse certificate block ${start}-${end} in $file with openssl")
      continue
    }
    # sample endline: notAfter=Oct 27 16:15:30 2125 GMT
    notAfter=${endline#notAfter=}
    expiry_s=$(date -d "$notAfter" +%s 2>/dev/null) || {
      errors+=("Cannot parse date '$notAfter' for cert in $file")
      continue
    }
    days_left=$(( (expiry_s - now_s) / 86400 ))
    subj=$(printf '%s\n' "$cert_block" | openssl x509 -noout -subject -in /dev/stdin 2>/dev/null || true)
    subj=${subj#subject= }
    info="${file} :: ${subj} :: expires in ${days_left}d on ${notAfter}"
    cert_count=$((cert_count+1))
    if (( days_left <= CRIT_DAYS )); then
      critical+=("$info")
    elif (( days_left <= WARN_DAYS )); then
      warning+=("$info")
    else
      ok+=("$info")
    fi
  done
 done < <("${FIND_CMD[@]}")
 # results and exit codes
 if [[ ${#errors[@]} -gt 0 ]]; then
  echo "UNKNOWN - parsing errors: ${errors[*]}"
  exit 3
 fi
 if (( cert_count == 0 )); then
  echo "UNKNOWN - no certificates found under $PKI_PATH"
  exit 3
 fi
 if (( ${#critical[@]} > 0 )); then
  echo "CRITICAL - ${#critical[@]} certificate(s) expiring soon (<= ${CRIT_DAYS} days):"
  for c in "${critical[@]}"; do
    echo "  - $c"
  done
  if (( ${#warning[@]} > 0 )); then
    echo "WARN (additional ${#warning[@]} cert(s) <= ${WARN_DAYS} days):"
    for w in "${warning[@]}"; do
      echo "  - $w"
    done
  fi
  exit 2
 fi
 if (( ${#warning[@]} > 0 )); then
  echo "WARNING - ${#warning[@]} certificate(s) expiring within ${WARN_DAYS} days:"
  for w in "${warning[@]}"; do
    echo "  - $w"
  done
  exit 1
 fi
 echo "OK - ${cert_count} cert(s) checked in ${file_count} file(s), no expiry within ${WARN_DAYS} days"
 exit 0
--- a/files/nrpe/check_k8s_pod_restarts
+++ b/files/nrpe/check_k8s_pod_restarts
@@ -0,0 +1,49 @@
 #!/usr/bin/env bash
 # check_k8s_pod_restarts
 # Vérifie s'il y a eu des redémarrages de pods (événements "Killing") dans les X dernières minutes.
 # Retour: 0=OK, 2=CRITICAL, 3=UNKNOWN
 #
 # Usage: 
 #  sudo /usr/lib/nagios/plugins/check_k8s_pod_restarts [minutes]
 #
 MINUTES=${1:-5}
 # Require kubectl
 if ! command -v kubectl >/dev/null 2>&1; then
  echo "UNKNOWN - kubectl not found"
  exit 3
 fi
 # cutoff en epoch (GNU date)
 if ! cutoff=$(date -d "$MINUTES minutes ago" +%s 2>/dev/null); then
  echo "UNKNOWN - date parsing failed (on macOS use gdate from coreutils)"
  exit 3
 fi
 matches=()
 while IFS=$'\t' read -r ns pod last msg; do
  # skip empty lines
  [[ -z "$last" ]] && continue
  # convert last timestamp to epoch (works with GNU date; handles timezone/fractions)
  if ! ts=$(date -d "$last" +%s 2>/dev/null); then
    # if parsing fails, skip the event
    continue
  fi
  if (( ts >= cutoff )); then
    # safe message truncation
    shortmsg=$(echo "$msg" | tr '\n' ' ' | cut -c1-300)
    matches+=("$ns\t$pod\t$last\t$shortmsg")
  fi
 done < <(kubectl get events --all-namespaces --field-selector reason=Killing -o custom-columns='NAMESPACE:.metadata.namespace,NAME:.involvedObject.name,LAST:.lastTimestamp,MESSAGE:.message' --no-headers 2>/dev/null || true)
 if [[ ${#matches[@]} -eq 0 ]]; then
  echo "OK - no pod restarts in the last ${MINUTES} minutes"
  exit 0
 else
  echo "CRITICAL - ${#matches[@]} pod restarts in the last ${MINUTES} minutes:"
  for m in "${matches[@]}"; do
    IFS=$'\t' read -r ns pod last shortmsg <<< "$m"
    echo " - ${ns}/${pod} at ${last} : ${shortmsg}"
  done
  exit 2
 fi
--- a/files/nrpe/check_k8s_pv_pvc
+++ b/files/nrpe/check_k8s_pv_pvc
@@ -0,0 +1,202 @@
 #!/usr/bin/env bash
 # check_k8s_pv_pvc
 # Vérifie l'état des PersistentVolumes (PV) et PersistentVolumeClaims (PVC) Kubernetes.
 # Exit codes: 0=OK, 1=WARNING, 2=CRITICAL, 3=UNKNOWN
 #
 # Usage examples:
 #  sudo /usr/lib/nagios/plugins/check_k8s_pv_pvc --crit 1                        # CRITICAL si >=1 problème
 #  sudo /usr/lib/nagios/plugins/check_k8s_pv_pvc --ignore-ns kube-system        # ignorer kube-system
 #  sudo /usr/lib/nagios/plugins/check_k8s_pv_pvc --pvc-age-min 10 --crit 2      # ignorer PVC récents <10min, CRIT si >=2
 #  sudo /usr/lib/nagios/plugins/check_k8s_pv_pvc --check-pv --check-pvc         # (par défaut les 2 sont vérifiés)
 #
 set -euo pipefail
 # Defaults
 WARN=${WARN:-0}
 CRIT=${CRIT:-1}
 IGNORE_NS=""
 INCLUDE_NS=""
 PVC_AGE_MIN=${PVC_AGE_MIN:-5}        # en minutes : ignore PVC créés il y a moins de X minutes (défaut 5)
 CHECK_PV=1
 CHECK_PVC=1
 print_usage() {
  cat <<EOF
 Usage: $0 [options]
 Options:
  --warn N             seuil WARN si >= N objets en erreur (default 0)
  --crit M             seuil CRIT si >= M objets en erreur (default 1)
  --ignore-ns a,b,c    namespaces à ignorer (comma separated)
  --namespaces a,b     limiter aux namespaces donnés (comma separated)
  --pvc-age-min N      ignore PVC créés il y a moins de N minutes (default 5)
  --no-pv              disable PV checks
  --no-pvc             disable PVC checks
  -h, --help           affiche cette aide
 EOF
 }
 # Parse args
 while [[ $# -gt 0 ]]; do
  case "$1" in
    --warn) WARN="$2"; shift 2;;
    --crit) CRIT="$2"; shift 2;;
    --ignore-ns) IGNORE_NS="$2"; shift 2;;
    --namespaces) INCLUDE_NS="$2"; shift 2;;
    --pvc-age-min) PVC_AGE_MIN="$2"; shift 2;;
    --no-pv) CHECK_PV=0; shift 1;;
    --no-pvc) CHECK_PVC=0; shift 1;;
    -h|--help) print_usage; exit 3;;
    *) echo "Unknown arg: $1"; print_usage; exit 3;;
  esac
 done
 if ! command -v kubectl >/dev/null 2>&1; then
  echo "UNKNOWN - kubectl not found"
  exit 3
 fi
 # Build namespace filters
 ignore_pattern=""
 if [[ -n "$IGNORE_NS" ]]; then
  IFS=',' read -ra arr <<< "$IGNORE_NS"
  for ns in "${arr[@]}"; do
    ignore_pattern="${ignore_pattern}|^${ns}\$"
  done
  ignore_pattern="${ignore_pattern#|}"
 fi
 include_pattern=""
 if [[ -n "$INCLUDE_NS" ]]; then
  IFS=',' read -ra arr2 <<< "$INCLUDE_NS"
  for ns in "${arr2[@]}"; do
    include_pattern="${include_pattern}|^${ns}\$"
  done
  include_pattern="${include_pattern#|}"
 fi
 now_s=$(date +%s)
 # Initialize problems array safely (fix pour "variable sans liaison")
 problems=()
 # Helper: namespace filter
 ns_allowed() {
  local ns="$1"
  if [[ -n "$include_pattern" ]]; then
    if ! echo "$ns" | egrep -q "$include_pattern"; then
      return 1
    fi
  fi
  if [[ -n "$ignore_pattern" ]]; then
    if echo "$ns" | egrep -q "$ignore_pattern"; then
      return 1
    fi
  fi
  return 0
 }
 # 1) Check PVCs
 if (( CHECK_PVC == 1 )); then
  # gather: namespace, name, phase, volumeName, creationTimestamp
  mapfile -t pvc_lines < <(kubectl get pvc -A -o jsonpath='{range .items[*]}{.metadata.namespace}{"\t"}{.metadata.name}{"\t"}{.status.phase}{"\t"}{.spec.volumeName}{"\t"}{.metadata.creationTimestamp}{"\n"}{end}' 2>/dev/null || true)
  for line in "${pvc_lines[@]}"; do
    ns=$(echo "$line" | awk -F'\t' '{print $1}')
    name=$(echo "$line" | awk -F'\t' '{print $2}')
    phase=$(echo "$line" | awk -F'\t' '{print $3}')
    vol=$(echo "$line" | awk -F'\t' '{print $4}')
    created=$(echo "$line" | awk -F'\t' '{print $5}')
    # filter namespaces
    if ! ns_allowed "$ns"; then
      continue
    fi
    # ignore PVC newly created (to avoid noise during normal provisioning)
    if [[ -n "$created" && "$PVC_AGE_MIN" -gt 0 ]]; then
      created_s=0
      created_s=$(date -d "$created" +%s 2>/dev/null || echo 0)
      age_min=$(( (now_s - created_s) / 60 ))
      if (( age_min < PVC_AGE_MIN )); then
        continue
      fi
    fi
    # Consider non-Bound phases as problematic (Pending, Lost, Failed)
    # Bound is OK; if Bound but no volumeName -> problem
    if [[ "$phase" != "Bound" ]]; then
      problems+=("PVC ${ns}/${name} phase=${phase} created=${created}")
      continue
    fi
    if [[ -z "$vol" || "$vol" == "null" ]]; then
      problems+=("PVC ${ns}/${name} Bound but no volumeName assigned")
      continue
    fi
  done
 fi
 # 2) Check PVs
 if (( CHECK_PV == 1 )); then
  # gather: name, phase, capacity.storage, claimRef.namespace, claimRef.name, reclaimPolicy
  mapfile -t pv_lines < <(kubectl get pv -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.status.phase}{"\t"}{.spec.capacity.storage}{"\t"}{.spec.claimRef.namespace}{"\t"}{.spec.claimRef.name}{"\t"}{.spec.persistentVolumeReclaimPolicy}{"\n"}{end}' 2>/dev/null || true)
  for line in "${pv_lines[@]}"; do
    name=$(echo "$line" | awk -F'\t' '{print $1}')
    phase=$(echo "$line" | awk -F'\t' '{print $2}')
    cap=$(echo "$line" | awk -F'\t' '{print $3}')
    claim_ns=$(echo "$line" | awk -F'\t' '{print $4}')
    claim_name=$(echo "$line" | awk -F'\t' '{print $5}')
    reclaim=$(echo "$line" | awk -F'\t' '{print $6}')
    # If PV is bound, check namespace filter of its claim (only report if claim namespace allowed)
    if [[ -n "$claim_ns" && "$claim_ns" != "null" ]]; then
      if ! ns_allowed "$claim_ns"; then
        continue
      fi
    else
      # claim_ns empty => PV not bound to claim
      # Consider phases indicating issues: Released, Failed
      if [[ "$phase" == "Released" || "$phase" == "Failed" ]]; then
        problems+=("PV ${name} phase=${phase} reclaim=${reclaim} (no claim)")
        continue
      fi
      # Optionally, consider Available PV without claim as possibly orphaned:
      # Uncomment next lines to treat Available PVs as warning/problem
      # if [[ "$phase" == "Available" ]]; then
      #   problems+=("PV ${name} is Available (unbound) capacity=${cap} reclaim=${reclaim}")
      # fi
    fi
    # If bound, but claim cannot be found (partial sanity check)
    if [[ "$phase" == "Bound" ]]; then
      if [[ -z "$claim_ns" || -z "$claim_name" || "$claim_ns" == "null" || "$claim_name" == "null" ]]; then
        problems+=("PV ${name} Bound but missing claimRef (phase=${phase})")
        continue
      fi
      # try to ensure the claim exists (if denied by namespace filter it's been skipped earlier)
      if ! kubectl get pvc -n "${claim_ns}" "${claim_name}" >/dev/null 2>&1; then
        problems+=("PV ${name} Bound to ${claim_ns}/${claim_name} but PVC resource not found")
      fi
    fi
  done
 fi
 count=${#problems[@]}
 if (( count == 0 )); then
  echo "OK - PV/PVC checks passed"
  exit 0
 fi
 # Severity decision
 if (( count >= CRIT )); then
  echo "CRITICAL - ${count} PV/PVC problems: ${problems[*]}"
  exit 2
 elif (( count >= WARN )); then
  echo "WARNING - ${count} PV/PVC problems: ${problems[*]}"
  exit 1
 else
  echo "OK - ${count} PV/PVC problems but below thresholds"
  exit 0
 fi
--- a/files/nrpe/check_k8s_replicasets
+++ b/files/nrpe/check_k8s_replicasets
@@ -0,0 +1,135 @@
 #!/usr/bin/env bash
 # check_k8s_replicasets
 # Vérifie les ReplicaSets Kubernetes : readyReplicas < spec.replicas
 # Retour: 0=OK, 1=WARNING, 2=CRITICAL, 3=UNKNOWN
 #
 # Usage:
 #  sudo /usr/lib/nagios/plugins/check_k8s_replicasets [--warn N] [--crit M] [--ignore-ns ns1,ns2] [--namespaces ns1,ns2] [--age-min MINUTES]
 #
 set -euo pipefail
 WARN=${WARN:-0}   # nombre de RS en erreur pour WARNING
 CRIT=${CRIT:-1}   # nombre de RS en erreur pour CRITICAL par défaut (1 => 1 RS -> CRITICAL)
 IGNORE_NS=""
 INCLUDE_NS=""
 AGE_MIN=0
 print_usage() {
  cat <<EOF
 Usage: $0 [--warn N] [--crit M] [--ignore-ns ns1,ns2] [--namespaces ns1,ns2] [--age-min MINUTES]
 --warn N         : seuil warn si >=N ReplicaSets en erreur (default 0)
 --crit M         : seuil crit si >=M ReplicaSets en erreur (default 1)
 --ignore-ns LIST : comma separated namespaces to ignore (default none)
 --namespaces LIST: comma separated namespaces to check only (default all)
 --age-min N      : ignore ReplicaSets created less than N minutes ago (avoid flapping during rollout)
 EOF
 }
 # parse args
 while [[ $# -gt 0 ]]; do
  case "$1" in
    --warn) WARN="$2"; shift 2;;
    --crit) CRIT="$2"; shift 2;;
    --ignore-ns) IGNORE_NS="$2"; shift 2;;
    --namespaces) INCLUDE_NS="$2"; shift 2;;
    --age-min) AGE_MIN="$2"; shift 2;;
    -h|--help) print_usage; exit 3;;
    *) echo "Unknown arg: $1"; print_usage; exit 3;;
  esac
 done
 if ! command -v kubectl >/dev/null 2>&1; then
  echo "UNKNOWN - kubectl not found"
  exit 3
 fi
 # Build filter for namespace inclusion/exclusion (regex)
 ignore_pattern=""
 if [[ -n "$IGNORE_NS" ]]; then
  IFS=',' read -ra arr <<< "$IGNORE_NS"
  for ns in "${arr[@]}"; do
    ignore_pattern="${ignore_pattern}|^${ns}\$"
  done
  ignore_pattern="${ignore_pattern#|}"
 fi
 include_pattern=""
 if [[ -n "$INCLUDE_NS" ]]; then
  IFS=',' read -ra arr2 <<< "$INCLUDE_NS"
  for ns in "${arr2[@]}"; do
    include_pattern="${include_pattern}|^${ns}\$"
  done
  include_pattern="${include_pattern#|}"
 fi
 # Initialize failures array to avoid "variable sans liaison" when set -u is active
 failures=()
 # Collect ReplicaSets: namespace, name, desired(spec.replicas), ready(status.readyReplicas), creationTimestamp
 # If fields missing, jsonpath returns nothing -> we normalize later
 mapfile -t lines < <(kubectl get rs -A -o jsonpath='{range .items[*]}{.metadata.namespace}{"\t"}{.metadata.name}{"\t"}{.spec.replicas}{"\t"}{.status.readyReplicas}{"\t"}{.metadata.creationTimestamp}{"\n"}{end}' 2>/dev/null || true)
 now_s=$(date +%s)
 for line in "${lines[@]}"; do
  # Skip empty lines if any
  [[ -z "$line" ]] && continue
  ns=$(echo "$line" | awk -F'\t' '{print $1}')
  name=$(echo "$line" | awk -F'\t' '{print $2}')
  desired=$(echo "$line" | awk -F'\t' '{print $3}')
  ready=$(echo "$line" | awk -F'\t' '{print $4}')
  created=$(echo "$line" | awk -F'\t' '{print $5}')
  # normalize numeric values
  desired=${desired:-0}
  ready=${ready:-0}
  # namespace filtering
  if [[ -n "$include_pattern" ]]; then
    if ! echo "$ns" | egrep -q "$include_pattern"; then
      continue
    fi
  fi
  if [[ -n "$ignore_pattern" ]]; then
    if echo "$ns" | egrep -q "$ignore_pattern"; then
      continue
    fi
  fi
  # age filtering (skip very recent RS)
  if [[ -n "$created" && "$AGE_MIN" -gt 0 ]]; then
    created_s=0
    # convert to epoch; if conversion fails, keep created_s=0 so we don't skip
    if created_s=$(date -d "$created" +%s 2>/dev/null || echo 0); then :; fi
    age_min=$(( (now_s - created_s) / 60 ))
    if (( age_min < AGE_MIN )); then
      continue
    fi
  fi
  # Only consider RS where desired > 0 (skip zero-scale RS)
  if (( desired > 0 )) && (( ready < desired )); then
    failures+=("${ns}/${name} (desired=${desired},ready=${ready})")
  fi
 done
 count=${#failures[@]}
 # If there are no failures and the cluster reports none, return OK
 if (( count == 0 )); then
  echo "OK - all ReplicaSets report ready==desired"
  exit 0
 fi
 # Determine severity based on thresholds
 if (( count >= CRIT )); then
  echo "CRITICAL - ${count} ReplicaSets not fully ready: ${failures[*]}"
  exit 2
 elif (( count >= WARN )); then
  echo "WARNING - ${count} ReplicaSets not fully ready: ${failures[*]}"
  exit 1
 else
  echo "OK - ${count} ReplicaSets not fully ready but below thresholds"
  exit 0
 fi
--- a/templates/nrpe.j2
+++ b/templates/nrpe.j2
@@ -72,13 +72,32 @@ command[check_docker_{{ container }}]=/usr/lib/nagios/plugins/check_docker --con
 {% endif %}
 {% if nrpe_process is defined %}
 # process
 {% for process in nrpe_process %}
 command[check_proc_{{ process }}]=/usr/lib/nagios/plugins/check_systemd_service {{ process }}
 {% endfor %}
 {% endif %}
 {% if nrpe_kubernetes is defined or nrpe_kubernetes_manager is defined %}
 # kubernetes
 {% if nrpe_kubernetes is defined %}
 ## nodes
 command[check_proc_kubelet]=/usr/lib/nagios/plugins/check_systemd_service kubelet
 command[check_proc_etcd]=/usr/lib/nagios/plugins/check_systemd_service etcd
 command[check_proc_containerd]=/usr/lib/nagios/plugins/check_systemd_service containerd
 {% endif %}
 {% if nrpe_kubernetes_manager is defined %}
 ## manager / control plane
 command[check_k8s_health]=/usr/lib/nagios/plugins/check_http -I {{ ansible_default_ipv4.address }} -p 6443 -S -u /healthz --continue-after-certificate -r ok -w 1 -c 2
 command[check_cilium_health]=/usr/bin/sudo /usr/lib/nagios/plugins/check_cilium_health
 command[check_coredns_health]=/usr/bin/sudo /usr/lib/nagios/plugins/check_coredns_health
 command[check_etcd_health]=/usr/bin/sudo /usr/lib/nagios/plugins/check_etcd_health --endpoints "https://{{ ansible_default_ipv4.address }}:2379" --cacert /etc/ssl/etcd/ssl/ca.pem --cert /etc/ssl/etcd/ssl/node-{{ nrpe_kubernetes_manager_nodename }}.pem --key /etc/ssl/etcd/ssl/node-{{ nrpe_kubernetes_manager_nodename }}-key.pem
 command[check_k8s_apiserver_access]=/usr/bin/sudo /usr/lib/nagios/plugins/check_k8s_apiserver_access
 command[check_k8s_deployments]=/usr/bin/sudo /usr/lib/nagios/plugins/check_k8s_deployments
 command[check_k8s_jobs_cronjobs]=/usr/bin/sudo /usr/lib/nagios/plugins/check_k8s_jobs_cronjobs
 command[check_k8s_pki_certs]=/usr/bin/sudo /usr/lib/nagios/plugins/check_k8s_pki_certs
 command[check_k8s_pv_pvc]=/usr/bin/sudo /usr/lib/nagios/plugins/check_k8s_pv_pvc
 command[check_k8s_replicasets]=/usr/bin/sudo /usr/lib/nagios/plugins/check_k8s_replicasets
 command[check_k8s_pod_restarts]=/usr/bin/sudo /usr/lib/nagios/plugins/check_k8s_pod_restarts
 {% endif %}
 {% endif %}
--- a/templates/nrpe.sudoers.j2
+++ b/templates/nrpe.sudoers.j2
@@ -2,3 +2,13 @@ nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_postfix_mailqueue -w {{
 nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_exim_mailqueue -w {{ nrpe_mailq_warning }} -c {{ nrpe_mailq_critical }}
 nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_raid
 nagios ALL=(ALL) NOPASSWD: /usr/sbin/needrestart -b -l
 nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_cilium_health
 nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_coredns_health
 nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_etcd_health
 nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_k8s_apiserver_access
 nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_k8s_deployments
 nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_k8s_jobs_cronjobs
 nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_k8s_pki_certs
 nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_k8s_pv_pvc
 nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_k8s_replicasets
 nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_k8s_pod_restarts