add k8s check & config

2025-11-24 08:38:24 +01:00
parent 0045a21479
commit 1730b93c3f
12 changed files with 1888 additions and 0 deletions
--- a/files/nrpe/check_cilium_health
+++ b/files/nrpe/check_cilium_health
@@ -0,0 +1,307 @@
+#!/usr/bin/env bash
+# check_cilium_health
+# Vérifie la santé de Cilium (pods, daemonsets, operator) et optionnellement utilise le binaire `cilium status -o json`.
+# Retour: 0=OK, 1=WARNING, 2=CRITICAL, 3=UNKNOWN
+#
+# Usage:
+#  sudo /usr/lib/nagios/plugins/check_cilium_health [--namespace N] [--label LABEL] [--warn-not-ready N] [--crit-not-ready M] [--use-cilium-cli] [--timeout SECS]
+#
+set -euo pipefail
+
+# Defaults
+NAMESPACE=${NAMESPACE:-kube-system}
+LABEL=${LABEL:-k8s-app=cilium}
+WARN_NOT_READY=${WARN_NOT_READY:-1}
+CRIT_NOT_READY=${CRIT_NOT_READY:-2}
+WARN_RESTARTS=${WARN_RESTARTS:-3}
+CRIT_RESTARTS=${CRIT_RESTARTS:-10}
+USE_CILIUM_CLI=0
+TIMEOUT=${TIMEOUT:-10}
+
+print_usage() {
+  cat <<EOF
+Usage: $0 [options]
+Options:
+  --namespace N          namespace (default: kube-system)
+  --label LABEL          pod label selector (default: "k8s-app=cilium")
+  --warn-not-ready N     warn if >= N pods not ready (default ${WARN_NOT_READY})
+  --crit-not-ready M     critical if >= M pods not ready (default ${CRIT_NOT_READY})
+  --warn-restarts R      warn if restartCount >= R per pod (default ${WARN_RESTARTS})
+  --crit-restarts S      critical if restartCount >= S per pod (default ${CRIT_RESTARTS})
+  --use-cilium-cli       run 'cilium status -o json' as additional check (requires cilium binary)
+  --timeout SECS         kubectl timeout in seconds (default ${TIMEOUT})
+  -h, --help             show this help
+EOF
+}
+
+# Parse args
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --namespace) NAMESPACE="$2"; shift 2;;
+    --label) LABEL="$2"; shift 2;;
+    --warn-not-ready) WARN_NOT_READY="$2"; shift 2;;
+    --crit-not-ready) CRIT_NOT_READY="$2"; shift 2;;
+    --warn-restarts) WARN_RESTARTS="$2"; shift 2;;
+    --crit-restarts) CRIT_RESTARTS="$2"; shift 2;;
+    --use-cilium-cli) USE_CILIUM_CLI=1; shift 1;;
+    --timeout) TIMEOUT="$2"; shift 2;;
+    -h|--help) print_usage; exit 3;;
+    *) echo "Unknown arg: $1"; print_usage; exit 3;;
+  esac
+done
+
+# ensure kubectl & python present
+if ! command -v kubectl >/dev/null 2>&1; then
+  echo "UNKNOWN - kubectl not found in PATH"
+  exit 3
+fi
+if ! command -v python3 >/dev/null 2>&1; then
+  echo "UNKNOWN - python3 not found in PATH (required for JSON parsing)"
+  exit 3
+fi
+
+# ---- kubeconfig handling ----
+# If KUBECONFIG is not set, try sensible defaults so sudo/nagios runs succeed.
+# Priority:
+# 1) env KUBECONFIG if already defined
+# 2) /etc/kubernetes/admin.conf if present (common on control-planes)
+# 3) /root/.kube/config if present
+# 4) fallback to empty (kubectl will then try defaults and may fail)
+if [[ -z "${KUBECONFIG:-}" ]]; then
+  if [[ -r "/etc/kubernetes/admin.conf" ]]; then
+    export KUBECONFIG="/etc/kubernetes/admin.conf"
+  elif [[ -r "/root/.kube/config" ]]; then
+    export KUBECONFIG="/root/.kube/config"
+  else
+    # leave unset; kubectl will attempt defaults
+    unset KUBECONFIG || true
+  fi
+fi
+
+# Use explicit kubeconfig for kubectl invocations to avoid home/KUBECONFIG differences under sudo
+if [[ -n "${KUBECONFIG:-}" ]]; then
+  KC="kubectl --kubeconfig=${KUBECONFIG} --request-timeout=${TIMEOUT}s"
+else
+  KC="kubectl --request-timeout=${TIMEOUT}s"
+fi
+
+# Helper to run python parser safely via temp file
+run_python_parser() {
+  # $1 = input (stdin), $2 = python here-doc content (as a bash string)
+  local input="$1"
+  local pyprog="$2"
+  local tmp pyfile
+  tmp=$(mktemp) || return 1
+  pyfile=$(mktemp) || { rm -f "$tmp"; return 1; }
+  printf '%s\n' "$pyprog" > "$pyfile"
+  printf '%s' "$input" | python3 "$pyfile" > "$tmp" 2>/dev/null
+  local rc=$?
+  rm -f "$pyfile"
+  if [[ $rc -ne 0 ]]; then
+    rm -f "$tmp"
+    return $rc
+  fi
+  cat "$tmp"
+  rm -f "$tmp"
+  return 0
+}
+
+# 1) get pods JSON robustly
+set +e
+pods_json=$($KC -n "$NAMESPACE" get pods -l "$LABEL" -o json 2>&1)
+rc_kubectl=$?
+set -e
+if (( rc_kubectl != 0 )); then
+  echo "CRITICAL - kubectl failed to list Cilium pods: ${pods_json//$'\n'/ ' '}"
+  exit 2
+fi
+
+# 2) parse pods JSON via python (safe invocation)
+pod_python_prog=$'import sys,json\ntry:\n    data=json.load(sys.stdin)\nexcept Exception:\n    sys.exit(1)\nitems=data.get(\"items\",[])\nfor it in items:\n    name=it.get(\"metadata\",{}).get(\"name\",\"<noname>\")\n    node=it.get(\"spec\",{}).get(\"nodeName\",\"\")\n    phase=it.get(\"status\",{}).get(\"phase\",\"\")\n    cs=it.get(\"status\",{}).get(\"containerStatuses\",[]) or []\n    total_cont=len(cs)\n    ready_cnt=sum(1 for c in cs if c.get(\"ready\") is True)\n    restarts=sum(int(c.get(\"restartCount\",0) or 0) for c in cs)\n    ready_str = f\"{ready_cnt}/{total_cont}\"\n    print(f\"{name}\\t{phase}\\t{ready_str}\\t{restarts}\\t{node}\")\n'
+
+pod_lines=()
+if pod_out=$(run_python_parser "$pods_json" "$pod_python_prog"); then
+  # read into array safely
+  IFS=$'\n' read -r -d '' -a pod_lines <<< "$(printf '%s\n' "$pod_out")" || true
+fi
+
+# Fallback if parsing failed or empty: use simple kubectl get pods --no-headers
+if [[ ${#pod_lines[@]} -eq 0 ]]; then
+  simple=$($KC -n "$NAMESPACE" get pods -l "$LABEL" --no-headers 2>&1 || true)
+  count_simple=$(printf '%s\n' "$simple" | sed '/^\s*$/d' | wc -l)
+  if [[ "$count_simple" -eq 0 ]]; then
+    echo "CRITICAL - no Cilium pods found or kubectl output unparsable. kubectl output: ${simple//$'\n'/ ' '}"
+    exit 2
+  fi
+  # convert simple lines into pod_lines minimally: NAME READY ... -> parse name and READY column
+  while IFS= read -r l; do
+    [[ -z "$l" ]] && continue
+    name=$(echo "$l" | awk '{print $1}')
+    readycol=$(echo "$l" | awk '{print $2}')
+    if [[ "$readycol" == *"/"* ]]; then
+      rnum=$(echo "$readycol" | cut -d'/' -f1)
+      rtot=$(echo "$readycol" | cut -d'/' -f2)
+    else
+      rnum=0; rtot=0
+    fi
+    if [[ "$rnum" == "$rtot" && "$rtot" != "0" ]]; then
+      phase="Running"
+    else
+      phase="NotReady"
+    fi
+    restarts=0
+    node=""
+    pod_lines+=("${name}\t${phase}\t${rnum}/${rtot}\t${restarts}\t${node}")
+  done < <(printf '%s\n' "$simple")
+fi
+
+# Now evaluate pod_lines
+total_pods=0
+not_ready=0
+not_ready_list=()
+high_restart_pods=()
+
+for line in "${pod_lines[@]}"; do
+  [[ -z "$line" ]] && continue
+  total_pods=$((total_pods+1))
+  IFS=$'\t' read -r pname pphase pready prest pnode <<< "$line"
+  ready_num=${pready%/*}
+  ready_tot=${pready#*/}
+  ready_num=${ready_num:-0}
+  ready_tot=${ready_tot:-0}
+  if [[ "$pphase" != "Running" ]] || (( ready_num < ready_tot )); then
+    not_ready=$((not_ready+1))
+    not_ready_list+=("${pname}:${pphase}:${pready}")
+  fi
+  prest=${prest:-0}
+  if (( prest >= CRIT_RESTARTS )); then
+    high_restart_pods+=("${pname}:${prest}:CRITICAL")
+  elif (( prest >= WARN_RESTARTS )); then
+    high_restart_pods+=("${pname}:${prest}:WARN")
+  fi
+done
+
+# DaemonSet check (desired vs ready) using safe python parsing
+set +e
+ds_out=$($KC -n "$NAMESPACE" get ds -l "$LABEL" -o json 2>&1)
+rc_ds=$?
+set -e
+ds_desired=0; ds_ready=0
+if (( rc_ds == 0 )); then
+  ds_python_prog=$'import sys,json\ndata=json.load(sys.stdin)\nfor it in data.get(\"items\",[]):\n    s=it.get(\"status\",{})\n    desired=int(s.get(\"desiredNumberScheduled\") or 0)\n    ready=int(s.get(\"numberReady\") or 0)\n    print(f\"{desired}\\t{ready}\")\n'
+  if ds_out_parsed=$(run_python_parser "$ds_out" "$ds_python_prog"); then
+    while IFS=$'\n' read -r d; do
+      [[ -z "$d" ]] && continue
+      ddesired=$(echo "$d" | cut -f1)
+      dready=$(echo "$d" | cut -f2)
+      ds_desired=$((ds_desired+ddesired))
+      ds_ready=$((ds_ready+dready))
+    done <<< "$ds_out_parsed"
+  fi
+fi
+
+# cilium-operator deployment check
+op_ok=1
+op_msg=""
+set +e
+op_json=$($KC -n "$NAMESPACE" get deploy cilium-operator -o json 2>/dev/null || true)
+set -e
+if [[ -n "$op_json" ]]; then
+  op_python_prog=$'import sys,json\ndata=json.load(sys.stdin)\nspec=data.get(\"spec\",{})\nstatus=data.get(\"status\",{})\nreplicas=int(spec.get(\"replicas\") or 1)\navailable=int(status.get(\"availableReplicas\") or 0)\nprint(f\"{replicas}\\t{available}\")\n'
+  if op_line=$(run_python_parser "$op_json" "$op_python_prog"); then
+    IFS=$'\t' read -r op_repl op_avail <<< "$op_line"
+    if (( op_avail < op_repl )); then
+      op_ok=0
+      op_msg="operator available=${op_avail}/${op_repl}"
+    else
+      op_msg="operator available=${op_avail}/${op_repl}"
+    fi
+  fi
+fi
+
+# Optional: cilium CLI
+cilium_ok=1
+cilium_summary=""
+if (( USE_CILIUM_CLI == 1 )); then
+  if ! command -v cilium >/dev/null 2>&1; then
+    cilium_ok=0
+    cilium_summary="cilium binary not in PATH"
+  else
+    set +e
+    cilium_raw=$(cilium status -o json 2>&1) || true
+    rc_cilium=$?
+    set -e
+    if (( rc_cilium != 0 )); then
+      cilium_ok=0
+      cilium_summary="cilium status failed: ${cilium_raw//$'\n'/ ' '}"
+    else
+      cilium_ok=1
+      cilium_summary=$(printf '%s' "$cilium_raw" | tr '\n' ' ' | sed 's/  */ /g' | cut -c1-300)
+    fi
+  fi
+fi
+
+# Compose status
+code=0
+msgs=()
+
+if (( not_ready >= CRIT_NOT_READY )); then
+  code=2
+  msgs+=("CRITICAL - ${not_ready}/${total_pods} pods not ready")
+elif (( not_ready >= WARN_NOT_READY )); then
+  if (( code < 1 )); then code=1; fi
+  msgs+=("WARNING - ${not_ready}/${total_pods} pods not ready")
+else
+  msgs+=("OK - ${total_pods} pods, not-ready=${not_ready}")
+fi
+
+if (( ds_desired > 0 )) && (( ds_ready < ds_desired )); then
+  if (( ds_desired - ds_ready >= CRIT_NOT_READY )); then
+    code=2
+    msgs+=("CRITICAL - daemonsets ready=${ds_ready}/${ds_desired}")
+  else
+    if (( code < 1 )); then code=1; fi
+    msgs+=("WARNING - daemonsets ready=${ds_ready}/${ds_desired}")
+  fi
+fi
+
+if [[ -n "$op_msg" ]]; then
+  if (( op_ok == 0 )); then
+    code=2
+    msgs+=("CRITICAL - ${op_msg}")
+  else
+    msgs+=("${op_msg}")
+  fi
+fi
+
+if (( ${#high_restart_pods[@]} > 0 )); then
+  crit_restart=0; warn_restart=0
+  for r in "${high_restart_pods[@]}"; do
+    [[ "$r" == *":CRITICAL" ]] && crit_restart=1
+    [[ "$r" == *":WARN" ]] && warn_restart=1
+  done
+  if (( crit_restart == 1 )); then
+    code=2
+    msgs+=("CRITICAL - pods with high restart counts: ${high_restart_pods[*]}")
+  elif (( warn_restart == 1 )); then
+    if (( code < 1 )); then code=1; fi
+    msgs+=("WARNING - pods with elevated restarts: ${high_restart_pods[*]}")
+  fi
+fi
+
+if (( USE_CILIUM_CLI == 1 )); then
+  if (( cilium_ok == 0 )); then
+    code=2
+    msgs+=("CRITICAL - cilium-cli: ${cilium_summary}")
+  else
+    msgs+=("cilium-cli ok: ${cilium_summary}")
+  fi
+fi
+
+if (( not_ready > 0 )); then
+  truncated=$(printf "%s, " "${not_ready_list[@]}" | sed 's/, $//')
+  msgs+=("not-ready-list: ${truncated}")
+fi
+
+echo "$(IFS=' ; '; echo "${msgs[*]}")"
+exit "${code}"
--- a/files/nrpe/check_coredns_health
+++ b/files/nrpe/check_coredns_health
@@ -0,0 +1,158 @@
+#!/usr/bin/env bash
+# check_coredns_health
+# Vérifie la santé de CoreDNS (endpoints + endpointslices + fallback pods)
+# Retour codes: 0=OK, 1=WARNING, 2=CRITICAL, 3=UNKNOWN
+#
+# Usage:
+#  sudo /usr/lib/nagios/plugins/check_coredns_health [--namespace N] [--service NAME] [--label-fallback LABEL] [--kubeconfig PATH]
+#
+set -euo pipefail
+
+NAMESPACE=${NAMESPACE:-kube-system}
+SERVICE_NAME=${SERVICE_NAME:-coredns}
+LABEL_FALLBACK=${LABEL_FALLBACK:-k8s-app=kube-dns}
+TIMEOUT=${TIMEOUT:-10}
+
+usage() {
+  cat <<EOF
+Usage: $0 [--namespace N] [--service NAME] [--label-fallback LABEL] [--kubeconfig PATH]
+Defaults: namespace=$NAMESPACE service=$SERVICE_NAME
+EOF
+}
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --namespace) NAMESPACE="$2"; shift 2;;
+    --service) SERVICE_NAME="$2"; shift 2;;
+    --label-fallback) LABEL_FALLBACK="$2"; shift 2;;
+    --kubeconfig) export KUBECONFIG="$2"; shift 2;;
+    -h|--help) usage; exit 3;;
+    *) echo "Unknown arg: $1"; usage; exit 3;;
+  esac
+done
+
+if ! command -v kubectl >/dev/null 2>&1; then
+  echo "UNKNOWN - kubectl not found"
+  exit 3
+fi
+
+# If KUBECONFIG not set, try sensible defaults so sudo/nagios runs succeed.
+if [[ -z "${KUBECONFIG:-}" ]]; then
+  if [[ -r "/etc/kubernetes/admin.conf" ]]; then
+    export KUBECONFIG="/etc/kubernetes/admin.conf"
+  elif [[ -r "/root/.kube/config" ]]; then
+    export KUBECONFIG="/root/.kube/config"
+  fi
+fi
+
+# Build kubectl command with explicit kubeconfig when available
+if [[ -n "${KUBECONFIG:-}" ]]; then
+  KC=(kubectl --kubeconfig="${KUBECONFIG}" --request-timeout="${TIMEOUT}s")
+else
+  KC=(kubectl --request-timeout="${TIMEOUT}s")
+fi
+
+# run_kc: capture stdout only (stderr -> /dev/null) and return kubectl's exit code
+run_kc() {
+  local out rc
+  out="$("${KC[@]}" "$@" 2>/dev/null)"
+  rc=$?
+  printf '%s' "$out"
+  return $rc
+}
+
+# 1) try Endpoints resource
+ep_out=$(run_kc -n "$NAMESPACE" get endpoints "$SERVICE_NAME" -o jsonpath='{.subsets[*].addresses[*].ip}')
+rc=$?
+if (( rc != 0 )); then
+  echo "CRITICAL - kubectl failed to get Endpoints (exit code ${rc})"
+  exit 2
+fi
+if [[ -n "${ep_out// /}" ]]; then
+  echo "OK - service ${SERVICE_NAME} in ${NAMESPACE} has endpoints: $(echo "$ep_out" | tr ' ' ',')"
+  exit 0
+fi
+
+# 2) try EndpointSlices (k8s >= 1.17)
+eps_out=$(run_kc -n "$NAMESPACE" get endpointslices -l "kubernetes.io/service-name=${SERVICE_NAME}" -o jsonpath='{range .items[*]}{range .endpoints[*]}{.addresses[*]}{"\n"}{end}{end}')
+rc=$?
+if (( rc != 0 )); then
+  echo "CRITICAL - kubectl failed to get EndpointSlices (exit code ${rc})"
+  exit 2
+fi
+if [[ -n "${eps_out// /}" ]]; then
+  tops=$(printf '%s\n' "$eps_out" | sed '/^\s*$/d' | tr '\n' ',' | sed 's/,$//')
+  echo "OK - service ${SERVICE_NAME} in ${NAMESPACE} has EndpointSlices addresses: ${tops}"
+  exit 0
+fi
+
+# 3) fallback: check service selector and pods matching it
+svc_out=$(run_kc -n "$NAMESPACE" get svc "$SERVICE_NAME" -o jsonpath='{range $k,$v := .spec.selector}{printf "%s=%s;" $k $v}{end}')
+rc=$?
+if (( rc != 0 )); then
+  echo "CRITICAL - kubectl failed to get Service selector (exit code ${rc})"
+  exit 2
+fi
+
+SEL="$svc_out"
+if [[ -z "$SEL" ]]; then
+  SEL="$LABEL_FALLBACK"
+  SEL=${SEL//;/,}
+fi
+SEL=${SEL%[;,]}
+
+# get pods by selector
+pods_out=$(run_kc -n "$NAMESPACE" get pods -l "$SEL" --no-headers -o custom-columns=READY:.status.containerStatuses[0].ready,NAME:.metadata.name)
+rc=$?
+if (( rc != 0 )); then
+  echo "CRITICAL - kubectl failed to list pods for selector '${SEL}' (exit code ${rc})"
+  exit 2
+fi
+
+if [[ -z "${pods_out// /}" ]]; then
+  # try alternative labels common for CoreDNS (k8s-app=coredns)
+  pods_alt=$(run_kc -n "$NAMESPACE" get pods -l k8s-app=coredns --no-headers -o custom-columns=READY:.status.containerStatuses[0].ready,NAME:.metadata.name)
+  rc=$?
+  if (( rc != 0 )); then
+    echo "CRITICAL - kubectl failed to list pods for fallback selector (exit code ${rc})"
+    exit 2
+  fi
+  if [[ -n "${pods_alt// /}" ]]; then
+    pods_out="$pods_alt"
+    SEL="k8s-app=coredns (fallback)"
+  fi
+fi
+
+if [[ -z "${pods_out// /}" ]]; then
+  echo "CRITICAL - service ${SERVICE_NAME} in ${NAMESPACE} has no endpoints and no pods match selector '${SEL}'"
+  exit 2
+fi
+
+# count Ready pods
+not_ready_count=0
+total_count=0
+not_ready_list=()
+while IFS= read -r line; do
+  [[ -z "$line" ]] && continue
+  total_count=$((total_count+1))
+  ready_flag=$(echo "$line" | awk '{print $1}')
+  pod_name=$(echo "$line" | awk '{print $2}')
+  if [[ "$ready_flag" != "true" && "$ready_flag" != "True" && "$ready_flag" != "1" ]]; then
+    not_ready_count=$((not_ready_count+1))
+    not_ready_list+=("$pod_name")
+  fi
+done <<< "$pods_out"
+
+if (( total_count == 0 )); then
+  echo "CRITICAL - service ${SERVICE_NAME} in ${NAMESPACE} has no endpoints and no pods found for selector '${SEL}'"
+  exit 2
+fi
+
+if (( not_ready_count > 0 )); then
+  echo "WARNING - service ${SERVICE_NAME} in ${NAMESPACE} has no endpoints, but ${not_ready_count}/${total_count} pods matching selector '${SEL}' are not Ready: ${not_ready_list[*]}"
+  exit 1
+fi
+
+# If pods exist and are Ready but no Endpoints/EndpointSlices -> likely endpointcontroller/roles mismatch; consider OK but log it
+echo "OK - service ${SERVICE_NAME} in ${NAMESPACE} has no Endpoints resource but ${total_count} pods matching selector '${SEL}' are Ready (EndpointSlices absent or controller issue)"
+exit 0
--- a/files/nrpe/check_etcd_health
+++ b/files/nrpe/check_etcd_health
@@ -0,0 +1,230 @@
+#!/usr/bin/env bash
+# check_etcd_health
+# Verifie la santé d'etcd et (optionnel) la creation/verifieation des snapshots.
+# Retourne : 0=OK, 1=WARNING, 2=CRITICAL, 3=UNKNOWN
+#
+# Usage example:
+#  sudo /usr/lib/nagios/plugins/check_etcd_health \
+#    --endpoints "https://192.168.1.41:2379,https://192.168.1.42:2379" \
+#    --cacert /etc/ssl/etcd/ssl/ca.pem --cert /etc/ssl/etcd/ssl/admin.pem --key /etc/ssl/etcd/ssl/admin-key.pem \
+#    --test-snapshot --snapshot-dir /var/backups/etcd --snapshot-max-age 24
+#
+# Notes:
+# - Par securite, execute ce script sur un master (ou via NRPE/SSH) avec un utilisateur ayant acces aux clefs.
+# - --snapshot-max-age en heures (defaut 24). Mettre 0 pour desactiver la verification d'age.
+# - --test-snapshot creerera un snapshot temporaire pour valider la creation + verification via `etcdctl snapshot status`.
+# - Si --keep-snapshot-on-failure est active, le snapshot temporaire sera conserve en cas d'erreur pour debug.
+
+ETCDCTL=${ETCDCTL:-/usr/local/bin/etcdctl}
+
+print_usage() {
+  cat <<EOF
+Usage: $0 --endpoints ENDPOINTS --cacert CA --cert CERT --key KEY [options]
+Options:
+  --warn-db-mb N           avertissement si DB >= N MB (default 1024)
+  --crit-db-mb M           critique si DB >= M MB (default 1800)
+  --timeout SECS           etcdctl timeout (default 10)
+  --test-snapshot          tenter de creer un snapshot temporaire et verifier son status
+  --snapshot-dir DIR       repertoire pour snapshots temporaires (default /var/backups/etcd)
+  --keep-snapshot-on-failure  conserver le snapshot temporaire si creation echoue (default false)
+  --snapshot-max-age HRS   verifier qu'il existe un snapshot plus recent que HRS heures (default 24). Mettre 0 pour desactiver.
+  -h, --help               affiche cette aide
+EOF
+}
+
+# Defaults
+WARN_DB_MB=${WARN_DB_MB:-1024}
+CRIT_DB_MB=${CRIT_DB_MB:-1800}
+TIMEOUT=${TIMEOUT:-10}
+TEST_SNAPSHOT=0
+SNAPSHOT_DIR=${SNAPSHOT_DIR:-/var/backups/etcd}
+KEEP_SNAPSHOT_ON_FAILURE=0
+SNAPSHOT_MAX_AGE_HOURS=${SNAPSHOT_MAX_AGE_HOURS:-24}
+
+# Parse args
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --endpoints) ENDPOINTS="$2"; shift 2;;
+    --cacert) CACERT="$2"; shift 2;;
+    --cert) CERT="$2"; shift 2;;
+    --key) KEY="$2"; shift 2;;
+    --warn-db-mb) WARN_DB_MB="$2"; shift 2;;
+    --crit-db-mb) CRIT_DB_MB="$2"; shift 2;;
+    --timeout) TIMEOUT="$2"; shift 2;;
+    --test-snapshot) TEST_SNAPSHOT=1; shift 1;;
+    --snapshot-dir) SNAPSHOT_DIR="$2"; shift 2;;
+    --keep-snapshot-on-failure) KEEP_SNAPSHOT_ON_FAILURE=1; shift 1;;
+    --snapshot-max-age) SNAPSHOT_MAX_AGE_HOURS="$2"; shift 2;;
+    -h|--help) print_usage; exit 3;;
+    *) echo "Unknown arg: $1"; print_usage; exit 3;;
+  esac
+done
+
+# Allow env fallback (if ETCDCTL_* env vars set)
+ENDPOINTS=${ENDPOINTS:-${ETCDCTL_ENDPOINTS:-}}
+CACERT=${CACERT:-${ETCDCTL_CACERT:-}}
+CERT=${CERT:-${ETCDCTL_CERT:-}}
+KEY=${KEY:-${ETCDCTL_KEY:-}}
+
+if [[ -z "${ENDPOINTS:-}" || -z "${CACERT:-}" || -z "${CERT:-}" || -z "${KEY:-}" ]]; then
+  echo "UNKNOWN - missing required args/certs"
+  print_usage
+  exit 3
+fi
+
+if [[ ! -x "$ETCDCTL" ]]; then
+  echo "UNKNOWN - etcdctl not found at $ETCDCTL"
+  exit 3
+fi
+
+if [[ ! -r "$CACERT" || ! -r "$CERT" || ! -r "$KEY" ]]; then
+  echo "CRITICAL - cannot read certificate files (permissions?)"
+  echo "CACERT=$CACERT CERT=$CERT KEY=$KEY"
+  exit 2
+fi
+
+export ETCDCTL_API=3
+
+# 1) endpoint status check
+OUT=$("$ETCDCTL" --command-timeout="${TIMEOUT}s" --endpoints="${ENDPOINTS}" --cacert="$CACERT" --cert="$CERT" --key="$KEY" endpoint status 2>&1) || {
+  echo "CRITICAL - etcdctl endpoint status failed: $OUT"
+  exit 2
+}
+
+leaders=0
+total=0
+max_db_mb=0
+while IFS= read -r line; do
+  line=${line//$'\r'/}
+  [[ -z "$line" ]] && continue
+  total=$((total+1))
+  IFS=',' read -r endpoint id version dbsize isLeader isLearner memberCount rest <<<"$line"
+  isLeader=$(echo "${isLeader:-}" | tr -d ' ' | tr '[:upper:]' '[:lower:]')
+  if [[ "$isLeader" == "true" ]]; then leaders=$((leaders+1)); fi
+  db_mb=0
+  if [[ -n "${dbsize:-}" ]]; then
+    dbsize=$(echo "$dbsize" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
+    num=$(echo "$dbsize" | awk '{print $1}' 2>/dev/null || echo "")
+    unit=$(echo "$dbsize" | awk '{print $2}' 2>/dev/null || echo "")
+    if [[ "$num" =~ ^[0-9]+(\.[0-9]+)?$ ]]; then
+      case "${unit^^}" in
+        B) db_mb=$(( num / 1024 / 1024 )) ;;
+        KB) db_mb=$(( num / 1024 )) ;;
+        MB) db_mb=$(printf "%.0f" "$num") ;;
+        GB) db_mb=$(( num * 1024 )) ;;
+        *) db_mb=$(printf "%.0f" "$num") ;;
+      esac
+    fi
+  fi
+  if (( db_mb > max_db_mb )); then max_db_mb=$db_mb; fi
+done <<< "$OUT"
+
+if (( total == 0 )); then
+  echo "CRITICAL - no endpoints returned by etcdctl"
+  exit 2
+fi
+if (( leaders == 0 )); then
+  echo "CRITICAL - no leader found among $total endpoints; detail: $OUT"
+  exit 2
+fi
+if (( leaders > 1 )); then
+  echo "WARNING - multiple leaders detected: $leaders (possible split-brain); detail: $OUT"
+  exit 1
+fi
+if (( max_db_mb >= CRIT_DB_MB )); then
+  echo "CRITICAL - etcd DB size ${max_db_mb}MB >= ${CRIT_DB_MB}MB"
+  exit 2
+fi
+if (( max_db_mb >= WARN_DB_MB )); then
+  echo "WARNING - etcd DB size ${max_db_mb}MB >= ${WARN_DB_MB}MB"
+  exit 1
+fi
+
+# 2) Verification of recent snapshot files (optional, default 24h)
+SNAP_CHECK_MSG=""
+if [[ -n "$SNAPSHOT_MAX_AGE_HOURS" ]]; then
+  # SNAPSHOT_MAX_AGE_HOURS == 0 -> disabled
+  if (( SNAPSHOT_MAX_AGE_HOURS > 0 )); then
+    mkdir -p "$SNAPSHOT_DIR" 2>/dev/null || {
+      echo "CRITICAL - cannot create/access snapshot dir $SNAPSHOT_DIR"
+      exit 2
+    }
+    latest_snapshot=$(ls -1t "$SNAPSHOT_DIR"/snapshot-*.db 2>/dev/null | head -n1 || true)
+    if [[ -z "$latest_snapshot" ]]; then
+      SNAP_CHECK_MSG="no snapshot files found in $SNAPSHOT_DIR"
+      echo "CRITICAL - $SNAP_CHECK_MSG (no snapshots)"
+      exit 2
+    else
+      now_s=$(date +%s)
+      snap_mtime_s=$(stat -c %Y "$latest_snapshot")
+      age_s=$(( now_s - snap_mtime_s ))
+      age_h=$(( age_s / 3600 ))
+      if (( age_h > SNAPSHOT_MAX_AGE_HOURS )); then
+        SNAP_CHECK_MSG="latest snapshot $latest_snapshot is ${age_h}h old (> ${SNAPSHOT_MAX_AGE_HOURS}h)"
+        echo "CRITICAL - $SNAP_CHECK_MSG"
+        exit 2
+      else
+        SNAP_CHECK_MSG="latest snapshot $latest_snapshot is ${age_h}h old (<= ${SNAPSHOT_MAX_AGE_HOURS}h)"
+      fi
+    fi
+  fi
+fi
+
+# 3) Optional: test snapshot creation and status
+SNAP_TEST_MSG=""
+if (( TEST_SNAPSHOT == 1 )); then
+  mkdir -p "$SNAPSHOT_DIR" 2>/dev/null || {
+    echo "CRITICAL - cannot create/access snapshot dir $SNAPSHOT_DIR"
+    exit 2
+  }
+  if [[ ! -w "$SNAPSHOT_DIR" ]]; then
+    echo "CRITICAL - snapshot dir not writable: $SNAPSHOT_DIR"
+    exit 2
+  fi
+
+  SNAPFILE=$(mktemp "${SNAPSHOT_DIR}/snapshot-XXXXXX.db") || {
+    echo "CRITICAL - mktemp failed in $SNAPSHOT_DIR"
+    exit 2
+  }
+
+  cleanup() {
+    rc=$?
+    if [[ $rc -eq 0 ]]; then
+      rm -f "$SNAPFILE" 2>/dev/null || true
+    else
+      if [[ $KEEP_SNAPSHOT_ON_FAILURE -eq 0 ]]; then
+        rm -f "$SNAPFILE" 2>/dev/null || true
+      else
+        echo "NOTICE - snapshot kept at $SNAPFILE for debugging"
+      fi
+    fi
+    return $rc
+  }
+  trap 'cleanup' EXIT
+
+  SAVE_OUT=$("$ETCDCTL" --command-timeout="${TIMEOUT}s" --endpoints="${ENDPOINTS}" --cacert="$CACERT" --cert="$CERT" --key="$KEY" snapshot save "$SNAPFILE" 2>&1) || {
+    echo "CRITICAL - snapshot save failed: $SAVE_OUT"
+    exit 2
+  }
+
+  STATUS_OUT=$("$ETCDCTL" snapshot status "$SNAPFILE" 2>&1) || {
+    echo "CRITICAL - snapshot status failed: $STATUS_OUT"
+    exit 2
+  }
+
+  # If we reach here, creation+status ok
+  SNAP_TEST_MSG="snapshot test ok: $SNAPFILE ; status: $(echo "$STATUS_OUT" | tr '\n' ' ' | sed 's/  */ /g')"
+  # cleanup will remove the snapshot (unless KEEP_SNAPSHOT_ON_FAILURE and rc != 0)
+fi
+
+# Compose final message
+MSG="OK - $total endpoints checked, leaders=$leaders, max_db=${max_db_mb}MB"
+if [[ -n "$SNAP_CHECK_MSG" ]]; then
+  MSG="$MSG ; $SNAP_CHECK_MSG"
+fi
+if [[ -n "$SNAP_TEST_MSG" ]]; then
+  MSG="$MSG ; $SNAP_TEST_MSG"
+fi
+
+echo "$MSG"
+exit 0
--- a/files/nrpe/check_k8s_apiserver_access
+++ b/files/nrpe/check_k8s_apiserver_access
@@ -0,0 +1,214 @@
+#!/usr/bin/env bash
+# check_k8s_apiserver_access
+# Vérifie le nombre de réponses HTTP 403 dans les logs de kube-apiserver.
+# Retour codes: 0=OK, 1=WARNING, 2=CRITICAL, 3=UNKNOWN
+#
+# Par défaut: utilise journalctl -u kube-apiserver --since="${WINDOW} minutes ago"
+# Option --kubectl : utilise "kubectl logs" sur les pods correspondant au sélecteur.
+#
+# Usage examples:
+#   sudo /usr/lib/nagios/plugins/check_k8s_apiserver_access --window 5 --warn 10 --crit 50
+#   sudo /usr/lib/nagios/plugins/check_k8s_apiserver_access --kubectl --selector 'k8s-app=kube-apiserver' --window 10 --crit 100
+#
+set -euo pipefail
+
+PROG_NAME=$(basename "$0")
+
+# Defaults
+WINDOW_MINUTES=5
+WARN_THRESHOLD=10
+CRIT_THRESHOLD=50
+USE_KUBECTL=0
+KUBECTL_NAMESPACE="kube-system"
+KUBECTL_SELECTOR=""    # if empty, we'll try -l component=kube-apiserver or label provided
+JOURNAL_UNIT="kube-apiserver"  # systemd unit name; adapt if different
+PATTERN=''  # optional custom grep regex
+TOP_N=5     # number of top offenders to show
+
+print_help() {
+  cat <<EOF
+$PROG_NAME - check apiserver 403 rate in logs
+
+Options:
+  --window N           Window in minutes to look back (default: ${WINDOW_MINUTES})
+  --warn N             WARN threshold: count >= N -> WARNING (default: ${WARN_THRESHOLD})
+  --crit N             CRIT threshold: count >= N -> CRITICAL (default: ${CRIT_THRESHOLD})
+  --kubectl            Use 'kubectl logs' on apiserver pods instead of journalctl
+  --namespace NS       Namespace for kubectl logs (default: ${KUBECTL_NAMESPACE})
+  --selector SEL       Label selector for kubectl logs (e.g. "component=kube-apiserver" or "k8s-app=kube-apiserver")
+  --unit UNIT          systemd unit for journalctl (default: ${JOURNAL_UNIT})
+  --pattern REGEX      custom grep regex to detect 403 entries (overrides built-in heuristics)
+  --top N              show top N request lines causing 403 (default ${TOP_N})
+  -h, --help           show this help
+
+Examples:
+  # check last 5 minutes using journalctl
+  sudo ./check_apiserver_403.sh --window 5 --warn 20 --crit 50
+
+  # check last 10 minutes using kubectl logs for apiserver static-pods
+  sudo ./check_apiserver_403.sh --kubectl --namespace kube-system --selector 'k8s-app=kube-apiserver' --window 10 --crit 100
+EOF
+}
+
+# Parse args
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --window) WINDOW_MINUTES="$2"; shift 2;;
+    --warn) WARN_THRESHOLD="$2"; shift 2;;
+    --crit) CRIT_THRESHOLD="$2"; shift 2;;
+    --kubectl) USE_KUBECTL=1; shift 1;;
+    --namespace) KUBECTL_NAMESPACE="$2"; shift 2;;
+    --selector) KUBECTL_SELECTOR="$2"; shift 2;;
+    --unit) JOURNAL_UNIT="$2"; shift 2;;
+    --pattern) PATTERN="$2"; shift 2;;
+    --top) TOP_N="$2"; shift 2;;
+    -h|--help) print_help; exit 3;;
+    *) echo "Unknown argument: $1"; print_help; exit 3;;
+  esac
+done
+
+# Validate numeric args
+if ! [[ "$WINDOW_MINUTES" =~ ^[0-9]+$ ]]; then echo "UNKNOWN - invalid --window"; exit 3; fi
+if ! [[ "$WARN_THRESHOLD" =~ ^[0-9]+$ ]]; then echo "UNKNOWN - invalid --warn"; exit 3; fi
+if ! [[ "$CRIT_THRESHOLD" =~ ^[0-9]+$ ]]; then echo "UNKNOWN - invalid --crit"; exit 3; fi
+if ! [[ "$TOP_N" =~ ^[0-9]+$ ]]; then echo "UNKNOWN - invalid --top"; exit 3; fi
+
+# Build detection regex if not provided
+if [[ -z "$PATTERN" ]]; then
+  # heuristics: try to match common apiserver log patterns that indicate a 403/Forbidden
+  # examples: "\" 403 ", "code=403", "403 Forbidden", "Forbidden" combined with "Denied" etc.
+  PATTERN='(" 403 |\" 403 |code=403|403 Forbidden|Forbidden|\"Reason=Forbidden\"|\"message=.*Forbidden)'
+
+  # note: portable grep -E will accept that pattern
+fi
+
+# Grab logs
+get_logs_journal() {
+  # Use journalctl if available
+  if ! command -v journalctl >/dev/null 2>&1; then
+    echo "ERROR_NO_JOURNAL" 1>&2
+    return 1
+  fi
+  # We use --no-pager; use unit name. If unit not present, journalctl returns non-zero.
+  # Example: journalctl -u kube-apiserver --since "5 minutes ago"
+  journalctl -u "${JOURNAL_UNIT}" --since="${WINDOW_MINUTES} minutes ago" --no-pager 2>/dev/null || return 1
+}
+
+get_logs_kubectl() {
+  if ! command -v kubectl >/dev/null 2>&1; then
+    echo "ERROR_NO_KUBECTL" 1>&2
+    return 1
+  fi
+  # If no selector given try common selectors
+  sel="${KUBECTL_SELECTOR}"
+  if [[ -z "$sel" ]]; then
+    # try common labels
+    for try in 'component=kube-apiserver' 'k8s-app=kube-apiserver' 'tier=control-plane' ''; do
+      if [[ -z "$try" ]]; then
+        sel=""
+        break
+      fi
+      # test if any pods match
+      count=$(kubectl -n "${KUBECTL_NAMESPACE}" get pods -l "${try}" --no-headers 2>/dev/null | wc -l || echo 0)
+      if [[ "$count" -gt 0 ]]; then
+        sel="${try}"
+        break
+      fi
+    done
+  fi
+
+  if [[ -z "$sel" ]]; then
+    # fallback: get all pods in namespace and try to find apiserver in name
+    pods=$(kubectl -n "${KUBECTL_NAMESPACE}" get pods --no-headers -o custom-columns=':metadata.name' 2>/dev/null || true)
+    if [[ -z "$pods" ]]; then
+      return 1
+    fi
+    # build selector as empty and we'll filter by name
+    # collect logs from pods whose name contains "apiserver"
+    out=""
+    while IFS= read -r p; do
+      [[ -z "$p" ]] && continue
+      if echo "$p" | grep -qi 'apiserver'; then
+        out="${out}$(kubectl -n ${KUBECTL_NAMESPACE} logs --since=${WINDOW_MINUTES}m ${p} --all-containers 2>/dev/null || true)$'\n'"
+      fi
+    done <<< "$pods"
+    printf '%s' "$out"
+    return 0
+  else
+    # gather logs from all pods matching selector
+    podnames=$(kubectl -n "${KUBECTL_NAMESPACE}" get pods -l "${sel}" -o custom-columns=':metadata.name' --no-headers 2>/dev/null || true)
+    if [[ -z "$podnames" ]]; then
+      return 1
+    fi
+    out=""
+    while IFS= read -r p; do
+      [[ -z "$p" ]] && continue
+      out="${out}$(kubectl -n ${KUBECTL_NAMESPACE} logs --since=${WINDOW_MINUTES}m ${p} --all-containers 2>/dev/null || true)$'\n'"
+    done <<< "$podnames"
+    printf '%s' "$out"
+    return 0
+  fi
+}
+
+# retrieve logs into variable LOGS
+LOGS=""
+if (( USE_KUBECTL == 1 )); then
+  if ! LOGS=$(get_logs_kubectl); then
+    echo "CRITICAL - failed to collect logs via kubectl (check KUBECONFIG, namespace/selector, permissions)"
+    exit 2
+  fi
+else
+  if ! LOGS=$(get_logs_journal); then
+    echo "CRITICAL - failed to collect logs via journalctl for unit '${JOURNAL_UNIT}' (check unit name/permissions)"
+    exit 2
+  fi
+fi
+
+# If logs empty -> OK (no traffic) BUT treat with UNKNOWN if we expected logs
+if [[ -z "$LOGS" ]]; then
+  echo "OK - no apiserver logs found in the last ${WINDOW_MINUTES}m (count=0)"
+  exit 0
+fi
+
+# Count matches of 403 using grep -E (case-insensitive)
+# Use printf to pass LOGS safely to grep
+count_403=$(printf '%s\n' "$LOGS" | grep -E -i -c "$PATTERN" || true)
+count_403=${count_403:-0}
+
+# Optionally extract top request lines that caused 403
+# Try to extract HTTP method + path if present, otherwise use whole line truncated
+top_requests=$(printf '%s\n' "$LOGS" | grep -E -i "$PATTERN" || true)
+if [[ -n "$top_requests" ]]; then
+  # try to extract method+path like: "GET /api/..." or GET /api/... 
+  top_paths=$(printf '%s\n' "$top_requests" | grep -oE '(GET|POST|PUT|DELETE|PATCH) [^" ]+' | sed 's/"$//' | sort | uniq -c | sort -rn | head -n "${TOP_N}" || true)
+  if [[ -z "$top_paths" ]]; then
+    # fallback: show most frequent truncated lines
+    top_paths=$(printf '%s\n' "$top_requests" | sed 's/^[[:space:]]*//; s/[[:space:]]\+/ /g' | cut -c1-200 | sort | uniq -c | sort -rn | head -n "${TOP_N}" || true)
+  fi
+else
+  top_paths=""
+fi
+
+# Decide severity
+if (( count_403 >= CRIT_THRESHOLD )); then
+  status=2
+  state="CRITICAL"
+elif (( count_403 >= WARN_THRESHOLD )); then
+  status=1
+  state="WARNING"
+else
+  status=0
+  state="OK"
+fi
+
+# Build message
+msg="${state} - ${count_403} occurrences of 403 in last ${WINDOW_MINUTES}m (warn=${WARN_THRESHOLD},crit=${CRIT_THRESHOLD})"
+
+# Append top paths if present
+if [[ -n "$top_paths" ]]; then
+  msg="${msg} ; top=${TOP_N}: $(printf '%s' "$top_paths" | tr '\n' '|' | sed 's/|$//')"
+fi
+
+# Print and exit
+echo "$msg"
+exit $status
--- a/files/nrpe/check_k8s_deployments
+++ b/files/nrpe/check_k8s_deployments
@@ -0,0 +1,138 @@
+#!/usr/bin/env bash
+# check_k8s_deployments
+# Vérifie les Deployments Kubernetes: availableReplicas < spec.replicas
+# Retour: 0=OK, 1=WARNING, 2=CRITICAL, 3=UNKNOWN
+#
+# Usage:
+#  sudo /usr/lib/nagios/plugins/check_k8s_deployments [--warn N] [--crit M] [--ignore-ns ns1,ns2] [--namespaces ns1,ns2] [--age-min MINUTES]
+#
+# Exemples:
+#  sudo /usr/lib/nagios/plugins/check_k8s_deployments --crit 1
+#  sudo /usr/lib/nagios/plugins/check_k8s_deployments --ignore-ns kube-system,monitoring
+#
+set -euo pipefail
+
+WARN=${WARN:-0}   # nombre de deploys en erreur pour WARNING
+CRIT=${CRIT:-1}   # nombre de deploys en erreur pour CRITICAL par défaut (1 => tout problème -> CRITICAL)
+IGNORE_NS=""
+INCLUDE_NS=""
+AGE_MIN=0
+
+print_usage() {
+  cat <<EOF
+Usage: $0 [--warn N] [--crit M] [--ignore-ns ns1,ns2] [--namespaces ns1,ns2] [--age-min MINUTES]
+ --warn N         : seuil warn si >=N déploiements en erreur (default 0)
+ --crit M         : seuil crit si >=M déploiements en erreur (default 1)
+ --ignore-ns LIST : comma separated namespaces to ignore (default none)
+ --namespaces LIST: comma separated namespaces to check only (default all)
+ --age-min N      : ignore deployments created less than N minutes ago (avoid flapping during rollout)
+EOF
+}
+
+# parse args
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --warn) WARN="$2"; shift 2;;
+    --crit) CRIT="$2"; shift 2;;
+    --ignore-ns) IGNORE_NS="$2"; shift 2;;
+    --namespaces) INCLUDE_NS="$2"; shift 2;;
+    --age-min) AGE_MIN="$2"; shift 2;;
+    -h|--help) print_usage; exit 3;;
+    *) echo "Unknown arg: $1"; print_usage; exit 3;;
+  esac
+done
+
+if ! command -v kubectl >/dev/null 2>&1; then
+  echo "UNKNOWN - kubectl not found"
+  exit 3
+fi
+
+# Build filter for namespace inclusion/exclusion
+ignore_pattern=""
+if [[ -n "$IGNORE_NS" ]]; then
+  IFS=',' read -ra arr <<< "$IGNORE_NS"
+  for ns in "${arr[@]}"; do
+    ignore_pattern="${ignore_pattern}|^${ns}\$"
+  done
+  # remove leading |
+  ignore_pattern="${ignore_pattern#|}"
+fi
+
+include_pattern=""
+if [[ -n "$INCLUDE_NS" ]]; then
+  IFS=',' read -ra arr2 <<< "$INCLUDE_NS"
+  for ns in "${arr2[@]}"; do
+    include_pattern="${include_pattern}|^${ns}\$"
+  done
+  include_pattern="${include_pattern#|}"
+fi
+
+# result collection
+# Initialize failures array to avoid "variable sans liaison" when running with set -u
+failures=()
+
+# get list: namespace, name, desired, available, creationTimestamp
+mapfile -t lines < <(kubectl get deploy -A -o jsonpath='{range .items[*]}{.metadata.namespace}{"\t"}{.metadata.name}{"\t"}{.spec.replicas}{"\t"}{.status.availableReplicas}{"\t"}{.metadata.creationTimestamp}{"\n"}{end}' 2>/dev/null || true)
+
+now_s=$(date +%s)
+
+for line in "${lines[@]}"; do
+  # skip empty lines
+  [[ -z "${line}" ]] && continue
+
+  ns=$(echo "$line" | awk -F'\t' '{print $1}')
+  name=$(echo "$line" | awk -F'\t' '{print $2}')
+  desired=$(echo "$line" | awk -F'\t' '{print $3}')
+  available=$(echo "$line" | awk -F'\t' '{print $4}')
+  created=$(echo "$line" | awk -F'\t' '{print $5}')
+
+  # normalize
+  desired=${desired:-0}
+  available=${available:-0}
+
+  # namespace filtering
+  if [[ -n "$include_pattern" ]]; then
+    if ! echo "$ns" | egrep -q "$include_pattern"; then
+      continue
+    fi
+  fi
+  if [[ -n "$ignore_pattern" ]]; then
+    if echo "$ns" | egrep -q "$ignore_pattern"; then
+      continue
+    fi
+  fi
+
+  # age filtering
+  if [[ -n "$created" && "$AGE_MIN" -gt 0 ]]; then
+    # convert to epoch
+    created_s=$(date -d "$created" +%s 2>/dev/null || echo 0)
+    age_min=$(( (now_s - created_s) / 60 ))
+    if (( age_min < AGE_MIN )); then
+      # skip new deployments (they might be still rolling out)
+      continue
+    fi
+  fi
+
+  if (( available < desired )); then
+    failures+=("${ns}/${name} (desired=${desired},available=${available})")
+  fi
+done
+
+count=${#failures[@]}
+
+if (( count == 0 )); then
+  echo "OK - all deployments report desired==available"
+  exit 0
+fi
+
+# Decide severity
+if (( count >= CRIT )); then
+  echo "CRITICAL - ${count} deployments not available: ${failures[*]}"
+  exit 2
+elif (( count >= WARN )); then
+  echo "WARNING - ${count} deployments not available: ${failures[*]}"
+  exit 1
+else
+  echo "OK - ${count} deployments not available but below thresholds"
+  exit 0
+fi
--- a/files/nrpe/check_k8s_jobs_cronjobs
+++ b/files/nrpe/check_k8s_jobs_cronjobs
@@ -0,0 +1,232 @@
+#!/usr/bin/env bash
+# check_k8s_jobs_cronjobs
+# Vérifie l'état des Kubernetes Jobs et CronJobs.
+# Exit codes: 0=OK, 1=WARNING, 2=CRITICAL, 3=UNKNOWN
+#
+# Fonctions principales :
+#  - détecte Jobs avec des échecs (.status.failed > 0) ou des Jobs "actifs" trop vieux
+#  - recherche d'événements récents (type=Warning) liés aux Jobs dans les X dernières minutes
+#  - vérifie pour les CronJobs que lastScheduleTime n'est pas trop ancien (configurable) si non suspendu
+#
+# Usage (exemples) :
+#  sudo /usr/lib/nagios/plugins/check_k8s_jobs_cronjobs --crit 1 --recent-minutes 5
+#  sudo /usr/lib/nagios/plugins/check_k8s_jobs_cronjobs --ignore-ns kube-system --cron-max-age 120
+#
+set -euo pipefail
+
+# Defaults
+WARN=${WARN:-0}
+CRIT=${CRIT:-1}
+IGNORE_NS=""
+INCLUDE_NS=""
+AGE_MIN=${AGE_MIN:-60}
+RECENT_MINUTES=${RECENT_MINUTES:-5}
+CHECK_CRON=1
+CRON_MAX_AGE_MIN=${CRON_MAX_AGE_MIN:-60}
+
+print_usage() {
+  cat <<EOF
+Usage: $0 [options]
+Options:
+  --warn N                seuil WARN si >= N objets en erreur (default 0)
+  --crit M                seuil CRIT si >= M objets en erreur (default 1)
+  --ignore-ns ns1,ns2     namespaces à ignorer
+  --namespaces ns1,ns2    limiter aux namespaces donnés (comma separated)
+  --age-min MINUTES       considérer un job "actif" normal si démarré moins de MINUTES (default 60)
+  --recent-minutes MIN    chercher événements de Job (Warning) dans les MIN dernières minutes (default 5)
+  --check-cron            activer la vérification des CronJobs (default ON)
+  --cron-max-age MINUTES  si lastScheduleTime > MINUTES => alerter (default 60). Mettre 0 pour désactiver.
+  -h, --help              : affiche l'aide
+EOF
+}
+
+# Parse args
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --warn) WARN="$2"; shift 2;;
+    --crit) CRIT="$2"; shift 2;;
+    --ignore-ns) IGNORE_NS="$2"; shift 2;;
+    --namespaces) INCLUDE_NS="$2"; shift 2;;
+    --age-min) AGE_MIN="$2"; shift 2;;
+    --recent-minutes) RECENT_MINUTES="$2"; shift 2;;
+    --no-cron) CHECK_CRON=0; shift 1;;
+    --cron-max-age) CRON_MAX_AGE_MIN="$2"; shift 2;;
+    -h|--help) print_usage; exit 3;;
+    *) echo "Unknown arg: $1"; print_usage; exit 3;;
+  esac
+done
+
+if ! command -v kubectl >/dev/null 2>&1; then
+  echo "UNKNOWN - kubectl not found"
+  exit 3
+fi
+
+# Build namespace filters (regex)
+ignore_pattern=""
+if [[ -n "$IGNORE_NS" ]]; then
+  IFS=',' read -ra arr <<< "$IGNORE_NS"
+  for ns in "${arr[@]}"; do
+    ignore_pattern="${ignore_pattern}|^${ns}\$"
+  done
+  ignore_pattern="${ignore_pattern#|}"
+fi
+
+include_pattern=""
+if [[ -n "$INCLUDE_NS" ]]; then
+  IFS=',' read -ra arr2 <<< "$INCLUDE_NS"
+  for ns in "${arr2[@]}"; do
+    include_pattern="${include_pattern}|^${ns}\$"
+  done
+  include_pattern="${include_pattern#|}"
+fi
+
+ns_allowed() {
+  local ns="$1"
+  if [[ -n "$include_pattern" ]]; then
+    if ! echo "$ns" | egrep -q "$include_pattern"; then
+      return 1
+    fi
+  fi
+  if [[ -n "$ignore_pattern" ]]; then
+    if echo "$ns" | egrep -q "$ignore_pattern"; then
+      return 1
+    fi
+  fi
+  return 0
+}
+
+now_s=$(date +%s)
+
+# Initialize problems array safely
+problems=()
+
+# ---------------------------
+# 1) Inspect Jobs
+# ---------------------------
+# Fields: namespace, name, active, succeeded, failed, startTime, completionTime
+mapfile -t job_lines < <(kubectl get jobs -A -o jsonpath='{range .items[*]}{.metadata.namespace}{"\t"}{.metadata.name}{"\t"}{.status.active}{"\t"}{.status.succeeded}{"\t"}{.status.failed}{"\t"}{.status.startTime}{"\t"}{.status.completionTime}{"\n"}{end}' 2>/dev/null || true)
+
+for line in "${job_lines[@]}"; do
+  ns=$(echo "$line" | awk -F'\t' '{print $1}')
+  name=$(echo "$line" | awk -F'\t' '{print $2}')
+  active=$(echo "$line" | awk -F'\t' '{print $3}')
+  succeeded=$(echo "$line" | awk -F'\t' '{print $4}')
+  failed=$(echo "$line" | awk -F'\t' '{print $5}')
+  start=$(echo "$line" | awk -F'\t' '{print $6}')
+  completion=$(echo "$line" | awk -F'\t' '{print $7}')
+
+  # defaults
+  active=${active:-0}
+  succeeded=${succeeded:-0}
+  failed=${failed:-0}
+
+  if ! ns_allowed "$ns"; then
+    continue
+  fi
+
+  # 1.a) Jobs with failures
+  if (( failed > 0 )); then
+    problems+=("Job ${ns}/${name} failedCount=${failed}")
+    continue
+  fi
+
+  # 1.b) Active jobs running too long
+  if (( active > 0 )); then
+    if [[ -n "$start" && "$start" != "null" ]]; then
+      # convert start timestamp to epoch (GNU date)
+      start_s=$(date -d "$start" +%s 2>/dev/null || echo 0)
+      if (( start_s > 0 )); then
+        age_min=$(( (now_s - start_s) / 60 ))
+        if (( age_min >= AGE_MIN )); then
+          problems+=("Job ${ns}/${name} active for ${age_min}min >= ${AGE_MIN}min")
+        fi
+      fi
+    else
+      # no start time but active >0 -> flag
+      problems+=("Job ${ns}/${name} active but no startTime recorded")
+    fi
+  fi
+done
+
+# 1.c) Recent Job warning events (type=Warning) in last RECENT_MINUTES
+if (( RECENT_MINUTES > 0 )); then
+  # get events for Jobs (type Warning) with fields: namespace, involvedObject.name, lastTimestamp, reason, message
+  mapfile -t event_lines < <(kubectl get events --all-namespaces --field-selector involvedObject.kind=Job,type=Warning -o custom-columns='NAMESPACE:.metadata.namespace,NAME:.involvedObject.name,LAST:.lastTimestamp,REASON:.reason,MESSAGE:.message' --no-headers 2>/dev/null || true)
+  cutoff_s=$(( now_s - RECENT_MINUTES * 60 ))
+  for ev in "${event_lines[@]}"; do
+    ns=$(echo "$ev" | awk '{print $1}')
+    name=$(echo "$ev" | awk '{print $2}')
+    last=$(echo "$ev" | awk '{print $3}')
+    if ! ns_allowed "$ns"; then
+      continue
+    fi
+    if [[ -n "$last" && "$last" != "<none>" ]]; then
+      ts=$(date -d "$last" +%s 2>/dev/null || echo 0)
+      if (( ts >= cutoff_s )); then
+        problems+=("Job event Warning ${ns}/${name} at $last")
+      fi
+    fi
+  done
+fi
+
+# ---------------------------
+# 2) Inspect CronJobs (optionnel)
+# ---------------------------
+if (( CHECK_CRON == 1 )) && (( CRON_MAX_AGE_MIN > 0 )); then
+  # Fields: namespace, name, suspend (true/false/null), lastScheduleTime
+  mapfile -t cron_lines < <(kubectl get cronjob -A -o jsonpath='{range .items[*]}{.metadata.namespace}{"\t"}{.metadata.name}{"\t"}{.spec.suspend}{"\t"}{.status.lastScheduleTime}{"\n"}{end}' 2>/dev/null || true)
+
+  for line in "${cron_lines[@]}"; do
+    ns=$(echo "$line" | awk -F'\t' '{print $1}')
+    name=$(echo "$line" | awk -F'\t' '{print $2}')
+    suspend=$(echo "$line" | awk -F'\t' '{print $3}')
+    last=$(echo "$line" | awk -F'\t' '{print $4}')
+
+    if ! ns_allowed "$ns"; then
+      continue
+    fi
+
+    # If suspended, do not consider as problem
+    if [[ "$suspend" == "true" ]]; then
+      continue
+    fi
+
+    if [[ -z "$last" || "$last" == "null" ]]; then
+      # Never scheduled yet: warn (useful to detect misconfigured cronjobs)
+      problems+=("CronJob ${ns}/${name} has no lastScheduleTime (never scheduled?)")
+      continue
+    fi
+
+    last_s=$(date -d "$last" +%s 2>/dev/null || echo 0)
+    if (( last_s > 0 )); then
+      age_min=$(( (now_s - last_s) / 60 ))
+      if (( age_min > CRON_MAX_AGE_MIN )); then
+        problems+=("CronJob ${ns}/${name} lastSchedule ${age_min}min ago > ${CRON_MAX_AGE_MIN}min")
+      fi
+    else
+      problems+=("CronJob ${ns}/${name} lastScheduleTime unparsable: ${last}")
+    fi
+  done
+fi
+
+# ---------------------------
+# Final decision & output
+# ---------------------------
+count=${#problems[@]}
+
+if (( count == 0 )); then
+  echo "OK - Jobs/CronJobs checks passed"
+  exit 0
+fi
+
+# Severity decision
+if (( count >= CRIT )); then
+  echo "CRITICAL - ${count} problems found: ${problems[*]}"
+  exit 2
+elif (( count >= WARN )); then
+  echo "WARNING - ${count} problems found: ${problems[*]}"
+  exit 1
+else
+  echo "OK - ${count} problems found but below thresholds"
+  exit 0
+fi
--- a/files/nrpe/check_k8s_pki_certs
+++ b/files/nrpe/check_k8s_pki_certs
@@ -0,0 +1,194 @@
+#!/usr/bin/env bash
+# check_k8s_pki_certs
+# Vérifie les certificats PEM sous /etc/kubernetes/pki (par défaut) et alerte si expiration <= warn_days (30j par défaut).
+# Exit codes: 0=OK, 1=WARNING, 2=CRITICAL, 3=UNKNOWN
+#
+# Usage:
+#  sudo /usr/lib/nagios/plugins/check_k8s_pki_certs
+#  sudo /usr/lib/nagios/plugins/check_k8s_pki_certs --path /etc/kubernetes/ssl --warn-days 30 --crit-days 7 --recursive
+#
+set -euo pipefail
+
+PKI_PATH=${PKI_PATH:-/etc/kubernetes/pki}
+WARN_DAYS=${WARN_DAYS:-30}
+CRIT_DAYS=${CRIT_DAYS:-7}
+RECURSIVE=0
+
+print_usage() {
+  cat <<EOF
+Usage: $0 [--path PATH] [--warn-days N] [--crit-days M] [--recursive] [-h|--help]
+
+Options:
+  --path PATH        répertoire à scanner (default: $PKI_PATH)
+  --warn-days N      seuil warning en jours (default: $WARN_DAYS)
+  --crit-days M      seuil critical en jours (default: $CRIT_DAYS)
+  --recursive        scanner récursivement PATH et sous-dirs
+  -h, --help         affiche cette aide
+EOF
+}
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --path) PKI_PATH="$2"; shift 2;;
+    --warn-days) WARN_DAYS="$2"; shift 2;;
+    --crit-days) CRIT_DAYS="$2"; shift 2;;
+    --recursive) RECURSIVE=1; shift 1;;
+    -h|--help) print_usage; exit 3;;
+    *) echo "Unknown arg: $1"; print_usage; exit 3;;
+  esac
+done
+
+# tools
+if ! command -v openssl >/dev/null 2>&1; then
+  echo "UNKNOWN - openssl not found"
+  exit 3
+fi
+if ! command -v date >/dev/null 2>&1; then
+  echo "UNKNOWN - date not found"
+  exit 3
+fi
+if ! command -v sed >/dev/null 2>&1; then
+  echo "UNKNOWN - sed not found"
+  exit 3
+fi
+if ! command -v awk >/dev/null 2>&1; then
+  echo "UNKNOWN - awk not found"
+  exit 3
+fi
+if ! command -v find >/dev/null 2>&1; then
+  echo "UNKNOWN - find not found"
+  exit 3
+fi
+
+# resolve symlink target (realpath or readlink -f)
+if command -v realpath >/dev/null 2>&1; then
+  PKI_PATH_RESOLVED=$(realpath -e "$PKI_PATH" 2>/dev/null || true)
+else
+  PKI_PATH_RESOLVED=$(readlink -f "$PKI_PATH" 2>/dev/null || true)
+fi
+if [[ -n "$PKI_PATH_RESOLVED" && -d "$PKI_PATH_RESOLVED" ]]; then
+  PKI_PATH="$PKI_PATH_RESOLVED"
+fi
+
+if [[ ! -d "$PKI_PATH" ]]; then
+  echo "UNKNOWN - path $PKI_PATH not found or not a directory"
+  exit 3
+fi
+
+now_s=$(date +%s)
+
+# Initialize arrays explicitly to avoid "variable sans liaison" with set -u
+critical=()
+warning=()
+ok=()
+errors=()
+
+file_count=0
+cert_count=0
+
+# build find command: follow symlinks (-L) so that symlinked directories/files are handled
+if [[ $RECURSIVE -eq 1 ]]; then
+  FIND_CMD=(find -L "$PKI_PATH" -type f -print0)
+else
+  FIND_CMD=(find -L "$PKI_PATH" -maxdepth 1 -type f -print0)
+fi
+
+# iterate files found
+while IFS= read -r -d '' file; do
+  file_count=$((file_count+1))
+
+  # skip unreadable files
+  if [[ ! -r "$file" ]]; then
+    errors+=("Unreadable file: $file")
+    continue
+  fi
+
+  # skip files without PEM marker
+  if ! grep -q "BEGIN CERTIFICATE" "$file" 2>/dev/null; then
+    continue
+  fi
+
+  # find pairs of BEGIN/END certificate line numbers robustly using awk
+  # prints "start:end" for each certificate block
+  mapfile -t pairs < <(awk '
+    /BEGIN CERTIFICATE/ {start=NR}
+    /END CERTIFICATE/ && start { print start ":" NR; start=0 }
+  ' "$file" 2>/dev/null || true)
+
+  if [[ ${#pairs[@]} -eq 0 ]]; then
+    errors+=("No certificate block pairs found in $file")
+    continue
+  fi
+
+  for p in "${pairs[@]}"; do
+    start=${p%%:*}
+    end=${p##*:}
+    # extract block via sed (line range), send to openssl via stdin
+    cert_block=$(sed -n "${start},${end}p" "$file" 2>/dev/null || true)
+    if [[ -z "$cert_block" ]]; then
+      errors+=("Failed to extract certificate block ${start}-${end} from $file")
+      continue
+    fi
+
+    # openssl expects a file or stdin; use stdin
+    endline=$(printf '%s\n' "$cert_block" | openssl x509 -noout -enddate -in /dev/stdin 2>/dev/null) || {
+      errors+=("Failed to parse certificate block ${start}-${end} in $file with openssl")
+      continue
+    }
+    # sample endline: notAfter=Oct 27 16:15:30 2125 GMT
+    notAfter=${endline#notAfter=}
+    expiry_s=$(date -d "$notAfter" +%s 2>/dev/null) || {
+      errors+=("Cannot parse date '$notAfter' for cert in $file")
+      continue
+    }
+    days_left=$(( (expiry_s - now_s) / 86400 ))
+    subj=$(printf '%s\n' "$cert_block" | openssl x509 -noout -subject -in /dev/stdin 2>/dev/null || true)
+    subj=${subj#subject= }
+    info="${file} :: ${subj} :: expires in ${days_left}d on ${notAfter}"
+    cert_count=$((cert_count+1))
+    if (( days_left <= CRIT_DAYS )); then
+      critical+=("$info")
+    elif (( days_left <= WARN_DAYS )); then
+      warning+=("$info")
+    else
+      ok+=("$info")
+    fi
+  done
+
+done < <("${FIND_CMD[@]}")
+
+# results and exit codes
+if [[ ${#errors[@]} -gt 0 ]]; then
+  echo "UNKNOWN - parsing errors: ${errors[*]}"
+  exit 3
+fi
+
+if (( cert_count == 0 )); then
+  echo "UNKNOWN - no certificates found under $PKI_PATH"
+  exit 3
+fi
+
+if (( ${#critical[@]} > 0 )); then
+  echo "CRITICAL - ${#critical[@]} certificate(s) expiring soon (<= ${CRIT_DAYS} days):"
+  for c in "${critical[@]}"; do
+    echo "  - $c"
+  done
+  if (( ${#warning[@]} > 0 )); then
+    echo "WARN (additional ${#warning[@]} cert(s) <= ${WARN_DAYS} days):"
+    for w in "${warning[@]}"; do
+      echo "  - $w"
+    done
+  fi
+  exit 2
+fi
+
+if (( ${#warning[@]} > 0 )); then
+  echo "WARNING - ${#warning[@]} certificate(s) expiring within ${WARN_DAYS} days:"
+  for w in "${warning[@]}"; do
+    echo "  - $w"
+  done
+  exit 1
+fi
+
+echo "OK - ${cert_count} cert(s) checked in ${file_count} file(s), no expiry within ${WARN_DAYS} days"
+exit 0
--- a/files/nrpe/check_k8s_pod_restarts
+++ b/files/nrpe/check_k8s_pod_restarts
@@ -0,0 +1,49 @@
+#!/usr/bin/env bash
+# check_k8s_pod_restarts
+# Vérifie s'il y a eu des redémarrages de pods (événements "Killing") dans les X dernières minutes.
+# Retour: 0=OK, 2=CRITICAL, 3=UNKNOWN
+#
+# Usage: 
+#  sudo /usr/lib/nagios/plugins/check_k8s_pod_restarts [minutes]
+#
+MINUTES=${1:-5}
+
+# Require kubectl
+if ! command -v kubectl >/dev/null 2>&1; then
+  echo "UNKNOWN - kubectl not found"
+  exit 3
+fi
+
+# cutoff en epoch (GNU date)
+if ! cutoff=$(date -d "$MINUTES minutes ago" +%s 2>/dev/null); then
+  echo "UNKNOWN - date parsing failed (on macOS use gdate from coreutils)"
+  exit 3
+fi
+
+matches=()
+while IFS=$'\t' read -r ns pod last msg; do
+  # skip empty lines
+  [[ -z "$last" ]] && continue
+  # convert last timestamp to epoch (works with GNU date; handles timezone/fractions)
+  if ! ts=$(date -d "$last" +%s 2>/dev/null); then
+    # if parsing fails, skip the event
+    continue
+  fi
+  if (( ts >= cutoff )); then
+    # safe message truncation
+    shortmsg=$(echo "$msg" | tr '\n' ' ' | cut -c1-300)
+    matches+=("$ns\t$pod\t$last\t$shortmsg")
+  fi
+done < <(kubectl get events --all-namespaces --field-selector reason=Killing -o custom-columns='NAMESPACE:.metadata.namespace,NAME:.involvedObject.name,LAST:.lastTimestamp,MESSAGE:.message' --no-headers 2>/dev/null || true)
+
+if [[ ${#matches[@]} -eq 0 ]]; then
+  echo "OK - no pod restarts in the last ${MINUTES} minutes"
+  exit 0
+else
+  echo "CRITICAL - ${#matches[@]} pod restarts in the last ${MINUTES} minutes:"
+  for m in "${matches[@]}"; do
+    IFS=$'\t' read -r ns pod last shortmsg <<< "$m"
+    echo " - ${ns}/${pod} at ${last} : ${shortmsg}"
+  done
+  exit 2
+fi
--- a/files/nrpe/check_k8s_pv_pvc
+++ b/files/nrpe/check_k8s_pv_pvc
@@ -0,0 +1,202 @@
+#!/usr/bin/env bash
+# check_k8s_pv_pvc
+# Vérifie l'état des PersistentVolumes (PV) et PersistentVolumeClaims (PVC) Kubernetes.
+# Exit codes: 0=OK, 1=WARNING, 2=CRITICAL, 3=UNKNOWN
+#
+# Usage examples:
+#  sudo /usr/lib/nagios/plugins/check_k8s_pv_pvc --crit 1                        # CRITICAL si >=1 problème
+#  sudo /usr/lib/nagios/plugins/check_k8s_pv_pvc --ignore-ns kube-system        # ignorer kube-system
+#  sudo /usr/lib/nagios/plugins/check_k8s_pv_pvc --pvc-age-min 10 --crit 2      # ignorer PVC récents <10min, CRIT si >=2
+#  sudo /usr/lib/nagios/plugins/check_k8s_pv_pvc --check-pv --check-pvc         # (par défaut les 2 sont vérifiés)
+#
+set -euo pipefail
+
+# Defaults
+WARN=${WARN:-0}
+CRIT=${CRIT:-1}
+IGNORE_NS=""
+INCLUDE_NS=""
+PVC_AGE_MIN=${PVC_AGE_MIN:-5}        # en minutes : ignore PVC créés il y a moins de X minutes (défaut 5)
+CHECK_PV=1
+CHECK_PVC=1
+
+print_usage() {
+  cat <<EOF
+Usage: $0 [options]
+Options:
+  --warn N             seuil WARN si >= N objets en erreur (default 0)
+  --crit M             seuil CRIT si >= M objets en erreur (default 1)
+  --ignore-ns a,b,c    namespaces à ignorer (comma separated)
+  --namespaces a,b     limiter aux namespaces donnés (comma separated)
+  --pvc-age-min N      ignore PVC créés il y a moins de N minutes (default 5)
+  --no-pv              disable PV checks
+  --no-pvc             disable PVC checks
+  -h, --help           affiche cette aide
+EOF
+}
+
+# Parse args
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --warn) WARN="$2"; shift 2;;
+    --crit) CRIT="$2"; shift 2;;
+    --ignore-ns) IGNORE_NS="$2"; shift 2;;
+    --namespaces) INCLUDE_NS="$2"; shift 2;;
+    --pvc-age-min) PVC_AGE_MIN="$2"; shift 2;;
+    --no-pv) CHECK_PV=0; shift 1;;
+    --no-pvc) CHECK_PVC=0; shift 1;;
+    -h|--help) print_usage; exit 3;;
+    *) echo "Unknown arg: $1"; print_usage; exit 3;;
+  esac
+done
+
+if ! command -v kubectl >/dev/null 2>&1; then
+  echo "UNKNOWN - kubectl not found"
+  exit 3
+fi
+
+# Build namespace filters
+ignore_pattern=""
+if [[ -n "$IGNORE_NS" ]]; then
+  IFS=',' read -ra arr <<< "$IGNORE_NS"
+  for ns in "${arr[@]}"; do
+    ignore_pattern="${ignore_pattern}|^${ns}\$"
+  done
+  ignore_pattern="${ignore_pattern#|}"
+fi
+
+include_pattern=""
+if [[ -n "$INCLUDE_NS" ]]; then
+  IFS=',' read -ra arr2 <<< "$INCLUDE_NS"
+  for ns in "${arr2[@]}"; do
+    include_pattern="${include_pattern}|^${ns}\$"
+  done
+  include_pattern="${include_pattern#|}"
+fi
+
+now_s=$(date +%s)
+
+# Initialize problems array safely (fix pour "variable sans liaison")
+problems=()
+
+# Helper: namespace filter
+ns_allowed() {
+  local ns="$1"
+  if [[ -n "$include_pattern" ]]; then
+    if ! echo "$ns" | egrep -q "$include_pattern"; then
+      return 1
+    fi
+  fi
+  if [[ -n "$ignore_pattern" ]]; then
+    if echo "$ns" | egrep -q "$ignore_pattern"; then
+      return 1
+    fi
+  fi
+  return 0
+}
+
+# 1) Check PVCs
+if (( CHECK_PVC == 1 )); then
+  # gather: namespace, name, phase, volumeName, creationTimestamp
+  mapfile -t pvc_lines < <(kubectl get pvc -A -o jsonpath='{range .items[*]}{.metadata.namespace}{"\t"}{.metadata.name}{"\t"}{.status.phase}{"\t"}{.spec.volumeName}{"\t"}{.metadata.creationTimestamp}{"\n"}{end}' 2>/dev/null || true)
+
+  for line in "${pvc_lines[@]}"; do
+    ns=$(echo "$line" | awk -F'\t' '{print $1}')
+    name=$(echo "$line" | awk -F'\t' '{print $2}')
+    phase=$(echo "$line" | awk -F'\t' '{print $3}')
+    vol=$(echo "$line" | awk -F'\t' '{print $4}')
+    created=$(echo "$line" | awk -F'\t' '{print $5}')
+
+    # filter namespaces
+    if ! ns_allowed "$ns"; then
+      continue
+    fi
+
+    # ignore PVC newly created (to avoid noise during normal provisioning)
+    if [[ -n "$created" && "$PVC_AGE_MIN" -gt 0 ]]; then
+      created_s=0
+      created_s=$(date -d "$created" +%s 2>/dev/null || echo 0)
+      age_min=$(( (now_s - created_s) / 60 ))
+      if (( age_min < PVC_AGE_MIN )); then
+        continue
+      fi
+    fi
+
+    # Consider non-Bound phases as problematic (Pending, Lost, Failed)
+    # Bound is OK; if Bound but no volumeName -> problem
+    if [[ "$phase" != "Bound" ]]; then
+      problems+=("PVC ${ns}/${name} phase=${phase} created=${created}")
+      continue
+    fi
+
+    if [[ -z "$vol" || "$vol" == "null" ]]; then
+      problems+=("PVC ${ns}/${name} Bound but no volumeName assigned")
+      continue
+    fi
+  done
+fi
+
+# 2) Check PVs
+if (( CHECK_PV == 1 )); then
+  # gather: name, phase, capacity.storage, claimRef.namespace, claimRef.name, reclaimPolicy
+  mapfile -t pv_lines < <(kubectl get pv -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.status.phase}{"\t"}{.spec.capacity.storage}{"\t"}{.spec.claimRef.namespace}{"\t"}{.spec.claimRef.name}{"\t"}{.spec.persistentVolumeReclaimPolicy}{"\n"}{end}' 2>/dev/null || true)
+
+  for line in "${pv_lines[@]}"; do
+    name=$(echo "$line" | awk -F'\t' '{print $1}')
+    phase=$(echo "$line" | awk -F'\t' '{print $2}')
+    cap=$(echo "$line" | awk -F'\t' '{print $3}')
+    claim_ns=$(echo "$line" | awk -F'\t' '{print $4}')
+    claim_name=$(echo "$line" | awk -F'\t' '{print $5}')
+    reclaim=$(echo "$line" | awk -F'\t' '{print $6}')
+
+    # If PV is bound, check namespace filter of its claim (only report if claim namespace allowed)
+    if [[ -n "$claim_ns" && "$claim_ns" != "null" ]]; then
+      if ! ns_allowed "$claim_ns"; then
+        continue
+      fi
+    else
+      # claim_ns empty => PV not bound to claim
+      # Consider phases indicating issues: Released, Failed
+      if [[ "$phase" == "Released" || "$phase" == "Failed" ]]; then
+        problems+=("PV ${name} phase=${phase} reclaim=${reclaim} (no claim)")
+        continue
+      fi
+      # Optionally, consider Available PV without claim as possibly orphaned:
+      # Uncomment next lines to treat Available PVs as warning/problem
+      # if [[ "$phase" == "Available" ]]; then
+      #   problems+=("PV ${name} is Available (unbound) capacity=${cap} reclaim=${reclaim}")
+      # fi
+    fi
+
+    # If bound, but claim cannot be found (partial sanity check)
+    if [[ "$phase" == "Bound" ]]; then
+      if [[ -z "$claim_ns" || -z "$claim_name" || "$claim_ns" == "null" || "$claim_name" == "null" ]]; then
+        problems+=("PV ${name} Bound but missing claimRef (phase=${phase})")
+        continue
+      fi
+      # try to ensure the claim exists (if denied by namespace filter it's been skipped earlier)
+      if ! kubectl get pvc -n "${claim_ns}" "${claim_name}" >/dev/null 2>&1; then
+        problems+=("PV ${name} Bound to ${claim_ns}/${claim_name} but PVC resource not found")
+      fi
+    fi
+  done
+fi
+
+count=${#problems[@]}
+
+if (( count == 0 )); then
+  echo "OK - PV/PVC checks passed"
+  exit 0
+fi
+
+# Severity decision
+if (( count >= CRIT )); then
+  echo "CRITICAL - ${count} PV/PVC problems: ${problems[*]}"
+  exit 2
+elif (( count >= WARN )); then
+  echo "WARNING - ${count} PV/PVC problems: ${problems[*]}"
+  exit 1
+else
+  echo "OK - ${count} PV/PVC problems but below thresholds"
+  exit 0
+fi
--- a/files/nrpe/check_k8s_replicasets
+++ b/files/nrpe/check_k8s_replicasets
@@ -0,0 +1,135 @@
+#!/usr/bin/env bash
+# check_k8s_replicasets
+# Vérifie les ReplicaSets Kubernetes : readyReplicas < spec.replicas
+# Retour: 0=OK, 1=WARNING, 2=CRITICAL, 3=UNKNOWN
+#
+# Usage:
+#  sudo /usr/lib/nagios/plugins/check_k8s_replicasets [--warn N] [--crit M] [--ignore-ns ns1,ns2] [--namespaces ns1,ns2] [--age-min MINUTES]
+#
+set -euo pipefail
+
+WARN=${WARN:-0}   # nombre de RS en erreur pour WARNING
+CRIT=${CRIT:-1}   # nombre de RS en erreur pour CRITICAL par défaut (1 => 1 RS -> CRITICAL)
+IGNORE_NS=""
+INCLUDE_NS=""
+AGE_MIN=0
+
+print_usage() {
+  cat <<EOF
+Usage: $0 [--warn N] [--crit M] [--ignore-ns ns1,ns2] [--namespaces ns1,ns2] [--age-min MINUTES]
+ --warn N         : seuil warn si >=N ReplicaSets en erreur (default 0)
+ --crit M         : seuil crit si >=M ReplicaSets en erreur (default 1)
+ --ignore-ns LIST : comma separated namespaces to ignore (default none)
+ --namespaces LIST: comma separated namespaces to check only (default all)
+ --age-min N      : ignore ReplicaSets created less than N minutes ago (avoid flapping during rollout)
+EOF
+}
+
+# parse args
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --warn) WARN="$2"; shift 2;;
+    --crit) CRIT="$2"; shift 2;;
+    --ignore-ns) IGNORE_NS="$2"; shift 2;;
+    --namespaces) INCLUDE_NS="$2"; shift 2;;
+    --age-min) AGE_MIN="$2"; shift 2;;
+    -h|--help) print_usage; exit 3;;
+    *) echo "Unknown arg: $1"; print_usage; exit 3;;
+  esac
+done
+
+if ! command -v kubectl >/dev/null 2>&1; then
+  echo "UNKNOWN - kubectl not found"
+  exit 3
+fi
+
+# Build filter for namespace inclusion/exclusion (regex)
+ignore_pattern=""
+if [[ -n "$IGNORE_NS" ]]; then
+  IFS=',' read -ra arr <<< "$IGNORE_NS"
+  for ns in "${arr[@]}"; do
+    ignore_pattern="${ignore_pattern}|^${ns}\$"
+  done
+  ignore_pattern="${ignore_pattern#|}"
+fi
+
+include_pattern=""
+if [[ -n "$INCLUDE_NS" ]]; then
+  IFS=',' read -ra arr2 <<< "$INCLUDE_NS"
+  for ns in "${arr2[@]}"; do
+    include_pattern="${include_pattern}|^${ns}\$"
+  done
+  include_pattern="${include_pattern#|}"
+fi
+
+# Initialize failures array to avoid "variable sans liaison" when set -u is active
+failures=()
+
+# Collect ReplicaSets: namespace, name, desired(spec.replicas), ready(status.readyReplicas), creationTimestamp
+# If fields missing, jsonpath returns nothing -> we normalize later
+mapfile -t lines < <(kubectl get rs -A -o jsonpath='{range .items[*]}{.metadata.namespace}{"\t"}{.metadata.name}{"\t"}{.spec.replicas}{"\t"}{.status.readyReplicas}{"\t"}{.metadata.creationTimestamp}{"\n"}{end}' 2>/dev/null || true)
+
+now_s=$(date +%s)
+
+for line in "${lines[@]}"; do
+  # Skip empty lines if any
+  [[ -z "$line" ]] && continue
+
+  ns=$(echo "$line" | awk -F'\t' '{print $1}')
+  name=$(echo "$line" | awk -F'\t' '{print $2}')
+  desired=$(echo "$line" | awk -F'\t' '{print $3}')
+  ready=$(echo "$line" | awk -F'\t' '{print $4}')
+  created=$(echo "$line" | awk -F'\t' '{print $5}')
+
+  # normalize numeric values
+  desired=${desired:-0}
+  ready=${ready:-0}
+
+  # namespace filtering
+  if [[ -n "$include_pattern" ]]; then
+    if ! echo "$ns" | egrep -q "$include_pattern"; then
+      continue
+    fi
+  fi
+  if [[ -n "$ignore_pattern" ]]; then
+    if echo "$ns" | egrep -q "$ignore_pattern"; then
+      continue
+    fi
+  fi
+
+  # age filtering (skip very recent RS)
+  if [[ -n "$created" && "$AGE_MIN" -gt 0 ]]; then
+    created_s=0
+    # convert to epoch; if conversion fails, keep created_s=0 so we don't skip
+    if created_s=$(date -d "$created" +%s 2>/dev/null || echo 0); then :; fi
+    age_min=$(( (now_s - created_s) / 60 ))
+    if (( age_min < AGE_MIN )); then
+      continue
+    fi
+  fi
+
+  # Only consider RS where desired > 0 (skip zero-scale RS)
+  if (( desired > 0 )) && (( ready < desired )); then
+    failures+=("${ns}/${name} (desired=${desired},ready=${ready})")
+  fi
+done
+
+count=${#failures[@]}
+
+# If there are no failures and the cluster reports none, return OK
+if (( count == 0 )); then
+  echo "OK - all ReplicaSets report ready==desired"
+  exit 0
+fi
+
+# Determine severity based on thresholds
+if (( count >= CRIT )); then
+  echo "CRITICAL - ${count} ReplicaSets not fully ready: ${failures[*]}"
+  exit 2
+elif (( count >= WARN )); then
+  echo "WARNING - ${count} ReplicaSets not fully ready: ${failures[*]}"
+  exit 1
+else
+  echo "OK - ${count} ReplicaSets not fully ready but below thresholds"
+  exit 0
+fi
--- a/templates/nrpe.j2
+++ b/templates/nrpe.j2
@@ -72,13 +72,32 @@ command[check_docker_{{ container }}]=/usr/lib/nagios/plugins/check_docker --con
 {% endif %}

 {% if nrpe_process is defined %}
+# process
 {% for process in nrpe_process %}
 command[check_proc_{{ process }}]=/usr/lib/nagios/plugins/check_systemd_service {{ process }}
 {% endfor %}
 {% endif %}

+{% if nrpe_kubernetes is defined or nrpe_kubernetes_manager is defined %}
+# kubernetes
 {% if nrpe_kubernetes is defined %}
+## nodes
 command[check_proc_kubelet]=/usr/lib/nagios/plugins/check_systemd_service kubelet
 command[check_proc_etcd]=/usr/lib/nagios/plugins/check_systemd_service etcd
 command[check_proc_containerd]=/usr/lib/nagios/plugins/check_systemd_service containerd
 {% endif %}
+{% if nrpe_kubernetes_manager is defined %}
+## manager / control plane
+command[check_k8s_health]=/usr/lib/nagios/plugins/check_http -I {{ ansible_default_ipv4.address }} -p 6443 -S -u /healthz --continue-after-certificate -r ok -w 1 -c 2
+command[check_cilium_health]=/usr/bin/sudo /usr/lib/nagios/plugins/check_cilium_health
+command[check_coredns_health]=/usr/bin/sudo /usr/lib/nagios/plugins/check_coredns_health
+command[check_etcd_health]=/usr/bin/sudo /usr/lib/nagios/plugins/check_etcd_health --endpoints "https://{{ ansible_default_ipv4.address }}:2379" --cacert /etc/ssl/etcd/ssl/ca.pem --cert /etc/ssl/etcd/ssl/node-{{ nrpe_kubernetes_manager_nodename }}.pem --key /etc/ssl/etcd/ssl/node-{{ nrpe_kubernetes_manager_nodename }}-key.pem
+command[check_k8s_apiserver_access]=/usr/bin/sudo /usr/lib/nagios/plugins/check_k8s_apiserver_access
+command[check_k8s_deployments]=/usr/bin/sudo /usr/lib/nagios/plugins/check_k8s_deployments
+command[check_k8s_jobs_cronjobs]=/usr/bin/sudo /usr/lib/nagios/plugins/check_k8s_jobs_cronjobs
+command[check_k8s_pki_certs]=/usr/bin/sudo /usr/lib/nagios/plugins/check_k8s_pki_certs
+command[check_k8s_pv_pvc]=/usr/bin/sudo /usr/lib/nagios/plugins/check_k8s_pv_pvc
+command[check_k8s_replicasets]=/usr/bin/sudo /usr/lib/nagios/plugins/check_k8s_replicasets
+command[check_k8s_pod_restarts]=/usr/bin/sudo /usr/lib/nagios/plugins/check_k8s_pod_restarts
+{% endif %}
+{% endif %}
--- a/templates/nrpe.sudoers.j2
+++ b/templates/nrpe.sudoers.j2
@@ -2,3 +2,13 @@ nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_postfix_mailqueue -w {{
 nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_exim_mailqueue -w {{ nrpe_mailq_warning }} -c {{ nrpe_mailq_critical }}
 nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_raid
 nagios ALL=(ALL) NOPASSWD: /usr/sbin/needrestart -b -l
+nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_cilium_health
+nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_coredns_health
+nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_etcd_health
+nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_k8s_apiserver_access
+nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_k8s_deployments
+nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_k8s_jobs_cronjobs
+nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_k8s_pki_certs
+nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_k8s_pv_pvc
+nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_k8s_replicasets
+nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_k8s_pod_restarts