diff --git a/files/nrpe/check_cilium_health b/files/nrpe/check_cilium_health new file mode 100644 index 0000000..00f7ffb --- /dev/null +++ b/files/nrpe/check_cilium_health @@ -0,0 +1,307 @@ +#!/usr/bin/env bash +# check_cilium_health +# Vérifie la santé de Cilium (pods, daemonsets, operator) et optionnellement utilise le binaire `cilium status -o json`. +# Retour: 0=OK, 1=WARNING, 2=CRITICAL, 3=UNKNOWN +# +# Usage: +# sudo /usr/lib/nagios/plugins/check_cilium_health [--namespace N] [--label LABEL] [--warn-not-ready N] [--crit-not-ready M] [--use-cilium-cli] [--timeout SECS] +# +set -euo pipefail + +# Defaults +NAMESPACE=${NAMESPACE:-kube-system} +LABEL=${LABEL:-k8s-app=cilium} +WARN_NOT_READY=${WARN_NOT_READY:-1} +CRIT_NOT_READY=${CRIT_NOT_READY:-2} +WARN_RESTARTS=${WARN_RESTARTS:-3} +CRIT_RESTARTS=${CRIT_RESTARTS:-10} +USE_CILIUM_CLI=0 +TIMEOUT=${TIMEOUT:-10} + +print_usage() { + cat <= N pods not ready (default ${WARN_NOT_READY}) + --crit-not-ready M critical if >= M pods not ready (default ${CRIT_NOT_READY}) + --warn-restarts R warn if restartCount >= R per pod (default ${WARN_RESTARTS}) + --crit-restarts S critical if restartCount >= S per pod (default ${CRIT_RESTARTS}) + --use-cilium-cli run 'cilium status -o json' as additional check (requires cilium binary) + --timeout SECS kubectl timeout in seconds (default ${TIMEOUT}) + -h, --help show this help +EOF +} + +# Parse args +while [[ $# -gt 0 ]]; do + case "$1" in + --namespace) NAMESPACE="$2"; shift 2;; + --label) LABEL="$2"; shift 2;; + --warn-not-ready) WARN_NOT_READY="$2"; shift 2;; + --crit-not-ready) CRIT_NOT_READY="$2"; shift 2;; + --warn-restarts) WARN_RESTARTS="$2"; shift 2;; + --crit-restarts) CRIT_RESTARTS="$2"; shift 2;; + --use-cilium-cli) USE_CILIUM_CLI=1; shift 1;; + --timeout) TIMEOUT="$2"; shift 2;; + -h|--help) print_usage; exit 3;; + *) echo "Unknown arg: $1"; print_usage; exit 3;; + esac +done + +# ensure kubectl & python present +if ! command -v kubectl >/dev/null 2>&1; then + echo "UNKNOWN - kubectl not found in PATH" + exit 3 +fi +if ! command -v python3 >/dev/null 2>&1; then + echo "UNKNOWN - python3 not found in PATH (required for JSON parsing)" + exit 3 +fi + +# ---- kubeconfig handling ---- +# If KUBECONFIG is not set, try sensible defaults so sudo/nagios runs succeed. +# Priority: +# 1) env KUBECONFIG if already defined +# 2) /etc/kubernetes/admin.conf if present (common on control-planes) +# 3) /root/.kube/config if present +# 4) fallback to empty (kubectl will then try defaults and may fail) +if [[ -z "${KUBECONFIG:-}" ]]; then + if [[ -r "/etc/kubernetes/admin.conf" ]]; then + export KUBECONFIG="/etc/kubernetes/admin.conf" + elif [[ -r "/root/.kube/config" ]]; then + export KUBECONFIG="/root/.kube/config" + else + # leave unset; kubectl will attempt defaults + unset KUBECONFIG || true + fi +fi + +# Use explicit kubeconfig for kubectl invocations to avoid home/KUBECONFIG differences under sudo +if [[ -n "${KUBECONFIG:-}" ]]; then + KC="kubectl --kubeconfig=${KUBECONFIG} --request-timeout=${TIMEOUT}s" +else + KC="kubectl --request-timeout=${TIMEOUT}s" +fi + +# Helper to run python parser safely via temp file +run_python_parser() { + # $1 = input (stdin), $2 = python here-doc content (as a bash string) + local input="$1" + local pyprog="$2" + local tmp pyfile + tmp=$(mktemp) || return 1 + pyfile=$(mktemp) || { rm -f "$tmp"; return 1; } + printf '%s\n' "$pyprog" > "$pyfile" + printf '%s' "$input" | python3 "$pyfile" > "$tmp" 2>/dev/null + local rc=$? + rm -f "$pyfile" + if [[ $rc -ne 0 ]]; then + rm -f "$tmp" + return $rc + fi + cat "$tmp" + rm -f "$tmp" + return 0 +} + +# 1) get pods JSON robustly +set +e +pods_json=$($KC -n "$NAMESPACE" get pods -l "$LABEL" -o json 2>&1) +rc_kubectl=$? +set -e +if (( rc_kubectl != 0 )); then + echo "CRITICAL - kubectl failed to list Cilium pods: ${pods_json//$'\n'/ ' '}" + exit 2 +fi + +# 2) parse pods JSON via python (safe invocation) +pod_python_prog=$'import sys,json\ntry:\n data=json.load(sys.stdin)\nexcept Exception:\n sys.exit(1)\nitems=data.get(\"items\",[])\nfor it in items:\n name=it.get(\"metadata\",{}).get(\"name\",\"\")\n node=it.get(\"spec\",{}).get(\"nodeName\",\"\")\n phase=it.get(\"status\",{}).get(\"phase\",\"\")\n cs=it.get(\"status\",{}).get(\"containerStatuses\",[]) or []\n total_cont=len(cs)\n ready_cnt=sum(1 for c in cs if c.get(\"ready\") is True)\n restarts=sum(int(c.get(\"restartCount\",0) or 0) for c in cs)\n ready_str = f\"{ready_cnt}/{total_cont}\"\n print(f\"{name}\\t{phase}\\t{ready_str}\\t{restarts}\\t{node}\")\n' + +pod_lines=() +if pod_out=$(run_python_parser "$pods_json" "$pod_python_prog"); then + # read into array safely + IFS=$'\n' read -r -d '' -a pod_lines <<< "$(printf '%s\n' "$pod_out")" || true +fi + +# Fallback if parsing failed or empty: use simple kubectl get pods --no-headers +if [[ ${#pod_lines[@]} -eq 0 ]]; then + simple=$($KC -n "$NAMESPACE" get pods -l "$LABEL" --no-headers 2>&1 || true) + count_simple=$(printf '%s\n' "$simple" | sed '/^\s*$/d' | wc -l) + if [[ "$count_simple" -eq 0 ]]; then + echo "CRITICAL - no Cilium pods found or kubectl output unparsable. kubectl output: ${simple//$'\n'/ ' '}" + exit 2 + fi + # convert simple lines into pod_lines minimally: NAME READY ... -> parse name and READY column + while IFS= read -r l; do + [[ -z "$l" ]] && continue + name=$(echo "$l" | awk '{print $1}') + readycol=$(echo "$l" | awk '{print $2}') + if [[ "$readycol" == *"/"* ]]; then + rnum=$(echo "$readycol" | cut -d'/' -f1) + rtot=$(echo "$readycol" | cut -d'/' -f2) + else + rnum=0; rtot=0 + fi + if [[ "$rnum" == "$rtot" && "$rtot" != "0" ]]; then + phase="Running" + else + phase="NotReady" + fi + restarts=0 + node="" + pod_lines+=("${name}\t${phase}\t${rnum}/${rtot}\t${restarts}\t${node}") + done < <(printf '%s\n' "$simple") +fi + +# Now evaluate pod_lines +total_pods=0 +not_ready=0 +not_ready_list=() +high_restart_pods=() + +for line in "${pod_lines[@]}"; do + [[ -z "$line" ]] && continue + total_pods=$((total_pods+1)) + IFS=$'\t' read -r pname pphase pready prest pnode <<< "$line" + ready_num=${pready%/*} + ready_tot=${pready#*/} + ready_num=${ready_num:-0} + ready_tot=${ready_tot:-0} + if [[ "$pphase" != "Running" ]] || (( ready_num < ready_tot )); then + not_ready=$((not_ready+1)) + not_ready_list+=("${pname}:${pphase}:${pready}") + fi + prest=${prest:-0} + if (( prest >= CRIT_RESTARTS )); then + high_restart_pods+=("${pname}:${prest}:CRITICAL") + elif (( prest >= WARN_RESTARTS )); then + high_restart_pods+=("${pname}:${prest}:WARN") + fi +done + +# DaemonSet check (desired vs ready) using safe python parsing +set +e +ds_out=$($KC -n "$NAMESPACE" get ds -l "$LABEL" -o json 2>&1) +rc_ds=$? +set -e +ds_desired=0; ds_ready=0 +if (( rc_ds == 0 )); then + ds_python_prog=$'import sys,json\ndata=json.load(sys.stdin)\nfor it in data.get(\"items\",[]):\n s=it.get(\"status\",{})\n desired=int(s.get(\"desiredNumberScheduled\") or 0)\n ready=int(s.get(\"numberReady\") or 0)\n print(f\"{desired}\\t{ready}\")\n' + if ds_out_parsed=$(run_python_parser "$ds_out" "$ds_python_prog"); then + while IFS=$'\n' read -r d; do + [[ -z "$d" ]] && continue + ddesired=$(echo "$d" | cut -f1) + dready=$(echo "$d" | cut -f2) + ds_desired=$((ds_desired+ddesired)) + ds_ready=$((ds_ready+dready)) + done <<< "$ds_out_parsed" + fi +fi + +# cilium-operator deployment check +op_ok=1 +op_msg="" +set +e +op_json=$($KC -n "$NAMESPACE" get deploy cilium-operator -o json 2>/dev/null || true) +set -e +if [[ -n "$op_json" ]]; then + op_python_prog=$'import sys,json\ndata=json.load(sys.stdin)\nspec=data.get(\"spec\",{})\nstatus=data.get(\"status\",{})\nreplicas=int(spec.get(\"replicas\") or 1)\navailable=int(status.get(\"availableReplicas\") or 0)\nprint(f\"{replicas}\\t{available}\")\n' + if op_line=$(run_python_parser "$op_json" "$op_python_prog"); then + IFS=$'\t' read -r op_repl op_avail <<< "$op_line" + if (( op_avail < op_repl )); then + op_ok=0 + op_msg="operator available=${op_avail}/${op_repl}" + else + op_msg="operator available=${op_avail}/${op_repl}" + fi + fi +fi + +# Optional: cilium CLI +cilium_ok=1 +cilium_summary="" +if (( USE_CILIUM_CLI == 1 )); then + if ! command -v cilium >/dev/null 2>&1; then + cilium_ok=0 + cilium_summary="cilium binary not in PATH" + else + set +e + cilium_raw=$(cilium status -o json 2>&1) || true + rc_cilium=$? + set -e + if (( rc_cilium != 0 )); then + cilium_ok=0 + cilium_summary="cilium status failed: ${cilium_raw//$'\n'/ ' '}" + else + cilium_ok=1 + cilium_summary=$(printf '%s' "$cilium_raw" | tr '\n' ' ' | sed 's/ */ /g' | cut -c1-300) + fi + fi +fi + +# Compose status +code=0 +msgs=() + +if (( not_ready >= CRIT_NOT_READY )); then + code=2 + msgs+=("CRITICAL - ${not_ready}/${total_pods} pods not ready") +elif (( not_ready >= WARN_NOT_READY )); then + if (( code < 1 )); then code=1; fi + msgs+=("WARNING - ${not_ready}/${total_pods} pods not ready") +else + msgs+=("OK - ${total_pods} pods, not-ready=${not_ready}") +fi + +if (( ds_desired > 0 )) && (( ds_ready < ds_desired )); then + if (( ds_desired - ds_ready >= CRIT_NOT_READY )); then + code=2 + msgs+=("CRITICAL - daemonsets ready=${ds_ready}/${ds_desired}") + else + if (( code < 1 )); then code=1; fi + msgs+=("WARNING - daemonsets ready=${ds_ready}/${ds_desired}") + fi +fi + +if [[ -n "$op_msg" ]]; then + if (( op_ok == 0 )); then + code=2 + msgs+=("CRITICAL - ${op_msg}") + else + msgs+=("${op_msg}") + fi +fi + +if (( ${#high_restart_pods[@]} > 0 )); then + crit_restart=0; warn_restart=0 + for r in "${high_restart_pods[@]}"; do + [[ "$r" == *":CRITICAL" ]] && crit_restart=1 + [[ "$r" == *":WARN" ]] && warn_restart=1 + done + if (( crit_restart == 1 )); then + code=2 + msgs+=("CRITICAL - pods with high restart counts: ${high_restart_pods[*]}") + elif (( warn_restart == 1 )); then + if (( code < 1 )); then code=1; fi + msgs+=("WARNING - pods with elevated restarts: ${high_restart_pods[*]}") + fi +fi + +if (( USE_CILIUM_CLI == 1 )); then + if (( cilium_ok == 0 )); then + code=2 + msgs+=("CRITICAL - cilium-cli: ${cilium_summary}") + else + msgs+=("cilium-cli ok: ${cilium_summary}") + fi +fi + +if (( not_ready > 0 )); then + truncated=$(printf "%s, " "${not_ready_list[@]}" | sed 's/, $//') + msgs+=("not-ready-list: ${truncated}") +fi + +echo "$(IFS=' ; '; echo "${msgs[*]}")" +exit "${code}" \ No newline at end of file diff --git a/files/nrpe/check_coredns_health b/files/nrpe/check_coredns_health new file mode 100644 index 0000000..4bf16ea --- /dev/null +++ b/files/nrpe/check_coredns_health @@ -0,0 +1,158 @@ +#!/usr/bin/env bash +# check_coredns_health +# Vérifie la santé de CoreDNS (endpoints + endpointslices + fallback pods) +# Retour codes: 0=OK, 1=WARNING, 2=CRITICAL, 3=UNKNOWN +# +# Usage: +# sudo /usr/lib/nagios/plugins/check_coredns_health [--namespace N] [--service NAME] [--label-fallback LABEL] [--kubeconfig PATH] +# +set -euo pipefail + +NAMESPACE=${NAMESPACE:-kube-system} +SERVICE_NAME=${SERVICE_NAME:-coredns} +LABEL_FALLBACK=${LABEL_FALLBACK:-k8s-app=kube-dns} +TIMEOUT=${TIMEOUT:-10} + +usage() { + cat </dev/null 2>&1; then + echo "UNKNOWN - kubectl not found" + exit 3 +fi + +# If KUBECONFIG not set, try sensible defaults so sudo/nagios runs succeed. +if [[ -z "${KUBECONFIG:-}" ]]; then + if [[ -r "/etc/kubernetes/admin.conf" ]]; then + export KUBECONFIG="/etc/kubernetes/admin.conf" + elif [[ -r "/root/.kube/config" ]]; then + export KUBECONFIG="/root/.kube/config" + fi +fi + +# Build kubectl command with explicit kubeconfig when available +if [[ -n "${KUBECONFIG:-}" ]]; then + KC=(kubectl --kubeconfig="${KUBECONFIG}" --request-timeout="${TIMEOUT}s") +else + KC=(kubectl --request-timeout="${TIMEOUT}s") +fi + +# run_kc: capture stdout only (stderr -> /dev/null) and return kubectl's exit code +run_kc() { + local out rc + out="$("${KC[@]}" "$@" 2>/dev/null)" + rc=$? + printf '%s' "$out" + return $rc +} + +# 1) try Endpoints resource +ep_out=$(run_kc -n "$NAMESPACE" get endpoints "$SERVICE_NAME" -o jsonpath='{.subsets[*].addresses[*].ip}') +rc=$? +if (( rc != 0 )); then + echo "CRITICAL - kubectl failed to get Endpoints (exit code ${rc})" + exit 2 +fi +if [[ -n "${ep_out// /}" ]]; then + echo "OK - service ${SERVICE_NAME} in ${NAMESPACE} has endpoints: $(echo "$ep_out" | tr ' ' ',')" + exit 0 +fi + +# 2) try EndpointSlices (k8s >= 1.17) +eps_out=$(run_kc -n "$NAMESPACE" get endpointslices -l "kubernetes.io/service-name=${SERVICE_NAME}" -o jsonpath='{range .items[*]}{range .endpoints[*]}{.addresses[*]}{"\n"}{end}{end}') +rc=$? +if (( rc != 0 )); then + echo "CRITICAL - kubectl failed to get EndpointSlices (exit code ${rc})" + exit 2 +fi +if [[ -n "${eps_out// /}" ]]; then + tops=$(printf '%s\n' "$eps_out" | sed '/^\s*$/d' | tr '\n' ',' | sed 's/,$//') + echo "OK - service ${SERVICE_NAME} in ${NAMESPACE} has EndpointSlices addresses: ${tops}" + exit 0 +fi + +# 3) fallback: check service selector and pods matching it +svc_out=$(run_kc -n "$NAMESPACE" get svc "$SERVICE_NAME" -o jsonpath='{range $k,$v := .spec.selector}{printf "%s=%s;" $k $v}{end}') +rc=$? +if (( rc != 0 )); then + echo "CRITICAL - kubectl failed to get Service selector (exit code ${rc})" + exit 2 +fi + +SEL="$svc_out" +if [[ -z "$SEL" ]]; then + SEL="$LABEL_FALLBACK" + SEL=${SEL//;/,} +fi +SEL=${SEL%[;,]} + +# get pods by selector +pods_out=$(run_kc -n "$NAMESPACE" get pods -l "$SEL" --no-headers -o custom-columns=READY:.status.containerStatuses[0].ready,NAME:.metadata.name) +rc=$? +if (( rc != 0 )); then + echo "CRITICAL - kubectl failed to list pods for selector '${SEL}' (exit code ${rc})" + exit 2 +fi + +if [[ -z "${pods_out// /}" ]]; then + # try alternative labels common for CoreDNS (k8s-app=coredns) + pods_alt=$(run_kc -n "$NAMESPACE" get pods -l k8s-app=coredns --no-headers -o custom-columns=READY:.status.containerStatuses[0].ready,NAME:.metadata.name) + rc=$? + if (( rc != 0 )); then + echo "CRITICAL - kubectl failed to list pods for fallback selector (exit code ${rc})" + exit 2 + fi + if [[ -n "${pods_alt// /}" ]]; then + pods_out="$pods_alt" + SEL="k8s-app=coredns (fallback)" + fi +fi + +if [[ -z "${pods_out// /}" ]]; then + echo "CRITICAL - service ${SERVICE_NAME} in ${NAMESPACE} has no endpoints and no pods match selector '${SEL}'" + exit 2 +fi + +# count Ready pods +not_ready_count=0 +total_count=0 +not_ready_list=() +while IFS= read -r line; do + [[ -z "$line" ]] && continue + total_count=$((total_count+1)) + ready_flag=$(echo "$line" | awk '{print $1}') + pod_name=$(echo "$line" | awk '{print $2}') + if [[ "$ready_flag" != "true" && "$ready_flag" != "True" && "$ready_flag" != "1" ]]; then + not_ready_count=$((not_ready_count+1)) + not_ready_list+=("$pod_name") + fi +done <<< "$pods_out" + +if (( total_count == 0 )); then + echo "CRITICAL - service ${SERVICE_NAME} in ${NAMESPACE} has no endpoints and no pods found for selector '${SEL}'" + exit 2 +fi + +if (( not_ready_count > 0 )); then + echo "WARNING - service ${SERVICE_NAME} in ${NAMESPACE} has no endpoints, but ${not_ready_count}/${total_count} pods matching selector '${SEL}' are not Ready: ${not_ready_list[*]}" + exit 1 +fi + +# If pods exist and are Ready but no Endpoints/EndpointSlices -> likely endpointcontroller/roles mismatch; consider OK but log it +echo "OK - service ${SERVICE_NAME} in ${NAMESPACE} has no Endpoints resource but ${total_count} pods matching selector '${SEL}' are Ready (EndpointSlices absent or controller issue)" +exit 0 \ No newline at end of file diff --git a/files/nrpe/check_etcd_health b/files/nrpe/check_etcd_health new file mode 100644 index 0000000..309a7b5 --- /dev/null +++ b/files/nrpe/check_etcd_health @@ -0,0 +1,230 @@ +#!/usr/bin/env bash +# check_etcd_health +# Verifie la santé d'etcd et (optionnel) la creation/verifieation des snapshots. +# Retourne : 0=OK, 1=WARNING, 2=CRITICAL, 3=UNKNOWN +# +# Usage example: +# sudo /usr/lib/nagios/plugins/check_etcd_health \ +# --endpoints "https://192.168.1.41:2379,https://192.168.1.42:2379" \ +# --cacert /etc/ssl/etcd/ssl/ca.pem --cert /etc/ssl/etcd/ssl/admin.pem --key /etc/ssl/etcd/ssl/admin-key.pem \ +# --test-snapshot --snapshot-dir /var/backups/etcd --snapshot-max-age 24 +# +# Notes: +# - Par securite, execute ce script sur un master (ou via NRPE/SSH) avec un utilisateur ayant acces aux clefs. +# - --snapshot-max-age en heures (defaut 24). Mettre 0 pour desactiver la verification d'age. +# - --test-snapshot creerera un snapshot temporaire pour valider la creation + verification via `etcdctl snapshot status`. +# - Si --keep-snapshot-on-failure est active, le snapshot temporaire sera conserve en cas d'erreur pour debug. + +ETCDCTL=${ETCDCTL:-/usr/local/bin/etcdctl} + +print_usage() { + cat <= N MB (default 1024) + --crit-db-mb M critique si DB >= M MB (default 1800) + --timeout SECS etcdctl timeout (default 10) + --test-snapshot tenter de creer un snapshot temporaire et verifier son status + --snapshot-dir DIR repertoire pour snapshots temporaires (default /var/backups/etcd) + --keep-snapshot-on-failure conserver le snapshot temporaire si creation echoue (default false) + --snapshot-max-age HRS verifier qu'il existe un snapshot plus recent que HRS heures (default 24). Mettre 0 pour desactiver. + -h, --help affiche cette aide +EOF +} + +# Defaults +WARN_DB_MB=${WARN_DB_MB:-1024} +CRIT_DB_MB=${CRIT_DB_MB:-1800} +TIMEOUT=${TIMEOUT:-10} +TEST_SNAPSHOT=0 +SNAPSHOT_DIR=${SNAPSHOT_DIR:-/var/backups/etcd} +KEEP_SNAPSHOT_ON_FAILURE=0 +SNAPSHOT_MAX_AGE_HOURS=${SNAPSHOT_MAX_AGE_HOURS:-24} + +# Parse args +while [[ $# -gt 0 ]]; do + case "$1" in + --endpoints) ENDPOINTS="$2"; shift 2;; + --cacert) CACERT="$2"; shift 2;; + --cert) CERT="$2"; shift 2;; + --key) KEY="$2"; shift 2;; + --warn-db-mb) WARN_DB_MB="$2"; shift 2;; + --crit-db-mb) CRIT_DB_MB="$2"; shift 2;; + --timeout) TIMEOUT="$2"; shift 2;; + --test-snapshot) TEST_SNAPSHOT=1; shift 1;; + --snapshot-dir) SNAPSHOT_DIR="$2"; shift 2;; + --keep-snapshot-on-failure) KEEP_SNAPSHOT_ON_FAILURE=1; shift 1;; + --snapshot-max-age) SNAPSHOT_MAX_AGE_HOURS="$2"; shift 2;; + -h|--help) print_usage; exit 3;; + *) echo "Unknown arg: $1"; print_usage; exit 3;; + esac +done + +# Allow env fallback (if ETCDCTL_* env vars set) +ENDPOINTS=${ENDPOINTS:-${ETCDCTL_ENDPOINTS:-}} +CACERT=${CACERT:-${ETCDCTL_CACERT:-}} +CERT=${CERT:-${ETCDCTL_CERT:-}} +KEY=${KEY:-${ETCDCTL_KEY:-}} + +if [[ -z "${ENDPOINTS:-}" || -z "${CACERT:-}" || -z "${CERT:-}" || -z "${KEY:-}" ]]; then + echo "UNKNOWN - missing required args/certs" + print_usage + exit 3 +fi + +if [[ ! -x "$ETCDCTL" ]]; then + echo "UNKNOWN - etcdctl not found at $ETCDCTL" + exit 3 +fi + +if [[ ! -r "$CACERT" || ! -r "$CERT" || ! -r "$KEY" ]]; then + echo "CRITICAL - cannot read certificate files (permissions?)" + echo "CACERT=$CACERT CERT=$CERT KEY=$KEY" + exit 2 +fi + +export ETCDCTL_API=3 + +# 1) endpoint status check +OUT=$("$ETCDCTL" --command-timeout="${TIMEOUT}s" --endpoints="${ENDPOINTS}" --cacert="$CACERT" --cert="$CERT" --key="$KEY" endpoint status 2>&1) || { + echo "CRITICAL - etcdctl endpoint status failed: $OUT" + exit 2 +} + +leaders=0 +total=0 +max_db_mb=0 +while IFS= read -r line; do + line=${line//$'\r'/} + [[ -z "$line" ]] && continue + total=$((total+1)) + IFS=',' read -r endpoint id version dbsize isLeader isLearner memberCount rest <<<"$line" + isLeader=$(echo "${isLeader:-}" | tr -d ' ' | tr '[:upper:]' '[:lower:]') + if [[ "$isLeader" == "true" ]]; then leaders=$((leaders+1)); fi + db_mb=0 + if [[ -n "${dbsize:-}" ]]; then + dbsize=$(echo "$dbsize" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//') + num=$(echo "$dbsize" | awk '{print $1}' 2>/dev/null || echo "") + unit=$(echo "$dbsize" | awk '{print $2}' 2>/dev/null || echo "") + if [[ "$num" =~ ^[0-9]+(\.[0-9]+)?$ ]]; then + case "${unit^^}" in + B) db_mb=$(( num / 1024 / 1024 )) ;; + KB) db_mb=$(( num / 1024 )) ;; + MB) db_mb=$(printf "%.0f" "$num") ;; + GB) db_mb=$(( num * 1024 )) ;; + *) db_mb=$(printf "%.0f" "$num") ;; + esac + fi + fi + if (( db_mb > max_db_mb )); then max_db_mb=$db_mb; fi +done <<< "$OUT" + +if (( total == 0 )); then + echo "CRITICAL - no endpoints returned by etcdctl" + exit 2 +fi +if (( leaders == 0 )); then + echo "CRITICAL - no leader found among $total endpoints; detail: $OUT" + exit 2 +fi +if (( leaders > 1 )); then + echo "WARNING - multiple leaders detected: $leaders (possible split-brain); detail: $OUT" + exit 1 +fi +if (( max_db_mb >= CRIT_DB_MB )); then + echo "CRITICAL - etcd DB size ${max_db_mb}MB >= ${CRIT_DB_MB}MB" + exit 2 +fi +if (( max_db_mb >= WARN_DB_MB )); then + echo "WARNING - etcd DB size ${max_db_mb}MB >= ${WARN_DB_MB}MB" + exit 1 +fi + +# 2) Verification of recent snapshot files (optional, default 24h) +SNAP_CHECK_MSG="" +if [[ -n "$SNAPSHOT_MAX_AGE_HOURS" ]]; then + # SNAPSHOT_MAX_AGE_HOURS == 0 -> disabled + if (( SNAPSHOT_MAX_AGE_HOURS > 0 )); then + mkdir -p "$SNAPSHOT_DIR" 2>/dev/null || { + echo "CRITICAL - cannot create/access snapshot dir $SNAPSHOT_DIR" + exit 2 + } + latest_snapshot=$(ls -1t "$SNAPSHOT_DIR"/snapshot-*.db 2>/dev/null | head -n1 || true) + if [[ -z "$latest_snapshot" ]]; then + SNAP_CHECK_MSG="no snapshot files found in $SNAPSHOT_DIR" + echo "CRITICAL - $SNAP_CHECK_MSG (no snapshots)" + exit 2 + else + now_s=$(date +%s) + snap_mtime_s=$(stat -c %Y "$latest_snapshot") + age_s=$(( now_s - snap_mtime_s )) + age_h=$(( age_s / 3600 )) + if (( age_h > SNAPSHOT_MAX_AGE_HOURS )); then + SNAP_CHECK_MSG="latest snapshot $latest_snapshot is ${age_h}h old (> ${SNAPSHOT_MAX_AGE_HOURS}h)" + echo "CRITICAL - $SNAP_CHECK_MSG" + exit 2 + else + SNAP_CHECK_MSG="latest snapshot $latest_snapshot is ${age_h}h old (<= ${SNAPSHOT_MAX_AGE_HOURS}h)" + fi + fi + fi +fi + +# 3) Optional: test snapshot creation and status +SNAP_TEST_MSG="" +if (( TEST_SNAPSHOT == 1 )); then + mkdir -p "$SNAPSHOT_DIR" 2>/dev/null || { + echo "CRITICAL - cannot create/access snapshot dir $SNAPSHOT_DIR" + exit 2 + } + if [[ ! -w "$SNAPSHOT_DIR" ]]; then + echo "CRITICAL - snapshot dir not writable: $SNAPSHOT_DIR" + exit 2 + fi + + SNAPFILE=$(mktemp "${SNAPSHOT_DIR}/snapshot-XXXXXX.db") || { + echo "CRITICAL - mktemp failed in $SNAPSHOT_DIR" + exit 2 + } + + cleanup() { + rc=$? + if [[ $rc -eq 0 ]]; then + rm -f "$SNAPFILE" 2>/dev/null || true + else + if [[ $KEEP_SNAPSHOT_ON_FAILURE -eq 0 ]]; then + rm -f "$SNAPFILE" 2>/dev/null || true + else + echo "NOTICE - snapshot kept at $SNAPFILE for debugging" + fi + fi + return $rc + } + trap 'cleanup' EXIT + + SAVE_OUT=$("$ETCDCTL" --command-timeout="${TIMEOUT}s" --endpoints="${ENDPOINTS}" --cacert="$CACERT" --cert="$CERT" --key="$KEY" snapshot save "$SNAPFILE" 2>&1) || { + echo "CRITICAL - snapshot save failed: $SAVE_OUT" + exit 2 + } + + STATUS_OUT=$("$ETCDCTL" snapshot status "$SNAPFILE" 2>&1) || { + echo "CRITICAL - snapshot status failed: $STATUS_OUT" + exit 2 + } + + # If we reach here, creation+status ok + SNAP_TEST_MSG="snapshot test ok: $SNAPFILE ; status: $(echo "$STATUS_OUT" | tr '\n' ' ' | sed 's/ */ /g')" + # cleanup will remove the snapshot (unless KEEP_SNAPSHOT_ON_FAILURE and rc != 0) +fi + +# Compose final message +MSG="OK - $total endpoints checked, leaders=$leaders, max_db=${max_db_mb}MB" +if [[ -n "$SNAP_CHECK_MSG" ]]; then + MSG="$MSG ; $SNAP_CHECK_MSG" +fi +if [[ -n "$SNAP_TEST_MSG" ]]; then + MSG="$MSG ; $SNAP_TEST_MSG" +fi + +echo "$MSG" +exit 0 \ No newline at end of file diff --git a/files/nrpe/check_k8s_apiserver_access b/files/nrpe/check_k8s_apiserver_access new file mode 100644 index 0000000..6693e35 --- /dev/null +++ b/files/nrpe/check_k8s_apiserver_access @@ -0,0 +1,214 @@ +#!/usr/bin/env bash +# check_k8s_apiserver_access +# Vérifie le nombre de réponses HTTP 403 dans les logs de kube-apiserver. +# Retour codes: 0=OK, 1=WARNING, 2=CRITICAL, 3=UNKNOWN +# +# Par défaut: utilise journalctl -u kube-apiserver --since="${WINDOW} minutes ago" +# Option --kubectl : utilise "kubectl logs" sur les pods correspondant au sélecteur. +# +# Usage examples: +# sudo /usr/lib/nagios/plugins/check_k8s_apiserver_access --window 5 --warn 10 --crit 50 +# sudo /usr/lib/nagios/plugins/check_k8s_apiserver_access --kubectl --selector 'k8s-app=kube-apiserver' --window 10 --crit 100 +# +set -euo pipefail + +PROG_NAME=$(basename "$0") + +# Defaults +WINDOW_MINUTES=5 +WARN_THRESHOLD=10 +CRIT_THRESHOLD=50 +USE_KUBECTL=0 +KUBECTL_NAMESPACE="kube-system" +KUBECTL_SELECTOR="" # if empty, we'll try -l component=kube-apiserver or label provided +JOURNAL_UNIT="kube-apiserver" # systemd unit name; adapt if different +PATTERN='' # optional custom grep regex +TOP_N=5 # number of top offenders to show + +print_help() { + cat <= N -> WARNING (default: ${WARN_THRESHOLD}) + --crit N CRIT threshold: count >= N -> CRITICAL (default: ${CRIT_THRESHOLD}) + --kubectl Use 'kubectl logs' on apiserver pods instead of journalctl + --namespace NS Namespace for kubectl logs (default: ${KUBECTL_NAMESPACE}) + --selector SEL Label selector for kubectl logs (e.g. "component=kube-apiserver" or "k8s-app=kube-apiserver") + --unit UNIT systemd unit for journalctl (default: ${JOURNAL_UNIT}) + --pattern REGEX custom grep regex to detect 403 entries (overrides built-in heuristics) + --top N show top N request lines causing 403 (default ${TOP_N}) + -h, --help show this help + +Examples: + # check last 5 minutes using journalctl + sudo ./check_apiserver_403.sh --window 5 --warn 20 --crit 50 + + # check last 10 minutes using kubectl logs for apiserver static-pods + sudo ./check_apiserver_403.sh --kubectl --namespace kube-system --selector 'k8s-app=kube-apiserver' --window 10 --crit 100 +EOF +} + +# Parse args +while [[ $# -gt 0 ]]; do + case "$1" in + --window) WINDOW_MINUTES="$2"; shift 2;; + --warn) WARN_THRESHOLD="$2"; shift 2;; + --crit) CRIT_THRESHOLD="$2"; shift 2;; + --kubectl) USE_KUBECTL=1; shift 1;; + --namespace) KUBECTL_NAMESPACE="$2"; shift 2;; + --selector) KUBECTL_SELECTOR="$2"; shift 2;; + --unit) JOURNAL_UNIT="$2"; shift 2;; + --pattern) PATTERN="$2"; shift 2;; + --top) TOP_N="$2"; shift 2;; + -h|--help) print_help; exit 3;; + *) echo "Unknown argument: $1"; print_help; exit 3;; + esac +done + +# Validate numeric args +if ! [[ "$WINDOW_MINUTES" =~ ^[0-9]+$ ]]; then echo "UNKNOWN - invalid --window"; exit 3; fi +if ! [[ "$WARN_THRESHOLD" =~ ^[0-9]+$ ]]; then echo "UNKNOWN - invalid --warn"; exit 3; fi +if ! [[ "$CRIT_THRESHOLD" =~ ^[0-9]+$ ]]; then echo "UNKNOWN - invalid --crit"; exit 3; fi +if ! [[ "$TOP_N" =~ ^[0-9]+$ ]]; then echo "UNKNOWN - invalid --top"; exit 3; fi + +# Build detection regex if not provided +if [[ -z "$PATTERN" ]]; then + # heuristics: try to match common apiserver log patterns that indicate a 403/Forbidden + # examples: "\" 403 ", "code=403", "403 Forbidden", "Forbidden" combined with "Denied" etc. + PATTERN='(" 403 |\" 403 |code=403|403 Forbidden|Forbidden|\"Reason=Forbidden\"|\"message=.*Forbidden)' + + # note: portable grep -E will accept that pattern +fi + +# Grab logs +get_logs_journal() { + # Use journalctl if available + if ! command -v journalctl >/dev/null 2>&1; then + echo "ERROR_NO_JOURNAL" 1>&2 + return 1 + fi + # We use --no-pager; use unit name. If unit not present, journalctl returns non-zero. + # Example: journalctl -u kube-apiserver --since "5 minutes ago" + journalctl -u "${JOURNAL_UNIT}" --since="${WINDOW_MINUTES} minutes ago" --no-pager 2>/dev/null || return 1 +} + +get_logs_kubectl() { + if ! command -v kubectl >/dev/null 2>&1; then + echo "ERROR_NO_KUBECTL" 1>&2 + return 1 + fi + # If no selector given try common selectors + sel="${KUBECTL_SELECTOR}" + if [[ -z "$sel" ]]; then + # try common labels + for try in 'component=kube-apiserver' 'k8s-app=kube-apiserver' 'tier=control-plane' ''; do + if [[ -z "$try" ]]; then + sel="" + break + fi + # test if any pods match + count=$(kubectl -n "${KUBECTL_NAMESPACE}" get pods -l "${try}" --no-headers 2>/dev/null | wc -l || echo 0) + if [[ "$count" -gt 0 ]]; then + sel="${try}" + break + fi + done + fi + + if [[ -z "$sel" ]]; then + # fallback: get all pods in namespace and try to find apiserver in name + pods=$(kubectl -n "${KUBECTL_NAMESPACE}" get pods --no-headers -o custom-columns=':metadata.name' 2>/dev/null || true) + if [[ -z "$pods" ]]; then + return 1 + fi + # build selector as empty and we'll filter by name + # collect logs from pods whose name contains "apiserver" + out="" + while IFS= read -r p; do + [[ -z "$p" ]] && continue + if echo "$p" | grep -qi 'apiserver'; then + out="${out}$(kubectl -n ${KUBECTL_NAMESPACE} logs --since=${WINDOW_MINUTES}m ${p} --all-containers 2>/dev/null || true)$'\n'" + fi + done <<< "$pods" + printf '%s' "$out" + return 0 + else + # gather logs from all pods matching selector + podnames=$(kubectl -n "${KUBECTL_NAMESPACE}" get pods -l "${sel}" -o custom-columns=':metadata.name' --no-headers 2>/dev/null || true) + if [[ -z "$podnames" ]]; then + return 1 + fi + out="" + while IFS= read -r p; do + [[ -z "$p" ]] && continue + out="${out}$(kubectl -n ${KUBECTL_NAMESPACE} logs --since=${WINDOW_MINUTES}m ${p} --all-containers 2>/dev/null || true)$'\n'" + done <<< "$podnames" + printf '%s' "$out" + return 0 + fi +} + +# retrieve logs into variable LOGS +LOGS="" +if (( USE_KUBECTL == 1 )); then + if ! LOGS=$(get_logs_kubectl); then + echo "CRITICAL - failed to collect logs via kubectl (check KUBECONFIG, namespace/selector, permissions)" + exit 2 + fi +else + if ! LOGS=$(get_logs_journal); then + echo "CRITICAL - failed to collect logs via journalctl for unit '${JOURNAL_UNIT}' (check unit name/permissions)" + exit 2 + fi +fi + +# If logs empty -> OK (no traffic) BUT treat with UNKNOWN if we expected logs +if [[ -z "$LOGS" ]]; then + echo "OK - no apiserver logs found in the last ${WINDOW_MINUTES}m (count=0)" + exit 0 +fi + +# Count matches of 403 using grep -E (case-insensitive) +# Use printf to pass LOGS safely to grep +count_403=$(printf '%s\n' "$LOGS" | grep -E -i -c "$PATTERN" || true) +count_403=${count_403:-0} + +# Optionally extract top request lines that caused 403 +# Try to extract HTTP method + path if present, otherwise use whole line truncated +top_requests=$(printf '%s\n' "$LOGS" | grep -E -i "$PATTERN" || true) +if [[ -n "$top_requests" ]]; then + # try to extract method+path like: "GET /api/..." or GET /api/... + top_paths=$(printf '%s\n' "$top_requests" | grep -oE '(GET|POST|PUT|DELETE|PATCH) [^" ]+' | sed 's/"$//' | sort | uniq -c | sort -rn | head -n "${TOP_N}" || true) + if [[ -z "$top_paths" ]]; then + # fallback: show most frequent truncated lines + top_paths=$(printf '%s\n' "$top_requests" | sed 's/^[[:space:]]*//; s/[[:space:]]\+/ /g' | cut -c1-200 | sort | uniq -c | sort -rn | head -n "${TOP_N}" || true) + fi +else + top_paths="" +fi + +# Decide severity +if (( count_403 >= CRIT_THRESHOLD )); then + status=2 + state="CRITICAL" +elif (( count_403 >= WARN_THRESHOLD )); then + status=1 + state="WARNING" +else + status=0 + state="OK" +fi + +# Build message +msg="${state} - ${count_403} occurrences of 403 in last ${WINDOW_MINUTES}m (warn=${WARN_THRESHOLD},crit=${CRIT_THRESHOLD})" + +# Append top paths if present +if [[ -n "$top_paths" ]]; then + msg="${msg} ; top=${TOP_N}: $(printf '%s' "$top_paths" | tr '\n' '|' | sed 's/|$//')" +fi + +# Print and exit +echo "$msg" +exit $status \ No newline at end of file diff --git a/files/nrpe/check_k8s_deployments b/files/nrpe/check_k8s_deployments new file mode 100644 index 0000000..6575e2f --- /dev/null +++ b/files/nrpe/check_k8s_deployments @@ -0,0 +1,138 @@ +#!/usr/bin/env bash +# check_k8s_deployments +# Vérifie les Deployments Kubernetes: availableReplicas < spec.replicas +# Retour: 0=OK, 1=WARNING, 2=CRITICAL, 3=UNKNOWN +# +# Usage: +# sudo /usr/lib/nagios/plugins/check_k8s_deployments [--warn N] [--crit M] [--ignore-ns ns1,ns2] [--namespaces ns1,ns2] [--age-min MINUTES] +# +# Exemples: +# sudo /usr/lib/nagios/plugins/check_k8s_deployments --crit 1 +# sudo /usr/lib/nagios/plugins/check_k8s_deployments --ignore-ns kube-system,monitoring +# +set -euo pipefail + +WARN=${WARN:-0} # nombre de deploys en erreur pour WARNING +CRIT=${CRIT:-1} # nombre de deploys en erreur pour CRITICAL par défaut (1 => tout problème -> CRITICAL) +IGNORE_NS="" +INCLUDE_NS="" +AGE_MIN=0 + +print_usage() { + cat <=N déploiements en erreur (default 0) + --crit M : seuil crit si >=M déploiements en erreur (default 1) + --ignore-ns LIST : comma separated namespaces to ignore (default none) + --namespaces LIST: comma separated namespaces to check only (default all) + --age-min N : ignore deployments created less than N minutes ago (avoid flapping during rollout) +EOF +} + +# parse args +while [[ $# -gt 0 ]]; do + case "$1" in + --warn) WARN="$2"; shift 2;; + --crit) CRIT="$2"; shift 2;; + --ignore-ns) IGNORE_NS="$2"; shift 2;; + --namespaces) INCLUDE_NS="$2"; shift 2;; + --age-min) AGE_MIN="$2"; shift 2;; + -h|--help) print_usage; exit 3;; + *) echo "Unknown arg: $1"; print_usage; exit 3;; + esac +done + +if ! command -v kubectl >/dev/null 2>&1; then + echo "UNKNOWN - kubectl not found" + exit 3 +fi + +# Build filter for namespace inclusion/exclusion +ignore_pattern="" +if [[ -n "$IGNORE_NS" ]]; then + IFS=',' read -ra arr <<< "$IGNORE_NS" + for ns in "${arr[@]}"; do + ignore_pattern="${ignore_pattern}|^${ns}\$" + done + # remove leading | + ignore_pattern="${ignore_pattern#|}" +fi + +include_pattern="" +if [[ -n "$INCLUDE_NS" ]]; then + IFS=',' read -ra arr2 <<< "$INCLUDE_NS" + for ns in "${arr2[@]}"; do + include_pattern="${include_pattern}|^${ns}\$" + done + include_pattern="${include_pattern#|}" +fi + +# result collection +# Initialize failures array to avoid "variable sans liaison" when running with set -u +failures=() + +# get list: namespace, name, desired, available, creationTimestamp +mapfile -t lines < <(kubectl get deploy -A -o jsonpath='{range .items[*]}{.metadata.namespace}{"\t"}{.metadata.name}{"\t"}{.spec.replicas}{"\t"}{.status.availableReplicas}{"\t"}{.metadata.creationTimestamp}{"\n"}{end}' 2>/dev/null || true) + +now_s=$(date +%s) + +for line in "${lines[@]}"; do + # skip empty lines + [[ -z "${line}" ]] && continue + + ns=$(echo "$line" | awk -F'\t' '{print $1}') + name=$(echo "$line" | awk -F'\t' '{print $2}') + desired=$(echo "$line" | awk -F'\t' '{print $3}') + available=$(echo "$line" | awk -F'\t' '{print $4}') + created=$(echo "$line" | awk -F'\t' '{print $5}') + + # normalize + desired=${desired:-0} + available=${available:-0} + + # namespace filtering + if [[ -n "$include_pattern" ]]; then + if ! echo "$ns" | egrep -q "$include_pattern"; then + continue + fi + fi + if [[ -n "$ignore_pattern" ]]; then + if echo "$ns" | egrep -q "$ignore_pattern"; then + continue + fi + fi + + # age filtering + if [[ -n "$created" && "$AGE_MIN" -gt 0 ]]; then + # convert to epoch + created_s=$(date -d "$created" +%s 2>/dev/null || echo 0) + age_min=$(( (now_s - created_s) / 60 )) + if (( age_min < AGE_MIN )); then + # skip new deployments (they might be still rolling out) + continue + fi + fi + + if (( available < desired )); then + failures+=("${ns}/${name} (desired=${desired},available=${available})") + fi +done + +count=${#failures[@]} + +if (( count == 0 )); then + echo "OK - all deployments report desired==available" + exit 0 +fi + +# Decide severity +if (( count >= CRIT )); then + echo "CRITICAL - ${count} deployments not available: ${failures[*]}" + exit 2 +elif (( count >= WARN )); then + echo "WARNING - ${count} deployments not available: ${failures[*]}" + exit 1 +else + echo "OK - ${count} deployments not available but below thresholds" + exit 0 +fi \ No newline at end of file diff --git a/files/nrpe/check_k8s_jobs_cronjobs b/files/nrpe/check_k8s_jobs_cronjobs new file mode 100644 index 0000000..9b90a6f --- /dev/null +++ b/files/nrpe/check_k8s_jobs_cronjobs @@ -0,0 +1,232 @@ +#!/usr/bin/env bash +# check_k8s_jobs_cronjobs +# Vérifie l'état des Kubernetes Jobs et CronJobs. +# Exit codes: 0=OK, 1=WARNING, 2=CRITICAL, 3=UNKNOWN +# +# Fonctions principales : +# - détecte Jobs avec des échecs (.status.failed > 0) ou des Jobs "actifs" trop vieux +# - recherche d'événements récents (type=Warning) liés aux Jobs dans les X dernières minutes +# - vérifie pour les CronJobs que lastScheduleTime n'est pas trop ancien (configurable) si non suspendu +# +# Usage (exemples) : +# sudo /usr/lib/nagios/plugins/check_k8s_jobs_cronjobs --crit 1 --recent-minutes 5 +# sudo /usr/lib/nagios/plugins/check_k8s_jobs_cronjobs --ignore-ns kube-system --cron-max-age 120 +# +set -euo pipefail + +# Defaults +WARN=${WARN:-0} +CRIT=${CRIT:-1} +IGNORE_NS="" +INCLUDE_NS="" +AGE_MIN=${AGE_MIN:-60} +RECENT_MINUTES=${RECENT_MINUTES:-5} +CHECK_CRON=1 +CRON_MAX_AGE_MIN=${CRON_MAX_AGE_MIN:-60} + +print_usage() { + cat <= N objets en erreur (default 0) + --crit M seuil CRIT si >= M objets en erreur (default 1) + --ignore-ns ns1,ns2 namespaces à ignorer + --namespaces ns1,ns2 limiter aux namespaces donnés (comma separated) + --age-min MINUTES considérer un job "actif" normal si démarré moins de MINUTES (default 60) + --recent-minutes MIN chercher événements de Job (Warning) dans les MIN dernières minutes (default 5) + --check-cron activer la vérification des CronJobs (default ON) + --cron-max-age MINUTES si lastScheduleTime > MINUTES => alerter (default 60). Mettre 0 pour désactiver. + -h, --help : affiche l'aide +EOF +} + +# Parse args +while [[ $# -gt 0 ]]; do + case "$1" in + --warn) WARN="$2"; shift 2;; + --crit) CRIT="$2"; shift 2;; + --ignore-ns) IGNORE_NS="$2"; shift 2;; + --namespaces) INCLUDE_NS="$2"; shift 2;; + --age-min) AGE_MIN="$2"; shift 2;; + --recent-minutes) RECENT_MINUTES="$2"; shift 2;; + --no-cron) CHECK_CRON=0; shift 1;; + --cron-max-age) CRON_MAX_AGE_MIN="$2"; shift 2;; + -h|--help) print_usage; exit 3;; + *) echo "Unknown arg: $1"; print_usage; exit 3;; + esac +done + +if ! command -v kubectl >/dev/null 2>&1; then + echo "UNKNOWN - kubectl not found" + exit 3 +fi + +# Build namespace filters (regex) +ignore_pattern="" +if [[ -n "$IGNORE_NS" ]]; then + IFS=',' read -ra arr <<< "$IGNORE_NS" + for ns in "${arr[@]}"; do + ignore_pattern="${ignore_pattern}|^${ns}\$" + done + ignore_pattern="${ignore_pattern#|}" +fi + +include_pattern="" +if [[ -n "$INCLUDE_NS" ]]; then + IFS=',' read -ra arr2 <<< "$INCLUDE_NS" + for ns in "${arr2[@]}"; do + include_pattern="${include_pattern}|^${ns}\$" + done + include_pattern="${include_pattern#|}" +fi + +ns_allowed() { + local ns="$1" + if [[ -n "$include_pattern" ]]; then + if ! echo "$ns" | egrep -q "$include_pattern"; then + return 1 + fi + fi + if [[ -n "$ignore_pattern" ]]; then + if echo "$ns" | egrep -q "$ignore_pattern"; then + return 1 + fi + fi + return 0 +} + +now_s=$(date +%s) + +# Initialize problems array safely +problems=() + +# --------------------------- +# 1) Inspect Jobs +# --------------------------- +# Fields: namespace, name, active, succeeded, failed, startTime, completionTime +mapfile -t job_lines < <(kubectl get jobs -A -o jsonpath='{range .items[*]}{.metadata.namespace}{"\t"}{.metadata.name}{"\t"}{.status.active}{"\t"}{.status.succeeded}{"\t"}{.status.failed}{"\t"}{.status.startTime}{"\t"}{.status.completionTime}{"\n"}{end}' 2>/dev/null || true) + +for line in "${job_lines[@]}"; do + ns=$(echo "$line" | awk -F'\t' '{print $1}') + name=$(echo "$line" | awk -F'\t' '{print $2}') + active=$(echo "$line" | awk -F'\t' '{print $3}') + succeeded=$(echo "$line" | awk -F'\t' '{print $4}') + failed=$(echo "$line" | awk -F'\t' '{print $5}') + start=$(echo "$line" | awk -F'\t' '{print $6}') + completion=$(echo "$line" | awk -F'\t' '{print $7}') + + # defaults + active=${active:-0} + succeeded=${succeeded:-0} + failed=${failed:-0} + + if ! ns_allowed "$ns"; then + continue + fi + + # 1.a) Jobs with failures + if (( failed > 0 )); then + problems+=("Job ${ns}/${name} failedCount=${failed}") + continue + fi + + # 1.b) Active jobs running too long + if (( active > 0 )); then + if [[ -n "$start" && "$start" != "null" ]]; then + # convert start timestamp to epoch (GNU date) + start_s=$(date -d "$start" +%s 2>/dev/null || echo 0) + if (( start_s > 0 )); then + age_min=$(( (now_s - start_s) / 60 )) + if (( age_min >= AGE_MIN )); then + problems+=("Job ${ns}/${name} active for ${age_min}min >= ${AGE_MIN}min") + fi + fi + else + # no start time but active >0 -> flag + problems+=("Job ${ns}/${name} active but no startTime recorded") + fi + fi +done + +# 1.c) Recent Job warning events (type=Warning) in last RECENT_MINUTES +if (( RECENT_MINUTES > 0 )); then + # get events for Jobs (type Warning) with fields: namespace, involvedObject.name, lastTimestamp, reason, message + mapfile -t event_lines < <(kubectl get events --all-namespaces --field-selector involvedObject.kind=Job,type=Warning -o custom-columns='NAMESPACE:.metadata.namespace,NAME:.involvedObject.name,LAST:.lastTimestamp,REASON:.reason,MESSAGE:.message' --no-headers 2>/dev/null || true) + cutoff_s=$(( now_s - RECENT_MINUTES * 60 )) + for ev in "${event_lines[@]}"; do + ns=$(echo "$ev" | awk '{print $1}') + name=$(echo "$ev" | awk '{print $2}') + last=$(echo "$ev" | awk '{print $3}') + if ! ns_allowed "$ns"; then + continue + fi + if [[ -n "$last" && "$last" != "" ]]; then + ts=$(date -d "$last" +%s 2>/dev/null || echo 0) + if (( ts >= cutoff_s )); then + problems+=("Job event Warning ${ns}/${name} at $last") + fi + fi + done +fi + +# --------------------------- +# 2) Inspect CronJobs (optionnel) +# --------------------------- +if (( CHECK_CRON == 1 )) && (( CRON_MAX_AGE_MIN > 0 )); then + # Fields: namespace, name, suspend (true/false/null), lastScheduleTime + mapfile -t cron_lines < <(kubectl get cronjob -A -o jsonpath='{range .items[*]}{.metadata.namespace}{"\t"}{.metadata.name}{"\t"}{.spec.suspend}{"\t"}{.status.lastScheduleTime}{"\n"}{end}' 2>/dev/null || true) + + for line in "${cron_lines[@]}"; do + ns=$(echo "$line" | awk -F'\t' '{print $1}') + name=$(echo "$line" | awk -F'\t' '{print $2}') + suspend=$(echo "$line" | awk -F'\t' '{print $3}') + last=$(echo "$line" | awk -F'\t' '{print $4}') + + if ! ns_allowed "$ns"; then + continue + fi + + # If suspended, do not consider as problem + if [[ "$suspend" == "true" ]]; then + continue + fi + + if [[ -z "$last" || "$last" == "null" ]]; then + # Never scheduled yet: warn (useful to detect misconfigured cronjobs) + problems+=("CronJob ${ns}/${name} has no lastScheduleTime (never scheduled?)") + continue + fi + + last_s=$(date -d "$last" +%s 2>/dev/null || echo 0) + if (( last_s > 0 )); then + age_min=$(( (now_s - last_s) / 60 )) + if (( age_min > CRON_MAX_AGE_MIN )); then + problems+=("CronJob ${ns}/${name} lastSchedule ${age_min}min ago > ${CRON_MAX_AGE_MIN}min") + fi + else + problems+=("CronJob ${ns}/${name} lastScheduleTime unparsable: ${last}") + fi + done +fi + +# --------------------------- +# Final decision & output +# --------------------------- +count=${#problems[@]} + +if (( count == 0 )); then + echo "OK - Jobs/CronJobs checks passed" + exit 0 +fi + +# Severity decision +if (( count >= CRIT )); then + echo "CRITICAL - ${count} problems found: ${problems[*]}" + exit 2 +elif (( count >= WARN )); then + echo "WARNING - ${count} problems found: ${problems[*]}" + exit 1 +else + echo "OK - ${count} problems found but below thresholds" + exit 0 +fi \ No newline at end of file diff --git a/files/nrpe/check_k8s_pki_certs b/files/nrpe/check_k8s_pki_certs new file mode 100644 index 0000000..d62da68 --- /dev/null +++ b/files/nrpe/check_k8s_pki_certs @@ -0,0 +1,194 @@ +#!/usr/bin/env bash +# check_k8s_pki_certs +# Vérifie les certificats PEM sous /etc/kubernetes/pki (par défaut) et alerte si expiration <= warn_days (30j par défaut). +# Exit codes: 0=OK, 1=WARNING, 2=CRITICAL, 3=UNKNOWN +# +# Usage: +# sudo /usr/lib/nagios/plugins/check_k8s_pki_certs +# sudo /usr/lib/nagios/plugins/check_k8s_pki_certs --path /etc/kubernetes/ssl --warn-days 30 --crit-days 7 --recursive +# +set -euo pipefail + +PKI_PATH=${PKI_PATH:-/etc/kubernetes/pki} +WARN_DAYS=${WARN_DAYS:-30} +CRIT_DAYS=${CRIT_DAYS:-7} +RECURSIVE=0 + +print_usage() { + cat </dev/null 2>&1; then + echo "UNKNOWN - openssl not found" + exit 3 +fi +if ! command -v date >/dev/null 2>&1; then + echo "UNKNOWN - date not found" + exit 3 +fi +if ! command -v sed >/dev/null 2>&1; then + echo "UNKNOWN - sed not found" + exit 3 +fi +if ! command -v awk >/dev/null 2>&1; then + echo "UNKNOWN - awk not found" + exit 3 +fi +if ! command -v find >/dev/null 2>&1; then + echo "UNKNOWN - find not found" + exit 3 +fi + +# resolve symlink target (realpath or readlink -f) +if command -v realpath >/dev/null 2>&1; then + PKI_PATH_RESOLVED=$(realpath -e "$PKI_PATH" 2>/dev/null || true) +else + PKI_PATH_RESOLVED=$(readlink -f "$PKI_PATH" 2>/dev/null || true) +fi +if [[ -n "$PKI_PATH_RESOLVED" && -d "$PKI_PATH_RESOLVED" ]]; then + PKI_PATH="$PKI_PATH_RESOLVED" +fi + +if [[ ! -d "$PKI_PATH" ]]; then + echo "UNKNOWN - path $PKI_PATH not found or not a directory" + exit 3 +fi + +now_s=$(date +%s) + +# Initialize arrays explicitly to avoid "variable sans liaison" with set -u +critical=() +warning=() +ok=() +errors=() + +file_count=0 +cert_count=0 + +# build find command: follow symlinks (-L) so that symlinked directories/files are handled +if [[ $RECURSIVE -eq 1 ]]; then + FIND_CMD=(find -L "$PKI_PATH" -type f -print0) +else + FIND_CMD=(find -L "$PKI_PATH" -maxdepth 1 -type f -print0) +fi + +# iterate files found +while IFS= read -r -d '' file; do + file_count=$((file_count+1)) + + # skip unreadable files + if [[ ! -r "$file" ]]; then + errors+=("Unreadable file: $file") + continue + fi + + # skip files without PEM marker + if ! grep -q "BEGIN CERTIFICATE" "$file" 2>/dev/null; then + continue + fi + + # find pairs of BEGIN/END certificate line numbers robustly using awk + # prints "start:end" for each certificate block + mapfile -t pairs < <(awk ' + /BEGIN CERTIFICATE/ {start=NR} + /END CERTIFICATE/ && start { print start ":" NR; start=0 } + ' "$file" 2>/dev/null || true) + + if [[ ${#pairs[@]} -eq 0 ]]; then + errors+=("No certificate block pairs found in $file") + continue + fi + + for p in "${pairs[@]}"; do + start=${p%%:*} + end=${p##*:} + # extract block via sed (line range), send to openssl via stdin + cert_block=$(sed -n "${start},${end}p" "$file" 2>/dev/null || true) + if [[ -z "$cert_block" ]]; then + errors+=("Failed to extract certificate block ${start}-${end} from $file") + continue + fi + + # openssl expects a file or stdin; use stdin + endline=$(printf '%s\n' "$cert_block" | openssl x509 -noout -enddate -in /dev/stdin 2>/dev/null) || { + errors+=("Failed to parse certificate block ${start}-${end} in $file with openssl") + continue + } + # sample endline: notAfter=Oct 27 16:15:30 2125 GMT + notAfter=${endline#notAfter=} + expiry_s=$(date -d "$notAfter" +%s 2>/dev/null) || { + errors+=("Cannot parse date '$notAfter' for cert in $file") + continue + } + days_left=$(( (expiry_s - now_s) / 86400 )) + subj=$(printf '%s\n' "$cert_block" | openssl x509 -noout -subject -in /dev/stdin 2>/dev/null || true) + subj=${subj#subject= } + info="${file} :: ${subj} :: expires in ${days_left}d on ${notAfter}" + cert_count=$((cert_count+1)) + if (( days_left <= CRIT_DAYS )); then + critical+=("$info") + elif (( days_left <= WARN_DAYS )); then + warning+=("$info") + else + ok+=("$info") + fi + done + +done < <("${FIND_CMD[@]}") + +# results and exit codes +if [[ ${#errors[@]} -gt 0 ]]; then + echo "UNKNOWN - parsing errors: ${errors[*]}" + exit 3 +fi + +if (( cert_count == 0 )); then + echo "UNKNOWN - no certificates found under $PKI_PATH" + exit 3 +fi + +if (( ${#critical[@]} > 0 )); then + echo "CRITICAL - ${#critical[@]} certificate(s) expiring soon (<= ${CRIT_DAYS} days):" + for c in "${critical[@]}"; do + echo " - $c" + done + if (( ${#warning[@]} > 0 )); then + echo "WARN (additional ${#warning[@]} cert(s) <= ${WARN_DAYS} days):" + for w in "${warning[@]}"; do + echo " - $w" + done + fi + exit 2 +fi + +if (( ${#warning[@]} > 0 )); then + echo "WARNING - ${#warning[@]} certificate(s) expiring within ${WARN_DAYS} days:" + for w in "${warning[@]}"; do + echo " - $w" + done + exit 1 +fi + +echo "OK - ${cert_count} cert(s) checked in ${file_count} file(s), no expiry within ${WARN_DAYS} days" +exit 0 \ No newline at end of file diff --git a/files/nrpe/check_k8s_pod_restarts b/files/nrpe/check_k8s_pod_restarts new file mode 100644 index 0000000..73aedca --- /dev/null +++ b/files/nrpe/check_k8s_pod_restarts @@ -0,0 +1,49 @@ +#!/usr/bin/env bash +# check_k8s_pod_restarts +# Vérifie s'il y a eu des redémarrages de pods (événements "Killing") dans les X dernières minutes. +# Retour: 0=OK, 2=CRITICAL, 3=UNKNOWN +# +# Usage: +# sudo /usr/lib/nagios/plugins/check_k8s_pod_restarts [minutes] +# +MINUTES=${1:-5} + +# Require kubectl +if ! command -v kubectl >/dev/null 2>&1; then + echo "UNKNOWN - kubectl not found" + exit 3 +fi + +# cutoff en epoch (GNU date) +if ! cutoff=$(date -d "$MINUTES minutes ago" +%s 2>/dev/null); then + echo "UNKNOWN - date parsing failed (on macOS use gdate from coreutils)" + exit 3 +fi + +matches=() +while IFS=$'\t' read -r ns pod last msg; do + # skip empty lines + [[ -z "$last" ]] && continue + # convert last timestamp to epoch (works with GNU date; handles timezone/fractions) + if ! ts=$(date -d "$last" +%s 2>/dev/null); then + # if parsing fails, skip the event + continue + fi + if (( ts >= cutoff )); then + # safe message truncation + shortmsg=$(echo "$msg" | tr '\n' ' ' | cut -c1-300) + matches+=("$ns\t$pod\t$last\t$shortmsg") + fi +done < <(kubectl get events --all-namespaces --field-selector reason=Killing -o custom-columns='NAMESPACE:.metadata.namespace,NAME:.involvedObject.name,LAST:.lastTimestamp,MESSAGE:.message' --no-headers 2>/dev/null || true) + +if [[ ${#matches[@]} -eq 0 ]]; then + echo "OK - no pod restarts in the last ${MINUTES} minutes" + exit 0 +else + echo "CRITICAL - ${#matches[@]} pod restarts in the last ${MINUTES} minutes:" + for m in "${matches[@]}"; do + IFS=$'\t' read -r ns pod last shortmsg <<< "$m" + echo " - ${ns}/${pod} at ${last} : ${shortmsg}" + done + exit 2 +fi \ No newline at end of file diff --git a/files/nrpe/check_k8s_pv_pvc b/files/nrpe/check_k8s_pv_pvc new file mode 100644 index 0000000..5d27dc3 --- /dev/null +++ b/files/nrpe/check_k8s_pv_pvc @@ -0,0 +1,202 @@ +#!/usr/bin/env bash +# check_k8s_pv_pvc +# Vérifie l'état des PersistentVolumes (PV) et PersistentVolumeClaims (PVC) Kubernetes. +# Exit codes: 0=OK, 1=WARNING, 2=CRITICAL, 3=UNKNOWN +# +# Usage examples: +# sudo /usr/lib/nagios/plugins/check_k8s_pv_pvc --crit 1 # CRITICAL si >=1 problème +# sudo /usr/lib/nagios/plugins/check_k8s_pv_pvc --ignore-ns kube-system # ignorer kube-system +# sudo /usr/lib/nagios/plugins/check_k8s_pv_pvc --pvc-age-min 10 --crit 2 # ignorer PVC récents <10min, CRIT si >=2 +# sudo /usr/lib/nagios/plugins/check_k8s_pv_pvc --check-pv --check-pvc # (par défaut les 2 sont vérifiés) +# +set -euo pipefail + +# Defaults +WARN=${WARN:-0} +CRIT=${CRIT:-1} +IGNORE_NS="" +INCLUDE_NS="" +PVC_AGE_MIN=${PVC_AGE_MIN:-5} # en minutes : ignore PVC créés il y a moins de X minutes (défaut 5) +CHECK_PV=1 +CHECK_PVC=1 + +print_usage() { + cat <= N objets en erreur (default 0) + --crit M seuil CRIT si >= M objets en erreur (default 1) + --ignore-ns a,b,c namespaces à ignorer (comma separated) + --namespaces a,b limiter aux namespaces donnés (comma separated) + --pvc-age-min N ignore PVC créés il y a moins de N minutes (default 5) + --no-pv disable PV checks + --no-pvc disable PVC checks + -h, --help affiche cette aide +EOF +} + +# Parse args +while [[ $# -gt 0 ]]; do + case "$1" in + --warn) WARN="$2"; shift 2;; + --crit) CRIT="$2"; shift 2;; + --ignore-ns) IGNORE_NS="$2"; shift 2;; + --namespaces) INCLUDE_NS="$2"; shift 2;; + --pvc-age-min) PVC_AGE_MIN="$2"; shift 2;; + --no-pv) CHECK_PV=0; shift 1;; + --no-pvc) CHECK_PVC=0; shift 1;; + -h|--help) print_usage; exit 3;; + *) echo "Unknown arg: $1"; print_usage; exit 3;; + esac +done + +if ! command -v kubectl >/dev/null 2>&1; then + echo "UNKNOWN - kubectl not found" + exit 3 +fi + +# Build namespace filters +ignore_pattern="" +if [[ -n "$IGNORE_NS" ]]; then + IFS=',' read -ra arr <<< "$IGNORE_NS" + for ns in "${arr[@]}"; do + ignore_pattern="${ignore_pattern}|^${ns}\$" + done + ignore_pattern="${ignore_pattern#|}" +fi + +include_pattern="" +if [[ -n "$INCLUDE_NS" ]]; then + IFS=',' read -ra arr2 <<< "$INCLUDE_NS" + for ns in "${arr2[@]}"; do + include_pattern="${include_pattern}|^${ns}\$" + done + include_pattern="${include_pattern#|}" +fi + +now_s=$(date +%s) + +# Initialize problems array safely (fix pour "variable sans liaison") +problems=() + +# Helper: namespace filter +ns_allowed() { + local ns="$1" + if [[ -n "$include_pattern" ]]; then + if ! echo "$ns" | egrep -q "$include_pattern"; then + return 1 + fi + fi + if [[ -n "$ignore_pattern" ]]; then + if echo "$ns" | egrep -q "$ignore_pattern"; then + return 1 + fi + fi + return 0 +} + +# 1) Check PVCs +if (( CHECK_PVC == 1 )); then + # gather: namespace, name, phase, volumeName, creationTimestamp + mapfile -t pvc_lines < <(kubectl get pvc -A -o jsonpath='{range .items[*]}{.metadata.namespace}{"\t"}{.metadata.name}{"\t"}{.status.phase}{"\t"}{.spec.volumeName}{"\t"}{.metadata.creationTimestamp}{"\n"}{end}' 2>/dev/null || true) + + for line in "${pvc_lines[@]}"; do + ns=$(echo "$line" | awk -F'\t' '{print $1}') + name=$(echo "$line" | awk -F'\t' '{print $2}') + phase=$(echo "$line" | awk -F'\t' '{print $3}') + vol=$(echo "$line" | awk -F'\t' '{print $4}') + created=$(echo "$line" | awk -F'\t' '{print $5}') + + # filter namespaces + if ! ns_allowed "$ns"; then + continue + fi + + # ignore PVC newly created (to avoid noise during normal provisioning) + if [[ -n "$created" && "$PVC_AGE_MIN" -gt 0 ]]; then + created_s=0 + created_s=$(date -d "$created" +%s 2>/dev/null || echo 0) + age_min=$(( (now_s - created_s) / 60 )) + if (( age_min < PVC_AGE_MIN )); then + continue + fi + fi + + # Consider non-Bound phases as problematic (Pending, Lost, Failed) + # Bound is OK; if Bound but no volumeName -> problem + if [[ "$phase" != "Bound" ]]; then + problems+=("PVC ${ns}/${name} phase=${phase} created=${created}") + continue + fi + + if [[ -z "$vol" || "$vol" == "null" ]]; then + problems+=("PVC ${ns}/${name} Bound but no volumeName assigned") + continue + fi + done +fi + +# 2) Check PVs +if (( CHECK_PV == 1 )); then + # gather: name, phase, capacity.storage, claimRef.namespace, claimRef.name, reclaimPolicy + mapfile -t pv_lines < <(kubectl get pv -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.status.phase}{"\t"}{.spec.capacity.storage}{"\t"}{.spec.claimRef.namespace}{"\t"}{.spec.claimRef.name}{"\t"}{.spec.persistentVolumeReclaimPolicy}{"\n"}{end}' 2>/dev/null || true) + + for line in "${pv_lines[@]}"; do + name=$(echo "$line" | awk -F'\t' '{print $1}') + phase=$(echo "$line" | awk -F'\t' '{print $2}') + cap=$(echo "$line" | awk -F'\t' '{print $3}') + claim_ns=$(echo "$line" | awk -F'\t' '{print $4}') + claim_name=$(echo "$line" | awk -F'\t' '{print $5}') + reclaim=$(echo "$line" | awk -F'\t' '{print $6}') + + # If PV is bound, check namespace filter of its claim (only report if claim namespace allowed) + if [[ -n "$claim_ns" && "$claim_ns" != "null" ]]; then + if ! ns_allowed "$claim_ns"; then + continue + fi + else + # claim_ns empty => PV not bound to claim + # Consider phases indicating issues: Released, Failed + if [[ "$phase" == "Released" || "$phase" == "Failed" ]]; then + problems+=("PV ${name} phase=${phase} reclaim=${reclaim} (no claim)") + continue + fi + # Optionally, consider Available PV without claim as possibly orphaned: + # Uncomment next lines to treat Available PVs as warning/problem + # if [[ "$phase" == "Available" ]]; then + # problems+=("PV ${name} is Available (unbound) capacity=${cap} reclaim=${reclaim}") + # fi + fi + + # If bound, but claim cannot be found (partial sanity check) + if [[ "$phase" == "Bound" ]]; then + if [[ -z "$claim_ns" || -z "$claim_name" || "$claim_ns" == "null" || "$claim_name" == "null" ]]; then + problems+=("PV ${name} Bound but missing claimRef (phase=${phase})") + continue + fi + # try to ensure the claim exists (if denied by namespace filter it's been skipped earlier) + if ! kubectl get pvc -n "${claim_ns}" "${claim_name}" >/dev/null 2>&1; then + problems+=("PV ${name} Bound to ${claim_ns}/${claim_name} but PVC resource not found") + fi + fi + done +fi + +count=${#problems[@]} + +if (( count == 0 )); then + echo "OK - PV/PVC checks passed" + exit 0 +fi + +# Severity decision +if (( count >= CRIT )); then + echo "CRITICAL - ${count} PV/PVC problems: ${problems[*]}" + exit 2 +elif (( count >= WARN )); then + echo "WARNING - ${count} PV/PVC problems: ${problems[*]}" + exit 1 +else + echo "OK - ${count} PV/PVC problems but below thresholds" + exit 0 +fi \ No newline at end of file diff --git a/files/nrpe/check_k8s_replicasets b/files/nrpe/check_k8s_replicasets new file mode 100644 index 0000000..4b075ae --- /dev/null +++ b/files/nrpe/check_k8s_replicasets @@ -0,0 +1,135 @@ +#!/usr/bin/env bash +# check_k8s_replicasets +# Vérifie les ReplicaSets Kubernetes : readyReplicas < spec.replicas +# Retour: 0=OK, 1=WARNING, 2=CRITICAL, 3=UNKNOWN +# +# Usage: +# sudo /usr/lib/nagios/plugins/check_k8s_replicasets [--warn N] [--crit M] [--ignore-ns ns1,ns2] [--namespaces ns1,ns2] [--age-min MINUTES] +# +set -euo pipefail + +WARN=${WARN:-0} # nombre de RS en erreur pour WARNING +CRIT=${CRIT:-1} # nombre de RS en erreur pour CRITICAL par défaut (1 => 1 RS -> CRITICAL) +IGNORE_NS="" +INCLUDE_NS="" +AGE_MIN=0 + +print_usage() { + cat <=N ReplicaSets en erreur (default 0) + --crit M : seuil crit si >=M ReplicaSets en erreur (default 1) + --ignore-ns LIST : comma separated namespaces to ignore (default none) + --namespaces LIST: comma separated namespaces to check only (default all) + --age-min N : ignore ReplicaSets created less than N minutes ago (avoid flapping during rollout) +EOF +} + +# parse args +while [[ $# -gt 0 ]]; do + case "$1" in + --warn) WARN="$2"; shift 2;; + --crit) CRIT="$2"; shift 2;; + --ignore-ns) IGNORE_NS="$2"; shift 2;; + --namespaces) INCLUDE_NS="$2"; shift 2;; + --age-min) AGE_MIN="$2"; shift 2;; + -h|--help) print_usage; exit 3;; + *) echo "Unknown arg: $1"; print_usage; exit 3;; + esac +done + +if ! command -v kubectl >/dev/null 2>&1; then + echo "UNKNOWN - kubectl not found" + exit 3 +fi + +# Build filter for namespace inclusion/exclusion (regex) +ignore_pattern="" +if [[ -n "$IGNORE_NS" ]]; then + IFS=',' read -ra arr <<< "$IGNORE_NS" + for ns in "${arr[@]}"; do + ignore_pattern="${ignore_pattern}|^${ns}\$" + done + ignore_pattern="${ignore_pattern#|}" +fi + +include_pattern="" +if [[ -n "$INCLUDE_NS" ]]; then + IFS=',' read -ra arr2 <<< "$INCLUDE_NS" + for ns in "${arr2[@]}"; do + include_pattern="${include_pattern}|^${ns}\$" + done + include_pattern="${include_pattern#|}" +fi + +# Initialize failures array to avoid "variable sans liaison" when set -u is active +failures=() + +# Collect ReplicaSets: namespace, name, desired(spec.replicas), ready(status.readyReplicas), creationTimestamp +# If fields missing, jsonpath returns nothing -> we normalize later +mapfile -t lines < <(kubectl get rs -A -o jsonpath='{range .items[*]}{.metadata.namespace}{"\t"}{.metadata.name}{"\t"}{.spec.replicas}{"\t"}{.status.readyReplicas}{"\t"}{.metadata.creationTimestamp}{"\n"}{end}' 2>/dev/null || true) + +now_s=$(date +%s) + +for line in "${lines[@]}"; do + # Skip empty lines if any + [[ -z "$line" ]] && continue + + ns=$(echo "$line" | awk -F'\t' '{print $1}') + name=$(echo "$line" | awk -F'\t' '{print $2}') + desired=$(echo "$line" | awk -F'\t' '{print $3}') + ready=$(echo "$line" | awk -F'\t' '{print $4}') + created=$(echo "$line" | awk -F'\t' '{print $5}') + + # normalize numeric values + desired=${desired:-0} + ready=${ready:-0} + + # namespace filtering + if [[ -n "$include_pattern" ]]; then + if ! echo "$ns" | egrep -q "$include_pattern"; then + continue + fi + fi + if [[ -n "$ignore_pattern" ]]; then + if echo "$ns" | egrep -q "$ignore_pattern"; then + continue + fi + fi + + # age filtering (skip very recent RS) + if [[ -n "$created" && "$AGE_MIN" -gt 0 ]]; then + created_s=0 + # convert to epoch; if conversion fails, keep created_s=0 so we don't skip + if created_s=$(date -d "$created" +%s 2>/dev/null || echo 0); then :; fi + age_min=$(( (now_s - created_s) / 60 )) + if (( age_min < AGE_MIN )); then + continue + fi + fi + + # Only consider RS where desired > 0 (skip zero-scale RS) + if (( desired > 0 )) && (( ready < desired )); then + failures+=("${ns}/${name} (desired=${desired},ready=${ready})") + fi +done + +count=${#failures[@]} + +# If there are no failures and the cluster reports none, return OK +if (( count == 0 )); then + echo "OK - all ReplicaSets report ready==desired" + exit 0 +fi + +# Determine severity based on thresholds +if (( count >= CRIT )); then + echo "CRITICAL - ${count} ReplicaSets not fully ready: ${failures[*]}" + exit 2 +elif (( count >= WARN )); then + echo "WARNING - ${count} ReplicaSets not fully ready: ${failures[*]}" + exit 1 +else + echo "OK - ${count} ReplicaSets not fully ready but below thresholds" + exit 0 +fi \ No newline at end of file diff --git a/templates/nrpe.j2 b/templates/nrpe.j2 index d761e33..3b55e91 100644 --- a/templates/nrpe.j2 +++ b/templates/nrpe.j2 @@ -72,13 +72,32 @@ command[check_docker_{{ container }}]=/usr/lib/nagios/plugins/check_docker --con {% endif %} {% if nrpe_process is defined %} +# process {% for process in nrpe_process %} command[check_proc_{{ process }}]=/usr/lib/nagios/plugins/check_systemd_service {{ process }} {% endfor %} {% endif %} +{% if nrpe_kubernetes is defined or nrpe_kubernetes_manager is defined %} +# kubernetes {% if nrpe_kubernetes is defined %} +## nodes command[check_proc_kubelet]=/usr/lib/nagios/plugins/check_systemd_service kubelet command[check_proc_etcd]=/usr/lib/nagios/plugins/check_systemd_service etcd command[check_proc_containerd]=/usr/lib/nagios/plugins/check_systemd_service containerd {% endif %} +{% if nrpe_kubernetes_manager is defined %} +## manager / control plane +command[check_k8s_health]=/usr/lib/nagios/plugins/check_http -I {{ ansible_default_ipv4.address }} -p 6443 -S -u /healthz --continue-after-certificate -r ok -w 1 -c 2 +command[check_cilium_health]=/usr/bin/sudo /usr/lib/nagios/plugins/check_cilium_health +command[check_coredns_health]=/usr/bin/sudo /usr/lib/nagios/plugins/check_coredns_health +command[check_etcd_health]=/usr/bin/sudo /usr/lib/nagios/plugins/check_etcd_health --endpoints "https://{{ ansible_default_ipv4.address }}:2379" --cacert /etc/ssl/etcd/ssl/ca.pem --cert /etc/ssl/etcd/ssl/node-{{ nrpe_kubernetes_manager_nodename }}.pem --key /etc/ssl/etcd/ssl/node-{{ nrpe_kubernetes_manager_nodename }}-key.pem +command[check_k8s_apiserver_access]=/usr/bin/sudo /usr/lib/nagios/plugins/check_k8s_apiserver_access +command[check_k8s_deployments]=/usr/bin/sudo /usr/lib/nagios/plugins/check_k8s_deployments +command[check_k8s_jobs_cronjobs]=/usr/bin/sudo /usr/lib/nagios/plugins/check_k8s_jobs_cronjobs +command[check_k8s_pki_certs]=/usr/bin/sudo /usr/lib/nagios/plugins/check_k8s_pki_certs +command[check_k8s_pv_pvc]=/usr/bin/sudo /usr/lib/nagios/plugins/check_k8s_pv_pvc +command[check_k8s_replicasets]=/usr/bin/sudo /usr/lib/nagios/plugins/check_k8s_replicasets +command[check_k8s_pod_restarts]=/usr/bin/sudo /usr/lib/nagios/plugins/check_k8s_pod_restarts +{% endif %} +{% endif %} \ No newline at end of file diff --git a/templates/nrpe.sudoers.j2 b/templates/nrpe.sudoers.j2 index d10cdd3..86e4cf0 100644 --- a/templates/nrpe.sudoers.j2 +++ b/templates/nrpe.sudoers.j2 @@ -2,3 +2,13 @@ nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_postfix_mailqueue -w {{ nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_exim_mailqueue -w {{ nrpe_mailq_warning }} -c {{ nrpe_mailq_critical }} nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_raid nagios ALL=(ALL) NOPASSWD: /usr/sbin/needrestart -b -l +nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_cilium_health +nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_coredns_health +nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_etcd_health +nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_k8s_apiserver_access +nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_k8s_deployments +nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_k8s_jobs_cronjobs +nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_k8s_pki_certs +nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_k8s_pv_pvc +nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_k8s_replicasets +nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_k8s_pod_restarts \ No newline at end of file