add k8s check & config

This commit is contained in:
Ludovic Cartier
2025-11-24 08:38:24 +01:00
parent 0045a21479
commit 1730b93c3f
12 changed files with 1888 additions and 0 deletions

View File

@@ -0,0 +1,307 @@
#!/usr/bin/env bash
# check_cilium_health
# Vérifie la santé de Cilium (pods, daemonsets, operator) et optionnellement utilise le binaire `cilium status -o json`.
# Retour: 0=OK, 1=WARNING, 2=CRITICAL, 3=UNKNOWN
#
# Usage:
# sudo /usr/lib/nagios/plugins/check_cilium_health [--namespace N] [--label LABEL] [--warn-not-ready N] [--crit-not-ready M] [--use-cilium-cli] [--timeout SECS]
#
set -euo pipefail
# Defaults
NAMESPACE=${NAMESPACE:-kube-system}
LABEL=${LABEL:-k8s-app=cilium}
WARN_NOT_READY=${WARN_NOT_READY:-1}
CRIT_NOT_READY=${CRIT_NOT_READY:-2}
WARN_RESTARTS=${WARN_RESTARTS:-3}
CRIT_RESTARTS=${CRIT_RESTARTS:-10}
USE_CILIUM_CLI=0
TIMEOUT=${TIMEOUT:-10}
print_usage() {
cat <<EOF
Usage: $0 [options]
Options:
--namespace N namespace (default: kube-system)
--label LABEL pod label selector (default: "k8s-app=cilium")
--warn-not-ready N warn if >= N pods not ready (default ${WARN_NOT_READY})
--crit-not-ready M critical if >= M pods not ready (default ${CRIT_NOT_READY})
--warn-restarts R warn if restartCount >= R per pod (default ${WARN_RESTARTS})
--crit-restarts S critical if restartCount >= S per pod (default ${CRIT_RESTARTS})
--use-cilium-cli run 'cilium status -o json' as additional check (requires cilium binary)
--timeout SECS kubectl timeout in seconds (default ${TIMEOUT})
-h, --help show this help
EOF
}
# Parse args
while [[ $# -gt 0 ]]; do
case "$1" in
--namespace) NAMESPACE="$2"; shift 2;;
--label) LABEL="$2"; shift 2;;
--warn-not-ready) WARN_NOT_READY="$2"; shift 2;;
--crit-not-ready) CRIT_NOT_READY="$2"; shift 2;;
--warn-restarts) WARN_RESTARTS="$2"; shift 2;;
--crit-restarts) CRIT_RESTARTS="$2"; shift 2;;
--use-cilium-cli) USE_CILIUM_CLI=1; shift 1;;
--timeout) TIMEOUT="$2"; shift 2;;
-h|--help) print_usage; exit 3;;
*) echo "Unknown arg: $1"; print_usage; exit 3;;
esac
done
# ensure kubectl & python present
if ! command -v kubectl >/dev/null 2>&1; then
echo "UNKNOWN - kubectl not found in PATH"
exit 3
fi
if ! command -v python3 >/dev/null 2>&1; then
echo "UNKNOWN - python3 not found in PATH (required for JSON parsing)"
exit 3
fi
# ---- kubeconfig handling ----
# If KUBECONFIG is not set, try sensible defaults so sudo/nagios runs succeed.
# Priority:
# 1) env KUBECONFIG if already defined
# 2) /etc/kubernetes/admin.conf if present (common on control-planes)
# 3) /root/.kube/config if present
# 4) fallback to empty (kubectl will then try defaults and may fail)
if [[ -z "${KUBECONFIG:-}" ]]; then
if [[ -r "/etc/kubernetes/admin.conf" ]]; then
export KUBECONFIG="/etc/kubernetes/admin.conf"
elif [[ -r "/root/.kube/config" ]]; then
export KUBECONFIG="/root/.kube/config"
else
# leave unset; kubectl will attempt defaults
unset KUBECONFIG || true
fi
fi
# Use explicit kubeconfig for kubectl invocations to avoid home/KUBECONFIG differences under sudo
if [[ -n "${KUBECONFIG:-}" ]]; then
KC="kubectl --kubeconfig=${KUBECONFIG} --request-timeout=${TIMEOUT}s"
else
KC="kubectl --request-timeout=${TIMEOUT}s"
fi
# Helper to run python parser safely via temp file
run_python_parser() {
# $1 = input (stdin), $2 = python here-doc content (as a bash string)
local input="$1"
local pyprog="$2"
local tmp pyfile
tmp=$(mktemp) || return 1
pyfile=$(mktemp) || { rm -f "$tmp"; return 1; }
printf '%s\n' "$pyprog" > "$pyfile"
printf '%s' "$input" | python3 "$pyfile" > "$tmp" 2>/dev/null
local rc=$?
rm -f "$pyfile"
if [[ $rc -ne 0 ]]; then
rm -f "$tmp"
return $rc
fi
cat "$tmp"
rm -f "$tmp"
return 0
}
# 1) get pods JSON robustly
set +e
pods_json=$($KC -n "$NAMESPACE" get pods -l "$LABEL" -o json 2>&1)
rc_kubectl=$?
set -e
if (( rc_kubectl != 0 )); then
echo "CRITICAL - kubectl failed to list Cilium pods: ${pods_json//$'\n'/ ' '}"
exit 2
fi
# 2) parse pods JSON via python (safe invocation)
pod_python_prog=$'import sys,json\ntry:\n data=json.load(sys.stdin)\nexcept Exception:\n sys.exit(1)\nitems=data.get(\"items\",[])\nfor it in items:\n name=it.get(\"metadata\",{}).get(\"name\",\"<noname>\")\n node=it.get(\"spec\",{}).get(\"nodeName\",\"\")\n phase=it.get(\"status\",{}).get(\"phase\",\"\")\n cs=it.get(\"status\",{}).get(\"containerStatuses\",[]) or []\n total_cont=len(cs)\n ready_cnt=sum(1 for c in cs if c.get(\"ready\") is True)\n restarts=sum(int(c.get(\"restartCount\",0) or 0) for c in cs)\n ready_str = f\"{ready_cnt}/{total_cont}\"\n print(f\"{name}\\t{phase}\\t{ready_str}\\t{restarts}\\t{node}\")\n'
pod_lines=()
if pod_out=$(run_python_parser "$pods_json" "$pod_python_prog"); then
# read into array safely
IFS=$'\n' read -r -d '' -a pod_lines <<< "$(printf '%s\n' "$pod_out")" || true
fi
# Fallback if parsing failed or empty: use simple kubectl get pods --no-headers
if [[ ${#pod_lines[@]} -eq 0 ]]; then
simple=$($KC -n "$NAMESPACE" get pods -l "$LABEL" --no-headers 2>&1 || true)
count_simple=$(printf '%s\n' "$simple" | sed '/^\s*$/d' | wc -l)
if [[ "$count_simple" -eq 0 ]]; then
echo "CRITICAL - no Cilium pods found or kubectl output unparsable. kubectl output: ${simple//$'\n'/ ' '}"
exit 2
fi
# convert simple lines into pod_lines minimally: NAME READY ... -> parse name and READY column
while IFS= read -r l; do
[[ -z "$l" ]] && continue
name=$(echo "$l" | awk '{print $1}')
readycol=$(echo "$l" | awk '{print $2}')
if [[ "$readycol" == *"/"* ]]; then
rnum=$(echo "$readycol" | cut -d'/' -f1)
rtot=$(echo "$readycol" | cut -d'/' -f2)
else
rnum=0; rtot=0
fi
if [[ "$rnum" == "$rtot" && "$rtot" != "0" ]]; then
phase="Running"
else
phase="NotReady"
fi
restarts=0
node=""
pod_lines+=("${name}\t${phase}\t${rnum}/${rtot}\t${restarts}\t${node}")
done < <(printf '%s\n' "$simple")
fi
# Now evaluate pod_lines
total_pods=0
not_ready=0
not_ready_list=()
high_restart_pods=()
for line in "${pod_lines[@]}"; do
[[ -z "$line" ]] && continue
total_pods=$((total_pods+1))
IFS=$'\t' read -r pname pphase pready prest pnode <<< "$line"
ready_num=${pready%/*}
ready_tot=${pready#*/}
ready_num=${ready_num:-0}
ready_tot=${ready_tot:-0}
if [[ "$pphase" != "Running" ]] || (( ready_num < ready_tot )); then
not_ready=$((not_ready+1))
not_ready_list+=("${pname}:${pphase}:${pready}")
fi
prest=${prest:-0}
if (( prest >= CRIT_RESTARTS )); then
high_restart_pods+=("${pname}:${prest}:CRITICAL")
elif (( prest >= WARN_RESTARTS )); then
high_restart_pods+=("${pname}:${prest}:WARN")
fi
done
# DaemonSet check (desired vs ready) using safe python parsing
set +e
ds_out=$($KC -n "$NAMESPACE" get ds -l "$LABEL" -o json 2>&1)
rc_ds=$?
set -e
ds_desired=0; ds_ready=0
if (( rc_ds == 0 )); then
ds_python_prog=$'import sys,json\ndata=json.load(sys.stdin)\nfor it in data.get(\"items\",[]):\n s=it.get(\"status\",{})\n desired=int(s.get(\"desiredNumberScheduled\") or 0)\n ready=int(s.get(\"numberReady\") or 0)\n print(f\"{desired}\\t{ready}\")\n'
if ds_out_parsed=$(run_python_parser "$ds_out" "$ds_python_prog"); then
while IFS=$'\n' read -r d; do
[[ -z "$d" ]] && continue
ddesired=$(echo "$d" | cut -f1)
dready=$(echo "$d" | cut -f2)
ds_desired=$((ds_desired+ddesired))
ds_ready=$((ds_ready+dready))
done <<< "$ds_out_parsed"
fi
fi
# cilium-operator deployment check
op_ok=1
op_msg=""
set +e
op_json=$($KC -n "$NAMESPACE" get deploy cilium-operator -o json 2>/dev/null || true)
set -e
if [[ -n "$op_json" ]]; then
op_python_prog=$'import sys,json\ndata=json.load(sys.stdin)\nspec=data.get(\"spec\",{})\nstatus=data.get(\"status\",{})\nreplicas=int(spec.get(\"replicas\") or 1)\navailable=int(status.get(\"availableReplicas\") or 0)\nprint(f\"{replicas}\\t{available}\")\n'
if op_line=$(run_python_parser "$op_json" "$op_python_prog"); then
IFS=$'\t' read -r op_repl op_avail <<< "$op_line"
if (( op_avail < op_repl )); then
op_ok=0
op_msg="operator available=${op_avail}/${op_repl}"
else
op_msg="operator available=${op_avail}/${op_repl}"
fi
fi
fi
# Optional: cilium CLI
cilium_ok=1
cilium_summary=""
if (( USE_CILIUM_CLI == 1 )); then
if ! command -v cilium >/dev/null 2>&1; then
cilium_ok=0
cilium_summary="cilium binary not in PATH"
else
set +e
cilium_raw=$(cilium status -o json 2>&1) || true
rc_cilium=$?
set -e
if (( rc_cilium != 0 )); then
cilium_ok=0
cilium_summary="cilium status failed: ${cilium_raw//$'\n'/ ' '}"
else
cilium_ok=1
cilium_summary=$(printf '%s' "$cilium_raw" | tr '\n' ' ' | sed 's/ */ /g' | cut -c1-300)
fi
fi
fi
# Compose status
code=0
msgs=()
if (( not_ready >= CRIT_NOT_READY )); then
code=2
msgs+=("CRITICAL - ${not_ready}/${total_pods} pods not ready")
elif (( not_ready >= WARN_NOT_READY )); then
if (( code < 1 )); then code=1; fi
msgs+=("WARNING - ${not_ready}/${total_pods} pods not ready")
else
msgs+=("OK - ${total_pods} pods, not-ready=${not_ready}")
fi
if (( ds_desired > 0 )) && (( ds_ready < ds_desired )); then
if (( ds_desired - ds_ready >= CRIT_NOT_READY )); then
code=2
msgs+=("CRITICAL - daemonsets ready=${ds_ready}/${ds_desired}")
else
if (( code < 1 )); then code=1; fi
msgs+=("WARNING - daemonsets ready=${ds_ready}/${ds_desired}")
fi
fi
if [[ -n "$op_msg" ]]; then
if (( op_ok == 0 )); then
code=2
msgs+=("CRITICAL - ${op_msg}")
else
msgs+=("${op_msg}")
fi
fi
if (( ${#high_restart_pods[@]} > 0 )); then
crit_restart=0; warn_restart=0
for r in "${high_restart_pods[@]}"; do
[[ "$r" == *":CRITICAL" ]] && crit_restart=1
[[ "$r" == *":WARN" ]] && warn_restart=1
done
if (( crit_restart == 1 )); then
code=2
msgs+=("CRITICAL - pods with high restart counts: ${high_restart_pods[*]}")
elif (( warn_restart == 1 )); then
if (( code < 1 )); then code=1; fi
msgs+=("WARNING - pods with elevated restarts: ${high_restart_pods[*]}")
fi
fi
if (( USE_CILIUM_CLI == 1 )); then
if (( cilium_ok == 0 )); then
code=2
msgs+=("CRITICAL - cilium-cli: ${cilium_summary}")
else
msgs+=("cilium-cli ok: ${cilium_summary}")
fi
fi
if (( not_ready > 0 )); then
truncated=$(printf "%s, " "${not_ready_list[@]}" | sed 's/, $//')
msgs+=("not-ready-list: ${truncated}")
fi
echo "$(IFS=' ; '; echo "${msgs[*]}")"
exit "${code}"

View File

@@ -0,0 +1,158 @@
#!/usr/bin/env bash
# check_coredns_health
# Vérifie la santé de CoreDNS (endpoints + endpointslices + fallback pods)
# Retour codes: 0=OK, 1=WARNING, 2=CRITICAL, 3=UNKNOWN
#
# Usage:
# sudo /usr/lib/nagios/plugins/check_coredns_health [--namespace N] [--service NAME] [--label-fallback LABEL] [--kubeconfig PATH]
#
set -euo pipefail
NAMESPACE=${NAMESPACE:-kube-system}
SERVICE_NAME=${SERVICE_NAME:-coredns}
LABEL_FALLBACK=${LABEL_FALLBACK:-k8s-app=kube-dns}
TIMEOUT=${TIMEOUT:-10}
usage() {
cat <<EOF
Usage: $0 [--namespace N] [--service NAME] [--label-fallback LABEL] [--kubeconfig PATH]
Defaults: namespace=$NAMESPACE service=$SERVICE_NAME
EOF
}
while [[ $# -gt 0 ]]; do
case "$1" in
--namespace) NAMESPACE="$2"; shift 2;;
--service) SERVICE_NAME="$2"; shift 2;;
--label-fallback) LABEL_FALLBACK="$2"; shift 2;;
--kubeconfig) export KUBECONFIG="$2"; shift 2;;
-h|--help) usage; exit 3;;
*) echo "Unknown arg: $1"; usage; exit 3;;
esac
done
if ! command -v kubectl >/dev/null 2>&1; then
echo "UNKNOWN - kubectl not found"
exit 3
fi
# If KUBECONFIG not set, try sensible defaults so sudo/nagios runs succeed.
if [[ -z "${KUBECONFIG:-}" ]]; then
if [[ -r "/etc/kubernetes/admin.conf" ]]; then
export KUBECONFIG="/etc/kubernetes/admin.conf"
elif [[ -r "/root/.kube/config" ]]; then
export KUBECONFIG="/root/.kube/config"
fi
fi
# Build kubectl command with explicit kubeconfig when available
if [[ -n "${KUBECONFIG:-}" ]]; then
KC=(kubectl --kubeconfig="${KUBECONFIG}" --request-timeout="${TIMEOUT}s")
else
KC=(kubectl --request-timeout="${TIMEOUT}s")
fi
# run_kc: capture stdout only (stderr -> /dev/null) and return kubectl's exit code
run_kc() {
local out rc
out="$("${KC[@]}" "$@" 2>/dev/null)"
rc=$?
printf '%s' "$out"
return $rc
}
# 1) try Endpoints resource
ep_out=$(run_kc -n "$NAMESPACE" get endpoints "$SERVICE_NAME" -o jsonpath='{.subsets[*].addresses[*].ip}')
rc=$?
if (( rc != 0 )); then
echo "CRITICAL - kubectl failed to get Endpoints (exit code ${rc})"
exit 2
fi
if [[ -n "${ep_out// /}" ]]; then
echo "OK - service ${SERVICE_NAME} in ${NAMESPACE} has endpoints: $(echo "$ep_out" | tr ' ' ',')"
exit 0
fi
# 2) try EndpointSlices (k8s >= 1.17)
eps_out=$(run_kc -n "$NAMESPACE" get endpointslices -l "kubernetes.io/service-name=${SERVICE_NAME}" -o jsonpath='{range .items[*]}{range .endpoints[*]}{.addresses[*]}{"\n"}{end}{end}')
rc=$?
if (( rc != 0 )); then
echo "CRITICAL - kubectl failed to get EndpointSlices (exit code ${rc})"
exit 2
fi
if [[ -n "${eps_out// /}" ]]; then
tops=$(printf '%s\n' "$eps_out" | sed '/^\s*$/d' | tr '\n' ',' | sed 's/,$//')
echo "OK - service ${SERVICE_NAME} in ${NAMESPACE} has EndpointSlices addresses: ${tops}"
exit 0
fi
# 3) fallback: check service selector and pods matching it
svc_out=$(run_kc -n "$NAMESPACE" get svc "$SERVICE_NAME" -o jsonpath='{range $k,$v := .spec.selector}{printf "%s=%s;" $k $v}{end}')
rc=$?
if (( rc != 0 )); then
echo "CRITICAL - kubectl failed to get Service selector (exit code ${rc})"
exit 2
fi
SEL="$svc_out"
if [[ -z "$SEL" ]]; then
SEL="$LABEL_FALLBACK"
SEL=${SEL//;/,}
fi
SEL=${SEL%[;,]}
# get pods by selector
pods_out=$(run_kc -n "$NAMESPACE" get pods -l "$SEL" --no-headers -o custom-columns=READY:.status.containerStatuses[0].ready,NAME:.metadata.name)
rc=$?
if (( rc != 0 )); then
echo "CRITICAL - kubectl failed to list pods for selector '${SEL}' (exit code ${rc})"
exit 2
fi
if [[ -z "${pods_out// /}" ]]; then
# try alternative labels common for CoreDNS (k8s-app=coredns)
pods_alt=$(run_kc -n "$NAMESPACE" get pods -l k8s-app=coredns --no-headers -o custom-columns=READY:.status.containerStatuses[0].ready,NAME:.metadata.name)
rc=$?
if (( rc != 0 )); then
echo "CRITICAL - kubectl failed to list pods for fallback selector (exit code ${rc})"
exit 2
fi
if [[ -n "${pods_alt// /}" ]]; then
pods_out="$pods_alt"
SEL="k8s-app=coredns (fallback)"
fi
fi
if [[ -z "${pods_out// /}" ]]; then
echo "CRITICAL - service ${SERVICE_NAME} in ${NAMESPACE} has no endpoints and no pods match selector '${SEL}'"
exit 2
fi
# count Ready pods
not_ready_count=0
total_count=0
not_ready_list=()
while IFS= read -r line; do
[[ -z "$line" ]] && continue
total_count=$((total_count+1))
ready_flag=$(echo "$line" | awk '{print $1}')
pod_name=$(echo "$line" | awk '{print $2}')
if [[ "$ready_flag" != "true" && "$ready_flag" != "True" && "$ready_flag" != "1" ]]; then
not_ready_count=$((not_ready_count+1))
not_ready_list+=("$pod_name")
fi
done <<< "$pods_out"
if (( total_count == 0 )); then
echo "CRITICAL - service ${SERVICE_NAME} in ${NAMESPACE} has no endpoints and no pods found for selector '${SEL}'"
exit 2
fi
if (( not_ready_count > 0 )); then
echo "WARNING - service ${SERVICE_NAME} in ${NAMESPACE} has no endpoints, but ${not_ready_count}/${total_count} pods matching selector '${SEL}' are not Ready: ${not_ready_list[*]}"
exit 1
fi
# If pods exist and are Ready but no Endpoints/EndpointSlices -> likely endpointcontroller/roles mismatch; consider OK but log it
echo "OK - service ${SERVICE_NAME} in ${NAMESPACE} has no Endpoints resource but ${total_count} pods matching selector '${SEL}' are Ready (EndpointSlices absent or controller issue)"
exit 0

View File

@@ -0,0 +1,230 @@
#!/usr/bin/env bash
# check_etcd_health
# Verifie la santé d'etcd et (optionnel) la creation/verifieation des snapshots.
# Retourne : 0=OK, 1=WARNING, 2=CRITICAL, 3=UNKNOWN
#
# Usage example:
# sudo /usr/lib/nagios/plugins/check_etcd_health \
# --endpoints "https://192.168.1.41:2379,https://192.168.1.42:2379" \
# --cacert /etc/ssl/etcd/ssl/ca.pem --cert /etc/ssl/etcd/ssl/admin.pem --key /etc/ssl/etcd/ssl/admin-key.pem \
# --test-snapshot --snapshot-dir /var/backups/etcd --snapshot-max-age 24
#
# Notes:
# - Par securite, execute ce script sur un master (ou via NRPE/SSH) avec un utilisateur ayant acces aux clefs.
# - --snapshot-max-age en heures (defaut 24). Mettre 0 pour desactiver la verification d'age.
# - --test-snapshot creerera un snapshot temporaire pour valider la creation + verification via `etcdctl snapshot status`.
# - Si --keep-snapshot-on-failure est active, le snapshot temporaire sera conserve en cas d'erreur pour debug.
ETCDCTL=${ETCDCTL:-/usr/local/bin/etcdctl}
print_usage() {
cat <<EOF
Usage: $0 --endpoints ENDPOINTS --cacert CA --cert CERT --key KEY [options]
Options:
--warn-db-mb N avertissement si DB >= N MB (default 1024)
--crit-db-mb M critique si DB >= M MB (default 1800)
--timeout SECS etcdctl timeout (default 10)
--test-snapshot tenter de creer un snapshot temporaire et verifier son status
--snapshot-dir DIR repertoire pour snapshots temporaires (default /var/backups/etcd)
--keep-snapshot-on-failure conserver le snapshot temporaire si creation echoue (default false)
--snapshot-max-age HRS verifier qu'il existe un snapshot plus recent que HRS heures (default 24). Mettre 0 pour desactiver.
-h, --help affiche cette aide
EOF
}
# Defaults
WARN_DB_MB=${WARN_DB_MB:-1024}
CRIT_DB_MB=${CRIT_DB_MB:-1800}
TIMEOUT=${TIMEOUT:-10}
TEST_SNAPSHOT=0
SNAPSHOT_DIR=${SNAPSHOT_DIR:-/var/backups/etcd}
KEEP_SNAPSHOT_ON_FAILURE=0
SNAPSHOT_MAX_AGE_HOURS=${SNAPSHOT_MAX_AGE_HOURS:-24}
# Parse args
while [[ $# -gt 0 ]]; do
case "$1" in
--endpoints) ENDPOINTS="$2"; shift 2;;
--cacert) CACERT="$2"; shift 2;;
--cert) CERT="$2"; shift 2;;
--key) KEY="$2"; shift 2;;
--warn-db-mb) WARN_DB_MB="$2"; shift 2;;
--crit-db-mb) CRIT_DB_MB="$2"; shift 2;;
--timeout) TIMEOUT="$2"; shift 2;;
--test-snapshot) TEST_SNAPSHOT=1; shift 1;;
--snapshot-dir) SNAPSHOT_DIR="$2"; shift 2;;
--keep-snapshot-on-failure) KEEP_SNAPSHOT_ON_FAILURE=1; shift 1;;
--snapshot-max-age) SNAPSHOT_MAX_AGE_HOURS="$2"; shift 2;;
-h|--help) print_usage; exit 3;;
*) echo "Unknown arg: $1"; print_usage; exit 3;;
esac
done
# Allow env fallback (if ETCDCTL_* env vars set)
ENDPOINTS=${ENDPOINTS:-${ETCDCTL_ENDPOINTS:-}}
CACERT=${CACERT:-${ETCDCTL_CACERT:-}}
CERT=${CERT:-${ETCDCTL_CERT:-}}
KEY=${KEY:-${ETCDCTL_KEY:-}}
if [[ -z "${ENDPOINTS:-}" || -z "${CACERT:-}" || -z "${CERT:-}" || -z "${KEY:-}" ]]; then
echo "UNKNOWN - missing required args/certs"
print_usage
exit 3
fi
if [[ ! -x "$ETCDCTL" ]]; then
echo "UNKNOWN - etcdctl not found at $ETCDCTL"
exit 3
fi
if [[ ! -r "$CACERT" || ! -r "$CERT" || ! -r "$KEY" ]]; then
echo "CRITICAL - cannot read certificate files (permissions?)"
echo "CACERT=$CACERT CERT=$CERT KEY=$KEY"
exit 2
fi
export ETCDCTL_API=3
# 1) endpoint status check
OUT=$("$ETCDCTL" --command-timeout="${TIMEOUT}s" --endpoints="${ENDPOINTS}" --cacert="$CACERT" --cert="$CERT" --key="$KEY" endpoint status 2>&1) || {
echo "CRITICAL - etcdctl endpoint status failed: $OUT"
exit 2
}
leaders=0
total=0
max_db_mb=0
while IFS= read -r line; do
line=${line//$'\r'/}
[[ -z "$line" ]] && continue
total=$((total+1))
IFS=',' read -r endpoint id version dbsize isLeader isLearner memberCount rest <<<"$line"
isLeader=$(echo "${isLeader:-}" | tr -d ' ' | tr '[:upper:]' '[:lower:]')
if [[ "$isLeader" == "true" ]]; then leaders=$((leaders+1)); fi
db_mb=0
if [[ -n "${dbsize:-}" ]]; then
dbsize=$(echo "$dbsize" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
num=$(echo "$dbsize" | awk '{print $1}' 2>/dev/null || echo "")
unit=$(echo "$dbsize" | awk '{print $2}' 2>/dev/null || echo "")
if [[ "$num" =~ ^[0-9]+(\.[0-9]+)?$ ]]; then
case "${unit^^}" in
B) db_mb=$(( num / 1024 / 1024 )) ;;
KB) db_mb=$(( num / 1024 )) ;;
MB) db_mb=$(printf "%.0f" "$num") ;;
GB) db_mb=$(( num * 1024 )) ;;
*) db_mb=$(printf "%.0f" "$num") ;;
esac
fi
fi
if (( db_mb > max_db_mb )); then max_db_mb=$db_mb; fi
done <<< "$OUT"
if (( total == 0 )); then
echo "CRITICAL - no endpoints returned by etcdctl"
exit 2
fi
if (( leaders == 0 )); then
echo "CRITICAL - no leader found among $total endpoints; detail: $OUT"
exit 2
fi
if (( leaders > 1 )); then
echo "WARNING - multiple leaders detected: $leaders (possible split-brain); detail: $OUT"
exit 1
fi
if (( max_db_mb >= CRIT_DB_MB )); then
echo "CRITICAL - etcd DB size ${max_db_mb}MB >= ${CRIT_DB_MB}MB"
exit 2
fi
if (( max_db_mb >= WARN_DB_MB )); then
echo "WARNING - etcd DB size ${max_db_mb}MB >= ${WARN_DB_MB}MB"
exit 1
fi
# 2) Verification of recent snapshot files (optional, default 24h)
SNAP_CHECK_MSG=""
if [[ -n "$SNAPSHOT_MAX_AGE_HOURS" ]]; then
# SNAPSHOT_MAX_AGE_HOURS == 0 -> disabled
if (( SNAPSHOT_MAX_AGE_HOURS > 0 )); then
mkdir -p "$SNAPSHOT_DIR" 2>/dev/null || {
echo "CRITICAL - cannot create/access snapshot dir $SNAPSHOT_DIR"
exit 2
}
latest_snapshot=$(ls -1t "$SNAPSHOT_DIR"/snapshot-*.db 2>/dev/null | head -n1 || true)
if [[ -z "$latest_snapshot" ]]; then
SNAP_CHECK_MSG="no snapshot files found in $SNAPSHOT_DIR"
echo "CRITICAL - $SNAP_CHECK_MSG (no snapshots)"
exit 2
else
now_s=$(date +%s)
snap_mtime_s=$(stat -c %Y "$latest_snapshot")
age_s=$(( now_s - snap_mtime_s ))
age_h=$(( age_s / 3600 ))
if (( age_h > SNAPSHOT_MAX_AGE_HOURS )); then
SNAP_CHECK_MSG="latest snapshot $latest_snapshot is ${age_h}h old (> ${SNAPSHOT_MAX_AGE_HOURS}h)"
echo "CRITICAL - $SNAP_CHECK_MSG"
exit 2
else
SNAP_CHECK_MSG="latest snapshot $latest_snapshot is ${age_h}h old (<= ${SNAPSHOT_MAX_AGE_HOURS}h)"
fi
fi
fi
fi
# 3) Optional: test snapshot creation and status
SNAP_TEST_MSG=""
if (( TEST_SNAPSHOT == 1 )); then
mkdir -p "$SNAPSHOT_DIR" 2>/dev/null || {
echo "CRITICAL - cannot create/access snapshot dir $SNAPSHOT_DIR"
exit 2
}
if [[ ! -w "$SNAPSHOT_DIR" ]]; then
echo "CRITICAL - snapshot dir not writable: $SNAPSHOT_DIR"
exit 2
fi
SNAPFILE=$(mktemp "${SNAPSHOT_DIR}/snapshot-XXXXXX.db") || {
echo "CRITICAL - mktemp failed in $SNAPSHOT_DIR"
exit 2
}
cleanup() {
rc=$?
if [[ $rc -eq 0 ]]; then
rm -f "$SNAPFILE" 2>/dev/null || true
else
if [[ $KEEP_SNAPSHOT_ON_FAILURE -eq 0 ]]; then
rm -f "$SNAPFILE" 2>/dev/null || true
else
echo "NOTICE - snapshot kept at $SNAPFILE for debugging"
fi
fi
return $rc
}
trap 'cleanup' EXIT
SAVE_OUT=$("$ETCDCTL" --command-timeout="${TIMEOUT}s" --endpoints="${ENDPOINTS}" --cacert="$CACERT" --cert="$CERT" --key="$KEY" snapshot save "$SNAPFILE" 2>&1) || {
echo "CRITICAL - snapshot save failed: $SAVE_OUT"
exit 2
}
STATUS_OUT=$("$ETCDCTL" snapshot status "$SNAPFILE" 2>&1) || {
echo "CRITICAL - snapshot status failed: $STATUS_OUT"
exit 2
}
# If we reach here, creation+status ok
SNAP_TEST_MSG="snapshot test ok: $SNAPFILE ; status: $(echo "$STATUS_OUT" | tr '\n' ' ' | sed 's/ */ /g')"
# cleanup will remove the snapshot (unless KEEP_SNAPSHOT_ON_FAILURE and rc != 0)
fi
# Compose final message
MSG="OK - $total endpoints checked, leaders=$leaders, max_db=${max_db_mb}MB"
if [[ -n "$SNAP_CHECK_MSG" ]]; then
MSG="$MSG ; $SNAP_CHECK_MSG"
fi
if [[ -n "$SNAP_TEST_MSG" ]]; then
MSG="$MSG ; $SNAP_TEST_MSG"
fi
echo "$MSG"
exit 0

View File

@@ -0,0 +1,214 @@
#!/usr/bin/env bash
# check_k8s_apiserver_access
# Vérifie le nombre de réponses HTTP 403 dans les logs de kube-apiserver.
# Retour codes: 0=OK, 1=WARNING, 2=CRITICAL, 3=UNKNOWN
#
# Par défaut: utilise journalctl -u kube-apiserver --since="${WINDOW} minutes ago"
# Option --kubectl : utilise "kubectl logs" sur les pods correspondant au sélecteur.
#
# Usage examples:
# sudo /usr/lib/nagios/plugins/check_k8s_apiserver_access --window 5 --warn 10 --crit 50
# sudo /usr/lib/nagios/plugins/check_k8s_apiserver_access --kubectl --selector 'k8s-app=kube-apiserver' --window 10 --crit 100
#
set -euo pipefail
PROG_NAME=$(basename "$0")
# Defaults
WINDOW_MINUTES=5
WARN_THRESHOLD=10
CRIT_THRESHOLD=50
USE_KUBECTL=0
KUBECTL_NAMESPACE="kube-system"
KUBECTL_SELECTOR="" # if empty, we'll try -l component=kube-apiserver or label provided
JOURNAL_UNIT="kube-apiserver" # systemd unit name; adapt if different
PATTERN='' # optional custom grep regex
TOP_N=5 # number of top offenders to show
print_help() {
cat <<EOF
$PROG_NAME - check apiserver 403 rate in logs
Options:
--window N Window in minutes to look back (default: ${WINDOW_MINUTES})
--warn N WARN threshold: count >= N -> WARNING (default: ${WARN_THRESHOLD})
--crit N CRIT threshold: count >= N -> CRITICAL (default: ${CRIT_THRESHOLD})
--kubectl Use 'kubectl logs' on apiserver pods instead of journalctl
--namespace NS Namespace for kubectl logs (default: ${KUBECTL_NAMESPACE})
--selector SEL Label selector for kubectl logs (e.g. "component=kube-apiserver" or "k8s-app=kube-apiserver")
--unit UNIT systemd unit for journalctl (default: ${JOURNAL_UNIT})
--pattern REGEX custom grep regex to detect 403 entries (overrides built-in heuristics)
--top N show top N request lines causing 403 (default ${TOP_N})
-h, --help show this help
Examples:
# check last 5 minutes using journalctl
sudo ./check_apiserver_403.sh --window 5 --warn 20 --crit 50
# check last 10 minutes using kubectl logs for apiserver static-pods
sudo ./check_apiserver_403.sh --kubectl --namespace kube-system --selector 'k8s-app=kube-apiserver' --window 10 --crit 100
EOF
}
# Parse args
while [[ $# -gt 0 ]]; do
case "$1" in
--window) WINDOW_MINUTES="$2"; shift 2;;
--warn) WARN_THRESHOLD="$2"; shift 2;;
--crit) CRIT_THRESHOLD="$2"; shift 2;;
--kubectl) USE_KUBECTL=1; shift 1;;
--namespace) KUBECTL_NAMESPACE="$2"; shift 2;;
--selector) KUBECTL_SELECTOR="$2"; shift 2;;
--unit) JOURNAL_UNIT="$2"; shift 2;;
--pattern) PATTERN="$2"; shift 2;;
--top) TOP_N="$2"; shift 2;;
-h|--help) print_help; exit 3;;
*) echo "Unknown argument: $1"; print_help; exit 3;;
esac
done
# Validate numeric args
if ! [[ "$WINDOW_MINUTES" =~ ^[0-9]+$ ]]; then echo "UNKNOWN - invalid --window"; exit 3; fi
if ! [[ "$WARN_THRESHOLD" =~ ^[0-9]+$ ]]; then echo "UNKNOWN - invalid --warn"; exit 3; fi
if ! [[ "$CRIT_THRESHOLD" =~ ^[0-9]+$ ]]; then echo "UNKNOWN - invalid --crit"; exit 3; fi
if ! [[ "$TOP_N" =~ ^[0-9]+$ ]]; then echo "UNKNOWN - invalid --top"; exit 3; fi
# Build detection regex if not provided
if [[ -z "$PATTERN" ]]; then
# heuristics: try to match common apiserver log patterns that indicate a 403/Forbidden
# examples: "\" 403 ", "code=403", "403 Forbidden", "Forbidden" combined with "Denied" etc.
PATTERN='(" 403 |\" 403 |code=403|403 Forbidden|Forbidden|\"Reason=Forbidden\"|\"message=.*Forbidden)'
# note: portable grep -E will accept that pattern
fi
# Grab logs
get_logs_journal() {
# Use journalctl if available
if ! command -v journalctl >/dev/null 2>&1; then
echo "ERROR_NO_JOURNAL" 1>&2
return 1
fi
# We use --no-pager; use unit name. If unit not present, journalctl returns non-zero.
# Example: journalctl -u kube-apiserver --since "5 minutes ago"
journalctl -u "${JOURNAL_UNIT}" --since="${WINDOW_MINUTES} minutes ago" --no-pager 2>/dev/null || return 1
}
get_logs_kubectl() {
if ! command -v kubectl >/dev/null 2>&1; then
echo "ERROR_NO_KUBECTL" 1>&2
return 1
fi
# If no selector given try common selectors
sel="${KUBECTL_SELECTOR}"
if [[ -z "$sel" ]]; then
# try common labels
for try in 'component=kube-apiserver' 'k8s-app=kube-apiserver' 'tier=control-plane' ''; do
if [[ -z "$try" ]]; then
sel=""
break
fi
# test if any pods match
count=$(kubectl -n "${KUBECTL_NAMESPACE}" get pods -l "${try}" --no-headers 2>/dev/null | wc -l || echo 0)
if [[ "$count" -gt 0 ]]; then
sel="${try}"
break
fi
done
fi
if [[ -z "$sel" ]]; then
# fallback: get all pods in namespace and try to find apiserver in name
pods=$(kubectl -n "${KUBECTL_NAMESPACE}" get pods --no-headers -o custom-columns=':metadata.name' 2>/dev/null || true)
if [[ -z "$pods" ]]; then
return 1
fi
# build selector as empty and we'll filter by name
# collect logs from pods whose name contains "apiserver"
out=""
while IFS= read -r p; do
[[ -z "$p" ]] && continue
if echo "$p" | grep -qi 'apiserver'; then
out="${out}$(kubectl -n ${KUBECTL_NAMESPACE} logs --since=${WINDOW_MINUTES}m ${p} --all-containers 2>/dev/null || true)$'\n'"
fi
done <<< "$pods"
printf '%s' "$out"
return 0
else
# gather logs from all pods matching selector
podnames=$(kubectl -n "${KUBECTL_NAMESPACE}" get pods -l "${sel}" -o custom-columns=':metadata.name' --no-headers 2>/dev/null || true)
if [[ -z "$podnames" ]]; then
return 1
fi
out=""
while IFS= read -r p; do
[[ -z "$p" ]] && continue
out="${out}$(kubectl -n ${KUBECTL_NAMESPACE} logs --since=${WINDOW_MINUTES}m ${p} --all-containers 2>/dev/null || true)$'\n'"
done <<< "$podnames"
printf '%s' "$out"
return 0
fi
}
# retrieve logs into variable LOGS
LOGS=""
if (( USE_KUBECTL == 1 )); then
if ! LOGS=$(get_logs_kubectl); then
echo "CRITICAL - failed to collect logs via kubectl (check KUBECONFIG, namespace/selector, permissions)"
exit 2
fi
else
if ! LOGS=$(get_logs_journal); then
echo "CRITICAL - failed to collect logs via journalctl for unit '${JOURNAL_UNIT}' (check unit name/permissions)"
exit 2
fi
fi
# If logs empty -> OK (no traffic) BUT treat with UNKNOWN if we expected logs
if [[ -z "$LOGS" ]]; then
echo "OK - no apiserver logs found in the last ${WINDOW_MINUTES}m (count=0)"
exit 0
fi
# Count matches of 403 using grep -E (case-insensitive)
# Use printf to pass LOGS safely to grep
count_403=$(printf '%s\n' "$LOGS" | grep -E -i -c "$PATTERN" || true)
count_403=${count_403:-0}
# Optionally extract top request lines that caused 403
# Try to extract HTTP method + path if present, otherwise use whole line truncated
top_requests=$(printf '%s\n' "$LOGS" | grep -E -i "$PATTERN" || true)
if [[ -n "$top_requests" ]]; then
# try to extract method+path like: "GET /api/..." or GET /api/...
top_paths=$(printf '%s\n' "$top_requests" | grep -oE '(GET|POST|PUT|DELETE|PATCH) [^" ]+' | sed 's/"$//' | sort | uniq -c | sort -rn | head -n "${TOP_N}" || true)
if [[ -z "$top_paths" ]]; then
# fallback: show most frequent truncated lines
top_paths=$(printf '%s\n' "$top_requests" | sed 's/^[[:space:]]*//; s/[[:space:]]\+/ /g' | cut -c1-200 | sort | uniq -c | sort -rn | head -n "${TOP_N}" || true)
fi
else
top_paths=""
fi
# Decide severity
if (( count_403 >= CRIT_THRESHOLD )); then
status=2
state="CRITICAL"
elif (( count_403 >= WARN_THRESHOLD )); then
status=1
state="WARNING"
else
status=0
state="OK"
fi
# Build message
msg="${state} - ${count_403} occurrences of 403 in last ${WINDOW_MINUTES}m (warn=${WARN_THRESHOLD},crit=${CRIT_THRESHOLD})"
# Append top paths if present
if [[ -n "$top_paths" ]]; then
msg="${msg} ; top=${TOP_N}: $(printf '%s' "$top_paths" | tr '\n' '|' | sed 's/|$//')"
fi
# Print and exit
echo "$msg"
exit $status

View File

@@ -0,0 +1,138 @@
#!/usr/bin/env bash
# check_k8s_deployments
# Vérifie les Deployments Kubernetes: availableReplicas < spec.replicas
# Retour: 0=OK, 1=WARNING, 2=CRITICAL, 3=UNKNOWN
#
# Usage:
# sudo /usr/lib/nagios/plugins/check_k8s_deployments [--warn N] [--crit M] [--ignore-ns ns1,ns2] [--namespaces ns1,ns2] [--age-min MINUTES]
#
# Exemples:
# sudo /usr/lib/nagios/plugins/check_k8s_deployments --crit 1
# sudo /usr/lib/nagios/plugins/check_k8s_deployments --ignore-ns kube-system,monitoring
#
set -euo pipefail
WARN=${WARN:-0} # nombre de deploys en erreur pour WARNING
CRIT=${CRIT:-1} # nombre de deploys en erreur pour CRITICAL par défaut (1 => tout problème -> CRITICAL)
IGNORE_NS=""
INCLUDE_NS=""
AGE_MIN=0
print_usage() {
cat <<EOF
Usage: $0 [--warn N] [--crit M] [--ignore-ns ns1,ns2] [--namespaces ns1,ns2] [--age-min MINUTES]
--warn N : seuil warn si >=N déploiements en erreur (default 0)
--crit M : seuil crit si >=M déploiements en erreur (default 1)
--ignore-ns LIST : comma separated namespaces to ignore (default none)
--namespaces LIST: comma separated namespaces to check only (default all)
--age-min N : ignore deployments created less than N minutes ago (avoid flapping during rollout)
EOF
}
# parse args
while [[ $# -gt 0 ]]; do
case "$1" in
--warn) WARN="$2"; shift 2;;
--crit) CRIT="$2"; shift 2;;
--ignore-ns) IGNORE_NS="$2"; shift 2;;
--namespaces) INCLUDE_NS="$2"; shift 2;;
--age-min) AGE_MIN="$2"; shift 2;;
-h|--help) print_usage; exit 3;;
*) echo "Unknown arg: $1"; print_usage; exit 3;;
esac
done
if ! command -v kubectl >/dev/null 2>&1; then
echo "UNKNOWN - kubectl not found"
exit 3
fi
# Build filter for namespace inclusion/exclusion
ignore_pattern=""
if [[ -n "$IGNORE_NS" ]]; then
IFS=',' read -ra arr <<< "$IGNORE_NS"
for ns in "${arr[@]}"; do
ignore_pattern="${ignore_pattern}|^${ns}\$"
done
# remove leading |
ignore_pattern="${ignore_pattern#|}"
fi
include_pattern=""
if [[ -n "$INCLUDE_NS" ]]; then
IFS=',' read -ra arr2 <<< "$INCLUDE_NS"
for ns in "${arr2[@]}"; do
include_pattern="${include_pattern}|^${ns}\$"
done
include_pattern="${include_pattern#|}"
fi
# result collection
# Initialize failures array to avoid "variable sans liaison" when running with set -u
failures=()
# get list: namespace, name, desired, available, creationTimestamp
mapfile -t lines < <(kubectl get deploy -A -o jsonpath='{range .items[*]}{.metadata.namespace}{"\t"}{.metadata.name}{"\t"}{.spec.replicas}{"\t"}{.status.availableReplicas}{"\t"}{.metadata.creationTimestamp}{"\n"}{end}' 2>/dev/null || true)
now_s=$(date +%s)
for line in "${lines[@]}"; do
# skip empty lines
[[ -z "${line}" ]] && continue
ns=$(echo "$line" | awk -F'\t' '{print $1}')
name=$(echo "$line" | awk -F'\t' '{print $2}')
desired=$(echo "$line" | awk -F'\t' '{print $3}')
available=$(echo "$line" | awk -F'\t' '{print $4}')
created=$(echo "$line" | awk -F'\t' '{print $5}')
# normalize
desired=${desired:-0}
available=${available:-0}
# namespace filtering
if [[ -n "$include_pattern" ]]; then
if ! echo "$ns" | egrep -q "$include_pattern"; then
continue
fi
fi
if [[ -n "$ignore_pattern" ]]; then
if echo "$ns" | egrep -q "$ignore_pattern"; then
continue
fi
fi
# age filtering
if [[ -n "$created" && "$AGE_MIN" -gt 0 ]]; then
# convert to epoch
created_s=$(date -d "$created" +%s 2>/dev/null || echo 0)
age_min=$(( (now_s - created_s) / 60 ))
if (( age_min < AGE_MIN )); then
# skip new deployments (they might be still rolling out)
continue
fi
fi
if (( available < desired )); then
failures+=("${ns}/${name} (desired=${desired},available=${available})")
fi
done
count=${#failures[@]}
if (( count == 0 )); then
echo "OK - all deployments report desired==available"
exit 0
fi
# Decide severity
if (( count >= CRIT )); then
echo "CRITICAL - ${count} deployments not available: ${failures[*]}"
exit 2
elif (( count >= WARN )); then
echo "WARNING - ${count} deployments not available: ${failures[*]}"
exit 1
else
echo "OK - ${count} deployments not available but below thresholds"
exit 0
fi

View File

@@ -0,0 +1,232 @@
#!/usr/bin/env bash
# check_k8s_jobs_cronjobs
# Vérifie l'état des Kubernetes Jobs et CronJobs.
# Exit codes: 0=OK, 1=WARNING, 2=CRITICAL, 3=UNKNOWN
#
# Fonctions principales :
# - détecte Jobs avec des échecs (.status.failed > 0) ou des Jobs "actifs" trop vieux
# - recherche d'événements récents (type=Warning) liés aux Jobs dans les X dernières minutes
# - vérifie pour les CronJobs que lastScheduleTime n'est pas trop ancien (configurable) si non suspendu
#
# Usage (exemples) :
# sudo /usr/lib/nagios/plugins/check_k8s_jobs_cronjobs --crit 1 --recent-minutes 5
# sudo /usr/lib/nagios/plugins/check_k8s_jobs_cronjobs --ignore-ns kube-system --cron-max-age 120
#
set -euo pipefail
# Defaults
WARN=${WARN:-0}
CRIT=${CRIT:-1}
IGNORE_NS=""
INCLUDE_NS=""
AGE_MIN=${AGE_MIN:-60}
RECENT_MINUTES=${RECENT_MINUTES:-5}
CHECK_CRON=1
CRON_MAX_AGE_MIN=${CRON_MAX_AGE_MIN:-60}
print_usage() {
cat <<EOF
Usage: $0 [options]
Options:
--warn N seuil WARN si >= N objets en erreur (default 0)
--crit M seuil CRIT si >= M objets en erreur (default 1)
--ignore-ns ns1,ns2 namespaces à ignorer
--namespaces ns1,ns2 limiter aux namespaces donnés (comma separated)
--age-min MINUTES considérer un job "actif" normal si démarré moins de MINUTES (default 60)
--recent-minutes MIN chercher événements de Job (Warning) dans les MIN dernières minutes (default 5)
--check-cron activer la vérification des CronJobs (default ON)
--cron-max-age MINUTES si lastScheduleTime > MINUTES => alerter (default 60). Mettre 0 pour désactiver.
-h, --help : affiche l'aide
EOF
}
# Parse args
while [[ $# -gt 0 ]]; do
case "$1" in
--warn) WARN="$2"; shift 2;;
--crit) CRIT="$2"; shift 2;;
--ignore-ns) IGNORE_NS="$2"; shift 2;;
--namespaces) INCLUDE_NS="$2"; shift 2;;
--age-min) AGE_MIN="$2"; shift 2;;
--recent-minutes) RECENT_MINUTES="$2"; shift 2;;
--no-cron) CHECK_CRON=0; shift 1;;
--cron-max-age) CRON_MAX_AGE_MIN="$2"; shift 2;;
-h|--help) print_usage; exit 3;;
*) echo "Unknown arg: $1"; print_usage; exit 3;;
esac
done
if ! command -v kubectl >/dev/null 2>&1; then
echo "UNKNOWN - kubectl not found"
exit 3
fi
# Build namespace filters (regex)
ignore_pattern=""
if [[ -n "$IGNORE_NS" ]]; then
IFS=',' read -ra arr <<< "$IGNORE_NS"
for ns in "${arr[@]}"; do
ignore_pattern="${ignore_pattern}|^${ns}\$"
done
ignore_pattern="${ignore_pattern#|}"
fi
include_pattern=""
if [[ -n "$INCLUDE_NS" ]]; then
IFS=',' read -ra arr2 <<< "$INCLUDE_NS"
for ns in "${arr2[@]}"; do
include_pattern="${include_pattern}|^${ns}\$"
done
include_pattern="${include_pattern#|}"
fi
ns_allowed() {
local ns="$1"
if [[ -n "$include_pattern" ]]; then
if ! echo "$ns" | egrep -q "$include_pattern"; then
return 1
fi
fi
if [[ -n "$ignore_pattern" ]]; then
if echo "$ns" | egrep -q "$ignore_pattern"; then
return 1
fi
fi
return 0
}
now_s=$(date +%s)
# Initialize problems array safely
problems=()
# ---------------------------
# 1) Inspect Jobs
# ---------------------------
# Fields: namespace, name, active, succeeded, failed, startTime, completionTime
mapfile -t job_lines < <(kubectl get jobs -A -o jsonpath='{range .items[*]}{.metadata.namespace}{"\t"}{.metadata.name}{"\t"}{.status.active}{"\t"}{.status.succeeded}{"\t"}{.status.failed}{"\t"}{.status.startTime}{"\t"}{.status.completionTime}{"\n"}{end}' 2>/dev/null || true)
for line in "${job_lines[@]}"; do
ns=$(echo "$line" | awk -F'\t' '{print $1}')
name=$(echo "$line" | awk -F'\t' '{print $2}')
active=$(echo "$line" | awk -F'\t' '{print $3}')
succeeded=$(echo "$line" | awk -F'\t' '{print $4}')
failed=$(echo "$line" | awk -F'\t' '{print $5}')
start=$(echo "$line" | awk -F'\t' '{print $6}')
completion=$(echo "$line" | awk -F'\t' '{print $7}')
# defaults
active=${active:-0}
succeeded=${succeeded:-0}
failed=${failed:-0}
if ! ns_allowed "$ns"; then
continue
fi
# 1.a) Jobs with failures
if (( failed > 0 )); then
problems+=("Job ${ns}/${name} failedCount=${failed}")
continue
fi
# 1.b) Active jobs running too long
if (( active > 0 )); then
if [[ -n "$start" && "$start" != "null" ]]; then
# convert start timestamp to epoch (GNU date)
start_s=$(date -d "$start" +%s 2>/dev/null || echo 0)
if (( start_s > 0 )); then
age_min=$(( (now_s - start_s) / 60 ))
if (( age_min >= AGE_MIN )); then
problems+=("Job ${ns}/${name} active for ${age_min}min >= ${AGE_MIN}min")
fi
fi
else
# no start time but active >0 -> flag
problems+=("Job ${ns}/${name} active but no startTime recorded")
fi
fi
done
# 1.c) Recent Job warning events (type=Warning) in last RECENT_MINUTES
if (( RECENT_MINUTES > 0 )); then
# get events for Jobs (type Warning) with fields: namespace, involvedObject.name, lastTimestamp, reason, message
mapfile -t event_lines < <(kubectl get events --all-namespaces --field-selector involvedObject.kind=Job,type=Warning -o custom-columns='NAMESPACE:.metadata.namespace,NAME:.involvedObject.name,LAST:.lastTimestamp,REASON:.reason,MESSAGE:.message' --no-headers 2>/dev/null || true)
cutoff_s=$(( now_s - RECENT_MINUTES * 60 ))
for ev in "${event_lines[@]}"; do
ns=$(echo "$ev" | awk '{print $1}')
name=$(echo "$ev" | awk '{print $2}')
last=$(echo "$ev" | awk '{print $3}')
if ! ns_allowed "$ns"; then
continue
fi
if [[ -n "$last" && "$last" != "<none>" ]]; then
ts=$(date -d "$last" +%s 2>/dev/null || echo 0)
if (( ts >= cutoff_s )); then
problems+=("Job event Warning ${ns}/${name} at $last")
fi
fi
done
fi
# ---------------------------
# 2) Inspect CronJobs (optionnel)
# ---------------------------
if (( CHECK_CRON == 1 )) && (( CRON_MAX_AGE_MIN > 0 )); then
# Fields: namespace, name, suspend (true/false/null), lastScheduleTime
mapfile -t cron_lines < <(kubectl get cronjob -A -o jsonpath='{range .items[*]}{.metadata.namespace}{"\t"}{.metadata.name}{"\t"}{.spec.suspend}{"\t"}{.status.lastScheduleTime}{"\n"}{end}' 2>/dev/null || true)
for line in "${cron_lines[@]}"; do
ns=$(echo "$line" | awk -F'\t' '{print $1}')
name=$(echo "$line" | awk -F'\t' '{print $2}')
suspend=$(echo "$line" | awk -F'\t' '{print $3}')
last=$(echo "$line" | awk -F'\t' '{print $4}')
if ! ns_allowed "$ns"; then
continue
fi
# If suspended, do not consider as problem
if [[ "$suspend" == "true" ]]; then
continue
fi
if [[ -z "$last" || "$last" == "null" ]]; then
# Never scheduled yet: warn (useful to detect misconfigured cronjobs)
problems+=("CronJob ${ns}/${name} has no lastScheduleTime (never scheduled?)")
continue
fi
last_s=$(date -d "$last" +%s 2>/dev/null || echo 0)
if (( last_s > 0 )); then
age_min=$(( (now_s - last_s) / 60 ))
if (( age_min > CRON_MAX_AGE_MIN )); then
problems+=("CronJob ${ns}/${name} lastSchedule ${age_min}min ago > ${CRON_MAX_AGE_MIN}min")
fi
else
problems+=("CronJob ${ns}/${name} lastScheduleTime unparsable: ${last}")
fi
done
fi
# ---------------------------
# Final decision & output
# ---------------------------
count=${#problems[@]}
if (( count == 0 )); then
echo "OK - Jobs/CronJobs checks passed"
exit 0
fi
# Severity decision
if (( count >= CRIT )); then
echo "CRITICAL - ${count} problems found: ${problems[*]}"
exit 2
elif (( count >= WARN )); then
echo "WARNING - ${count} problems found: ${problems[*]}"
exit 1
else
echo "OK - ${count} problems found but below thresholds"
exit 0
fi

View File

@@ -0,0 +1,194 @@
#!/usr/bin/env bash
# check_k8s_pki_certs
# Vérifie les certificats PEM sous /etc/kubernetes/pki (par défaut) et alerte si expiration <= warn_days (30j par défaut).
# Exit codes: 0=OK, 1=WARNING, 2=CRITICAL, 3=UNKNOWN
#
# Usage:
# sudo /usr/lib/nagios/plugins/check_k8s_pki_certs
# sudo /usr/lib/nagios/plugins/check_k8s_pki_certs --path /etc/kubernetes/ssl --warn-days 30 --crit-days 7 --recursive
#
set -euo pipefail
PKI_PATH=${PKI_PATH:-/etc/kubernetes/pki}
WARN_DAYS=${WARN_DAYS:-30}
CRIT_DAYS=${CRIT_DAYS:-7}
RECURSIVE=0
print_usage() {
cat <<EOF
Usage: $0 [--path PATH] [--warn-days N] [--crit-days M] [--recursive] [-h|--help]
Options:
--path PATH répertoire à scanner (default: $PKI_PATH)
--warn-days N seuil warning en jours (default: $WARN_DAYS)
--crit-days M seuil critical en jours (default: $CRIT_DAYS)
--recursive scanner récursivement PATH et sous-dirs
-h, --help affiche cette aide
EOF
}
while [[ $# -gt 0 ]]; do
case "$1" in
--path) PKI_PATH="$2"; shift 2;;
--warn-days) WARN_DAYS="$2"; shift 2;;
--crit-days) CRIT_DAYS="$2"; shift 2;;
--recursive) RECURSIVE=1; shift 1;;
-h|--help) print_usage; exit 3;;
*) echo "Unknown arg: $1"; print_usage; exit 3;;
esac
done
# tools
if ! command -v openssl >/dev/null 2>&1; then
echo "UNKNOWN - openssl not found"
exit 3
fi
if ! command -v date >/dev/null 2>&1; then
echo "UNKNOWN - date not found"
exit 3
fi
if ! command -v sed >/dev/null 2>&1; then
echo "UNKNOWN - sed not found"
exit 3
fi
if ! command -v awk >/dev/null 2>&1; then
echo "UNKNOWN - awk not found"
exit 3
fi
if ! command -v find >/dev/null 2>&1; then
echo "UNKNOWN - find not found"
exit 3
fi
# resolve symlink target (realpath or readlink -f)
if command -v realpath >/dev/null 2>&1; then
PKI_PATH_RESOLVED=$(realpath -e "$PKI_PATH" 2>/dev/null || true)
else
PKI_PATH_RESOLVED=$(readlink -f "$PKI_PATH" 2>/dev/null || true)
fi
if [[ -n "$PKI_PATH_RESOLVED" && -d "$PKI_PATH_RESOLVED" ]]; then
PKI_PATH="$PKI_PATH_RESOLVED"
fi
if [[ ! -d "$PKI_PATH" ]]; then
echo "UNKNOWN - path $PKI_PATH not found or not a directory"
exit 3
fi
now_s=$(date +%s)
# Initialize arrays explicitly to avoid "variable sans liaison" with set -u
critical=()
warning=()
ok=()
errors=()
file_count=0
cert_count=0
# build find command: follow symlinks (-L) so that symlinked directories/files are handled
if [[ $RECURSIVE -eq 1 ]]; then
FIND_CMD=(find -L "$PKI_PATH" -type f -print0)
else
FIND_CMD=(find -L "$PKI_PATH" -maxdepth 1 -type f -print0)
fi
# iterate files found
while IFS= read -r -d '' file; do
file_count=$((file_count+1))
# skip unreadable files
if [[ ! -r "$file" ]]; then
errors+=("Unreadable file: $file")
continue
fi
# skip files without PEM marker
if ! grep -q "BEGIN CERTIFICATE" "$file" 2>/dev/null; then
continue
fi
# find pairs of BEGIN/END certificate line numbers robustly using awk
# prints "start:end" for each certificate block
mapfile -t pairs < <(awk '
/BEGIN CERTIFICATE/ {start=NR}
/END CERTIFICATE/ && start { print start ":" NR; start=0 }
' "$file" 2>/dev/null || true)
if [[ ${#pairs[@]} -eq 0 ]]; then
errors+=("No certificate block pairs found in $file")
continue
fi
for p in "${pairs[@]}"; do
start=${p%%:*}
end=${p##*:}
# extract block via sed (line range), send to openssl via stdin
cert_block=$(sed -n "${start},${end}p" "$file" 2>/dev/null || true)
if [[ -z "$cert_block" ]]; then
errors+=("Failed to extract certificate block ${start}-${end} from $file")
continue
fi
# openssl expects a file or stdin; use stdin
endline=$(printf '%s\n' "$cert_block" | openssl x509 -noout -enddate -in /dev/stdin 2>/dev/null) || {
errors+=("Failed to parse certificate block ${start}-${end} in $file with openssl")
continue
}
# sample endline: notAfter=Oct 27 16:15:30 2125 GMT
notAfter=${endline#notAfter=}
expiry_s=$(date -d "$notAfter" +%s 2>/dev/null) || {
errors+=("Cannot parse date '$notAfter' for cert in $file")
continue
}
days_left=$(( (expiry_s - now_s) / 86400 ))
subj=$(printf '%s\n' "$cert_block" | openssl x509 -noout -subject -in /dev/stdin 2>/dev/null || true)
subj=${subj#subject= }
info="${file} :: ${subj} :: expires in ${days_left}d on ${notAfter}"
cert_count=$((cert_count+1))
if (( days_left <= CRIT_DAYS )); then
critical+=("$info")
elif (( days_left <= WARN_DAYS )); then
warning+=("$info")
else
ok+=("$info")
fi
done
done < <("${FIND_CMD[@]}")
# results and exit codes
if [[ ${#errors[@]} -gt 0 ]]; then
echo "UNKNOWN - parsing errors: ${errors[*]}"
exit 3
fi
if (( cert_count == 0 )); then
echo "UNKNOWN - no certificates found under $PKI_PATH"
exit 3
fi
if (( ${#critical[@]} > 0 )); then
echo "CRITICAL - ${#critical[@]} certificate(s) expiring soon (<= ${CRIT_DAYS} days):"
for c in "${critical[@]}"; do
echo " - $c"
done
if (( ${#warning[@]} > 0 )); then
echo "WARN (additional ${#warning[@]} cert(s) <= ${WARN_DAYS} days):"
for w in "${warning[@]}"; do
echo " - $w"
done
fi
exit 2
fi
if (( ${#warning[@]} > 0 )); then
echo "WARNING - ${#warning[@]} certificate(s) expiring within ${WARN_DAYS} days:"
for w in "${warning[@]}"; do
echo " - $w"
done
exit 1
fi
echo "OK - ${cert_count} cert(s) checked in ${file_count} file(s), no expiry within ${WARN_DAYS} days"
exit 0

View File

@@ -0,0 +1,49 @@
#!/usr/bin/env bash
# check_k8s_pod_restarts
# Vérifie s'il y a eu des redémarrages de pods (événements "Killing") dans les X dernières minutes.
# Retour: 0=OK, 2=CRITICAL, 3=UNKNOWN
#
# Usage:
# sudo /usr/lib/nagios/plugins/check_k8s_pod_restarts [minutes]
#
MINUTES=${1:-5}
# Require kubectl
if ! command -v kubectl >/dev/null 2>&1; then
echo "UNKNOWN - kubectl not found"
exit 3
fi
# cutoff en epoch (GNU date)
if ! cutoff=$(date -d "$MINUTES minutes ago" +%s 2>/dev/null); then
echo "UNKNOWN - date parsing failed (on macOS use gdate from coreutils)"
exit 3
fi
matches=()
while IFS=$'\t' read -r ns pod last msg; do
# skip empty lines
[[ -z "$last" ]] && continue
# convert last timestamp to epoch (works with GNU date; handles timezone/fractions)
if ! ts=$(date -d "$last" +%s 2>/dev/null); then
# if parsing fails, skip the event
continue
fi
if (( ts >= cutoff )); then
# safe message truncation
shortmsg=$(echo "$msg" | tr '\n' ' ' | cut -c1-300)
matches+=("$ns\t$pod\t$last\t$shortmsg")
fi
done < <(kubectl get events --all-namespaces --field-selector reason=Killing -o custom-columns='NAMESPACE:.metadata.namespace,NAME:.involvedObject.name,LAST:.lastTimestamp,MESSAGE:.message' --no-headers 2>/dev/null || true)
if [[ ${#matches[@]} -eq 0 ]]; then
echo "OK - no pod restarts in the last ${MINUTES} minutes"
exit 0
else
echo "CRITICAL - ${#matches[@]} pod restarts in the last ${MINUTES} minutes:"
for m in "${matches[@]}"; do
IFS=$'\t' read -r ns pod last shortmsg <<< "$m"
echo " - ${ns}/${pod} at ${last} : ${shortmsg}"
done
exit 2
fi

202
files/nrpe/check_k8s_pv_pvc Normal file
View File

@@ -0,0 +1,202 @@
#!/usr/bin/env bash
# check_k8s_pv_pvc
# Vérifie l'état des PersistentVolumes (PV) et PersistentVolumeClaims (PVC) Kubernetes.
# Exit codes: 0=OK, 1=WARNING, 2=CRITICAL, 3=UNKNOWN
#
# Usage examples:
# sudo /usr/lib/nagios/plugins/check_k8s_pv_pvc --crit 1 # CRITICAL si >=1 problème
# sudo /usr/lib/nagios/plugins/check_k8s_pv_pvc --ignore-ns kube-system # ignorer kube-system
# sudo /usr/lib/nagios/plugins/check_k8s_pv_pvc --pvc-age-min 10 --crit 2 # ignorer PVC récents <10min, CRIT si >=2
# sudo /usr/lib/nagios/plugins/check_k8s_pv_pvc --check-pv --check-pvc # (par défaut les 2 sont vérifiés)
#
set -euo pipefail
# Defaults
WARN=${WARN:-0}
CRIT=${CRIT:-1}
IGNORE_NS=""
INCLUDE_NS=""
PVC_AGE_MIN=${PVC_AGE_MIN:-5} # en minutes : ignore PVC créés il y a moins de X minutes (défaut 5)
CHECK_PV=1
CHECK_PVC=1
print_usage() {
cat <<EOF
Usage: $0 [options]
Options:
--warn N seuil WARN si >= N objets en erreur (default 0)
--crit M seuil CRIT si >= M objets en erreur (default 1)
--ignore-ns a,b,c namespaces à ignorer (comma separated)
--namespaces a,b limiter aux namespaces donnés (comma separated)
--pvc-age-min N ignore PVC créés il y a moins de N minutes (default 5)
--no-pv disable PV checks
--no-pvc disable PVC checks
-h, --help affiche cette aide
EOF
}
# Parse args
while [[ $# -gt 0 ]]; do
case "$1" in
--warn) WARN="$2"; shift 2;;
--crit) CRIT="$2"; shift 2;;
--ignore-ns) IGNORE_NS="$2"; shift 2;;
--namespaces) INCLUDE_NS="$2"; shift 2;;
--pvc-age-min) PVC_AGE_MIN="$2"; shift 2;;
--no-pv) CHECK_PV=0; shift 1;;
--no-pvc) CHECK_PVC=0; shift 1;;
-h|--help) print_usage; exit 3;;
*) echo "Unknown arg: $1"; print_usage; exit 3;;
esac
done
if ! command -v kubectl >/dev/null 2>&1; then
echo "UNKNOWN - kubectl not found"
exit 3
fi
# Build namespace filters
ignore_pattern=""
if [[ -n "$IGNORE_NS" ]]; then
IFS=',' read -ra arr <<< "$IGNORE_NS"
for ns in "${arr[@]}"; do
ignore_pattern="${ignore_pattern}|^${ns}\$"
done
ignore_pattern="${ignore_pattern#|}"
fi
include_pattern=""
if [[ -n "$INCLUDE_NS" ]]; then
IFS=',' read -ra arr2 <<< "$INCLUDE_NS"
for ns in "${arr2[@]}"; do
include_pattern="${include_pattern}|^${ns}\$"
done
include_pattern="${include_pattern#|}"
fi
now_s=$(date +%s)
# Initialize problems array safely (fix pour "variable sans liaison")
problems=()
# Helper: namespace filter
ns_allowed() {
local ns="$1"
if [[ -n "$include_pattern" ]]; then
if ! echo "$ns" | egrep -q "$include_pattern"; then
return 1
fi
fi
if [[ -n "$ignore_pattern" ]]; then
if echo "$ns" | egrep -q "$ignore_pattern"; then
return 1
fi
fi
return 0
}
# 1) Check PVCs
if (( CHECK_PVC == 1 )); then
# gather: namespace, name, phase, volumeName, creationTimestamp
mapfile -t pvc_lines < <(kubectl get pvc -A -o jsonpath='{range .items[*]}{.metadata.namespace}{"\t"}{.metadata.name}{"\t"}{.status.phase}{"\t"}{.spec.volumeName}{"\t"}{.metadata.creationTimestamp}{"\n"}{end}' 2>/dev/null || true)
for line in "${pvc_lines[@]}"; do
ns=$(echo "$line" | awk -F'\t' '{print $1}')
name=$(echo "$line" | awk -F'\t' '{print $2}')
phase=$(echo "$line" | awk -F'\t' '{print $3}')
vol=$(echo "$line" | awk -F'\t' '{print $4}')
created=$(echo "$line" | awk -F'\t' '{print $5}')
# filter namespaces
if ! ns_allowed "$ns"; then
continue
fi
# ignore PVC newly created (to avoid noise during normal provisioning)
if [[ -n "$created" && "$PVC_AGE_MIN" -gt 0 ]]; then
created_s=0
created_s=$(date -d "$created" +%s 2>/dev/null || echo 0)
age_min=$(( (now_s - created_s) / 60 ))
if (( age_min < PVC_AGE_MIN )); then
continue
fi
fi
# Consider non-Bound phases as problematic (Pending, Lost, Failed)
# Bound is OK; if Bound but no volumeName -> problem
if [[ "$phase" != "Bound" ]]; then
problems+=("PVC ${ns}/${name} phase=${phase} created=${created}")
continue
fi
if [[ -z "$vol" || "$vol" == "null" ]]; then
problems+=("PVC ${ns}/${name} Bound but no volumeName assigned")
continue
fi
done
fi
# 2) Check PVs
if (( CHECK_PV == 1 )); then
# gather: name, phase, capacity.storage, claimRef.namespace, claimRef.name, reclaimPolicy
mapfile -t pv_lines < <(kubectl get pv -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.status.phase}{"\t"}{.spec.capacity.storage}{"\t"}{.spec.claimRef.namespace}{"\t"}{.spec.claimRef.name}{"\t"}{.spec.persistentVolumeReclaimPolicy}{"\n"}{end}' 2>/dev/null || true)
for line in "${pv_lines[@]}"; do
name=$(echo "$line" | awk -F'\t' '{print $1}')
phase=$(echo "$line" | awk -F'\t' '{print $2}')
cap=$(echo "$line" | awk -F'\t' '{print $3}')
claim_ns=$(echo "$line" | awk -F'\t' '{print $4}')
claim_name=$(echo "$line" | awk -F'\t' '{print $5}')
reclaim=$(echo "$line" | awk -F'\t' '{print $6}')
# If PV is bound, check namespace filter of its claim (only report if claim namespace allowed)
if [[ -n "$claim_ns" && "$claim_ns" != "null" ]]; then
if ! ns_allowed "$claim_ns"; then
continue
fi
else
# claim_ns empty => PV not bound to claim
# Consider phases indicating issues: Released, Failed
if [[ "$phase" == "Released" || "$phase" == "Failed" ]]; then
problems+=("PV ${name} phase=${phase} reclaim=${reclaim} (no claim)")
continue
fi
# Optionally, consider Available PV without claim as possibly orphaned:
# Uncomment next lines to treat Available PVs as warning/problem
# if [[ "$phase" == "Available" ]]; then
# problems+=("PV ${name} is Available (unbound) capacity=${cap} reclaim=${reclaim}")
# fi
fi
# If bound, but claim cannot be found (partial sanity check)
if [[ "$phase" == "Bound" ]]; then
if [[ -z "$claim_ns" || -z "$claim_name" || "$claim_ns" == "null" || "$claim_name" == "null" ]]; then
problems+=("PV ${name} Bound but missing claimRef (phase=${phase})")
continue
fi
# try to ensure the claim exists (if denied by namespace filter it's been skipped earlier)
if ! kubectl get pvc -n "${claim_ns}" "${claim_name}" >/dev/null 2>&1; then
problems+=("PV ${name} Bound to ${claim_ns}/${claim_name} but PVC resource not found")
fi
fi
done
fi
count=${#problems[@]}
if (( count == 0 )); then
echo "OK - PV/PVC checks passed"
exit 0
fi
# Severity decision
if (( count >= CRIT )); then
echo "CRITICAL - ${count} PV/PVC problems: ${problems[*]}"
exit 2
elif (( count >= WARN )); then
echo "WARNING - ${count} PV/PVC problems: ${problems[*]}"
exit 1
else
echo "OK - ${count} PV/PVC problems but below thresholds"
exit 0
fi

View File

@@ -0,0 +1,135 @@
#!/usr/bin/env bash
# check_k8s_replicasets
# Vérifie les ReplicaSets Kubernetes : readyReplicas < spec.replicas
# Retour: 0=OK, 1=WARNING, 2=CRITICAL, 3=UNKNOWN
#
# Usage:
# sudo /usr/lib/nagios/plugins/check_k8s_replicasets [--warn N] [--crit M] [--ignore-ns ns1,ns2] [--namespaces ns1,ns2] [--age-min MINUTES]
#
set -euo pipefail
WARN=${WARN:-0} # nombre de RS en erreur pour WARNING
CRIT=${CRIT:-1} # nombre de RS en erreur pour CRITICAL par défaut (1 => 1 RS -> CRITICAL)
IGNORE_NS=""
INCLUDE_NS=""
AGE_MIN=0
print_usage() {
cat <<EOF
Usage: $0 [--warn N] [--crit M] [--ignore-ns ns1,ns2] [--namespaces ns1,ns2] [--age-min MINUTES]
--warn N : seuil warn si >=N ReplicaSets en erreur (default 0)
--crit M : seuil crit si >=M ReplicaSets en erreur (default 1)
--ignore-ns LIST : comma separated namespaces to ignore (default none)
--namespaces LIST: comma separated namespaces to check only (default all)
--age-min N : ignore ReplicaSets created less than N minutes ago (avoid flapping during rollout)
EOF
}
# parse args
while [[ $# -gt 0 ]]; do
case "$1" in
--warn) WARN="$2"; shift 2;;
--crit) CRIT="$2"; shift 2;;
--ignore-ns) IGNORE_NS="$2"; shift 2;;
--namespaces) INCLUDE_NS="$2"; shift 2;;
--age-min) AGE_MIN="$2"; shift 2;;
-h|--help) print_usage; exit 3;;
*) echo "Unknown arg: $1"; print_usage; exit 3;;
esac
done
if ! command -v kubectl >/dev/null 2>&1; then
echo "UNKNOWN - kubectl not found"
exit 3
fi
# Build filter for namespace inclusion/exclusion (regex)
ignore_pattern=""
if [[ -n "$IGNORE_NS" ]]; then
IFS=',' read -ra arr <<< "$IGNORE_NS"
for ns in "${arr[@]}"; do
ignore_pattern="${ignore_pattern}|^${ns}\$"
done
ignore_pattern="${ignore_pattern#|}"
fi
include_pattern=""
if [[ -n "$INCLUDE_NS" ]]; then
IFS=',' read -ra arr2 <<< "$INCLUDE_NS"
for ns in "${arr2[@]}"; do
include_pattern="${include_pattern}|^${ns}\$"
done
include_pattern="${include_pattern#|}"
fi
# Initialize failures array to avoid "variable sans liaison" when set -u is active
failures=()
# Collect ReplicaSets: namespace, name, desired(spec.replicas), ready(status.readyReplicas), creationTimestamp
# If fields missing, jsonpath returns nothing -> we normalize later
mapfile -t lines < <(kubectl get rs -A -o jsonpath='{range .items[*]}{.metadata.namespace}{"\t"}{.metadata.name}{"\t"}{.spec.replicas}{"\t"}{.status.readyReplicas}{"\t"}{.metadata.creationTimestamp}{"\n"}{end}' 2>/dev/null || true)
now_s=$(date +%s)
for line in "${lines[@]}"; do
# Skip empty lines if any
[[ -z "$line" ]] && continue
ns=$(echo "$line" | awk -F'\t' '{print $1}')
name=$(echo "$line" | awk -F'\t' '{print $2}')
desired=$(echo "$line" | awk -F'\t' '{print $3}')
ready=$(echo "$line" | awk -F'\t' '{print $4}')
created=$(echo "$line" | awk -F'\t' '{print $5}')
# normalize numeric values
desired=${desired:-0}
ready=${ready:-0}
# namespace filtering
if [[ -n "$include_pattern" ]]; then
if ! echo "$ns" | egrep -q "$include_pattern"; then
continue
fi
fi
if [[ -n "$ignore_pattern" ]]; then
if echo "$ns" | egrep -q "$ignore_pattern"; then
continue
fi
fi
# age filtering (skip very recent RS)
if [[ -n "$created" && "$AGE_MIN" -gt 0 ]]; then
created_s=0
# convert to epoch; if conversion fails, keep created_s=0 so we don't skip
if created_s=$(date -d "$created" +%s 2>/dev/null || echo 0); then :; fi
age_min=$(( (now_s - created_s) / 60 ))
if (( age_min < AGE_MIN )); then
continue
fi
fi
# Only consider RS where desired > 0 (skip zero-scale RS)
if (( desired > 0 )) && (( ready < desired )); then
failures+=("${ns}/${name} (desired=${desired},ready=${ready})")
fi
done
count=${#failures[@]}
# If there are no failures and the cluster reports none, return OK
if (( count == 0 )); then
echo "OK - all ReplicaSets report ready==desired"
exit 0
fi
# Determine severity based on thresholds
if (( count >= CRIT )); then
echo "CRITICAL - ${count} ReplicaSets not fully ready: ${failures[*]}"
exit 2
elif (( count >= WARN )); then
echo "WARNING - ${count} ReplicaSets not fully ready: ${failures[*]}"
exit 1
else
echo "OK - ${count} ReplicaSets not fully ready but below thresholds"
exit 0
fi

View File

@@ -72,13 +72,32 @@ command[check_docker_{{ container }}]=/usr/lib/nagios/plugins/check_docker --con
{% endif %} {% endif %}
{% if nrpe_process is defined %} {% if nrpe_process is defined %}
# process
{% for process in nrpe_process %} {% for process in nrpe_process %}
command[check_proc_{{ process }}]=/usr/lib/nagios/plugins/check_systemd_service {{ process }} command[check_proc_{{ process }}]=/usr/lib/nagios/plugins/check_systemd_service {{ process }}
{% endfor %} {% endfor %}
{% endif %} {% endif %}
{% if nrpe_kubernetes is defined or nrpe_kubernetes_manager is defined %}
# kubernetes
{% if nrpe_kubernetes is defined %} {% if nrpe_kubernetes is defined %}
## nodes
command[check_proc_kubelet]=/usr/lib/nagios/plugins/check_systemd_service kubelet command[check_proc_kubelet]=/usr/lib/nagios/plugins/check_systemd_service kubelet
command[check_proc_etcd]=/usr/lib/nagios/plugins/check_systemd_service etcd command[check_proc_etcd]=/usr/lib/nagios/plugins/check_systemd_service etcd
command[check_proc_containerd]=/usr/lib/nagios/plugins/check_systemd_service containerd command[check_proc_containerd]=/usr/lib/nagios/plugins/check_systemd_service containerd
{% endif %} {% endif %}
{% if nrpe_kubernetes_manager is defined %}
## manager / control plane
command[check_k8s_health]=/usr/lib/nagios/plugins/check_http -I {{ ansible_default_ipv4.address }} -p 6443 -S -u /healthz --continue-after-certificate -r ok -w 1 -c 2
command[check_cilium_health]=/usr/bin/sudo /usr/lib/nagios/plugins/check_cilium_health
command[check_coredns_health]=/usr/bin/sudo /usr/lib/nagios/plugins/check_coredns_health
command[check_etcd_health]=/usr/bin/sudo /usr/lib/nagios/plugins/check_etcd_health --endpoints "https://{{ ansible_default_ipv4.address }}:2379" --cacert /etc/ssl/etcd/ssl/ca.pem --cert /etc/ssl/etcd/ssl/node-{{ nrpe_kubernetes_manager_nodename }}.pem --key /etc/ssl/etcd/ssl/node-{{ nrpe_kubernetes_manager_nodename }}-key.pem
command[check_k8s_apiserver_access]=/usr/bin/sudo /usr/lib/nagios/plugins/check_k8s_apiserver_access
command[check_k8s_deployments]=/usr/bin/sudo /usr/lib/nagios/plugins/check_k8s_deployments
command[check_k8s_jobs_cronjobs]=/usr/bin/sudo /usr/lib/nagios/plugins/check_k8s_jobs_cronjobs
command[check_k8s_pki_certs]=/usr/bin/sudo /usr/lib/nagios/plugins/check_k8s_pki_certs
command[check_k8s_pv_pvc]=/usr/bin/sudo /usr/lib/nagios/plugins/check_k8s_pv_pvc
command[check_k8s_replicasets]=/usr/bin/sudo /usr/lib/nagios/plugins/check_k8s_replicasets
command[check_k8s_pod_restarts]=/usr/bin/sudo /usr/lib/nagios/plugins/check_k8s_pod_restarts
{% endif %}
{% endif %}

View File

@@ -2,3 +2,13 @@ nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_postfix_mailqueue -w {{
nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_exim_mailqueue -w {{ nrpe_mailq_warning }} -c {{ nrpe_mailq_critical }} nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_exim_mailqueue -w {{ nrpe_mailq_warning }} -c {{ nrpe_mailq_critical }}
nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_raid nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_raid
nagios ALL=(ALL) NOPASSWD: /usr/sbin/needrestart -b -l nagios ALL=(ALL) NOPASSWD: /usr/sbin/needrestart -b -l
nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_cilium_health
nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_coredns_health
nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_etcd_health
nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_k8s_apiserver_access
nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_k8s_deployments
nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_k8s_jobs_cronjobs
nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_k8s_pki_certs
nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_k8s_pv_pvc
nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_k8s_replicasets
nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_k8s_pod_restarts