add k8s check & config
This commit is contained in:
307
files/nrpe/check_cilium_health
Normal file
307
files/nrpe/check_cilium_health
Normal file
@@ -0,0 +1,307 @@
|
||||
#!/usr/bin/env bash
|
||||
# check_cilium_health
|
||||
# Vérifie la santé de Cilium (pods, daemonsets, operator) et optionnellement utilise le binaire `cilium status -o json`.
|
||||
# Retour: 0=OK, 1=WARNING, 2=CRITICAL, 3=UNKNOWN
|
||||
#
|
||||
# Usage:
|
||||
# sudo /usr/lib/nagios/plugins/check_cilium_health [--namespace N] [--label LABEL] [--warn-not-ready N] [--crit-not-ready M] [--use-cilium-cli] [--timeout SECS]
|
||||
#
|
||||
set -euo pipefail
|
||||
|
||||
# Defaults
|
||||
NAMESPACE=${NAMESPACE:-kube-system}
|
||||
LABEL=${LABEL:-k8s-app=cilium}
|
||||
WARN_NOT_READY=${WARN_NOT_READY:-1}
|
||||
CRIT_NOT_READY=${CRIT_NOT_READY:-2}
|
||||
WARN_RESTARTS=${WARN_RESTARTS:-3}
|
||||
CRIT_RESTARTS=${CRIT_RESTARTS:-10}
|
||||
USE_CILIUM_CLI=0
|
||||
TIMEOUT=${TIMEOUT:-10}
|
||||
|
||||
print_usage() {
|
||||
cat <<EOF
|
||||
Usage: $0 [options]
|
||||
Options:
|
||||
--namespace N namespace (default: kube-system)
|
||||
--label LABEL pod label selector (default: "k8s-app=cilium")
|
||||
--warn-not-ready N warn if >= N pods not ready (default ${WARN_NOT_READY})
|
||||
--crit-not-ready M critical if >= M pods not ready (default ${CRIT_NOT_READY})
|
||||
--warn-restarts R warn if restartCount >= R per pod (default ${WARN_RESTARTS})
|
||||
--crit-restarts S critical if restartCount >= S per pod (default ${CRIT_RESTARTS})
|
||||
--use-cilium-cli run 'cilium status -o json' as additional check (requires cilium binary)
|
||||
--timeout SECS kubectl timeout in seconds (default ${TIMEOUT})
|
||||
-h, --help show this help
|
||||
EOF
|
||||
}
|
||||
|
||||
# Parse args
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--namespace) NAMESPACE="$2"; shift 2;;
|
||||
--label) LABEL="$2"; shift 2;;
|
||||
--warn-not-ready) WARN_NOT_READY="$2"; shift 2;;
|
||||
--crit-not-ready) CRIT_NOT_READY="$2"; shift 2;;
|
||||
--warn-restarts) WARN_RESTARTS="$2"; shift 2;;
|
||||
--crit-restarts) CRIT_RESTARTS="$2"; shift 2;;
|
||||
--use-cilium-cli) USE_CILIUM_CLI=1; shift 1;;
|
||||
--timeout) TIMEOUT="$2"; shift 2;;
|
||||
-h|--help) print_usage; exit 3;;
|
||||
*) echo "Unknown arg: $1"; print_usage; exit 3;;
|
||||
esac
|
||||
done
|
||||
|
||||
# ensure kubectl & python present
|
||||
if ! command -v kubectl >/dev/null 2>&1; then
|
||||
echo "UNKNOWN - kubectl not found in PATH"
|
||||
exit 3
|
||||
fi
|
||||
if ! command -v python3 >/dev/null 2>&1; then
|
||||
echo "UNKNOWN - python3 not found in PATH (required for JSON parsing)"
|
||||
exit 3
|
||||
fi
|
||||
|
||||
# ---- kubeconfig handling ----
|
||||
# If KUBECONFIG is not set, try sensible defaults so sudo/nagios runs succeed.
|
||||
# Priority:
|
||||
# 1) env KUBECONFIG if already defined
|
||||
# 2) /etc/kubernetes/admin.conf if present (common on control-planes)
|
||||
# 3) /root/.kube/config if present
|
||||
# 4) fallback to empty (kubectl will then try defaults and may fail)
|
||||
if [[ -z "${KUBECONFIG:-}" ]]; then
|
||||
if [[ -r "/etc/kubernetes/admin.conf" ]]; then
|
||||
export KUBECONFIG="/etc/kubernetes/admin.conf"
|
||||
elif [[ -r "/root/.kube/config" ]]; then
|
||||
export KUBECONFIG="/root/.kube/config"
|
||||
else
|
||||
# leave unset; kubectl will attempt defaults
|
||||
unset KUBECONFIG || true
|
||||
fi
|
||||
fi
|
||||
|
||||
# Use explicit kubeconfig for kubectl invocations to avoid home/KUBECONFIG differences under sudo
|
||||
if [[ -n "${KUBECONFIG:-}" ]]; then
|
||||
KC="kubectl --kubeconfig=${KUBECONFIG} --request-timeout=${TIMEOUT}s"
|
||||
else
|
||||
KC="kubectl --request-timeout=${TIMEOUT}s"
|
||||
fi
|
||||
|
||||
# Helper to run python parser safely via temp file
|
||||
run_python_parser() {
|
||||
# $1 = input (stdin), $2 = python here-doc content (as a bash string)
|
||||
local input="$1"
|
||||
local pyprog="$2"
|
||||
local tmp pyfile
|
||||
tmp=$(mktemp) || return 1
|
||||
pyfile=$(mktemp) || { rm -f "$tmp"; return 1; }
|
||||
printf '%s\n' "$pyprog" > "$pyfile"
|
||||
printf '%s' "$input" | python3 "$pyfile" > "$tmp" 2>/dev/null
|
||||
local rc=$?
|
||||
rm -f "$pyfile"
|
||||
if [[ $rc -ne 0 ]]; then
|
||||
rm -f "$tmp"
|
||||
return $rc
|
||||
fi
|
||||
cat "$tmp"
|
||||
rm -f "$tmp"
|
||||
return 0
|
||||
}
|
||||
|
||||
# 1) get pods JSON robustly
|
||||
set +e
|
||||
pods_json=$($KC -n "$NAMESPACE" get pods -l "$LABEL" -o json 2>&1)
|
||||
rc_kubectl=$?
|
||||
set -e
|
||||
if (( rc_kubectl != 0 )); then
|
||||
echo "CRITICAL - kubectl failed to list Cilium pods: ${pods_json//$'\n'/ ' '}"
|
||||
exit 2
|
||||
fi
|
||||
|
||||
# 2) parse pods JSON via python (safe invocation)
|
||||
pod_python_prog=$'import sys,json\ntry:\n data=json.load(sys.stdin)\nexcept Exception:\n sys.exit(1)\nitems=data.get(\"items\",[])\nfor it in items:\n name=it.get(\"metadata\",{}).get(\"name\",\"<noname>\")\n node=it.get(\"spec\",{}).get(\"nodeName\",\"\")\n phase=it.get(\"status\",{}).get(\"phase\",\"\")\n cs=it.get(\"status\",{}).get(\"containerStatuses\",[]) or []\n total_cont=len(cs)\n ready_cnt=sum(1 for c in cs if c.get(\"ready\") is True)\n restarts=sum(int(c.get(\"restartCount\",0) or 0) for c in cs)\n ready_str = f\"{ready_cnt}/{total_cont}\"\n print(f\"{name}\\t{phase}\\t{ready_str}\\t{restarts}\\t{node}\")\n'
|
||||
|
||||
pod_lines=()
|
||||
if pod_out=$(run_python_parser "$pods_json" "$pod_python_prog"); then
|
||||
# read into array safely
|
||||
IFS=$'\n' read -r -d '' -a pod_lines <<< "$(printf '%s\n' "$pod_out")" || true
|
||||
fi
|
||||
|
||||
# Fallback if parsing failed or empty: use simple kubectl get pods --no-headers
|
||||
if [[ ${#pod_lines[@]} -eq 0 ]]; then
|
||||
simple=$($KC -n "$NAMESPACE" get pods -l "$LABEL" --no-headers 2>&1 || true)
|
||||
count_simple=$(printf '%s\n' "$simple" | sed '/^\s*$/d' | wc -l)
|
||||
if [[ "$count_simple" -eq 0 ]]; then
|
||||
echo "CRITICAL - no Cilium pods found or kubectl output unparsable. kubectl output: ${simple//$'\n'/ ' '}"
|
||||
exit 2
|
||||
fi
|
||||
# convert simple lines into pod_lines minimally: NAME READY ... -> parse name and READY column
|
||||
while IFS= read -r l; do
|
||||
[[ -z "$l" ]] && continue
|
||||
name=$(echo "$l" | awk '{print $1}')
|
||||
readycol=$(echo "$l" | awk '{print $2}')
|
||||
if [[ "$readycol" == *"/"* ]]; then
|
||||
rnum=$(echo "$readycol" | cut -d'/' -f1)
|
||||
rtot=$(echo "$readycol" | cut -d'/' -f2)
|
||||
else
|
||||
rnum=0; rtot=0
|
||||
fi
|
||||
if [[ "$rnum" == "$rtot" && "$rtot" != "0" ]]; then
|
||||
phase="Running"
|
||||
else
|
||||
phase="NotReady"
|
||||
fi
|
||||
restarts=0
|
||||
node=""
|
||||
pod_lines+=("${name}\t${phase}\t${rnum}/${rtot}\t${restarts}\t${node}")
|
||||
done < <(printf '%s\n' "$simple")
|
||||
fi
|
||||
|
||||
# Now evaluate pod_lines
|
||||
total_pods=0
|
||||
not_ready=0
|
||||
not_ready_list=()
|
||||
high_restart_pods=()
|
||||
|
||||
for line in "${pod_lines[@]}"; do
|
||||
[[ -z "$line" ]] && continue
|
||||
total_pods=$((total_pods+1))
|
||||
IFS=$'\t' read -r pname pphase pready prest pnode <<< "$line"
|
||||
ready_num=${pready%/*}
|
||||
ready_tot=${pready#*/}
|
||||
ready_num=${ready_num:-0}
|
||||
ready_tot=${ready_tot:-0}
|
||||
if [[ "$pphase" != "Running" ]] || (( ready_num < ready_tot )); then
|
||||
not_ready=$((not_ready+1))
|
||||
not_ready_list+=("${pname}:${pphase}:${pready}")
|
||||
fi
|
||||
prest=${prest:-0}
|
||||
if (( prest >= CRIT_RESTARTS )); then
|
||||
high_restart_pods+=("${pname}:${prest}:CRITICAL")
|
||||
elif (( prest >= WARN_RESTARTS )); then
|
||||
high_restart_pods+=("${pname}:${prest}:WARN")
|
||||
fi
|
||||
done
|
||||
|
||||
# DaemonSet check (desired vs ready) using safe python parsing
|
||||
set +e
|
||||
ds_out=$($KC -n "$NAMESPACE" get ds -l "$LABEL" -o json 2>&1)
|
||||
rc_ds=$?
|
||||
set -e
|
||||
ds_desired=0; ds_ready=0
|
||||
if (( rc_ds == 0 )); then
|
||||
ds_python_prog=$'import sys,json\ndata=json.load(sys.stdin)\nfor it in data.get(\"items\",[]):\n s=it.get(\"status\",{})\n desired=int(s.get(\"desiredNumberScheduled\") or 0)\n ready=int(s.get(\"numberReady\") or 0)\n print(f\"{desired}\\t{ready}\")\n'
|
||||
if ds_out_parsed=$(run_python_parser "$ds_out" "$ds_python_prog"); then
|
||||
while IFS=$'\n' read -r d; do
|
||||
[[ -z "$d" ]] && continue
|
||||
ddesired=$(echo "$d" | cut -f1)
|
||||
dready=$(echo "$d" | cut -f2)
|
||||
ds_desired=$((ds_desired+ddesired))
|
||||
ds_ready=$((ds_ready+dready))
|
||||
done <<< "$ds_out_parsed"
|
||||
fi
|
||||
fi
|
||||
|
||||
# cilium-operator deployment check
|
||||
op_ok=1
|
||||
op_msg=""
|
||||
set +e
|
||||
op_json=$($KC -n "$NAMESPACE" get deploy cilium-operator -o json 2>/dev/null || true)
|
||||
set -e
|
||||
if [[ -n "$op_json" ]]; then
|
||||
op_python_prog=$'import sys,json\ndata=json.load(sys.stdin)\nspec=data.get(\"spec\",{})\nstatus=data.get(\"status\",{})\nreplicas=int(spec.get(\"replicas\") or 1)\navailable=int(status.get(\"availableReplicas\") or 0)\nprint(f\"{replicas}\\t{available}\")\n'
|
||||
if op_line=$(run_python_parser "$op_json" "$op_python_prog"); then
|
||||
IFS=$'\t' read -r op_repl op_avail <<< "$op_line"
|
||||
if (( op_avail < op_repl )); then
|
||||
op_ok=0
|
||||
op_msg="operator available=${op_avail}/${op_repl}"
|
||||
else
|
||||
op_msg="operator available=${op_avail}/${op_repl}"
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
||||
# Optional: cilium CLI
|
||||
cilium_ok=1
|
||||
cilium_summary=""
|
||||
if (( USE_CILIUM_CLI == 1 )); then
|
||||
if ! command -v cilium >/dev/null 2>&1; then
|
||||
cilium_ok=0
|
||||
cilium_summary="cilium binary not in PATH"
|
||||
else
|
||||
set +e
|
||||
cilium_raw=$(cilium status -o json 2>&1) || true
|
||||
rc_cilium=$?
|
||||
set -e
|
||||
if (( rc_cilium != 0 )); then
|
||||
cilium_ok=0
|
||||
cilium_summary="cilium status failed: ${cilium_raw//$'\n'/ ' '}"
|
||||
else
|
||||
cilium_ok=1
|
||||
cilium_summary=$(printf '%s' "$cilium_raw" | tr '\n' ' ' | sed 's/ */ /g' | cut -c1-300)
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
||||
# Compose status
|
||||
code=0
|
||||
msgs=()
|
||||
|
||||
if (( not_ready >= CRIT_NOT_READY )); then
|
||||
code=2
|
||||
msgs+=("CRITICAL - ${not_ready}/${total_pods} pods not ready")
|
||||
elif (( not_ready >= WARN_NOT_READY )); then
|
||||
if (( code < 1 )); then code=1; fi
|
||||
msgs+=("WARNING - ${not_ready}/${total_pods} pods not ready")
|
||||
else
|
||||
msgs+=("OK - ${total_pods} pods, not-ready=${not_ready}")
|
||||
fi
|
||||
|
||||
if (( ds_desired > 0 )) && (( ds_ready < ds_desired )); then
|
||||
if (( ds_desired - ds_ready >= CRIT_NOT_READY )); then
|
||||
code=2
|
||||
msgs+=("CRITICAL - daemonsets ready=${ds_ready}/${ds_desired}")
|
||||
else
|
||||
if (( code < 1 )); then code=1; fi
|
||||
msgs+=("WARNING - daemonsets ready=${ds_ready}/${ds_desired}")
|
||||
fi
|
||||
fi
|
||||
|
||||
if [[ -n "$op_msg" ]]; then
|
||||
if (( op_ok == 0 )); then
|
||||
code=2
|
||||
msgs+=("CRITICAL - ${op_msg}")
|
||||
else
|
||||
msgs+=("${op_msg}")
|
||||
fi
|
||||
fi
|
||||
|
||||
if (( ${#high_restart_pods[@]} > 0 )); then
|
||||
crit_restart=0; warn_restart=0
|
||||
for r in "${high_restart_pods[@]}"; do
|
||||
[[ "$r" == *":CRITICAL" ]] && crit_restart=1
|
||||
[[ "$r" == *":WARN" ]] && warn_restart=1
|
||||
done
|
||||
if (( crit_restart == 1 )); then
|
||||
code=2
|
||||
msgs+=("CRITICAL - pods with high restart counts: ${high_restart_pods[*]}")
|
||||
elif (( warn_restart == 1 )); then
|
||||
if (( code < 1 )); then code=1; fi
|
||||
msgs+=("WARNING - pods with elevated restarts: ${high_restart_pods[*]}")
|
||||
fi
|
||||
fi
|
||||
|
||||
if (( USE_CILIUM_CLI == 1 )); then
|
||||
if (( cilium_ok == 0 )); then
|
||||
code=2
|
||||
msgs+=("CRITICAL - cilium-cli: ${cilium_summary}")
|
||||
else
|
||||
msgs+=("cilium-cli ok: ${cilium_summary}")
|
||||
fi
|
||||
fi
|
||||
|
||||
if (( not_ready > 0 )); then
|
||||
truncated=$(printf "%s, " "${not_ready_list[@]}" | sed 's/, $//')
|
||||
msgs+=("not-ready-list: ${truncated}")
|
||||
fi
|
||||
|
||||
echo "$(IFS=' ; '; echo "${msgs[*]}")"
|
||||
exit "${code}"
|
||||
158
files/nrpe/check_coredns_health
Normal file
158
files/nrpe/check_coredns_health
Normal file
@@ -0,0 +1,158 @@
|
||||
#!/usr/bin/env bash
|
||||
# check_coredns_health
|
||||
# Vérifie la santé de CoreDNS (endpoints + endpointslices + fallback pods)
|
||||
# Retour codes: 0=OK, 1=WARNING, 2=CRITICAL, 3=UNKNOWN
|
||||
#
|
||||
# Usage:
|
||||
# sudo /usr/lib/nagios/plugins/check_coredns_health [--namespace N] [--service NAME] [--label-fallback LABEL] [--kubeconfig PATH]
|
||||
#
|
||||
set -euo pipefail
|
||||
|
||||
NAMESPACE=${NAMESPACE:-kube-system}
|
||||
SERVICE_NAME=${SERVICE_NAME:-coredns}
|
||||
LABEL_FALLBACK=${LABEL_FALLBACK:-k8s-app=kube-dns}
|
||||
TIMEOUT=${TIMEOUT:-10}
|
||||
|
||||
usage() {
|
||||
cat <<EOF
|
||||
Usage: $0 [--namespace N] [--service NAME] [--label-fallback LABEL] [--kubeconfig PATH]
|
||||
Defaults: namespace=$NAMESPACE service=$SERVICE_NAME
|
||||
EOF
|
||||
}
|
||||
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--namespace) NAMESPACE="$2"; shift 2;;
|
||||
--service) SERVICE_NAME="$2"; shift 2;;
|
||||
--label-fallback) LABEL_FALLBACK="$2"; shift 2;;
|
||||
--kubeconfig) export KUBECONFIG="$2"; shift 2;;
|
||||
-h|--help) usage; exit 3;;
|
||||
*) echo "Unknown arg: $1"; usage; exit 3;;
|
||||
esac
|
||||
done
|
||||
|
||||
if ! command -v kubectl >/dev/null 2>&1; then
|
||||
echo "UNKNOWN - kubectl not found"
|
||||
exit 3
|
||||
fi
|
||||
|
||||
# If KUBECONFIG not set, try sensible defaults so sudo/nagios runs succeed.
|
||||
if [[ -z "${KUBECONFIG:-}" ]]; then
|
||||
if [[ -r "/etc/kubernetes/admin.conf" ]]; then
|
||||
export KUBECONFIG="/etc/kubernetes/admin.conf"
|
||||
elif [[ -r "/root/.kube/config" ]]; then
|
||||
export KUBECONFIG="/root/.kube/config"
|
||||
fi
|
||||
fi
|
||||
|
||||
# Build kubectl command with explicit kubeconfig when available
|
||||
if [[ -n "${KUBECONFIG:-}" ]]; then
|
||||
KC=(kubectl --kubeconfig="${KUBECONFIG}" --request-timeout="${TIMEOUT}s")
|
||||
else
|
||||
KC=(kubectl --request-timeout="${TIMEOUT}s")
|
||||
fi
|
||||
|
||||
# run_kc: capture stdout only (stderr -> /dev/null) and return kubectl's exit code
|
||||
run_kc() {
|
||||
local out rc
|
||||
out="$("${KC[@]}" "$@" 2>/dev/null)"
|
||||
rc=$?
|
||||
printf '%s' "$out"
|
||||
return $rc
|
||||
}
|
||||
|
||||
# 1) try Endpoints resource
|
||||
ep_out=$(run_kc -n "$NAMESPACE" get endpoints "$SERVICE_NAME" -o jsonpath='{.subsets[*].addresses[*].ip}')
|
||||
rc=$?
|
||||
if (( rc != 0 )); then
|
||||
echo "CRITICAL - kubectl failed to get Endpoints (exit code ${rc})"
|
||||
exit 2
|
||||
fi
|
||||
if [[ -n "${ep_out// /}" ]]; then
|
||||
echo "OK - service ${SERVICE_NAME} in ${NAMESPACE} has endpoints: $(echo "$ep_out" | tr ' ' ',')"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# 2) try EndpointSlices (k8s >= 1.17)
|
||||
eps_out=$(run_kc -n "$NAMESPACE" get endpointslices -l "kubernetes.io/service-name=${SERVICE_NAME}" -o jsonpath='{range .items[*]}{range .endpoints[*]}{.addresses[*]}{"\n"}{end}{end}')
|
||||
rc=$?
|
||||
if (( rc != 0 )); then
|
||||
echo "CRITICAL - kubectl failed to get EndpointSlices (exit code ${rc})"
|
||||
exit 2
|
||||
fi
|
||||
if [[ -n "${eps_out// /}" ]]; then
|
||||
tops=$(printf '%s\n' "$eps_out" | sed '/^\s*$/d' | tr '\n' ',' | sed 's/,$//')
|
||||
echo "OK - service ${SERVICE_NAME} in ${NAMESPACE} has EndpointSlices addresses: ${tops}"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# 3) fallback: check service selector and pods matching it
|
||||
svc_out=$(run_kc -n "$NAMESPACE" get svc "$SERVICE_NAME" -o jsonpath='{range $k,$v := .spec.selector}{printf "%s=%s;" $k $v}{end}')
|
||||
rc=$?
|
||||
if (( rc != 0 )); then
|
||||
echo "CRITICAL - kubectl failed to get Service selector (exit code ${rc})"
|
||||
exit 2
|
||||
fi
|
||||
|
||||
SEL="$svc_out"
|
||||
if [[ -z "$SEL" ]]; then
|
||||
SEL="$LABEL_FALLBACK"
|
||||
SEL=${SEL//;/,}
|
||||
fi
|
||||
SEL=${SEL%[;,]}
|
||||
|
||||
# get pods by selector
|
||||
pods_out=$(run_kc -n "$NAMESPACE" get pods -l "$SEL" --no-headers -o custom-columns=READY:.status.containerStatuses[0].ready,NAME:.metadata.name)
|
||||
rc=$?
|
||||
if (( rc != 0 )); then
|
||||
echo "CRITICAL - kubectl failed to list pods for selector '${SEL}' (exit code ${rc})"
|
||||
exit 2
|
||||
fi
|
||||
|
||||
if [[ -z "${pods_out// /}" ]]; then
|
||||
# try alternative labels common for CoreDNS (k8s-app=coredns)
|
||||
pods_alt=$(run_kc -n "$NAMESPACE" get pods -l k8s-app=coredns --no-headers -o custom-columns=READY:.status.containerStatuses[0].ready,NAME:.metadata.name)
|
||||
rc=$?
|
||||
if (( rc != 0 )); then
|
||||
echo "CRITICAL - kubectl failed to list pods for fallback selector (exit code ${rc})"
|
||||
exit 2
|
||||
fi
|
||||
if [[ -n "${pods_alt// /}" ]]; then
|
||||
pods_out="$pods_alt"
|
||||
SEL="k8s-app=coredns (fallback)"
|
||||
fi
|
||||
fi
|
||||
|
||||
if [[ -z "${pods_out// /}" ]]; then
|
||||
echo "CRITICAL - service ${SERVICE_NAME} in ${NAMESPACE} has no endpoints and no pods match selector '${SEL}'"
|
||||
exit 2
|
||||
fi
|
||||
|
||||
# count Ready pods
|
||||
not_ready_count=0
|
||||
total_count=0
|
||||
not_ready_list=()
|
||||
while IFS= read -r line; do
|
||||
[[ -z "$line" ]] && continue
|
||||
total_count=$((total_count+1))
|
||||
ready_flag=$(echo "$line" | awk '{print $1}')
|
||||
pod_name=$(echo "$line" | awk '{print $2}')
|
||||
if [[ "$ready_flag" != "true" && "$ready_flag" != "True" && "$ready_flag" != "1" ]]; then
|
||||
not_ready_count=$((not_ready_count+1))
|
||||
not_ready_list+=("$pod_name")
|
||||
fi
|
||||
done <<< "$pods_out"
|
||||
|
||||
if (( total_count == 0 )); then
|
||||
echo "CRITICAL - service ${SERVICE_NAME} in ${NAMESPACE} has no endpoints and no pods found for selector '${SEL}'"
|
||||
exit 2
|
||||
fi
|
||||
|
||||
if (( not_ready_count > 0 )); then
|
||||
echo "WARNING - service ${SERVICE_NAME} in ${NAMESPACE} has no endpoints, but ${not_ready_count}/${total_count} pods matching selector '${SEL}' are not Ready: ${not_ready_list[*]}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# If pods exist and are Ready but no Endpoints/EndpointSlices -> likely endpointcontroller/roles mismatch; consider OK but log it
|
||||
echo "OK - service ${SERVICE_NAME} in ${NAMESPACE} has no Endpoints resource but ${total_count} pods matching selector '${SEL}' are Ready (EndpointSlices absent or controller issue)"
|
||||
exit 0
|
||||
230
files/nrpe/check_etcd_health
Normal file
230
files/nrpe/check_etcd_health
Normal file
@@ -0,0 +1,230 @@
|
||||
#!/usr/bin/env bash
|
||||
# check_etcd_health
|
||||
# Verifie la santé d'etcd et (optionnel) la creation/verifieation des snapshots.
|
||||
# Retourne : 0=OK, 1=WARNING, 2=CRITICAL, 3=UNKNOWN
|
||||
#
|
||||
# Usage example:
|
||||
# sudo /usr/lib/nagios/plugins/check_etcd_health \
|
||||
# --endpoints "https://192.168.1.41:2379,https://192.168.1.42:2379" \
|
||||
# --cacert /etc/ssl/etcd/ssl/ca.pem --cert /etc/ssl/etcd/ssl/admin.pem --key /etc/ssl/etcd/ssl/admin-key.pem \
|
||||
# --test-snapshot --snapshot-dir /var/backups/etcd --snapshot-max-age 24
|
||||
#
|
||||
# Notes:
|
||||
# - Par securite, execute ce script sur un master (ou via NRPE/SSH) avec un utilisateur ayant acces aux clefs.
|
||||
# - --snapshot-max-age en heures (defaut 24). Mettre 0 pour desactiver la verification d'age.
|
||||
# - --test-snapshot creerera un snapshot temporaire pour valider la creation + verification via `etcdctl snapshot status`.
|
||||
# - Si --keep-snapshot-on-failure est active, le snapshot temporaire sera conserve en cas d'erreur pour debug.
|
||||
|
||||
ETCDCTL=${ETCDCTL:-/usr/local/bin/etcdctl}
|
||||
|
||||
print_usage() {
|
||||
cat <<EOF
|
||||
Usage: $0 --endpoints ENDPOINTS --cacert CA --cert CERT --key KEY [options]
|
||||
Options:
|
||||
--warn-db-mb N avertissement si DB >= N MB (default 1024)
|
||||
--crit-db-mb M critique si DB >= M MB (default 1800)
|
||||
--timeout SECS etcdctl timeout (default 10)
|
||||
--test-snapshot tenter de creer un snapshot temporaire et verifier son status
|
||||
--snapshot-dir DIR repertoire pour snapshots temporaires (default /var/backups/etcd)
|
||||
--keep-snapshot-on-failure conserver le snapshot temporaire si creation echoue (default false)
|
||||
--snapshot-max-age HRS verifier qu'il existe un snapshot plus recent que HRS heures (default 24). Mettre 0 pour desactiver.
|
||||
-h, --help affiche cette aide
|
||||
EOF
|
||||
}
|
||||
|
||||
# Defaults
|
||||
WARN_DB_MB=${WARN_DB_MB:-1024}
|
||||
CRIT_DB_MB=${CRIT_DB_MB:-1800}
|
||||
TIMEOUT=${TIMEOUT:-10}
|
||||
TEST_SNAPSHOT=0
|
||||
SNAPSHOT_DIR=${SNAPSHOT_DIR:-/var/backups/etcd}
|
||||
KEEP_SNAPSHOT_ON_FAILURE=0
|
||||
SNAPSHOT_MAX_AGE_HOURS=${SNAPSHOT_MAX_AGE_HOURS:-24}
|
||||
|
||||
# Parse args
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--endpoints) ENDPOINTS="$2"; shift 2;;
|
||||
--cacert) CACERT="$2"; shift 2;;
|
||||
--cert) CERT="$2"; shift 2;;
|
||||
--key) KEY="$2"; shift 2;;
|
||||
--warn-db-mb) WARN_DB_MB="$2"; shift 2;;
|
||||
--crit-db-mb) CRIT_DB_MB="$2"; shift 2;;
|
||||
--timeout) TIMEOUT="$2"; shift 2;;
|
||||
--test-snapshot) TEST_SNAPSHOT=1; shift 1;;
|
||||
--snapshot-dir) SNAPSHOT_DIR="$2"; shift 2;;
|
||||
--keep-snapshot-on-failure) KEEP_SNAPSHOT_ON_FAILURE=1; shift 1;;
|
||||
--snapshot-max-age) SNAPSHOT_MAX_AGE_HOURS="$2"; shift 2;;
|
||||
-h|--help) print_usage; exit 3;;
|
||||
*) echo "Unknown arg: $1"; print_usage; exit 3;;
|
||||
esac
|
||||
done
|
||||
|
||||
# Allow env fallback (if ETCDCTL_* env vars set)
|
||||
ENDPOINTS=${ENDPOINTS:-${ETCDCTL_ENDPOINTS:-}}
|
||||
CACERT=${CACERT:-${ETCDCTL_CACERT:-}}
|
||||
CERT=${CERT:-${ETCDCTL_CERT:-}}
|
||||
KEY=${KEY:-${ETCDCTL_KEY:-}}
|
||||
|
||||
if [[ -z "${ENDPOINTS:-}" || -z "${CACERT:-}" || -z "${CERT:-}" || -z "${KEY:-}" ]]; then
|
||||
echo "UNKNOWN - missing required args/certs"
|
||||
print_usage
|
||||
exit 3
|
||||
fi
|
||||
|
||||
if [[ ! -x "$ETCDCTL" ]]; then
|
||||
echo "UNKNOWN - etcdctl not found at $ETCDCTL"
|
||||
exit 3
|
||||
fi
|
||||
|
||||
if [[ ! -r "$CACERT" || ! -r "$CERT" || ! -r "$KEY" ]]; then
|
||||
echo "CRITICAL - cannot read certificate files (permissions?)"
|
||||
echo "CACERT=$CACERT CERT=$CERT KEY=$KEY"
|
||||
exit 2
|
||||
fi
|
||||
|
||||
export ETCDCTL_API=3
|
||||
|
||||
# 1) endpoint status check
|
||||
OUT=$("$ETCDCTL" --command-timeout="${TIMEOUT}s" --endpoints="${ENDPOINTS}" --cacert="$CACERT" --cert="$CERT" --key="$KEY" endpoint status 2>&1) || {
|
||||
echo "CRITICAL - etcdctl endpoint status failed: $OUT"
|
||||
exit 2
|
||||
}
|
||||
|
||||
leaders=0
|
||||
total=0
|
||||
max_db_mb=0
|
||||
while IFS= read -r line; do
|
||||
line=${line//$'\r'/}
|
||||
[[ -z "$line" ]] && continue
|
||||
total=$((total+1))
|
||||
IFS=',' read -r endpoint id version dbsize isLeader isLearner memberCount rest <<<"$line"
|
||||
isLeader=$(echo "${isLeader:-}" | tr -d ' ' | tr '[:upper:]' '[:lower:]')
|
||||
if [[ "$isLeader" == "true" ]]; then leaders=$((leaders+1)); fi
|
||||
db_mb=0
|
||||
if [[ -n "${dbsize:-}" ]]; then
|
||||
dbsize=$(echo "$dbsize" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
|
||||
num=$(echo "$dbsize" | awk '{print $1}' 2>/dev/null || echo "")
|
||||
unit=$(echo "$dbsize" | awk '{print $2}' 2>/dev/null || echo "")
|
||||
if [[ "$num" =~ ^[0-9]+(\.[0-9]+)?$ ]]; then
|
||||
case "${unit^^}" in
|
||||
B) db_mb=$(( num / 1024 / 1024 )) ;;
|
||||
KB) db_mb=$(( num / 1024 )) ;;
|
||||
MB) db_mb=$(printf "%.0f" "$num") ;;
|
||||
GB) db_mb=$(( num * 1024 )) ;;
|
||||
*) db_mb=$(printf "%.0f" "$num") ;;
|
||||
esac
|
||||
fi
|
||||
fi
|
||||
if (( db_mb > max_db_mb )); then max_db_mb=$db_mb; fi
|
||||
done <<< "$OUT"
|
||||
|
||||
if (( total == 0 )); then
|
||||
echo "CRITICAL - no endpoints returned by etcdctl"
|
||||
exit 2
|
||||
fi
|
||||
if (( leaders == 0 )); then
|
||||
echo "CRITICAL - no leader found among $total endpoints; detail: $OUT"
|
||||
exit 2
|
||||
fi
|
||||
if (( leaders > 1 )); then
|
||||
echo "WARNING - multiple leaders detected: $leaders (possible split-brain); detail: $OUT"
|
||||
exit 1
|
||||
fi
|
||||
if (( max_db_mb >= CRIT_DB_MB )); then
|
||||
echo "CRITICAL - etcd DB size ${max_db_mb}MB >= ${CRIT_DB_MB}MB"
|
||||
exit 2
|
||||
fi
|
||||
if (( max_db_mb >= WARN_DB_MB )); then
|
||||
echo "WARNING - etcd DB size ${max_db_mb}MB >= ${WARN_DB_MB}MB"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# 2) Verification of recent snapshot files (optional, default 24h)
|
||||
SNAP_CHECK_MSG=""
|
||||
if [[ -n "$SNAPSHOT_MAX_AGE_HOURS" ]]; then
|
||||
# SNAPSHOT_MAX_AGE_HOURS == 0 -> disabled
|
||||
if (( SNAPSHOT_MAX_AGE_HOURS > 0 )); then
|
||||
mkdir -p "$SNAPSHOT_DIR" 2>/dev/null || {
|
||||
echo "CRITICAL - cannot create/access snapshot dir $SNAPSHOT_DIR"
|
||||
exit 2
|
||||
}
|
||||
latest_snapshot=$(ls -1t "$SNAPSHOT_DIR"/snapshot-*.db 2>/dev/null | head -n1 || true)
|
||||
if [[ -z "$latest_snapshot" ]]; then
|
||||
SNAP_CHECK_MSG="no snapshot files found in $SNAPSHOT_DIR"
|
||||
echo "CRITICAL - $SNAP_CHECK_MSG (no snapshots)"
|
||||
exit 2
|
||||
else
|
||||
now_s=$(date +%s)
|
||||
snap_mtime_s=$(stat -c %Y "$latest_snapshot")
|
||||
age_s=$(( now_s - snap_mtime_s ))
|
||||
age_h=$(( age_s / 3600 ))
|
||||
if (( age_h > SNAPSHOT_MAX_AGE_HOURS )); then
|
||||
SNAP_CHECK_MSG="latest snapshot $latest_snapshot is ${age_h}h old (> ${SNAPSHOT_MAX_AGE_HOURS}h)"
|
||||
echo "CRITICAL - $SNAP_CHECK_MSG"
|
||||
exit 2
|
||||
else
|
||||
SNAP_CHECK_MSG="latest snapshot $latest_snapshot is ${age_h}h old (<= ${SNAPSHOT_MAX_AGE_HOURS}h)"
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
||||
# 3) Optional: test snapshot creation and status
|
||||
SNAP_TEST_MSG=""
|
||||
if (( TEST_SNAPSHOT == 1 )); then
|
||||
mkdir -p "$SNAPSHOT_DIR" 2>/dev/null || {
|
||||
echo "CRITICAL - cannot create/access snapshot dir $SNAPSHOT_DIR"
|
||||
exit 2
|
||||
}
|
||||
if [[ ! -w "$SNAPSHOT_DIR" ]]; then
|
||||
echo "CRITICAL - snapshot dir not writable: $SNAPSHOT_DIR"
|
||||
exit 2
|
||||
fi
|
||||
|
||||
SNAPFILE=$(mktemp "${SNAPSHOT_DIR}/snapshot-XXXXXX.db") || {
|
||||
echo "CRITICAL - mktemp failed in $SNAPSHOT_DIR"
|
||||
exit 2
|
||||
}
|
||||
|
||||
cleanup() {
|
||||
rc=$?
|
||||
if [[ $rc -eq 0 ]]; then
|
||||
rm -f "$SNAPFILE" 2>/dev/null || true
|
||||
else
|
||||
if [[ $KEEP_SNAPSHOT_ON_FAILURE -eq 0 ]]; then
|
||||
rm -f "$SNAPFILE" 2>/dev/null || true
|
||||
else
|
||||
echo "NOTICE - snapshot kept at $SNAPFILE for debugging"
|
||||
fi
|
||||
fi
|
||||
return $rc
|
||||
}
|
||||
trap 'cleanup' EXIT
|
||||
|
||||
SAVE_OUT=$("$ETCDCTL" --command-timeout="${TIMEOUT}s" --endpoints="${ENDPOINTS}" --cacert="$CACERT" --cert="$CERT" --key="$KEY" snapshot save "$SNAPFILE" 2>&1) || {
|
||||
echo "CRITICAL - snapshot save failed: $SAVE_OUT"
|
||||
exit 2
|
||||
}
|
||||
|
||||
STATUS_OUT=$("$ETCDCTL" snapshot status "$SNAPFILE" 2>&1) || {
|
||||
echo "CRITICAL - snapshot status failed: $STATUS_OUT"
|
||||
exit 2
|
||||
}
|
||||
|
||||
# If we reach here, creation+status ok
|
||||
SNAP_TEST_MSG="snapshot test ok: $SNAPFILE ; status: $(echo "$STATUS_OUT" | tr '\n' ' ' | sed 's/ */ /g')"
|
||||
# cleanup will remove the snapshot (unless KEEP_SNAPSHOT_ON_FAILURE and rc != 0)
|
||||
fi
|
||||
|
||||
# Compose final message
|
||||
MSG="OK - $total endpoints checked, leaders=$leaders, max_db=${max_db_mb}MB"
|
||||
if [[ -n "$SNAP_CHECK_MSG" ]]; then
|
||||
MSG="$MSG ; $SNAP_CHECK_MSG"
|
||||
fi
|
||||
if [[ -n "$SNAP_TEST_MSG" ]]; then
|
||||
MSG="$MSG ; $SNAP_TEST_MSG"
|
||||
fi
|
||||
|
||||
echo "$MSG"
|
||||
exit 0
|
||||
214
files/nrpe/check_k8s_apiserver_access
Normal file
214
files/nrpe/check_k8s_apiserver_access
Normal file
@@ -0,0 +1,214 @@
|
||||
#!/usr/bin/env bash
|
||||
# check_k8s_apiserver_access
|
||||
# Vérifie le nombre de réponses HTTP 403 dans les logs de kube-apiserver.
|
||||
# Retour codes: 0=OK, 1=WARNING, 2=CRITICAL, 3=UNKNOWN
|
||||
#
|
||||
# Par défaut: utilise journalctl -u kube-apiserver --since="${WINDOW} minutes ago"
|
||||
# Option --kubectl : utilise "kubectl logs" sur les pods correspondant au sélecteur.
|
||||
#
|
||||
# Usage examples:
|
||||
# sudo /usr/lib/nagios/plugins/check_k8s_apiserver_access --window 5 --warn 10 --crit 50
|
||||
# sudo /usr/lib/nagios/plugins/check_k8s_apiserver_access --kubectl --selector 'k8s-app=kube-apiserver' --window 10 --crit 100
|
||||
#
|
||||
set -euo pipefail
|
||||
|
||||
PROG_NAME=$(basename "$0")
|
||||
|
||||
# Defaults
|
||||
WINDOW_MINUTES=5
|
||||
WARN_THRESHOLD=10
|
||||
CRIT_THRESHOLD=50
|
||||
USE_KUBECTL=0
|
||||
KUBECTL_NAMESPACE="kube-system"
|
||||
KUBECTL_SELECTOR="" # if empty, we'll try -l component=kube-apiserver or label provided
|
||||
JOURNAL_UNIT="kube-apiserver" # systemd unit name; adapt if different
|
||||
PATTERN='' # optional custom grep regex
|
||||
TOP_N=5 # number of top offenders to show
|
||||
|
||||
print_help() {
|
||||
cat <<EOF
|
||||
$PROG_NAME - check apiserver 403 rate in logs
|
||||
|
||||
Options:
|
||||
--window N Window in minutes to look back (default: ${WINDOW_MINUTES})
|
||||
--warn N WARN threshold: count >= N -> WARNING (default: ${WARN_THRESHOLD})
|
||||
--crit N CRIT threshold: count >= N -> CRITICAL (default: ${CRIT_THRESHOLD})
|
||||
--kubectl Use 'kubectl logs' on apiserver pods instead of journalctl
|
||||
--namespace NS Namespace for kubectl logs (default: ${KUBECTL_NAMESPACE})
|
||||
--selector SEL Label selector for kubectl logs (e.g. "component=kube-apiserver" or "k8s-app=kube-apiserver")
|
||||
--unit UNIT systemd unit for journalctl (default: ${JOURNAL_UNIT})
|
||||
--pattern REGEX custom grep regex to detect 403 entries (overrides built-in heuristics)
|
||||
--top N show top N request lines causing 403 (default ${TOP_N})
|
||||
-h, --help show this help
|
||||
|
||||
Examples:
|
||||
# check last 5 minutes using journalctl
|
||||
sudo ./check_apiserver_403.sh --window 5 --warn 20 --crit 50
|
||||
|
||||
# check last 10 minutes using kubectl logs for apiserver static-pods
|
||||
sudo ./check_apiserver_403.sh --kubectl --namespace kube-system --selector 'k8s-app=kube-apiserver' --window 10 --crit 100
|
||||
EOF
|
||||
}
|
||||
|
||||
# Parse args
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--window) WINDOW_MINUTES="$2"; shift 2;;
|
||||
--warn) WARN_THRESHOLD="$2"; shift 2;;
|
||||
--crit) CRIT_THRESHOLD="$2"; shift 2;;
|
||||
--kubectl) USE_KUBECTL=1; shift 1;;
|
||||
--namespace) KUBECTL_NAMESPACE="$2"; shift 2;;
|
||||
--selector) KUBECTL_SELECTOR="$2"; shift 2;;
|
||||
--unit) JOURNAL_UNIT="$2"; shift 2;;
|
||||
--pattern) PATTERN="$2"; shift 2;;
|
||||
--top) TOP_N="$2"; shift 2;;
|
||||
-h|--help) print_help; exit 3;;
|
||||
*) echo "Unknown argument: $1"; print_help; exit 3;;
|
||||
esac
|
||||
done
|
||||
|
||||
# Validate numeric args
|
||||
if ! [[ "$WINDOW_MINUTES" =~ ^[0-9]+$ ]]; then echo "UNKNOWN - invalid --window"; exit 3; fi
|
||||
if ! [[ "$WARN_THRESHOLD" =~ ^[0-9]+$ ]]; then echo "UNKNOWN - invalid --warn"; exit 3; fi
|
||||
if ! [[ "$CRIT_THRESHOLD" =~ ^[0-9]+$ ]]; then echo "UNKNOWN - invalid --crit"; exit 3; fi
|
||||
if ! [[ "$TOP_N" =~ ^[0-9]+$ ]]; then echo "UNKNOWN - invalid --top"; exit 3; fi
|
||||
|
||||
# Build detection regex if not provided
|
||||
if [[ -z "$PATTERN" ]]; then
|
||||
# heuristics: try to match common apiserver log patterns that indicate a 403/Forbidden
|
||||
# examples: "\" 403 ", "code=403", "403 Forbidden", "Forbidden" combined with "Denied" etc.
|
||||
PATTERN='(" 403 |\" 403 |code=403|403 Forbidden|Forbidden|\"Reason=Forbidden\"|\"message=.*Forbidden)'
|
||||
|
||||
# note: portable grep -E will accept that pattern
|
||||
fi
|
||||
|
||||
# Grab logs
|
||||
get_logs_journal() {
|
||||
# Use journalctl if available
|
||||
if ! command -v journalctl >/dev/null 2>&1; then
|
||||
echo "ERROR_NO_JOURNAL" 1>&2
|
||||
return 1
|
||||
fi
|
||||
# We use --no-pager; use unit name. If unit not present, journalctl returns non-zero.
|
||||
# Example: journalctl -u kube-apiserver --since "5 minutes ago"
|
||||
journalctl -u "${JOURNAL_UNIT}" --since="${WINDOW_MINUTES} minutes ago" --no-pager 2>/dev/null || return 1
|
||||
}
|
||||
|
||||
get_logs_kubectl() {
|
||||
if ! command -v kubectl >/dev/null 2>&1; then
|
||||
echo "ERROR_NO_KUBECTL" 1>&2
|
||||
return 1
|
||||
fi
|
||||
# If no selector given try common selectors
|
||||
sel="${KUBECTL_SELECTOR}"
|
||||
if [[ -z "$sel" ]]; then
|
||||
# try common labels
|
||||
for try in 'component=kube-apiserver' 'k8s-app=kube-apiserver' 'tier=control-plane' ''; do
|
||||
if [[ -z "$try" ]]; then
|
||||
sel=""
|
||||
break
|
||||
fi
|
||||
# test if any pods match
|
||||
count=$(kubectl -n "${KUBECTL_NAMESPACE}" get pods -l "${try}" --no-headers 2>/dev/null | wc -l || echo 0)
|
||||
if [[ "$count" -gt 0 ]]; then
|
||||
sel="${try}"
|
||||
break
|
||||
fi
|
||||
done
|
||||
fi
|
||||
|
||||
if [[ -z "$sel" ]]; then
|
||||
# fallback: get all pods in namespace and try to find apiserver in name
|
||||
pods=$(kubectl -n "${KUBECTL_NAMESPACE}" get pods --no-headers -o custom-columns=':metadata.name' 2>/dev/null || true)
|
||||
if [[ -z "$pods" ]]; then
|
||||
return 1
|
||||
fi
|
||||
# build selector as empty and we'll filter by name
|
||||
# collect logs from pods whose name contains "apiserver"
|
||||
out=""
|
||||
while IFS= read -r p; do
|
||||
[[ -z "$p" ]] && continue
|
||||
if echo "$p" | grep -qi 'apiserver'; then
|
||||
out="${out}$(kubectl -n ${KUBECTL_NAMESPACE} logs --since=${WINDOW_MINUTES}m ${p} --all-containers 2>/dev/null || true)$'\n'"
|
||||
fi
|
||||
done <<< "$pods"
|
||||
printf '%s' "$out"
|
||||
return 0
|
||||
else
|
||||
# gather logs from all pods matching selector
|
||||
podnames=$(kubectl -n "${KUBECTL_NAMESPACE}" get pods -l "${sel}" -o custom-columns=':metadata.name' --no-headers 2>/dev/null || true)
|
||||
if [[ -z "$podnames" ]]; then
|
||||
return 1
|
||||
fi
|
||||
out=""
|
||||
while IFS= read -r p; do
|
||||
[[ -z "$p" ]] && continue
|
||||
out="${out}$(kubectl -n ${KUBECTL_NAMESPACE} logs --since=${WINDOW_MINUTES}m ${p} --all-containers 2>/dev/null || true)$'\n'"
|
||||
done <<< "$podnames"
|
||||
printf '%s' "$out"
|
||||
return 0
|
||||
fi
|
||||
}
|
||||
|
||||
# retrieve logs into variable LOGS
|
||||
LOGS=""
|
||||
if (( USE_KUBECTL == 1 )); then
|
||||
if ! LOGS=$(get_logs_kubectl); then
|
||||
echo "CRITICAL - failed to collect logs via kubectl (check KUBECONFIG, namespace/selector, permissions)"
|
||||
exit 2
|
||||
fi
|
||||
else
|
||||
if ! LOGS=$(get_logs_journal); then
|
||||
echo "CRITICAL - failed to collect logs via journalctl for unit '${JOURNAL_UNIT}' (check unit name/permissions)"
|
||||
exit 2
|
||||
fi
|
||||
fi
|
||||
|
||||
# If logs empty -> OK (no traffic) BUT treat with UNKNOWN if we expected logs
|
||||
if [[ -z "$LOGS" ]]; then
|
||||
echo "OK - no apiserver logs found in the last ${WINDOW_MINUTES}m (count=0)"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# Count matches of 403 using grep -E (case-insensitive)
|
||||
# Use printf to pass LOGS safely to grep
|
||||
count_403=$(printf '%s\n' "$LOGS" | grep -E -i -c "$PATTERN" || true)
|
||||
count_403=${count_403:-0}
|
||||
|
||||
# Optionally extract top request lines that caused 403
|
||||
# Try to extract HTTP method + path if present, otherwise use whole line truncated
|
||||
top_requests=$(printf '%s\n' "$LOGS" | grep -E -i "$PATTERN" || true)
|
||||
if [[ -n "$top_requests" ]]; then
|
||||
# try to extract method+path like: "GET /api/..." or GET /api/...
|
||||
top_paths=$(printf '%s\n' "$top_requests" | grep -oE '(GET|POST|PUT|DELETE|PATCH) [^" ]+' | sed 's/"$//' | sort | uniq -c | sort -rn | head -n "${TOP_N}" || true)
|
||||
if [[ -z "$top_paths" ]]; then
|
||||
# fallback: show most frequent truncated lines
|
||||
top_paths=$(printf '%s\n' "$top_requests" | sed 's/^[[:space:]]*//; s/[[:space:]]\+/ /g' | cut -c1-200 | sort | uniq -c | sort -rn | head -n "${TOP_N}" || true)
|
||||
fi
|
||||
else
|
||||
top_paths=""
|
||||
fi
|
||||
|
||||
# Decide severity
|
||||
if (( count_403 >= CRIT_THRESHOLD )); then
|
||||
status=2
|
||||
state="CRITICAL"
|
||||
elif (( count_403 >= WARN_THRESHOLD )); then
|
||||
status=1
|
||||
state="WARNING"
|
||||
else
|
||||
status=0
|
||||
state="OK"
|
||||
fi
|
||||
|
||||
# Build message
|
||||
msg="${state} - ${count_403} occurrences of 403 in last ${WINDOW_MINUTES}m (warn=${WARN_THRESHOLD},crit=${CRIT_THRESHOLD})"
|
||||
|
||||
# Append top paths if present
|
||||
if [[ -n "$top_paths" ]]; then
|
||||
msg="${msg} ; top=${TOP_N}: $(printf '%s' "$top_paths" | tr '\n' '|' | sed 's/|$//')"
|
||||
fi
|
||||
|
||||
# Print and exit
|
||||
echo "$msg"
|
||||
exit $status
|
||||
138
files/nrpe/check_k8s_deployments
Normal file
138
files/nrpe/check_k8s_deployments
Normal file
@@ -0,0 +1,138 @@
|
||||
#!/usr/bin/env bash
|
||||
# check_k8s_deployments
|
||||
# Vérifie les Deployments Kubernetes: availableReplicas < spec.replicas
|
||||
# Retour: 0=OK, 1=WARNING, 2=CRITICAL, 3=UNKNOWN
|
||||
#
|
||||
# Usage:
|
||||
# sudo /usr/lib/nagios/plugins/check_k8s_deployments [--warn N] [--crit M] [--ignore-ns ns1,ns2] [--namespaces ns1,ns2] [--age-min MINUTES]
|
||||
#
|
||||
# Exemples:
|
||||
# sudo /usr/lib/nagios/plugins/check_k8s_deployments --crit 1
|
||||
# sudo /usr/lib/nagios/plugins/check_k8s_deployments --ignore-ns kube-system,monitoring
|
||||
#
|
||||
set -euo pipefail
|
||||
|
||||
WARN=${WARN:-0} # nombre de deploys en erreur pour WARNING
|
||||
CRIT=${CRIT:-1} # nombre de deploys en erreur pour CRITICAL par défaut (1 => tout problème -> CRITICAL)
|
||||
IGNORE_NS=""
|
||||
INCLUDE_NS=""
|
||||
AGE_MIN=0
|
||||
|
||||
print_usage() {
|
||||
cat <<EOF
|
||||
Usage: $0 [--warn N] [--crit M] [--ignore-ns ns1,ns2] [--namespaces ns1,ns2] [--age-min MINUTES]
|
||||
--warn N : seuil warn si >=N déploiements en erreur (default 0)
|
||||
--crit M : seuil crit si >=M déploiements en erreur (default 1)
|
||||
--ignore-ns LIST : comma separated namespaces to ignore (default none)
|
||||
--namespaces LIST: comma separated namespaces to check only (default all)
|
||||
--age-min N : ignore deployments created less than N minutes ago (avoid flapping during rollout)
|
||||
EOF
|
||||
}
|
||||
|
||||
# parse args
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--warn) WARN="$2"; shift 2;;
|
||||
--crit) CRIT="$2"; shift 2;;
|
||||
--ignore-ns) IGNORE_NS="$2"; shift 2;;
|
||||
--namespaces) INCLUDE_NS="$2"; shift 2;;
|
||||
--age-min) AGE_MIN="$2"; shift 2;;
|
||||
-h|--help) print_usage; exit 3;;
|
||||
*) echo "Unknown arg: $1"; print_usage; exit 3;;
|
||||
esac
|
||||
done
|
||||
|
||||
if ! command -v kubectl >/dev/null 2>&1; then
|
||||
echo "UNKNOWN - kubectl not found"
|
||||
exit 3
|
||||
fi
|
||||
|
||||
# Build filter for namespace inclusion/exclusion
|
||||
ignore_pattern=""
|
||||
if [[ -n "$IGNORE_NS" ]]; then
|
||||
IFS=',' read -ra arr <<< "$IGNORE_NS"
|
||||
for ns in "${arr[@]}"; do
|
||||
ignore_pattern="${ignore_pattern}|^${ns}\$"
|
||||
done
|
||||
# remove leading |
|
||||
ignore_pattern="${ignore_pattern#|}"
|
||||
fi
|
||||
|
||||
include_pattern=""
|
||||
if [[ -n "$INCLUDE_NS" ]]; then
|
||||
IFS=',' read -ra arr2 <<< "$INCLUDE_NS"
|
||||
for ns in "${arr2[@]}"; do
|
||||
include_pattern="${include_pattern}|^${ns}\$"
|
||||
done
|
||||
include_pattern="${include_pattern#|}"
|
||||
fi
|
||||
|
||||
# result collection
|
||||
# Initialize failures array to avoid "variable sans liaison" when running with set -u
|
||||
failures=()
|
||||
|
||||
# get list: namespace, name, desired, available, creationTimestamp
|
||||
mapfile -t lines < <(kubectl get deploy -A -o jsonpath='{range .items[*]}{.metadata.namespace}{"\t"}{.metadata.name}{"\t"}{.spec.replicas}{"\t"}{.status.availableReplicas}{"\t"}{.metadata.creationTimestamp}{"\n"}{end}' 2>/dev/null || true)
|
||||
|
||||
now_s=$(date +%s)
|
||||
|
||||
for line in "${lines[@]}"; do
|
||||
# skip empty lines
|
||||
[[ -z "${line}" ]] && continue
|
||||
|
||||
ns=$(echo "$line" | awk -F'\t' '{print $1}')
|
||||
name=$(echo "$line" | awk -F'\t' '{print $2}')
|
||||
desired=$(echo "$line" | awk -F'\t' '{print $3}')
|
||||
available=$(echo "$line" | awk -F'\t' '{print $4}')
|
||||
created=$(echo "$line" | awk -F'\t' '{print $5}')
|
||||
|
||||
# normalize
|
||||
desired=${desired:-0}
|
||||
available=${available:-0}
|
||||
|
||||
# namespace filtering
|
||||
if [[ -n "$include_pattern" ]]; then
|
||||
if ! echo "$ns" | egrep -q "$include_pattern"; then
|
||||
continue
|
||||
fi
|
||||
fi
|
||||
if [[ -n "$ignore_pattern" ]]; then
|
||||
if echo "$ns" | egrep -q "$ignore_pattern"; then
|
||||
continue
|
||||
fi
|
||||
fi
|
||||
|
||||
# age filtering
|
||||
if [[ -n "$created" && "$AGE_MIN" -gt 0 ]]; then
|
||||
# convert to epoch
|
||||
created_s=$(date -d "$created" +%s 2>/dev/null || echo 0)
|
||||
age_min=$(( (now_s - created_s) / 60 ))
|
||||
if (( age_min < AGE_MIN )); then
|
||||
# skip new deployments (they might be still rolling out)
|
||||
continue
|
||||
fi
|
||||
fi
|
||||
|
||||
if (( available < desired )); then
|
||||
failures+=("${ns}/${name} (desired=${desired},available=${available})")
|
||||
fi
|
||||
done
|
||||
|
||||
count=${#failures[@]}
|
||||
|
||||
if (( count == 0 )); then
|
||||
echo "OK - all deployments report desired==available"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# Decide severity
|
||||
if (( count >= CRIT )); then
|
||||
echo "CRITICAL - ${count} deployments not available: ${failures[*]}"
|
||||
exit 2
|
||||
elif (( count >= WARN )); then
|
||||
echo "WARNING - ${count} deployments not available: ${failures[*]}"
|
||||
exit 1
|
||||
else
|
||||
echo "OK - ${count} deployments not available but below thresholds"
|
||||
exit 0
|
||||
fi
|
||||
232
files/nrpe/check_k8s_jobs_cronjobs
Normal file
232
files/nrpe/check_k8s_jobs_cronjobs
Normal file
@@ -0,0 +1,232 @@
|
||||
#!/usr/bin/env bash
|
||||
# check_k8s_jobs_cronjobs
|
||||
# Vérifie l'état des Kubernetes Jobs et CronJobs.
|
||||
# Exit codes: 0=OK, 1=WARNING, 2=CRITICAL, 3=UNKNOWN
|
||||
#
|
||||
# Fonctions principales :
|
||||
# - détecte Jobs avec des échecs (.status.failed > 0) ou des Jobs "actifs" trop vieux
|
||||
# - recherche d'événements récents (type=Warning) liés aux Jobs dans les X dernières minutes
|
||||
# - vérifie pour les CronJobs que lastScheduleTime n'est pas trop ancien (configurable) si non suspendu
|
||||
#
|
||||
# Usage (exemples) :
|
||||
# sudo /usr/lib/nagios/plugins/check_k8s_jobs_cronjobs --crit 1 --recent-minutes 5
|
||||
# sudo /usr/lib/nagios/plugins/check_k8s_jobs_cronjobs --ignore-ns kube-system --cron-max-age 120
|
||||
#
|
||||
set -euo pipefail
|
||||
|
||||
# Defaults
|
||||
WARN=${WARN:-0}
|
||||
CRIT=${CRIT:-1}
|
||||
IGNORE_NS=""
|
||||
INCLUDE_NS=""
|
||||
AGE_MIN=${AGE_MIN:-60}
|
||||
RECENT_MINUTES=${RECENT_MINUTES:-5}
|
||||
CHECK_CRON=1
|
||||
CRON_MAX_AGE_MIN=${CRON_MAX_AGE_MIN:-60}
|
||||
|
||||
print_usage() {
|
||||
cat <<EOF
|
||||
Usage: $0 [options]
|
||||
Options:
|
||||
--warn N seuil WARN si >= N objets en erreur (default 0)
|
||||
--crit M seuil CRIT si >= M objets en erreur (default 1)
|
||||
--ignore-ns ns1,ns2 namespaces à ignorer
|
||||
--namespaces ns1,ns2 limiter aux namespaces donnés (comma separated)
|
||||
--age-min MINUTES considérer un job "actif" normal si démarré moins de MINUTES (default 60)
|
||||
--recent-minutes MIN chercher événements de Job (Warning) dans les MIN dernières minutes (default 5)
|
||||
--check-cron activer la vérification des CronJobs (default ON)
|
||||
--cron-max-age MINUTES si lastScheduleTime > MINUTES => alerter (default 60). Mettre 0 pour désactiver.
|
||||
-h, --help : affiche l'aide
|
||||
EOF
|
||||
}
|
||||
|
||||
# Parse args
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--warn) WARN="$2"; shift 2;;
|
||||
--crit) CRIT="$2"; shift 2;;
|
||||
--ignore-ns) IGNORE_NS="$2"; shift 2;;
|
||||
--namespaces) INCLUDE_NS="$2"; shift 2;;
|
||||
--age-min) AGE_MIN="$2"; shift 2;;
|
||||
--recent-minutes) RECENT_MINUTES="$2"; shift 2;;
|
||||
--no-cron) CHECK_CRON=0; shift 1;;
|
||||
--cron-max-age) CRON_MAX_AGE_MIN="$2"; shift 2;;
|
||||
-h|--help) print_usage; exit 3;;
|
||||
*) echo "Unknown arg: $1"; print_usage; exit 3;;
|
||||
esac
|
||||
done
|
||||
|
||||
if ! command -v kubectl >/dev/null 2>&1; then
|
||||
echo "UNKNOWN - kubectl not found"
|
||||
exit 3
|
||||
fi
|
||||
|
||||
# Build namespace filters (regex)
|
||||
ignore_pattern=""
|
||||
if [[ -n "$IGNORE_NS" ]]; then
|
||||
IFS=',' read -ra arr <<< "$IGNORE_NS"
|
||||
for ns in "${arr[@]}"; do
|
||||
ignore_pattern="${ignore_pattern}|^${ns}\$"
|
||||
done
|
||||
ignore_pattern="${ignore_pattern#|}"
|
||||
fi
|
||||
|
||||
include_pattern=""
|
||||
if [[ -n "$INCLUDE_NS" ]]; then
|
||||
IFS=',' read -ra arr2 <<< "$INCLUDE_NS"
|
||||
for ns in "${arr2[@]}"; do
|
||||
include_pattern="${include_pattern}|^${ns}\$"
|
||||
done
|
||||
include_pattern="${include_pattern#|}"
|
||||
fi
|
||||
|
||||
ns_allowed() {
|
||||
local ns="$1"
|
||||
if [[ -n "$include_pattern" ]]; then
|
||||
if ! echo "$ns" | egrep -q "$include_pattern"; then
|
||||
return 1
|
||||
fi
|
||||
fi
|
||||
if [[ -n "$ignore_pattern" ]]; then
|
||||
if echo "$ns" | egrep -q "$ignore_pattern"; then
|
||||
return 1
|
||||
fi
|
||||
fi
|
||||
return 0
|
||||
}
|
||||
|
||||
now_s=$(date +%s)
|
||||
|
||||
# Initialize problems array safely
|
||||
problems=()
|
||||
|
||||
# ---------------------------
|
||||
# 1) Inspect Jobs
|
||||
# ---------------------------
|
||||
# Fields: namespace, name, active, succeeded, failed, startTime, completionTime
|
||||
mapfile -t job_lines < <(kubectl get jobs -A -o jsonpath='{range .items[*]}{.metadata.namespace}{"\t"}{.metadata.name}{"\t"}{.status.active}{"\t"}{.status.succeeded}{"\t"}{.status.failed}{"\t"}{.status.startTime}{"\t"}{.status.completionTime}{"\n"}{end}' 2>/dev/null || true)
|
||||
|
||||
for line in "${job_lines[@]}"; do
|
||||
ns=$(echo "$line" | awk -F'\t' '{print $1}')
|
||||
name=$(echo "$line" | awk -F'\t' '{print $2}')
|
||||
active=$(echo "$line" | awk -F'\t' '{print $3}')
|
||||
succeeded=$(echo "$line" | awk -F'\t' '{print $4}')
|
||||
failed=$(echo "$line" | awk -F'\t' '{print $5}')
|
||||
start=$(echo "$line" | awk -F'\t' '{print $6}')
|
||||
completion=$(echo "$line" | awk -F'\t' '{print $7}')
|
||||
|
||||
# defaults
|
||||
active=${active:-0}
|
||||
succeeded=${succeeded:-0}
|
||||
failed=${failed:-0}
|
||||
|
||||
if ! ns_allowed "$ns"; then
|
||||
continue
|
||||
fi
|
||||
|
||||
# 1.a) Jobs with failures
|
||||
if (( failed > 0 )); then
|
||||
problems+=("Job ${ns}/${name} failedCount=${failed}")
|
||||
continue
|
||||
fi
|
||||
|
||||
# 1.b) Active jobs running too long
|
||||
if (( active > 0 )); then
|
||||
if [[ -n "$start" && "$start" != "null" ]]; then
|
||||
# convert start timestamp to epoch (GNU date)
|
||||
start_s=$(date -d "$start" +%s 2>/dev/null || echo 0)
|
||||
if (( start_s > 0 )); then
|
||||
age_min=$(( (now_s - start_s) / 60 ))
|
||||
if (( age_min >= AGE_MIN )); then
|
||||
problems+=("Job ${ns}/${name} active for ${age_min}min >= ${AGE_MIN}min")
|
||||
fi
|
||||
fi
|
||||
else
|
||||
# no start time but active >0 -> flag
|
||||
problems+=("Job ${ns}/${name} active but no startTime recorded")
|
||||
fi
|
||||
fi
|
||||
done
|
||||
|
||||
# 1.c) Recent Job warning events (type=Warning) in last RECENT_MINUTES
|
||||
if (( RECENT_MINUTES > 0 )); then
|
||||
# get events for Jobs (type Warning) with fields: namespace, involvedObject.name, lastTimestamp, reason, message
|
||||
mapfile -t event_lines < <(kubectl get events --all-namespaces --field-selector involvedObject.kind=Job,type=Warning -o custom-columns='NAMESPACE:.metadata.namespace,NAME:.involvedObject.name,LAST:.lastTimestamp,REASON:.reason,MESSAGE:.message' --no-headers 2>/dev/null || true)
|
||||
cutoff_s=$(( now_s - RECENT_MINUTES * 60 ))
|
||||
for ev in "${event_lines[@]}"; do
|
||||
ns=$(echo "$ev" | awk '{print $1}')
|
||||
name=$(echo "$ev" | awk '{print $2}')
|
||||
last=$(echo "$ev" | awk '{print $3}')
|
||||
if ! ns_allowed "$ns"; then
|
||||
continue
|
||||
fi
|
||||
if [[ -n "$last" && "$last" != "<none>" ]]; then
|
||||
ts=$(date -d "$last" +%s 2>/dev/null || echo 0)
|
||||
if (( ts >= cutoff_s )); then
|
||||
problems+=("Job event Warning ${ns}/${name} at $last")
|
||||
fi
|
||||
fi
|
||||
done
|
||||
fi
|
||||
|
||||
# ---------------------------
|
||||
# 2) Inspect CronJobs (optionnel)
|
||||
# ---------------------------
|
||||
if (( CHECK_CRON == 1 )) && (( CRON_MAX_AGE_MIN > 0 )); then
|
||||
# Fields: namespace, name, suspend (true/false/null), lastScheduleTime
|
||||
mapfile -t cron_lines < <(kubectl get cronjob -A -o jsonpath='{range .items[*]}{.metadata.namespace}{"\t"}{.metadata.name}{"\t"}{.spec.suspend}{"\t"}{.status.lastScheduleTime}{"\n"}{end}' 2>/dev/null || true)
|
||||
|
||||
for line in "${cron_lines[@]}"; do
|
||||
ns=$(echo "$line" | awk -F'\t' '{print $1}')
|
||||
name=$(echo "$line" | awk -F'\t' '{print $2}')
|
||||
suspend=$(echo "$line" | awk -F'\t' '{print $3}')
|
||||
last=$(echo "$line" | awk -F'\t' '{print $4}')
|
||||
|
||||
if ! ns_allowed "$ns"; then
|
||||
continue
|
||||
fi
|
||||
|
||||
# If suspended, do not consider as problem
|
||||
if [[ "$suspend" == "true" ]]; then
|
||||
continue
|
||||
fi
|
||||
|
||||
if [[ -z "$last" || "$last" == "null" ]]; then
|
||||
# Never scheduled yet: warn (useful to detect misconfigured cronjobs)
|
||||
problems+=("CronJob ${ns}/${name} has no lastScheduleTime (never scheduled?)")
|
||||
continue
|
||||
fi
|
||||
|
||||
last_s=$(date -d "$last" +%s 2>/dev/null || echo 0)
|
||||
if (( last_s > 0 )); then
|
||||
age_min=$(( (now_s - last_s) / 60 ))
|
||||
if (( age_min > CRON_MAX_AGE_MIN )); then
|
||||
problems+=("CronJob ${ns}/${name} lastSchedule ${age_min}min ago > ${CRON_MAX_AGE_MIN}min")
|
||||
fi
|
||||
else
|
||||
problems+=("CronJob ${ns}/${name} lastScheduleTime unparsable: ${last}")
|
||||
fi
|
||||
done
|
||||
fi
|
||||
|
||||
# ---------------------------
|
||||
# Final decision & output
|
||||
# ---------------------------
|
||||
count=${#problems[@]}
|
||||
|
||||
if (( count == 0 )); then
|
||||
echo "OK - Jobs/CronJobs checks passed"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# Severity decision
|
||||
if (( count >= CRIT )); then
|
||||
echo "CRITICAL - ${count} problems found: ${problems[*]}"
|
||||
exit 2
|
||||
elif (( count >= WARN )); then
|
||||
echo "WARNING - ${count} problems found: ${problems[*]}"
|
||||
exit 1
|
||||
else
|
||||
echo "OK - ${count} problems found but below thresholds"
|
||||
exit 0
|
||||
fi
|
||||
194
files/nrpe/check_k8s_pki_certs
Normal file
194
files/nrpe/check_k8s_pki_certs
Normal file
@@ -0,0 +1,194 @@
|
||||
#!/usr/bin/env bash
|
||||
# check_k8s_pki_certs
|
||||
# Vérifie les certificats PEM sous /etc/kubernetes/pki (par défaut) et alerte si expiration <= warn_days (30j par défaut).
|
||||
# Exit codes: 0=OK, 1=WARNING, 2=CRITICAL, 3=UNKNOWN
|
||||
#
|
||||
# Usage:
|
||||
# sudo /usr/lib/nagios/plugins/check_k8s_pki_certs
|
||||
# sudo /usr/lib/nagios/plugins/check_k8s_pki_certs --path /etc/kubernetes/ssl --warn-days 30 --crit-days 7 --recursive
|
||||
#
|
||||
set -euo pipefail
|
||||
|
||||
PKI_PATH=${PKI_PATH:-/etc/kubernetes/pki}
|
||||
WARN_DAYS=${WARN_DAYS:-30}
|
||||
CRIT_DAYS=${CRIT_DAYS:-7}
|
||||
RECURSIVE=0
|
||||
|
||||
print_usage() {
|
||||
cat <<EOF
|
||||
Usage: $0 [--path PATH] [--warn-days N] [--crit-days M] [--recursive] [-h|--help]
|
||||
|
||||
Options:
|
||||
--path PATH répertoire à scanner (default: $PKI_PATH)
|
||||
--warn-days N seuil warning en jours (default: $WARN_DAYS)
|
||||
--crit-days M seuil critical en jours (default: $CRIT_DAYS)
|
||||
--recursive scanner récursivement PATH et sous-dirs
|
||||
-h, --help affiche cette aide
|
||||
EOF
|
||||
}
|
||||
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--path) PKI_PATH="$2"; shift 2;;
|
||||
--warn-days) WARN_DAYS="$2"; shift 2;;
|
||||
--crit-days) CRIT_DAYS="$2"; shift 2;;
|
||||
--recursive) RECURSIVE=1; shift 1;;
|
||||
-h|--help) print_usage; exit 3;;
|
||||
*) echo "Unknown arg: $1"; print_usage; exit 3;;
|
||||
esac
|
||||
done
|
||||
|
||||
# tools
|
||||
if ! command -v openssl >/dev/null 2>&1; then
|
||||
echo "UNKNOWN - openssl not found"
|
||||
exit 3
|
||||
fi
|
||||
if ! command -v date >/dev/null 2>&1; then
|
||||
echo "UNKNOWN - date not found"
|
||||
exit 3
|
||||
fi
|
||||
if ! command -v sed >/dev/null 2>&1; then
|
||||
echo "UNKNOWN - sed not found"
|
||||
exit 3
|
||||
fi
|
||||
if ! command -v awk >/dev/null 2>&1; then
|
||||
echo "UNKNOWN - awk not found"
|
||||
exit 3
|
||||
fi
|
||||
if ! command -v find >/dev/null 2>&1; then
|
||||
echo "UNKNOWN - find not found"
|
||||
exit 3
|
||||
fi
|
||||
|
||||
# resolve symlink target (realpath or readlink -f)
|
||||
if command -v realpath >/dev/null 2>&1; then
|
||||
PKI_PATH_RESOLVED=$(realpath -e "$PKI_PATH" 2>/dev/null || true)
|
||||
else
|
||||
PKI_PATH_RESOLVED=$(readlink -f "$PKI_PATH" 2>/dev/null || true)
|
||||
fi
|
||||
if [[ -n "$PKI_PATH_RESOLVED" && -d "$PKI_PATH_RESOLVED" ]]; then
|
||||
PKI_PATH="$PKI_PATH_RESOLVED"
|
||||
fi
|
||||
|
||||
if [[ ! -d "$PKI_PATH" ]]; then
|
||||
echo "UNKNOWN - path $PKI_PATH not found or not a directory"
|
||||
exit 3
|
||||
fi
|
||||
|
||||
now_s=$(date +%s)
|
||||
|
||||
# Initialize arrays explicitly to avoid "variable sans liaison" with set -u
|
||||
critical=()
|
||||
warning=()
|
||||
ok=()
|
||||
errors=()
|
||||
|
||||
file_count=0
|
||||
cert_count=0
|
||||
|
||||
# build find command: follow symlinks (-L) so that symlinked directories/files are handled
|
||||
if [[ $RECURSIVE -eq 1 ]]; then
|
||||
FIND_CMD=(find -L "$PKI_PATH" -type f -print0)
|
||||
else
|
||||
FIND_CMD=(find -L "$PKI_PATH" -maxdepth 1 -type f -print0)
|
||||
fi
|
||||
|
||||
# iterate files found
|
||||
while IFS= read -r -d '' file; do
|
||||
file_count=$((file_count+1))
|
||||
|
||||
# skip unreadable files
|
||||
if [[ ! -r "$file" ]]; then
|
||||
errors+=("Unreadable file: $file")
|
||||
continue
|
||||
fi
|
||||
|
||||
# skip files without PEM marker
|
||||
if ! grep -q "BEGIN CERTIFICATE" "$file" 2>/dev/null; then
|
||||
continue
|
||||
fi
|
||||
|
||||
# find pairs of BEGIN/END certificate line numbers robustly using awk
|
||||
# prints "start:end" for each certificate block
|
||||
mapfile -t pairs < <(awk '
|
||||
/BEGIN CERTIFICATE/ {start=NR}
|
||||
/END CERTIFICATE/ && start { print start ":" NR; start=0 }
|
||||
' "$file" 2>/dev/null || true)
|
||||
|
||||
if [[ ${#pairs[@]} -eq 0 ]]; then
|
||||
errors+=("No certificate block pairs found in $file")
|
||||
continue
|
||||
fi
|
||||
|
||||
for p in "${pairs[@]}"; do
|
||||
start=${p%%:*}
|
||||
end=${p##*:}
|
||||
# extract block via sed (line range), send to openssl via stdin
|
||||
cert_block=$(sed -n "${start},${end}p" "$file" 2>/dev/null || true)
|
||||
if [[ -z "$cert_block" ]]; then
|
||||
errors+=("Failed to extract certificate block ${start}-${end} from $file")
|
||||
continue
|
||||
fi
|
||||
|
||||
# openssl expects a file or stdin; use stdin
|
||||
endline=$(printf '%s\n' "$cert_block" | openssl x509 -noout -enddate -in /dev/stdin 2>/dev/null) || {
|
||||
errors+=("Failed to parse certificate block ${start}-${end} in $file with openssl")
|
||||
continue
|
||||
}
|
||||
# sample endline: notAfter=Oct 27 16:15:30 2125 GMT
|
||||
notAfter=${endline#notAfter=}
|
||||
expiry_s=$(date -d "$notAfter" +%s 2>/dev/null) || {
|
||||
errors+=("Cannot parse date '$notAfter' for cert in $file")
|
||||
continue
|
||||
}
|
||||
days_left=$(( (expiry_s - now_s) / 86400 ))
|
||||
subj=$(printf '%s\n' "$cert_block" | openssl x509 -noout -subject -in /dev/stdin 2>/dev/null || true)
|
||||
subj=${subj#subject= }
|
||||
info="${file} :: ${subj} :: expires in ${days_left}d on ${notAfter}"
|
||||
cert_count=$((cert_count+1))
|
||||
if (( days_left <= CRIT_DAYS )); then
|
||||
critical+=("$info")
|
||||
elif (( days_left <= WARN_DAYS )); then
|
||||
warning+=("$info")
|
||||
else
|
||||
ok+=("$info")
|
||||
fi
|
||||
done
|
||||
|
||||
done < <("${FIND_CMD[@]}")
|
||||
|
||||
# results and exit codes
|
||||
if [[ ${#errors[@]} -gt 0 ]]; then
|
||||
echo "UNKNOWN - parsing errors: ${errors[*]}"
|
||||
exit 3
|
||||
fi
|
||||
|
||||
if (( cert_count == 0 )); then
|
||||
echo "UNKNOWN - no certificates found under $PKI_PATH"
|
||||
exit 3
|
||||
fi
|
||||
|
||||
if (( ${#critical[@]} > 0 )); then
|
||||
echo "CRITICAL - ${#critical[@]} certificate(s) expiring soon (<= ${CRIT_DAYS} days):"
|
||||
for c in "${critical[@]}"; do
|
||||
echo " - $c"
|
||||
done
|
||||
if (( ${#warning[@]} > 0 )); then
|
||||
echo "WARN (additional ${#warning[@]} cert(s) <= ${WARN_DAYS} days):"
|
||||
for w in "${warning[@]}"; do
|
||||
echo " - $w"
|
||||
done
|
||||
fi
|
||||
exit 2
|
||||
fi
|
||||
|
||||
if (( ${#warning[@]} > 0 )); then
|
||||
echo "WARNING - ${#warning[@]} certificate(s) expiring within ${WARN_DAYS} days:"
|
||||
for w in "${warning[@]}"; do
|
||||
echo " - $w"
|
||||
done
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "OK - ${cert_count} cert(s) checked in ${file_count} file(s), no expiry within ${WARN_DAYS} days"
|
||||
exit 0
|
||||
49
files/nrpe/check_k8s_pod_restarts
Normal file
49
files/nrpe/check_k8s_pod_restarts
Normal file
@@ -0,0 +1,49 @@
|
||||
#!/usr/bin/env bash
|
||||
# check_k8s_pod_restarts
|
||||
# Vérifie s'il y a eu des redémarrages de pods (événements "Killing") dans les X dernières minutes.
|
||||
# Retour: 0=OK, 2=CRITICAL, 3=UNKNOWN
|
||||
#
|
||||
# Usage:
|
||||
# sudo /usr/lib/nagios/plugins/check_k8s_pod_restarts [minutes]
|
||||
#
|
||||
MINUTES=${1:-5}
|
||||
|
||||
# Require kubectl
|
||||
if ! command -v kubectl >/dev/null 2>&1; then
|
||||
echo "UNKNOWN - kubectl not found"
|
||||
exit 3
|
||||
fi
|
||||
|
||||
# cutoff en epoch (GNU date)
|
||||
if ! cutoff=$(date -d "$MINUTES minutes ago" +%s 2>/dev/null); then
|
||||
echo "UNKNOWN - date parsing failed (on macOS use gdate from coreutils)"
|
||||
exit 3
|
||||
fi
|
||||
|
||||
matches=()
|
||||
while IFS=$'\t' read -r ns pod last msg; do
|
||||
# skip empty lines
|
||||
[[ -z "$last" ]] && continue
|
||||
# convert last timestamp to epoch (works with GNU date; handles timezone/fractions)
|
||||
if ! ts=$(date -d "$last" +%s 2>/dev/null); then
|
||||
# if parsing fails, skip the event
|
||||
continue
|
||||
fi
|
||||
if (( ts >= cutoff )); then
|
||||
# safe message truncation
|
||||
shortmsg=$(echo "$msg" | tr '\n' ' ' | cut -c1-300)
|
||||
matches+=("$ns\t$pod\t$last\t$shortmsg")
|
||||
fi
|
||||
done < <(kubectl get events --all-namespaces --field-selector reason=Killing -o custom-columns='NAMESPACE:.metadata.namespace,NAME:.involvedObject.name,LAST:.lastTimestamp,MESSAGE:.message' --no-headers 2>/dev/null || true)
|
||||
|
||||
if [[ ${#matches[@]} -eq 0 ]]; then
|
||||
echo "OK - no pod restarts in the last ${MINUTES} minutes"
|
||||
exit 0
|
||||
else
|
||||
echo "CRITICAL - ${#matches[@]} pod restarts in the last ${MINUTES} minutes:"
|
||||
for m in "${matches[@]}"; do
|
||||
IFS=$'\t' read -r ns pod last shortmsg <<< "$m"
|
||||
echo " - ${ns}/${pod} at ${last} : ${shortmsg}"
|
||||
done
|
||||
exit 2
|
||||
fi
|
||||
202
files/nrpe/check_k8s_pv_pvc
Normal file
202
files/nrpe/check_k8s_pv_pvc
Normal file
@@ -0,0 +1,202 @@
|
||||
#!/usr/bin/env bash
|
||||
# check_k8s_pv_pvc
|
||||
# Vérifie l'état des PersistentVolumes (PV) et PersistentVolumeClaims (PVC) Kubernetes.
|
||||
# Exit codes: 0=OK, 1=WARNING, 2=CRITICAL, 3=UNKNOWN
|
||||
#
|
||||
# Usage examples:
|
||||
# sudo /usr/lib/nagios/plugins/check_k8s_pv_pvc --crit 1 # CRITICAL si >=1 problème
|
||||
# sudo /usr/lib/nagios/plugins/check_k8s_pv_pvc --ignore-ns kube-system # ignorer kube-system
|
||||
# sudo /usr/lib/nagios/plugins/check_k8s_pv_pvc --pvc-age-min 10 --crit 2 # ignorer PVC récents <10min, CRIT si >=2
|
||||
# sudo /usr/lib/nagios/plugins/check_k8s_pv_pvc --check-pv --check-pvc # (par défaut les 2 sont vérifiés)
|
||||
#
|
||||
set -euo pipefail
|
||||
|
||||
# Defaults
|
||||
WARN=${WARN:-0}
|
||||
CRIT=${CRIT:-1}
|
||||
IGNORE_NS=""
|
||||
INCLUDE_NS=""
|
||||
PVC_AGE_MIN=${PVC_AGE_MIN:-5} # en minutes : ignore PVC créés il y a moins de X minutes (défaut 5)
|
||||
CHECK_PV=1
|
||||
CHECK_PVC=1
|
||||
|
||||
print_usage() {
|
||||
cat <<EOF
|
||||
Usage: $0 [options]
|
||||
Options:
|
||||
--warn N seuil WARN si >= N objets en erreur (default 0)
|
||||
--crit M seuil CRIT si >= M objets en erreur (default 1)
|
||||
--ignore-ns a,b,c namespaces à ignorer (comma separated)
|
||||
--namespaces a,b limiter aux namespaces donnés (comma separated)
|
||||
--pvc-age-min N ignore PVC créés il y a moins de N minutes (default 5)
|
||||
--no-pv disable PV checks
|
||||
--no-pvc disable PVC checks
|
||||
-h, --help affiche cette aide
|
||||
EOF
|
||||
}
|
||||
|
||||
# Parse args
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--warn) WARN="$2"; shift 2;;
|
||||
--crit) CRIT="$2"; shift 2;;
|
||||
--ignore-ns) IGNORE_NS="$2"; shift 2;;
|
||||
--namespaces) INCLUDE_NS="$2"; shift 2;;
|
||||
--pvc-age-min) PVC_AGE_MIN="$2"; shift 2;;
|
||||
--no-pv) CHECK_PV=0; shift 1;;
|
||||
--no-pvc) CHECK_PVC=0; shift 1;;
|
||||
-h|--help) print_usage; exit 3;;
|
||||
*) echo "Unknown arg: $1"; print_usage; exit 3;;
|
||||
esac
|
||||
done
|
||||
|
||||
if ! command -v kubectl >/dev/null 2>&1; then
|
||||
echo "UNKNOWN - kubectl not found"
|
||||
exit 3
|
||||
fi
|
||||
|
||||
# Build namespace filters
|
||||
ignore_pattern=""
|
||||
if [[ -n "$IGNORE_NS" ]]; then
|
||||
IFS=',' read -ra arr <<< "$IGNORE_NS"
|
||||
for ns in "${arr[@]}"; do
|
||||
ignore_pattern="${ignore_pattern}|^${ns}\$"
|
||||
done
|
||||
ignore_pattern="${ignore_pattern#|}"
|
||||
fi
|
||||
|
||||
include_pattern=""
|
||||
if [[ -n "$INCLUDE_NS" ]]; then
|
||||
IFS=',' read -ra arr2 <<< "$INCLUDE_NS"
|
||||
for ns in "${arr2[@]}"; do
|
||||
include_pattern="${include_pattern}|^${ns}\$"
|
||||
done
|
||||
include_pattern="${include_pattern#|}"
|
||||
fi
|
||||
|
||||
now_s=$(date +%s)
|
||||
|
||||
# Initialize problems array safely (fix pour "variable sans liaison")
|
||||
problems=()
|
||||
|
||||
# Helper: namespace filter
|
||||
ns_allowed() {
|
||||
local ns="$1"
|
||||
if [[ -n "$include_pattern" ]]; then
|
||||
if ! echo "$ns" | egrep -q "$include_pattern"; then
|
||||
return 1
|
||||
fi
|
||||
fi
|
||||
if [[ -n "$ignore_pattern" ]]; then
|
||||
if echo "$ns" | egrep -q "$ignore_pattern"; then
|
||||
return 1
|
||||
fi
|
||||
fi
|
||||
return 0
|
||||
}
|
||||
|
||||
# 1) Check PVCs
|
||||
if (( CHECK_PVC == 1 )); then
|
||||
# gather: namespace, name, phase, volumeName, creationTimestamp
|
||||
mapfile -t pvc_lines < <(kubectl get pvc -A -o jsonpath='{range .items[*]}{.metadata.namespace}{"\t"}{.metadata.name}{"\t"}{.status.phase}{"\t"}{.spec.volumeName}{"\t"}{.metadata.creationTimestamp}{"\n"}{end}' 2>/dev/null || true)
|
||||
|
||||
for line in "${pvc_lines[@]}"; do
|
||||
ns=$(echo "$line" | awk -F'\t' '{print $1}')
|
||||
name=$(echo "$line" | awk -F'\t' '{print $2}')
|
||||
phase=$(echo "$line" | awk -F'\t' '{print $3}')
|
||||
vol=$(echo "$line" | awk -F'\t' '{print $4}')
|
||||
created=$(echo "$line" | awk -F'\t' '{print $5}')
|
||||
|
||||
# filter namespaces
|
||||
if ! ns_allowed "$ns"; then
|
||||
continue
|
||||
fi
|
||||
|
||||
# ignore PVC newly created (to avoid noise during normal provisioning)
|
||||
if [[ -n "$created" && "$PVC_AGE_MIN" -gt 0 ]]; then
|
||||
created_s=0
|
||||
created_s=$(date -d "$created" +%s 2>/dev/null || echo 0)
|
||||
age_min=$(( (now_s - created_s) / 60 ))
|
||||
if (( age_min < PVC_AGE_MIN )); then
|
||||
continue
|
||||
fi
|
||||
fi
|
||||
|
||||
# Consider non-Bound phases as problematic (Pending, Lost, Failed)
|
||||
# Bound is OK; if Bound but no volumeName -> problem
|
||||
if [[ "$phase" != "Bound" ]]; then
|
||||
problems+=("PVC ${ns}/${name} phase=${phase} created=${created}")
|
||||
continue
|
||||
fi
|
||||
|
||||
if [[ -z "$vol" || "$vol" == "null" ]]; then
|
||||
problems+=("PVC ${ns}/${name} Bound but no volumeName assigned")
|
||||
continue
|
||||
fi
|
||||
done
|
||||
fi
|
||||
|
||||
# 2) Check PVs
|
||||
if (( CHECK_PV == 1 )); then
|
||||
# gather: name, phase, capacity.storage, claimRef.namespace, claimRef.name, reclaimPolicy
|
||||
mapfile -t pv_lines < <(kubectl get pv -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.status.phase}{"\t"}{.spec.capacity.storage}{"\t"}{.spec.claimRef.namespace}{"\t"}{.spec.claimRef.name}{"\t"}{.spec.persistentVolumeReclaimPolicy}{"\n"}{end}' 2>/dev/null || true)
|
||||
|
||||
for line in "${pv_lines[@]}"; do
|
||||
name=$(echo "$line" | awk -F'\t' '{print $1}')
|
||||
phase=$(echo "$line" | awk -F'\t' '{print $2}')
|
||||
cap=$(echo "$line" | awk -F'\t' '{print $3}')
|
||||
claim_ns=$(echo "$line" | awk -F'\t' '{print $4}')
|
||||
claim_name=$(echo "$line" | awk -F'\t' '{print $5}')
|
||||
reclaim=$(echo "$line" | awk -F'\t' '{print $6}')
|
||||
|
||||
# If PV is bound, check namespace filter of its claim (only report if claim namespace allowed)
|
||||
if [[ -n "$claim_ns" && "$claim_ns" != "null" ]]; then
|
||||
if ! ns_allowed "$claim_ns"; then
|
||||
continue
|
||||
fi
|
||||
else
|
||||
# claim_ns empty => PV not bound to claim
|
||||
# Consider phases indicating issues: Released, Failed
|
||||
if [[ "$phase" == "Released" || "$phase" == "Failed" ]]; then
|
||||
problems+=("PV ${name} phase=${phase} reclaim=${reclaim} (no claim)")
|
||||
continue
|
||||
fi
|
||||
# Optionally, consider Available PV without claim as possibly orphaned:
|
||||
# Uncomment next lines to treat Available PVs as warning/problem
|
||||
# if [[ "$phase" == "Available" ]]; then
|
||||
# problems+=("PV ${name} is Available (unbound) capacity=${cap} reclaim=${reclaim}")
|
||||
# fi
|
||||
fi
|
||||
|
||||
# If bound, but claim cannot be found (partial sanity check)
|
||||
if [[ "$phase" == "Bound" ]]; then
|
||||
if [[ -z "$claim_ns" || -z "$claim_name" || "$claim_ns" == "null" || "$claim_name" == "null" ]]; then
|
||||
problems+=("PV ${name} Bound but missing claimRef (phase=${phase})")
|
||||
continue
|
||||
fi
|
||||
# try to ensure the claim exists (if denied by namespace filter it's been skipped earlier)
|
||||
if ! kubectl get pvc -n "${claim_ns}" "${claim_name}" >/dev/null 2>&1; then
|
||||
problems+=("PV ${name} Bound to ${claim_ns}/${claim_name} but PVC resource not found")
|
||||
fi
|
||||
fi
|
||||
done
|
||||
fi
|
||||
|
||||
count=${#problems[@]}
|
||||
|
||||
if (( count == 0 )); then
|
||||
echo "OK - PV/PVC checks passed"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# Severity decision
|
||||
if (( count >= CRIT )); then
|
||||
echo "CRITICAL - ${count} PV/PVC problems: ${problems[*]}"
|
||||
exit 2
|
||||
elif (( count >= WARN )); then
|
||||
echo "WARNING - ${count} PV/PVC problems: ${problems[*]}"
|
||||
exit 1
|
||||
else
|
||||
echo "OK - ${count} PV/PVC problems but below thresholds"
|
||||
exit 0
|
||||
fi
|
||||
135
files/nrpe/check_k8s_replicasets
Normal file
135
files/nrpe/check_k8s_replicasets
Normal file
@@ -0,0 +1,135 @@
|
||||
#!/usr/bin/env bash
|
||||
# check_k8s_replicasets
|
||||
# Vérifie les ReplicaSets Kubernetes : readyReplicas < spec.replicas
|
||||
# Retour: 0=OK, 1=WARNING, 2=CRITICAL, 3=UNKNOWN
|
||||
#
|
||||
# Usage:
|
||||
# sudo /usr/lib/nagios/plugins/check_k8s_replicasets [--warn N] [--crit M] [--ignore-ns ns1,ns2] [--namespaces ns1,ns2] [--age-min MINUTES]
|
||||
#
|
||||
set -euo pipefail
|
||||
|
||||
WARN=${WARN:-0} # nombre de RS en erreur pour WARNING
|
||||
CRIT=${CRIT:-1} # nombre de RS en erreur pour CRITICAL par défaut (1 => 1 RS -> CRITICAL)
|
||||
IGNORE_NS=""
|
||||
INCLUDE_NS=""
|
||||
AGE_MIN=0
|
||||
|
||||
print_usage() {
|
||||
cat <<EOF
|
||||
Usage: $0 [--warn N] [--crit M] [--ignore-ns ns1,ns2] [--namespaces ns1,ns2] [--age-min MINUTES]
|
||||
--warn N : seuil warn si >=N ReplicaSets en erreur (default 0)
|
||||
--crit M : seuil crit si >=M ReplicaSets en erreur (default 1)
|
||||
--ignore-ns LIST : comma separated namespaces to ignore (default none)
|
||||
--namespaces LIST: comma separated namespaces to check only (default all)
|
||||
--age-min N : ignore ReplicaSets created less than N minutes ago (avoid flapping during rollout)
|
||||
EOF
|
||||
}
|
||||
|
||||
# parse args
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--warn) WARN="$2"; shift 2;;
|
||||
--crit) CRIT="$2"; shift 2;;
|
||||
--ignore-ns) IGNORE_NS="$2"; shift 2;;
|
||||
--namespaces) INCLUDE_NS="$2"; shift 2;;
|
||||
--age-min) AGE_MIN="$2"; shift 2;;
|
||||
-h|--help) print_usage; exit 3;;
|
||||
*) echo "Unknown arg: $1"; print_usage; exit 3;;
|
||||
esac
|
||||
done
|
||||
|
||||
if ! command -v kubectl >/dev/null 2>&1; then
|
||||
echo "UNKNOWN - kubectl not found"
|
||||
exit 3
|
||||
fi
|
||||
|
||||
# Build filter for namespace inclusion/exclusion (regex)
|
||||
ignore_pattern=""
|
||||
if [[ -n "$IGNORE_NS" ]]; then
|
||||
IFS=',' read -ra arr <<< "$IGNORE_NS"
|
||||
for ns in "${arr[@]}"; do
|
||||
ignore_pattern="${ignore_pattern}|^${ns}\$"
|
||||
done
|
||||
ignore_pattern="${ignore_pattern#|}"
|
||||
fi
|
||||
|
||||
include_pattern=""
|
||||
if [[ -n "$INCLUDE_NS" ]]; then
|
||||
IFS=',' read -ra arr2 <<< "$INCLUDE_NS"
|
||||
for ns in "${arr2[@]}"; do
|
||||
include_pattern="${include_pattern}|^${ns}\$"
|
||||
done
|
||||
include_pattern="${include_pattern#|}"
|
||||
fi
|
||||
|
||||
# Initialize failures array to avoid "variable sans liaison" when set -u is active
|
||||
failures=()
|
||||
|
||||
# Collect ReplicaSets: namespace, name, desired(spec.replicas), ready(status.readyReplicas), creationTimestamp
|
||||
# If fields missing, jsonpath returns nothing -> we normalize later
|
||||
mapfile -t lines < <(kubectl get rs -A -o jsonpath='{range .items[*]}{.metadata.namespace}{"\t"}{.metadata.name}{"\t"}{.spec.replicas}{"\t"}{.status.readyReplicas}{"\t"}{.metadata.creationTimestamp}{"\n"}{end}' 2>/dev/null || true)
|
||||
|
||||
now_s=$(date +%s)
|
||||
|
||||
for line in "${lines[@]}"; do
|
||||
# Skip empty lines if any
|
||||
[[ -z "$line" ]] && continue
|
||||
|
||||
ns=$(echo "$line" | awk -F'\t' '{print $1}')
|
||||
name=$(echo "$line" | awk -F'\t' '{print $2}')
|
||||
desired=$(echo "$line" | awk -F'\t' '{print $3}')
|
||||
ready=$(echo "$line" | awk -F'\t' '{print $4}')
|
||||
created=$(echo "$line" | awk -F'\t' '{print $5}')
|
||||
|
||||
# normalize numeric values
|
||||
desired=${desired:-0}
|
||||
ready=${ready:-0}
|
||||
|
||||
# namespace filtering
|
||||
if [[ -n "$include_pattern" ]]; then
|
||||
if ! echo "$ns" | egrep -q "$include_pattern"; then
|
||||
continue
|
||||
fi
|
||||
fi
|
||||
if [[ -n "$ignore_pattern" ]]; then
|
||||
if echo "$ns" | egrep -q "$ignore_pattern"; then
|
||||
continue
|
||||
fi
|
||||
fi
|
||||
|
||||
# age filtering (skip very recent RS)
|
||||
if [[ -n "$created" && "$AGE_MIN" -gt 0 ]]; then
|
||||
created_s=0
|
||||
# convert to epoch; if conversion fails, keep created_s=0 so we don't skip
|
||||
if created_s=$(date -d "$created" +%s 2>/dev/null || echo 0); then :; fi
|
||||
age_min=$(( (now_s - created_s) / 60 ))
|
||||
if (( age_min < AGE_MIN )); then
|
||||
continue
|
||||
fi
|
||||
fi
|
||||
|
||||
# Only consider RS where desired > 0 (skip zero-scale RS)
|
||||
if (( desired > 0 )) && (( ready < desired )); then
|
||||
failures+=("${ns}/${name} (desired=${desired},ready=${ready})")
|
||||
fi
|
||||
done
|
||||
|
||||
count=${#failures[@]}
|
||||
|
||||
# If there are no failures and the cluster reports none, return OK
|
||||
if (( count == 0 )); then
|
||||
echo "OK - all ReplicaSets report ready==desired"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# Determine severity based on thresholds
|
||||
if (( count >= CRIT )); then
|
||||
echo "CRITICAL - ${count} ReplicaSets not fully ready: ${failures[*]}"
|
||||
exit 2
|
||||
elif (( count >= WARN )); then
|
||||
echo "WARNING - ${count} ReplicaSets not fully ready: ${failures[*]}"
|
||||
exit 1
|
||||
else
|
||||
echo "OK - ${count} ReplicaSets not fully ready but below thresholds"
|
||||
exit 0
|
||||
fi
|
||||
@@ -72,13 +72,32 @@ command[check_docker_{{ container }}]=/usr/lib/nagios/plugins/check_docker --con
|
||||
{% endif %}
|
||||
|
||||
{% if nrpe_process is defined %}
|
||||
# process
|
||||
{% for process in nrpe_process %}
|
||||
command[check_proc_{{ process }}]=/usr/lib/nagios/plugins/check_systemd_service {{ process }}
|
||||
{% endfor %}
|
||||
{% endif %}
|
||||
|
||||
{% if nrpe_kubernetes is defined or nrpe_kubernetes_manager is defined %}
|
||||
# kubernetes
|
||||
{% if nrpe_kubernetes is defined %}
|
||||
## nodes
|
||||
command[check_proc_kubelet]=/usr/lib/nagios/plugins/check_systemd_service kubelet
|
||||
command[check_proc_etcd]=/usr/lib/nagios/plugins/check_systemd_service etcd
|
||||
command[check_proc_containerd]=/usr/lib/nagios/plugins/check_systemd_service containerd
|
||||
{% endif %}
|
||||
{% if nrpe_kubernetes_manager is defined %}
|
||||
## manager / control plane
|
||||
command[check_k8s_health]=/usr/lib/nagios/plugins/check_http -I {{ ansible_default_ipv4.address }} -p 6443 -S -u /healthz --continue-after-certificate -r ok -w 1 -c 2
|
||||
command[check_cilium_health]=/usr/bin/sudo /usr/lib/nagios/plugins/check_cilium_health
|
||||
command[check_coredns_health]=/usr/bin/sudo /usr/lib/nagios/plugins/check_coredns_health
|
||||
command[check_etcd_health]=/usr/bin/sudo /usr/lib/nagios/plugins/check_etcd_health --endpoints "https://{{ ansible_default_ipv4.address }}:2379" --cacert /etc/ssl/etcd/ssl/ca.pem --cert /etc/ssl/etcd/ssl/node-{{ nrpe_kubernetes_manager_nodename }}.pem --key /etc/ssl/etcd/ssl/node-{{ nrpe_kubernetes_manager_nodename }}-key.pem
|
||||
command[check_k8s_apiserver_access]=/usr/bin/sudo /usr/lib/nagios/plugins/check_k8s_apiserver_access
|
||||
command[check_k8s_deployments]=/usr/bin/sudo /usr/lib/nagios/plugins/check_k8s_deployments
|
||||
command[check_k8s_jobs_cronjobs]=/usr/bin/sudo /usr/lib/nagios/plugins/check_k8s_jobs_cronjobs
|
||||
command[check_k8s_pki_certs]=/usr/bin/sudo /usr/lib/nagios/plugins/check_k8s_pki_certs
|
||||
command[check_k8s_pv_pvc]=/usr/bin/sudo /usr/lib/nagios/plugins/check_k8s_pv_pvc
|
||||
command[check_k8s_replicasets]=/usr/bin/sudo /usr/lib/nagios/plugins/check_k8s_replicasets
|
||||
command[check_k8s_pod_restarts]=/usr/bin/sudo /usr/lib/nagios/plugins/check_k8s_pod_restarts
|
||||
{% endif %}
|
||||
{% endif %}
|
||||
@@ -2,3 +2,13 @@ nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_postfix_mailqueue -w {{
|
||||
nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_exim_mailqueue -w {{ nrpe_mailq_warning }} -c {{ nrpe_mailq_critical }}
|
||||
nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_raid
|
||||
nagios ALL=(ALL) NOPASSWD: /usr/sbin/needrestart -b -l
|
||||
nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_cilium_health
|
||||
nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_coredns_health
|
||||
nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_etcd_health
|
||||
nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_k8s_apiserver_access
|
||||
nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_k8s_deployments
|
||||
nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_k8s_jobs_cronjobs
|
||||
nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_k8s_pki_certs
|
||||
nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_k8s_pv_pvc
|
||||
nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_k8s_replicasets
|
||||
nagios ALL=(ALL) NOPASSWD: /usr/lib/nagios/plugins/check_k8s_pod_restarts
|
||||
Reference in New Issue
Block a user