You've already forked nrpe
158 lines
5.2 KiB
Bash
Executable File
158 lines
5.2 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# check_coredns_health
|
|
# Vérifie la santé de CoreDNS (endpoints + endpointslices + fallback pods)
|
|
# Retour codes: 0=OK, 1=WARNING, 2=CRITICAL, 3=UNKNOWN
|
|
#
|
|
# Usage:
|
|
# sudo /usr/lib/nagios/plugins/check_coredns_health [--namespace N] [--service NAME] [--label-fallback LABEL] [--kubeconfig PATH]
|
|
#
|
|
set -euo pipefail
|
|
|
|
NAMESPACE=${NAMESPACE:-kube-system}
|
|
SERVICE_NAME=${SERVICE_NAME:-coredns}
|
|
LABEL_FALLBACK=${LABEL_FALLBACK:-k8s-app=kube-dns}
|
|
TIMEOUT=${TIMEOUT:-10}
|
|
|
|
usage() {
|
|
cat <<EOF
|
|
Usage: $0 [--namespace N] [--service NAME] [--label-fallback LABEL] [--kubeconfig PATH]
|
|
Defaults: namespace=$NAMESPACE service=$SERVICE_NAME
|
|
EOF
|
|
}
|
|
|
|
while [[ $# -gt 0 ]]; do
|
|
case "$1" in
|
|
--namespace) NAMESPACE="$2"; shift 2;;
|
|
--service) SERVICE_NAME="$2"; shift 2;;
|
|
--label-fallback) LABEL_FALLBACK="$2"; shift 2;;
|
|
--kubeconfig) export KUBECONFIG="$2"; shift 2;;
|
|
-h|--help) usage; exit 3;;
|
|
*) echo "Unknown arg: $1"; usage; exit 3;;
|
|
esac
|
|
done
|
|
|
|
if ! command -v kubectl >/dev/null 2>&1; then
|
|
echo "UNKNOWN - kubectl not found"
|
|
exit 3
|
|
fi
|
|
|
|
# If KUBECONFIG not set, try sensible defaults so sudo/nagios runs succeed.
|
|
if [[ -z "${KUBECONFIG:-}" ]]; then
|
|
if [[ -r "/etc/kubernetes/admin.conf" ]]; then
|
|
export KUBECONFIG="/etc/kubernetes/admin.conf"
|
|
elif [[ -r "/root/.kube/config" ]]; then
|
|
export KUBECONFIG="/root/.kube/config"
|
|
fi
|
|
fi
|
|
|
|
# Build kubectl command with explicit kubeconfig when available
|
|
if [[ -n "${KUBECONFIG:-}" ]]; then
|
|
KC=(kubectl --kubeconfig="${KUBECONFIG}" --request-timeout="${TIMEOUT}s")
|
|
else
|
|
KC=(kubectl --request-timeout="${TIMEOUT}s")
|
|
fi
|
|
|
|
# run_kc: capture stdout only (stderr -> /dev/null) and return kubectl's exit code
|
|
run_kc() {
|
|
local out rc
|
|
out="$("${KC[@]}" "$@" 2>/dev/null)"
|
|
rc=$?
|
|
printf '%s' "$out"
|
|
return $rc
|
|
}
|
|
|
|
# 1) try Endpoints resource
|
|
ep_out=$(run_kc -n "$NAMESPACE" get endpoints "$SERVICE_NAME" -o jsonpath='{.subsets[*].addresses[*].ip}')
|
|
rc=$?
|
|
if (( rc != 0 )); then
|
|
echo "CRITICAL - kubectl failed to get Endpoints (exit code ${rc})"
|
|
exit 2
|
|
fi
|
|
if [[ -n "${ep_out// /}" ]]; then
|
|
echo "OK - service ${SERVICE_NAME} in ${NAMESPACE} has endpoints: $(echo "$ep_out" | tr ' ' ',')"
|
|
exit 0
|
|
fi
|
|
|
|
# 2) try EndpointSlices (k8s >= 1.17)
|
|
eps_out=$(run_kc -n "$NAMESPACE" get endpointslices -l "kubernetes.io/service-name=${SERVICE_NAME}" -o jsonpath='{range .items[*]}{range .endpoints[*]}{.addresses[*]}{"\n"}{end}{end}')
|
|
rc=$?
|
|
if (( rc != 0 )); then
|
|
echo "CRITICAL - kubectl failed to get EndpointSlices (exit code ${rc})"
|
|
exit 2
|
|
fi
|
|
if [[ -n "${eps_out// /}" ]]; then
|
|
tops=$(printf '%s\n' "$eps_out" | sed '/^\s*$/d' | tr '\n' ',' | sed 's/,$//')
|
|
echo "OK - service ${SERVICE_NAME} in ${NAMESPACE} has EndpointSlices addresses: ${tops}"
|
|
exit 0
|
|
fi
|
|
|
|
# 3) fallback: check service selector and pods matching it
|
|
svc_out=$(run_kc -n "$NAMESPACE" get svc "$SERVICE_NAME" -o jsonpath='{range $k,$v := .spec.selector}{printf "%s=%s;" $k $v}{end}')
|
|
rc=$?
|
|
if (( rc != 0 )); then
|
|
echo "CRITICAL - kubectl failed to get Service selector (exit code ${rc})"
|
|
exit 2
|
|
fi
|
|
|
|
SEL="$svc_out"
|
|
if [[ -z "$SEL" ]]; then
|
|
SEL="$LABEL_FALLBACK"
|
|
SEL=${SEL//;/,}
|
|
fi
|
|
SEL=${SEL%[;,]}
|
|
|
|
# get pods by selector
|
|
pods_out=$(run_kc -n "$NAMESPACE" get pods -l "$SEL" --no-headers -o custom-columns=READY:.status.containerStatuses[0].ready,NAME:.metadata.name)
|
|
rc=$?
|
|
if (( rc != 0 )); then
|
|
echo "CRITICAL - kubectl failed to list pods for selector '${SEL}' (exit code ${rc})"
|
|
exit 2
|
|
fi
|
|
|
|
if [[ -z "${pods_out// /}" ]]; then
|
|
# try alternative labels common for CoreDNS (k8s-app=coredns)
|
|
pods_alt=$(run_kc -n "$NAMESPACE" get pods -l k8s-app=coredns --no-headers -o custom-columns=READY:.status.containerStatuses[0].ready,NAME:.metadata.name)
|
|
rc=$?
|
|
if (( rc != 0 )); then
|
|
echo "CRITICAL - kubectl failed to list pods for fallback selector (exit code ${rc})"
|
|
exit 2
|
|
fi
|
|
if [[ -n "${pods_alt// /}" ]]; then
|
|
pods_out="$pods_alt"
|
|
SEL="k8s-app=coredns (fallback)"
|
|
fi
|
|
fi
|
|
|
|
if [[ -z "${pods_out// /}" ]]; then
|
|
echo "CRITICAL - service ${SERVICE_NAME} in ${NAMESPACE} has no endpoints and no pods match selector '${SEL}'"
|
|
exit 2
|
|
fi
|
|
|
|
# count Ready pods
|
|
not_ready_count=0
|
|
total_count=0
|
|
not_ready_list=()
|
|
while IFS= read -r line; do
|
|
[[ -z "$line" ]] && continue
|
|
total_count=$((total_count+1))
|
|
ready_flag=$(echo "$line" | awk '{print $1}')
|
|
pod_name=$(echo "$line" | awk '{print $2}')
|
|
if [[ "$ready_flag" != "true" && "$ready_flag" != "True" && "$ready_flag" != "1" ]]; then
|
|
not_ready_count=$((not_ready_count+1))
|
|
not_ready_list+=("$pod_name")
|
|
fi
|
|
done <<< "$pods_out"
|
|
|
|
if (( total_count == 0 )); then
|
|
echo "CRITICAL - service ${SERVICE_NAME} in ${NAMESPACE} has no endpoints and no pods found for selector '${SEL}'"
|
|
exit 2
|
|
fi
|
|
|
|
if (( not_ready_count > 0 )); then
|
|
echo "WARNING - service ${SERVICE_NAME} in ${NAMESPACE} has no endpoints, but ${not_ready_count}/${total_count} pods matching selector '${SEL}' are not Ready: ${not_ready_list[*]}"
|
|
exit 1
|
|
fi
|
|
|
|
# If pods exist and are Ready but no Endpoints/EndpointSlices -> likely endpointcontroller/roles mismatch; consider OK but log it
|
|
echo "OK - service ${SERVICE_NAME} in ${NAMESPACE} has no Endpoints resource but ${total_count} pods matching selector '${SEL}' are Ready (EndpointSlices absent or controller issue)"
|
|
exit 0 |