Files
nrpe/files/nrpe/check_coredns_health
T
2025-12-31 15:17:51 +01:00

158 lines
5.2 KiB
Bash
Executable File

#!/usr/bin/env bash
# check_coredns_health
# Vérifie la santé de CoreDNS (endpoints + endpointslices + fallback pods)
# Retour codes: 0=OK, 1=WARNING, 2=CRITICAL, 3=UNKNOWN
#
# Usage:
# sudo /usr/lib/nagios/plugins/check_coredns_health [--namespace N] [--service NAME] [--label-fallback LABEL] [--kubeconfig PATH]
#
set -euo pipefail
NAMESPACE=${NAMESPACE:-kube-system}
SERVICE_NAME=${SERVICE_NAME:-coredns}
LABEL_FALLBACK=${LABEL_FALLBACK:-k8s-app=kube-dns}
TIMEOUT=${TIMEOUT:-10}
usage() {
cat <<EOF
Usage: $0 [--namespace N] [--service NAME] [--label-fallback LABEL] [--kubeconfig PATH]
Defaults: namespace=$NAMESPACE service=$SERVICE_NAME
EOF
}
while [[ $# -gt 0 ]]; do
case "$1" in
--namespace) NAMESPACE="$2"; shift 2;;
--service) SERVICE_NAME="$2"; shift 2;;
--label-fallback) LABEL_FALLBACK="$2"; shift 2;;
--kubeconfig) export KUBECONFIG="$2"; shift 2;;
-h|--help) usage; exit 3;;
*) echo "Unknown arg: $1"; usage; exit 3;;
esac
done
if ! command -v kubectl >/dev/null 2>&1; then
echo "UNKNOWN - kubectl not found"
exit 3
fi
# If KUBECONFIG not set, try sensible defaults so sudo/nagios runs succeed.
if [[ -z "${KUBECONFIG:-}" ]]; then
if [[ -r "/etc/kubernetes/admin.conf" ]]; then
export KUBECONFIG="/etc/kubernetes/admin.conf"
elif [[ -r "/root/.kube/config" ]]; then
export KUBECONFIG="/root/.kube/config"
fi
fi
# Build kubectl command with explicit kubeconfig when available
if [[ -n "${KUBECONFIG:-}" ]]; then
KC=(kubectl --kubeconfig="${KUBECONFIG}" --request-timeout="${TIMEOUT}s")
else
KC=(kubectl --request-timeout="${TIMEOUT}s")
fi
# run_kc: capture stdout only (stderr -> /dev/null) and return kubectl's exit code
run_kc() {
local out rc
out="$("${KC[@]}" "$@" 2>/dev/null)"
rc=$?
printf '%s' "$out"
return $rc
}
# 1) try Endpoints resource
ep_out=$(run_kc -n "$NAMESPACE" get endpoints "$SERVICE_NAME" -o jsonpath='{.subsets[*].addresses[*].ip}')
rc=$?
if (( rc != 0 )); then
echo "CRITICAL - kubectl failed to get Endpoints (exit code ${rc})"
exit 2
fi
if [[ -n "${ep_out// /}" ]]; then
echo "OK - service ${SERVICE_NAME} in ${NAMESPACE} has endpoints: $(echo "$ep_out" | tr ' ' ',')"
exit 0
fi
# 2) try EndpointSlices (k8s >= 1.17)
eps_out=$(run_kc -n "$NAMESPACE" get endpointslices -l "kubernetes.io/service-name=${SERVICE_NAME}" -o jsonpath='{range .items[*]}{range .endpoints[*]}{.addresses[*]}{"\n"}{end}{end}')
rc=$?
if (( rc != 0 )); then
echo "CRITICAL - kubectl failed to get EndpointSlices (exit code ${rc})"
exit 2
fi
if [[ -n "${eps_out// /}" ]]; then
tops=$(printf '%s\n' "$eps_out" | sed '/^\s*$/d' | tr '\n' ',' | sed 's/,$//')
echo "OK - service ${SERVICE_NAME} in ${NAMESPACE} has EndpointSlices addresses: ${tops}"
exit 0
fi
# 3) fallback: check service selector and pods matching it
svc_out=$(run_kc -n "$NAMESPACE" get svc "$SERVICE_NAME" -o jsonpath='{range $k,$v := .spec.selector}{printf "%s=%s;" $k $v}{end}')
rc=$?
if (( rc != 0 )); then
echo "CRITICAL - kubectl failed to get Service selector (exit code ${rc})"
exit 2
fi
SEL="$svc_out"
if [[ -z "$SEL" ]]; then
SEL="$LABEL_FALLBACK"
SEL=${SEL//;/,}
fi
SEL=${SEL%[;,]}
# get pods by selector
pods_out=$(run_kc -n "$NAMESPACE" get pods -l "$SEL" --no-headers -o custom-columns=READY:.status.containerStatuses[0].ready,NAME:.metadata.name)
rc=$?
if (( rc != 0 )); then
echo "CRITICAL - kubectl failed to list pods for selector '${SEL}' (exit code ${rc})"
exit 2
fi
if [[ -z "${pods_out// /}" ]]; then
# try alternative labels common for CoreDNS (k8s-app=coredns)
pods_alt=$(run_kc -n "$NAMESPACE" get pods -l k8s-app=coredns --no-headers -o custom-columns=READY:.status.containerStatuses[0].ready,NAME:.metadata.name)
rc=$?
if (( rc != 0 )); then
echo "CRITICAL - kubectl failed to list pods for fallback selector (exit code ${rc})"
exit 2
fi
if [[ -n "${pods_alt// /}" ]]; then
pods_out="$pods_alt"
SEL="k8s-app=coredns (fallback)"
fi
fi
if [[ -z "${pods_out// /}" ]]; then
echo "CRITICAL - service ${SERVICE_NAME} in ${NAMESPACE} has no endpoints and no pods match selector '${SEL}'"
exit 2
fi
# count Ready pods
not_ready_count=0
total_count=0
not_ready_list=()
while IFS= read -r line; do
[[ -z "$line" ]] && continue
total_count=$((total_count+1))
ready_flag=$(echo "$line" | awk '{print $1}')
pod_name=$(echo "$line" | awk '{print $2}')
if [[ "$ready_flag" != "true" && "$ready_flag" != "True" && "$ready_flag" != "1" ]]; then
not_ready_count=$((not_ready_count+1))
not_ready_list+=("$pod_name")
fi
done <<< "$pods_out"
if (( total_count == 0 )); then
echo "CRITICAL - service ${SERVICE_NAME} in ${NAMESPACE} has no endpoints and no pods found for selector '${SEL}'"
exit 2
fi
if (( not_ready_count > 0 )); then
echo "WARNING - service ${SERVICE_NAME} in ${NAMESPACE} has no endpoints, but ${not_ready_count}/${total_count} pods matching selector '${SEL}' are not Ready: ${not_ready_list[*]}"
exit 1
fi
# If pods exist and are Ready but no Endpoints/EndpointSlices -> likely endpointcontroller/roles mismatch; consider OK but log it
echo "OK - service ${SERVICE_NAME} in ${NAMESPACE} has no Endpoints resource but ${total_count} pods matching selector '${SEL}' are Ready (EndpointSlices absent or controller issue)"
exit 0