#!/usr/bin/env bash
# check_coredns_health
# Vérifie la santé de CoreDNS (endpoints + endpointslices + fallback pods)
# Retour codes: 0=OK, 1=WARNING, 2=CRITICAL, 3=UNKNOWN
#
# Usage:
#  sudo /usr/lib/nagios/plugins/check_coredns_health [--namespace N] [--service NAME] [--label-fallback LABEL] [--kubeconfig PATH]
#
set -euo pipefail

NAMESPACE=${NAMESPACE:-kube-system}
SERVICE_NAME=${SERVICE_NAME:-coredns}
LABEL_FALLBACK=${LABEL_FALLBACK:-k8s-app=kube-dns}
TIMEOUT=${TIMEOUT:-10}

usage() {
  cat <<EOF
Usage: $0 [--namespace N] [--service NAME] [--label-fallback LABEL] [--kubeconfig PATH]
Defaults: namespace=$NAMESPACE service=$SERVICE_NAME
EOF
}

while [[ $# -gt 0 ]]; do
  case "$1" in
    --namespace) NAMESPACE="$2"; shift 2;;
    --service) SERVICE_NAME="$2"; shift 2;;
    --label-fallback) LABEL_FALLBACK="$2"; shift 2;;
    --kubeconfig) export KUBECONFIG="$2"; shift 2;;
    -h|--help) usage; exit 3;;
    *) echo "Unknown arg: $1"; usage; exit 3;;
  esac
done

if ! command -v kubectl >/dev/null 2>&1; then
  echo "UNKNOWN - kubectl not found"
  exit 3
fi

# If KUBECONFIG not set, try sensible defaults so sudo/nagios runs succeed.
if [[ -z "${KUBECONFIG:-}" ]]; then
  if [[ -r "/etc/kubernetes/admin.conf" ]]; then
    export KUBECONFIG="/etc/kubernetes/admin.conf"
  elif [[ -r "/root/.kube/config" ]]; then
    export KUBECONFIG="/root/.kube/config"
  fi
fi

# Build kubectl command with explicit kubeconfig when available
if [[ -n "${KUBECONFIG:-}" ]]; then
  KC=(kubectl --kubeconfig="${KUBECONFIG}" --request-timeout="${TIMEOUT}s")
else
  KC=(kubectl --request-timeout="${TIMEOUT}s")
fi

# run_kc: capture stdout only (stderr -> /dev/null) and return kubectl's exit code
run_kc() {
  local out rc
  out="$("${KC[@]}" "$@" 2>/dev/null)"
  rc=$?
  printf '%s' "$out"
  return $rc
}

# 1) try Endpoints resource
ep_out=$(run_kc -n "$NAMESPACE" get endpoints "$SERVICE_NAME" -o jsonpath='{.subsets[*].addresses[*].ip}')
rc=$?
if (( rc != 0 )); then
  echo "CRITICAL - kubectl failed to get Endpoints (exit code ${rc})"
  exit 2
fi
if [[ -n "${ep_out// /}" ]]; then
  echo "OK - service ${SERVICE_NAME} in ${NAMESPACE} has endpoints: $(echo "$ep_out" | tr ' ' ',')"
  exit 0
fi

# 2) try EndpointSlices (k8s >= 1.17)
eps_out=$(run_kc -n "$NAMESPACE" get endpointslices -l "kubernetes.io/service-name=${SERVICE_NAME}" -o jsonpath='{range .items[*]}{range .endpoints[*]}{.addresses[*]}{"\n"}{end}{end}')
rc=$?
if (( rc != 0 )); then
  echo "CRITICAL - kubectl failed to get EndpointSlices (exit code ${rc})"
  exit 2
fi
if [[ -n "${eps_out// /}" ]]; then
  tops=$(printf '%s\n' "$eps_out" | sed '/^\s*$/d' | tr '\n' ',' | sed 's/,$//')
  echo "OK - service ${SERVICE_NAME} in ${NAMESPACE} has EndpointSlices addresses: ${tops}"
  exit 0
fi

# 3) fallback: check service selector and pods matching it
svc_out=$(run_kc -n "$NAMESPACE" get svc "$SERVICE_NAME" -o jsonpath='{range $k,$v := .spec.selector}{printf "%s=%s;" $k $v}{end}')
rc=$?
if (( rc != 0 )); then
  echo "CRITICAL - kubectl failed to get Service selector (exit code ${rc})"
  exit 2
fi

SEL="$svc_out"
if [[ -z "$SEL" ]]; then
  SEL="$LABEL_FALLBACK"
  SEL=${SEL//;/,}
fi
SEL=${SEL%[;,]}

# get pods by selector
pods_out=$(run_kc -n "$NAMESPACE" get pods -l "$SEL" --no-headers -o custom-columns=READY:.status.containerStatuses[0].ready,NAME:.metadata.name)
rc=$?
if (( rc != 0 )); then
  echo "CRITICAL - kubectl failed to list pods for selector '${SEL}' (exit code ${rc})"
  exit 2
fi

if [[ -z "${pods_out// /}" ]]; then
  # try alternative labels common for CoreDNS (k8s-app=coredns)
  pods_alt=$(run_kc -n "$NAMESPACE" get pods -l k8s-app=coredns --no-headers -o custom-columns=READY:.status.containerStatuses[0].ready,NAME:.metadata.name)
  rc=$?
  if (( rc != 0 )); then
    echo "CRITICAL - kubectl failed to list pods for fallback selector (exit code ${rc})"
    exit 2
  fi
  if [[ -n "${pods_alt// /}" ]]; then
    pods_out="$pods_alt"
    SEL="k8s-app=coredns (fallback)"
  fi
fi

if [[ -z "${pods_out// /}" ]]; then
  echo "CRITICAL - service ${SERVICE_NAME} in ${NAMESPACE} has no endpoints and no pods match selector '${SEL}'"
  exit 2
fi

# count Ready pods
not_ready_count=0
total_count=0
not_ready_list=()
while IFS= read -r line; do
  [[ -z "$line" ]] && continue
  total_count=$((total_count+1))
  ready_flag=$(echo "$line" | awk '{print $1}')
  pod_name=$(echo "$line" | awk '{print $2}')
  if [[ "$ready_flag" != "true" && "$ready_flag" != "True" && "$ready_flag" != "1" ]]; then
    not_ready_count=$((not_ready_count+1))
    not_ready_list+=("$pod_name")
  fi
done <<< "$pods_out"

if (( total_count == 0 )); then
  echo "CRITICAL - service ${SERVICE_NAME} in ${NAMESPACE} has no endpoints and no pods found for selector '${SEL}'"
  exit 2
fi

if (( not_ready_count > 0 )); then
  echo "WARNING - service ${SERVICE_NAME} in ${NAMESPACE} has no endpoints, but ${not_ready_count}/${total_count} pods matching selector '${SEL}' are not Ready: ${not_ready_list[*]}"
  exit 1
fi

# If pods exist and are Ready but no Endpoints/EndpointSlices -> likely endpointcontroller/roles mismatch; consider OK but log it
echo "OK - service ${SERVICE_NAME} in ${NAMESPACE} has no Endpoints resource but ${total_count} pods matching selector '${SEL}' are Ready (EndpointSlices absent or controller issue)"
exit 0