#!/usr/bin/env bash
# check_cilium_health
# Vérifie la santé de Cilium (pods, daemonsets, operator) et optionnellement utilise le binaire `cilium status -o json`.
# Retour: 0=OK, 1=WARNING, 2=CRITICAL, 3=UNKNOWN
#
# Usage:
#  sudo /usr/lib/nagios/plugins/check_cilium_health [--namespace N] [--label LABEL] [--warn-not-ready N] [--crit-not-ready M] [--use-cilium-cli] [--timeout SECS]
#
set -euo pipefail

# Defaults
NAMESPACE=${NAMESPACE:-kube-system}
LABEL=${LABEL:-k8s-app=cilium}
WARN_NOT_READY=${WARN_NOT_READY:-1}
CRIT_NOT_READY=${CRIT_NOT_READY:-2}
WARN_RESTARTS=${WARN_RESTARTS:-3}
CRIT_RESTARTS=${CRIT_RESTARTS:-10}
USE_CILIUM_CLI=0
TIMEOUT=${TIMEOUT:-10}

print_usage() {
  cat <<EOF
Usage: $0 [options]
Options:
  --namespace N          namespace (default: kube-system)
  --label LABEL          pod label selector (default: "k8s-app=cilium")
  --warn-not-ready N     warn if >= N pods not ready (default ${WARN_NOT_READY})
  --crit-not-ready M     critical if >= M pods not ready (default ${CRIT_NOT_READY})
  --warn-restarts R      warn if restartCount >= R per pod (default ${WARN_RESTARTS})
  --crit-restarts S      critical if restartCount >= S per pod (default ${CRIT_RESTARTS})
  --use-cilium-cli       run 'cilium status -o json' as additional check (requires cilium binary)
  --timeout SECS         kubectl timeout in seconds (default ${TIMEOUT})
  -h, --help             show this help
EOF
}

# Parse args
while [[ $# -gt 0 ]]; do
  case "$1" in
    --namespace) NAMESPACE="$2"; shift 2;;
    --label) LABEL="$2"; shift 2;;
    --warn-not-ready) WARN_NOT_READY="$2"; shift 2;;
    --crit-not-ready) CRIT_NOT_READY="$2"; shift 2;;
    --warn-restarts) WARN_RESTARTS="$2"; shift 2;;
    --crit-restarts) CRIT_RESTARTS="$2"; shift 2;;
    --use-cilium-cli) USE_CILIUM_CLI=1; shift 1;;
    --timeout) TIMEOUT="$2"; shift 2;;
    -h|--help) print_usage; exit 3;;
    *) echo "Unknown arg: $1"; print_usage; exit 3;;
  esac
done

# ensure kubectl & python present
if ! command -v kubectl >/dev/null 2>&1; then
  echo "UNKNOWN - kubectl not found in PATH"
  exit 3
fi
if ! command -v python3 >/dev/null 2>&1; then
  echo "UNKNOWN - python3 not found in PATH (required for JSON parsing)"
  exit 3
fi

# ---- kubeconfig handling ----
# If KUBECONFIG is not set, try sensible defaults so sudo/nagios runs succeed.
# Priority:
# 1) env KUBECONFIG if already defined
# 2) /etc/kubernetes/admin.conf if present (common on control-planes)
# 3) /root/.kube/config if present
# 4) fallback to empty (kubectl will then try defaults and may fail)
if [[ -z "${KUBECONFIG:-}" ]]; then
  if [[ -r "/etc/kubernetes/admin.conf" ]]; then
    export KUBECONFIG="/etc/kubernetes/admin.conf"
  elif [[ -r "/root/.kube/config" ]]; then
    export KUBECONFIG="/root/.kube/config"
  else
    # leave unset; kubectl will attempt defaults
    unset KUBECONFIG || true
  fi
fi

# Use explicit kubeconfig for kubectl invocations to avoid home/KUBECONFIG differences under sudo
if [[ -n "${KUBECONFIG:-}" ]]; then
  KC="kubectl --kubeconfig=${KUBECONFIG} --request-timeout=${TIMEOUT}s"
else
  KC="kubectl --request-timeout=${TIMEOUT}s"
fi

# Helper to run python parser safely via temp file
run_python_parser() {
  # $1 = input (stdin), $2 = python here-doc content (as a bash string)
  local input="$1"
  local pyprog="$2"
  local tmp pyfile
  tmp=$(mktemp) || return 1
  pyfile=$(mktemp) || { rm -f "$tmp"; return 1; }
  printf '%s\n' "$pyprog" > "$pyfile"
  printf '%s' "$input" | python3 "$pyfile" > "$tmp" 2>/dev/null
  local rc=$?
  rm -f "$pyfile"
  if [[ $rc -ne 0 ]]; then
    rm -f "$tmp"
    return $rc
  fi
  cat "$tmp"
  rm -f "$tmp"
  return 0
}

# 1) get pods JSON robustly
set +e
pods_json=$($KC -n "$NAMESPACE" get pods -l "$LABEL" -o json 2>&1)
rc_kubectl=$?
set -e
if (( rc_kubectl != 0 )); then
  echo "CRITICAL - kubectl failed to list Cilium pods: ${pods_json//$'\n'/ ' '}"
  exit 2
fi

# 2) parse pods JSON via python (safe invocation)
pod_python_prog=$'import sys,json\ntry:\n    data=json.load(sys.stdin)\nexcept Exception:\n    sys.exit(1)\nitems=data.get(\"items\",[])\nfor it in items:\n    name=it.get(\"metadata\",{}).get(\"name\",\"<noname>\")\n    node=it.get(\"spec\",{}).get(\"nodeName\",\"\")\n    phase=it.get(\"status\",{}).get(\"phase\",\"\")\n    cs=it.get(\"status\",{}).get(\"containerStatuses\",[]) or []\n    total_cont=len(cs)\n    ready_cnt=sum(1 for c in cs if c.get(\"ready\") is True)\n    restarts=sum(int(c.get(\"restartCount\",0) or 0) for c in cs)\n    ready_str = f\"{ready_cnt}/{total_cont}\"\n    print(f\"{name}\\t{phase}\\t{ready_str}\\t{restarts}\\t{node}\")\n'

pod_lines=()
if pod_out=$(run_python_parser "$pods_json" "$pod_python_prog"); then
  # read into array safely
  IFS=$'\n' read -r -d '' -a pod_lines <<< "$(printf '%s\n' "$pod_out")" || true
fi

# Fallback if parsing failed or empty: use simple kubectl get pods --no-headers
if [[ ${#pod_lines[@]} -eq 0 ]]; then
  simple=$($KC -n "$NAMESPACE" get pods -l "$LABEL" --no-headers 2>&1 || true)
  count_simple=$(printf '%s\n' "$simple" | sed '/^\s*$/d' | wc -l)
  if [[ "$count_simple" -eq 0 ]]; then
    echo "CRITICAL - no Cilium pods found or kubectl output unparsable. kubectl output: ${simple//$'\n'/ ' '}"
    exit 2
  fi
  # convert simple lines into pod_lines minimally: NAME READY ... -> parse name and READY column
  while IFS= read -r l; do
    [[ -z "$l" ]] && continue
    name=$(echo "$l" | awk '{print $1}')
    readycol=$(echo "$l" | awk '{print $2}')
    if [[ "$readycol" == *"/"* ]]; then
      rnum=$(echo "$readycol" | cut -d'/' -f1)
      rtot=$(echo "$readycol" | cut -d'/' -f2)
    else
      rnum=0; rtot=0
    fi
    if [[ "$rnum" == "$rtot" && "$rtot" != "0" ]]; then
      phase="Running"
    else
      phase="NotReady"
    fi
    restarts=0
    node=""
    pod_lines+=("${name}\t${phase}\t${rnum}/${rtot}\t${restarts}\t${node}")
  done < <(printf '%s\n' "$simple")
fi

# Now evaluate pod_lines
total_pods=0
not_ready=0
not_ready_list=()
high_restart_pods=()

for line in "${pod_lines[@]}"; do
  [[ -z "$line" ]] && continue
  total_pods=$((total_pods+1))
  IFS=$'\t' read -r pname pphase pready prest pnode <<< "$line"
  ready_num=${pready%/*}
  ready_tot=${pready#*/}
  ready_num=${ready_num:-0}
  ready_tot=${ready_tot:-0}
  if [[ "$pphase" != "Running" ]] || (( ready_num < ready_tot )); then
    not_ready=$((not_ready+1))
    not_ready_list+=("${pname}:${pphase}:${pready}")
  fi
  prest=${prest:-0}
  if (( prest >= CRIT_RESTARTS )); then
    high_restart_pods+=("${pname}:${prest}:CRITICAL")
  elif (( prest >= WARN_RESTARTS )); then
    high_restart_pods+=("${pname}:${prest}:WARN")
  fi
done

# DaemonSet check (desired vs ready) using safe python parsing
set +e
ds_out=$($KC -n "$NAMESPACE" get ds -l "$LABEL" -o json 2>&1)
rc_ds=$?
set -e
ds_desired=0; ds_ready=0
if (( rc_ds == 0 )); then
  ds_python_prog=$'import sys,json\ndata=json.load(sys.stdin)\nfor it in data.get(\"items\",[]):\n    s=it.get(\"status\",{})\n    desired=int(s.get(\"desiredNumberScheduled\") or 0)\n    ready=int(s.get(\"numberReady\") or 0)\n    print(f\"{desired}\\t{ready}\")\n'
  if ds_out_parsed=$(run_python_parser "$ds_out" "$ds_python_prog"); then
    while IFS=$'\n' read -r d; do
      [[ -z "$d" ]] && continue
      ddesired=$(echo "$d" | cut -f1)
      dready=$(echo "$d" | cut -f2)
      ds_desired=$((ds_desired+ddesired))
      ds_ready=$((ds_ready+dready))
    done <<< "$ds_out_parsed"
  fi
fi

# cilium-operator deployment check
op_ok=1
op_msg=""
set +e
op_json=$($KC -n "$NAMESPACE" get deploy cilium-operator -o json 2>/dev/null || true)
set -e
if [[ -n "$op_json" ]]; then
  op_python_prog=$'import sys,json\ndata=json.load(sys.stdin)\nspec=data.get(\"spec\",{})\nstatus=data.get(\"status\",{})\nreplicas=int(spec.get(\"replicas\") or 1)\navailable=int(status.get(\"availableReplicas\") or 0)\nprint(f\"{replicas}\\t{available}\")\n'
  if op_line=$(run_python_parser "$op_json" "$op_python_prog"); then
    IFS=$'\t' read -r op_repl op_avail <<< "$op_line"
    if (( op_avail < op_repl )); then
      op_ok=0
      op_msg="operator available=${op_avail}/${op_repl}"
    else
      op_msg="operator available=${op_avail}/${op_repl}"
    fi
  fi
fi

# Optional: cilium CLI
cilium_ok=1
cilium_summary=""
if (( USE_CILIUM_CLI == 1 )); then
  if ! command -v cilium >/dev/null 2>&1; then
    cilium_ok=0
    cilium_summary="cilium binary not in PATH"
  else
    set +e
    cilium_raw=$(cilium status -o json 2>&1) || true
    rc_cilium=$?
    set -e
    if (( rc_cilium != 0 )); then
      cilium_ok=0
      cilium_summary="cilium status failed: ${cilium_raw//$'\n'/ ' '}"
    else
      cilium_ok=1
      cilium_summary=$(printf '%s' "$cilium_raw" | tr '\n' ' ' | sed 's/  */ /g' | cut -c1-300)
    fi
  fi
fi

# Compose status
code=0
msgs=()

if (( not_ready >= CRIT_NOT_READY )); then
  code=2
  msgs+=("CRITICAL - ${not_ready}/${total_pods} pods not ready")
elif (( not_ready >= WARN_NOT_READY )); then
  if (( code < 1 )); then code=1; fi
  msgs+=("WARNING - ${not_ready}/${total_pods} pods not ready")
else
  msgs+=("OK - ${total_pods} pods, not-ready=${not_ready}")
fi

if (( ds_desired > 0 )) && (( ds_ready < ds_desired )); then
  if (( ds_desired - ds_ready >= CRIT_NOT_READY )); then
    code=2
    msgs+=("CRITICAL - daemonsets ready=${ds_ready}/${ds_desired}")
  else
    if (( code < 1 )); then code=1; fi
    msgs+=("WARNING - daemonsets ready=${ds_ready}/${ds_desired}")
  fi
fi

if [[ -n "$op_msg" ]]; then
  if (( op_ok == 0 )); then
    code=2
    msgs+=("CRITICAL - ${op_msg}")
  else
    msgs+=("${op_msg}")
  fi
fi

if (( ${#high_restart_pods[@]} > 0 )); then
  crit_restart=0; warn_restart=0
  for r in "${high_restart_pods[@]}"; do
    [[ "$r" == *":CRITICAL" ]] && crit_restart=1
    [[ "$r" == *":WARN" ]] && warn_restart=1
  done
  if (( crit_restart == 1 )); then
    code=2
    msgs+=("CRITICAL - pods with high restart counts: ${high_restart_pods[*]}")
  elif (( warn_restart == 1 )); then
    if (( code < 1 )); then code=1; fi
    msgs+=("WARNING - pods with elevated restarts: ${high_restart_pods[*]}")
  fi
fi

if (( USE_CILIUM_CLI == 1 )); then
  if (( cilium_ok == 0 )); then
    code=2
    msgs+=("CRITICAL - cilium-cli: ${cilium_summary}")
  else
    msgs+=("cilium-cli ok: ${cilium_summary}")
  fi
fi

if (( not_ready > 0 )); then
  truncated=$(printf "%s, " "${not_ready_list[@]}" | sed 's/, $//')
  msgs+=("not-ready-list: ${truncated}")
fi

echo "$(IFS=' ; '; echo "${msgs[*]}")"
exit "${code}"