#!/usr/bin/env bash
# check_k8s_deployments
# Vérifie les Deployments Kubernetes: availableReplicas < spec.replicas
# Retour: 0=OK, 1=WARNING, 2=CRITICAL, 3=UNKNOWN
#
# Usage:
#  sudo /usr/lib/nagios/plugins/check_k8s_deployments [--warn N] [--crit M] [--ignore-ns ns1,ns2] [--namespaces ns1,ns2] [--age-min MINUTES]
#
# Exemples:
#  sudo /usr/lib/nagios/plugins/check_k8s_deployments --crit 1
#  sudo /usr/lib/nagios/plugins/check_k8s_deployments --ignore-ns kube-system,monitoring
#
set -euo pipefail

WARN=${WARN:-0}   # nombre de deploys en erreur pour WARNING
CRIT=${CRIT:-1}   # nombre de deploys en erreur pour CRITICAL par défaut (1 => tout problème -> CRITICAL)
IGNORE_NS=""
INCLUDE_NS=""
AGE_MIN=0

print_usage() {
  cat <<EOF
Usage: $0 [--warn N] [--crit M] [--ignore-ns ns1,ns2] [--namespaces ns1,ns2] [--age-min MINUTES]
 --warn N         : seuil warn si >=N déploiements en erreur (default 0)
 --crit M         : seuil crit si >=M déploiements en erreur (default 1)
 --ignore-ns LIST : comma separated namespaces to ignore (default none)
 --namespaces LIST: comma separated namespaces to check only (default all)
 --age-min N      : ignore deployments created less than N minutes ago (avoid flapping during rollout)
EOF
}

# parse args
while [[ $# -gt 0 ]]; do
  case "$1" in
    --warn) WARN="$2"; shift 2;;
    --crit) CRIT="$2"; shift 2;;
    --ignore-ns) IGNORE_NS="$2"; shift 2;;
    --namespaces) INCLUDE_NS="$2"; shift 2;;
    --age-min) AGE_MIN="$2"; shift 2;;
    -h|--help) print_usage; exit 3;;
    *) echo "Unknown arg: $1"; print_usage; exit 3;;
  esac
done

if ! command -v kubectl >/dev/null 2>&1; then
  echo "UNKNOWN - kubectl not found"
  exit 3
fi

# Build filter for namespace inclusion/exclusion
ignore_pattern=""
if [[ -n "$IGNORE_NS" ]]; then
  IFS=',' read -ra arr <<< "$IGNORE_NS"
  for ns in "${arr[@]}"; do
    ignore_pattern="${ignore_pattern}|^${ns}\$"
  done
  # remove leading |
  ignore_pattern="${ignore_pattern#|}"
fi

include_pattern=""
if [[ -n "$INCLUDE_NS" ]]; then
  IFS=',' read -ra arr2 <<< "$INCLUDE_NS"
  for ns in "${arr2[@]}"; do
    include_pattern="${include_pattern}|^${ns}\$"
  done
  include_pattern="${include_pattern#|}"
fi

# result collection
# Initialize failures array to avoid "variable sans liaison" when running with set -u
failures=()

# get list: namespace, name, desired, available, creationTimestamp
mapfile -t lines < <(kubectl get deploy -A -o jsonpath='{range .items[*]}{.metadata.namespace}{"\t"}{.metadata.name}{"\t"}{.spec.replicas}{"\t"}{.status.availableReplicas}{"\t"}{.metadata.creationTimestamp}{"\n"}{end}' 2>/dev/null || true)

now_s=$(date +%s)

for line in "${lines[@]}"; do
  # skip empty lines
  [[ -z "${line}" ]] && continue

  ns=$(echo "$line" | awk -F'\t' '{print $1}')
  name=$(echo "$line" | awk -F'\t' '{print $2}')
  desired=$(echo "$line" | awk -F'\t' '{print $3}')
  available=$(echo "$line" | awk -F'\t' '{print $4}')
  created=$(echo "$line" | awk -F'\t' '{print $5}')

  # normalize
  desired=${desired:-0}
  available=${available:-0}

  # namespace filtering
  if [[ -n "$include_pattern" ]]; then
    if ! echo "$ns" | egrep -q "$include_pattern"; then
      continue
    fi
  fi
  if [[ -n "$ignore_pattern" ]]; then
    if echo "$ns" | egrep -q "$ignore_pattern"; then
      continue
    fi
  fi

  # age filtering
  if [[ -n "$created" && "$AGE_MIN" -gt 0 ]]; then
    # convert to epoch
    created_s=$(date -d "$created" +%s 2>/dev/null || echo 0)
    age_min=$(( (now_s - created_s) / 60 ))
    if (( age_min < AGE_MIN )); then
      # skip new deployments (they might be still rolling out)
      continue
    fi
  fi

  if (( available < desired )); then
    failures+=("${ns}/${name} (desired=${desired},available=${available})")
  fi
done

count=${#failures[@]}

if (( count == 0 )); then
  echo "OK - all deployments report desired==available"
  exit 0
fi

# Decide severity
if (( count >= CRIT )); then
  echo "CRITICAL - ${count} deployments not available: ${failures[*]}"
  exit 2
elif (( count >= WARN )); then
  echo "WARNING - ${count} deployments not available: ${failures[*]}"
  exit 1
else
  echo "OK - ${count} deployments not available but below thresholds"
  exit 0
fi