#!/usr/bin/env bash
# check_k8s_jobs_cronjobs
# Vérifie l'état des Kubernetes Jobs et CronJobs.
# Exit codes: 0=OK, 1=WARNING, 2=CRITICAL, 3=UNKNOWN
#
# Fonctions principales :
#  - détecte Jobs avec des échecs (.status.failed > 0) ou des Jobs "actifs" trop vieux
#  - recherche d'événements récents (type=Warning) liés aux Jobs dans les X dernières minutes
#  - vérifie pour les CronJobs que lastScheduleTime n'est pas trop ancien (configurable) si non suspendu
#
# Usage (exemples) :
#  sudo /usr/lib/nagios/plugins/check_k8s_jobs_cronjobs --crit 1 --recent-minutes 5
#  sudo /usr/lib/nagios/plugins/check_k8s_jobs_cronjobs --ignore-ns kube-system --cron-max-age 120
#
set -euo pipefail

# Defaults
WARN=${WARN:-0}
CRIT=${CRIT:-1}
IGNORE_NS=""
INCLUDE_NS=""
AGE_MIN=${AGE_MIN:-60}
RECENT_MINUTES=${RECENT_MINUTES:-5}
CHECK_CRON=1
CRON_MAX_AGE_MIN=${CRON_MAX_AGE_MIN:-60}

print_usage() {
  cat <<EOF
Usage: $0 [options]
Options:
  --warn N                seuil WARN si >= N objets en erreur (default 0)
  --crit M                seuil CRIT si >= M objets en erreur (default 1)
  --ignore-ns ns1,ns2     namespaces à ignorer
  --namespaces ns1,ns2    limiter aux namespaces donnés (comma separated)
  --age-min MINUTES       considérer un job "actif" normal si démarré moins de MINUTES (default 60)
  --recent-minutes MIN    chercher événements de Job (Warning) dans les MIN dernières minutes (default 5)
  --check-cron            activer la vérification des CronJobs (default ON)
  --cron-max-age MINUTES  si lastScheduleTime > MINUTES => alerter (default 60). Mettre 0 pour désactiver.
  -h, --help              : affiche l'aide
EOF
}

# Parse args
while [[ $# -gt 0 ]]; do
  case "$1" in
    --warn) WARN="$2"; shift 2;;
    --crit) CRIT="$2"; shift 2;;
    --ignore-ns) IGNORE_NS="$2"; shift 2;;
    --namespaces) INCLUDE_NS="$2"; shift 2;;
    --age-min) AGE_MIN="$2"; shift 2;;
    --recent-minutes) RECENT_MINUTES="$2"; shift 2;;
    --no-cron) CHECK_CRON=0; shift 1;;
    --cron-max-age) CRON_MAX_AGE_MIN="$2"; shift 2;;
    -h|--help) print_usage; exit 3;;
    *) echo "Unknown arg: $1"; print_usage; exit 3;;
  esac
done

if ! command -v kubectl >/dev/null 2>&1; then
  echo "UNKNOWN - kubectl not found"
  exit 3
fi

# Build namespace filters (regex)
ignore_pattern=""
if [[ -n "$IGNORE_NS" ]]; then
  IFS=',' read -ra arr <<< "$IGNORE_NS"
  for ns in "${arr[@]}"; do
    ignore_pattern="${ignore_pattern}|^${ns}\$"
  done
  ignore_pattern="${ignore_pattern#|}"
fi

include_pattern=""
if [[ -n "$INCLUDE_NS" ]]; then
  IFS=',' read -ra arr2 <<< "$INCLUDE_NS"
  for ns in "${arr2[@]}"; do
    include_pattern="${include_pattern}|^${ns}\$"
  done
  include_pattern="${include_pattern#|}"
fi

ns_allowed() {
  local ns="$1"
  if [[ -n "$include_pattern" ]]; then
    if ! echo "$ns" | egrep -q "$include_pattern"; then
      return 1
    fi
  fi
  if [[ -n "$ignore_pattern" ]]; then
    if echo "$ns" | egrep -q "$ignore_pattern"; then
      return 1
    fi
  fi
  return 0
}

now_s=$(date +%s)

# Initialize problems array safely
problems=()

# ---------------------------
# 1) Inspect Jobs
# ---------------------------
# Fields: namespace, name, active, succeeded, failed, startTime, completionTime
mapfile -t job_lines < <(kubectl get jobs -A -o jsonpath='{range .items[*]}{.metadata.namespace}{"\t"}{.metadata.name}{"\t"}{.status.active}{"\t"}{.status.succeeded}{"\t"}{.status.failed}{"\t"}{.status.startTime}{"\t"}{.status.completionTime}{"\n"}{end}' 2>/dev/null || true)

for line in "${job_lines[@]}"; do
  ns=$(echo "$line" | awk -F'\t' '{print $1}')
  name=$(echo "$line" | awk -F'\t' '{print $2}')
  active=$(echo "$line" | awk -F'\t' '{print $3}')
  succeeded=$(echo "$line" | awk -F'\t' '{print $4}')
  failed=$(echo "$line" | awk -F'\t' '{print $5}')
  start=$(echo "$line" | awk -F'\t' '{print $6}')
  completion=$(echo "$line" | awk -F'\t' '{print $7}')

  # defaults
  active=${active:-0}
  succeeded=${succeeded:-0}
  failed=${failed:-0}

  if ! ns_allowed "$ns"; then
    continue
  fi

  # 1.a) Jobs with failures
  if (( failed > 0 )); then
    problems+=("Job ${ns}/${name} failedCount=${failed}")
    continue
  fi

  # 1.b) Active jobs running too long
  if (( active > 0 )); then
    if [[ -n "$start" && "$start" != "null" ]]; then
      # convert start timestamp to epoch (GNU date)
      start_s=$(date -d "$start" +%s 2>/dev/null || echo 0)
      if (( start_s > 0 )); then
        age_min=$(( (now_s - start_s) / 60 ))
        if (( age_min >= AGE_MIN )); then
          problems+=("Job ${ns}/${name} active for ${age_min}min >= ${AGE_MIN}min")
        fi
      fi
    else
      # no start time but active >0 -> flag
      problems+=("Job ${ns}/${name} active but no startTime recorded")
    fi
  fi
done

# 1.c) Recent Job warning events (type=Warning) in last RECENT_MINUTES
if (( RECENT_MINUTES > 0 )); then
  # get events for Jobs (type Warning) with fields: namespace, involvedObject.name, lastTimestamp, reason, message
  mapfile -t event_lines < <(kubectl get events --all-namespaces --field-selector involvedObject.kind=Job,type=Warning -o custom-columns='NAMESPACE:.metadata.namespace,NAME:.involvedObject.name,LAST:.lastTimestamp,REASON:.reason,MESSAGE:.message' --no-headers 2>/dev/null || true)
  cutoff_s=$(( now_s - RECENT_MINUTES * 60 ))
  for ev in "${event_lines[@]}"; do
    ns=$(echo "$ev" | awk '{print $1}')
    name=$(echo "$ev" | awk '{print $2}')
    last=$(echo "$ev" | awk '{print $3}')
    if ! ns_allowed "$ns"; then
      continue
    fi
    if [[ -n "$last" && "$last" != "<none>" ]]; then
      ts=$(date -d "$last" +%s 2>/dev/null || echo 0)
      if (( ts >= cutoff_s )); then
        problems+=("Job event Warning ${ns}/${name} at $last")
      fi
    fi
  done
fi

# ---------------------------
# 2) Inspect CronJobs (optionnel)
# ---------------------------
if (( CHECK_CRON == 1 )) && (( CRON_MAX_AGE_MIN > 0 )); then
  # Fields: namespace, name, suspend (true/false/null), lastScheduleTime
  mapfile -t cron_lines < <(kubectl get cronjob -A -o jsonpath='{range .items[*]}{.metadata.namespace}{"\t"}{.metadata.name}{"\t"}{.spec.suspend}{"\t"}{.status.lastScheduleTime}{"\n"}{end}' 2>/dev/null || true)

  for line in "${cron_lines[@]}"; do
    ns=$(echo "$line" | awk -F'\t' '{print $1}')
    name=$(echo "$line" | awk -F'\t' '{print $2}')
    suspend=$(echo "$line" | awk -F'\t' '{print $3}')
    last=$(echo "$line" | awk -F'\t' '{print $4}')

    if ! ns_allowed "$ns"; then
      continue
    fi

    # If suspended, do not consider as problem
    if [[ "$suspend" == "true" ]]; then
      continue
    fi

    if [[ -z "$last" || "$last" == "null" ]]; then
      # Never scheduled yet: warn (useful to detect misconfigured cronjobs)
      problems+=("CronJob ${ns}/${name} has no lastScheduleTime (never scheduled?)")
      continue
    fi

    last_s=$(date -d "$last" +%s 2>/dev/null || echo 0)
    if (( last_s > 0 )); then
      age_min=$(( (now_s - last_s) / 60 ))
      if (( age_min > CRON_MAX_AGE_MIN )); then
        problems+=("CronJob ${ns}/${name} lastSchedule ${age_min}min ago > ${CRON_MAX_AGE_MIN}min")
      fi
    else
      problems+=("CronJob ${ns}/${name} lastScheduleTime unparsable: ${last}")
    fi
  done
fi

# ---------------------------
# Final decision & output
# ---------------------------
count=${#problems[@]}

if (( count == 0 )); then
  echo "OK - Jobs/CronJobs checks passed"
  exit 0
fi

# Severity decision
if (( count >= CRIT )); then
  echo "CRITICAL - ${count} problems found: ${problems[*]}"
  exit 2
elif (( count >= WARN )); then
  echo "WARNING - ${count} problems found: ${problems[*]}"
  exit 1
else
  echo "OK - ${count} problems found but below thresholds"
  exit 0
fi