#!/usr/bin/env bash # check_k8s_jobs_cronjobs # Vérifie l'état des Kubernetes Jobs et CronJobs. # Exit codes: 0=OK, 1=WARNING, 2=CRITICAL, 3=UNKNOWN # # Fonctions principales : # - détecte Jobs avec des échecs (.status.failed > 0) ou des Jobs "actifs" trop vieux # - recherche d'événements récents (type=Warning) liés aux Jobs dans les X dernières minutes # - vérifie pour les CronJobs que lastScheduleTime n'est pas trop ancien (configurable) si non suspendu # # Usage (exemples) : # sudo /usr/lib/nagios/plugins/check_k8s_jobs_cronjobs --crit 1 --recent-minutes 5 # sudo /usr/lib/nagios/plugins/check_k8s_jobs_cronjobs --ignore-ns kube-system --cron-max-age 120 # set -euo pipefail # Defaults WARN=${WARN:-0} CRIT=${CRIT:-1} IGNORE_NS="" INCLUDE_NS="" AGE_MIN=${AGE_MIN:-60} RECENT_MINUTES=${RECENT_MINUTES:-5} CHECK_CRON=1 CRON_MAX_AGE_MIN=${CRON_MAX_AGE_MIN:-60} print_usage() { cat <= N objets en erreur (default 0) --crit M seuil CRIT si >= M objets en erreur (default 1) --ignore-ns ns1,ns2 namespaces à ignorer --namespaces ns1,ns2 limiter aux namespaces donnés (comma separated) --age-min MINUTES considérer un job "actif" normal si démarré moins de MINUTES (default 60) --recent-minutes MIN chercher événements de Job (Warning) dans les MIN dernières minutes (default 5) --check-cron activer la vérification des CronJobs (default ON) --cron-max-age MINUTES si lastScheduleTime > MINUTES => alerter (default 60). Mettre 0 pour désactiver. -h, --help : affiche l'aide EOF } # Parse args while [[ $# -gt 0 ]]; do case "$1" in --warn) WARN="$2"; shift 2;; --crit) CRIT="$2"; shift 2;; --ignore-ns) IGNORE_NS="$2"; shift 2;; --namespaces) INCLUDE_NS="$2"; shift 2;; --age-min) AGE_MIN="$2"; shift 2;; --recent-minutes) RECENT_MINUTES="$2"; shift 2;; --no-cron) CHECK_CRON=0; shift 1;; --cron-max-age) CRON_MAX_AGE_MIN="$2"; shift 2;; -h|--help) print_usage; exit 3;; *) echo "Unknown arg: $1"; print_usage; exit 3;; esac done if ! command -v kubectl >/dev/null 2>&1; then echo "UNKNOWN - kubectl not found" exit 3 fi # Build namespace filters (regex) ignore_pattern="" if [[ -n "$IGNORE_NS" ]]; then IFS=',' read -ra arr <<< "$IGNORE_NS" for ns in "${arr[@]}"; do ignore_pattern="${ignore_pattern}|^${ns}\$" done ignore_pattern="${ignore_pattern#|}" fi include_pattern="" if [[ -n "$INCLUDE_NS" ]]; then IFS=',' read -ra arr2 <<< "$INCLUDE_NS" for ns in "${arr2[@]}"; do include_pattern="${include_pattern}|^${ns}\$" done include_pattern="${include_pattern#|}" fi ns_allowed() { local ns="$1" if [[ -n "$include_pattern" ]]; then if ! echo "$ns" | egrep -q "$include_pattern"; then return 1 fi fi if [[ -n "$ignore_pattern" ]]; then if echo "$ns" | egrep -q "$ignore_pattern"; then return 1 fi fi return 0 } now_s=$(date +%s) # Initialize problems array safely problems=() # --------------------------- # 1) Inspect Jobs # --------------------------- # Fields: namespace, name, active, succeeded, failed, startTime, completionTime mapfile -t job_lines < <(kubectl get jobs -A -o jsonpath='{range .items[*]}{.metadata.namespace}{"\t"}{.metadata.name}{"\t"}{.status.active}{"\t"}{.status.succeeded}{"\t"}{.status.failed}{"\t"}{.status.startTime}{"\t"}{.status.completionTime}{"\n"}{end}' 2>/dev/null || true) for line in "${job_lines[@]}"; do ns=$(echo "$line" | awk -F'\t' '{print $1}') name=$(echo "$line" | awk -F'\t' '{print $2}') active=$(echo "$line" | awk -F'\t' '{print $3}') succeeded=$(echo "$line" | awk -F'\t' '{print $4}') failed=$(echo "$line" | awk -F'\t' '{print $5}') start=$(echo "$line" | awk -F'\t' '{print $6}') completion=$(echo "$line" | awk -F'\t' '{print $7}') # defaults active=${active:-0} succeeded=${succeeded:-0} failed=${failed:-0} if ! ns_allowed "$ns"; then continue fi # 1.a) Jobs with failures if (( failed > 0 )); then problems+=("Job ${ns}/${name} failedCount=${failed}") continue fi # 1.b) Active jobs running too long if (( active > 0 )); then if [[ -n "$start" && "$start" != "null" ]]; then # convert start timestamp to epoch (GNU date) start_s=$(date -d "$start" +%s 2>/dev/null || echo 0) if (( start_s > 0 )); then age_min=$(( (now_s - start_s) / 60 )) if (( age_min >= AGE_MIN )); then problems+=("Job ${ns}/${name} active for ${age_min}min >= ${AGE_MIN}min") fi fi else # no start time but active >0 -> flag problems+=("Job ${ns}/${name} active but no startTime recorded") fi fi done # 1.c) Recent Job warning events (type=Warning) in last RECENT_MINUTES if (( RECENT_MINUTES > 0 )); then # get events for Jobs (type Warning) with fields: namespace, involvedObject.name, lastTimestamp, reason, message mapfile -t event_lines < <(kubectl get events --all-namespaces --field-selector involvedObject.kind=Job,type=Warning -o custom-columns='NAMESPACE:.metadata.namespace,NAME:.involvedObject.name,LAST:.lastTimestamp,REASON:.reason,MESSAGE:.message' --no-headers 2>/dev/null || true) cutoff_s=$(( now_s - RECENT_MINUTES * 60 )) for ev in "${event_lines[@]}"; do ns=$(echo "$ev" | awk '{print $1}') name=$(echo "$ev" | awk '{print $2}') last=$(echo "$ev" | awk '{print $3}') if ! ns_allowed "$ns"; then continue fi if [[ -n "$last" && "$last" != "" ]]; then ts=$(date -d "$last" +%s 2>/dev/null || echo 0) if (( ts >= cutoff_s )); then problems+=("Job event Warning ${ns}/${name} at $last") fi fi done fi # --------------------------- # 2) Inspect CronJobs (optionnel) # --------------------------- if (( CHECK_CRON == 1 )) && (( CRON_MAX_AGE_MIN > 0 )); then # Fields: namespace, name, suspend (true/false/null), lastScheduleTime mapfile -t cron_lines < <(kubectl get cronjob -A -o jsonpath='{range .items[*]}{.metadata.namespace}{"\t"}{.metadata.name}{"\t"}{.spec.suspend}{"\t"}{.status.lastScheduleTime}{"\n"}{end}' 2>/dev/null || true) for line in "${cron_lines[@]}"; do ns=$(echo "$line" | awk -F'\t' '{print $1}') name=$(echo "$line" | awk -F'\t' '{print $2}') suspend=$(echo "$line" | awk -F'\t' '{print $3}') last=$(echo "$line" | awk -F'\t' '{print $4}') if ! ns_allowed "$ns"; then continue fi # If suspended, do not consider as problem if [[ "$suspend" == "true" ]]; then continue fi if [[ -z "$last" || "$last" == "null" ]]; then # Never scheduled yet: warn (useful to detect misconfigured cronjobs) problems+=("CronJob ${ns}/${name} has no lastScheduleTime (never scheduled?)") continue fi last_s=$(date -d "$last" +%s 2>/dev/null || echo 0) if (( last_s > 0 )); then age_min=$(( (now_s - last_s) / 60 )) if (( age_min > CRON_MAX_AGE_MIN )); then problems+=("CronJob ${ns}/${name} lastSchedule ${age_min}min ago > ${CRON_MAX_AGE_MIN}min") fi else problems+=("CronJob ${ns}/${name} lastScheduleTime unparsable: ${last}") fi done fi # --------------------------- # Final decision & output # --------------------------- count=${#problems[@]} if (( count == 0 )); then echo "OK - Jobs/CronJobs checks passed" exit 0 fi # Severity decision if (( count >= CRIT )); then echo "CRITICAL - ${count} problems found: ${problems[*]}" exit 2 elif (( count >= WARN )); then echo "WARNING - ${count} problems found: ${problems[*]}" exit 1 else echo "OK - ${count} problems found but below thresholds" exit 0 fi