Files
nrpe/files/nrpe/check_k8s_jobs_cronjobs
T
2025-12-31 15:17:51 +01:00

232 lines
7.5 KiB
Bash
Executable File

#!/usr/bin/env bash
# check_k8s_jobs_cronjobs
# Vérifie l'état des Kubernetes Jobs et CronJobs.
# Exit codes: 0=OK, 1=WARNING, 2=CRITICAL, 3=UNKNOWN
#
# Fonctions principales :
# - détecte Jobs avec des échecs (.status.failed > 0) ou des Jobs "actifs" trop vieux
# - recherche d'événements récents (type=Warning) liés aux Jobs dans les X dernières minutes
# - vérifie pour les CronJobs que lastScheduleTime n'est pas trop ancien (configurable) si non suspendu
#
# Usage (exemples) :
# sudo /usr/lib/nagios/plugins/check_k8s_jobs_cronjobs --crit 1 --recent-minutes 5
# sudo /usr/lib/nagios/plugins/check_k8s_jobs_cronjobs --ignore-ns kube-system --cron-max-age 120
#
set -euo pipefail
# Defaults
WARN=${WARN:-0}
CRIT=${CRIT:-1}
IGNORE_NS=""
INCLUDE_NS=""
AGE_MIN=${AGE_MIN:-60}
RECENT_MINUTES=${RECENT_MINUTES:-5}
CHECK_CRON=1
CRON_MAX_AGE_MIN=${CRON_MAX_AGE_MIN:-60}
print_usage() {
cat <<EOF
Usage: $0 [options]
Options:
--warn N seuil WARN si >= N objets en erreur (default 0)
--crit M seuil CRIT si >= M objets en erreur (default 1)
--ignore-ns ns1,ns2 namespaces à ignorer
--namespaces ns1,ns2 limiter aux namespaces donnés (comma separated)
--age-min MINUTES considérer un job "actif" normal si démarré moins de MINUTES (default 60)
--recent-minutes MIN chercher événements de Job (Warning) dans les MIN dernières minutes (default 5)
--check-cron activer la vérification des CronJobs (default ON)
--cron-max-age MINUTES si lastScheduleTime > MINUTES => alerter (default 60). Mettre 0 pour désactiver.
-h, --help : affiche l'aide
EOF
}
# Parse args
while [[ $# -gt 0 ]]; do
case "$1" in
--warn) WARN="$2"; shift 2;;
--crit) CRIT="$2"; shift 2;;
--ignore-ns) IGNORE_NS="$2"; shift 2;;
--namespaces) INCLUDE_NS="$2"; shift 2;;
--age-min) AGE_MIN="$2"; shift 2;;
--recent-minutes) RECENT_MINUTES="$2"; shift 2;;
--no-cron) CHECK_CRON=0; shift 1;;
--cron-max-age) CRON_MAX_AGE_MIN="$2"; shift 2;;
-h|--help) print_usage; exit 3;;
*) echo "Unknown arg: $1"; print_usage; exit 3;;
esac
done
if ! command -v kubectl >/dev/null 2>&1; then
echo "UNKNOWN - kubectl not found"
exit 3
fi
# Build namespace filters (regex)
ignore_pattern=""
if [[ -n "$IGNORE_NS" ]]; then
IFS=',' read -ra arr <<< "$IGNORE_NS"
for ns in "${arr[@]}"; do
ignore_pattern="${ignore_pattern}|^${ns}\$"
done
ignore_pattern="${ignore_pattern#|}"
fi
include_pattern=""
if [[ -n "$INCLUDE_NS" ]]; then
IFS=',' read -ra arr2 <<< "$INCLUDE_NS"
for ns in "${arr2[@]}"; do
include_pattern="${include_pattern}|^${ns}\$"
done
include_pattern="${include_pattern#|}"
fi
ns_allowed() {
local ns="$1"
if [[ -n "$include_pattern" ]]; then
if ! echo "$ns" | egrep -q "$include_pattern"; then
return 1
fi
fi
if [[ -n "$ignore_pattern" ]]; then
if echo "$ns" | egrep -q "$ignore_pattern"; then
return 1
fi
fi
return 0
}
now_s=$(date +%s)
# Initialize problems array safely
problems=()
# ---------------------------
# 1) Inspect Jobs
# ---------------------------
# Fields: namespace, name, active, succeeded, failed, startTime, completionTime
mapfile -t job_lines < <(kubectl get jobs -A -o jsonpath='{range .items[*]}{.metadata.namespace}{"\t"}{.metadata.name}{"\t"}{.status.active}{"\t"}{.status.succeeded}{"\t"}{.status.failed}{"\t"}{.status.startTime}{"\t"}{.status.completionTime}{"\n"}{end}' 2>/dev/null || true)
for line in "${job_lines[@]}"; do
ns=$(echo "$line" | awk -F'\t' '{print $1}')
name=$(echo "$line" | awk -F'\t' '{print $2}')
active=$(echo "$line" | awk -F'\t' '{print $3}')
succeeded=$(echo "$line" | awk -F'\t' '{print $4}')
failed=$(echo "$line" | awk -F'\t' '{print $5}')
start=$(echo "$line" | awk -F'\t' '{print $6}')
completion=$(echo "$line" | awk -F'\t' '{print $7}')
# defaults
active=${active:-0}
succeeded=${succeeded:-0}
failed=${failed:-0}
if ! ns_allowed "$ns"; then
continue
fi
# 1.a) Jobs with failures
if (( failed > 0 )); then
problems+=("Job ${ns}/${name} failedCount=${failed}")
continue
fi
# 1.b) Active jobs running too long
if (( active > 0 )); then
if [[ -n "$start" && "$start" != "null" ]]; then
# convert start timestamp to epoch (GNU date)
start_s=$(date -d "$start" +%s 2>/dev/null || echo 0)
if (( start_s > 0 )); then
age_min=$(( (now_s - start_s) / 60 ))
if (( age_min >= AGE_MIN )); then
problems+=("Job ${ns}/${name} active for ${age_min}min >= ${AGE_MIN}min")
fi
fi
else
# no start time but active >0 -> flag
problems+=("Job ${ns}/${name} active but no startTime recorded")
fi
fi
done
# 1.c) Recent Job warning events (type=Warning) in last RECENT_MINUTES
if (( RECENT_MINUTES > 0 )); then
# get events for Jobs (type Warning) with fields: namespace, involvedObject.name, lastTimestamp, reason, message
mapfile -t event_lines < <(kubectl get events --all-namespaces --field-selector involvedObject.kind=Job,type=Warning -o custom-columns='NAMESPACE:.metadata.namespace,NAME:.involvedObject.name,LAST:.lastTimestamp,REASON:.reason,MESSAGE:.message' --no-headers 2>/dev/null || true)
cutoff_s=$(( now_s - RECENT_MINUTES * 60 ))
for ev in "${event_lines[@]}"; do
ns=$(echo "$ev" | awk '{print $1}')
name=$(echo "$ev" | awk '{print $2}')
last=$(echo "$ev" | awk '{print $3}')
if ! ns_allowed "$ns"; then
continue
fi
if [[ -n "$last" && "$last" != "<none>" ]]; then
ts=$(date -d "$last" +%s 2>/dev/null || echo 0)
if (( ts >= cutoff_s )); then
problems+=("Job event Warning ${ns}/${name} at $last")
fi
fi
done
fi
# ---------------------------
# 2) Inspect CronJobs (optionnel)
# ---------------------------
if (( CHECK_CRON == 1 )) && (( CRON_MAX_AGE_MIN > 0 )); then
# Fields: namespace, name, suspend (true/false/null), lastScheduleTime
mapfile -t cron_lines < <(kubectl get cronjob -A -o jsonpath='{range .items[*]}{.metadata.namespace}{"\t"}{.metadata.name}{"\t"}{.spec.suspend}{"\t"}{.status.lastScheduleTime}{"\n"}{end}' 2>/dev/null || true)
for line in "${cron_lines[@]}"; do
ns=$(echo "$line" | awk -F'\t' '{print $1}')
name=$(echo "$line" | awk -F'\t' '{print $2}')
suspend=$(echo "$line" | awk -F'\t' '{print $3}')
last=$(echo "$line" | awk -F'\t' '{print $4}')
if ! ns_allowed "$ns"; then
continue
fi
# If suspended, do not consider as problem
if [[ "$suspend" == "true" ]]; then
continue
fi
if [[ -z "$last" || "$last" == "null" ]]; then
# Never scheduled yet: warn (useful to detect misconfigured cronjobs)
problems+=("CronJob ${ns}/${name} has no lastScheduleTime (never scheduled?)")
continue
fi
last_s=$(date -d "$last" +%s 2>/dev/null || echo 0)
if (( last_s > 0 )); then
age_min=$(( (now_s - last_s) / 60 ))
if (( age_min > CRON_MAX_AGE_MIN )); then
problems+=("CronJob ${ns}/${name} lastSchedule ${age_min}min ago > ${CRON_MAX_AGE_MIN}min")
fi
else
problems+=("CronJob ${ns}/${name} lastScheduleTime unparsable: ${last}")
fi
done
fi
# ---------------------------
# Final decision & output
# ---------------------------
count=${#problems[@]}
if (( count == 0 )); then
echo "OK - Jobs/CronJobs checks passed"
exit 0
fi
# Severity decision
if (( count >= CRIT )); then
echo "CRITICAL - ${count} problems found: ${problems[*]}"
exit 2
elif (( count >= WARN )); then
echo "WARNING - ${count} problems found: ${problems[*]}"
exit 1
else
echo "OK - ${count} problems found but below thresholds"
exit 0
fi