You've already forked nrpe
232 lines
7.5 KiB
Bash
Executable File
232 lines
7.5 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# check_k8s_jobs_cronjobs
|
|
# Vérifie l'état des Kubernetes Jobs et CronJobs.
|
|
# Exit codes: 0=OK, 1=WARNING, 2=CRITICAL, 3=UNKNOWN
|
|
#
|
|
# Fonctions principales :
|
|
# - détecte Jobs avec des échecs (.status.failed > 0) ou des Jobs "actifs" trop vieux
|
|
# - recherche d'événements récents (type=Warning) liés aux Jobs dans les X dernières minutes
|
|
# - vérifie pour les CronJobs que lastScheduleTime n'est pas trop ancien (configurable) si non suspendu
|
|
#
|
|
# Usage (exemples) :
|
|
# sudo /usr/lib/nagios/plugins/check_k8s_jobs_cronjobs --crit 1 --recent-minutes 5
|
|
# sudo /usr/lib/nagios/plugins/check_k8s_jobs_cronjobs --ignore-ns kube-system --cron-max-age 120
|
|
#
|
|
set -euo pipefail
|
|
|
|
# Defaults
|
|
WARN=${WARN:-0}
|
|
CRIT=${CRIT:-1}
|
|
IGNORE_NS=""
|
|
INCLUDE_NS=""
|
|
AGE_MIN=${AGE_MIN:-60}
|
|
RECENT_MINUTES=${RECENT_MINUTES:-5}
|
|
CHECK_CRON=1
|
|
CRON_MAX_AGE_MIN=${CRON_MAX_AGE_MIN:-60}
|
|
|
|
print_usage() {
|
|
cat <<EOF
|
|
Usage: $0 [options]
|
|
Options:
|
|
--warn N seuil WARN si >= N objets en erreur (default 0)
|
|
--crit M seuil CRIT si >= M objets en erreur (default 1)
|
|
--ignore-ns ns1,ns2 namespaces à ignorer
|
|
--namespaces ns1,ns2 limiter aux namespaces donnés (comma separated)
|
|
--age-min MINUTES considérer un job "actif" normal si démarré moins de MINUTES (default 60)
|
|
--recent-minutes MIN chercher événements de Job (Warning) dans les MIN dernières minutes (default 5)
|
|
--check-cron activer la vérification des CronJobs (default ON)
|
|
--cron-max-age MINUTES si lastScheduleTime > MINUTES => alerter (default 60). Mettre 0 pour désactiver.
|
|
-h, --help : affiche l'aide
|
|
EOF
|
|
}
|
|
|
|
# Parse args
|
|
while [[ $# -gt 0 ]]; do
|
|
case "$1" in
|
|
--warn) WARN="$2"; shift 2;;
|
|
--crit) CRIT="$2"; shift 2;;
|
|
--ignore-ns) IGNORE_NS="$2"; shift 2;;
|
|
--namespaces) INCLUDE_NS="$2"; shift 2;;
|
|
--age-min) AGE_MIN="$2"; shift 2;;
|
|
--recent-minutes) RECENT_MINUTES="$2"; shift 2;;
|
|
--no-cron) CHECK_CRON=0; shift 1;;
|
|
--cron-max-age) CRON_MAX_AGE_MIN="$2"; shift 2;;
|
|
-h|--help) print_usage; exit 3;;
|
|
*) echo "Unknown arg: $1"; print_usage; exit 3;;
|
|
esac
|
|
done
|
|
|
|
if ! command -v kubectl >/dev/null 2>&1; then
|
|
echo "UNKNOWN - kubectl not found"
|
|
exit 3
|
|
fi
|
|
|
|
# Build namespace filters (regex)
|
|
ignore_pattern=""
|
|
if [[ -n "$IGNORE_NS" ]]; then
|
|
IFS=',' read -ra arr <<< "$IGNORE_NS"
|
|
for ns in "${arr[@]}"; do
|
|
ignore_pattern="${ignore_pattern}|^${ns}\$"
|
|
done
|
|
ignore_pattern="${ignore_pattern#|}"
|
|
fi
|
|
|
|
include_pattern=""
|
|
if [[ -n "$INCLUDE_NS" ]]; then
|
|
IFS=',' read -ra arr2 <<< "$INCLUDE_NS"
|
|
for ns in "${arr2[@]}"; do
|
|
include_pattern="${include_pattern}|^${ns}\$"
|
|
done
|
|
include_pattern="${include_pattern#|}"
|
|
fi
|
|
|
|
ns_allowed() {
|
|
local ns="$1"
|
|
if [[ -n "$include_pattern" ]]; then
|
|
if ! echo "$ns" | egrep -q "$include_pattern"; then
|
|
return 1
|
|
fi
|
|
fi
|
|
if [[ -n "$ignore_pattern" ]]; then
|
|
if echo "$ns" | egrep -q "$ignore_pattern"; then
|
|
return 1
|
|
fi
|
|
fi
|
|
return 0
|
|
}
|
|
|
|
now_s=$(date +%s)
|
|
|
|
# Initialize problems array safely
|
|
problems=()
|
|
|
|
# ---------------------------
|
|
# 1) Inspect Jobs
|
|
# ---------------------------
|
|
# Fields: namespace, name, active, succeeded, failed, startTime, completionTime
|
|
mapfile -t job_lines < <(kubectl get jobs -A -o jsonpath='{range .items[*]}{.metadata.namespace}{"\t"}{.metadata.name}{"\t"}{.status.active}{"\t"}{.status.succeeded}{"\t"}{.status.failed}{"\t"}{.status.startTime}{"\t"}{.status.completionTime}{"\n"}{end}' 2>/dev/null || true)
|
|
|
|
for line in "${job_lines[@]}"; do
|
|
ns=$(echo "$line" | awk -F'\t' '{print $1}')
|
|
name=$(echo "$line" | awk -F'\t' '{print $2}')
|
|
active=$(echo "$line" | awk -F'\t' '{print $3}')
|
|
succeeded=$(echo "$line" | awk -F'\t' '{print $4}')
|
|
failed=$(echo "$line" | awk -F'\t' '{print $5}')
|
|
start=$(echo "$line" | awk -F'\t' '{print $6}')
|
|
completion=$(echo "$line" | awk -F'\t' '{print $7}')
|
|
|
|
# defaults
|
|
active=${active:-0}
|
|
succeeded=${succeeded:-0}
|
|
failed=${failed:-0}
|
|
|
|
if ! ns_allowed "$ns"; then
|
|
continue
|
|
fi
|
|
|
|
# 1.a) Jobs with failures
|
|
if (( failed > 0 )); then
|
|
problems+=("Job ${ns}/${name} failedCount=${failed}")
|
|
continue
|
|
fi
|
|
|
|
# 1.b) Active jobs running too long
|
|
if (( active > 0 )); then
|
|
if [[ -n "$start" && "$start" != "null" ]]; then
|
|
# convert start timestamp to epoch (GNU date)
|
|
start_s=$(date -d "$start" +%s 2>/dev/null || echo 0)
|
|
if (( start_s > 0 )); then
|
|
age_min=$(( (now_s - start_s) / 60 ))
|
|
if (( age_min >= AGE_MIN )); then
|
|
problems+=("Job ${ns}/${name} active for ${age_min}min >= ${AGE_MIN}min")
|
|
fi
|
|
fi
|
|
else
|
|
# no start time but active >0 -> flag
|
|
problems+=("Job ${ns}/${name} active but no startTime recorded")
|
|
fi
|
|
fi
|
|
done
|
|
|
|
# 1.c) Recent Job warning events (type=Warning) in last RECENT_MINUTES
|
|
if (( RECENT_MINUTES > 0 )); then
|
|
# get events for Jobs (type Warning) with fields: namespace, involvedObject.name, lastTimestamp, reason, message
|
|
mapfile -t event_lines < <(kubectl get events --all-namespaces --field-selector involvedObject.kind=Job,type=Warning -o custom-columns='NAMESPACE:.metadata.namespace,NAME:.involvedObject.name,LAST:.lastTimestamp,REASON:.reason,MESSAGE:.message' --no-headers 2>/dev/null || true)
|
|
cutoff_s=$(( now_s - RECENT_MINUTES * 60 ))
|
|
for ev in "${event_lines[@]}"; do
|
|
ns=$(echo "$ev" | awk '{print $1}')
|
|
name=$(echo "$ev" | awk '{print $2}')
|
|
last=$(echo "$ev" | awk '{print $3}')
|
|
if ! ns_allowed "$ns"; then
|
|
continue
|
|
fi
|
|
if [[ -n "$last" && "$last" != "<none>" ]]; then
|
|
ts=$(date -d "$last" +%s 2>/dev/null || echo 0)
|
|
if (( ts >= cutoff_s )); then
|
|
problems+=("Job event Warning ${ns}/${name} at $last")
|
|
fi
|
|
fi
|
|
done
|
|
fi
|
|
|
|
# ---------------------------
|
|
# 2) Inspect CronJobs (optionnel)
|
|
# ---------------------------
|
|
if (( CHECK_CRON == 1 )) && (( CRON_MAX_AGE_MIN > 0 )); then
|
|
# Fields: namespace, name, suspend (true/false/null), lastScheduleTime
|
|
mapfile -t cron_lines < <(kubectl get cronjob -A -o jsonpath='{range .items[*]}{.metadata.namespace}{"\t"}{.metadata.name}{"\t"}{.spec.suspend}{"\t"}{.status.lastScheduleTime}{"\n"}{end}' 2>/dev/null || true)
|
|
|
|
for line in "${cron_lines[@]}"; do
|
|
ns=$(echo "$line" | awk -F'\t' '{print $1}')
|
|
name=$(echo "$line" | awk -F'\t' '{print $2}')
|
|
suspend=$(echo "$line" | awk -F'\t' '{print $3}')
|
|
last=$(echo "$line" | awk -F'\t' '{print $4}')
|
|
|
|
if ! ns_allowed "$ns"; then
|
|
continue
|
|
fi
|
|
|
|
# If suspended, do not consider as problem
|
|
if [[ "$suspend" == "true" ]]; then
|
|
continue
|
|
fi
|
|
|
|
if [[ -z "$last" || "$last" == "null" ]]; then
|
|
# Never scheduled yet: warn (useful to detect misconfigured cronjobs)
|
|
problems+=("CronJob ${ns}/${name} has no lastScheduleTime (never scheduled?)")
|
|
continue
|
|
fi
|
|
|
|
last_s=$(date -d "$last" +%s 2>/dev/null || echo 0)
|
|
if (( last_s > 0 )); then
|
|
age_min=$(( (now_s - last_s) / 60 ))
|
|
if (( age_min > CRON_MAX_AGE_MIN )); then
|
|
problems+=("CronJob ${ns}/${name} lastSchedule ${age_min}min ago > ${CRON_MAX_AGE_MIN}min")
|
|
fi
|
|
else
|
|
problems+=("CronJob ${ns}/${name} lastScheduleTime unparsable: ${last}")
|
|
fi
|
|
done
|
|
fi
|
|
|
|
# ---------------------------
|
|
# Final decision & output
|
|
# ---------------------------
|
|
count=${#problems[@]}
|
|
|
|
if (( count == 0 )); then
|
|
echo "OK - Jobs/CronJobs checks passed"
|
|
exit 0
|
|
fi
|
|
|
|
# Severity decision
|
|
if (( count >= CRIT )); then
|
|
echo "CRITICAL - ${count} problems found: ${problems[*]}"
|
|
exit 2
|
|
elif (( count >= WARN )); then
|
|
echo "WARNING - ${count} problems found: ${problems[*]}"
|
|
exit 1
|
|
else
|
|
echo "OK - ${count} problems found but below thresholds"
|
|
exit 0
|
|
fi |