#!/usr/bin/env bash
# check_k8s_apiserver_access
# Vérifie le nombre de réponses HTTP 403 dans les logs de kube-apiserver.
# Retour codes: 0=OK, 1=WARNING, 2=CRITICAL, 3=UNKNOWN
#
# Par défaut: utilise journalctl -u kube-apiserver --since="${WINDOW} minutes ago"
# Option --kubectl : utilise "kubectl logs" sur les pods correspondant au sélecteur.
#
# Usage examples:
#   sudo /usr/lib/nagios/plugins/check_k8s_apiserver_access --window 5 --warn 10 --crit 50
#   sudo /usr/lib/nagios/plugins/check_k8s_apiserver_access --kubectl --selector 'k8s-app=kube-apiserver' --window 10 --crit 100
#
set -euo pipefail

PROG_NAME=$(basename "$0")

# Defaults
WINDOW_MINUTES=5
WARN_THRESHOLD=10
CRIT_THRESHOLD=50
USE_KUBECTL=0
KUBECTL_NAMESPACE="kube-system"
KUBECTL_SELECTOR=""    # if empty, we'll try -l component=kube-apiserver or label provided
JOURNAL_UNIT="kube-apiserver"  # systemd unit name; adapt if different
PATTERN=''  # optional custom grep regex
TOP_N=5     # number of top offenders to show

print_help() {
  cat <<EOF
$PROG_NAME - check apiserver 403 rate in logs

Options:
  --window N           Window in minutes to look back (default: ${WINDOW_MINUTES})
  --warn N             WARN threshold: count >= N -> WARNING (default: ${WARN_THRESHOLD})
  --crit N             CRIT threshold: count >= N -> CRITICAL (default: ${CRIT_THRESHOLD})
  --kubectl            Use 'kubectl logs' on apiserver pods instead of journalctl
  --namespace NS       Namespace for kubectl logs (default: ${KUBECTL_NAMESPACE})
  --selector SEL       Label selector for kubectl logs (e.g. "component=kube-apiserver" or "k8s-app=kube-apiserver")
  --unit UNIT          systemd unit for journalctl (default: ${JOURNAL_UNIT})
  --pattern REGEX      custom grep regex to detect 403 entries (overrides built-in heuristics)
  --top N              show top N request lines causing 403 (default ${TOP_N})
  -h, --help           show this help

Examples:
  # check last 5 minutes using journalctl
  sudo ./check_apiserver_403.sh --window 5 --warn 20 --crit 50

  # check last 10 minutes using kubectl logs for apiserver static-pods
  sudo ./check_apiserver_403.sh --kubectl --namespace kube-system --selector 'k8s-app=kube-apiserver' --window 10 --crit 100
EOF
}

# Parse args
while [[ $# -gt 0 ]]; do
  case "$1" in
    --window) WINDOW_MINUTES="$2"; shift 2;;
    --warn) WARN_THRESHOLD="$2"; shift 2;;
    --crit) CRIT_THRESHOLD="$2"; shift 2;;
    --kubectl) USE_KUBECTL=1; shift 1;;
    --namespace) KUBECTL_NAMESPACE="$2"; shift 2;;
    --selector) KUBECTL_SELECTOR="$2"; shift 2;;
    --unit) JOURNAL_UNIT="$2"; shift 2;;
    --pattern) PATTERN="$2"; shift 2;;
    --top) TOP_N="$2"; shift 2;;
    -h|--help) print_help; exit 3;;
    *) echo "Unknown argument: $1"; print_help; exit 3;;
  esac
done

# Validate numeric args
if ! [[ "$WINDOW_MINUTES" =~ ^[0-9]+$ ]]; then echo "UNKNOWN - invalid --window"; exit 3; fi
if ! [[ "$WARN_THRESHOLD" =~ ^[0-9]+$ ]]; then echo "UNKNOWN - invalid --warn"; exit 3; fi
if ! [[ "$CRIT_THRESHOLD" =~ ^[0-9]+$ ]]; then echo "UNKNOWN - invalid --crit"; exit 3; fi
if ! [[ "$TOP_N" =~ ^[0-9]+$ ]]; then echo "UNKNOWN - invalid --top"; exit 3; fi

# Build detection regex if not provided
if [[ -z "$PATTERN" ]]; then
  # heuristics: try to match common apiserver log patterns that indicate a 403/Forbidden
  # examples: "\" 403 ", "code=403", "403 Forbidden", "Forbidden" combined with "Denied" etc.
  PATTERN='(" 403 |\" 403 |code=403|403 Forbidden|Forbidden|\"Reason=Forbidden\"|\"message=.*Forbidden)'

  # note: portable grep -E will accept that pattern
fi

# Grab logs
get_logs_journal() {
  # Use journalctl if available
  if ! command -v journalctl >/dev/null 2>&1; then
    echo "ERROR_NO_JOURNAL" 1>&2
    return 1
  fi
  # We use --no-pager; use unit name. If unit not present, journalctl returns non-zero.
  # Example: journalctl -u kube-apiserver --since "5 minutes ago"
  journalctl -u "${JOURNAL_UNIT}" --since="${WINDOW_MINUTES} minutes ago" --no-pager 2>/dev/null || return 1
}

get_logs_kubectl() {
  if ! command -v kubectl >/dev/null 2>&1; then
    echo "ERROR_NO_KUBECTL" 1>&2
    return 1
  fi
  # If no selector given try common selectors
  sel="${KUBECTL_SELECTOR}"
  if [[ -z "$sel" ]]; then
    # try common labels
    for try in 'component=kube-apiserver' 'k8s-app=kube-apiserver' 'tier=control-plane' ''; do
      if [[ -z "$try" ]]; then
        sel=""
        break
      fi
      # test if any pods match
      count=$(kubectl -n "${KUBECTL_NAMESPACE}" get pods -l "${try}" --no-headers 2>/dev/null | wc -l || echo 0)
      if [[ "$count" -gt 0 ]]; then
        sel="${try}"
        break
      fi
    done
  fi

  if [[ -z "$sel" ]]; then
    # fallback: get all pods in namespace and try to find apiserver in name
    pods=$(kubectl -n "${KUBECTL_NAMESPACE}" get pods --no-headers -o custom-columns=':metadata.name' 2>/dev/null || true)
    if [[ -z "$pods" ]]; then
      return 1
    fi
    # build selector as empty and we'll filter by name
    # collect logs from pods whose name contains "apiserver"
    out=""
    while IFS= read -r p; do
      [[ -z "$p" ]] && continue
      if echo "$p" | grep -qi 'apiserver'; then
        out="${out}$(kubectl -n ${KUBECTL_NAMESPACE} logs --since=${WINDOW_MINUTES}m ${p} --all-containers 2>/dev/null || true)$'\n'"
      fi
    done <<< "$pods"
    printf '%s' "$out"
    return 0
  else
    # gather logs from all pods matching selector
    podnames=$(kubectl -n "${KUBECTL_NAMESPACE}" get pods -l "${sel}" -o custom-columns=':metadata.name' --no-headers 2>/dev/null || true)
    if [[ -z "$podnames" ]]; then
      return 1
    fi
    out=""
    while IFS= read -r p; do
      [[ -z "$p" ]] && continue
      out="${out}$(kubectl -n ${KUBECTL_NAMESPACE} logs --since=${WINDOW_MINUTES}m ${p} --all-containers 2>/dev/null || true)$'\n'"
    done <<< "$podnames"
    printf '%s' "$out"
    return 0
  fi
}

# retrieve logs into variable LOGS
LOGS=""
if (( USE_KUBECTL == 1 )); then
  if ! LOGS=$(get_logs_kubectl); then
    echo "CRITICAL - failed to collect logs via kubectl (check KUBECONFIG, namespace/selector, permissions)"
    exit 2
  fi
else
  if ! LOGS=$(get_logs_journal); then
    echo "CRITICAL - failed to collect logs via journalctl for unit '${JOURNAL_UNIT}' (check unit name/permissions)"
    exit 2
  fi
fi

# If logs empty -> OK (no traffic) BUT treat with UNKNOWN if we expected logs
if [[ -z "$LOGS" ]]; then
  echo "OK - no apiserver logs found in the last ${WINDOW_MINUTES}m (count=0)"
  exit 0
fi

# Count matches of 403 using grep -E (case-insensitive)
# Use printf to pass LOGS safely to grep
count_403=$(printf '%s\n' "$LOGS" | grep -E -i -c "$PATTERN" || true)
count_403=${count_403:-0}

# Optionally extract top request lines that caused 403
# Try to extract HTTP method + path if present, otherwise use whole line truncated
top_requests=$(printf '%s\n' "$LOGS" | grep -E -i "$PATTERN" || true)
if [[ -n "$top_requests" ]]; then
  # try to extract method+path like: "GET /api/..." or GET /api/... 
  top_paths=$(printf '%s\n' "$top_requests" | grep -oE '(GET|POST|PUT|DELETE|PATCH) [^" ]+' | sed 's/"$//' | sort | uniq -c | sort -rn | head -n "${TOP_N}" || true)
  if [[ -z "$top_paths" ]]; then
    # fallback: show most frequent truncated lines
    top_paths=$(printf '%s\n' "$top_requests" | sed 's/^[[:space:]]*//; s/[[:space:]]\+/ /g' | cut -c1-200 | sort | uniq -c | sort -rn | head -n "${TOP_N}" || true)
  fi
else
  top_paths=""
fi

# Decide severity
if (( count_403 >= CRIT_THRESHOLD )); then
  status=2
  state="CRITICAL"
elif (( count_403 >= WARN_THRESHOLD )); then
  status=1
  state="WARNING"
else
  status=0
  state="OK"
fi

# Build message
msg="${state} - ${count_403} occurrences of 403 in last ${WINDOW_MINUTES}m (warn=${WARN_THRESHOLD},crit=${CRIT_THRESHOLD})"

# Append top paths if present
if [[ -n "$top_paths" ]]; then
  msg="${msg} ; top=${TOP_N}: $(printf '%s' "$top_paths" | tr '\n' '|' | sed 's/|$//')"
fi

# Print and exit
echo "$msg"
exit $status