#!/usr/bin/env bash
# Emergency rail-deviation responder. Invoked by apricot-rail-watchdog when
# a rail excursion is detected. Goal: reduce power demand for N seconds to
# let the rail recover, then restore.
#
# Argv (from watchdog): <chip> <val_mV> <baseline_mV> <delta_mV> <src_ts>
#
# Actions:
#   1. Drop both GPU power caps to GPU_LIMIT_SAFE (default 250W).
#   2. Pin CPU governor to "powersave".
#   3. Hold for HOLD_SECONDS (default 60).
#   4. Restore prior values if we recorded them.
#
# Requires root (nvidia-smi -pl, writing to /sys/devices/system/cpu/...).
# Intended to run as a root-side systemd unit triggered via a fifo or via
# sudoers allowlist for the lilith user — install.sh sets this up.

set -o pipefail

: "${GPU_LIMIT_SAFE:=250}"
: "${HOLD_SECONDS:=60}"
: "${STATE_DIR:=/run/apricot-rail-mitigate}"
: "${GOVERNOR_SAFE:=powersave}"

mkdir -p "$STATE_DIR"
STAMP=$(date --iso-8601=ns)
LOCK="$STATE_DIR/active.lock"

log() { printf '[%s] apricot-rail-mitigate: %s\n' "$(date --iso-8601=ns)" "$*"; }

# Single-flight: if already mitigating, just bump the deadline.
if [[ -f "$LOCK" ]]; then
    deadline=$(( $(date +%s) + HOLD_SECONDS ))
    echo "$deadline" > "$LOCK"
    log "already mitigating, extending deadline to $(date -d "@$deadline" --iso-8601=s) (trigger=$*)"
    exit 0
fi

deadline=$(( $(date +%s) + HOLD_SECONDS ))
echo "$deadline" > "$LOCK"
log "engage trigger=$* hold=${HOLD_SECONDS}s gpu_limit=${GPU_LIMIT_SAFE}W governor=${GOVERNOR_SAFE}"

# --- capture prior state -------------------------------------------------
PRIOR_GPU=$(nvidia-smi --query-gpu=index,power.limit --format=csv,noheader,nounits 2>/dev/null | sed 's/ //g')
echo "$PRIOR_GPU" > "$STATE_DIR/prior_gpu"

PRIOR_GOV=""
for g in /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor; do
    [ -r "$g" ] && PRIOR_GOV="$(cat "$g")" && break
done
echo "$PRIOR_GOV" > "$STATE_DIR/prior_gov"

# --- apply safe state ----------------------------------------------------
while IFS=, read -r idx _; do
    [[ "$idx" =~ ^[0-9]+$ ]] || continue
    nvidia-smi -i "$idx" -pl "$GPU_LIMIT_SAFE" >/dev/null 2>&1 \
        && log "gpu $idx -> ${GPU_LIMIT_SAFE}W"
done <<< "$PRIOR_GPU"

for g in /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor; do
    [ -w "$g" ] || continue
    echo "$GOVERNOR_SAFE" > "$g" 2>/dev/null || true
done
log "cpu governor -> $GOVERNOR_SAFE (prior=$PRIOR_GOV)"

# --- hold, honoring deadline bumps --------------------------------------
while true; do
    now=$(date +%s)
    target=$(cat "$LOCK" 2>/dev/null || echo 0)
    (( now >= target )) && break
    sleep $(( target - now ))
done

# --- restore -------------------------------------------------------------
while IFS=, read -r idx prior_w; do
    [[ "$idx" =~ ^[0-9]+$ ]] || continue
    prior_w="${prior_w%.*}"
    [[ -n "$prior_w" ]] || continue
    nvidia-smi -i "$idx" -pl "$prior_w" >/dev/null 2>&1 \
        && log "gpu $idx -> ${prior_w}W (restored)"
done < "$STATE_DIR/prior_gpu"

if [[ -n "$PRIOR_GOV" ]]; then
    for g in /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor; do
        [ -w "$g" ] || continue
        echo "$PRIOR_GOV" > "$g" 2>/dev/null || true
    done
    log "cpu governor -> $PRIOR_GOV (restored)"
fi

rm -f "$LOCK"
log "disengage"
