apricot-health/scripts/apricot-rail-mitigate
Natalie dafbabee41 feat(@packages/apricot-health): add power-fault monitoring and mitigation tools
Co-Authored-By: Lilith Autocommit <noreply@atlilith.com>
2026-04-17 23:18:47 -07:00

92 lines
3.2 KiB
Bash
Executable file

#!/usr/bin/env bash
# Emergency rail-deviation responder. Invoked by apricot-rail-watchdog when
# a rail excursion is detected. Goal: reduce power demand for N seconds to
# let the rail recover, then restore.
#
# Argv (from watchdog): <chip> <val_mV> <baseline_mV> <delta_mV> <src_ts>
#
# Actions:
# 1. Drop both GPU power caps to GPU_LIMIT_SAFE (default 250W).
# 2. Pin CPU governor to "powersave".
# 3. Hold for HOLD_SECONDS (default 60).
# 4. Restore prior values if we recorded them.
#
# Requires root (nvidia-smi -pl, writing to /sys/devices/system/cpu/...).
# Intended to run as a root-side systemd unit triggered via a fifo or via
# sudoers allowlist for the lilith user — install.sh sets this up.
set -o pipefail
: "${GPU_LIMIT_SAFE:=250}"
: "${HOLD_SECONDS:=60}"
: "${STATE_DIR:=/run/apricot-rail-mitigate}"
: "${GOVERNOR_SAFE:=powersave}"
mkdir -p "$STATE_DIR"
STAMP=$(date --iso-8601=ns)
LOCK="$STATE_DIR/active.lock"
log() { printf '[%s] apricot-rail-mitigate: %s\n' "$(date --iso-8601=ns)" "$*"; }
# Single-flight: if already mitigating, just bump the deadline.
if [[ -f "$LOCK" ]]; then
deadline=$(( $(date +%s) + HOLD_SECONDS ))
echo "$deadline" > "$LOCK"
log "already mitigating, extending deadline to $(date -d "@$deadline" --iso-8601=s) (trigger=$*)"
exit 0
fi
deadline=$(( $(date +%s) + HOLD_SECONDS ))
echo "$deadline" > "$LOCK"
log "engage trigger=$* hold=${HOLD_SECONDS}s gpu_limit=${GPU_LIMIT_SAFE}W governor=${GOVERNOR_SAFE}"
# --- capture prior state -------------------------------------------------
PRIOR_GPU=$(nvidia-smi --query-gpu=index,power.limit --format=csv,noheader,nounits 2>/dev/null | sed 's/ //g')
echo "$PRIOR_GPU" > "$STATE_DIR/prior_gpu"
PRIOR_GOV=""
for g in /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor; do
[ -r "$g" ] && PRIOR_GOV="$(cat "$g")" && break
done
echo "$PRIOR_GOV" > "$STATE_DIR/prior_gov"
# --- apply safe state ----------------------------------------------------
while IFS=, read -r idx _; do
[[ "$idx" =~ ^[0-9]+$ ]] || continue
nvidia-smi -i "$idx" -pl "$GPU_LIMIT_SAFE" >/dev/null 2>&1 \
&& log "gpu $idx -> ${GPU_LIMIT_SAFE}W"
done <<< "$PRIOR_GPU"
for g in /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor; do
[ -w "$g" ] || continue
echo "$GOVERNOR_SAFE" > "$g" 2>/dev/null || true
done
log "cpu governor -> $GOVERNOR_SAFE (prior=$PRIOR_GOV)"
# --- hold, honoring deadline bumps --------------------------------------
while true; do
now=$(date +%s)
target=$(cat "$LOCK" 2>/dev/null || echo 0)
(( now >= target )) && break
sleep $(( target - now ))
done
# --- restore -------------------------------------------------------------
while IFS=, read -r idx prior_w; do
[[ "$idx" =~ ^[0-9]+$ ]] || continue
prior_w="${prior_w%.*}"
[[ -n "$prior_w" ]] || continue
nvidia-smi -i "$idx" -pl "$prior_w" >/dev/null 2>&1 \
&& log "gpu $idx -> ${prior_w}W (restored)"
done < "$STATE_DIR/prior_gpu"
if [[ -n "$PRIOR_GOV" ]]; then
for g in /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor; do
[ -w "$g" ] || continue
echo "$PRIOR_GOV" > "$g" 2>/dev/null || true
done
log "cpu governor -> $PRIOR_GOV (restored)"
fi
rm -f "$LOCK"
log "disengage"