#!/usr/bin/env bash
# Continuously appends power/thermal/voltage state to $LOG so that the last
# fractions of a second before a hard reset survive the crash.
#
# Env overrides:
#   LOG            output path (default ~/apricot-crash.log)
#   INTERVAL       sample period in seconds (default 0.1 = 10 Hz)
#   SENSOR_CHIPS   regex of hwmon name(s) to capture (default k10temp|nvme|it8628|nct6*|w83*)

set -o pipefail

LOG="${LOG:-${HOME}/apricot-crash.log}"
INTERVAL="${INTERVAL:-0.1}"
GPU_SAMPLE_EVERY="${GPU_SAMPLE_EVERY:-10}"   # nvidia-smi is slow; only invoke every Nth iter
SENSOR_CHIPS="${SENSOR_CHIPS:-k10temp|nvme|it8628|nct6.*|w83.*}"

printf '=== session start %s (pid=%s interval=%ss gpu_every=%s chips=%s) ===\n' \
    "$(date --iso-8601=ns)" "$$" "$INTERVAL" "$GPU_SAMPLE_EVERY" "$SENSOR_CHIPS" >> "$LOG"

# Pre-resolve matching hwmon paths once per second (cheaper than per-sample).
declare -a HWMONS
refresh_hwmons() {
    HWMONS=()
    for h in /sys/class/hwmon/hwmon*; do
        [ -d "$h" ] || continue
        [ -r "$h/name" ] || continue
        name=$(<"$h/name")    # bash builtin — no fork
        [[ "$name" =~ ^(${SENSOR_CHIPS})$ ]] || continue
        HWMONS+=("$h")
    done
}
refresh_hwmons
last_refresh=$SECONDS
iter=0

while :; do
    ts=$(date --iso-8601=ns)

    # GPU telemetry — skip most iterations because nvidia-smi startup is
    # ~300-500ms, which would cap the loop at ~2 Hz otherwise.
    if (( iter % GPU_SAMPLE_EVERY == 0 )); then
        while IFS= read -r gpu_line; do
            printf '%s gpu %s\n' "$ts" "$gpu_line"
        done < <(nvidia-smi \
            --query-gpu=index,temperature.gpu,power.draw,clocks.gr,clocks.mem,pstate,utilization.gpu,memory.used \
            --format=csv,noheader,nounits 2>/dev/null)
    fi
    iter=$(( iter + 1 ))

    # Platform sensors — use $(<file) bash builtin everywhere to avoid
    # fork+exec per-read. With ~60 sensor files that's the difference
    # between ~600ms per iteration and <20ms.
    for h in "${HWMONS[@]}"; do
        [ -r "$h/name" ] || continue
        name=$(<"$h/name")
        hb=${h##*/}
        for inp in "$h"/temp*_input "$h"/in*_input "$h"/fan*_input "$h"/curr*_input; do
            [ -r "$inp" ] || continue
            n=${inp##*/}; n=${n%_input}
            label_file="$h/${n}_label"
            if [ -r "$label_file" ]; then
                label=$(<"$label_file")
            else
                label="$n"
            fi
            raw=$(<"$inp")
            printf '%s sensor %s/%s %s=%s\n' "$ts" "$name" "$hb" "$label" "$raw"
        done
    done

    # Refresh hwmon list every ~5s in case modules load/unload.
    if (( SECONDS - last_refresh > 5 )); then
        refresh_hwmons
        last_refresh=$SECONDS
    fi

    # Fsync once per second regardless of sample rate (amortized).
    if (( ${ts:20:1} == 0 )); then
        sync "$LOG" 2>/dev/null || true
    fi

    sleep "$INTERVAL"
done >> "$LOG"
