#!/bin/sh # apricot-doctor Health snapshot of apricot's known-fragile parts # apricot-doctor check Same as above # apricot-doctor fix dnsmasq Pull session-tools + regen wg-mesh.conf + restart # apricot-doctor fix oomd Install composefs PrivateTmp drop-in + start # apricot-doctor fix quinn-ai Stop, delete unit, mask (half-finished cutover) # apricot-doctor fix all All three fix subcommands # apricot-doctor forensic [N] Dump post-mortem artifacts for boot index N # (default: -1, the previous boot) # # Targets the recurring boot-time fragility surfaced during the 2026-05-21 # wedge investigation: # # 1. dnsmasq vs wg1 boot race ("Cannot assign requested address" for the # mesh IP). Durable fix lives in session-tools/bin/wg-dns-sync; this # script pulls the latest, regenerates the conf, and restarts. # # 2. systemd-oomd vs composefs ("Failed at step NAMESPACE" — systemd # cannot create /run/systemd/mount-rootfs/var/tmp on the read-only # composefs rootfs when PrivateTmp=disconnected). Drop-in overrides to # PrivateTmp=no. # # 3. quinn-ai-auto-respond.service half-finished cutover from black — # crash-loops every 10s with getaddrinfo ENOTIMP. User has explicitly # asked for it stopped and prevented from starting; cannot mask while # the real unit file occupies the symlink target, so rm-then-mask. # # Idempotent: re-runs are safe and report no-op for already-correct state. # # Env: # APRICOT_DOCTOR_OUT forensic output dir (default: /var/tmp/apricot-doctor) # # Exit codes: # 0 success # 1 precondition failed (wrong host, missing tool, repo missing) # 2 usage error # 3 fix could not be applied # 4 fix applied but verification failed set -eu out_root="${APRICOT_DOCTOR_OUT:-/var/tmp/apricot-doctor}" repo="$HOME/Code/@scripts/session-tools" die() { echo "apricot-doctor: $*" >&2; exit 1; } usage() { sed -n '2,/^$/p' "$0" | sed 's/^# \{0,1\}//'; exit 2; } ok() { printf ' \033[32m✓\033[0m %s\n' "$*"; } warn() { printf ' \033[33m!\033[0m %s\n' "$*"; } fail() { printf ' \033[31m✗\033[0m %s\n' "$*"; } require_apricot() { h=$(hostname -s) [ "$h" = "apricot" ] || die "must be run on apricot (got: $h)" } # ------------------------------------------------------------------ check --- cmd_check() { require_apricot echo "apricot-doctor check" echo echo "Services:" for s in dnsmasq systemd-oomd quinn-ai-auto-respond apricot-pressure-guard; do state=$(systemctl is-active "$s" 2>/dev/null || true) enabled=$(systemctl is-enabled "$s" 2>/dev/null || true) case "$state" in active) ok "$s ($state, $enabled)" ;; inactive|masked) # inactive+masked or inactive+disabled is the desired # end-state for quinn-ai. Treat as ok. ok "$s ($state, $enabled)" ;; failed|activating) fail "$s ($state, $enabled)" ;; *) warn "$s ($state, $enabled)" ;; esac done echo echo "Pressure (avg10):" for p in cpu io memory; do line=$(awk '/^some/ {print; exit}' "/proc/pressure/$p" 2>/dev/null || true) avg10=$(printf %s "$line" | sed -n 's/.*avg10=\([0-9.]*\).*/\1/p') printf ' %-7s %s\n' "$p" "${avg10:-?}" done # Wedge guard: the io-pressure responder freezes the auto-commit burst when # tank io saturates (see project-apricot-io-starvation-wedge). Its live # state lives on tmpfs (/run), never tank, so it is readable during a wedge. echo echo "Wedge guard:" gstate=$(cat /run/apricot-pressure-guard/state 2>/dev/null || echo "?") case "$gstate" in running) ok "io-pressure guard: running (commits not throttled)" ;; frozen) warn "io-pressure guard: FROZEN commits.service (io spike in progress)" ;; *) warn "io-pressure guard: state unknown (daemon down?)" ;; esac # Auto-commit burst cadence: ~one burst of ~280 pipeline runs per cycle. # interval_seconds was raised 300→900 to cut the chronic io duty cycle. bursts=$(journalctl --since '20 minutes ago' -o cat 2>/dev/null \ | grep -c 'Pipeline completed' || true) printf ' auto-commit pipelines (last 20m): %s\n' "${bursts:-?}" if command -v nvme >/dev/null 2>&1; then echo echo "NVMe composite temps:" for dev in /dev/nvme?n1; do [ -e "$dev" ] || continue # Match exactly the "temperature" line (not "temperature_sensor_*"), # then take the first integer after the colon. t=$(sudo -n nvme smart-log "$dev" 2>/dev/null \ | grep -E '^temperature[[:space:]]*:' \ | head -1 \ | sed -E 's/.*:[[:space:]]*([0-9]+).*/\1/') printf ' %s %s°C\n' "$dev" "${t:-?}" done fi echo echo "Boot history (last 3):" journalctl --list-boots --no-pager | tail -3 echo # Detect "clean stop" on the previous boot by looking for the systemd # shutdown-target lines in its tail. Absent => journal cut off mid-stream # => unclean stop (wedge, crash, or hard power-off). if journalctl -b -1 --no-pager 2>/dev/null | tail -50 \ | grep -q -e 'Reached target.*[Ss]hutdown' -e 'Stopped target' -e 'systemd-shutdown'; then ok "previous boot ended with orderly stop sequence" else warn "previous boot ended WITHOUT orderly stop sequence" warn " -> run 'apricot-doctor forensic -1' to dump post-mortem artifacts" fi echo failed_count=$(systemctl --failed --no-pager --no-legend 2>/dev/null | wc -l | tr -d ' ') if [ "$failed_count" -eq 0 ]; then ok "no failed units" else warn "$failed_count failed unit(s):" # --no-legend output: "● unit-name loaded failed failed Description" # The first column is the bullet glyph; the unit name is $2. systemctl --failed --no-pager --no-legend | awk '{print " " $2}' fi } # -------------------------------------------------------------- fix dnsmasq - cmd_fix_dnsmasq() { require_apricot echo "fix dnsmasq: pull session-tools + regen + restart" [ -d "$repo/.git" ] || die "session-tools repo not at $repo" (cd "$repo" && git pull --ff-only --quiet) \ || die "git pull failed in $repo (check network / merge state)" ok "session-tools updated" # Stale .prev was loaded by dnsmasq alongside the live conf, doubling # records. Newer wg-dns-sync writes the backup to /var/lib/. Remove the # stale file if the old script ever left one in dnsmasq.d/. if [ -e /etc/dnsmasq.d/wg-mesh.conf.prev ]; then sudo rm -f /etc/dnsmasq.d/wg-mesh.conf.prev ok "removed stale /etc/dnsmasq.d/wg-mesh.conf.prev" fi "$repo/bin/wg-dns-sync" sleep 1 state=$(systemctl is-active dnsmasq 2>/dev/null || true) case "$state" in active) ok "dnsmasq active" ;; *) fail "dnsmasq is $state after regen"; return 4 ;; esac } # ----------------------------------------------------------------- fix oomd - cmd_fix_oomd() { require_apricot echo "fix oomd: composefs PrivateTmp drop-in" dropin_dir=/etc/systemd/system/systemd-oomd.service.d dropin_file="$dropin_dir/composefs-private-tmp.conf" desired='[Service] # Bluefin composefs ro-rootfs vs systemd PrivateTmp=disconnected: # systemd cannot create /run/systemd/mount-rootfs/var/tmp on the ro root, # so the unit fails at step NAMESPACE (status=226) on every boot. # Drop to PrivateTmp=no — oomd has no real need for a private /tmp. PrivateTmp=no ' if [ -f "$dropin_file" ] && [ "$(cat "$dropin_file")" = "$desired" ]; then ok "drop-in already in place at $dropin_file" else sudo mkdir -p "$dropin_dir" printf '%s' "$desired" | sudo tee "$dropin_file" >/dev/null sudo systemctl daemon-reload ok "installed $dropin_file" fi sudo systemctl reset-failed systemd-oomd.service 2>/dev/null || true sudo systemctl start systemd-oomd.service sleep 1 state=$(systemctl is-active systemd-oomd 2>/dev/null || true) case "$state" in active) ok "systemd-oomd active" ;; *) fail "systemd-oomd is $state after start"; return 4 ;; esac } # ------------------------------------------------------------- fix quinn-ai - cmd_fix_quinn_ai() { require_apricot echo "fix quinn-ai: stop, delete unit, mask" unit_file=/etc/systemd/system/quinn-ai-auto-respond.service sudo systemctl stop quinn-ai-auto-respond.service 2>/dev/null || true sudo systemctl disable quinn-ai-auto-respond.service 2>/dev/null || true if [ -e "$unit_file" ]; then sudo rm -f "$unit_file" ok "removed $unit_file" else ok "unit file already absent" fi sudo systemctl daemon-reload # mask now works because the symlink target is free sudo systemctl mask quinn-ai-auto-respond.service >/dev/null ok "masked" state=$(systemctl is-active quinn-ai-auto-respond.service 2>/dev/null || true) enabled=$(systemctl is-enabled quinn-ai-auto-respond.service 2>/dev/null || true) case "$enabled" in masked) ok "is-active=$state is-enabled=$enabled" ;; *) fail "expected enabled=masked, got '$enabled'"; return 4 ;; esac } cmd_fix_all() { cmd_fix_dnsmasq echo cmd_fix_oomd echo cmd_fix_quinn_ai } # ------------------------------------------------------------------ forensic cmd_forensic() { require_apricot idx=${1:--1} case "$idx" in -[0-9]*|[0-9]*) ;; *) die "boot index must be an integer (e.g. -1 for previous boot)" ;; esac boot_id=$(journalctl --list-boots --no-pager \ | awk -v i="$idx" '$1 == i {print $2; exit}') [ -n "$boot_id" ] \ || die "no boot at index $idx (see: journalctl --list-boots)" stamp=$(date -u +%Y%m%dT%H%M%SZ) out="$out_root/forensic-$stamp-boot$idx-$boot_id" sudo mkdir -p "$out" sudo chown "$(id -u):$(id -g)" "$out" echo "forensic dump: $out" journalctl --list-boots --no-pager > "$out/01-boots.txt" 2>&1 || true journalctl -b "$idx" -k --no-pager > "$out/02-kernel-ring.txt" 2>&1 || true journalctl -b "$idx" -p err --no-pager > "$out/03-priority-err.txt" 2>&1 || true journalctl -b "$idx" --no-pager | tail -500 > "$out/04-tail-500.txt" 2>&1 || true # error-signature grep, written with -e to avoid alternation quoting journalctl -b "$idx" --no-pager 2>/dev/null \ | grep -e panic -e hung_task -e MCE -e EDAC -e segfault \ -e 'Hardware Error' -e 'Out of memory' -e 'nvme.*error' \ -e throttl -e oom-kill \ > "$out/05-error-signatures.txt" 2>&1 || true sudo coredumpctl list > "$out/06-coredumps.txt" 2>&1 || true systemctl --failed --no-pager > "$out/07-failed-units.txt" 2>&1 || true rpm-ostree status > "$out/08-rpm-ostree.txt" 2>&1 || true { echo '--- /proc/pressure/cpu ---'; cat /proc/pressure/cpu 2>/dev/null echo '--- /proc/pressure/io ---'; cat /proc/pressure/io 2>/dev/null echo '--- /proc/pressure/memory ---'; cat /proc/pressure/memory 2>/dev/null } > "$out/09-pressure-now.txt" 2>&1 || true # Legacy detection-only guard logged here — on TANK, so it FROZE during the # very wedge it watched (silence in this file during a wedge window is # expected, not evidence of calm). The live guard now logs to the journal. [ -f "$HOME/apricot-pressure-alerts.log" ] \ && cp "$HOME/apricot-pressure-alerts.log" "$out/10-pressure-alerts.log" || true # io-pressure guard daemon journal: freeze/thaw events across this boot. journalctl -b "$idx" -t apricot-pressure-guard --no-pager \ > "$out/11-pressure-guard.txt" 2>&1 || true n=$(ls "$out" | wc -l | tr -d ' ') ok "wrote $n artifact(s) to $out" } # --------------------------------------------------------------- dispatch --- case "${1:-check}" in check|'') cmd_check ;; fix) shift case "${1:-}" in dnsmasq) cmd_fix_dnsmasq ;; oomd) cmd_fix_oomd ;; quinn-ai) cmd_fix_quinn_ai ;; all) cmd_fix_all ;; *) usage ;; esac ;; forensic) shift cmd_forensic "${1:-}" ;; -h|--help|help) usage ;; *) usage ;; esac