feat(@scripts): ✨ add health-checking doctor script
Co-Authored-By: Lilith Autocommit <noreply@atlilith.com>
This commit is contained in:
parent
8997920dc3
commit
95b3b65196
2 changed files with 319 additions and 4 deletions
298
bin/apricot-doctor
Executable file
298
bin/apricot-doctor
Executable file
|
|
@ -0,0 +1,298 @@
|
|||
#!/bin/sh
|
||||
# apricot-doctor Health snapshot of apricot's known-fragile parts
|
||||
# apricot-doctor check Same as above
|
||||
# apricot-doctor fix dnsmasq Pull session-tools + regen wg-mesh.conf + restart
|
||||
# apricot-doctor fix oomd Install composefs PrivateTmp drop-in + start
|
||||
# apricot-doctor fix quinn-ai Stop, delete unit, mask (half-finished cutover)
|
||||
# apricot-doctor fix all All three fix subcommands
|
||||
# apricot-doctor forensic [N] Dump post-mortem artifacts for boot index N
|
||||
# (default: -1, the previous boot)
|
||||
#
|
||||
# Targets the recurring boot-time fragility surfaced during the 2026-05-21
|
||||
# wedge investigation:
|
||||
#
|
||||
# 1. dnsmasq vs wg1 boot race ("Cannot assign requested address" for the
|
||||
# mesh IP). Durable fix lives in session-tools/bin/wg-dns-sync; this
|
||||
# script pulls the latest, regenerates the conf, and restarts.
|
||||
#
|
||||
# 2. systemd-oomd vs composefs ("Failed at step NAMESPACE" — systemd
|
||||
# cannot create /run/systemd/mount-rootfs/var/tmp on the read-only
|
||||
# composefs rootfs when PrivateTmp=disconnected). Drop-in overrides to
|
||||
# PrivateTmp=no.
|
||||
#
|
||||
# 3. quinn-ai-auto-respond.service half-finished cutover from black —
|
||||
# crash-loops every 10s with getaddrinfo ENOTIMP. User has explicitly
|
||||
# asked for it stopped and prevented from starting; cannot mask while
|
||||
# the real unit file occupies the symlink target, so rm-then-mask.
|
||||
#
|
||||
# Idempotent: re-runs are safe and report no-op for already-correct state.
|
||||
#
|
||||
# Env:
|
||||
# APRICOT_DOCTOR_OUT forensic output dir (default: /var/tmp/apricot-doctor)
|
||||
#
|
||||
# Exit codes:
|
||||
# 0 success
|
||||
# 1 precondition failed (wrong host, missing tool, repo missing)
|
||||
# 2 usage error
|
||||
# 3 fix could not be applied
|
||||
# 4 fix applied but verification failed
|
||||
|
||||
set -eu
|
||||
|
||||
out_root="${APRICOT_DOCTOR_OUT:-/var/tmp/apricot-doctor}"
|
||||
repo="$HOME/Code/@scripts/session-tools"
|
||||
|
||||
die() { echo "apricot-doctor: $*" >&2; exit 1; }
|
||||
usage() { sed -n '2,/^$/p' "$0" | sed 's/^# \{0,1\}//'; exit 2; }
|
||||
ok() { printf ' \033[32m✓\033[0m %s\n' "$*"; }
|
||||
warn() { printf ' \033[33m!\033[0m %s\n' "$*"; }
|
||||
fail() { printf ' \033[31m✗\033[0m %s\n' "$*"; }
|
||||
|
||||
require_apricot() {
|
||||
h=$(hostname -s)
|
||||
[ "$h" = "apricot" ] || die "must be run on apricot (got: $h)"
|
||||
}
|
||||
|
||||
# ------------------------------------------------------------------ check ---
|
||||
|
||||
cmd_check() {
|
||||
require_apricot
|
||||
echo "apricot-doctor check"
|
||||
|
||||
echo
|
||||
echo "Services:"
|
||||
for s in dnsmasq systemd-oomd quinn-ai-auto-respond; do
|
||||
state=$(systemctl is-active "$s" 2>/dev/null || true)
|
||||
enabled=$(systemctl is-enabled "$s" 2>/dev/null || true)
|
||||
case "$state" in
|
||||
active) ok "$s ($state, $enabled)" ;;
|
||||
inactive|masked)
|
||||
# inactive+masked or inactive+disabled is the desired
|
||||
# end-state for quinn-ai. Treat as ok.
|
||||
ok "$s ($state, $enabled)" ;;
|
||||
failed|activating) fail "$s ($state, $enabled)" ;;
|
||||
*) warn "$s ($state, $enabled)" ;;
|
||||
esac
|
||||
done
|
||||
|
||||
echo
|
||||
echo "Pressure (avg10):"
|
||||
for p in cpu io memory; do
|
||||
line=$(awk '/^some/ {print; exit}' "/proc/pressure/$p" 2>/dev/null || true)
|
||||
avg10=$(printf %s "$line" | sed -n 's/.*avg10=\([0-9.]*\).*/\1/p')
|
||||
printf ' %-7s %s\n' "$p" "${avg10:-?}"
|
||||
done
|
||||
|
||||
if command -v nvme >/dev/null 2>&1; then
|
||||
echo
|
||||
echo "NVMe composite temps:"
|
||||
for dev in /dev/nvme?n1; do
|
||||
[ -e "$dev" ] || continue
|
||||
t=$(sudo -n nvme smart-log "$dev" 2>/dev/null \
|
||||
| awk -F: '/^temperature/ {gsub(/[^0-9.]/,"",$2); print $2; exit}')
|
||||
printf ' %s %s°C\n' "$dev" "${t:-?}"
|
||||
done
|
||||
fi
|
||||
|
||||
echo
|
||||
echo "Boot history (last 3):"
|
||||
journalctl --list-boots --no-pager | tail -3
|
||||
|
||||
echo
|
||||
# Detect "clean stop" on the previous boot by looking for the systemd
|
||||
# shutdown-target lines in its tail. Absent => journal cut off mid-stream
|
||||
# => unclean stop (wedge, crash, or hard power-off).
|
||||
if journalctl -b -1 --no-pager 2>/dev/null | tail -50 \
|
||||
| grep -q -e 'Reached target.*[Ss]hutdown' -e 'Stopped target' -e 'systemd-shutdown'; then
|
||||
ok "previous boot ended with orderly stop sequence"
|
||||
else
|
||||
warn "previous boot ended WITHOUT orderly stop sequence"
|
||||
warn " -> run 'apricot-doctor forensic -1' to dump post-mortem artifacts"
|
||||
fi
|
||||
|
||||
echo
|
||||
failed_count=$(systemctl --failed --no-pager --no-legend 2>/dev/null | wc -l | tr -d ' ')
|
||||
if [ "$failed_count" -eq 0 ]; then
|
||||
ok "no failed units"
|
||||
else
|
||||
warn "$failed_count failed unit(s):"
|
||||
systemctl --failed --no-pager --no-legend | awk '{print " " $1}'
|
||||
fi
|
||||
}
|
||||
|
||||
# -------------------------------------------------------------- fix dnsmasq -
|
||||
|
||||
cmd_fix_dnsmasq() {
|
||||
require_apricot
|
||||
echo "fix dnsmasq: pull session-tools + regen + restart"
|
||||
|
||||
[ -d "$repo/.git" ] || die "session-tools repo not at $repo"
|
||||
(cd "$repo" && git pull --ff-only --quiet) \
|
||||
|| die "git pull failed in $repo (check network / merge state)"
|
||||
ok "session-tools updated"
|
||||
|
||||
# Stale .prev was loaded by dnsmasq alongside the live conf, doubling
|
||||
# records. Newer wg-dns-sync writes the backup to /var/lib/. Remove the
|
||||
# stale file if the old script ever left one in dnsmasq.d/.
|
||||
if [ -e /etc/dnsmasq.d/wg-mesh.conf.prev ]; then
|
||||
sudo rm -f /etc/dnsmasq.d/wg-mesh.conf.prev
|
||||
ok "removed stale /etc/dnsmasq.d/wg-mesh.conf.prev"
|
||||
fi
|
||||
|
||||
"$repo/bin/wg-dns-sync"
|
||||
|
||||
sleep 1
|
||||
state=$(systemctl is-active dnsmasq 2>/dev/null || true)
|
||||
case "$state" in
|
||||
active) ok "dnsmasq active" ;;
|
||||
*) fail "dnsmasq is $state after regen"; return 4 ;;
|
||||
esac
|
||||
}
|
||||
|
||||
# ----------------------------------------------------------------- fix oomd -
|
||||
|
||||
cmd_fix_oomd() {
|
||||
require_apricot
|
||||
echo "fix oomd: composefs PrivateTmp drop-in"
|
||||
|
||||
dropin_dir=/etc/systemd/system/systemd-oomd.service.d
|
||||
dropin_file="$dropin_dir/composefs-private-tmp.conf"
|
||||
desired='[Service]
|
||||
# Bluefin composefs ro-rootfs vs systemd PrivateTmp=disconnected:
|
||||
# systemd cannot create /run/systemd/mount-rootfs/var/tmp on the ro root,
|
||||
# so the unit fails at step NAMESPACE (status=226) on every boot.
|
||||
# Drop to PrivateTmp=no — oomd has no real need for a private /tmp.
|
||||
PrivateTmp=no
|
||||
'
|
||||
|
||||
if [ -f "$dropin_file" ] && [ "$(cat "$dropin_file")" = "$desired" ]; then
|
||||
ok "drop-in already in place at $dropin_file"
|
||||
else
|
||||
sudo mkdir -p "$dropin_dir"
|
||||
printf '%s' "$desired" | sudo tee "$dropin_file" >/dev/null
|
||||
sudo systemctl daemon-reload
|
||||
ok "installed $dropin_file"
|
||||
fi
|
||||
|
||||
sudo systemctl reset-failed systemd-oomd.service 2>/dev/null || true
|
||||
sudo systemctl start systemd-oomd.service
|
||||
sleep 1
|
||||
state=$(systemctl is-active systemd-oomd 2>/dev/null || true)
|
||||
case "$state" in
|
||||
active) ok "systemd-oomd active" ;;
|
||||
*) fail "systemd-oomd is $state after start"; return 4 ;;
|
||||
esac
|
||||
}
|
||||
|
||||
# ------------------------------------------------------------- fix quinn-ai -
|
||||
|
||||
cmd_fix_quinn_ai() {
|
||||
require_apricot
|
||||
echo "fix quinn-ai: stop, delete unit, mask"
|
||||
|
||||
unit_file=/etc/systemd/system/quinn-ai-auto-respond.service
|
||||
|
||||
sudo systemctl stop quinn-ai-auto-respond.service 2>/dev/null || true
|
||||
sudo systemctl disable quinn-ai-auto-respond.service 2>/dev/null || true
|
||||
|
||||
if [ -e "$unit_file" ]; then
|
||||
sudo rm -f "$unit_file"
|
||||
ok "removed $unit_file"
|
||||
else
|
||||
ok "unit file already absent"
|
||||
fi
|
||||
|
||||
sudo systemctl daemon-reload
|
||||
# mask now works because the symlink target is free
|
||||
sudo systemctl mask quinn-ai-auto-respond.service >/dev/null
|
||||
ok "masked"
|
||||
|
||||
state=$(systemctl is-active quinn-ai-auto-respond.service 2>/dev/null || true)
|
||||
enabled=$(systemctl is-enabled quinn-ai-auto-respond.service 2>/dev/null || true)
|
||||
case "$enabled" in
|
||||
masked) ok "is-active=$state is-enabled=$enabled" ;;
|
||||
*) fail "expected enabled=masked, got '$enabled'"; return 4 ;;
|
||||
esac
|
||||
}
|
||||
|
||||
cmd_fix_all() {
|
||||
cmd_fix_dnsmasq
|
||||
echo
|
||||
cmd_fix_oomd
|
||||
echo
|
||||
cmd_fix_quinn_ai
|
||||
}
|
||||
|
||||
# ------------------------------------------------------------------ forensic
|
||||
|
||||
cmd_forensic() {
|
||||
require_apricot
|
||||
idx=${1:--1}
|
||||
case "$idx" in
|
||||
-[0-9]*|[0-9]*) ;;
|
||||
*) die "boot index must be an integer (e.g. -1 for previous boot)" ;;
|
||||
esac
|
||||
|
||||
boot_id=$(journalctl --list-boots --no-pager \
|
||||
| awk -v i="$idx" '$1 == i {print $2; exit}')
|
||||
[ -n "$boot_id" ] \
|
||||
|| die "no boot at index $idx (see: journalctl --list-boots)"
|
||||
|
||||
stamp=$(date -u +%Y%m%dT%H%M%SZ)
|
||||
out="$out_root/forensic-$stamp-boot$idx-$boot_id"
|
||||
sudo mkdir -p "$out"
|
||||
sudo chown "$(id -u):$(id -g)" "$out"
|
||||
echo "forensic dump: $out"
|
||||
|
||||
journalctl --list-boots --no-pager > "$out/01-boots.txt" 2>&1 || true
|
||||
journalctl -b "$idx" -k --no-pager > "$out/02-kernel-ring.txt" 2>&1 || true
|
||||
journalctl -b "$idx" -p err --no-pager > "$out/03-priority-err.txt" 2>&1 || true
|
||||
journalctl -b "$idx" --no-pager | tail -500 > "$out/04-tail-500.txt" 2>&1 || true
|
||||
# error-signature grep, written with -e to avoid alternation quoting
|
||||
journalctl -b "$idx" --no-pager 2>/dev/null \
|
||||
| grep -e panic -e hung_task -e MCE -e EDAC -e segfault \
|
||||
-e 'Hardware Error' -e 'Out of memory' -e 'nvme.*error' \
|
||||
-e throttl -e oom-kill \
|
||||
> "$out/05-error-signatures.txt" 2>&1 || true
|
||||
sudo coredumpctl list > "$out/06-coredumps.txt" 2>&1 || true
|
||||
systemctl --failed --no-pager > "$out/07-failed-units.txt" 2>&1 || true
|
||||
rpm-ostree status > "$out/08-rpm-ostree.txt" 2>&1 || true
|
||||
{
|
||||
echo '--- /proc/pressure/cpu ---'; cat /proc/pressure/cpu 2>/dev/null
|
||||
echo '--- /proc/pressure/io ---'; cat /proc/pressure/io 2>/dev/null
|
||||
echo '--- /proc/pressure/memory ---'; cat /proc/pressure/memory 2>/dev/null
|
||||
} > "$out/09-pressure-now.txt" 2>&1 || true
|
||||
[ -f "$HOME/apricot-pressure-alerts.log" ] \
|
||||
&& cp "$HOME/apricot-pressure-alerts.log" "$out/10-pressure-alerts.log" || true
|
||||
|
||||
n=$(ls "$out" | wc -l | tr -d ' ')
|
||||
ok "wrote $n artifact(s) to $out"
|
||||
}
|
||||
|
||||
# --------------------------------------------------------------- dispatch ---
|
||||
|
||||
case "${1:-check}" in
|
||||
check|'')
|
||||
cmd_check
|
||||
;;
|
||||
fix)
|
||||
shift
|
||||
case "${1:-}" in
|
||||
dnsmasq) cmd_fix_dnsmasq ;;
|
||||
oomd) cmd_fix_oomd ;;
|
||||
quinn-ai) cmd_fix_quinn_ai ;;
|
||||
all) cmd_fix_all ;;
|
||||
*) usage ;;
|
||||
esac
|
||||
;;
|
||||
forensic)
|
||||
shift
|
||||
cmd_forensic "${1:-}"
|
||||
;;
|
||||
-h|--help|help)
|
||||
usage
|
||||
;;
|
||||
*)
|
||||
usage
|
||||
;;
|
||||
esac
|
||||
|
|
@ -64,7 +64,9 @@ host=$(hostname -s)
|
|||
printf '# Bind only to the wg1 IP so this view is invisible to LAN/loopback clients\n'
|
||||
printf '# (which are served by lilith-local.conf with split-horizon 127.0.0.1 records).\n'
|
||||
printf 'listen-address=%s\n' "$listen"
|
||||
printf 'bind-interfaces\n'
|
||||
# bind-dynamic (not bind-interfaces): binds the listen-address as it appears,
|
||||
# so dnsmasq doesn't lose the boot race against wg1 coming up.
|
||||
printf 'bind-dynamic\n'
|
||||
printf '\n'
|
||||
printf '# DNS records (one per record entry in wg-mesh-hosts.json)\n'
|
||||
jq -r '.records[] | "address=/\(.name|sub("^\\.";""))/\(.ip) # \(.comment // "")"' "$data_file"
|
||||
|
|
@ -90,8 +92,23 @@ fi
|
|||
# Back up current target (if any) before replacing — undo handled by re-running
|
||||
# wg-dns-sync after editing the JSON, NOT by restoring this backup. Backup is
|
||||
# audit-only; safe to delete.
|
||||
#
|
||||
# Backup MUST NOT live in /etc/dnsmasq.d/ because dnsmasq's conf-dir loads every
|
||||
# file there that isn't on its (small) exclude list; .prev was not excluded and
|
||||
# got parsed as a second config, silently doubling listen-address= and address=
|
||||
# entries.
|
||||
backup_dir=/var/lib/wg-dns-sync
|
||||
backup="$backup_dir/wg-mesh.conf.prev"
|
||||
sudo mkdir -p "$backup_dir"
|
||||
|
||||
# One-shot migration: earlier versions of this script wrote the backup as
|
||||
# ${target}.prev inside /etc/dnsmasq.d/. Remove that stale file if present.
|
||||
if [ -e "${target}.prev" ]; then
|
||||
sudo rm -f "${target}.prev"
|
||||
fi
|
||||
|
||||
if [ -r "$target" ]; then
|
||||
sudo cp "$target" "${target}.prev"
|
||||
sudo install -m 0644 -o root -g root "$target" "$backup"
|
||||
fi
|
||||
|
||||
sudo install -m 0644 -o root -g root "$tmp" "$target"
|
||||
|
|
@ -101,8 +118,8 @@ echo "wg-dns-sync: installed $target"
|
|||
# unreachable because wg1 is down), restore the previous conf and exit 3.
|
||||
if ! sudo systemctl restart dnsmasq; then
|
||||
echo "wg-dns-sync: dnsmasq failed to restart, rolling back" >&2
|
||||
if [ -r "${target}.prev" ]; then
|
||||
sudo install -m 0644 -o root -g root "${target}.prev" "$target"
|
||||
if [ -r "$backup" ]; then
|
||||
sudo install -m 0644 -o root -g root "$backup" "$target"
|
||||
else
|
||||
sudo rm -f "$target"
|
||||
fi
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue