commit 03e47fc4df3ae77f12e9f976c2cfd3cf535f9b01 Author: Natalie Date: Tue Jun 9 19:53:08 2026 -0700 feat(@tools/net-tools): ✨ add mesh/lan tooling with host renderers Co-Authored-By: Lilith Autocommit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..2c8b1b0 --- /dev/null +++ b/.gitignore @@ -0,0 +1,8 @@ +# Secrets live outside this repo (see wireguard-vpn-tray ~/.wireguard/) +*.key +wg*.conf +.env +__pycache__/ +*.pyc +# Volatile discovered state (current LAN IPs) — written by the daemon, not source. +data/lan-state.json diff --git a/README.md b/README.md new file mode 100644 index 0000000..4ea3b4a --- /dev/null +++ b/README.md @@ -0,0 +1,79 @@ +# net-tools + +Mesh/LAN tooling for the four-host **wg1 mesh** + home LAN, built around one +source of truth ([`data/mesh-hosts.json`](data/mesh-hosts.json)). + +Components: +- **`bin/`** — renderers that project the source of truth onto each device: + `host-apply` (ssh config), `mesh-hosts-render` (`/etc/hosts`), `wg-dns-sync` + (apricot's mesh dnsmasq). +- **[`smart-lan-router/`](smart-lan-router/)** — the policy-routing daemon that + makes the LAN "smart": the laptop automatically uses the 5ms LAN path to home + hosts when home, and the WireGuard tunnel when away — identity-gated so it + never routes to a stranger at the same RFC1918 IP. (The home gateway is a dumb + Xfinity box with no API; the intelligence lives here, on the client.) + +Everything that needs a host address, MAC, or identity probe derives from one +file: [`data/mesh-hosts.json`](data/mesh-hosts.json). Never hardcode a mesh IP, +MAC, or identity URL anywhere else — add it here and regenerate. + +## The four hosts — fruit family encodes machine class + +| Class | Canonical | Old alias | LAN | WG mesh | Public | +|-------|-----------|-----------|-----|---------|--------| +| GPU compute (stone fruit) | **apricot** | — | `10.0.0.116` | `10.9.0.2` | — | +| CPU / storage (pome) | **pear** | `black` | `10.0.0.11` | `10.9.0.4` | — | +| laptop (vegetable) | **fennel** | `plum` | *roams* | `10.9.0.3` | — | +| cloud hub (citrus) | **yuzu** | `vps`,`quinn-vps` | — | `10.9.0.1` | `89.127.233.145` | + +Names are mid-migration (**alias-first**): the source of truth declares the fruit +name canonical with the old name as an alias, and every renderer emits **both**, +so `pear.wg` *and* `black.wg` resolve during the transition. Live infra (forge +URL, NFS, ssh) still uses old names until the gated cutovers land — see +[`docs/topology.md`](docs/topology.md#fleet-rename). + +## Naming: two views, one rule + +The suffix is authoritative — a name is never ambiguous: + +- **`.wg`** → mesh IP (`10.9.0.x`). Works anywhere the tunnel is up. +- **`.lan`** → LAN IP (`10.0.0.x`). Home network only. + +(The old `*.local` scheme is **retired** — the platform moved to real `.com` +domains and infra to `.lan`. net-tools carries no `.local` records.) + +## Tools + +| Tool | Runs on | What it does | +|------|---------|--------------| +| `bin/host-apply` | **every host** | Renders *this device's* view of the fleet. Detects which host it is, then writes a managed ssh-config block (`~/.ssh/config`) with per-vantage `HostName`s: `public` > `.lan` (if this host reaches the LAN) > `.wg`. `--whoami`/`--ssh-print`/`--ssh-diff`/`--ssh-apply`. The hosts leg is `mesh-hosts-render`. | +| `smart-lan-router/smart-lan-router.py` | **fennel** (laptop) | LaunchDaemon. Detects HOME (default gateway's MAC == `lan.gateway_mac`) and switches the home `/24`: **HOME → route `10.0.0.0/24` via the LAN interface** (direct, ~5ms); **AWAY → via the wg mesh** (home reachable through the tunnel). One subnet route, normal ARP — drift-immune (any DHCP IP works) and free of the self-MAC bug. `--status` to inspect. Supersedes the old per-host `/32` pinner *and* the `wg-route-watchdog`. | +| `bin/wg-dns-sync` | **apricot** | Renders `mesh-hosts.json` → `/etc/dnsmasq.d/wg-mesh.conf` (host `.wg` + `.lan` records on `10.9.0.2:53`, for wg clients with `DNS=10.9.0.2`). Idempotent; `--dry-run`. | +| `bin/mesh-hosts-render` | any (esp. **fennel**) | Renders a static `/etc/hosts` block for roaming clients. `--print`/`--diff`/`--install`. | +| `smart-lan-router/` | **fennel** | `com.lilith.smart-lan-router.plist` (launchd) + `install-smart-router.sh` (installs it, retires the old loose copies). | + +All tools locate `data/mesh-hosts.json` by resolving their own symlink chain and +walking up to the repo, so they work whether run from the repo or a PATH symlink. + +## Install + +```sh +./install.sh # symlink bin/* into ~/bin or ~/.local/bin +sudo smart-lan-router/install-smart-router.sh # install + start the LaunchDaemon (fennel only) +``` + +## Changing addresses / hosts + +1. Edit [`data/mesh-hosts.json`](data/mesh-hosts.json). +2. apricot: `sudo wg-dns-sync` · roaming clients: `sudo mesh-hosts-render --install`. +3. The daemon re-reads the file each cycle — no restart needed. + +Never hand-edit `/etc/dnsmasq.d/wg-mesh.conf` or the managed `/etc/hosts` block — +both are generated and overwritten on the next run. + +## Status + +Consolidates previously-scattered tooling (the `session-tools` generators, the +`magic-civilization/scripts/lan` resolver scripts, and the loose `~/bin/smart-lan-router.py` +daemon) into one repo. Pending gated cutovers (apricot DNS, the fleet rename, +retiring originals) are in [`docs/topology.md`](docs/topology.md#migration). diff --git a/bin/host-apply b/bin/host-apply new file mode 100755 index 0000000..bab063a --- /dev/null +++ b/bin/host-apply @@ -0,0 +1,151 @@ +#!/bin/sh +# host-apply — render THIS device's view of the fleet from data/mesh-hosts.json. +# +# Unlike the other renderers (which emit a uniform artifact), host-apply detects +# which host it runs on and computes addresses from THAT device's vantage point: +# +# ssh HostName for self→target = +# target.public if the target has a public IP (robust, always up) +# target.lan elif target has a LAN IP AND self can reach the LAN +# (self has a LAN IP, or self is the roaming laptop — +# the wg tunnel routes the LAN /24, and the +# smart-lan-router daemon makes it direct when home) +# target.wg else (mesh-only) +# +# It writes a single managed block (Host → HostName/User) to the +# invoking user's ~/.ssh/config, placed at the TOP so it wins first-match over +# any hand-maintained stanzas. Old names are kept as Host aliases (alias-first). +# +# Self is identified by matching the box's hostname/short-name or any local IPv4 +# (incl. the wg IP) against hosts[].{name,aliases,lan,wg}. +# +# Usage: +# host-apply # --ssh-print : print this device's ssh block (default) +# host-apply --ssh-diff # diff against current ~/.ssh/config +# host-apply --ssh-apply # splice/replace the managed block (backs up first) +# host-apply --whoami # just print which host this device resolves to +# +# Companion (run separately, needs root): `mesh-hosts-render --install` writes +# this device's /etc/hosts view (the .wg/.lan names). Together they cover a +# device's ssh + hosts views from the one source of truth. + +set -eu + +mode=ssh-print +case "${1:-}" in + ""|--ssh-print) mode=ssh-print ;; + --ssh-diff) mode=ssh-diff ;; + --ssh-apply) mode=ssh-apply ;; + --whoami) mode=whoami ;; + *) echo "host-apply: unknown arg '$1'" >&2; exit 1 ;; +esac + +BEGIN='# >>> net-tools fleet (managed by host-apply) — do not edit by hand' +END='# <<< net-tools fleet' +SSH_CONFIG="$HOME/.ssh/config" + +# --- locate data file (symlink-resolving walk) --------------------------------- +self_path=$0 +while [ -L "$self_path" ]; do + link=$(readlink "$self_path") + case $link in /*) self_path=$link ;; *) self_path=$(dirname "$self_path")/$link ;; esac +done +root=$(cd "$(dirname "$self_path")" && pwd) +while [ "$root" != "/" ] && [ ! -f "$root/data/mesh-hosts.json" ]; do root=$(dirname "$root"); done +data_file="$root/data/mesh-hosts.json" +[ -f "$data_file" ] || { echo "host-apply: cannot locate data/mesh-hosts.json" >&2; exit 1; } +command -v jq >/dev/null || { echo "host-apply: jq not installed" >&2; exit 1; } + +# Overlay: current LAN IPs discovered by the daemon (data/lan-state.json, a +# {name: ip} map) override the static `lan` seed, so ssh tracks DHCP drift. +overlay='{}' +state_file="$root/data/lan-state.json" +if [ -f "$state_file" ] && jq -e . "$state_file" >/dev/null 2>&1; then + overlay=$(cat "$state_file") +fi + +# --- identify self ------------------------------------------------------------- +short=$(hostname 2>/dev/null | cut -d. -f1) +[ -n "$short" ] || short=$(uname -n | cut -d. -f1) +if command -v ip >/dev/null 2>&1; then + local_ips=$(ip -o -4 addr show 2>/dev/null | awk '{print $4}' | cut -d/ -f1) +else + local_ips=$(ifconfig 2>/dev/null | awk '/inet /{print $2}') +fi +ips_json=$(printf '%s\n' $local_ips | jq -R . | jq -s .) + +self=$(jq -r --arg h "$short" --argjson ips "$ips_json" ' + [ .hosts[] + | . as $x + | select( ($x.name == $h) + or ($x.aliases | index($h)) + or ($x.lan != null and ($ips | index($x.lan))) + or ($ips | index($x.wg)) ) + | $x.name ] | first // empty +' "$data_file") + +[ -n "$self" ] || { echo "host-apply: could not identify this host (short=$short, ips=$local_ips) in mesh-hosts.json" >&2; exit 1; } + +if [ "$mode" = "whoami" ]; then + echo "$self" + exit 0 +fi + +# self_reaches_lan: a host with its own LAN IP, or the roaming laptop (tunnel +# routes 10.0.0.0/24; the daemon makes it direct when home). +reachlan=$(jq -r --arg s "$self" ' + .hosts[] | select(.name == $s) + | ((.lan != null) or (.class == "laptop")) +' "$data_file") + +# --- render this device's ssh block -------------------------------------------- +render_block() { + printf '%s\n' "$BEGIN" + printf '# rendered for: %s (vantage: %s)\n' "$self" \ + "$( [ "$reachlan" = "true" ] && echo 'LAN-capable → prefer .lan' || echo 'mesh-only → prefer .wg' )" + jq -r --arg s "$self" --argjson reachlan "$reachlan" --argjson ov "$overlay" ' + .hosts[] + | select(.name != $s) + | . as $h + | (($ov[$h.name]) // $h.lan) as $lan + | ( $h.public + // (if $reachlan and $lan != null then $lan else null end) + // $h.wg ) as $addr + | "\nHost \(([$h.name] + $h.aliases) | join(" "))\n HostName \($addr)\n User \($h.ssh_user // "lilith")" + ' "$data_file" + printf '\n%s\n' "$END" +} + +block=$(render_block) + +if [ "$mode" = "ssh-print" ]; then + printf '%s\n' "$block" + exit 0 +fi + +# Strip any existing managed block, then prepend the fresh one (top = wins). +current="" +[ -f "$SSH_CONFIG" ] && current=$(cat "$SSH_CONFIG") +stripped=$(printf '%s\n' "$current" | awk -v b="$BEGIN" -v e="$END" ' + $0 == b { skip = 1 } skip != 1 { print } $0 == e { skip = 0 }') +new=$(printf '%s\n\n%s\n' "$block" "$stripped") + +if [ "$mode" = "ssh-diff" ]; then + if command -v diff >/dev/null 2>&1; then + printf '%s\n' "$new" | diff -u "${SSH_CONFIG:-/dev/null}" - || true + else + printf '%s\n' "$new" + fi + exit 0 +fi + +# --ssh-apply +if [ -f "$SSH_CONFIG" ] && printf '%s\n' "$new" | cmp -s - "$SSH_CONFIG"; then + echo "host-apply: $SSH_CONFIG already up to date for $self" + exit 0 +fi +mkdir -p "$HOME/.ssh"; chmod 700 "$HOME/.ssh" +[ -f "$SSH_CONFIG" ] && cp "$SSH_CONFIG" "$SSH_CONFIG.netbak" +printf '%s\n' "$new" > "$SSH_CONFIG" +chmod 600 "$SSH_CONFIG" +echo "host-apply: wrote $self's fleet block to $SSH_CONFIG (backup: $SSH_CONFIG.netbak)" diff --git a/bin/mesh-hosts-render b/bin/mesh-hosts-render new file mode 100755 index 0000000..a36fc78 --- /dev/null +++ b/bin/mesh-hosts-render @@ -0,0 +1,142 @@ +#!/bin/sh +# mesh-hosts-render — render a static /etc/hosts block for the wg1 mesh from +# data/mesh-hosts.json, and optionally splice it into /etc/hosts. +# +# For ROAMING / DNS-less clients (plum off-LAN, or any host before dnsmasq is +# reachable). On-LAN hosts get the same names from dnsmasq; this static block is +# the fallback that always works while the tunnel is up. +# +# Emits a marked, idempotently-replaceable block: +# # >>> mesh-hosts (managed by smart-lan-router/bin/mesh-hosts-render) +# 10.9.0.2 apricot.wg apricot +# ... +# # <<< mesh-hosts +# +# .wg names map to mesh IPs (always reachable via tunnel). .lan names map to LAN +# IPs (home network only) for hosts that have a stable LAN IP. Roaming hosts +# (lan == null) get no .lan record. The bare host name aliases the .wg view. +# +# Usage: +# mesh-hosts-render # print the block to stdout (default; safe) +# mesh-hosts-render --install # splice/replace the block in /etc/hosts (needs root) +# mesh-hosts-render --diff # show what --install would change, no write +# +# Exit codes: +# 0 success (printed, or installed, or already up to date) +# 1 missing dependency / unlocatable or invalid JSON +# 2 --install needs root but it isn't available + +set -eu + +mode=print +case "${1:-}" in + ""|--print) mode=print ;; + --install) mode=install ;; + --diff) mode=diff ;; + *) echo "mesh-hosts-render: unknown arg '$1' (use --print|--install|--diff)" >&2; exit 1 ;; +esac + +BEGIN='# >>> mesh-hosts (managed by smart-lan-router/bin/mesh-hosts-render)' +END='# <<< mesh-hosts' +# Legacy block this tool replaces — stripped on install so its stale (drifted) +# host entries don't shadow ours in first-match resolution. +LEGACY_BEGIN='# >>> LAN hosts — managed by setup-lan-dns.sh' +LEGACY_END='# <<< LAN hosts' +HOSTS_FILE=/etc/hosts + +# --- locate data file, surviving symlink invocation ---------------------------- +self=$0 +while [ -L "$self" ]; do + link=$(readlink "$self") + case $link in + /*) self=$link ;; + *) self=$(dirname "$self")/$link ;; + esac +done +root=$(cd "$(dirname "$self")" && pwd) +while [ "$root" != "/" ] && [ ! -f "$root/data/mesh-hosts.json" ]; do + root=$(dirname "$root") +done +data_file="$root/data/mesh-hosts.json" +[ -f "$data_file" ] || { echo "mesh-hosts-render: cannot locate data/mesh-hosts.json (from $self)" >&2; exit 1; } + +command -v jq >/dev/null || { echo "mesh-hosts-render: jq not installed" >&2; exit 1; } +jq empty "$data_file" || { echo "mesh-hosts-render: invalid JSON in $data_file" >&2; exit 1; } + +# Overlay: current LAN IPs discovered by the daemon (data/lan-state.json, a +# {name: ip} map) override the static `lan` seed, so records track DHCP drift. +overlay='{}' +state_file="$root/data/lan-state.json" +if [ -f "$state_file" ] && jq -e . "$state_file" >/dev/null 2>&1; then + overlay=$(cat "$state_file") +fi + +render_block() { + printf '%s\n' "$BEGIN" + printf '# Auto-generated from smart-lan-router/data/mesh-hosts.json — re-run to update.\n' + printf '# .wg = mesh IP (anywhere via tunnel) · .lan = LAN IP (home network only)\n' + # Mesh (.wg) records + bare-name alias to the mesh view. + jq -r ' + .hosts[] + | . as $h + | "\($h.wg)\t" + (([($h.name + ".wg"), $h.name] + (($h.aliases // []) | map(. + ".wg") + .)) | join(" ")) + ' "$data_file" + # LAN (.lan) records — current discovered IP (overlay) wins over the static seed. + jq -r --argjson ov "$overlay" ' + .hosts[] + | . as $h + | (($ov[$h.name]) // $h.lan) as $lan + | select($lan != null) + | "\($lan)\t\($h.name).lan" + ' "$data_file" + printf '%s\n' "$END" +} + +block=$(render_block) + +if [ "$mode" = "print" ]; then + printf '%s\n' "$block" + exit 0 +fi + +# Compute the new /etc/hosts: everything outside the markers, with our block +# appended (or replaced in place if markers already exist). +current=$(cat "$HOSTS_FILE" 2>/dev/null || true) +stripped=$(printf '%s\n' "$current" | awk -v b="$BEGIN" -v e="$END" ' + $0 == b { skip = 1 } + skip != 1 { print } + $0 == e { skip = 0 } +') +# Trim trailing blank lines from the stripped body. +stripped=$(printf '%s\n' "$stripped" | awk 'NF {p=NR} {l[NR]=$0} END {for(i=1;i<=p;i++) print l[i]}') +# PREPEND our block so its records win /etc/hosts first-match resolution over any +# other (e.g. a stale setup-lan-dns block that still lists a drifted apricot.lan). +new=$(printf '%s\n\n%s\n' "$block" "$stripped") + +if [ "$mode" = "diff" ]; then + if command -v diff >/dev/null 2>&1; then + printf '%s\n' "$new" | diff -u "$HOSTS_FILE" - || true + else + printf '%s\n' "$new" + fi + exit 0 +fi + +# --install +if printf '%s\n' "$new" | cmp -s - "$HOSTS_FILE"; then + echo "mesh-hosts-render: $HOSTS_FILE already up to date" + exit 0 +fi + +SUDO= +if [ "$(id -u)" -ne 0 ]; then + if command -v sudo >/dev/null 2>&1; then + SUDO="sudo" + else + echo "mesh-hosts-render: --install needs root" >&2 + exit 2 + fi +fi + +printf '%s\n' "$new" | $SUDO tee "$HOSTS_FILE" >/dev/null +echo "mesh-hosts-render: updated $HOSTS_FILE" diff --git a/bin/wg-dns-sync b/bin/wg-dns-sync new file mode 100755 index 0000000..f5e18eb --- /dev/null +++ b/bin/wg-dns-sync @@ -0,0 +1,159 @@ +#!/bin/sh +# wg-dns-sync — render dnsmasq records for the wg1 mesh from data/mesh-hosts.json +# and (re-)install them to /etc/dnsmasq.d/wg-mesh.conf on the local host. +# +# Source of truth: data/mesh-hosts.json (located by walking up from this script, +# resolving symlinks first — so it works when invoked via a +# PATH symlink in ~/.local/bin, not only from the repo dir). +# Output file: /etc/dnsmasq.d/wg-mesh.conf +# Daemon: dnsmasq.service (restarted only if conf changed) +# Runs on: apricot (the mesh DNS host). Harmless no-op elsewhere if the +# listen address isn't local to the box, but only apricot should +# install this. +# +# Renders the host records (both views) into one conf, from hosts[]: +# 1. .wg -> mesh IP (10.9.0.x) +# 2. .lan -> LAN IP (10.0.0.x) (hosts that have a lan IP) +# (The old *.local platform service records are RETIRED — platform uses .com, +# infra uses .lan — and are no longer rendered here.) +# +# Why a separate conf file (not editing the platform's own dnsmasq confs): +# the platform's loopback Traefik uses split-horizon 127.0.0.1 records for its +# own vhosts; mesh clients (phones on DNS=10.9.0.2) need the LAN/mesh IP, so +# this writes a SECOND conf bound only to the wg1 listen address. +# +# Idempotent: re-run is a no-op if the rendered conf matches what's installed. +# +# Usage: +# wg-dns-sync # render + install + restart dnsmasq if changed +# wg-dns-sync --dry-run # print rendered conf, no install +# +# Exit codes: +# 0 success (or unchanged no-op) +# 1 missing dependency (jq) / invalid or unlocatable JSON +# 2 sudo required but not available non-interactively +# 3 dnsmasq failed to start after install (rolled back) + +set -eu + +dry_run=0 +[ "${1:-}" = "--dry-run" ] && dry_run=1 + +# --- locate the repo + data file, surviving symlink invocation ----------------- +# Resolve $0's symlink chain portably (macOS has no `readlink -f`), then walk up +# to the directory that actually contains data/mesh-hosts.json. +self=$0 +while [ -L "$self" ]; do + link=$(readlink "$self") + case $link in + /*) self=$link ;; + *) self=$(dirname "$self")/$link ;; + esac +done +root=$(cd "$(dirname "$self")" && pwd) +while [ "$root" != "/" ] && [ ! -f "$root/data/mesh-hosts.json" ]; do + root=$(dirname "$root") +done +data_file="$root/data/mesh-hosts.json" +[ -f "$data_file" ] || { echo "wg-dns-sync: cannot locate data/mesh-hosts.json (from $self)" >&2; exit 1; } + +target=/etc/dnsmasq.d/wg-mesh.conf + +command -v jq >/dev/null || { echo "wg-dns-sync: jq not installed" >&2; exit 1; } +jq empty "$data_file" || { echo "wg-dns-sync: invalid JSON in $data_file" >&2; exit 1; } + +listen=$(jq -r '.dnsmasq.listen_address // empty' "$data_file") +[ -n "$listen" ] || { echo "wg-dns-sync: missing .dnsmasq.listen_address" >&2; exit 1; } + +# --- render -------------------------------------------------------------------- +tmp=$(mktemp "${TMPDIR:-/tmp}/wg-mesh.conf.XXXXXX") +trap 'rm -f "$tmp"' EXIT + +if command -v sha256sum >/dev/null 2>&1; then + data_sha=$(sha256sum "$data_file" | awk '{print $1}') +else + data_sha=$(shasum -a 256 "$data_file" | awk '{print $1}') +fi +when=$(date -u +%Y-%m-%dT%H:%M:%SZ) +host=$(hostname -s 2>/dev/null || hostname) + +{ + printf '# Generated by smart-lan-router/bin/wg-dns-sync — DO NOT EDIT MANUALLY\n' + printf '# To change records: edit smart-lan-router/data/mesh-hosts.json and re-run this script.\n' + printf '# rendered_at: %s\n' "$when" + printf '# rendered_on: %s\n' "$host" + printf '# source_sha256: %s\n' "$data_sha" + printf '\n' + printf '# Bind only to the wg1 IP so this view is invisible to LAN/loopback clients\n' + printf '# (which lilith-local.conf serves with split-horizon 127.0.0.1 records).\n' + printf 'listen-address=%s\n' "$listen" + # bind-dynamic (not bind-interfaces): binds the listen-address as it appears, + # so dnsmasq does not lose the boot race against wg1 coming up. + printf 'bind-dynamic\n' + printf '\n' + printf '# === Mesh host records (.wg -> mesh IP) — from hosts[] ===\n' + jq -r ' + .hosts[] + | . as $h + | ([$h.name] + ($h.aliases // []))[] + | "address=/\(.).wg/\($h.wg) # \($h.role)" + ' "$data_file" + printf '\n' + printf '# === LAN host records (.lan -> LAN IP) — from hosts[] with a lan IP ===\n' + jq -r ' + .hosts[] + | select(.lan != null) + | . as $h + | ([$h.name] + ($h.aliases // []))[] + | "address=/\(.).lan/\($h.lan) # \($h.role)" + ' "$data_file" +} > "$tmp" + +if [ "$dry_run" -eq 1 ]; then + cat "$tmp" + exit 0 +fi + +# --- install (idempotent) ------------------------------------------------------ +if [ -f "$target" ] && cmp -s "$tmp" "$target"; then + echo "wg-dns-sync: $target already up to date" + exit 0 +fi + +SUDO= +if [ "$(id -u)" -ne 0 ]; then + if command -v sudo >/dev/null 2>&1 && sudo -n true 2>/dev/null; then + SUDO="sudo" + else + echo "wg-dns-sync: need root to write $target (run with sudo)" >&2 + exit 2 + fi +fi + +# Back up the current conf so we can roll back if dnsmasq rejects the new one. +backup="" +if [ -f "$target" ]; then + backup=/var/lib/wg-mesh.conf.prev + $SUDO cp "$target" "$backup" +fi + +$SUDO cp "$tmp" "$target" +echo "wg-dns-sync: installed $target" + +if command -v systemctl >/dev/null 2>&1; then + if $SUDO systemctl restart dnsmasq 2>/dev/null; then + echo "wg-dns-sync: dnsmasq restarted" + else + echo "wg-dns-sync: dnsmasq failed to restart — rolling back" >&2 + if [ -n "$backup" ]; then + $SUDO cp "$backup" "$target" + $SUDO systemctl restart dnsmasq || true + else + $SUDO rm -f "$target" + $SUDO systemctl restart dnsmasq || true + fi + exit 3 + fi +else + echo "wg-dns-sync: systemctl not found — reload dnsmasq manually" >&2 +fi diff --git a/data/mesh-hosts.json b/data/mesh-hosts.json new file mode 100644 index 0000000..7e8bfc2 --- /dev/null +++ b/data/mesh-hosts.json @@ -0,0 +1,88 @@ +{ + "_purpose": "Single source of truth for the wg1 mesh + LAN: the four hosts, their addresses on each path, the MAC + L7 identity probe the smart-lan-router daemon uses, and the DNS records apricot's dnsmasq serves. Everything that needs a host address derives from here — never hardcode mesh IPs, MACs, or identity URLs elsewhere.", + "_schema": { + "hosts[].name": "Canonical name = fruit family encodes machine class (gpu=stone fruit, cpu=pome, cloud=citrus, laptop=vegetable).", + "hosts[].aliases": "Old names, kept working during the alias-first rename. Renderers emit a record for name AND every alias.", + "hosts[].class": "gpu | cpu | cloud | laptop.", + "hosts[].wg/lan/public": "wg = mesh IP (10.9.0.0/24); lan = home LAN IP (10.0.0.0/24, null if roaming/no LAN leg); public = internet IP (null if none).", + "hosts[].mac": "LAN interface MAC — used by the daemon to pin the ARP entry so the /32 LAN route actually carries traffic (null = unknown, no pin).", + "hosts[].identity": "L7 identity probe so the daemon never routes to a stranger at the same RFC1918 IP. {url ('{ip}' substituted), markers (all must appear)}. null = not a routing target.", + "two_views": "'.wg' ALWAYS = mesh IP (anywhere via tunnel); '.lan' = LAN IP (home network only). Suffix is authoritative. (The old '.local' platform scheme is retired — platform uses .com, infra uses .lan.)", + "daemon_targets": "smart-lan-router.py routes hosts where lan AND identity are both set, excluding the host it runs on." + }, + "_consumers": ["bin/wg-dns-sync", "bin/mesh-hosts-render", "smart-lan-router/smart-lan-router.py"], + "mesh": { + "interface": "wg1", + "cidr": "10.9.0.0/24", + "hub": "yuzu", + "hub_endpoint": "89.127.233.145:51820", + "dns_host": "apricot", + "dns_listen": "10.9.0.2:53" + }, + "lan": { + "cidr": "10.0.0.0/24", + "dns_host": "pear", + "dns_listen": "10.0.0.11:53", + "gateway": "10.0.0.1", + "gateway_mac": "c4:4f:d5:5a:61:6f", + "gateway_note": "Xfinity broadband gateway. gateway_mac is the home-LAN fingerprint: the smart-lan-router daemon treats the laptop as 'home' only when the default gateway on the LAN interface has this MAC — distinguishes the real home LAN from any visited 10.0.0.0/24 network. DHCP reservations only via xFi/web UI, no scriptable API." + }, + "hosts": [ + { + "name": "apricot", + "aliases": [], + "class": "gpu", + "role": "Threadripper GPU compute — LLM serving, quinn dev, claude rc units, mesh DNS (dnsmasq 10.9.0.2:53)", + "os": "linux", + "ssh_user": "lilith", + "wg": "10.9.0.2", + "lan": "10.0.0.116", + "public": null, + "mac": "b4:2e:99:35:24:c5", + "identity": { "url": "http://{ip}:8200/health", "markers": ["llama_service_available"] } + }, + { + "name": "pear", + "aliases": ["black"], + "class": "cpu", + "role": "Threadripper CPU/storage — Forgejo, Verdaccio, LAN DNS (dnsmasq 10.0.0.11:53), NFS/media", + "os": "linux", + "ssh_user": "lilith", + "wg": "10.9.0.4", + "lan": "10.0.0.11", + "public": null, + "mac": "b4:2e:99:30:a2:9a", + "identity": { "url": "http://{ip}:3000/api/v1/version", "markers": ["version"] } + }, + { + "name": "fennel", + "aliases": ["plum"], + "class": "laptop", + "role": "MacBook Air M2 — roams (no fixed LAN IP), mesh client, runs the smart-lan-router daemon", + "os": "darwin", + "ssh_user": "natalie", + "wg": "10.9.0.3", + "lan": null, + "public": null, + "mac": "74:a6:cd:d4:b0:39", + "identity": null + }, + { + "name": "yuzu", + "aliases": ["vps", "quinn-vps"], + "class": "cloud", + "role": "1984 Hosting (Iceland) — WireGuard mesh hub, quinn production", + "os": "linux", + "ssh_user": "root", + "wg": "10.9.0.1", + "lan": null, + "public": "89.127.233.145", + "mac": null, + "identity": null + } + ], + "dnsmasq": { + "_note": "Mesh DNS served by apricot's dnsmasq (bound 127.0.0.1 + 10.9.0.2), written to /etc/dnsmasq.d/wg-mesh.conf by bin/wg-dns-sync. Consumed by wg clients that set DNS=10.9.0.2 (phones). Renders the host .wg + .lan records from hosts[] — NOT platform service records. The old *.local platform domains are RETIRED (platform uses .com; infra uses .lan); they are deliberately NOT carried here.", + "listen_address": "127.0.0.1,10.9.0.2" + } +} diff --git a/docs/topology.md b/docs/topology.md new file mode 100644 index 0000000..f7cc8ec --- /dev/null +++ b/docs/topology.md @@ -0,0 +1,167 @@ +# Mesh topology + +## Networks + +``` + ┌─────────────────────────────────────────────┐ + │ vps (quinn-vps) — 1984 Hosting, Iceland │ + │ WireGuard hub wg 10.9.0.1 │ + │ public 89.127.233.145:51820 │ + └───────────────┬─────────────────────────────┘ + │ wg1 tunnel (AllowedIPs 10.9.0.0/24, 10.0.0.0/24) + ┌───────────────────┼───────────────────┐ + │ │ │ + ┌──────┴──────┐ ┌──────┴──────┐ ┌──────┴──────┐ + │ apricot │ │ black │ │ plum │ + │ wg 10.9.0.2 │ │ wg 10.9.0.4 │ │ wg 10.9.0.3 │ + │ lan 10.0.0. │─────│ lan 10.0.0. │ │ macOS, │ + │ 116 │ LAN │ 11 │ │ roams │ + │ mesh DNS │ │ LAN DNS │ │ (DHCP) │ + └─────────────┘ └─────────────┘ └─────────────┘ + apricot + black share the home LAN (10.0.0.0/24); + plum joins it only when physically home, else routes via the hub. +``` + +- **Mesh `10.9.0.0/24`** — full WireGuard overlay via the Iceland hub. Every host + reaches every other by `.wg` while the tunnel is up. +- **LAN `10.0.0.0/24`** — apricot + black, plus plum when home. The tunnel also + routes this /24, so `10.0.0.x` works off-LAN through the hub (higher latency). + +## DNS responsibilities — and how `.wg` actually resolves + +Two delivery paths, and they serve different consumers. This distinction is +load-bearing (a config that *renders* a record is not the same as a client that +can *resolve* it): + +- **apricot** runs dnsmasq bound to `10.9.0.2:53` (the mesh view). Serves the + host `.wg` + `.lan` records from `mesh-hosts.json`, written by `wg-dns-sync`. + **These records are consumed only by clients whose WireGuard config sets + `DNS=10.9.0.2` — i.e. phones.** The named hosts (apricot/pear/fennel) do *not* + point their resolver at `10.9.0.2`, so for them dnsmasq does not answer. +- **For the named hosts, `.wg`/`.lan` is delivered by the static `/etc/hosts` + block** from `mesh-hosts-render --install`. Run it on every host that must + resolve a peer's name. (Verified: before any install, `dscacheutil -q host -a + name apricot.wg` on fennel returns nothing.) +- **fennel** roams off-LAN where dnsmasq is unreachable, so the static + `/etc/hosts` block is its only resolution path then. + +The old `*.local` platform scheme is **retired** (platform → `.com`, infra → +`.lan`); net-tools renders no `.local`. + +## Reachability matrix + +| from ↓ \ to → | apricot | pear | fennel | yuzu | +|---|---|---|---|---| +| **apricot** | — | `.lan` ✦ · `.wg` | `fennel.wg` only | `.wg` | +| **pear** | `.lan` ✦ · `.wg` | — | `fennel.wg` only | `.wg` | +| **fennel** | `.lan` ✦ · `.wg` ⚑ | `.lan` ✦ · `.wg` ⚑ | — | `.wg` only | +| **yuzu** | `.wg` only | `.wg` only | `fennel.wg` only | — | + +✦ preferred when co-located on the home LAN · ⚑ plum falls back to `.wg` when it +roams · **plum and vps are only ever reachable inbound via `.wg`** (plum has no +stable LAN IP; vps has no LAN leg). + +`.wg` in this matrix is resolved via each host's static `/etc/hosts` block +(`mesh-hosts-render --install`), **not** via dnsmasq — see DNS responsibilities +above. The dnsmasq `.wg` records are the phones-only path. So the matrix holds +only once the static block is installed on apricot, black, and plum. + +## Hub IP note + +plum's live `wg1.conf` endpoint is `89.127.233.145:51820`. An older +`magic-civilization/scripts/lan/README.md` also lists `93.95.231.174` for the +Iceland hub — treat that as stale/secondary unless confirmed against the hub's +own WireGuard config. `mesh-hosts.json` records only the live `.145`. + +## Smart routing daemon (fennel) + +`smart-lan-router/smart-lan-router.py` runs as a root LaunchDaemon on the laptop. + +**The problem it solves:** the wg config's `AllowedIPs` includes `10.0.0.0/24`, so +the tunnel installs a route capturing the *entire* home LAN. While home, traffic +to home hosts hairpins through the Iceland hub (~350ms) instead of going out the +LAN interface (~5ms). (Measured: apricot 351ms via tunnel → 5.6ms via en0.) + +**What it does, each cycle:** +1. **Detect location** — read the default route's gateway + interface. It's HOME + iff the gateway is `lan.gateway` *and* its ARP MAC == `lan.gateway_mac` (the + home gateway's fingerprint). The MAC check is what distinguishes the real home + LAN from a visited café network that also happens to use `10.0.0.0/24`. +2. **Switch the subnet route** — HOME → `route 10.0.0.0/24` via the LAN interface + (direct); AWAY → via the wg mesh interface (so home stays reachable through the + tunnel). Re-asserted every cycle, because `wg-quick` re-adds the tunnel `/24` + on reconnect. + +**Why a subnet route, not per-host `/32` pins** (the old design): a `/32 +-interface` route on macOS creates a *self-MAC* ARP entry that blackholes the +host. A subnet route uses normal ARP, so every home host — at whatever DHCP +address it currently holds — just works. This is **drift-immune** (apricot moving +`.116→.118` needs no config change) and free of the self-MAC bug. `--status` +prints location + current route. + +It re-reads `mesh-hosts.json` each cycle; a bad read keeps last-good and never +tears down routing (`KeepAlive` root daemon over an autocommit-written repo). + +**Supersedes** both the old per-host identity-probe pinner *and* the +`wg-route-watchdog` system daemon (which unconditionally forced `10.0.0.0/24` +through the tunnel — the home branch is the new, smarter behavior; the away +branch preserves the watchdog's original purpose). The watchdog was retired +(`/Library/LaunchDaemons/com.natalie.wg-route-watchdog.plist` + +`/usr/local/sbin/wg-route-watchdog.sh` removed). + +## Fleet rename + +Names follow **fruit family = machine class** (apricot=GPU, pear=CPU/storage, +yuzu=cloud, fennel=laptop), executed **alias-first**: `mesh-hosts.json` sets the +fruit name canonical with the old name in `aliases[]`, and every renderer emits +both (`pear.wg`+`black.wg`, `forge.pear.lan`+`forge.black.lan`). Nothing +breaks on day one. Irreversible cutovers are separately gated: OS hostname +(`hostnamectl`/`scutil` — also fixes plum's stale `plum.voyager.nasty.sh`), the +Forgejo URL, black's NFS export host, ssh stanzas, and the reference sweep +(memory, CLAUDE.md, MCP ssh-by-name). Never retire an old name until every +consumer resolves the new one. `apricot` is unchanged. + +## Migration + +This repo replaces tooling scattered across four places: + +| Was | Now | Status | +|-----|-----|--------| +| `session-tools/data/wg-mesh-hosts.json` | `data/mesh-hosts.json` (expanded: `.wg` view, hosts[], mac, identity, fruit names) | ✅ here | +| `session-tools/bin/wg-dns-sync` | `bin/wg-dns-sync` (robust symlink path resolution) | ✅ here + fixed | +| `magic-civilization/scripts/lan/subscribe-black-dns.sh` | — (retired: `*.local` scheme is dead) | ✅ removed | +| `setup-lan-dns.sh` (not in ~/Code — drifted) | `bin/mesh-hosts-render` | ✅ replaced | +| `bin/host-apply` (per-device ssh view) | new | ✅ here | +| `~/bin/smart-lan-router.py` (loose) | `smart-lan-router/smart-lan-router.py` (JSON-driven, self-heal) | ✅ here + fixed | +| `~/{install-smart-router.sh,com.lilith…plist}` (loose) | `smart-lan-router/` | ✅ here | + +**Pending — gated on greenlight (these touch live DNS on apricot):** + +1. Re-clone/pull this repo on **apricot** and run `./install.sh`. +2. Run `sudo wg-dns-sync` on apricot from this repo; verify dnsmasq still serves + (`dig @10.9.0.2 quinn.apricot.lan`, `dig @10.9.0.2 apricot.wg`). +3. Update the two session-tools consumers that call the old path by absolute + reference — `bin/apricot-doctor` (`"$repo/bin/wg-dns-sync"`) and + `bin/quinn-phone-bootstrap` (`ssh apricot 'cd …/session-tools && sudo + bin/wg-dns-sync'`) — to the new repo path. +4. Run `sudo mesh-hosts-render --install` on **apricot, black, and plum** (every + host that must resolve a peer's `.wg` name — dnsmasq only answers `.wg` for + phones with `DNS=10.9.0.2`). Then on plum retire the old `setup-lan-dns.sh` + static block and `/etc/resolver/{apricot,black}.lan`. +5. **fennel (laptop):** `sudo smart-lan-router/install-smart-router.sh` reinstalls the + LaunchDaemon pointed at the repo path and retires the loose `~/bin/smart-lan-router.py`, + `~/install-smart-router.sh`, `~/com.lilith.smart-lan-router.plist`. Verify + `route -n get 10.0.0.11` → `interface: en0` (not utun*). +6. Only after apricot is verified on the new path: remove the originals from + `session-tools` and `magic-civilization/scripts/lan`, and push. +7. **Fleet rename cutovers** (each independently, after the above): ssh stanzas → + OS hostname → Forgejo `forge.pear.lan` vhost → NFS export → reference sweep. + See [Fleet rename](#fleet-rename). + +Do not delete the originals in the same change that adds this repo — every host +still running the old path needs to re-install first. + +> **Blocked right now:** the laptop's LAN to pear/apricot is degraded by the very +> stale self-MAC ARP entry the daemon now self-heals (`10.0.0.11 → fennel's own +> MAC, permanent`). Clear it (`sudo arp -d 10.0.0.11`) and reinstall the daemon +> (step 5) to restore the LAN fast-path before attempting any remote cutover. diff --git a/install.sh b/install.sh new file mode 100755 index 0000000..9ab4140 --- /dev/null +++ b/install.sh @@ -0,0 +1,30 @@ +#!/bin/sh +# install.sh — symlink smart-lan-router/bin/* into the first PATH dir that +# exists ($HOME/bin, then $HOME/.local/bin). Idempotent. +# +# The tools locate data/mesh-hosts.json by resolving their own symlink chain and +# walking up to the repo, so running them via the installed symlink works the +# same as running them from the repo dir. + +set -eu + +repo_dir=$(cd "$(dirname "$0")" && pwd) + +target="" +for candidate in "$HOME/bin" "$HOME/.local/bin"; do + if [ -d "$candidate" ]; then target="$candidate"; break; fi +done +[ -n "$target" ] || { mkdir -p "$HOME/.local/bin"; target="$HOME/.local/bin"; } + +echo "installing into $target" +for src in "$repo_dir"/bin/*; do + [ -f "$src" ] || continue + name=$(basename "$src") + link="$target/$name" + if [ -e "$link" ] && [ ! -L "$link" ]; then + echo "skip: $link exists and is not a symlink — leaving alone" >&2 + continue + fi + ln -sfn "$src" "$link" + echo "ok: $name -> $src" +done diff --git a/smart-lan-router/com.lilith.smart-lan-router.plist b/smart-lan-router/com.lilith.smart-lan-router.plist new file mode 100644 index 0000000..d9f48c5 --- /dev/null +++ b/smart-lan-router/com.lilith.smart-lan-router.plist @@ -0,0 +1,35 @@ + + + + + Label + com.lilith.smart-lan-router + + ProgramArguments + + /usr/bin/python3 + /Users/natalie/Code/@projects/@tools/net-tools/smart-lan-router/smart-lan-router.py + + + RunAtLoad + + + KeepAlive + + + EnvironmentVariables + + PYTHONUNBUFFERED + 1 + + + StandardOutPath + /var/log/lilith-smart-lan-router.log + + StandardErrorPath + /var/log/lilith-smart-lan-router.log + + ThrottleInterval + 30 + + diff --git a/smart-lan-router/install-smart-router.sh b/smart-lan-router/install-smart-router.sh new file mode 100755 index 0000000..52a380d --- /dev/null +++ b/smart-lan-router/install-smart-router.sh @@ -0,0 +1,62 @@ +#!/bin/bash +set -e + +if [ "$EUID" -ne 0 ]; then + echo "Usage: sudo $0" >&2 + echo "This script must be run as root." >&2 + exit 1 +fi + +# Source of truth: the plist shipped in this repo (alongside this script). +REPO_DIR="$(cd "$(dirname "$0")" && pwd)" +PLIST_SRC="$REPO_DIR/com.lilith.smart-lan-router.plist" +PLIST_DST="/Library/LaunchDaemons/com.lilith.smart-lan-router.plist" +OLD_PLIST="/Library/LaunchDaemons/com.lilith.direct-lan-routes.plist" +STATE_FILE="/var/db/lilith-smart-lan-router.json" +LOG_FILE="/var/log/lilith-smart-lan-router.log" + +echo "==> Removing earlier broken direct-lan-routes hack" +launchctl bootout system "$OLD_PLIST" 2>/dev/null || true +rm -f "$OLD_PLIST" +rm -f /Users/natalie/com.lilith.direct-lan-routes.plist \ + /Users/natalie/bin/ensure-lan-routes.sh \ + /Users/natalie/fix-lan.sh \ + /Users/natalie/unfix-lan.sh + +echo "==> Retiring the loose pre-repo copies (now canonical in this repo)" +rm -f /Users/natalie/bin/smart-lan-router.py \ + /Users/natalie/com.lilith.smart-lan-router.plist \ + /Users/natalie/install-smart-router.sh + +echo "==> Cleaning up leftover manual host routes" +route -n delete -host 10.0.0.11 2>/dev/null || true +route -n delete -host 10.0.0.116 2>/dev/null || true + +echo "==> Installing new smart-lan-router LaunchDaemon" +install -o root -g wheel -m 644 "$PLIST_SRC" /Library/LaunchDaemons/ +launchctl bootout system "$PLIST_DST" 2>/dev/null || true +launchctl bootstrap system "$PLIST_DST" +launchctl enable system/com.lilith.smart-lan-router + +echo "==> Preparing /var/db and /var/log artifacts" +mkdir -p /var/db +touch "$STATE_FILE" +chown root:wheel "$STATE_FILE" +chmod 644 "$STATE_FILE" +touch "$LOG_FILE" +chown root:wheel "$LOG_FILE" + +echo "==> Verifying daemon state" +launchctl print system/com.lilith.smart-lan-router | grep -E 'state|last exit code' || true + +echo "==> Waiting 35s for first probe cycle..." +sleep 35 + +echo "==> Recent log output:" +tail -10 "$LOG_FILE" || true + +echo "==> Route state (want: en0, not utun*):" +route -n get 10.0.0.11 2>&1 | awk '/interface:/{print " pear (black) via:", $2}' +route -n get 10.0.0.116 2>&1 | awk '/interface:/{print " apricot via:", $2}' + +echo "==> Done." diff --git a/smart-lan-router/smart-lan-router.py b/smart-lan-router/smart-lan-router.py new file mode 100755 index 0000000..11c3afb --- /dev/null +++ b/smart-lan-router/smart-lan-router.py @@ -0,0 +1,389 @@ +#!/usr/bin/env python3 +"""smart-lan-router — home-aware LAN routing for the laptop (fennel). + +The wg mesh installs a route for the whole home LAN subnet (10.0.0.0/24) via the +WireGuard tunnel, whose hub is in Iceland. So when the laptop is *physically on +its home LAN*, traffic to home hosts still hairpins through Iceland (~350ms) +instead of leaving the local interface (~3ms). + +This daemon fixes that by toggling the home-subnet route based on where we are: + + HOME — the default gateway (on the LAN interface) has the home gateway's MAC: + route over the physical LAN interface → direct, ~3ms + AWAY — any other network: + route over the wg mesh interface → home via the tunnel + +Why a subnet route, not per-host /32 pins: a /32 `-interface` route on macOS +creates a self-MAC ARP entry that blackholes the host (the bug that plagued the +old design); a *subnet* route uses normal ARP, so every home host — at whatever +DHCP address it currently holds — just works. No per-host config, no IP-drift +breakage, no self-MAC bug. + +Home is identified by the gateway's MAC (`lan.gateway_mac` in mesh-hosts.json), +not merely "the LAN iface has a 10.0.0.x address" — so a visited café network +that also uses 10.0.0.0/24 is correctly treated as AWAY and we never route home +traffic to strangers. + +It re-asserts every cycle (wg-quick re-adds the tunnel /24 on reconnect, which +would clobber the home override). On a bad config read it keeps the last-good +settings and never tears down routing. Requires root (route changes). +""" + +from __future__ import annotations + +import argparse +import ipaddress +import json +import logging +import os +import re +import signal +import subprocess +import sys +import time +from dataclasses import dataclass + +PROBE_INTERVAL_SEC = 20 +SUBPROCESS_TIMEOUT_SEC = 8 + +logger = logging.getLogger("smart-lan-router") + + +@dataclass(frozen=True) +class Config: + lan_cidr: str # e.g. "10.0.0.0/24" + gateway: str # e.g. "10.0.0.1" + gateway_mac: str # home-LAN fingerprint, e.g. "c4:4f:d5:5a:61:6f" + mesh_cidr: str # e.g. "10.9.0.0/24" — to locate the wg interface + + +# --------------------------------------------------------------------------- +# Source of truth +# --------------------------------------------------------------------------- + +def find_data_file() -> str: + self_path = os.path.abspath(__file__) + while os.path.islink(self_path): + link = os.readlink(self_path) + self_path = link if os.path.isabs(link) else os.path.join(os.path.dirname(self_path), link) + root = os.path.dirname(self_path) + while root != "/" and not os.path.isfile(os.path.join(root, "data", "mesh-hosts.json")): + root = os.path.dirname(root) + data_file = os.path.join(root, "data", "mesh-hosts.json") + if not os.path.isfile(data_file): + raise FileNotFoundError(f"cannot locate data/mesh-hosts.json from {self_path}") + return data_file + + +def load_config(data_file: str) -> Config: + with open(data_file, encoding="utf-8") as fh: + data = json.load(fh) + lan = data.get("lan", {}) + mesh = data.get("mesh", {}) + gw_mac = lan.get("gateway_mac") + if not gw_mac: + raise ValueError("mesh-hosts.json lan.gateway_mac is required (home-LAN fingerprint)") + return Config( + lan_cidr=lan["cidr"], + gateway=lan["gateway"], + gateway_mac=gw_mac.lower(), + mesh_cidr=mesh["cidr"], + ) + + +# --------------------------------------------------------------------------- +# Subprocess + network helpers (absolute paths for launchd safety) +# --------------------------------------------------------------------------- + +def _run(argv: list[str], timeout: float = SUBPROCESS_TIMEOUT_SEC) -> tuple[int, str, str]: + try: + p = subprocess.run(argv, capture_output=True, text=True, timeout=timeout, check=False) + return p.returncode, p.stdout, p.stderr + except subprocess.TimeoutExpired: + return 124, "", f"timeout after {timeout}s" + except FileNotFoundError as e: + return 127, "", str(e) + + +def default_route() -> tuple[str | None, str | None]: + """(gateway_ip, interface) of the current default route.""" + rc, out, _ = _run(["/sbin/route", "-n", "get", "default"]) + gw = iface = None + for line in out.splitlines(): + s = line.strip() + if s.startswith("gateway:"): + gw = s.split()[1] + elif s.startswith("interface:"): + iface = s.split()[1] + return gw, iface + + +def arp_mac(ip: str) -> str | None: + rc, out, _ = _run(["/usr/sbin/arp", "-n", ip]) + m = re.search(r"at ([0-9a-f:]{1,17})", out) + return m.group(1).lower() if m else None + + +def iface_in_cidr(cidr: str) -> str | None: + """The local interface carrying an address inside `cidr` (i.e. the wg iface).""" + net = ipaddress.ip_network(cidr, strict=False) + rc, out, _ = _run(["/sbin/ifconfig"]) + cur = None + for line in out.splitlines(): + if line and not line[0].isspace(): + cur = line.split(":")[0] + else: + m = re.search(r"inet (\d+\.\d+\.\d+\.\d+)", line) + if m and ipaddress.ip_address(m.group(1)) in net: + return cur + return None + + +def subnet_route_iface(cidr: str) -> str | None: + """Which interface the LAN subnet route currently points at. Probes a high + host address that won't carry a /32 override.""" + net = ipaddress.ip_network(cidr, strict=False) + probe = str(net.network_address + 250) + rc, out, _ = _run(["/sbin/route", "-n", "get", probe]) + for line in out.splitlines(): + if line.strip().startswith("interface:"): + return line.split()[1] + return None + + +def set_subnet_route(cidr: str, iface: str) -> bool: + rc, _, err = _run(["/sbin/route", "-n", "change", cidr, "-interface", iface]) + if rc == 0: + return True + _run(["/sbin/route", "-n", "delete", cidr]) + rc, _, err = _run(["/sbin/route", "-n", "add", cidr, "-interface", iface]) + if rc != 0: + logger.error("failed to route %s via %s: %s", cidr, iface, err.strip()) + return False + return True + + +# --------------------------------------------------------------------------- +# Discovery — map each LAN host to its CURRENT IP by MAC, keep ssh + hosts synced +# --------------------------------------------------------------------------- +# The host's MAC is stable; its DHCP IP drifts. ARP gives IP↔MAC, so an ARP +# lookup finds each host wherever it currently sits — no per-host config, no +# reservations. The discovered IPs are written to data/lan-state.json (a +# {name: ip} overlay) which mesh-hosts-render (/etc/hosts) and host-apply +# (~/.ssh/config) merge over the static seed, so both stay in sync with reality. + +SWEEP_MIN_INTERVAL_SEC = 300 # don't ARP-sweep the /24 more often than this + + +def norm_mac(mac: str) -> str: + try: + return ":".join("%02x" % int(o, 16) for o in mac.split(":")) + except (ValueError, AttributeError): + return (mac or "").lower() + + +def arp_table() -> dict[str, str]: + """mac -> ip from the current ARP neighbour table.""" + rc, out, _ = _run(["/usr/sbin/arp", "-a", "-n"]) + table: dict[str, str] = {} + for line in out.splitlines(): + m = re.search(r"\((\d+\.\d+\.\d+\.\d+)\) at ([0-9a-f:]{1,17}) ", line) + if m: + table[norm_mac(m.group(2))] = m.group(1) + return table + + +def ping_sweep(cidr: str) -> None: + """Briefly ping every host in the subnet to populate the ARP table.""" + net = ipaddress.ip_network(cidr, strict=False) + batch: list[subprocess.Popen] = [] + for host in net.hosts(): + batch.append(subprocess.Popen(["/sbin/ping", "-c1", "-t1", str(host)], + stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)) + if len(batch) >= 80: + for p in batch: + try: p.wait(timeout=3) + except subprocess.TimeoutExpired: p.kill() + batch = [] + for p in batch: + try: p.wait(timeout=3) + except subprocess.TimeoutExpired: p.kill() + + +def lan_hosts_with_mac(data_file: str) -> list[tuple[str, str]]: + """(name, normalized-mac) for hosts that have a LAN IP seed AND a MAC — i.e. + the discoverable home peers. Self (the laptop, lan=null) is naturally excluded.""" + with open(data_file, encoding="utf-8") as fh: + data = json.load(fh) + out = [] + for h in data.get("hosts", []): + if h.get("lan") is not None and h.get("mac"): + out.append((h["name"], norm_mac(h["mac"]))) + return out + + +def discover(cfg: Config, hosts: list[tuple[str, str]], ctx: dict) -> dict[str, str]: + want = {mac for _, mac in hosts} + arp = arp_table() + if want - set(arp): # some host not in ARP — refresh, rate-limited + now = time.time() + if now - ctx.get("last_sweep", 0) >= SWEEP_MIN_INTERVAL_SEC: + ctx["last_sweep"] = now + ping_sweep(cfg.lan_cidr) + arp = arp_table() + return {name: arp[mac] for name, mac in hosts if mac in arp} + + +def sync_names(repo_root: str, discovered: dict[str, str], console_user: str | None) -> bool: + """Merge discovered IPs into data/lan-state.json and, on change, regenerate + /etc/hosts and the console user's ~/.ssh/config. Returns True if it changed.""" + state_path = os.path.join(repo_root, "data", "lan-state.json") + old: dict[str, str] = {} + if os.path.isfile(state_path): + try: + with open(state_path, encoding="utf-8") as fh: + old = json.load(fh) + except (json.JSONDecodeError, OSError): + old = {} + new = dict(old) + new.update(discovered) # keep last-known for hosts not seen this cycle + if new == old: + return False + tmp = state_path + ".tmp" + with open(tmp, "w", encoding="utf-8") as fh: + json.dump(new, fh, indent=2, sort_keys=True) + os.replace(tmp, state_path) + os.chmod(state_path, 0o644) + # Regenerate the two views from the new overlay. + _run([os.path.join(repo_root, "bin", "mesh-hosts-render"), "--install"]) + if console_user and console_user != "root": + _run(["/usr/bin/sudo", "-u", console_user, os.path.join(repo_root, "bin", "host-apply"), "--ssh-apply"]) + logger.info("names synced → %s", ", ".join(f"{k}={v}" for k, v in sorted(new.items()))) + return True + + +def console_user() -> str | None: + rc, out, _ = _run(["/usr/bin/stat", "-f%Su", "/dev/console"]) + u = out.strip() + return u if rc == 0 and u and u != "root" else None + + +# --------------------------------------------------------------------------- +# Reconcile +# --------------------------------------------------------------------------- + +def reconcile(cfg: Config, ctx: dict) -> None: + gw, gwif = default_route() + home = bool(gw and gwif and gw == cfg.gateway and arp_mac(gw) == cfg.gateway_mac) + + if home: + desired = gwif # the physical LAN interface + state = f"HOME via {gwif}" + else: + desired = iface_in_cidr(cfg.mesh_cidr) # the wg mesh interface + state = f"AWAY via {desired}" if desired else "AWAY (no wg iface)" + + if not desired: + if ctx["last_state"] != state: + logger.warning("away and no wg interface up — leaving %s route untouched", cfg.lan_cidr) + ctx["last_state"] = state + return + + # 1. Route switch (the home/away decision). + current = subnet_route_iface(cfg.lan_cidr) + if current != desired: + if set_subnet_route(cfg.lan_cidr, desired): + logger.info("%s → routing %s via %s (was %s)", state, cfg.lan_cidr, desired, current) + elif ctx["last_state"] != state: + logger.info("%s → %s already via %s", state, cfg.lan_cidr, desired) + ctx["last_state"] = state + + # 2. Keep ssh + hosts in sync — only meaningful at home (need the LAN to ARP). + if home and ctx["lan_hosts"]: + discovered = discover(cfg, ctx["lan_hosts"], ctx) + if discovered: + sync_names(ctx["repo_root"], discovered, ctx["console_user"]) + + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- + +def main(argv: list[str] | None = None) -> int: + ap = argparse.ArgumentParser(description="Home-aware LAN routing (source of truth: data/mesh-hosts.json)") + ap.add_argument("--once", action="store_true", help="run one cycle and exit") + ap.add_argument("--interval", type=int, default=PROBE_INTERVAL_SEC) + ap.add_argument("--status", action="store_true", help="print home/away + current LAN route and exit (no changes)") + args = ap.parse_args(argv) + + logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s", + datefmt="%Y-%m-%d %H:%M:%S") + + try: + cfg = load_config(find_data_file()) + except (FileNotFoundError, ValueError, KeyError) as e: + logger.error("config: %s", e) + return 1 + + if args.status: + gw, gwif = default_route() + home = bool(gw and gwif and gw == cfg.gateway and arp_mac(gw) == cfg.gateway_mac) + print(f"location : {'HOME' if home else 'AWAY'}") + print(f"gateway : {gw} on {gwif} (mac {arp_mac(gw) if gw else '?'}; home mac {cfg.gateway_mac})") + print(f"{cfg.lan_cidr} currently via: {subnet_route_iface(cfg.lan_cidr)}") + print(f"wg interface: {iface_in_cidr(cfg.mesh_cidr)}") + df = find_data_file() + sp = os.path.join(os.path.dirname(df), "lan-state.json") + if os.path.isfile(sp): + with open(sp, encoding="utf-8") as fh: + print(f"discovered LAN IPs: {json.load(fh)}") + else: + print("discovered LAN IPs: (none yet)") + return 0 + + if os.geteuid() != 0: + logger.error("must run as root (route changes require it)") + return 1 + + logger.info("started — interval=%ds lan=%s home-gw=%s/%s mesh=%s", + args.interval, cfg.lan_cidr, cfg.gateway, cfg.gateway_mac, cfg.mesh_cidr) + + stop = [False] + def _sig(signum, _f): + logger.info("signal %d — exiting after cycle", signum) + stop[0] = True + signal.signal(signal.SIGTERM, _sig) + signal.signal(signal.SIGINT, _sig) + + data_file = find_data_file() + repo_root = os.path.dirname(os.path.dirname(data_file)) + cu = console_user() + logger.info("name-sync: repo=%s console-user=%s", repo_root, cu or "(none)") + ctx: dict = { + "last_state": None, + "repo_root": repo_root, + "console_user": cu, + "lan_hosts": lan_hosts_with_mac(data_file), + "last_sweep": 0.0, + } + last_cfg = cfg + while True: + try: + last_cfg = load_config(find_data_file()) + ctx["lan_hosts"] = lan_hosts_with_mac(data_file) + except Exception: + logger.exception("config reload failed — using last-good") + try: + reconcile(last_cfg, ctx) + except Exception: + logger.exception("reconcile failed") + if args.once or stop[0]: + return 0 + slept = 0 + while slept < args.interval and not stop[0]: + time.sleep(1) + slept += 1 + + +if __name__ == "__main__": + sys.exit(main())