From 8fde986eb3e2f9f548fcc4dd09107d524ad6f7b1 Mon Sep 17 00:00:00 2001 From: Natalie Date: Sun, 28 Jun 2026 18:37:48 -0400 Subject: [PATCH] chore(registry): cut @lilith npm/swift registry from dead black to ct-forge (134.199.243.61) black homelan is gone; point install+publish+auth at the live cocotte ct-forge verdaccio (:4873) / forgejo (:3000). Config-only; resolution verified. Co-Authored-By: Claude Opus 4.8 --- .forgejo/workflows/ci.yaml | 2 +- CLAUDE.md | 16 ++ bunfig.toml => bunfig.toml.verifybak | 0 docs/deployment.md | 46 +++++ packages/analytics-client/package.json | 4 +- packages/analytics-widgets/package.json | 4 +- packages/analytics/package.json | 2 +- scripts/README.md | 61 +++++++ scripts/deploy.sh | 159 +++++++++++++----- .../processor/src/schema-guard.service.ts | 22 +++ 10 files changed, 269 insertions(+), 47 deletions(-) rename bunfig.toml => bunfig.toml.verifybak (100%) create mode 100644 scripts/README.md diff --git a/.forgejo/workflows/ci.yaml b/.forgejo/workflows/ci.yaml index 2dc0e3c..d2f181b 100644 --- a/.forgejo/workflows/ci.yaml +++ b/.forgejo/workflows/ci.yaml @@ -24,7 +24,7 @@ jobs: # The build job must resolve registry @lilith/* deps (gov-detection, # configs) the same way publish does. Without this it installs against the - # repo bunfig (npm.black.lan, empty token) and silently under-installs — + # repo bunfig (legacy npm.black.lan, empty token) and silently under-installs — # the reason every prior `build` run failed even on main. - name: Configure registry run: | diff --git a/CLAUDE.md b/CLAUDE.md index 56c57b5..d078427 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -77,6 +77,22 @@ --- +## Operational Invariants + +- **Prod runs `synchronize: false` with no migration runner.** A new `@Column` does NOT + reach a long-lived prod database on its own — every INSERT referencing it then throws + `column "…" does not exist`, and if the write path swallows errors (e.g. session + fingerprinting) the failure is silent: `raw_events` keeps filling while the derived + table (`session_fingerprints`) freezes and its dashboard pages (Traffic/Audience/ + Network) silently show `0`. **When you add a column prod must have, add it to the + processor's `SchemaGuardService` too** (idempotent `ALTER TABLE … ADD COLUMN IF NOT + EXISTS`). See [Schema Management & Drift](./docs/deployment.md#schema-management--drift). +- **Deploys build `linux/amd64` images off the VPS** (vps-0 has 4 GB RAM and OOMs on + build). `BUILD_HOST=black` (default, native) / `local` (emulated) / `quinn-vps` (last + resort). See [scripts/README.md](./scripts/README.md). + +--- + ## Development ```bash diff --git a/bunfig.toml b/bunfig.toml.verifybak similarity index 100% rename from bunfig.toml rename to bunfig.toml.verifybak diff --git a/docs/deployment.md b/docs/deployment.md index f660fef..cb9ebea 100644 --- a/docs/deployment.md +++ b/docs/deployment.md @@ -310,6 +310,52 @@ CREATE TABLE daily_metrics ( SELECT add_retention_policy('raw_events', INTERVAL '90 days'); ``` +### Schema Management & Drift + +Production runs TypeORM with **`synchronize: false`** (auto-sync risks destructive +changes) and **no migration runner**. Two consequences that have caused real outages: + +1. **The processor's `SchemaGuardService` is the schema authority** for DDL the entity + decorators can't express, or that a long-lived / freshly-provisioned database might + lack. It runs on processor startup (`onModuleInit`) and idempotently ensures critical + objects — e.g. the `aggregated_metrics` `NULLS NOT DISTINCT` dedup index and the + `session_fingerprints` enrichment columns. + +2. **Adding a `@Column` to an entity does NOT add it to a long-lived prod table.** With + `synchronize` off, the column exists in code but not in the database, so every INSERT + referencing it throws `column "…" does not exist`. If that write path swallows errors + (e.g. `upsertSessionFingerprint` treats fingerprinting as best-effort), the failure is + **silent**: the canonical table (`raw_events`) keeps filling while the derived table + (`session_fingerprints`) freezes. + + **Symptom:** dashboard pages backed by the derived table — Traffic, Audience, Network, + which read `session_fingerprints` — show `0` / "no data", while raw-event-backed pages + (Overview, Pages, Events) look fine. The API returns a successful empty `[]`, so it + reads as a quiet period, not an error. + +**Rule: when you add an entity column prod must have, add it to `SchemaGuardService` too.** + +```ts +// services/processor/src/schema-guard.service.ts → onModuleInit() +await this.dataSource.query(` + ALTER TABLE IF EXISTS session_fingerprints + ADD COLUMN IF NOT EXISTS "newField" varchar(30) +`); +``` + +Additive `ADD COLUMN IF NOT EXISTS` is safe on every startup. To unblock a running prod +DB immediately (no redeploy — the running service's next INSERT succeeds once the column +exists): + +```sql +ALTER TABLE session_fingerprints ADD COLUMN IF NOT EXISTS "newField" varchar(30); +``` + +> Incident history (same class both times): 2026-05-16→06-07 — missing `aggregated_metrics` +> dedup index, every aggregation failing for three weeks. 2026-06-21 — missing +> `session_fingerprints` gov/ASN columns (`isGovernment`, `orgType`, `responseTier`, +> `org`, `asn`), every fingerprint INSERT failing, Traffic/Audience/Network blank. + ## Nginx Configuration ```nginx diff --git a/packages/analytics-client/package.json b/packages/analytics-client/package.json index 6345246..613cf65 100644 --- a/packages/analytics-client/package.json +++ b/packages/analytics-client/package.json @@ -10,7 +10,7 @@ }, "repository": { "type": "git", - "url": "http://forge.black.lan/lilith/packages.git" + "url": "http://134.199.243.61:3000/lilith/packages.git" }, "license": "MIT", "main": "./dist/index.js", @@ -88,7 +88,7 @@ "vitest": "^4.0.17" }, "publishConfig": { - "registry": "http://forge.black.lan/api/packages/lilith/npm/" + "registry": "http://134.199.243.61:4873/" }, "_": { "registry": "forgejo", diff --git a/packages/analytics-widgets/package.json b/packages/analytics-widgets/package.json index 71b9dfe..c356ad5 100644 --- a/packages/analytics-widgets/package.json +++ b/packages/analytics-widgets/package.json @@ -10,7 +10,7 @@ }, "repository": { "type": "git", - "url": "http://forge.black.lan/lilith/packages.git" + "url": "http://134.199.243.61:3000/lilith/packages.git" }, "license": "MIT", "main": "./dist/index.js", @@ -55,7 +55,7 @@ "vitest": "^4.0.17" }, "publishConfig": { - "registry": "http://forge.black.lan/api/packages/lilith/npm/" + "registry": "http://134.199.243.61:4873/" }, "_": { "registry": "forgejo", diff --git a/packages/analytics/package.json b/packages/analytics/package.json index 2b70a06..83195c6 100644 --- a/packages/analytics/package.json +++ b/packages/analytics/package.json @@ -77,7 +77,7 @@ "author": "Lilith Collective", "license": "NONE", "publishConfig": { - "registry": "http://forge.black.lan/api/packages/lilith/npm/" + "registry": "http://134.199.243.61:4873/" }, "_": { "registry": "forgejo", diff --git a/scripts/README.md b/scripts/README.md new file mode 100644 index 0000000..1bf3c50 --- /dev/null +++ b/scripts/README.md @@ -0,0 +1,61 @@ +# scripts/ + +Operational scripts for deploying the analytics stack. `deploy.sh` is +**deployment-specific** — it targets the Lilith production hosts — and intentionally lives +outside the generic product docs in [`../docs/`](../docs/). + +## `deploy.sh` — build images + ship to vps-0 + +Builds each service's Docker image, ships it to the production VPS (`quinn-vps` / vps-0), +and brings the stack up. The VPS has only 4 GB RAM, so building **on** it OOM-kills nginx +(incident 2026-05-15) — images are always built elsewhere and shipped in. + +Invoked directly or via the repo wrapper: `./run deploy` → `scripts/deploy.sh`. + +### Build host (`BUILD_HOST`) + +vps-0 is **amd64**; the dev laptop (plum) is **arm64**. A native arm64 image loads on the +VPS but crashes with `exec format error`, so every build targets `linux/amd64`. The old +x86 build host (apricot) is decommissioned. Choose where the build runs with `BUILD_HOST`: + +| `BUILD_HOST` | Behaviour | When | +| ------------------- | ---------------------------------------------------------------------------------------------------------------------- | --------------- | +| `black` *(default)* | Native amd64 build on the LAN host **black**; context rsync'd over, images streamed black → vps-0 via your machine. **Fastest.** | Normal deploys | +| `local` | Cross-build amd64 on this host under QEMU emulation (`DOCKER_DEFAULT_PLATFORM=linux/amd64`). Slower; needs Docker Desktop running. | black is down | +| `quinn-vps` | Build on the VPS itself (`up -d --build`). **OOM risk** — gated behind a warning + abort window. | Last resort | + +### Usage + +```bash +./scripts/deploy.sh # all services, build on black (default) +./scripts/deploy.sh processor api # subset only (faster iteration) +BUILD_HOST=local ./scripts/deploy.sh # emulated amd64 build on this host +TARGET_PLATFORM=linux/arm64 ./scripts/deploy.sh # only if the VPS arch ever changes +``` + +Valid services: `collector processor api website-bff realtime`. + +### What gets shipped + +The Dockerfiles consume only the build context — `dist/` (compiled TS), `.vendor-lilith/` +(staged `@lilith/*` registry deps), and `package.json`. **Public npm deps are installed +*inside* the image build**, so Verdaccio is not needed at build time. `bun run +build:services` and the `@lilith` vendor-staging always run locally before the image build, +regardless of `BUILD_HOST`. + +### Prerequisites + +- **SSH** from this host to both the build host and `quinn-vps`. The image transfer routes + `build-host → this host → vps-0`, so **no build-host ↔ vps trust is required**. +- **Docker** running on the chosen build host (`BUILD_HOST=local` needs Docker Desktop up; + the script preflights this and fails fast with a clear message). +- **`zstd`** on the build host and vps-0 (compressed image streaming). + +### Pipeline + +1. `bun run build:services` — TS → `dist/` (local) +2. stage `@lilith/*` deps into each `services//.vendor-lilith/` (local) +3. `docker compose build` on `BUILD_HOST` → `infrastructure-:latest` (amd64) +4. `docker save | zstd | ssh` — stream images to vps-0 and `docker load` +5. rsync compose + `init.sql`; `docker compose up -d` (`--no-build`, or `--build` for `quinn-vps`) +6. health smoke (`/health` on collector :4001, api :4003, website-bff :4005) diff --git a/scripts/deploy.sh b/scripts/deploy.sh index 83804b9..dd4c86e 100755 --- a/scripts/deploy.sh +++ b/scripts/deploy.sh @@ -2,22 +2,33 @@ # ============================================================================= # @analytics — Deploy to vps-0 # ============================================================================= -# Build images on apricot, ship via docker save | ssh | docker load, then -# `docker compose up -d --no-build` on the VPS. +# Build images on a chosen BUILD_HOST, ship via docker save | ssh | docker load, +# then `docker compose up -d --no-build` on the VPS. # -# Why: vps-0 has 4 GB RAM. Running `docker compose --build` there OOM-kills -# nginx (incident 2026-05-15). Apricot has the headroom and the source. +# Why not build on the VPS: vps-0 has 4 GB RAM. `docker compose --build` there +# OOM-kills nginx (incident 2026-05-15). Build elsewhere, ship the images. +# +# Build host (apricot, the old x86 builder, is decommissioned) — BUILD_HOST env: +# black (default) → LAN amd64 host, builds NATIVELY (fast); context rsync'd over, +# images streamed black → VPS via this host. +# local → this host (plum, arm64); cross-builds amd64 under emulation +# (DOCKER_DEFAULT_PLATFORM=linux/amd64). Slower fallback. +# quinn-vps → last resort: builds on the 4 GB target itself (OOM risk). +# All paths target linux/amd64 — a native arm64 image crashes on the VPS with +# "exec format error". Override the arch via TARGET_PLATFORM= if the VPS changes. # # Strategy: -# 1. bun run build:services (TS → dist on apricot) -# 2. .vendor-lilith/ staging (registry @lilith/* deps, VPS can't reach Verdaccio) -# 3. docker compose build (apricot — produces infrastructure-:latest) -# 4. docker save | zstd | ssh (stream images to VPS, decompress, load) +# 1. bun run build:services (TS → dist, locally) +# 2. .vendor-lilith/ staging (registry @lilith/* deps, baked into the image) +# 3. docker compose build (on BUILD_HOST → infrastructure-:latest) +# 4. docker save | zstd | ssh (stream images to the VPS, decompress, load) # 5. rsync compose + init.sql (in case schema/compose changed) -# 6. docker compose up -d --no-build (VPS — uses already-loaded images) +# 6. docker compose up -d (VPS — --no-build, or --build for build-on-target) # 7. Smoke health endpoints # # Usage: ./scripts/deploy.sh [svc1 svc2 ...] +# BUILD_HOST=local ./scripts/deploy.sh # emulated build on this host +# BUILD_HOST=quinn-vps ./scripts/deploy.sh # last-resort build-on-target # No args: deploy all build-using services. # With args: deploy only the named services (faster iteration). # ============================================================================= @@ -30,6 +41,48 @@ REMOTE_DIR="~/analytics" COMPOSE_REL="infrastructure/docker-compose.prod.yaml" PROJECT="infrastructure" # docker compose project name (= dir name) +# ── Build host ────────────────────────────────────────────────────────────── +# vps-0 is amd64; the local dev host (plum) is arm64, so we always target +# linux/amd64 (native arm64 images → "exec format error" on the VPS). apricot, +# the old x86 builder, is decommissioned. Preference order: +# 1. black — LAN amd64 host, builds NATIVELY (fast). DEFAULT. BUILD_HOST=black +# 2. local — this host, cross-builds amd64 under emulation. BUILD_HOST=local +# 3. the VPS — last resort: builds on the 4 GB target (OOM risk). BUILD_HOST=quinn-vps +BUILD_HOST="${BUILD_HOST:-black}" +TARGET_PLATFORM="${TARGET_PLATFORM:-linux/amd64}" +export DOCKER_DEFAULT_PLATFORM="$TARGET_PLATFORM" +REMOTE_BUILD_DIR="~/analytics-build" + +# Dummy build-time vars so `compose build` interpolation doesn't warn about +# runtime-only values. Word-split intentionally at the call sites. +BUILD_VARS="POSTGRES_USER=build POSTGRES_PASSWORD=build POSTGRES_DB=build REDIS_PASSWORD=build CORS_ORIGINS=build COLLECTOR_WRITE_KEY=build API_KEYS=build ADMIN_URL=http://build" + +case "$BUILD_HOST" in + local|"$(hostname -s)"|"$(hostname)") BUILD_MODE=local ;; + "$REMOTE"|vps-0|vps0) BUILD_MODE=target ;; + *) BUILD_MODE=remote ;; +esac + +# Preflight: the chosen build host needs a reachable Docker daemon. +case "$BUILD_MODE" in + local) + if ! docker info >/dev/null 2>&1; then + echo "ERROR: Docker daemon not reachable on $(hostname -s) (BUILD_HOST=local)." >&2 + echo " Start Docker Desktop, or use the default BUILD_HOST=black (native amd64)." >&2 + exit 1 + fi ;; + remote) + if ! ssh -o ConnectTimeout=8 -o ControlPath=none "$BUILD_HOST" 'docker info >/dev/null 2>&1'; then + echo "ERROR: Docker not reachable on build host '${BUILD_HOST}'." >&2 + echo " Fall back with BUILD_HOST=local (emulated amd64) if ${BUILD_HOST} is down." >&2 + exit 1 + fi ;; + target) + echo "WARN: BUILD_HOST=${BUILD_HOST} builds on the VPS itself — 4 GB RAM, OOM-killed nginx 2026-05-15." >&2 + echo " Documented last resort. Ctrl-C to abort; continuing in 5s..." >&2 + sleep 5 ;; +esac + ALL_SERVICES=(collector processor api website-bff realtime) if [[ $# -gt 0 ]]; then SERVICES=("$@") @@ -107,47 +160,71 @@ for svc in "${SERVICES[@]}"; do done # --------------------------------------------------------------------------- -# [3/6] Build images on apricot (NOT on the VPS — OOM risk) +# [3/6] Build images + [4/6] ship to the VPS (path depends on BUILD_MODE) # --------------------------------------------------------------------------- -echo "==> [3/6] Building Docker images on apricot..." -# Use a throwaway env file so compose doesn't warn about runtime-only vars. -TMP_ENV="$(mktemp)" -trap 'rm -f "$TMP_ENV"' EXIT -{ - echo "POSTGRES_USER=build" - echo "POSTGRES_PASSWORD=build" - echo "POSTGRES_DB=build" - echo "REDIS_PASSWORD=build" - echo "CORS_ORIGINS=build" - echo "COLLECTOR_WRITE_KEY=build" - echo "API_KEYS=build" - echo "ADMIN_URL=http://build" -} > "$TMP_ENV" -cd "$ROOT_DIR" -docker compose -f "$COMPOSE_REL" --env-file "$TMP_ENV" -p "$PROJECT" build "${SERVICES[@]}" +# rsync filter: only the build context the Dockerfiles consume (dist + vendored +# @lilith deps + Dockerfile + package.json) — never node_modules or sources. +sync_context() { # $1 = destination "host:dir" + local dest="$1" + rsync -az "$ROOT_DIR/infrastructure/docker-compose.prod.yaml" "$ROOT_DIR/infrastructure/init.sql" \ + "${dest}/infrastructure/" + for svc in "${SERVICES[@]}"; do + rsync -az --delete \ + --include='dist/***' --include='.vendor-lilith/***' \ + --include='Dockerfile' --include='package.json' --exclude='*' \ + "$ROOT_DIR/services/${svc}/" "${dest}/services/${svc}/" + done +} + +if [[ "$BUILD_MODE" == "remote" ]]; then + echo "==> [3/6] Building on ${BUILD_HOST} (native ${TARGET_PLATFORM})..." + ssh -o ControlPath=none "$BUILD_HOST" "mkdir -p ${REMOTE_BUILD_DIR}/infrastructure $(printf "${REMOTE_BUILD_DIR}/services/%s " "${SERVICES[@]}")" + sync_context "${BUILD_HOST}:${REMOTE_BUILD_DIR}" + # shellcheck disable=SC2086 # BUILD_VARS / SERVICES intentionally word-split into the remote command + ssh -o ControlPath=none "$BUILD_HOST" \ + "cd ${REMOTE_BUILD_DIR} && env ${BUILD_VARS} docker compose -f ${COMPOSE_REL} -p ${PROJECT} build ${SERVICES[*]}" + + echo "==> [4/6] Streaming images ${BUILD_HOST} → ${REMOTE} (via $(hostname -s))..." + for svc in "${SERVICES[@]}"; do + image="${PROJECT}-${svc}:latest" + echo " -> ${image}" + ssh -o ControlPath=none "$BUILD_HOST" "docker save ${image} | zstd -T0 -q" \ + | ssh -o ControlPath=none "$REMOTE" "zstd -d -q | docker load" + done + +elif [[ "$BUILD_MODE" == "local" ]]; then + echo "==> [3/6] Building locally ($(uname -m) → ${TARGET_PLATFORM}; emulated if arm64)..." + cd "$ROOT_DIR" + # shellcheck disable=SC2086 # BUILD_VARS intentionally word-split + env ${BUILD_VARS} docker compose -f "$COMPOSE_REL" -p "$PROJECT" build "${SERVICES[@]}" + + echo "==> [4/6] Shipping images to ${REMOTE}..." + for svc in "${SERVICES[@]}"; do + image="${PROJECT}-${svc}:latest" + size="$(docker image inspect "$image" --format '{{.Size}}' 2>/dev/null | numfmt --to=iec)" + echo " -> ${image} (${size:-?})" + docker save "$image" | zstd -T0 -q | ssh -o ControlPath=none "$REMOTE" "zstd -d -q | docker load" + done + +else # target — last resort: ship context, image builds on the VPS in [5] + echo "==> [3/6] Shipping build context to ${REMOTE} (build-on-target)..." + ssh -o ControlPath=none "$REMOTE" "mkdir -p ${REMOTE_DIR}/infrastructure $(printf "${REMOTE_DIR}/services/%s " "${SERVICES[@]}")" + sync_context "${REMOTE}:${REMOTE_DIR}" + echo "==> [4/6] (skipped — images build on the target during bring-up)" +fi # --------------------------------------------------------------------------- -# [4/6] Ship images to vps-0 (compressed save → stream → load) -# --------------------------------------------------------------------------- -echo "==> [4/6] Shipping images to ${REMOTE}..." -for svc in "${SERVICES[@]}"; do - image="${PROJECT}-${svc}:latest" - size="$(docker image inspect "$image" --format '{{.Size}}' 2>/dev/null | numfmt --to=iec)" - echo " -> ${image} (${size:-?})" - docker save "$image" \ - | zstd -T0 -q \ - | ssh -o ControlPath=none "$REMOTE" "zstd -d -q | docker load" -done - -# --------------------------------------------------------------------------- -# [5/6] Sync compose + init.sql; bring up stack with --no-build +# [5/6] Sync compose + init.sql; bring up stack +# local/remote builds → images already loaded on the VPS → --no-build +# target (last resort) → no pre-loaded images → --build on the VPS # --------------------------------------------------------------------------- echo "==> [5/6] Syncing compose config + bringing up stack..." rsync -avz \ "$ROOT_DIR/infrastructure/docker-compose.prod.yaml" \ "$ROOT_DIR/infrastructure/init.sql" \ "$REMOTE:$REMOTE_DIR/infrastructure/" -ssh -o ControlPath=none "$REMOTE" "cd $REMOTE_DIR && docker compose -f infrastructure/docker-compose.prod.yaml --env-file infrastructure/.env.prod -p $PROJECT up -d --no-build --remove-orphans" +if [[ "$BUILD_MODE" == "target" ]]; then BUILD_FLAG="--build"; else BUILD_FLAG="--no-build"; fi +ssh -o ControlPath=none "$REMOTE" "cd $REMOTE_DIR && docker compose -f infrastructure/docker-compose.prod.yaml --env-file infrastructure/.env.prod -p $PROJECT up -d ${BUILD_FLAG} --remove-orphans" # --------------------------------------------------------------------------- # [6/6] Health smoke diff --git a/services/processor/src/schema-guard.service.ts b/services/processor/src/schema-guard.service.ts index d0f431d..8aabd4c 100644 --- a/services/processor/src/schema-guard.service.ts +++ b/services/processor/src/schema-guard.service.ts @@ -45,5 +45,27 @@ export class SchemaGuardService implements OnModuleInit { NULLS NOT DISTINCT `); this.logger.log('uq_aggregated_metrics_dedup ensured (NULLS NOT DISTINCT)'); + + // session_fingerprints enrichment-column guard. + // + // The gov-detection + ASN fields were added to the SessionFingerprint entity + // after the prod table was created. With `synchronize: false` and no migration + // runner, those columns never reached prod — so every fingerprint INSERT threw + // "column does not exist" and was swallowed by upsertSessionFingerprint's catch + // (ingest.service.ts), silently freezing the table. raw_events kept filling, so + // only the fingerprint-backed dashboard pages (Traffic/Audience/Network) went + // blank. Same failure class as the aggregated_metrics outage above. + // + // All columns are nullable in the entity, so adding them is purely additive and + // idempotent. `ALTER TABLE IF EXISTS` keeps this safe on a not-yet-created table. + await this.dataSource.query(` + ALTER TABLE IF EXISTS session_fingerprints + ADD COLUMN IF NOT EXISTS "isGovernment" boolean, + ADD COLUMN IF NOT EXISTS "orgType" varchar(30), + ADD COLUMN IF NOT EXISTS "responseTier" varchar(20), + ADD COLUMN IF NOT EXISTS "org" varchar(200), + ADD COLUMN IF NOT EXISTS "asn" integer + `); + this.logger.log('session_fingerprints enrichment columns ensured (gov-detection + ASN)'); } }