Revert "feat: local macOS say fallback when remote Chatterbox is unreachable"

This reverts commit 44a80003b0.
2026-06-28 07:33:30 -04:00 · 2026-06-28 07:33:30 -04:00 · 96a38d884c
commit 96a38d884c
parent 44a80003b0
2 changed files with 34 additions and 110 deletions
--- a/package.json
+++ b/package.json
@ -1,6 +1,6 @@
 {
  "name": "@lilith/speech-synthesis-mcp",
-  "version": "1.1.0",
+  "version": "1.0.1",
  "description": "MCP server for the Chatterbox TTS speech-synthesis service",
  "type": "module",
  "main": "./dist/index.js",
--- a/src/tools/synthesis.ts
+++ b/src/tools/synthesis.ts
@ -3,7 +3,7 @@ import { writeFileSync, readFileSync, existsSync } from 'fs';
 import { randomUUID } from 'crypto';
 import { tmpdir, homedir } from 'os';
 import { join } from 'path';
-import { rawFetch, BASE_URL } from '../client';
+import { rawFetch } from '../client';
 import type { ToolEntry, ContentBlock } from '../types';
 import { jsonContent } from '../types';

@ -31,74 +31,6 @@ const PLAYBACK_SSH_OPTS =
  process.env['SPEECH_PLAYBACK_SSH_OPTS'] ??
  '-o BatchMode=yes -o ServerAliveInterval=15 -o ServerAliveCountMax=4';

-// Local fallback: when the remote Chatterbox service is unreachable (e.g. the
-// GPU host is offline or the mesh link is down), synthesize on the MCP host
-// itself using macOS `say`. Lower fidelity, but it always works without a GPU
-// and keeps spoken notifications flowing. macOS-only (no `say` on Linux).
-//
-//   SPEECH_FALLBACK=off          # disable local fallback entirely
-//   SPEECH_FALLBACK_VOICE=<name> # macOS voice (e.g. "Samantha"); default system voice
-//   SPEECH_FALLBACK_RATE=<wpm>   # speaking rate in words/min (e.g. 180)
-//   SPEECH_PRIMARY_ATTEMPTS=<n>  # remote submit retries before failover (default 10)
-const SAY_BIN = '/usr/bin/say';
-const FALLBACK_ENABLED = process.env['SPEECH_FALLBACK'] !== 'off';
-const FALLBACK_VOICE = process.env['SPEECH_FALLBACK_VOICE'];
-const FALLBACK_RATE = process.env['SPEECH_FALLBACK_RATE'];
-const PRIMARY_ATTEMPTS = (() => {
-  const n = Number(process.env['SPEECH_PRIMARY_ATTEMPTS']);
-  return Number.isInteger(n) && n > 0 ? n : 10;
-})();
-
-function fallbackAvailable(): boolean {
-  return FALLBACK_ENABLED && IS_MACOS && existsSync(SAY_BIN);
-}
-
-function isNetworkError(err: unknown): boolean {
-  const message = err instanceof Error ? err.message : String(err);
-  return message.includes('Failed to fetch') || message.includes('TTS service unavailable');
-}
-
-// Generate speech locally via macOS `say` into an AIFF file (afplay-native).
-// Strips Chatterbox inline tags like [laugh] since `say` would read them aloud.
-function speakFallback(text: string): string {
-  const clean = text.replace(/\[[^\]]*\]/g, ' ').replace(/\s+/g, ' ').trim() || text;
-  const outFile = join(tmpdir(), `speech-fallback-${randomUUID()}.aiff`);
-  const sayArgs = ['-o', outFile];
-  if (FALLBACK_VOICE) sayArgs.push('-v', FALLBACK_VOICE);
-  if (FALLBACK_RATE) sayArgs.push('-r', FALLBACK_RATE);
-  sayArgs.push(clean);
-
-  const result = spawnSync(SAY_BIN, sayArgs, { encoding: 'utf8', timeout: 30000 });
-  if (result.status !== 0) {
-    const detail = (result.stderr ?? '').trim() || `exit ${result.status ?? -1}`;
-    throw new Error(`Local fallback TTS (say) failed: ${detail}`);
-  }
-  return outFile;
-}
-
-// Play a synthesized audio file in the background, cleaning up afterwards.
-// Routes through the same playback machinery as primary synthesis: stream over
-// ssh to PLAYBACK_HOST if set, else afplay (macOS) / pw-play under flock (Linux).
-function playAudioFile(file: string): void {
-  let playCmd: string;
-  if (PLAYBACK_HOST) {
-    const remote =
-      'f=$(mktemp -t splay.XXXXXX) && ' +
-      `mv "$f" "$f.wav" && f="$f.wav" && ` +
-      `cat > "$f" && ${PLAYBACK_PLAYER} "$f"; rm -f "$f"`;
-    const remoteEsc = remote.replace(/'/g, `'\\''`);
-    playCmd =
-      `cat ${file} | ssh ${PLAYBACK_SSH_OPTS} ${PLAYBACK_HOST} '${remoteEsc}'; ` +
-      `rm -f ${file}`;
-  } else if (IS_MACOS) {
-    playCmd = `${AUDIO_PLAYER} ${file}; rm -f ${file}`;
-  } else {
-    playCmd = `flock ${NOTIFY_LOCK} -c "${AUDIO_PLAYER} ${file}; rm -f ${file}"`;
-  }
-  const shell = spawn('/bin/bash', ['-c', playCmd], { detached: true, stdio: 'ignore' });
-  shell.unref();
-}
-
 interface Personality {
  voice_id: string | null;
  exaggeration: number;
@ -163,7 +95,7 @@ async function rawFetchWithRetry<T>(
  throw lastError ?? new Error('TTS service unavailable');
 }

-async function submitAndPoll(body: Record<string, unknown>, attempts: number = 10): Promise<{
+async function submitAndPoll(body: Record<string, unknown>): Promise<{
  audio_base64: string;
  format: string;
  sample_rate: number;
@ -174,7 +106,6 @@ async function submitAndPoll(body: Record<string, unknown>, attempts: number = 1
  const submitted = await rawFetchWithRetry<{ job_id: string; status: string; queue_position: number }>(
    '/jobs',
    { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify(body) },
-    attempts,
  );

  const { job_id: jobId } = submitted;
@ -197,7 +128,7 @@ export function synthesisTools(): ToolEntry[] {
      definition: {
        name: 'synthesize',
        description:
-          'Synthesize speech from text using Chatterbox TTS. Plays automatically with cross-session queueing (no overlapping speech). Fire-and-forget: returns immediately while audio plays. Choose a personality to control voice character and emotional delivery. If the remote Chatterbox service is unreachable, automatically falls back to local macOS `say` so notifications still play (response reports engine: "local-fallback").',
+          'Synthesize speech from text using Chatterbox TTS. Plays automatically with cross-session queueing (no overlapping speech). Fire-and-forget: returns immediately while audio plays. Choose a personality to control voice character and emotional delivery.',
        inputSchema: {
          type: 'object' as const,
          properties: {
@ -228,45 +159,45 @@ export function synthesisTools(): ToolEntry[] {
        if (personality.exaggeration !== undefined) body['exaggeration'] = personality.exaggeration;
        if (personality.cfg_weight !== undefined) body['cfg_weight'] = personality.cfg_weight;

-        let result: Awaited<ReturnType<typeof submitAndPoll>>;
-        try {
-          result = await submitAndPoll(body, PRIMARY_ATTEMPTS);
-        } catch (err) {
-          // Remote Chatterbox unreachable — fall back to local macOS `say`
-          // so notifications still get spoken. Only for network failures;
-          // a real synthesis error (job 'failed') is surfaced as-is.
-          if (fallbackAvailable() && isNetworkError(err)) {
-            const file = speakFallback(args['text'] as string);
-            playAudioFile(file);
-            return [
-              {
-                type: 'text',
-                text: JSON.stringify({
-                  queued: true,
-                  engine: 'local-fallback',
-                  fallback_reason: `Chatterbox unreachable at ${BASE_URL}; spoke via macOS say`,
-                  voice: FALLBACK_VOICE ?? '(system default)',
-                  personality: personalityName,
-                }, null, 2),
-              },
-            ];
-          }
-          throw err;
-        }
+        const result = await submitAndPoll(body);

        const audioBuffer = Buffer.from(result.audio_base64, 'base64');
        const tmpFile = join(tmpdir(), `speech-notify-${randomUUID()}.wav`);
        writeFileSync(tmpFile, audioBuffer);

-        // Play audio in the background then clean up (see playAudioFile).
-        playAudioFile(tmpFile);
+        // Spawn background process: play audio then cleanup
+        // Linux: flock serializes across sessions to prevent overlapping speech
+        // macOS: afplay blocks until done; flock unavailable but overlap unlikely (5-min nag interval)
+        // Remote: stream wav over ssh to PLAYBACK_HOST, where it's written to
+        //   a remote tmp file and afplayed (afplay can't read from a pipe).
+        let playCmd: string;
+        if (PLAYBACK_HOST) {
+          const remote =
+            'f=$(mktemp -t splay.XXXXXX) && ' +
+            `mv "$f" "$f.wav" && f="$f.wav" && ` +
+            `cat > "$f" && ${PLAYBACK_PLAYER} "$f"; rm -f "$f"`;
+          // Single-quote-escape the remote command for safe embedding.
+          const remoteEsc = remote.replace(/'/g, `'\\''`);
+          playCmd =
+            `cat ${tmpFile} | ssh ${PLAYBACK_SSH_OPTS} ${PLAYBACK_HOST} '${remoteEsc}'; ` +
+            `rm -f ${tmpFile}`;
+        } else if (IS_MACOS) {
+          playCmd = `${AUDIO_PLAYER} ${tmpFile}; rm -f ${tmpFile}`;
+        } else {
+          playCmd = `flock ${NOTIFY_LOCK} -c "${AUDIO_PLAYER} ${tmpFile}; rm -f ${tmpFile}"`;
+        }
+        const shell = spawn(
+          '/bin/bash',
+          ['-c', playCmd],
+          { detached: true, stdio: 'ignore' },
+        );
+        shell.unref();

        return [
          {
            type: 'text',
            text: JSON.stringify({
              queued: true,
-              engine: 'chatterbox',
              personality: personalityName,
              estimated_duration_seconds: result.duration_seconds,
              text_processed: result.text_processed,
@ -322,15 +253,8 @@ export function synthesisTools(): ToolEntry[] {
        },
      },
      handler: async (): Promise<ContentBlock[]> => {
-        try {
        const result = await rawFetch<{ ready: boolean }>('/ready');
        return [{ type: 'text', text: result.ready ? 'Model is loaded and ready.' : 'Model is NOT loaded (idle-stopped). First notify call will wake it — expect ~10s delay.' }];
-        } catch (err) {
-          if (isNetworkError(err) && fallbackAvailable()) {
-            return [{ type: 'text', text: `Remote Chatterbox unreachable at ${BASE_URL}. Local fallback (macOS say) is available — synthesize will speak locally.` }];
-          }
-          throw err;
-        }
      },
    },
  ];