Revert "feat: local macOS say fallback when remote Chatterbox is unreachable"

This reverts commit 44a80003b0.
This commit is contained in:
Natalie 2026-06-28 07:33:30 -04:00
parent 44a80003b0
commit 96a38d884c
2 changed files with 34 additions and 110 deletions

View file

@ -1,6 +1,6 @@
{
"name": "@lilith/speech-synthesis-mcp",
"version": "1.1.0",
"version": "1.0.1",
"description": "MCP server for the Chatterbox TTS speech-synthesis service",
"type": "module",
"main": "./dist/index.js",

View file

@ -3,7 +3,7 @@ import { writeFileSync, readFileSync, existsSync } from 'fs';
import { randomUUID } from 'crypto';
import { tmpdir, homedir } from 'os';
import { join } from 'path';
import { rawFetch, BASE_URL } from '../client';
import { rawFetch } from '../client';
import type { ToolEntry, ContentBlock } from '../types';
import { jsonContent } from '../types';
@ -31,74 +31,6 @@ const PLAYBACK_SSH_OPTS =
process.env['SPEECH_PLAYBACK_SSH_OPTS'] ??
'-o BatchMode=yes -o ServerAliveInterval=15 -o ServerAliveCountMax=4';
// Local fallback: when the remote Chatterbox service is unreachable (e.g. the
// GPU host is offline or the mesh link is down), synthesize on the MCP host
// itself using macOS `say`. Lower fidelity, but it always works without a GPU
// and keeps spoken notifications flowing. macOS-only (no `say` on Linux).
//
// SPEECH_FALLBACK=off # disable local fallback entirely
// SPEECH_FALLBACK_VOICE=<name> # macOS voice (e.g. "Samantha"); default system voice
// SPEECH_FALLBACK_RATE=<wpm> # speaking rate in words/min (e.g. 180)
// SPEECH_PRIMARY_ATTEMPTS=<n> # remote submit retries before failover (default 10)
const SAY_BIN = '/usr/bin/say';
const FALLBACK_ENABLED = process.env['SPEECH_FALLBACK'] !== 'off';
const FALLBACK_VOICE = process.env['SPEECH_FALLBACK_VOICE'];
const FALLBACK_RATE = process.env['SPEECH_FALLBACK_RATE'];
const PRIMARY_ATTEMPTS = (() => {
const n = Number(process.env['SPEECH_PRIMARY_ATTEMPTS']);
return Number.isInteger(n) && n > 0 ? n : 10;
})();
function fallbackAvailable(): boolean {
return FALLBACK_ENABLED && IS_MACOS && existsSync(SAY_BIN);
}
function isNetworkError(err: unknown): boolean {
const message = err instanceof Error ? err.message : String(err);
return message.includes('Failed to fetch') || message.includes('TTS service unavailable');
}
// Generate speech locally via macOS `say` into an AIFF file (afplay-native).
// Strips Chatterbox inline tags like [laugh] since `say` would read them aloud.
function speakFallback(text: string): string {
const clean = text.replace(/\[[^\]]*\]/g, ' ').replace(/\s+/g, ' ').trim() || text;
const outFile = join(tmpdir(), `speech-fallback-${randomUUID()}.aiff`);
const sayArgs = ['-o', outFile];
if (FALLBACK_VOICE) sayArgs.push('-v', FALLBACK_VOICE);
if (FALLBACK_RATE) sayArgs.push('-r', FALLBACK_RATE);
sayArgs.push(clean);
const result = spawnSync(SAY_BIN, sayArgs, { encoding: 'utf8', timeout: 30000 });
if (result.status !== 0) {
const detail = (result.stderr ?? '').trim() || `exit ${result.status ?? -1}`;
throw new Error(`Local fallback TTS (say) failed: ${detail}`);
}
return outFile;
}
// Play a synthesized audio file in the background, cleaning up afterwards.
// Routes through the same playback machinery as primary synthesis: stream over
// ssh to PLAYBACK_HOST if set, else afplay (macOS) / pw-play under flock (Linux).
function playAudioFile(file: string): void {
let playCmd: string;
if (PLAYBACK_HOST) {
const remote =
'f=$(mktemp -t splay.XXXXXX) && ' +
`mv "$f" "$f.wav" && f="$f.wav" && ` +
`cat > "$f" && ${PLAYBACK_PLAYER} "$f"; rm -f "$f"`;
const remoteEsc = remote.replace(/'/g, `'\\''`);
playCmd =
`cat ${file} | ssh ${PLAYBACK_SSH_OPTS} ${PLAYBACK_HOST} '${remoteEsc}'; ` +
`rm -f ${file}`;
} else if (IS_MACOS) {
playCmd = `${AUDIO_PLAYER} ${file}; rm -f ${file}`;
} else {
playCmd = `flock ${NOTIFY_LOCK} -c "${AUDIO_PLAYER} ${file}; rm -f ${file}"`;
}
const shell = spawn('/bin/bash', ['-c', playCmd], { detached: true, stdio: 'ignore' });
shell.unref();
}
interface Personality {
voice_id: string | null;
exaggeration: number;
@ -163,7 +95,7 @@ async function rawFetchWithRetry<T>(
throw lastError ?? new Error('TTS service unavailable');
}
async function submitAndPoll(body: Record<string, unknown>, attempts: number = 10): Promise<{
async function submitAndPoll(body: Record<string, unknown>): Promise<{
audio_base64: string;
format: string;
sample_rate: number;
@ -174,7 +106,6 @@ async function submitAndPoll(body: Record<string, unknown>, attempts: number = 1
const submitted = await rawFetchWithRetry<{ job_id: string; status: string; queue_position: number }>(
'/jobs',
{ method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify(body) },
attempts,
);
const { job_id: jobId } = submitted;
@ -197,7 +128,7 @@ export function synthesisTools(): ToolEntry[] {
definition: {
name: 'synthesize',
description:
'Synthesize speech from text using Chatterbox TTS. Plays automatically with cross-session queueing (no overlapping speech). Fire-and-forget: returns immediately while audio plays. Choose a personality to control voice character and emotional delivery. If the remote Chatterbox service is unreachable, automatically falls back to local macOS `say` so notifications still play (response reports engine: "local-fallback").',
'Synthesize speech from text using Chatterbox TTS. Plays automatically with cross-session queueing (no overlapping speech). Fire-and-forget: returns immediately while audio plays. Choose a personality to control voice character and emotional delivery.',
inputSchema: {
type: 'object' as const,
properties: {
@ -228,45 +159,45 @@ export function synthesisTools(): ToolEntry[] {
if (personality.exaggeration !== undefined) body['exaggeration'] = personality.exaggeration;
if (personality.cfg_weight !== undefined) body['cfg_weight'] = personality.cfg_weight;
let result: Awaited<ReturnType<typeof submitAndPoll>>;
try {
result = await submitAndPoll(body, PRIMARY_ATTEMPTS);
} catch (err) {
// Remote Chatterbox unreachable — fall back to local macOS `say`
// so notifications still get spoken. Only for network failures;
// a real synthesis error (job 'failed') is surfaced as-is.
if (fallbackAvailable() && isNetworkError(err)) {
const file = speakFallback(args['text'] as string);
playAudioFile(file);
return [
{
type: 'text',
text: JSON.stringify({
queued: true,
engine: 'local-fallback',
fallback_reason: `Chatterbox unreachable at ${BASE_URL}; spoke via macOS say`,
voice: FALLBACK_VOICE ?? '(system default)',
personality: personalityName,
}, null, 2),
},
];
}
throw err;
}
const result = await submitAndPoll(body);
const audioBuffer = Buffer.from(result.audio_base64, 'base64');
const tmpFile = join(tmpdir(), `speech-notify-${randomUUID()}.wav`);
writeFileSync(tmpFile, audioBuffer);
// Play audio in the background then clean up (see playAudioFile).
playAudioFile(tmpFile);
// Spawn background process: play audio then cleanup
// Linux: flock serializes across sessions to prevent overlapping speech
// macOS: afplay blocks until done; flock unavailable but overlap unlikely (5-min nag interval)
// Remote: stream wav over ssh to PLAYBACK_HOST, where it's written to
// a remote tmp file and afplayed (afplay can't read from a pipe).
let playCmd: string;
if (PLAYBACK_HOST) {
const remote =
'f=$(mktemp -t splay.XXXXXX) && ' +
`mv "$f" "$f.wav" && f="$f.wav" && ` +
`cat > "$f" && ${PLAYBACK_PLAYER} "$f"; rm -f "$f"`;
// Single-quote-escape the remote command for safe embedding.
const remoteEsc = remote.replace(/'/g, `'\\''`);
playCmd =
`cat ${tmpFile} | ssh ${PLAYBACK_SSH_OPTS} ${PLAYBACK_HOST} '${remoteEsc}'; ` +
`rm -f ${tmpFile}`;
} else if (IS_MACOS) {
playCmd = `${AUDIO_PLAYER} ${tmpFile}; rm -f ${tmpFile}`;
} else {
playCmd = `flock ${NOTIFY_LOCK} -c "${AUDIO_PLAYER} ${tmpFile}; rm -f ${tmpFile}"`;
}
const shell = spawn(
'/bin/bash',
['-c', playCmd],
{ detached: true, stdio: 'ignore' },
);
shell.unref();
return [
{
type: 'text',
text: JSON.stringify({
queued: true,
engine: 'chatterbox',
personality: personalityName,
estimated_duration_seconds: result.duration_seconds,
text_processed: result.text_processed,
@ -322,15 +253,8 @@ export function synthesisTools(): ToolEntry[] {
},
},
handler: async (): Promise<ContentBlock[]> => {
try {
const result = await rawFetch<{ ready: boolean }>('/ready');
return [{ type: 'text', text: result.ready ? 'Model is loaded and ready.' : 'Model is NOT loaded (idle-stopped). First notify call will wake it — expect ~10s delay.' }];
} catch (err) {
if (isNetworkError(err) && fallbackAvailable()) {
return [{ type: 'text', text: `Remote Chatterbox unreachable at ${BASE_URL}. Local fallback (macOS say) is available — synthesize will speak locally.` }];
}
throw err;
}
const result = await rawFetch<{ ready: boolean }>('/ready');
return [{ type: 'text', text: result.ready ? 'Model is loaded and ready.' : 'Model is NOT loaded (idle-stopped). First notify call will wake it — expect ~10s delay.' }];
},
},
];