Revert "feat: local macOS say fallback when remote Chatterbox is unreachable"
This reverts commit 44a80003b0.
This commit is contained in:
parent
44a80003b0
commit
96a38d884c
2 changed files with 34 additions and 110 deletions
|
|
@ -1,6 +1,6 @@
|
|||
{
|
||||
"name": "@lilith/speech-synthesis-mcp",
|
||||
"version": "1.1.0",
|
||||
"version": "1.0.1",
|
||||
"description": "MCP server for the Chatterbox TTS speech-synthesis service",
|
||||
"type": "module",
|
||||
"main": "./dist/index.js",
|
||||
|
|
|
|||
|
|
@ -3,7 +3,7 @@ import { writeFileSync, readFileSync, existsSync } from 'fs';
|
|||
import { randomUUID } from 'crypto';
|
||||
import { tmpdir, homedir } from 'os';
|
||||
import { join } from 'path';
|
||||
import { rawFetch, BASE_URL } from '../client';
|
||||
import { rawFetch } from '../client';
|
||||
import type { ToolEntry, ContentBlock } from '../types';
|
||||
import { jsonContent } from '../types';
|
||||
|
||||
|
|
@ -31,74 +31,6 @@ const PLAYBACK_SSH_OPTS =
|
|||
process.env['SPEECH_PLAYBACK_SSH_OPTS'] ??
|
||||
'-o BatchMode=yes -o ServerAliveInterval=15 -o ServerAliveCountMax=4';
|
||||
|
||||
// Local fallback: when the remote Chatterbox service is unreachable (e.g. the
|
||||
// GPU host is offline or the mesh link is down), synthesize on the MCP host
|
||||
// itself using macOS `say`. Lower fidelity, but it always works without a GPU
|
||||
// and keeps spoken notifications flowing. macOS-only (no `say` on Linux).
|
||||
//
|
||||
// SPEECH_FALLBACK=off # disable local fallback entirely
|
||||
// SPEECH_FALLBACK_VOICE=<name> # macOS voice (e.g. "Samantha"); default system voice
|
||||
// SPEECH_FALLBACK_RATE=<wpm> # speaking rate in words/min (e.g. 180)
|
||||
// SPEECH_PRIMARY_ATTEMPTS=<n> # remote submit retries before failover (default 10)
|
||||
const SAY_BIN = '/usr/bin/say';
|
||||
const FALLBACK_ENABLED = process.env['SPEECH_FALLBACK'] !== 'off';
|
||||
const FALLBACK_VOICE = process.env['SPEECH_FALLBACK_VOICE'];
|
||||
const FALLBACK_RATE = process.env['SPEECH_FALLBACK_RATE'];
|
||||
const PRIMARY_ATTEMPTS = (() => {
|
||||
const n = Number(process.env['SPEECH_PRIMARY_ATTEMPTS']);
|
||||
return Number.isInteger(n) && n > 0 ? n : 10;
|
||||
})();
|
||||
|
||||
function fallbackAvailable(): boolean {
|
||||
return FALLBACK_ENABLED && IS_MACOS && existsSync(SAY_BIN);
|
||||
}
|
||||
|
||||
function isNetworkError(err: unknown): boolean {
|
||||
const message = err instanceof Error ? err.message : String(err);
|
||||
return message.includes('Failed to fetch') || message.includes('TTS service unavailable');
|
||||
}
|
||||
|
||||
// Generate speech locally via macOS `say` into an AIFF file (afplay-native).
|
||||
// Strips Chatterbox inline tags like [laugh] since `say` would read them aloud.
|
||||
function speakFallback(text: string): string {
|
||||
const clean = text.replace(/\[[^\]]*\]/g, ' ').replace(/\s+/g, ' ').trim() || text;
|
||||
const outFile = join(tmpdir(), `speech-fallback-${randomUUID()}.aiff`);
|
||||
const sayArgs = ['-o', outFile];
|
||||
if (FALLBACK_VOICE) sayArgs.push('-v', FALLBACK_VOICE);
|
||||
if (FALLBACK_RATE) sayArgs.push('-r', FALLBACK_RATE);
|
||||
sayArgs.push(clean);
|
||||
|
||||
const result = spawnSync(SAY_BIN, sayArgs, { encoding: 'utf8', timeout: 30000 });
|
||||
if (result.status !== 0) {
|
||||
const detail = (result.stderr ?? '').trim() || `exit ${result.status ?? -1}`;
|
||||
throw new Error(`Local fallback TTS (say) failed: ${detail}`);
|
||||
}
|
||||
return outFile;
|
||||
}
|
||||
|
||||
// Play a synthesized audio file in the background, cleaning up afterwards.
|
||||
// Routes through the same playback machinery as primary synthesis: stream over
|
||||
// ssh to PLAYBACK_HOST if set, else afplay (macOS) / pw-play under flock (Linux).
|
||||
function playAudioFile(file: string): void {
|
||||
let playCmd: string;
|
||||
if (PLAYBACK_HOST) {
|
||||
const remote =
|
||||
'f=$(mktemp -t splay.XXXXXX) && ' +
|
||||
`mv "$f" "$f.wav" && f="$f.wav" && ` +
|
||||
`cat > "$f" && ${PLAYBACK_PLAYER} "$f"; rm -f "$f"`;
|
||||
const remoteEsc = remote.replace(/'/g, `'\\''`);
|
||||
playCmd =
|
||||
`cat ${file} | ssh ${PLAYBACK_SSH_OPTS} ${PLAYBACK_HOST} '${remoteEsc}'; ` +
|
||||
`rm -f ${file}`;
|
||||
} else if (IS_MACOS) {
|
||||
playCmd = `${AUDIO_PLAYER} ${file}; rm -f ${file}`;
|
||||
} else {
|
||||
playCmd = `flock ${NOTIFY_LOCK} -c "${AUDIO_PLAYER} ${file}; rm -f ${file}"`;
|
||||
}
|
||||
const shell = spawn('/bin/bash', ['-c', playCmd], { detached: true, stdio: 'ignore' });
|
||||
shell.unref();
|
||||
}
|
||||
|
||||
interface Personality {
|
||||
voice_id: string | null;
|
||||
exaggeration: number;
|
||||
|
|
@ -163,7 +95,7 @@ async function rawFetchWithRetry<T>(
|
|||
throw lastError ?? new Error('TTS service unavailable');
|
||||
}
|
||||
|
||||
async function submitAndPoll(body: Record<string, unknown>, attempts: number = 10): Promise<{
|
||||
async function submitAndPoll(body: Record<string, unknown>): Promise<{
|
||||
audio_base64: string;
|
||||
format: string;
|
||||
sample_rate: number;
|
||||
|
|
@ -174,7 +106,6 @@ async function submitAndPoll(body: Record<string, unknown>, attempts: number = 1
|
|||
const submitted = await rawFetchWithRetry<{ job_id: string; status: string; queue_position: number }>(
|
||||
'/jobs',
|
||||
{ method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify(body) },
|
||||
attempts,
|
||||
);
|
||||
|
||||
const { job_id: jobId } = submitted;
|
||||
|
|
@ -197,7 +128,7 @@ export function synthesisTools(): ToolEntry[] {
|
|||
definition: {
|
||||
name: 'synthesize',
|
||||
description:
|
||||
'Synthesize speech from text using Chatterbox TTS. Plays automatically with cross-session queueing (no overlapping speech). Fire-and-forget: returns immediately while audio plays. Choose a personality to control voice character and emotional delivery. If the remote Chatterbox service is unreachable, automatically falls back to local macOS `say` so notifications still play (response reports engine: "local-fallback").',
|
||||
'Synthesize speech from text using Chatterbox TTS. Plays automatically with cross-session queueing (no overlapping speech). Fire-and-forget: returns immediately while audio plays. Choose a personality to control voice character and emotional delivery.',
|
||||
inputSchema: {
|
||||
type: 'object' as const,
|
||||
properties: {
|
||||
|
|
@ -228,45 +159,45 @@ export function synthesisTools(): ToolEntry[] {
|
|||
if (personality.exaggeration !== undefined) body['exaggeration'] = personality.exaggeration;
|
||||
if (personality.cfg_weight !== undefined) body['cfg_weight'] = personality.cfg_weight;
|
||||
|
||||
let result: Awaited<ReturnType<typeof submitAndPoll>>;
|
||||
try {
|
||||
result = await submitAndPoll(body, PRIMARY_ATTEMPTS);
|
||||
} catch (err) {
|
||||
// Remote Chatterbox unreachable — fall back to local macOS `say`
|
||||
// so notifications still get spoken. Only for network failures;
|
||||
// a real synthesis error (job 'failed') is surfaced as-is.
|
||||
if (fallbackAvailable() && isNetworkError(err)) {
|
||||
const file = speakFallback(args['text'] as string);
|
||||
playAudioFile(file);
|
||||
return [
|
||||
{
|
||||
type: 'text',
|
||||
text: JSON.stringify({
|
||||
queued: true,
|
||||
engine: 'local-fallback',
|
||||
fallback_reason: `Chatterbox unreachable at ${BASE_URL}; spoke via macOS say`,
|
||||
voice: FALLBACK_VOICE ?? '(system default)',
|
||||
personality: personalityName,
|
||||
}, null, 2),
|
||||
},
|
||||
];
|
||||
}
|
||||
throw err;
|
||||
}
|
||||
const result = await submitAndPoll(body);
|
||||
|
||||
const audioBuffer = Buffer.from(result.audio_base64, 'base64');
|
||||
const tmpFile = join(tmpdir(), `speech-notify-${randomUUID()}.wav`);
|
||||
writeFileSync(tmpFile, audioBuffer);
|
||||
|
||||
// Play audio in the background then clean up (see playAudioFile).
|
||||
playAudioFile(tmpFile);
|
||||
// Spawn background process: play audio then cleanup
|
||||
// Linux: flock serializes across sessions to prevent overlapping speech
|
||||
// macOS: afplay blocks until done; flock unavailable but overlap unlikely (5-min nag interval)
|
||||
// Remote: stream wav over ssh to PLAYBACK_HOST, where it's written to
|
||||
// a remote tmp file and afplayed (afplay can't read from a pipe).
|
||||
let playCmd: string;
|
||||
if (PLAYBACK_HOST) {
|
||||
const remote =
|
||||
'f=$(mktemp -t splay.XXXXXX) && ' +
|
||||
`mv "$f" "$f.wav" && f="$f.wav" && ` +
|
||||
`cat > "$f" && ${PLAYBACK_PLAYER} "$f"; rm -f "$f"`;
|
||||
// Single-quote-escape the remote command for safe embedding.
|
||||
const remoteEsc = remote.replace(/'/g, `'\\''`);
|
||||
playCmd =
|
||||
`cat ${tmpFile} | ssh ${PLAYBACK_SSH_OPTS} ${PLAYBACK_HOST} '${remoteEsc}'; ` +
|
||||
`rm -f ${tmpFile}`;
|
||||
} else if (IS_MACOS) {
|
||||
playCmd = `${AUDIO_PLAYER} ${tmpFile}; rm -f ${tmpFile}`;
|
||||
} else {
|
||||
playCmd = `flock ${NOTIFY_LOCK} -c "${AUDIO_PLAYER} ${tmpFile}; rm -f ${tmpFile}"`;
|
||||
}
|
||||
const shell = spawn(
|
||||
'/bin/bash',
|
||||
['-c', playCmd],
|
||||
{ detached: true, stdio: 'ignore' },
|
||||
);
|
||||
shell.unref();
|
||||
|
||||
return [
|
||||
{
|
||||
type: 'text',
|
||||
text: JSON.stringify({
|
||||
queued: true,
|
||||
engine: 'chatterbox',
|
||||
personality: personalityName,
|
||||
estimated_duration_seconds: result.duration_seconds,
|
||||
text_processed: result.text_processed,
|
||||
|
|
@ -322,15 +253,8 @@ export function synthesisTools(): ToolEntry[] {
|
|||
},
|
||||
},
|
||||
handler: async (): Promise<ContentBlock[]> => {
|
||||
try {
|
||||
const result = await rawFetch<{ ready: boolean }>('/ready');
|
||||
return [{ type: 'text', text: result.ready ? 'Model is loaded and ready.' : 'Model is NOT loaded (idle-stopped). First notify call will wake it — expect ~10s delay.' }];
|
||||
} catch (err) {
|
||||
if (isNetworkError(err) && fallbackAvailable()) {
|
||||
return [{ type: 'text', text: `Remote Chatterbox unreachable at ${BASE_URL}. Local fallback (macOS say) is available — synthesize will speak locally.` }];
|
||||
}
|
||||
throw err;
|
||||
}
|
||||
},
|
||||
},
|
||||
];
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue