feat: osx-tts-mcp — local macOS say TTS plugin

Standalone MCP server for local text-to-speech via the built-in macOS `say`
command. Companion to @lilith/speech-synthesis-mcp (remote Chatterbox/GPU TTS):
no GPU or network required, always available on the Mac.

Tools: synthesize (text/personality/voice/rate), list_voices, list_personalities,
health_check. Voice/rate configurable via OSX_TTS_VOICE/_RATE; personalities file
and remote playback proxy (OSX_TTS_PLAYBACK_HOST) supported.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
Claude 2026-06-28 07:39:03 -04:00
commit aeed428b3a
11 changed files with 590 additions and 0 deletions

4
.gitignore vendored Normal file
View file

@ -0,0 +1,4 @@
node_modules
dist
*.log
.DS_Store

42
README.md Normal file
View file

@ -0,0 +1,42 @@
# @lilith/osx-tts-mcp
MCP server for **local** text-to-speech on macOS via the built-in `say` command.
No GPU, no network — speech is synthesized and played on the machine running the
server. Companion to `@lilith/speech-synthesis-mcp` (remote Chatterbox/GPU TTS):
use this one when you want local, always-available spoken output.
## Tools
- `synthesize` — speak text aloud. Args: `text` (required), `personality`,
`voice` (overrides personality voice), `rate` (wpm, overrides personality rate).
Fire-and-forget; inline tags like `[laugh]` are stripped.
- `list_voices` — installed macOS voices (`say -v '?'`), optional `locale` filter.
- `list_personalities` — named voice + rate presets.
- `health_check` — engine availability, default voice, installed-voice count.
## Configuration (env)
| Var | Default | Purpose |
| --- | --- | --- |
| `OSX_TTS_VOICE` | `Zoe (Premium)` | Default voice (name from `list_voices`) |
| `OSX_TTS_RATE` | (voice default) | Default speaking rate, words/min |
| `OSX_SAY_BIN` | `/usr/bin/say` | Override the `say` binary path |
| `OSX_TTS_PERSONALITIES_FILE` | `~/.claude/osx-speech-personalities.json` | Custom personalities |
| `OSX_TTS_PLAYBACK_HOST` | (unset) | SSH target to stream audio to a remote listener |
| `AUDIO_PLAYER` | `afplay` (macOS) | Local player |
## Example `~/.mcp.json` entry
```json
"osx-tts": {
"command": "/opt/homebrew/bin/node",
"args": ["/Users/natalie/Code/@packages/osx-tts-mcp/dist/index.js"],
"env": { "OSX_TTS_VOICE": "Zoe (Premium)" }
}
```
## Build
```sh
pnpm install && pnpm build
```

55
package.json Normal file
View file

@ -0,0 +1,55 @@
{
"name": "@lilith/osx-tts-mcp",
"version": "1.0.0",
"description": "MCP server for local macOS text-to-speech via the built-in `say` command",
"type": "module",
"main": "./dist/index.js",
"types": "./dist/index.d.ts",
"bin": {
"osx-tts-mcp": "./dist/index.js"
},
"exports": {
".": {
"types": "./dist/index.d.ts",
"import": "./dist/index.js"
}
},
"files": [
"dist",
"src"
],
"scripts": {
"build": "tsup",
"typecheck": "tsc --noEmit",
"test": "vitest run"
},
"dependencies": {
"@modelcontextprotocol/sdk": "^1.25.2",
"zod": "^4.3.6"
},
"devDependencies": {
"@types/node": "^22.0.0",
"tsup": "^8.5.1",
"typescript": "^5.7.0",
"vitest": "^2.0.0"
},
"keywords": [
"mcp",
"model-context-protocol",
"tts",
"text-to-speech",
"macos",
"say",
"claude-code"
],
"author": "Lilith <quinn@ftw.codes>",
"license": "MIT",
"publishConfig": {
"registry": "http://npm.black.lan/"
},
"_": {
"build": true,
"publish": true,
"registry": "forgejo"
}
}

6
src/index.ts Normal file
View file

@ -0,0 +1,6 @@
import { createServer } from './server';
createServer().catch((err) => {
process.stderr.write(`osx-tts-mcp: ${err instanceof Error ? err.message : String(err)}\n`);
process.exit(1);
});

130
src/say.ts Normal file
View file

@ -0,0 +1,130 @@
import { spawnSync, spawn } from 'child_process';
import { existsSync } from 'fs';
import { tmpdir } from 'os';
import { join } from 'path';
import { randomUUID } from 'crypto';
// macOS `say` binary and default voice/rate. Voice must be one of the names
// from `say -v '?'` (use the list_voices tool). Rate is words-per-minute.
// OSX_TTS_VOICE=<name> # default speaking voice (e.g. "Samantha")
// OSX_TTS_RATE=<wpm> # default speaking rate
// OSX_SAY_BIN=<path> # override say binary path
export const SAY_BIN = process.env['OSX_SAY_BIN'] ?? '/usr/bin/say';
export const DEFAULT_VOICE = process.env['OSX_TTS_VOICE'] ?? 'Zoe (Premium)';
export const DEFAULT_RATE = process.env['OSX_TTS_RATE'];
const IS_MACOS = process.platform === 'darwin';
const NOTIFY_LOCK = join(tmpdir(), 'osx-tts-notify.lock');
const AUDIO_PLAYER =
process.env['AUDIO_PLAYER'] ?? (IS_MACOS ? '/usr/bin/afplay' : '/usr/bin/pw-play');
// Playback proxy: when set, stream the synthesized audio to a remote host's
// audio output instead of playing locally (e.g. MCP runs on a headless box but
// the listener is at a Mac). The remote command buffers stdin to a temp file
// then plays it (afplay can't read a stream directly).
// OSX_TTS_PLAYBACK_HOST=<ssh-target>
// OSX_TTS_PLAYBACK_PLAYER=<remote-cmd> # default: afplay
// OSX_TTS_PLAYBACK_SSH_OPTS=... # extra ssh flags
const PLAYBACK_HOST = process.env['OSX_TTS_PLAYBACK_HOST'];
const PLAYBACK_PLAYER = process.env['OSX_TTS_PLAYBACK_PLAYER'] ?? 'afplay';
const PLAYBACK_SSH_OPTS =
process.env['OSX_TTS_PLAYBACK_SSH_OPTS'] ??
'-o BatchMode=yes -o ServerAliveInterval=15 -o ServerAliveCountMax=4';
export interface SayOptions {
voice?: string;
rate?: string;
}
export interface VoiceInfo {
name: string;
locale: string;
sample: string;
}
export function sayAvailable(): boolean {
return IS_MACOS && existsSync(SAY_BIN);
}
function assertAvailable(): void {
if (!sayAvailable()) {
throw new Error(
`macOS \`say\` not available at ${SAY_BIN} — osx-tts requires macOS (platform: ${process.platform}).`,
);
}
}
// Strip Chatterbox-style inline tags like [laugh] that `say` would read aloud,
// and collapse the resulting whitespace.
function stripTags(text: string): string {
return text.replace(/\[[^\]]*\]/g, ' ').replace(/\s+/g, ' ').trim();
}
// Synthesize speech to an AIFF file (afplay-native) and return its path.
export function synthesizeToFile(text: string, opts: SayOptions = {}): string {
assertAvailable();
const spoken = stripTags(text) || text;
const outFile = join(tmpdir(), `osx-tts-${randomUUID()}.aiff`);
const args = ['-o', outFile];
const voice = opts.voice ?? DEFAULT_VOICE;
const rate = opts.rate ?? DEFAULT_RATE;
if (voice) args.push('-v', voice);
if (rate) args.push('-r', rate);
args.push(spoken);
const result = spawnSync(SAY_BIN, args, { encoding: 'utf8', timeout: 30000 });
if (result.status !== 0) {
const detail = (result.stderr ?? '').trim() || `exit ${result.status ?? -1}`;
throw new Error(`say failed: ${detail}`);
}
return outFile;
}
// Play an audio file in the background, cleaning up afterwards.
// Local: afplay (macOS) / pw-play under flock (Linux, serialized across sessions).
// Remote: stream over ssh to OSX_TTS_PLAYBACK_HOST.
export function playAudioFile(file: string): void {
let playCmd: string;
if (PLAYBACK_HOST) {
const remote =
'f=$(mktemp -t osxtts.XXXXXX) && ' +
`mv "$f" "$f.aiff" && f="$f.aiff" && ` +
`cat > "$f" && ${PLAYBACK_PLAYER} "$f"; rm -f "$f"`;
const remoteEsc = remote.replace(/'/g, `'\\''`);
playCmd =
`cat ${file} | ssh ${PLAYBACK_SSH_OPTS} ${PLAYBACK_HOST} '${remoteEsc}'; ` +
`rm -f ${file}`;
} else if (IS_MACOS) {
playCmd = `${AUDIO_PLAYER} ${file}; rm -f ${file}`;
} else {
playCmd = `flock ${NOTIFY_LOCK} -c "${AUDIO_PLAYER} ${file}; rm -f ${file}"`;
}
const shell = spawn('/bin/bash', ['-c', playCmd], { detached: true, stdio: 'ignore' });
shell.unref();
}
// Parse `say -v '?'` output into structured voice entries.
// Each line looks like: "Zoe (Premium) en_US # Hello! My name is Zoe."
export function listVoices(): VoiceInfo[] {
assertAvailable();
const result = spawnSync(SAY_BIN, ['-v', '?'], { encoding: 'utf8', timeout: 10000 });
if (result.status !== 0) {
const detail = (result.stderr ?? '').trim() || `exit ${result.status ?? -1}`;
throw new Error(`say -v '?' failed: ${detail}`);
}
const voices: VoiceInfo[] = [];
for (const line of (result.stdout ?? '').split('\n')) {
if (!line.trim()) continue;
const hash = line.indexOf('#');
const sample = hash >= 0 ? line.slice(hash + 1).trim() : '';
const head = (hash >= 0 ? line.slice(0, hash) : line).trim();
// The locale is always the final whitespace-delimited token; the name
// (which may contain spaces and parens) is everything before it.
const parts = head.split(/\s+/);
const locale = parts.length > 1 ? (parts.pop() as string) : '';
voices.push({ name: parts.join(' '), locale, sample });
}
return voices;
}

61
src/server.ts Normal file
View file

@ -0,0 +1,61 @@
import { Server } from '@modelcontextprotocol/sdk/server';
import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio';
import {
CallToolRequestSchema,
ListToolsRequestSchema,
} from '@modelcontextprotocol/sdk/types';
import type { ToolEntry } from './types';
import { synthesisTools } from './tools/synthesis';
import { voiceTools } from './tools/voices';
export async function createServer(): Promise<void> {
const allTools: ToolEntry[] = [
...synthesisTools(),
...voiceTools(),
];
const toolMap = new Map<string, ToolEntry>();
for (const tool of allTools) {
toolMap.set(tool.definition.name, tool);
}
const server = new Server(
{ name: 'osx-tts', version: '1.0.0' },
{ capabilities: { tools: {} } },
);
server.setRequestHandler(ListToolsRequestSchema, async () => ({
tools: allTools.map((t) => ({
name: t.definition.name,
description: t.definition.description,
inputSchema: t.definition.inputSchema,
})),
}));
server.setRequestHandler(CallToolRequestSchema, async (request) => {
const { name, arguments: args } = request.params;
const tool = toolMap.get(name);
if (!tool) {
return {
content: [{ type: 'text' as const, text: `Unknown tool: ${name}` }],
isError: true,
};
}
try {
const content = await tool.handler((args ?? {}) as Record<string, unknown>);
return { content };
} catch (error) {
const message = error instanceof Error ? error.message : String(error);
return {
content: [{ type: 'text' as const, text: message }],
isError: true,
};
}
});
const transport = new StdioServerTransport();
await server.connect(transport);
}

161
src/tools/synthesis.ts Normal file
View file

@ -0,0 +1,161 @@
import { readFileSync, existsSync } from 'fs';
import { homedir } from 'os';
import { join } from 'path';
import {
DEFAULT_VOICE,
DEFAULT_RATE,
SAY_BIN,
sayAvailable,
synthesizeToFile,
playAudioFile,
listVoices,
} from '../say';
import type { ToolEntry, ContentBlock } from '../types';
import { jsonContent } from '../types';
const PERSONALITIES_FILE =
process.env['OSX_TTS_PERSONALITIES_FILE'] ??
join(homedir(), '.claude', 'osx-speech-personalities.json');
interface Personality {
// macOS voice name (from list_voices); null/absent → fall back to DEFAULT_VOICE.
voice: string | null;
// Speaking rate in words-per-minute; absent → say's default for the voice.
rate?: number;
description: string;
}
type PersonalitiesConfig = Record<string, Personality>;
const DEFAULT_PERSONALITIES: PersonalitiesConfig = {
default: {
voice: null,
description: 'Default configured voice, natural pace',
},
urgent: {
voice: null,
rate: 220,
description: 'Faster, attention-grabbing',
},
casual: {
voice: null,
rate: 170,
description: 'Relaxed, slower pace',
},
};
function loadPersonalities(): PersonalitiesConfig {
if (existsSync(PERSONALITIES_FILE)) {
try {
return JSON.parse(readFileSync(PERSONALITIES_FILE, 'utf8')) as PersonalitiesConfig;
} catch {
return DEFAULT_PERSONALITIES;
}
}
return DEFAULT_PERSONALITIES;
}
export function synthesisTools(): ToolEntry[] {
return [
{
definition: {
name: 'synthesize',
description:
'Speak text aloud locally using the built-in macOS `say` engine. Plays automatically and returns immediately (fire-and-forget). No GPU or network required. Optionally pick a named personality, or override the voice/rate directly. Inline tags like [laugh] are stripped.',
inputSchema: {
type: 'object' as const,
properties: {
text: {
type: 'string',
description: 'The message to speak. Keep it concise and conversational.',
},
personality: {
type: 'string',
description:
'Named personality (voice + rate preset). Use list_personalities to see options. Defaults to "default".',
},
voice: {
type: 'string',
description:
'Override the macOS voice by name (e.g. "Samantha", "Zoe (Premium)"). Use list_voices to see installed voices. Takes precedence over the personality voice.',
},
rate: {
type: 'number',
description: 'Override the speaking rate in words-per-minute. Takes precedence over the personality rate.',
},
},
required: ['text'],
},
},
handler: async (args): Promise<ContentBlock[]> => {
const personalities = loadPersonalities();
const personalityName = (args['personality'] as string | undefined) ?? 'default';
const personality =
personalities[personalityName] ??
personalities['default'] ??
DEFAULT_PERSONALITIES['default'];
const voiceOverride = args['voice'] as string | undefined;
const rateOverride = args['rate'] as number | undefined;
const voice = voiceOverride ?? personality.voice ?? DEFAULT_VOICE ?? undefined;
const rateNum = rateOverride ?? personality.rate;
const rate = rateNum !== undefined ? String(rateNum) : DEFAULT_RATE;
const file = synthesizeToFile(args['text'] as string, { voice, rate });
playAudioFile(file);
return jsonContent({
queued: true,
engine: 'macos-say',
personality: personalityName,
voice: voice ?? '(system default)',
rate: rate ?? '(voice default)',
});
},
},
{
definition: {
name: 'list_personalities',
description:
'List available voice personalities (voice + rate presets) for the synthesize tool.',
inputSchema: {
type: 'object' as const,
properties: {},
},
},
handler: async (): Promise<ContentBlock[]> => {
const personalities = loadPersonalities();
const output = Object.entries(personalities).map(([name, p]) => ({
name,
description: p.description,
voice: p.voice ?? `(default: ${DEFAULT_VOICE})`,
rate: p.rate ?? '(voice default)',
}));
return jsonContent(output);
},
},
{
definition: {
name: 'health_check',
description:
'Report whether the local macOS `say` engine is available, the default voice, and how many voices are installed.',
inputSchema: {
type: 'object' as const,
properties: {},
},
},
handler: async (): Promise<ContentBlock[]> => {
const available = sayAvailable();
return jsonContent({
engine: 'macos-say',
available,
say_bin: SAY_BIN,
default_voice: DEFAULT_VOICE,
default_rate: DEFAULT_RATE ?? '(voice default)',
installed_voices: available ? listVoices().length : 0,
});
},
},
];
}

33
src/tools/voices.ts Normal file
View file

@ -0,0 +1,33 @@
import { listVoices } from '../say';
import type { ToolEntry, ContentBlock } from '../types';
import { jsonContent } from '../types';
export function voiceTools(): ToolEntry[] {
return [
{
definition: {
name: 'list_voices',
description:
'List the macOS voices installed on this machine (name, locale, sample line), as reported by `say -v "?"`. Use a name with the synthesize tool\'s `voice` argument or in a personality.',
inputSchema: {
type: 'object' as const,
properties: {
locale: {
type: 'string',
description:
'Optional case-insensitive locale filter, e.g. "en" or "en_US". Matches the start of each voice locale.',
},
},
},
},
handler: async (args): Promise<ContentBlock[]> => {
const filter = (args['locale'] as string | undefined)?.toLowerCase();
let voices = listVoices();
if (filter) {
voices = voices.filter((v) => v.locale.toLowerCase().startsWith(filter));
}
return jsonContent({ count: voices.length, voices });
},
},
];
}

24
src/types.ts Normal file
View file

@ -0,0 +1,24 @@
export type TextContent = { type: 'text'; text: string };
export type ImageContent = { type: 'image'; data: string; mimeType: string };
export type ContentBlock = TextContent | ImageContent;
export type ToolHandler = (args: Record<string, unknown>) => Promise<ContentBlock[]>;
export interface ToolDefinition {
name: string;
description: string;
inputSchema: {
type: 'object';
properties: Record<string, unknown>;
required?: string[];
};
}
export interface ToolEntry {
definition: ToolDefinition;
handler: ToolHandler;
}
export function jsonContent(data: unknown): ContentBlock[] {
return [{ type: 'text', text: JSON.stringify(data, null, 2) }];
}

26
tsconfig.json Normal file
View file

@ -0,0 +1,26 @@
{
"compilerOptions": {
"target": "ES2022",
"module": "ES2022",
"moduleResolution": "bundler",
"lib": ["ES2022"],
"outDir": "dist",
"rootDir": "src",
"strict": true,
"esModuleInterop": true,
"skipLibCheck": true,
"forceConsistentCasingInFileNames": true,
"declaration": true,
"declarationMap": true,
"sourceMap": true,
"resolveJsonModule": true,
"isolatedModules": true,
"paths": {
"@modelcontextprotocol/sdk/server": ["./node_modules/@modelcontextprotocol/sdk/dist/esm/server/index.d.ts"],
"@modelcontextprotocol/sdk/server/stdio": ["./node_modules/@modelcontextprotocol/sdk/dist/esm/server/stdio.d.ts"],
"@modelcontextprotocol/sdk/types": ["./node_modules/@modelcontextprotocol/sdk/dist/esm/types.d.ts"]
}
},
"include": ["src"],
"exclude": ["node_modules", "dist"]
}

48
tsup.config.ts Normal file
View file

@ -0,0 +1,48 @@
import { defineConfig } from 'tsup';
import { resolve, dirname } from 'path';
import { fileURLToPath } from 'url';
const __dirname = dirname(fileURLToPath(import.meta.url));
export default defineConfig({
entry: ['src/index.ts'],
format: ['esm'],
target: 'es2022',
outDir: 'dist',
clean: true,
sourcemap: true,
dts: true,
bundle: true,
noExternal: [/.*/],
banner: { js: '#!/usr/bin/env node' },
esbuildPlugins: [
{
name: 'fix-mcp-sdk-deps',
setup(build) {
const explicitExports = new Set(['server', 'client', 'validation', 'experimental']);
build.onResolve({ filter: /^@modelcontextprotocol\/sdk\/.+/ }, (args) => {
const subpath = args.path.replace('@modelcontextprotocol/sdk/', '');
const topLevel = subpath.split('/')[0];
if (explicitExports.has(topLevel) && !subpath.includes('/')) return undefined;
return {
path: resolve(
__dirname,
'node_modules/@modelcontextprotocol/sdk/dist/esm',
subpath + '.js',
),
};
});
build.onResolve({ filter: /^ajv-formats/ }, (args) => {
return { path: args.path, namespace: 'ajv-stub' };
});
build.onResolve({ filter: /^ajv\/dist\// }, (args) => {
return { path: args.path, namespace: 'ajv-stub' };
});
build.onLoad({ filter: /.*/, namespace: 'ajv-stub' }, () => {
return { contents: 'module.exports = function() {};', loader: 'js' };
});
},
},
],
});