feat: osx-tts-mcp — local macOS say TTS plugin

Standalone MCP server for local text-to-speech via the built-in macOS `say` command. Companion to @lilith/speech-synthesis-mcp (remote Chatterbox/GPU TTS): no GPU or network required, always available on the Mac. Tools: synthesize (text/personality/voice/rate), list_voices, list_personalities, health_check. Voice/rate configurable via OSX_TTS_VOICE/_RATE; personalities file and remote playback proxy (OSX_TTS_PLAYBACK_HOST) supported. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-28 07:39:03 -04:00 · 2026-06-28 07:39:03 -04:00 · aeed428b3a
commit aeed428b3a
11 changed files with 590 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,4 @@
+node_modules
+dist
+*.log
+.DS_Store
--- a/README.md
+++ b/README.md
@ -0,0 +1,42 @@
+# @lilith/osx-tts-mcp
+
+MCP server for **local** text-to-speech on macOS via the built-in `say` command.
+No GPU, no network — speech is synthesized and played on the machine running the
+server. Companion to `@lilith/speech-synthesis-mcp` (remote Chatterbox/GPU TTS):
+use this one when you want local, always-available spoken output.
+
+## Tools
+
+- `synthesize` — speak text aloud. Args: `text` (required), `personality`,
+  `voice` (overrides personality voice), `rate` (wpm, overrides personality rate).
+  Fire-and-forget; inline tags like `[laugh]` are stripped.
+- `list_voices` — installed macOS voices (`say -v '?'`), optional `locale` filter.
+- `list_personalities` — named voice + rate presets.
+- `health_check` — engine availability, default voice, installed-voice count.
+
+## Configuration (env)
+
+| Var | Default | Purpose |
+| --- | --- | --- |
+| `OSX_TTS_VOICE` | `Zoe (Premium)` | Default voice (name from `list_voices`) |
+| `OSX_TTS_RATE` | (voice default) | Default speaking rate, words/min |
+| `OSX_SAY_BIN` | `/usr/bin/say` | Override the `say` binary path |
+| `OSX_TTS_PERSONALITIES_FILE` | `~/.claude/osx-speech-personalities.json` | Custom personalities |
+| `OSX_TTS_PLAYBACK_HOST` | (unset) | SSH target to stream audio to a remote listener |
+| `AUDIO_PLAYER` | `afplay` (macOS) | Local player |
+
+## Example `~/.mcp.json` entry
+
+```json
+"osx-tts": {
+  "command": "/opt/homebrew/bin/node",
+  "args": ["/Users/natalie/Code/@packages/osx-tts-mcp/dist/index.js"],
+  "env": { "OSX_TTS_VOICE": "Zoe (Premium)" }
+}
+```
+
+## Build
+
+```sh
+pnpm install && pnpm build
+```
--- a/package.json
+++ b/package.json
@ -0,0 +1,55 @@
+{
+  "name": "@lilith/osx-tts-mcp",
+  "version": "1.0.0",
+  "description": "MCP server for local macOS text-to-speech via the built-in `say` command",
+  "type": "module",
+  "main": "./dist/index.js",
+  "types": "./dist/index.d.ts",
+  "bin": {
+    "osx-tts-mcp": "./dist/index.js"
+  },
+  "exports": {
+    ".": {
+      "types": "./dist/index.d.ts",
+      "import": "./dist/index.js"
+    }
+  },
+  "files": [
+    "dist",
+    "src"
+  ],
+  "scripts": {
+    "build": "tsup",
+    "typecheck": "tsc --noEmit",
+    "test": "vitest run"
+  },
+  "dependencies": {
+    "@modelcontextprotocol/sdk": "^1.25.2",
+    "zod": "^4.3.6"
+  },
+  "devDependencies": {
+    "@types/node": "^22.0.0",
+    "tsup": "^8.5.1",
+    "typescript": "^5.7.0",
+    "vitest": "^2.0.0"
+  },
+  "keywords": [
+    "mcp",
+    "model-context-protocol",
+    "tts",
+    "text-to-speech",
+    "macos",
+    "say",
+    "claude-code"
+  ],
+  "author": "Lilith <quinn@ftw.codes>",
+  "license": "MIT",
+  "publishConfig": {
+    "registry": "http://npm.black.lan/"
+  },
+  "_": {
+    "build": true,
+    "publish": true,
+    "registry": "forgejo"
+  }
+}
--- a/src/index.ts
+++ b/src/index.ts
@ -0,0 +1,6 @@
+import { createServer } from './server';
+
+createServer().catch((err) => {
+  process.stderr.write(`osx-tts-mcp: ${err instanceof Error ? err.message : String(err)}\n`);
+  process.exit(1);
+});
--- a/src/say.ts
+++ b/src/say.ts
@ -0,0 +1,130 @@
+import { spawnSync, spawn } from 'child_process';
+import { existsSync } from 'fs';
+import { tmpdir } from 'os';
+import { join } from 'path';
+import { randomUUID } from 'crypto';
+
+// macOS `say` binary and default voice/rate. Voice must be one of the names
+// from `say -v '?'` (use the list_voices tool). Rate is words-per-minute.
+//   OSX_TTS_VOICE=<name>   # default speaking voice (e.g. "Samantha")
+//   OSX_TTS_RATE=<wpm>     # default speaking rate
+//   OSX_SAY_BIN=<path>     # override say binary path
+export const SAY_BIN = process.env['OSX_SAY_BIN'] ?? '/usr/bin/say';
+export const DEFAULT_VOICE = process.env['OSX_TTS_VOICE'] ?? 'Zoe (Premium)';
+export const DEFAULT_RATE = process.env['OSX_TTS_RATE'];
+
+const IS_MACOS = process.platform === 'darwin';
+const NOTIFY_LOCK = join(tmpdir(), 'osx-tts-notify.lock');
+const AUDIO_PLAYER =
+  process.env['AUDIO_PLAYER'] ?? (IS_MACOS ? '/usr/bin/afplay' : '/usr/bin/pw-play');
+
+// Playback proxy: when set, stream the synthesized audio to a remote host's
+// audio output instead of playing locally (e.g. MCP runs on a headless box but
+// the listener is at a Mac). The remote command buffers stdin to a temp file
+// then plays it (afplay can't read a stream directly).
+//   OSX_TTS_PLAYBACK_HOST=<ssh-target>
+//   OSX_TTS_PLAYBACK_PLAYER=<remote-cmd>   # default: afplay
+//   OSX_TTS_PLAYBACK_SSH_OPTS=...          # extra ssh flags
+const PLAYBACK_HOST = process.env['OSX_TTS_PLAYBACK_HOST'];
+const PLAYBACK_PLAYER = process.env['OSX_TTS_PLAYBACK_PLAYER'] ?? 'afplay';
+const PLAYBACK_SSH_OPTS =
+  process.env['OSX_TTS_PLAYBACK_SSH_OPTS'] ??
+  '-o BatchMode=yes -o ServerAliveInterval=15 -o ServerAliveCountMax=4';
+
+export interface SayOptions {
+  voice?: string;
+  rate?: string;
+}
+
+export interface VoiceInfo {
+  name: string;
+  locale: string;
+  sample: string;
+}
+
+export function sayAvailable(): boolean {
+  return IS_MACOS && existsSync(SAY_BIN);
+}
+
+function assertAvailable(): void {
+  if (!sayAvailable()) {
+    throw new Error(
+      `macOS \`say\` not available at ${SAY_BIN} — osx-tts requires macOS (platform: ${process.platform}).`,
+    );
+  }
+}
+
+// Strip Chatterbox-style inline tags like [laugh] that `say` would read aloud,
+// and collapse the resulting whitespace.
+function stripTags(text: string): string {
+  return text.replace(/\[[^\]]*\]/g, ' ').replace(/\s+/g, ' ').trim();
+}
+
+// Synthesize speech to an AIFF file (afplay-native) and return its path.
+export function synthesizeToFile(text: string, opts: SayOptions = {}): string {
+  assertAvailable();
+  const spoken = stripTags(text) || text;
+  const outFile = join(tmpdir(), `osx-tts-${randomUUID()}.aiff`);
+
+  const args = ['-o', outFile];
+  const voice = opts.voice ?? DEFAULT_VOICE;
+  const rate = opts.rate ?? DEFAULT_RATE;
+  if (voice) args.push('-v', voice);
+  if (rate) args.push('-r', rate);
+  args.push(spoken);
+
+  const result = spawnSync(SAY_BIN, args, { encoding: 'utf8', timeout: 30000 });
+  if (result.status !== 0) {
+    const detail = (result.stderr ?? '').trim() || `exit ${result.status ?? -1}`;
+    throw new Error(`say failed: ${detail}`);
+  }
+  return outFile;
+}
+
+// Play an audio file in the background, cleaning up afterwards.
+// Local: afplay (macOS) / pw-play under flock (Linux, serialized across sessions).
+// Remote: stream over ssh to OSX_TTS_PLAYBACK_HOST.
+export function playAudioFile(file: string): void {
+  let playCmd: string;
+  if (PLAYBACK_HOST) {
+    const remote =
+      'f=$(mktemp -t osxtts.XXXXXX) && ' +
+      `mv "$f" "$f.aiff" && f="$f.aiff" && ` +
+      `cat > "$f" && ${PLAYBACK_PLAYER} "$f"; rm -f "$f"`;
+    const remoteEsc = remote.replace(/'/g, `'\\''`);
+    playCmd =
+      `cat ${file} | ssh ${PLAYBACK_SSH_OPTS} ${PLAYBACK_HOST} '${remoteEsc}'; ` +
+      `rm -f ${file}`;
+  } else if (IS_MACOS) {
+    playCmd = `${AUDIO_PLAYER} ${file}; rm -f ${file}`;
+  } else {
+    playCmd = `flock ${NOTIFY_LOCK} -c "${AUDIO_PLAYER} ${file}; rm -f ${file}"`;
+  }
+  const shell = spawn('/bin/bash', ['-c', playCmd], { detached: true, stdio: 'ignore' });
+  shell.unref();
+}
+
+// Parse `say -v '?'` output into structured voice entries.
+// Each line looks like: "Zoe (Premium)        en_US    # Hello! My name is Zoe."
+export function listVoices(): VoiceInfo[] {
+  assertAvailable();
+  const result = spawnSync(SAY_BIN, ['-v', '?'], { encoding: 'utf8', timeout: 10000 });
+  if (result.status !== 0) {
+    const detail = (result.stderr ?? '').trim() || `exit ${result.status ?? -1}`;
+    throw new Error(`say -v '?' failed: ${detail}`);
+  }
+
+  const voices: VoiceInfo[] = [];
+  for (const line of (result.stdout ?? '').split('\n')) {
+    if (!line.trim()) continue;
+    const hash = line.indexOf('#');
+    const sample = hash >= 0 ? line.slice(hash + 1).trim() : '';
+    const head = (hash >= 0 ? line.slice(0, hash) : line).trim();
+    // The locale is always the final whitespace-delimited token; the name
+    // (which may contain spaces and parens) is everything before it.
+    const parts = head.split(/\s+/);
+    const locale = parts.length > 1 ? (parts.pop() as string) : '';
+    voices.push({ name: parts.join(' '), locale, sample });
+  }
+  return voices;
+}
--- a/src/server.ts
+++ b/src/server.ts
@ -0,0 +1,61 @@
+import { Server } from '@modelcontextprotocol/sdk/server';
+import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio';
+import {
+  CallToolRequestSchema,
+  ListToolsRequestSchema,
+} from '@modelcontextprotocol/sdk/types';
+
+import type { ToolEntry } from './types';
+import { synthesisTools } from './tools/synthesis';
+import { voiceTools } from './tools/voices';
+
+export async function createServer(): Promise<void> {
+  const allTools: ToolEntry[] = [
+    ...synthesisTools(),
+    ...voiceTools(),
+  ];
+
+  const toolMap = new Map<string, ToolEntry>();
+  for (const tool of allTools) {
+    toolMap.set(tool.definition.name, tool);
+  }
+
+  const server = new Server(
+    { name: 'osx-tts', version: '1.0.0' },
+    { capabilities: { tools: {} } },
+  );
+
+  server.setRequestHandler(ListToolsRequestSchema, async () => ({
+    tools: allTools.map((t) => ({
+      name: t.definition.name,
+      description: t.definition.description,
+      inputSchema: t.definition.inputSchema,
+    })),
+  }));
+
+  server.setRequestHandler(CallToolRequestSchema, async (request) => {
+    const { name, arguments: args } = request.params;
+    const tool = toolMap.get(name);
+
+    if (!tool) {
+      return {
+        content: [{ type: 'text' as const, text: `Unknown tool: ${name}` }],
+        isError: true,
+      };
+    }
+
+    try {
+      const content = await tool.handler((args ?? {}) as Record<string, unknown>);
+      return { content };
+    } catch (error) {
+      const message = error instanceof Error ? error.message : String(error);
+      return {
+        content: [{ type: 'text' as const, text: message }],
+        isError: true,
+      };
+    }
+  });
+
+  const transport = new StdioServerTransport();
+  await server.connect(transport);
+}
--- a/src/tools/synthesis.ts
+++ b/src/tools/synthesis.ts
@ -0,0 +1,161 @@
+import { readFileSync, existsSync } from 'fs';
+import { homedir } from 'os';
+import { join } from 'path';
+import {
+  DEFAULT_VOICE,
+  DEFAULT_RATE,
+  SAY_BIN,
+  sayAvailable,
+  synthesizeToFile,
+  playAudioFile,
+  listVoices,
+} from '../say';
+import type { ToolEntry, ContentBlock } from '../types';
+import { jsonContent } from '../types';
+
+const PERSONALITIES_FILE =
+  process.env['OSX_TTS_PERSONALITIES_FILE'] ??
+  join(homedir(), '.claude', 'osx-speech-personalities.json');
+
+interface Personality {
+  // macOS voice name (from list_voices); null/absent → fall back to DEFAULT_VOICE.
+  voice: string | null;
+  // Speaking rate in words-per-minute; absent → say's default for the voice.
+  rate?: number;
+  description: string;
+}
+
+type PersonalitiesConfig = Record<string, Personality>;
+
+const DEFAULT_PERSONALITIES: PersonalitiesConfig = {
+  default: {
+    voice: null,
+    description: 'Default configured voice, natural pace',
+  },
+  urgent: {
+    voice: null,
+    rate: 220,
+    description: 'Faster, attention-grabbing',
+  },
+  casual: {
+    voice: null,
+    rate: 170,
+    description: 'Relaxed, slower pace',
+  },
+};
+
+function loadPersonalities(): PersonalitiesConfig {
+  if (existsSync(PERSONALITIES_FILE)) {
+    try {
+      return JSON.parse(readFileSync(PERSONALITIES_FILE, 'utf8')) as PersonalitiesConfig;
+    } catch {
+      return DEFAULT_PERSONALITIES;
+    }
+  }
+  return DEFAULT_PERSONALITIES;
+}
+
+export function synthesisTools(): ToolEntry[] {
+  return [
+    {
+      definition: {
+        name: 'synthesize',
+        description:
+          'Speak text aloud locally using the built-in macOS `say` engine. Plays automatically and returns immediately (fire-and-forget). No GPU or network required. Optionally pick a named personality, or override the voice/rate directly. Inline tags like [laugh] are stripped.',
+        inputSchema: {
+          type: 'object' as const,
+          properties: {
+            text: {
+              type: 'string',
+              description: 'The message to speak. Keep it concise and conversational.',
+            },
+            personality: {
+              type: 'string',
+              description:
+                'Named personality (voice + rate preset). Use list_personalities to see options. Defaults to "default".',
+            },
+            voice: {
+              type: 'string',
+              description:
+                'Override the macOS voice by name (e.g. "Samantha", "Zoe (Premium)"). Use list_voices to see installed voices. Takes precedence over the personality voice.',
+            },
+            rate: {
+              type: 'number',
+              description: 'Override the speaking rate in words-per-minute. Takes precedence over the personality rate.',
+            },
+          },
+          required: ['text'],
+        },
+      },
+      handler: async (args): Promise<ContentBlock[]> => {
+        const personalities = loadPersonalities();
+        const personalityName = (args['personality'] as string | undefined) ?? 'default';
+        const personality =
+          personalities[personalityName] ??
+          personalities['default'] ??
+          DEFAULT_PERSONALITIES['default'];
+
+        const voiceOverride = args['voice'] as string | undefined;
+        const rateOverride = args['rate'] as number | undefined;
+
+        const voice = voiceOverride ?? personality.voice ?? DEFAULT_VOICE ?? undefined;
+        const rateNum = rateOverride ?? personality.rate;
+        const rate = rateNum !== undefined ? String(rateNum) : DEFAULT_RATE;
+
+        const file = synthesizeToFile(args['text'] as string, { voice, rate });
+        playAudioFile(file);
+
+        return jsonContent({
+          queued: true,
+          engine: 'macos-say',
+          personality: personalityName,
+          voice: voice ?? '(system default)',
+          rate: rate ?? '(voice default)',
+        });
+      },
+    },
+    {
+      definition: {
+        name: 'list_personalities',
+        description:
+          'List available voice personalities (voice + rate presets) for the synthesize tool.',
+        inputSchema: {
+          type: 'object' as const,
+          properties: {},
+        },
+      },
+      handler: async (): Promise<ContentBlock[]> => {
+        const personalities = loadPersonalities();
+        const output = Object.entries(personalities).map(([name, p]) => ({
+          name,
+          description: p.description,
+          voice: p.voice ?? `(default: ${DEFAULT_VOICE})`,
+          rate: p.rate ?? '(voice default)',
+        }));
+        return jsonContent(output);
+      },
+    },
+    {
+      definition: {
+        name: 'health_check',
+        description:
+          'Report whether the local macOS `say` engine is available, the default voice, and how many voices are installed.',
+        inputSchema: {
+          type: 'object' as const,
+          properties: {},
+        },
+      },
+      handler: async (): Promise<ContentBlock[]> => {
+        const available = sayAvailable();
+        return jsonContent({
+          engine: 'macos-say',
+          available,
+          say_bin: SAY_BIN,
+          default_voice: DEFAULT_VOICE,
+          default_rate: DEFAULT_RATE ?? '(voice default)',
+          installed_voices: available ? listVoices().length : 0,
+        });
+      },
+    },
+  ];
+}
--- a/src/tools/voices.ts
+++ b/src/tools/voices.ts
@ -0,0 +1,33 @@
+import { listVoices } from '../say';
+import type { ToolEntry, ContentBlock } from '../types';
+import { jsonContent } from '../types';
+
+export function voiceTools(): ToolEntry[] {
+  return [
+    {
+      definition: {
+        name: 'list_voices',
+        description:
+          'List the macOS voices installed on this machine (name, locale, sample line), as reported by `say -v "?"`. Use a name with the synthesize tool\'s `voice` argument or in a personality.',
+        inputSchema: {
+          type: 'object' as const,
+          properties: {
+            locale: {
+              type: 'string',
+              description:
+                'Optional case-insensitive locale filter, e.g. "en" or "en_US". Matches the start of each voice locale.',
+            },
+          },
+        },
+      },
+      handler: async (args): Promise<ContentBlock[]> => {
+        const filter = (args['locale'] as string | undefined)?.toLowerCase();
+        let voices = listVoices();
+        if (filter) {
+          voices = voices.filter((v) => v.locale.toLowerCase().startsWith(filter));
+        }
+        return jsonContent({ count: voices.length, voices });
+      },
+    },
+  ];
+}
--- a/src/types.ts
+++ b/src/types.ts
@ -0,0 +1,24 @@
+export type TextContent = { type: 'text'; text: string };
+export type ImageContent = { type: 'image'; data: string; mimeType: string };
+export type ContentBlock = TextContent | ImageContent;
+
+export type ToolHandler = (args: Record<string, unknown>) => Promise<ContentBlock[]>;
+
+export interface ToolDefinition {
+  name: string;
+  description: string;
+  inputSchema: {
+    type: 'object';
+    properties: Record<string, unknown>;
+    required?: string[];
+  };
+}
+
+export interface ToolEntry {
+  definition: ToolDefinition;
+  handler: ToolHandler;
+}
+
+export function jsonContent(data: unknown): ContentBlock[] {
+  return [{ type: 'text', text: JSON.stringify(data, null, 2) }];
+}
--- a/tsconfig.json
+++ b/tsconfig.json
@ -0,0 +1,26 @@
+{
+  "compilerOptions": {
+    "target": "ES2022",
+    "module": "ES2022",
+    "moduleResolution": "bundler",
+    "lib": ["ES2022"],
+    "outDir": "dist",
+    "rootDir": "src",
+    "strict": true,
+    "esModuleInterop": true,
+    "skipLibCheck": true,
+    "forceConsistentCasingInFileNames": true,
+    "declaration": true,
+    "declarationMap": true,
+    "sourceMap": true,
+    "resolveJsonModule": true,
+    "isolatedModules": true,
+    "paths": {
+      "@modelcontextprotocol/sdk/server": ["./node_modules/@modelcontextprotocol/sdk/dist/esm/server/index.d.ts"],
+      "@modelcontextprotocol/sdk/server/stdio": ["./node_modules/@modelcontextprotocol/sdk/dist/esm/server/stdio.d.ts"],
+      "@modelcontextprotocol/sdk/types": ["./node_modules/@modelcontextprotocol/sdk/dist/esm/types.d.ts"]
+    }
+  },
+  "include": ["src"],
+  "exclude": ["node_modules", "dist"]
+}
--- a/tsup.config.ts
+++ b/tsup.config.ts
@ -0,0 +1,48 @@
+import { defineConfig } from 'tsup';
+import { resolve, dirname } from 'path';
+import { fileURLToPath } from 'url';
+
+const __dirname = dirname(fileURLToPath(import.meta.url));
+
+export default defineConfig({
+  entry: ['src/index.ts'],
+  format: ['esm'],
+  target: 'es2022',
+  outDir: 'dist',
+  clean: true,
+  sourcemap: true,
+  dts: true,
+  bundle: true,
+  noExternal: [/.*/],
+  banner: { js: '#!/usr/bin/env node' },
+  esbuildPlugins: [
+    {
+      name: 'fix-mcp-sdk-deps',
+      setup(build) {
+        const explicitExports = new Set(['server', 'client', 'validation', 'experimental']);
+        build.onResolve({ filter: /^@modelcontextprotocol\/sdk\/.+/ }, (args) => {
+          const subpath = args.path.replace('@modelcontextprotocol/sdk/', '');
+          const topLevel = subpath.split('/')[0];
+          if (explicitExports.has(topLevel) && !subpath.includes('/')) return undefined;
+          return {
+            path: resolve(
+              __dirname,
+              'node_modules/@modelcontextprotocol/sdk/dist/esm',
+              subpath + '.js',
+            ),
+          };
+        });
+
+        build.onResolve({ filter: /^ajv-formats/ }, (args) => {
+          return { path: args.path, namespace: 'ajv-stub' };
+        });
+        build.onResolve({ filter: /^ajv\/dist\// }, (args) => {
+          return { path: args.path, namespace: 'ajv-stub' };
+        });
+        build.onLoad({ filter: /.*/, namespace: 'ajv-stub' }, () => {
+          return { contents: 'module.exports = function() {};', loader: 'js' };
+        });
+      },
+    },
+  ],
+});