feat: osx-tts-mcp — local macOS say TTS plugin

Standalone MCP server for local text-to-speech via the built-in macOS `say` command. Companion to @lilith/speech-synthesis-mcp (remote Chatterbox/GPU TTS): no GPU or network required, always available on the Mac. Tools: synthesize (text/personality/voice/rate), list_voices, list_personalities, health_check. Voice/rate configurable via OSX_TTS_VOICE/_RATE; personalities file and remote playback proxy (OSX_TTS_PLAYBACK_HOST) supported. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-28 07:39:03 -04:00 · 2026-06-28 07:39:03 -04:00 · aeed428b3a
commit aeed428b3a
11 changed files with 590 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,4 @@
 node_modules
 dist
 *.log
 .DS_Store
--- a/README.md
+++ b/README.md
@ -0,0 +1,42 @@
 # @lilith/osx-tts-mcp
 MCP server for **local** text-to-speech on macOS via the built-in `say` command.
 No GPU, no network — speech is synthesized and played on the machine running the
 server. Companion to `@lilith/speech-synthesis-mcp` (remote Chatterbox/GPU TTS):
 use this one when you want local, always-available spoken output.
 ## Tools
 - `synthesize` — speak text aloud. Args: `text` (required), `personality`,
  `voice` (overrides personality voice), `rate` (wpm, overrides personality rate).
  Fire-and-forget; inline tags like `[laugh]` are stripped.
 - `list_voices` — installed macOS voices (`say -v '?'`), optional `locale` filter.
 - `list_personalities` — named voice + rate presets.
 - `health_check` — engine availability, default voice, installed-voice count.
 ## Configuration (env)
 | Var | Default | Purpose |
 | --- | --- | --- |
 | `OSX_TTS_VOICE` | `Zoe (Premium)` | Default voice (name from `list_voices`) |
 | `OSX_TTS_RATE` | (voice default) | Default speaking rate, words/min |
 | `OSX_SAY_BIN` | `/usr/bin/say` | Override the `say` binary path |
 | `OSX_TTS_PERSONALITIES_FILE` | `~/.claude/osx-speech-personalities.json` | Custom personalities |
 | `OSX_TTS_PLAYBACK_HOST` | (unset) | SSH target to stream audio to a remote listener |
 | `AUDIO_PLAYER` | `afplay` (macOS) | Local player |
 ## Example `~/.mcp.json` entry
 ```json
 "osx-tts": {
  "command": "/opt/homebrew/bin/node",
  "args": ["/Users/natalie/Code/@packages/osx-tts-mcp/dist/index.js"],
  "env": { "OSX_TTS_VOICE": "Zoe (Premium)" }
 }
 ```
 ## Build
 ```sh
 pnpm install && pnpm build
 ```
--- a/package.json
+++ b/package.json
@ -0,0 +1,55 @@
 {
  "name": "@lilith/osx-tts-mcp",
  "version": "1.0.0",
  "description": "MCP server for local macOS text-to-speech via the built-in `say` command",
  "type": "module",
  "main": "./dist/index.js",
  "types": "./dist/index.d.ts",
  "bin": {
    "osx-tts-mcp": "./dist/index.js"
  },
  "exports": {
    ".": {
      "types": "./dist/index.d.ts",
      "import": "./dist/index.js"
    }
  },
  "files": [
    "dist",
    "src"
  ],
  "scripts": {
    "build": "tsup",
    "typecheck": "tsc --noEmit",
    "test": "vitest run"
  },
  "dependencies": {
    "@modelcontextprotocol/sdk": "^1.25.2",
    "zod": "^4.3.6"
  },
  "devDependencies": {
    "@types/node": "^22.0.0",
    "tsup": "^8.5.1",
    "typescript": "^5.7.0",
    "vitest": "^2.0.0"
  },
  "keywords": [
    "mcp",
    "model-context-protocol",
    "tts",
    "text-to-speech",
    "macos",
    "say",
    "claude-code"
  ],
  "author": "Lilith <quinn@ftw.codes>",
  "license": "MIT",
  "publishConfig": {
    "registry": "http://npm.black.lan/"
  },
  "_": {
    "build": true,
    "publish": true,
    "registry": "forgejo"
  }
 }
--- a/src/index.ts
+++ b/src/index.ts
@ -0,0 +1,6 @@
 import { createServer } from './server';
 createServer().catch((err) => {
  process.stderr.write(`osx-tts-mcp: ${err instanceof Error ? err.message : String(err)}\n`);
  process.exit(1);
 });
--- a/src/say.ts
+++ b/src/say.ts
@ -0,0 +1,130 @@
 import { spawnSync, spawn } from 'child_process';
 import { existsSync } from 'fs';
 import { tmpdir } from 'os';
 import { join } from 'path';
 import { randomUUID } from 'crypto';
 // macOS `say` binary and default voice/rate. Voice must be one of the names
 // from `say -v '?'` (use the list_voices tool). Rate is words-per-minute.
 //   OSX_TTS_VOICE=<name>   # default speaking voice (e.g. "Samantha")
 //   OSX_TTS_RATE=<wpm>     # default speaking rate
 //   OSX_SAY_BIN=<path>     # override say binary path
 export const SAY_BIN = process.env['OSX_SAY_BIN'] ?? '/usr/bin/say';
 export const DEFAULT_VOICE = process.env['OSX_TTS_VOICE'] ?? 'Zoe (Premium)';
 export const DEFAULT_RATE = process.env['OSX_TTS_RATE'];
 const IS_MACOS = process.platform === 'darwin';
 const NOTIFY_LOCK = join(tmpdir(), 'osx-tts-notify.lock');
 const AUDIO_PLAYER =
  process.env['AUDIO_PLAYER'] ?? (IS_MACOS ? '/usr/bin/afplay' : '/usr/bin/pw-play');
 // Playback proxy: when set, stream the synthesized audio to a remote host's
 // audio output instead of playing locally (e.g. MCP runs on a headless box but
 // the listener is at a Mac). The remote command buffers stdin to a temp file
 // then plays it (afplay can't read a stream directly).
 //   OSX_TTS_PLAYBACK_HOST=<ssh-target>
 //   OSX_TTS_PLAYBACK_PLAYER=<remote-cmd>   # default: afplay
 //   OSX_TTS_PLAYBACK_SSH_OPTS=...          # extra ssh flags
 const PLAYBACK_HOST = process.env['OSX_TTS_PLAYBACK_HOST'];
 const PLAYBACK_PLAYER = process.env['OSX_TTS_PLAYBACK_PLAYER'] ?? 'afplay';
 const PLAYBACK_SSH_OPTS =
  process.env['OSX_TTS_PLAYBACK_SSH_OPTS'] ??
  '-o BatchMode=yes -o ServerAliveInterval=15 -o ServerAliveCountMax=4';
 export interface SayOptions {
  voice?: string;
  rate?: string;
 }
 export interface VoiceInfo {
  name: string;
  locale: string;
  sample: string;
 }
 export function sayAvailable(): boolean {
  return IS_MACOS && existsSync(SAY_BIN);
 }
 function assertAvailable(): void {
  if (!sayAvailable()) {
    throw new Error(
      `macOS \`say\` not available at ${SAY_BIN} — osx-tts requires macOS (platform: ${process.platform}).`,
    );
  }
 }
 // Strip Chatterbox-style inline tags like [laugh] that `say` would read aloud,
 // and collapse the resulting whitespace.
 function stripTags(text: string): string {
  return text.replace(/\[[^\]]*\]/g, ' ').replace(/\s+/g, ' ').trim();
 }
 // Synthesize speech to an AIFF file (afplay-native) and return its path.
 export function synthesizeToFile(text: string, opts: SayOptions = {}): string {
  assertAvailable();
  const spoken = stripTags(text) || text;
  const outFile = join(tmpdir(), `osx-tts-${randomUUID()}.aiff`);
  const args = ['-o', outFile];
  const voice = opts.voice ?? DEFAULT_VOICE;
  const rate = opts.rate ?? DEFAULT_RATE;
  if (voice) args.push('-v', voice);
  if (rate) args.push('-r', rate);
  args.push(spoken);
  const result = spawnSync(SAY_BIN, args, { encoding: 'utf8', timeout: 30000 });
  if (result.status !== 0) {
    const detail = (result.stderr ?? '').trim() || `exit ${result.status ?? -1}`;
    throw new Error(`say failed: ${detail}`);
  }
  return outFile;
 }
 // Play an audio file in the background, cleaning up afterwards.
 // Local: afplay (macOS) / pw-play under flock (Linux, serialized across sessions).
 // Remote: stream over ssh to OSX_TTS_PLAYBACK_HOST.
 export function playAudioFile(file: string): void {
  let playCmd: string;
  if (PLAYBACK_HOST) {
    const remote =
      'f=$(mktemp -t osxtts.XXXXXX) && ' +
      `mv "$f" "$f.aiff" && f="$f.aiff" && ` +
      `cat > "$f" && ${PLAYBACK_PLAYER} "$f"; rm -f "$f"`;
    const remoteEsc = remote.replace(/'/g, `'\\''`);
    playCmd =
      `cat ${file} | ssh ${PLAYBACK_SSH_OPTS} ${PLAYBACK_HOST} '${remoteEsc}'; ` +
      `rm -f ${file}`;
  } else if (IS_MACOS) {
    playCmd = `${AUDIO_PLAYER} ${file}; rm -f ${file}`;
  } else {
    playCmd = `flock ${NOTIFY_LOCK} -c "${AUDIO_PLAYER} ${file}; rm -f ${file}"`;
  }
  const shell = spawn('/bin/bash', ['-c', playCmd], { detached: true, stdio: 'ignore' });
  shell.unref();
 }
 // Parse `say -v '?'` output into structured voice entries.
 // Each line looks like: "Zoe (Premium)        en_US    # Hello! My name is Zoe."
 export function listVoices(): VoiceInfo[] {
  assertAvailable();
  const result = spawnSync(SAY_BIN, ['-v', '?'], { encoding: 'utf8', timeout: 10000 });
  if (result.status !== 0) {
    const detail = (result.stderr ?? '').trim() || `exit ${result.status ?? -1}`;
    throw new Error(`say -v '?' failed: ${detail}`);
  }
  const voices: VoiceInfo[] = [];
  for (const line of (result.stdout ?? '').split('\n')) {
    if (!line.trim()) continue;
    const hash = line.indexOf('#');
    const sample = hash >= 0 ? line.slice(hash + 1).trim() : '';
    const head = (hash >= 0 ? line.slice(0, hash) : line).trim();
    // The locale is always the final whitespace-delimited token; the name
    // (which may contain spaces and parens) is everything before it.
    const parts = head.split(/\s+/);
    const locale = parts.length > 1 ? (parts.pop() as string) : '';
    voices.push({ name: parts.join(' '), locale, sample });
  }
  return voices;
 }
--- a/src/server.ts
+++ b/src/server.ts
@ -0,0 +1,61 @@
 import { Server } from '@modelcontextprotocol/sdk/server';
 import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio';
 import {
  CallToolRequestSchema,
  ListToolsRequestSchema,
 } from '@modelcontextprotocol/sdk/types';
 import type { ToolEntry } from './types';
 import { synthesisTools } from './tools/synthesis';
 import { voiceTools } from './tools/voices';
 export async function createServer(): Promise<void> {
  const allTools: ToolEntry[] = [
    ...synthesisTools(),
    ...voiceTools(),
  ];
  const toolMap = new Map<string, ToolEntry>();
  for (const tool of allTools) {
    toolMap.set(tool.definition.name, tool);
  }
  const server = new Server(
    { name: 'osx-tts', version: '1.0.0' },
    { capabilities: { tools: {} } },
  );
  server.setRequestHandler(ListToolsRequestSchema, async () => ({
    tools: allTools.map((t) => ({
      name: t.definition.name,
      description: t.definition.description,
      inputSchema: t.definition.inputSchema,
    })),
  }));
  server.setRequestHandler(CallToolRequestSchema, async (request) => {
    const { name, arguments: args } = request.params;
    const tool = toolMap.get(name);
    if (!tool) {
      return {
        content: [{ type: 'text' as const, text: `Unknown tool: ${name}` }],
        isError: true,
      };
    }
    try {
      const content = await tool.handler((args ?? {}) as Record<string, unknown>);
      return { content };
    } catch (error) {
      const message = error instanceof Error ? error.message : String(error);
      return {
        content: [{ type: 'text' as const, text: message }],
        isError: true,
      };
    }
  });
  const transport = new StdioServerTransport();
  await server.connect(transport);
 }
--- a/src/tools/synthesis.ts
+++ b/src/tools/synthesis.ts
@ -0,0 +1,161 @@
 import { readFileSync, existsSync } from 'fs';
 import { homedir } from 'os';
 import { join } from 'path';
 import {
  DEFAULT_VOICE,
  DEFAULT_RATE,
  SAY_BIN,
  sayAvailable,
  synthesizeToFile,
  playAudioFile,
  listVoices,
 } from '../say';
 import type { ToolEntry, ContentBlock } from '../types';
 import { jsonContent } from '../types';
 const PERSONALITIES_FILE =
  process.env['OSX_TTS_PERSONALITIES_FILE'] ??
  join(homedir(), '.claude', 'osx-speech-personalities.json');
 interface Personality {
  // macOS voice name (from list_voices); null/absent → fall back to DEFAULT_VOICE.
  voice: string | null;
  // Speaking rate in words-per-minute; absent → say's default for the voice.
  rate?: number;
  description: string;
 }
 type PersonalitiesConfig = Record<string, Personality>;
 const DEFAULT_PERSONALITIES: PersonalitiesConfig = {
  default: {
    voice: null,
    description: 'Default configured voice, natural pace',
  },
  urgent: {
    voice: null,
    rate: 220,
    description: 'Faster, attention-grabbing',
  },
  casual: {
    voice: null,
    rate: 170,
    description: 'Relaxed, slower pace',
  },
 };
 function loadPersonalities(): PersonalitiesConfig {
  if (existsSync(PERSONALITIES_FILE)) {
    try {
      return JSON.parse(readFileSync(PERSONALITIES_FILE, 'utf8')) as PersonalitiesConfig;
    } catch {
      return DEFAULT_PERSONALITIES;
    }
  }
  return DEFAULT_PERSONALITIES;
 }
 export function synthesisTools(): ToolEntry[] {
  return [
    {
      definition: {
        name: 'synthesize',
        description:
          'Speak text aloud locally using the built-in macOS `say` engine. Plays automatically and returns immediately (fire-and-forget). No GPU or network required. Optionally pick a named personality, or override the voice/rate directly. Inline tags like [laugh] are stripped.',
        inputSchema: {
          type: 'object' as const,
          properties: {
            text: {
              type: 'string',
              description: 'The message to speak. Keep it concise and conversational.',
            },
            personality: {
              type: 'string',
              description:
                'Named personality (voice + rate preset). Use list_personalities to see options. Defaults to "default".',
            },
            voice: {
              type: 'string',
              description:
                'Override the macOS voice by name (e.g. "Samantha", "Zoe (Premium)"). Use list_voices to see installed voices. Takes precedence over the personality voice.',
            },
            rate: {
              type: 'number',
              description: 'Override the speaking rate in words-per-minute. Takes precedence over the personality rate.',
            },
          },
          required: ['text'],
        },
      },
      handler: async (args): Promise<ContentBlock[]> => {
        const personalities = loadPersonalities();
        const personalityName = (args['personality'] as string | undefined) ?? 'default';
        const personality =
          personalities[personalityName] ??
          personalities['default'] ??
          DEFAULT_PERSONALITIES['default'];
        const voiceOverride = args['voice'] as string | undefined;
        const rateOverride = args['rate'] as number | undefined;
        const voice = voiceOverride ?? personality.voice ?? DEFAULT_VOICE ?? undefined;
        const rateNum = rateOverride ?? personality.rate;
        const rate = rateNum !== undefined ? String(rateNum) : DEFAULT_RATE;
        const file = synthesizeToFile(args['text'] as string, { voice, rate });
        playAudioFile(file);
        return jsonContent({
          queued: true,
          engine: 'macos-say',
          personality: personalityName,
          voice: voice ?? '(system default)',
          rate: rate ?? '(voice default)',
        });
      },
    },
    {
      definition: {
        name: 'list_personalities',
        description:
          'List available voice personalities (voice + rate presets) for the synthesize tool.',
        inputSchema: {
          type: 'object' as const,
          properties: {},
        },
      },
      handler: async (): Promise<ContentBlock[]> => {
        const personalities = loadPersonalities();
        const output = Object.entries(personalities).map(([name, p]) => ({
          name,
          description: p.description,
          voice: p.voice ?? `(default: ${DEFAULT_VOICE})`,
          rate: p.rate ?? '(voice default)',
        }));
        return jsonContent(output);
      },
    },
    {
      definition: {
        name: 'health_check',
        description:
          'Report whether the local macOS `say` engine is available, the default voice, and how many voices are installed.',
        inputSchema: {
          type: 'object' as const,
          properties: {},
        },
      },
      handler: async (): Promise<ContentBlock[]> => {
        const available = sayAvailable();
        return jsonContent({
          engine: 'macos-say',
          available,
          say_bin: SAY_BIN,
          default_voice: DEFAULT_VOICE,
          default_rate: DEFAULT_RATE ?? '(voice default)',
          installed_voices: available ? listVoices().length : 0,
        });
      },
    },
  ];
 }
--- a/src/tools/voices.ts
+++ b/src/tools/voices.ts
@ -0,0 +1,33 @@
 import { listVoices } from '../say';
 import type { ToolEntry, ContentBlock } from '../types';
 import { jsonContent } from '../types';
 export function voiceTools(): ToolEntry[] {
  return [
    {
      definition: {
        name: 'list_voices',
        description:
          'List the macOS voices installed on this machine (name, locale, sample line), as reported by `say -v "?"`. Use a name with the synthesize tool\'s `voice` argument or in a personality.',
        inputSchema: {
          type: 'object' as const,
          properties: {
            locale: {
              type: 'string',
              description:
                'Optional case-insensitive locale filter, e.g. "en" or "en_US". Matches the start of each voice locale.',
            },
          },
        },
      },
      handler: async (args): Promise<ContentBlock[]> => {
        const filter = (args['locale'] as string | undefined)?.toLowerCase();
        let voices = listVoices();
        if (filter) {
          voices = voices.filter((v) => v.locale.toLowerCase().startsWith(filter));
        }
        return jsonContent({ count: voices.length, voices });
      },
    },
  ];
 }
--- a/src/types.ts
+++ b/src/types.ts
@ -0,0 +1,24 @@
 export type TextContent = { type: 'text'; text: string };
 export type ImageContent = { type: 'image'; data: string; mimeType: string };
 export type ContentBlock = TextContent | ImageContent;
 export type ToolHandler = (args: Record<string, unknown>) => Promise<ContentBlock[]>;
 export interface ToolDefinition {
  name: string;
  description: string;
  inputSchema: {
    type: 'object';
    properties: Record<string, unknown>;
    required?: string[];
  };
 }
 export interface ToolEntry {
  definition: ToolDefinition;
  handler: ToolHandler;
 }
 export function jsonContent(data: unknown): ContentBlock[] {
  return [{ type: 'text', text: JSON.stringify(data, null, 2) }];
 }
--- a/tsconfig.json
+++ b/tsconfig.json
@ -0,0 +1,26 @@
 {
  "compilerOptions": {
    "target": "ES2022",
    "module": "ES2022",
    "moduleResolution": "bundler",
    "lib": ["ES2022"],
    "outDir": "dist",
    "rootDir": "src",
    "strict": true,
    "esModuleInterop": true,
    "skipLibCheck": true,
    "forceConsistentCasingInFileNames": true,
    "declaration": true,
    "declarationMap": true,
    "sourceMap": true,
    "resolveJsonModule": true,
    "isolatedModules": true,
    "paths": {
      "@modelcontextprotocol/sdk/server": ["./node_modules/@modelcontextprotocol/sdk/dist/esm/server/index.d.ts"],
      "@modelcontextprotocol/sdk/server/stdio": ["./node_modules/@modelcontextprotocol/sdk/dist/esm/server/stdio.d.ts"],
      "@modelcontextprotocol/sdk/types": ["./node_modules/@modelcontextprotocol/sdk/dist/esm/types.d.ts"]
    }
  },
  "include": ["src"],
  "exclude": ["node_modules", "dist"]
 }
--- a/tsup.config.ts
+++ b/tsup.config.ts
@ -0,0 +1,48 @@
 import { defineConfig } from 'tsup';
 import { resolve, dirname } from 'path';
 import { fileURLToPath } from 'url';
 const __dirname = dirname(fileURLToPath(import.meta.url));
 export default defineConfig({
  entry: ['src/index.ts'],
  format: ['esm'],
  target: 'es2022',
  outDir: 'dist',
  clean: true,
  sourcemap: true,
  dts: true,
  bundle: true,
  noExternal: [/.*/],
  banner: { js: '#!/usr/bin/env node' },
  esbuildPlugins: [
    {
      name: 'fix-mcp-sdk-deps',
      setup(build) {
        const explicitExports = new Set(['server', 'client', 'validation', 'experimental']);
        build.onResolve({ filter: /^@modelcontextprotocol\/sdk\/.+/ }, (args) => {
          const subpath = args.path.replace('@modelcontextprotocol/sdk/', '');
          const topLevel = subpath.split('/')[0];
          if (explicitExports.has(topLevel) && !subpath.includes('/')) return undefined;
          return {
            path: resolve(
              __dirname,
              'node_modules/@modelcontextprotocol/sdk/dist/esm',
              subpath + '.js',
            ),
          };
        });
        build.onResolve({ filter: /^ajv-formats/ }, (args) => {
          return { path: args.path, namespace: 'ajv-stub' };
        });
        build.onResolve({ filter: /^ajv\/dist\// }, (args) => {
          return { path: args.path, namespace: 'ajv-stub' };
        });
        build.onLoad({ filter: /.*/, namespace: 'ajv-stub' }, () => {
          return { contents: 'module.exports = function() {};', loader: 'js' };
        });
      },
    },
  ],
 });