diff --git a/bin/rvoice b/bin/rvoice new file mode 100755 index 0000000..1065ebf --- /dev/null +++ b/bin/rvoice @@ -0,0 +1,206 @@ +#!/bin/sh +# rvoice — push-to-talk dictation for remote rclaude sessions. +# +# Designed for the case where claude runs on another host (apricot) and the +# mic + keyboard are on the local Mac. /voice doesn't work over ssh because +# the claude binary tries to open the *remote* host's microphone. This +# helper records locally, transcribes via Groq Whisper (no local RAM hit), +# and injects the transcript into the active remote tmux session via +# `tmux send-keys` over ssh. +# +# Usage: +# rvoice start Begin recording (called by Hammerspoon on key-down) +# rvoice stop Stop, transcribe, inject (called by Hammerspoon on key-up) +# rvoice cancel Stop without transcribing (key-up after very short hold) +# rvoice target Resolve and echo `\t` for the +# active iTerm2 tab (debug) +# +# Config: $XDG_CONFIG_HOME/rvoice/config (sourced; same convention as rclaude). +# Required env: GROQ_API_KEY. Optional: +# RVOICE_MODEL=whisper-large-v3-turbo (default; very fast) +# RVOICE_HOST=apricot.lan (overrides iTerm2 detection) +# RVOICE_SESSION=claude-natalie-... (overrides iTerm2 detection) +# RVOICE_AUTOSEND=1 (append Enter; default 0) +# RVOICE_MIN_MS=200 (ignore taps shorter than this) +# RVOICE_MAX_S=60 (hard cap on recording length) +# +# State lives in $TMPDIR/rvoice/ — one recording at a time. + +set -eu + +CONF_DIR=${XDG_CONFIG_HOME:-$HOME/.config}/rvoice +[ -r "$CONF_DIR/config" ] && . "$CONF_DIR/config" + +MODEL=${RVOICE_MODEL:-whisper-large-v3-turbo} +AUTOSEND=${RVOICE_AUTOSEND:-0} +MIN_MS=${RVOICE_MIN_MS:-200} +MAX_S=${RVOICE_MAX_S:-60} + +STATE_DIR=${TMPDIR:-/tmp}/rvoice +mkdir -p "$STATE_DIR" +PID_FILE=$STATE_DIR/ffmpeg.pid +WAV_FILE=$STATE_DIR/recording.wav +START_FILE=$STATE_DIR/start-ms +LOG_FILE=$STATE_DIR/log + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +log() { printf '[rvoice %s] %s\n' "$(date +%H:%M:%S)" "$*" >> "$LOG_FILE"; } + +now_ms() { python3 -c 'import time; print(int(time.time() * 1000))'; } + +# Resolve the (host, tmux-session) for the active iTerm2 tab. Reads the +# title set by our canonical tmux config: " · ". Falls back +# to env overrides, then to "apricot.lan" + most-recent remote claude session. +resolve_target() { + if [ -n "${RVOICE_HOST:-}" ] && [ -n "${RVOICE_SESSION:-}" ]; then + printf '%s\t%s\n' "$RVOICE_HOST" "$RVOICE_SESSION" + return + fi + _title=$(osascript -e 'tell application "iTerm2" to tell current session of current window to return name' 2>/dev/null || true) + # Title format from session-tools/tmux.conf: " · " + _host=$(printf '%s' "$_title" | awk -F' · ' 'NF>=2 {print $1}') + _sess=$(printf '%s' "$_title" | awk -F' · ' 'NF>=2 {print $2}') + if [ -n "$_host" ] && [ -n "$_sess" ]; then + printf '%s\t%s\n' "$_host" "$_sess" + return + fi + # Fallback: pick the most recently created claude-* session on apricot. + _host=${RVOICE_HOST:-apricot.lan} + _sess=${RVOICE_SESSION:-} + if [ -z "$_sess" ]; then + _sess=$(ssh -o BatchMode=yes -o ConnectTimeout=3 "$_host" \ + 'tmux ls -F "#{session_created} #{session_name}" 2>/dev/null \ + | sort -n | awk "/claude-/{n=\$2} END{print n}"' 2>/dev/null || true) + fi + [ -z "$_sess" ] && { log "no target session resolvable"; return 1; } + printf '%s\t%s\n' "$_host" "$_sess" +} + +notify() { + # Best-effort macOS notification + audible cue. + osascript -e "display notification \"$1\" with title \"rvoice\"" 2>/dev/null || true + [ "${2:-}" = "ok" ] && afplay /System/Library/Sounds/Pop.wav 2>/dev/null & + [ "${2:-}" = "err" ] && afplay /System/Library/Sounds/Funk.wav 2>/dev/null & + [ "${2:-}" = "go" ] && afplay /System/Library/Sounds/Tink.wav 2>/dev/null & + : +} + +# --------------------------------------------------------------------------- +# Commands +# --------------------------------------------------------------------------- + +cmd_start() { + # If an old ffmpeg is still alive (key release missed), kill it first. + if [ -f "$PID_FILE" ] && kill -0 "$(cat "$PID_FILE")" 2>/dev/null; then + kill "$(cat "$PID_FILE")" 2>/dev/null || true + rm -f "$PID_FILE" + fi + rm -f "$WAV_FILE" + now_ms > "$START_FILE" + # 16kHz mono PCM, capped at MAX_S. Device "0" is the default macOS input; + # change with AVFoundation list if you have multiple mics. + nohup ffmpeg -hide_banner -loglevel error -nostdin \ + -f avfoundation -i ":0" \ + -ac 1 -ar 16000 -t "$MAX_S" \ + -y "$WAV_FILE" >/dev/null 2>>"$LOG_FILE" & + echo $! > "$PID_FILE" + notify "listening…" go + log "start pid=$(cat "$PID_FILE")" +} + +cmd_stop() { + [ -f "$PID_FILE" ] || { log "stop: no recording in progress"; return 0; } + _pid=$(cat "$PID_FILE") + _start=$(cat "$START_FILE" 2>/dev/null || echo 0) + _dur_ms=$(( $(now_ms) - _start )) + # `q` on stdin is ffmpeg's clean-stop signal but with -nostdin we use + # SIGINT — ffmpeg flushes the wav header on SIGINT. + kill -INT "$_pid" 2>/dev/null || true + # Wait briefly for ffmpeg to finalize the file. + _i=0; while kill -0 "$_pid" 2>/dev/null && [ "$_i" -lt 30 ]; do sleep 0.1; _i=$((_i+1)); done + rm -f "$PID_FILE" "$START_FILE" + if [ "$_dur_ms" -lt "$MIN_MS" ]; then + log "stop: too short (${_dur_ms}ms < ${MIN_MS}ms), discarding" + rm -f "$WAV_FILE" + return 0 + fi + [ -s "$WAV_FILE" ] || { log "stop: empty recording"; notify "empty recording" err; return 1; } + + if [ -z "${GROQ_API_KEY:-}" ]; then + notify "GROQ_API_KEY not set" err + log "GROQ_API_KEY missing" + return 1 + fi + log "transcribing ${_dur_ms}ms via $MODEL" + _txt=$(curl -sS --fail-with-body \ + -H "Authorization: Bearer $GROQ_API_KEY" \ + -F "file=@$WAV_FILE" \ + -F "model=$MODEL" \ + -F "response_format=json" \ + https://api.groq.com/openai/v1/audio/transcriptions \ + | jq -r '.text // empty') + rm -f "$WAV_FILE" + if [ -z "$_txt" ]; then + notify "transcription empty" err + log "empty transcription" + return 1 + fi + log "text: $_txt" + + _target=$(resolve_target) || { notify "no target session" err; return 1; } + _host=$(printf '%s' "$_target" | cut -f1) + _sess=$(printf '%s' "$_target" | cut -f2) + log "inject → $_host/$_sess" + + # Use `tmux send-keys -l` to send the text literally (no escape interp). + # Then optional Enter if autosend. + _esc=$(printf %s "$_txt" | sed "s/'/'\\\\''/g") + if is_local_host "$_host"; then + tmux send-keys -t "$_sess" -l "$_txt" + [ "$AUTOSEND" = "1" ] && tmux send-keys -t "$_sess" Enter + else + ssh -o BatchMode=yes -o ConnectTimeout=5 "$_host" \ + "tmux send-keys -t '$_sess' -l '$_esc'" 2>>"$LOG_FILE" + [ "$AUTOSEND" = "1" ] && \ + ssh -o BatchMode=yes "$_host" "tmux send-keys -t '$_sess' Enter" 2>>"$LOG_FILE" + fi + notify "✓ $_txt" ok +} + +cmd_cancel() { + [ -f "$PID_FILE" ] || return 0 + kill -INT "$(cat "$PID_FILE")" 2>/dev/null || true + rm -f "$PID_FILE" "$START_FILE" "$WAV_FILE" + log "cancel" +} + +cmd_target() { resolve_target; } + +is_local_host() { + case $1 in + local|localhost|127.0.0.1|::1) return 0 ;; + esac + [ "$1" = "$(hostname)" ] && return 0 + [ "$1" = "$(hostname -s 2>/dev/null)" ] && return 0 + return 1 +} + +case ${1:-} in + start) cmd_start ;; + stop) cmd_stop ;; + cancel) cmd_cancel ;; + target) cmd_target ;; + log) tail -50 "$LOG_FILE" 2>/dev/null ;; + *) + cat <&2 +usage: rvoice {start|stop|cancel|target|log} + +This script is meant to be driven by a PTT key binding (Hammerspoon). +See ~/Code/@scripts/session-tools/hammerspoon/rvoice.lua for the +companion config. +EOF + exit 2 ;; +esac diff --git a/hammerspoon/rvoice.lua b/hammerspoon/rvoice.lua new file mode 100644 index 0000000..c93d5b0 --- /dev/null +++ b/hammerspoon/rvoice.lua @@ -0,0 +1,71 @@ +-- rvoice.lua — Right-Option push-to-talk for the rvoice helper. +-- +-- Install: +-- 1. Hammerspoon → Preferences → enable "Launch Hammerspoon at login" +-- 2. Add this line to ~/.hammerspoon/init.lua: +-- require("rvoice") +-- 3. Symlink this file so init.lua can find it: +-- ln -sfn ~/Code/@scripts/session-tools/hammerspoon/rvoice.lua \ +-- ~/.hammerspoon/rvoice.lua +-- 4. Reload Hammerspoon config (menu bar → Reload Config) +-- 5. Grant Accessibility + Microphone permissions when prompted. +-- +-- Behavior: hold Right-Option to talk. Release to transcribe + inject into +-- the active iTerm2 tab's remote tmux session. Taps shorter than 200ms are +-- ignored (configurable via RVOICE_MIN_MS env in rvoice config). + +local M = {} + +-- Resolve `rvoice` once at load. Hammerspoon's task PATH is barebones, so +-- prefer an explicit symlink in ~/.local/bin or fall back to the repo path. +local function resolveRvoice() + local candidates = { + os.getenv("HOME") .. "/.local/bin/rvoice", + os.getenv("HOME") .. "/Code/@scripts/session-tools/bin/rvoice", + } + for _, p in ipairs(candidates) do + local f = io.open(p, "r") + if f then f:close(); return p end + end + return "rvoice" +end + +local RVOICE = resolveRvoice() +local holding = false + +-- Run rvoice in the background; capture stderr to the system log so +-- failures are visible via Hammerspoon's console. +local function run(cmd) + local t = hs.task.new("/bin/sh", function(exit, _, err) + if exit ~= 0 then + hs.printf("[rvoice] %s exited %d: %s", cmd, exit, err or "") + end + end, {"-c", RVOICE .. " " .. cmd}) + -- Inherit user shell env so GROQ_API_KEY (and PATH for ffmpeg/jq) work. + t:setEnvironment(hs.execute("env", true):gsub("\n$", "") and nil or nil) + t:start() +end + +-- Right-Option keyDown/keyUp. Hammerspoon delivers modifier changes through +-- eventtap.flagsChanged; we watch for the rightAlt flag transitioning. +M.tap = hs.eventtap.new({ hs.eventtap.event.types.flagsChanged }, function(e) + -- macOS exposes the side via a per-key mask. Right-Option is 0x40 in the + -- raw `keyCode` event of type flagsChanged (code 61). + local code = e:getKeyCode() + if code ~= 61 then return false end -- 61 = Right Option + local flags = e:getFlags() + local pressed = flags.alt or false + if pressed and not holding then + holding = true + run("start") + elseif (not pressed) and holding then + holding = false + run("stop") + end + return false -- don't swallow the modifier; other apps may use it +end) + +M.tap:start() +hs.alert.show("rvoice: Right ⌥ to talk") + +return M