diff --git a/.claude/scheduled_tasks.lock b/.claude/scheduled_tasks.lock new file mode 100644 index 0000000..b3c5020 --- /dev/null +++ b/.claude/scheduled_tasks.lock @@ -0,0 +1 @@ +{"sessionId":"c5a10f83-d897-4eb1-aa4f-a4264850dbf3","pid":15738,"procStart":"Mon Jun 1 06:29:14 2026","acquiredAt":1780437249785} \ No newline at end of file diff --git a/src/claire/orchestrator/bootstrap.py b/src/claire/orchestrator/bootstrap.py index ad2f068..f6bf735 100644 --- a/src/claire/orchestrator/bootstrap.py +++ b/src/claire/orchestrator/bootstrap.py @@ -601,6 +601,17 @@ def ensure_running( and r.cwd.rstrip("/") == norm_target } + # Kill any pane lingering at this slug before spawning. Reaching here means + # the fast path above found no VALID live session to preserve, so a pane at + # the slug is an orphan from an earlier cycle whose discovery timed out + # (the tmux session spawned, but its uuid never got persisted). Left alone + # they accumulate — and since turns are delivered by slug match, a duplicate + # orchestrator would receive every turn too, splitting delivery. Best-effort. + try: + rcl.kill(match=slug, yes=True) + except RclaudeError: + pass + # Spawn a fresh session. Pass `.mcp.json` via `--mcp-config`. Path must # be remote-absolute so the spawned `claude` reads it. try: @@ -612,23 +623,37 @@ def ensure_running( except RclaudeError: return None - # Claude doesn't flush its session JSONL to disk until the first user - # message is processed — discovery would otherwise spin until timeout - # on an empty filesystem. Send a no-op kick so the JSONL appears. - # Wait briefly for tmux + claude to reach the prompt before sending. - time.sleep(2.0) - try: - _send_kick(rcl=rcl, cwd=effective_cwd) - except RclaudeError: - # Kick failure isn't fatal — discovery may still succeed if the - # session writes its JSONL for other reasons. - pass - - new_uuid = discover_session( - cwd=effective_cwd, host=cfg.orchestrator.host, rclaude=rcl, - timeout_s=discover_timeout_s, - ignore_uuids=pre_uuids, - ) + # Claude doesn't flush its session JSONL to disk until it processes its + # first message — until then discovery has nothing to find. The kick both + # introduces Claude to its role and triggers that first flush. + # + # Timing is a race: a freshly spawned `claude` needs a few seconds to + # reach a live input prompt, and a kick sent before then is silently + # dropped — the session never processes it, never flushes, discovery + # fails, and the orchestrator never comes up (observed: panes stuck at + # R:0 with the kick text unconsumed). The delay scales with host load, so + # a single fixed sleep is unreliable. Instead, poll: send the kick, look + # for the new session, and re-send on each miss until it appears or the + # budget is spent. Re-sending is harmless — Claude just re-acknowledges + # once ready. capture-pane can't gate readiness here: the orchestrator may + # live on a remote host, where capture-pane is unsafe. + new_uuid: str | None = None + time.sleep(2.0) # let tmux attach + claude begin launching + deadline = time.time() + discover_timeout_s + while True: + try: + _send_kick(rcl=rcl, cwd=effective_cwd) + except RclaudeError: + pass # transient — the next iteration re-sends + # Clamp to ≥1s so a tiny `discover_timeout_s` still gets one real + # discovery attempt; cap at 4s so a miss re-kicks promptly. + new_uuid = discover_session( + cwd=effective_cwd, host=cfg.orchestrator.host, rclaude=rcl, + timeout_s=max(1.0, min(4.0, deadline - time.time())), + ignore_uuids=pre_uuids, + ) + if new_uuid is not None or time.time() >= deadline: + break if new_uuid is None: return None write_session_uuid(