fix: resolve 12 bugs across API, MCP, agent tools, web search, and context overflow (#674)

* fix: resolve 12 bugs across API, MCP, agent tools, web search, and context overflow API fixes: - Fix Gemini 400 error: delete 'store: false' field for Gemini endpoints (was globally injected, Gemini rejects unknown fields) - Fix session timeout 500 errors after ~25min: add 120s idle timeout on SSE stream readers in openaiShim and codexShim to detect dead connections and trigger withRetry reconnection - Fix context overflow 500 errors: add handler in errors.ts for 500 responses caused by oversized conversation context (too many tokens), surfacing user-friendly message with recovery actions instead of raw 'API Error: 500' Agent loop fix: - Fix premature task completion: detect continuation signals like 'so now I have to do it' in assistant text without tool calls and inject a meta nudge to force the agent to continue Web search improvements: - Increase result counts: Bing/Tavily/Exa/Firecrawl from 10→15, Mojeek/You/Jina from default→10 (explicit), max_uses 8→15 MCP fixes: - Reduce default tool timeout from ~27.8 hours to 5 minutes (tools no longer hang indefinitely on unresponsive servers) - Add retry logic (3 attempts) for tools/list fetch failures (prevents all MCP tools from silently disappearing on timeout) - Add abort signal check in URL elicitation retry loop - Improve MCP error messages with server and tool name context Agent tool fixes: - Fix SendMessage race condition: double-check task status before auto-resuming stopped agents to prevent duplicate registration - Fix auto-compact circuit breaker gap: when auto-compact fails 3+ consecutive times, proactively block oversized context BEFORE the API call instead of letting it 500. Clear message with recovery instructions (/new, /compact, rewind). Tests: 850 total, 0 failures (25 new bugfix tests) * fix: address all 4 review blockers + 6 additional issues from PR #674 Blockers (from Vasanthdev2004 review): 1. Continuation nudge infinite loop — no loop guard Added continuationNudgeCount to State, capped at MAX_CONTINUATION_NUDGES (3). Counter increments on each nudge, resets on tool execution (next_turn). 2. Continuation signal regexes too broad — high false-positive rate Tightened all patterns to require explicit action verbs. Added completion marker check (done/finished/completed/summary). Broad patterns only fire on messages <80 chars. 3. BUGFIXES.md in repo root — scope contamination Removed. PR description already contains this info. 4. AgentTool dump state cleanup is comment-only, not a bug fix Wrapped clearInvokedSkillsForAgent and clearDumpState in individual try/catch blocks so one failure doesn't prevent the other. Additional issues: 5+6. readWithTimeout ignores AbortSignal, timer leak on abort Added optional signal param to openaiStreamToAnthropic, codexStreamToAnthropic, collectCodexCompletedResponse, readSseEvents. Added abort listener that clears idle timer so AbortError surfaces cleanly instead of spurious idle timeout. 7. MCP error format change breaks consumers Reverted human-readable message to original errorDetails format. Moved server/tool context to telemetryMessage param only. 10. AgentTool test broken by comment change Updated test assertions to match new defensive cleanup text + try/catch. 12. Mojeek test regex dangerously broad Tightened to match searchParams.set('t', '10') specifically. 14. linkup.ts in providerCounts test — no result count field Removed from providers list (uses depth param, not result count). 15. Error message overlap between errors.ts and query.ts Prefixed errorDetails with 'Context overflow (500):' to distinguish. Tests: 851 pass, 0 fail --------- Co-authored-by: openclaude-bot <bot@openclaude.ai> Co-authored-by: Fix Bot <fix@openclaude.dev>
2026-04-14 16:29:53 +05:30
parent 1741f32cb7
commit 25ce2ca7bf
18 changed files with 647 additions and 27 deletions
--- a/src/services/api/codexShim.ts
+++ b/src/services/api/codexShim.ts
@@ -580,15 +580,55 @@ export async function performCodexRequest(options: {
  return response
 }

-async function* readSseEvents(response: Response): AsyncGenerator<CodexSseEvent> {
+async function* readSseEvents(response: Response, signal?: AbortSignal): AsyncGenerator<CodexSseEvent> {
  const reader = response.body?.getReader()
  if (!reader) return

  const decoder = new TextDecoder()
  let buffer = ''
+  const STREAM_IDLE_TIMEOUT_MS = 120_000 // 2 minutes without data
+  let lastDataTime = Date.now()
+
+  /**
+   * Read from the stream with an idle timeout. Respects the caller's
+   * AbortSignal — clears the idle timer on abort so the AbortError
+   * surfaces cleanly instead of a spurious idle timeout.
+   */
+  async function readWithTimeout(): Promise<ReadableStreamReadResult<Uint8Array>> {
+    return new Promise((resolve, reject) => {
+      const timeoutId = setTimeout(() => {
+        const elapsed = Math.round((Date.now() - lastDataTime) / 1000)
+        reject(new Error(
+          `Codex SSE stream idle for ${elapsed}s (limit: ${STREAM_IDLE_TIMEOUT_MS / 1000}s). Connection likely dropped.`,
+        ))
+      }, STREAM_IDLE_TIMEOUT_MS)
+
+      let abortCleanup: (() => void) | undefined
+      if (signal) {
+        abortCleanup = () => {
+          clearTimeout(timeoutId)
+        }
+        signal.addEventListener('abort', abortCleanup, { once: true })
+      }
+
+      reader.read().then(
+        result => {
+          clearTimeout(timeoutId)
+          if (signal && abortCleanup) signal.removeEventListener('abort', abortCleanup)
+          if (result.value) lastDataTime = Date.now()
+          resolve(result)
+        },
+        err => {
+          clearTimeout(timeoutId)
+          if (signal && abortCleanup) signal.removeEventListener('abort', abortCleanup)
+          reject(err)
+        },
+      )
+    })
+  }

  while (true) {
-    const { done, value } = await reader.read()
+    const { done, value } = await readWithTimeout()
    if (done) break

    buffer += decoder.decode(value, { stream: true })
@@ -649,10 +689,11 @@ function determineStopReason(

 export async function collectCodexCompletedResponse(
  response: Response,
+  signal?: AbortSignal,
 ): Promise<Record<string, any>> {
  let completedResponse: Record<string, any> | undefined

-  for await (const event of readSseEvents(response)) {
+  for await (const event of readSseEvents(response, signal)) {
    if (event.event === 'response.failed') {
      const msg = event.data?.response?.error?.message ??
        event.data?.error?.message ?? 'Codex response failed'
@@ -681,6 +722,7 @@ export async function collectCodexCompletedResponse(
 export async function* codexStreamToAnthropic(
  response: Response,
  model: string,
+  signal?: AbortSignal,
 ): AsyncGenerator<AnthropicStreamEvent> {
  const messageId = makeMessageId()
  const toolBlocksByItemId = new Map<
@@ -742,7 +784,7 @@ export async function* codexStreamToAnthropic(
    },
  }

-  for await (const event of readSseEvents(response)) {
+  for await (const event of readSseEvents(response, signal)) {
    const payload = event.data

    if (event.event === 'response.output_item.added') {
--- a/src/services/api/errors.ts
+++ b/src/services/api/errors.ts
@@ -924,6 +924,30 @@ export function getAssistantMessageFromError(
    })
  }

+  // 500 errors caused by context overflow — the API returns 500 instead of 400
+  // when the request body (including conversation context) exceeds limits.
+  // This happens when auto-compact fails or the token estimation undercounts.
+  // Detect by checking for context-related keywords in 500 responses.
+  if (
+    error instanceof APIError &&
+    error.status >= 500 &&
+    (error.message.toLowerCase().includes('too many tokens') ||
+      error.message.toLowerCase().includes('request too large') ||
+      error.message.toLowerCase().includes('context length') ||
+      error.message.toLowerCase().includes('maximum context') ||
+      error.message.toLowerCase().includes('input length') ||
+      error.message.toLowerCase().includes('payload too large'))
+  ) {
+    const rewindInstruction = getIsNonInteractiveSession()
+      ? ''
+      : ' Press esc twice to go up a few messages, or run /compact to reduce context.'
+    return createAssistantAPIErrorMessage({
+      content: `The conversation has grown too large for the API to process.${rewindInstruction} Alternatively, start a new session with /new.`,
+      error: 'invalid_request',
+      errorDetails: `Context overflow (500): ${error.message}`,
+    })
+  }
+
  // Connection errors (non-timeout) — use formatAPIError for detailed messages
  if (error instanceof APIConnectionError) {
    return createAssistantAPIErrorMessage({
--- a/src/services/api/openaiShim.ts
+++ b/src/services/api/openaiShim.ts
@@ -641,6 +641,7 @@ function repairPossiblyTruncatedObjectJson(raw: string): string | null {
 async function* openaiStreamToAnthropic(
  response: Response,
  model: string,
+  signal?: AbortSignal,
 ): AsyncGenerator<AnthropicStreamEvent> {
  const messageId = makeMessageId()
  let contentBlockIndex = 0
@@ -688,6 +689,51 @@ async function* openaiStreamToAnthropic(

  const decoder = new TextDecoder()
  let buffer = ''
+  const STREAM_IDLE_TIMEOUT_MS = 120_000 // 2 minutes without data = connection likely dead
+  let lastDataTime = Date.now()
+
+  /**
+   * Read from the stream with an idle timeout. If no data arrives within
+   * STREAM_IDLE_TIMEOUT_MS, assume the connection is dead and throw so
+   * withRetry can reconnect. This prevents indefinite hangs on stale
+   * SSE connections from OpenAI/Gemini during long-running sessions.
+   * Respects the caller's AbortSignal — clears the idle timer on abort
+   * so the rejection reason is AbortError, not a spurious idle timeout.
+   */
+  async function readWithTimeout(): Promise<ReadableStreamReadResult<Uint8Array>> {
+    return new Promise((resolve, reject) => {
+      const timeoutId = setTimeout(() => {
+        const elapsed = Math.round((Date.now() - lastDataTime) / 1000)
+        reject(new Error(
+          `OpenAI/Gemini SSE stream idle for ${elapsed}s (limit: ${STREAM_IDLE_TIMEOUT_MS / 1000}s). Connection likely dropped.`,
+        ))
+      }, STREAM_IDLE_TIMEOUT_MS)
+
+      // If the caller aborts, clear the timer so the AbortError surfaces
+      // cleanly instead of being masked by a spurious idle timeout.
+      let abortCleanup: (() => void) | undefined
+      if (signal) {
+        abortCleanup = () => {
+          clearTimeout(timeoutId)
+        }
+        signal.addEventListener('abort', abortCleanup, { once: true })
+      }
+
+      reader.read().then(
+        result => {
+          clearTimeout(timeoutId)
+          if (signal && abortCleanup) signal.removeEventListener('abort', abortCleanup)
+          if (result.value) lastDataTime = Date.now()
+          resolve(result)
+        },
+        err => {
+          clearTimeout(timeoutId)
+          if (signal && abortCleanup) signal.removeEventListener('abort', abortCleanup)
+          reject(err)
+        },
+      )
+    })
+  }

  const closeActiveContentBlock = async function* () {
    if (!hasEmittedContentStart) return
@@ -715,7 +761,7 @@ async function* openaiStreamToAnthropic(

  try {
    while (true) {
-      const { done, value } = await reader.read()
+      const { done, value } = await readWithTimeout()
      if (done) break

      buffer += decoder.decode(value, { stream: true })
@@ -1075,13 +1121,13 @@ class OpenAIShimMessages {
        const isResponsesStream = response.url?.includes('/responses')
        return new OpenAIShimStream(
          (request.transport === 'codex_responses' || isResponsesStream)
-            ? codexStreamToAnthropic(response, request.resolvedModel)
-            : openaiStreamToAnthropic(response, request.resolvedModel),
+            ? codexStreamToAnthropic(response, request.resolvedModel, options?.signal)
+            : openaiStreamToAnthropic(response, request.resolvedModel, options?.signal),
        )
      }

      if (request.transport === 'codex_responses') {
-        const data = await collectCodexCompletedResponse(response)
+        const data = await collectCodexCompletedResponse(response, options?.signal)
        return convertCodexResponseToAnthropicMessage(
          data,
          request.resolvedModel,
@@ -1271,8 +1317,9 @@ class OpenAIShimMessages {
      delete body.max_completion_tokens
    }

-    // mistral also doesn't recognize body.store
-    if (isMistral) {
+    // mistral and gemini don't recognize body.store — Gemini returns 400
+    // "Invalid JSON payload received. Unknown name 'store': Cannot find field."
+    if (isMistral || isGeminiMode()) {
      delete body.store
    }