From 02599e0b6f939f777a861d32ed99f893b20e214b Mon Sep 17 00:00:00 2001
From: Juan Camilo <juancamilo.auriti@gmail.com>
Date: Mon, 20 Apr 2026 10:08:09 +0200
Subject: [PATCH] fix(api): consolidate 3P provider compatibility fixes

- Strip store field from request body for local providers (Ollama, vLLM)
  that reject unknown JSON fields with 400 errors
- Add Gemini 3.x model context windows and output token limits
  (gemini-3-flash-preview, gemini-3.1-pro-preview, google/ OpenRouter variants)
- Preserve reasoning_content on assistant tool-call message replays
  for providers that require it (Kimi k2.5, DeepSeek reasoner)
- Use conservative max_output_tokens fallback (4096/16384) for unknown
  3P models to prevent vLLM/Ollama 400 errors from exceeding max_model_len

Consolidates fixes from: #258, #268, #237, #643, #666, #677

Co-authored-by: auriti <auriti@users.noreply.github.com>
Co-authored-by: Gustavo-Falci <Gustavo-Falci@users.noreply.github.com>
Co-authored-by: lttlin <lttlin@users.noreply.github.com>
Co-authored-by: Durannd <Durannd@users.noreply.github.com>
---
 src/__tests__/bugfixes.test.ts          |   8 +-
 src/services/api/openaiShim.test.ts     | 120 ++++++++++++++++++++++++
 src/services/api/openaiShim.ts          |  18 +++-
 src/utils/context.ts                    |   8 +-
 src/utils/model/openaiContextWindows.ts |  36 ++++---
 5 files changed, 167 insertions(+), 23 deletions(-)

diff --git a/src/__tests__/bugfixes.test.ts b/src/__tests__/bugfixes.test.ts
index c028bdd9..60142d0a 100644
--- a/src/__tests__/bugfixes.test.ts
+++ b/src/__tests__/bugfixes.test.ts
@@ -21,11 +21,11 @@ describe('Gemini store field fix', () => {
   test('isGeminiMode is imported and used in openaiShim', async () => {
     const content = await file('services/api/openaiShim.ts').text()
 
-    // Verify the fix: store deletion should check for Gemini mode
+    // Verify the fix: store deletion should check for Gemini mode and local providers
     expect(content).toContain('isGeminiMode()')
-    expect(content).toContain("mistral and gemini don't recognize body.store")
-    // Ensure the delete body.store is guarded for both Mistral and Gemini
-    expect(content).toMatch(/isMistral\s*\|\|\s*isGeminiMode\(\)/)
+    expect(content).toContain("Strip store for providers that don't recognize it")
+    // Ensure the delete body.store is guarded for Mistral, Gemini, and local providers
+    expect(content).toMatch(/isMistral\s*\|\|\s*isGeminiMode\(\)\s*\|\|\s*isLocal/)
   })
 
   test('store: false is still set by default (OpenAI needs it)', async () => {
diff --git a/src/services/api/openaiShim.test.ts b/src/services/api/openaiShim.test.ts
index eaf8b590..41046a26 100644
--- a/src/services/api/openaiShim.test.ts
+++ b/src/services/api/openaiShim.test.ts
@@ -3019,3 +3019,123 @@ test('preserves valid tool_result and drops orphan tool_result', async () => {
   const orphanMessage = toolMessages.find(m => m.tool_call_id === 'orphan_call_2')
   expect(orphanMessage).toBeUndefined()
 })
+
+test('request body does not contain store field for local providers', async () => {
+  process.env.CLAUDE_CODE_USE_OPENAI = '1'
+  process.env.OPENAI_BASE_URL = 'http://localhost:11434/v1'
+  let requestBody: Record<string, unknown> | undefined
+
+  globalThis.fetch = (async (_input, init) => {
+    requestBody = JSON.parse(String(init?.body))
+    return new Response(
+      JSON.stringify({
+        id: 'chatcmpl-1',
+        object: 'chat.completion',
+        model: 'test-model',
+        choices: [{ index: 0, message: { role: 'assistant', content: 'ok' }, finish_reason: 'stop' }],
+        usage: { prompt_tokens: 10, completion_tokens: 2, total_tokens: 12 },
+      }),
+      { headers: { 'Content-Type': 'application/json' } },
+    )
+  }) as FetchType
+
+  const client = createOpenAIShimClient({ defaultHeaders: {} }) as unknown as OpenAIShimClient
+  await client.beta.messages.create({
+    model: 'some-model',
+    messages: [{ role: 'user', content: [{ type: 'text', text: 'hi' }] }],
+    max_tokens: 64,
+    stream: false,
+  })
+
+  expect(requestBody).toBeDefined()
+  expect('store' in requestBody!).toBe(false)
+})
+
+test('preserves reasoning_content on assistant messages with tool_calls during replay', async () => {
+  process.env.CLAUDE_CODE_USE_OPENAI = '1'
+  let requestBody: Record<string, unknown> | undefined
+
+  globalThis.fetch = (async (_input, init) => {
+    requestBody = JSON.parse(String(init?.body))
+    return new Response(
+      JSON.stringify({
+        id: 'chatcmpl-1',
+        object: 'chat.completion',
+        model: 'test-model',
+        choices: [{ index: 0, message: { role: 'assistant', content: 'done' }, finish_reason: 'stop' }],
+        usage: { prompt_tokens: 10, completion_tokens: 2, total_tokens: 12 },
+      }),
+      { headers: { 'Content-Type': 'application/json' } },
+    )
+  }) as FetchType
+
+  const client = createOpenAIShimClient({ defaultHeaders: {} }) as unknown as OpenAIShimClient
+  await client.beta.messages.create({
+    model: 'kimi-k2.5',
+    messages: [
+      { role: 'user', content: [{ type: 'text', text: 'read file' }] },
+      {
+        role: 'assistant',
+        content: [
+          { type: 'thinking', thinking: 'I should use the read tool' },
+          { type: 'tool_use', id: 'call_1', name: 'Read', input: { file_path: 'test.ts' } },
+        ],
+      },
+      {
+        role: 'user',
+        content: [
+          { type: 'tool_result', tool_use_id: 'call_1', content: 'file contents here' },
+        ],
+      },
+    ],
+    max_tokens: 64,
+    stream: false,
+  })
+
+  const messages = requestBody?.messages as Array<Record<string, unknown>>
+  const assistantMsg = messages.find(m => m.role === 'assistant' && m.tool_calls)
+  expect(assistantMsg).toBeDefined()
+  expect(assistantMsg!.reasoning_content).toBe('I should use the read tool')
+})
+
+test('does not add reasoning_content on assistant messages without tool_calls', async () => {
+  process.env.CLAUDE_CODE_USE_OPENAI = '1'
+  let requestBody: Record<string, unknown> | undefined
+
+  globalThis.fetch = (async (_input, init) => {
+    requestBody = JSON.parse(String(init?.body))
+    return new Response(
+      JSON.stringify({
+        id: 'chatcmpl-1',
+        object: 'chat.completion',
+        model: 'test-model',
+        choices: [{ index: 0, message: { role: 'assistant', content: 'ok' }, finish_reason: 'stop' }],
+        usage: { prompt_tokens: 10, completion_tokens: 2, total_tokens: 12 },
+      }),
+      { headers: { 'Content-Type': 'application/json' } },
+    )
+  }) as FetchType
+
+  const client = createOpenAIShimClient({ defaultHeaders: {} }) as unknown as OpenAIShimClient
+  await client.beta.messages.create({
+    model: 'deepseek-reasoner',
+    messages: [
+      { role: 'user', content: [{ type: 'text', text: 'explain' }] },
+      {
+        role: 'assistant',
+        content: [
+          { type: 'thinking', thinking: 'Let me think about this' },
+          { type: 'text', text: 'Here is the explanation' },
+        ],
+      },
+      { role: 'user', content: [{ type: 'text', text: 'thanks' }] },
+    ],
+    max_tokens: 64,
+    stream: false,
+  })
+
+  const messages = requestBody?.messages as Array<Record<string, unknown>>
+  const assistantMsg = messages.find(m => m.role === 'assistant' && !m.tool_calls)
+  expect(assistantMsg).toBeDefined()
+  expect(assistantMsg!.reasoning_content).toBeUndefined()
+})
\ No newline at end of file
diff --git a/src/services/api/openaiShim.ts b/src/services/api/openaiShim.ts
index 6c3c1ffa..cb827d3e 100644
--- a/src/services/api/openaiShim.ts
+++ b/src/services/api/openaiShim.ts
@@ -192,6 +192,7 @@ function sleepMs(ms: number): Promise<void> {
 interface OpenAIMessage {
   role: 'system' | 'user' | 'assistant' | 'tool'
   content?: string | Array<{ type: string; text?: string; image_url?: { url: string } }>
+  reasoning_content?: string
   tool_calls?: Array<{
     id: string
     type: 'function'
@@ -416,6 +417,16 @@ function convertMessages(
         }
 
         if (toolUses.length > 0) {
+          // Preserve thinking text as reasoning_content for providers that
+          // require it on replayed assistant tool-call messages (e.g. Kimi,
+          // DeepSeek). Without this, follow-up requests fail with 400:
+          // "reasoning_content is missing in assistant tool call message".
+          // Note: only the first thinking block per turn is captured (.find);
+          // Anthropic's API typically produces one thinking block per turn.
+          if (thinkingBlock) {
+            assistantMsg.reasoning_content = (thinkingBlock as { thinking?: string }).thinking ?? ''
+          }
+
           assistantMsg.tool_calls = toolUses.map(
             (tu: {
               id?: string
@@ -1345,9 +1356,10 @@ class OpenAIShimMessages {
       delete body.max_completion_tokens
     }
 
-    // mistral and gemini don't recognize body.store — Gemini returns 400
-    // "Invalid JSON payload received. Unknown name 'store': Cannot find field."
-    if (isMistral || isGeminiMode()) {
+    // Strip store for providers that don't recognize it. Only OpenAI's own
+    // API supports this field — Gemini returns 400, local servers (vLLM,
+    // Ollama) reject unknown fields, and other providers silently ignore it.
+    if (isMistral || isGeminiMode() || isLocal) {
       delete body.store
     }
 
diff --git a/src/utils/context.ts b/src/utils/context.ts
index 370ed5df..45c343e5 100644
--- a/src/utils/context.ts
+++ b/src/utils/context.ts
@@ -190,16 +190,20 @@ export function getModelMaxOutputTokens(model: string): {
   }
 
   // OpenAI-compatible provider — use known output limits to avoid 400 errors
-  if (
+  const isOpenAICompatProvider =
     isEnvTruthy(process.env.CLAUDE_CODE_USE_OPENAI) ||
     isEnvTruthy(process.env.CLAUDE_CODE_USE_GEMINI) ||
     isEnvTruthy(process.env.CLAUDE_CODE_USE_GITHUB) ||
     isEnvTruthy(process.env.CLAUDE_CODE_USE_MISTRAL)
-  ) {
+  if (isOpenAICompatProvider) {
     const openaiMax = getOpenAIMaxOutputTokens(model)
     if (openaiMax !== undefined) {
       return { default: openaiMax, upperLimit: openaiMax }
     }
+    // Unknown 3P model — use conservative default to avoid vLLM/Ollama 400
+    // errors when the default 32k exceeds the model's max_model_len.
+    // Users can override with CLAUDE_CODE_MAX_OUTPUT_TOKENS.
+    return { default: 4_096, upperLimit: 16_384 }
   }
 
   const m = getCanonicalName(model)
diff --git a/src/utils/model/openaiContextWindows.ts b/src/utils/model/openaiContextWindows.ts
index 7c9838fb..63f6961b 100644
--- a/src/utils/model/openaiContextWindows.ts
+++ b/src/utils/model/openaiContextWindows.ts
@@ -177,15 +177,19 @@ const OPENAI_CONTEXT_WINDOWS: Record<string, number> = {
   'MiniMax-M2':               204_800,
 
   // Google (via OpenRouter)
-  'google/gemini-2.0-flash':1_048_576,
-  'google/gemini-2.5-pro':  1_048_576,
+  'google/gemini-2.0-flash':          1_048_576,
+  'google/gemini-2.5-pro':            1_048_576,
+  'google/gemini-3-flash-preview':    1_048_576,
+  'google/gemini-3.1-pro-preview':    1_048_576,
 
   // Google (native via CLAUDE_CODE_USE_GEMINI)
-  'gemini-2.0-flash':              1_048_576,
-  'gemini-2.5-pro':                1_048_576,
-  'gemini-2.5-flash':              1_048_576,
-  'gemini-3.1-pro':                1_048_576,
-  'gemini-3.1-flash-lite-preview': 1_048_576,
+  'gemini-2.0-flash':                 1_048_576,
+  'gemini-2.5-pro':                   1_048_576,
+  'gemini-2.5-flash':                 1_048_576,
+  'gemini-3-flash-preview':           1_048_576,
+  'gemini-3.1-pro':                   1_048_576,
+  'gemini-3.1-pro-preview':           1_048_576,
+  'gemini-3.1-flash-lite-preview':    1_048_576,
 
   // Ollama local models
   // Llama 3.1+ models support 128k context natively (Meta official specs).
@@ -329,15 +333,19 @@ const OPENAI_MAX_OUTPUT_TOKENS: Record<string, number> = {
   'MiniMax-Vision-01-Fast':    16_384,
 
   // Google (via OpenRouter)
-  'google/gemini-2.0-flash':   8_192,
-  'google/gemini-2.5-pro':    65_536,
+  'google/gemini-2.0-flash':          8_192,
+  'google/gemini-2.5-pro':           65_536,
+  'google/gemini-3-flash-preview':   65_536,
+  'google/gemini-3.1-pro-preview':   65_536,
 
   // Google (native via CLAUDE_CODE_USE_GEMINI)
-  'gemini-2.0-flash':              8_192,
-  'gemini-2.5-pro':                65_536,
-  'gemini-2.5-flash':              65_536,
-  'gemini-3.1-pro':                65_536,
-  'gemini-3.1-flash-lite-preview': 65_536,
+  'gemini-2.0-flash':                 8_192,
+  'gemini-2.5-pro':                  65_536,
+  'gemini-2.5-flash':                65_536,
+  'gemini-3-flash-preview':          65_536,
+  'gemini-3.1-pro':                  65_536,
+  'gemini-3.1-pro-preview':          65_536,
+  'gemini-3.1-flash-lite-preview':   65_536,
 
   // Ollama local models (conservative safe defaults)
   'llama3.3:70b':               4_096,