From 4ca94b2454c93056280c8a7bad7260e4d05bc603 Mon Sep 17 00:00:00 2001
From: gnanam1990 <gnanasekaran.sekareee@gmail.com>
Date: Wed, 1 Apr 2026 17:42:04 +0530
Subject: [PATCH] feat: add context window guard for OpenAI-compatible models

Without this fix, getContextWindowForModel() returns 200k for all OpenAI
models (the Claude default), causing two problems:
  1. Auto-compact/warnings trigger at wrong thresholds (200k instead of 128k)
  2. getModelMaxOutputTokens() returns 32k causing 400 errors from APIs that
     cap output tokens lower (gpt-4o supports max 16384)

Fix:
- Add openaiContextWindows.ts with known context window sizes and max output
  token limits for 30+ OpenAI-compatible models (OpenAI, DeepSeek, Groq,
  Mistral, Ollama, LM Studio)
- Hook into getContextWindowForModel() so correct input limits are used
- Hook into getModelMaxOutputTokens() so correct output limits are sent,
  preventing 400 "max_tokens is too large" errors

All existing warning, blocking, and auto-compact infrastructure works
automatically once the correct limits are returned.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 src/utils/context.ts                    |  27 +++++
 src/utils/model/openaiContextWindows.ts | 132 ++++++++++++++++++++++++
 2 files changed, 159 insertions(+)
 create mode 100644 src/utils/model/openaiContextWindows.ts

diff --git a/src/utils/context.ts b/src/utils/context.ts
index d9714de9..f13b2b0a 100644
--- a/src/utils/context.ts
+++ b/src/utils/context.ts
@@ -4,6 +4,7 @@ import { getGlobalConfig } from './config.js'
 import { isEnvTruthy } from './envUtils.js'
 import { getCanonicalName } from './model/model.js'
 import { getModelCapability } from './model/modelCapabilities.js'
+import { getOpenAIContextWindow, getOpenAIMaxOutputTokens } from './model/openaiContextWindows.js'
 
 // Model context window size (200k tokens for all models right now)
 export const MODEL_CONTEXT_WINDOW_DEFAULT = 200_000
@@ -71,6 +72,19 @@ export function getContextWindowForModel(
     return 1_000_000
   }
 
+  // OpenAI-compatible provider — use known context windows for the model
+  if (
+    process.env.CLAUDE_CODE_USE_OPENAI === '1' ||
+    process.env.CLAUDE_CODE_USE_OPENAI === 'true' ||
+    process.env.CLAUDE_CODE_USE_GEMINI === '1' ||
+    process.env.CLAUDE_CODE_USE_GEMINI === 'true'
+  ) {
+    const openaiWindow = getOpenAIContextWindow(model)
+    if (openaiWindow !== undefined) {
+      return openaiWindow
+    }
+  }
+
   const cap = getModelCapability(model)
   if (cap?.max_input_tokens && cap.max_input_tokens >= 100_000) {
     if (
@@ -162,6 +176,19 @@ export function getModelMaxOutputTokens(model: string): {
     }
   }
 
+  // OpenAI-compatible provider — use known output limits to avoid 400 errors
+  if (
+    process.env.CLAUDE_CODE_USE_OPENAI === '1' ||
+    process.env.CLAUDE_CODE_USE_OPENAI === 'true' ||
+    process.env.CLAUDE_CODE_USE_GEMINI === '1' ||
+    process.env.CLAUDE_CODE_USE_GEMINI === 'true'
+  ) {
+    const openaiMax = getOpenAIMaxOutputTokens(model)
+    if (openaiMax !== undefined) {
+      return { default: openaiMax, upperLimit: openaiMax }
+    }
+  }
+
   const m = getCanonicalName(model)
 
   if (m.includes('opus-4-6')) {
diff --git a/src/utils/model/openaiContextWindows.ts b/src/utils/model/openaiContextWindows.ts
new file mode 100644
index 00000000..fd6fb15a
--- /dev/null
+++ b/src/utils/model/openaiContextWindows.ts
@@ -0,0 +1,132 @@
+/**
+ * openaiContextWindows.ts
+ * Context window sizes for OpenAI-compatible models used via the shim.
+ * Fixes: auto-compact and warnings using wrong 200k default for OpenAI models.
+ *
+ * When CLAUDE_CODE_USE_OPENAI=1, getContextWindowForModel() falls through to
+ * MODEL_CONTEXT_WINDOW_DEFAULT (200k). This causes the warning and blocking
+ * thresholds to be set at 200k even for models like gpt-4o (128k) or llama3 (8k),
+ * meaning users get no warning before hitting a hard API error.
+ *
+ * Prices in tokens as of April 2026 — update as needed.
+ */
+
+const OPENAI_CONTEXT_WINDOWS: Record<string, number> = {
+  // OpenAI
+  'gpt-4o':                   128_000,
+  'gpt-4o-mini':              128_000,
+  'gpt-4.1':                  1_047_576,
+  'gpt-4.1-mini':             1_047_576,
+  'gpt-4.1-nano':             1_047_576,
+  'gpt-4-turbo':              128_000,
+  'gpt-4':                     8_192,
+  'o3-mini':                  200_000,
+  'o4-mini':                  200_000,
+  'o3':                       200_000,
+
+  // DeepSeek
+  'deepseek-chat':             64_000,
+  'deepseek-reasoner':         64_000,
+
+  // Groq (fast inference)
+  'llama-3.3-70b-versatile':  128_000,
+  'llama-3.1-8b-instant':     128_000,
+  'mixtral-8x7b-32768':        32_768,
+
+  // Mistral
+  'mistral-large-latest':     131_072,
+  'mistral-small-latest':     131_072,
+
+  // Google (via OpenRouter)
+  'google/gemini-2.0-flash':1_048_576,
+  'google/gemini-2.5-pro':  1_048_576,
+
+  // Ollama local models
+  'llama3.3:70b':               8_192,
+  'llama3.1:8b':                8_192,
+  'llama3.2:3b':                8_192,
+  'qwen2.5-coder:32b':        32_768,
+  'qwen2.5-coder:7b':         32_768,
+  'deepseek-coder-v2:16b':    163_840,
+  'deepseek-r1:14b':           65_536,
+  'mistral:7b':                32_768,
+  'phi4:14b':                  16_384,
+  'gemma2:27b':                 8_192,
+  'codellama:13b':              16_384,
+}
+
+/**
+ * Max output (completion) tokens per model.
+ * This is separate from the context window (input limit).
+ * Fixes: 400 error "max_tokens is too large" when default 32k exceeds model limit.
+ */
+const OPENAI_MAX_OUTPUT_TOKENS: Record<string, number> = {
+  // OpenAI
+  'gpt-4o':                   16_384,
+  'gpt-4o-mini':              16_384,
+  'gpt-4.1':                  32_768,
+  'gpt-4.1-mini':             32_768,
+  'gpt-4.1-nano':             32_768,
+  'gpt-4-turbo':               4_096,
+  'gpt-4':                     4_096,
+  'o3-mini':                  100_000,
+  'o4-mini':                  100_000,
+  'o3':                       100_000,
+
+  // DeepSeek
+  'deepseek-chat':              8_192,
+  'deepseek-reasoner':         32_768,
+
+  // Groq
+  'llama-3.3-70b-versatile':  32_768,
+  'llama-3.1-8b-instant':      8_192,
+  'mixtral-8x7b-32768':       32_768,
+
+  // Mistral
+  'mistral-large-latest':     32_768,
+  'mistral-small-latest':     32_768,
+
+  // Google (via OpenRouter)
+  'google/gemini-2.0-flash':   8_192,
+  'google/gemini-2.5-pro':    32_768,
+
+  // Ollama local models (conservative safe defaults)
+  'llama3.3:70b':               4_096,
+  'llama3.1:8b':                4_096,
+  'llama3.2:3b':                4_096,
+  'qwen2.5-coder:32b':         8_192,
+  'qwen2.5-coder:7b':          8_192,
+  'deepseek-coder-v2:16b':     8_192,
+  'deepseek-r1:14b':            8_192,
+  'mistral:7b':                 4_096,
+  'phi4:14b':                   4_096,
+  'gemma2:27b':                 4_096,
+  'codellama:13b':              4_096,
+}
+
+function lookupByModel<T>(table: Record<string, T>, model: string): T | undefined {
+  if (table[model] !== undefined) return table[model]
+  for (const key of Object.keys(table)) {
+    if (model.startsWith(key)) return table[key]
+  }
+  return undefined
+}
+
+/**
+ * Look up the context window for an OpenAI-compatible model.
+ * Returns undefined if the model is not in the table.
+ *
+ * Falls back to prefix matching so dated variants like
+ * "gpt-4o-2024-11-20" resolve to the base "gpt-4o" entry.
+ */
+export function getOpenAIContextWindow(model: string): number | undefined {
+  return lookupByModel(OPENAI_CONTEXT_WINDOWS, model)
+}
+
+/**
+ * Look up the max output tokens for an OpenAI-compatible model.
+ * Returns undefined if the model is not in the table.
+ */
+export function getOpenAIMaxOutputTokens(model: string): number | undefined {
+  return lookupByModel(OPENAI_MAX_OUTPUT_TOKENS, model)
+}