From 4ca94b2454c93056280c8a7bad7260e4d05bc603 Mon Sep 17 00:00:00 2001 From: gnanam1990 Date: Wed, 1 Apr 2026 17:42:04 +0530 Subject: [PATCH] feat: add context window guard for OpenAI-compatible models Without this fix, getContextWindowForModel() returns 200k for all OpenAI models (the Claude default), causing two problems: 1. Auto-compact/warnings trigger at wrong thresholds (200k instead of 128k) 2. getModelMaxOutputTokens() returns 32k causing 400 errors from APIs that cap output tokens lower (gpt-4o supports max 16384) Fix: - Add openaiContextWindows.ts with known context window sizes and max output token limits for 30+ OpenAI-compatible models (OpenAI, DeepSeek, Groq, Mistral, Ollama, LM Studio) - Hook into getContextWindowForModel() so correct input limits are used - Hook into getModelMaxOutputTokens() so correct output limits are sent, preventing 400 "max_tokens is too large" errors All existing warning, blocking, and auto-compact infrastructure works automatically once the correct limits are returned. Co-Authored-By: Claude Sonnet 4.6 --- src/utils/context.ts | 27 +++++ src/utils/model/openaiContextWindows.ts | 132 ++++++++++++++++++++++++ 2 files changed, 159 insertions(+) create mode 100644 src/utils/model/openaiContextWindows.ts diff --git a/src/utils/context.ts b/src/utils/context.ts index d9714de9..f13b2b0a 100644 --- a/src/utils/context.ts +++ b/src/utils/context.ts @@ -4,6 +4,7 @@ import { getGlobalConfig } from './config.js' import { isEnvTruthy } from './envUtils.js' import { getCanonicalName } from './model/model.js' import { getModelCapability } from './model/modelCapabilities.js' +import { getOpenAIContextWindow, getOpenAIMaxOutputTokens } from './model/openaiContextWindows.js' // Model context window size (200k tokens for all models right now) export const MODEL_CONTEXT_WINDOW_DEFAULT = 200_000 @@ -71,6 +72,19 @@ export function getContextWindowForModel( return 1_000_000 } + // OpenAI-compatible provider — use known context windows for the model + if ( + process.env.CLAUDE_CODE_USE_OPENAI === '1' || + process.env.CLAUDE_CODE_USE_OPENAI === 'true' || + process.env.CLAUDE_CODE_USE_GEMINI === '1' || + process.env.CLAUDE_CODE_USE_GEMINI === 'true' + ) { + const openaiWindow = getOpenAIContextWindow(model) + if (openaiWindow !== undefined) { + return openaiWindow + } + } + const cap = getModelCapability(model) if (cap?.max_input_tokens && cap.max_input_tokens >= 100_000) { if ( @@ -162,6 +176,19 @@ export function getModelMaxOutputTokens(model: string): { } } + // OpenAI-compatible provider — use known output limits to avoid 400 errors + if ( + process.env.CLAUDE_CODE_USE_OPENAI === '1' || + process.env.CLAUDE_CODE_USE_OPENAI === 'true' || + process.env.CLAUDE_CODE_USE_GEMINI === '1' || + process.env.CLAUDE_CODE_USE_GEMINI === 'true' + ) { + const openaiMax = getOpenAIMaxOutputTokens(model) + if (openaiMax !== undefined) { + return { default: openaiMax, upperLimit: openaiMax } + } + } + const m = getCanonicalName(model) if (m.includes('opus-4-6')) { diff --git a/src/utils/model/openaiContextWindows.ts b/src/utils/model/openaiContextWindows.ts new file mode 100644 index 00000000..fd6fb15a --- /dev/null +++ b/src/utils/model/openaiContextWindows.ts @@ -0,0 +1,132 @@ +/** + * openaiContextWindows.ts + * Context window sizes for OpenAI-compatible models used via the shim. + * Fixes: auto-compact and warnings using wrong 200k default for OpenAI models. + * + * When CLAUDE_CODE_USE_OPENAI=1, getContextWindowForModel() falls through to + * MODEL_CONTEXT_WINDOW_DEFAULT (200k). This causes the warning and blocking + * thresholds to be set at 200k even for models like gpt-4o (128k) or llama3 (8k), + * meaning users get no warning before hitting a hard API error. + * + * Prices in tokens as of April 2026 — update as needed. + */ + +const OPENAI_CONTEXT_WINDOWS: Record = { + // OpenAI + 'gpt-4o': 128_000, + 'gpt-4o-mini': 128_000, + 'gpt-4.1': 1_047_576, + 'gpt-4.1-mini': 1_047_576, + 'gpt-4.1-nano': 1_047_576, + 'gpt-4-turbo': 128_000, + 'gpt-4': 8_192, + 'o3-mini': 200_000, + 'o4-mini': 200_000, + 'o3': 200_000, + + // DeepSeek + 'deepseek-chat': 64_000, + 'deepseek-reasoner': 64_000, + + // Groq (fast inference) + 'llama-3.3-70b-versatile': 128_000, + 'llama-3.1-8b-instant': 128_000, + 'mixtral-8x7b-32768': 32_768, + + // Mistral + 'mistral-large-latest': 131_072, + 'mistral-small-latest': 131_072, + + // Google (via OpenRouter) + 'google/gemini-2.0-flash':1_048_576, + 'google/gemini-2.5-pro': 1_048_576, + + // Ollama local models + 'llama3.3:70b': 8_192, + 'llama3.1:8b': 8_192, + 'llama3.2:3b': 8_192, + 'qwen2.5-coder:32b': 32_768, + 'qwen2.5-coder:7b': 32_768, + 'deepseek-coder-v2:16b': 163_840, + 'deepseek-r1:14b': 65_536, + 'mistral:7b': 32_768, + 'phi4:14b': 16_384, + 'gemma2:27b': 8_192, + 'codellama:13b': 16_384, +} + +/** + * Max output (completion) tokens per model. + * This is separate from the context window (input limit). + * Fixes: 400 error "max_tokens is too large" when default 32k exceeds model limit. + */ +const OPENAI_MAX_OUTPUT_TOKENS: Record = { + // OpenAI + 'gpt-4o': 16_384, + 'gpt-4o-mini': 16_384, + 'gpt-4.1': 32_768, + 'gpt-4.1-mini': 32_768, + 'gpt-4.1-nano': 32_768, + 'gpt-4-turbo': 4_096, + 'gpt-4': 4_096, + 'o3-mini': 100_000, + 'o4-mini': 100_000, + 'o3': 100_000, + + // DeepSeek + 'deepseek-chat': 8_192, + 'deepseek-reasoner': 32_768, + + // Groq + 'llama-3.3-70b-versatile': 32_768, + 'llama-3.1-8b-instant': 8_192, + 'mixtral-8x7b-32768': 32_768, + + // Mistral + 'mistral-large-latest': 32_768, + 'mistral-small-latest': 32_768, + + // Google (via OpenRouter) + 'google/gemini-2.0-flash': 8_192, + 'google/gemini-2.5-pro': 32_768, + + // Ollama local models (conservative safe defaults) + 'llama3.3:70b': 4_096, + 'llama3.1:8b': 4_096, + 'llama3.2:3b': 4_096, + 'qwen2.5-coder:32b': 8_192, + 'qwen2.5-coder:7b': 8_192, + 'deepseek-coder-v2:16b': 8_192, + 'deepseek-r1:14b': 8_192, + 'mistral:7b': 4_096, + 'phi4:14b': 4_096, + 'gemma2:27b': 4_096, + 'codellama:13b': 4_096, +} + +function lookupByModel(table: Record, model: string): T | undefined { + if (table[model] !== undefined) return table[model] + for (const key of Object.keys(table)) { + if (model.startsWith(key)) return table[key] + } + return undefined +} + +/** + * Look up the context window for an OpenAI-compatible model. + * Returns undefined if the model is not in the table. + * + * Falls back to prefix matching so dated variants like + * "gpt-4o-2024-11-20" resolve to the base "gpt-4o" entry. + */ +export function getOpenAIContextWindow(model: string): number | undefined { + return lookupByModel(OPENAI_CONTEXT_WINDOWS, model) +} + +/** + * Look up the max output tokens for an OpenAI-compatible model. + * Returns undefined if the model is not in the table. + */ +export function getOpenAIMaxOutputTokens(model: string): number | undefined { + return lookupByModel(OPENAI_MAX_OUTPUT_TOKENS, model) +}