feat: add context window guard for OpenAI-compatible models
Without this fix, getContextWindowForModel() returns 200k for all OpenAI
models (the Claude default), causing two problems:
1. Auto-compact/warnings trigger at wrong thresholds (200k instead of 128k)
2. getModelMaxOutputTokens() returns 32k causing 400 errors from APIs that
cap output tokens lower (gpt-4o supports max 16384)
Fix:
- Add openaiContextWindows.ts with known context window sizes and max output
token limits for 30+ OpenAI-compatible models (OpenAI, DeepSeek, Groq,
Mistral, Ollama, LM Studio)
- Hook into getContextWindowForModel() so correct input limits are used
- Hook into getModelMaxOutputTokens() so correct output limits are sent,
preventing 400 "max_tokens is too large" errors
All existing warning, blocking, and auto-compact infrastructure works
automatically once the correct limits are returned.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -4,6 +4,7 @@ import { getGlobalConfig } from './config.js'
|
|||||||
import { isEnvTruthy } from './envUtils.js'
|
import { isEnvTruthy } from './envUtils.js'
|
||||||
import { getCanonicalName } from './model/model.js'
|
import { getCanonicalName } from './model/model.js'
|
||||||
import { getModelCapability } from './model/modelCapabilities.js'
|
import { getModelCapability } from './model/modelCapabilities.js'
|
||||||
|
import { getOpenAIContextWindow, getOpenAIMaxOutputTokens } from './model/openaiContextWindows.js'
|
||||||
|
|
||||||
// Model context window size (200k tokens for all models right now)
|
// Model context window size (200k tokens for all models right now)
|
||||||
export const MODEL_CONTEXT_WINDOW_DEFAULT = 200_000
|
export const MODEL_CONTEXT_WINDOW_DEFAULT = 200_000
|
||||||
@@ -71,6 +72,19 @@ export function getContextWindowForModel(
|
|||||||
return 1_000_000
|
return 1_000_000
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// OpenAI-compatible provider — use known context windows for the model
|
||||||
|
if (
|
||||||
|
process.env.CLAUDE_CODE_USE_OPENAI === '1' ||
|
||||||
|
process.env.CLAUDE_CODE_USE_OPENAI === 'true' ||
|
||||||
|
process.env.CLAUDE_CODE_USE_GEMINI === '1' ||
|
||||||
|
process.env.CLAUDE_CODE_USE_GEMINI === 'true'
|
||||||
|
) {
|
||||||
|
const openaiWindow = getOpenAIContextWindow(model)
|
||||||
|
if (openaiWindow !== undefined) {
|
||||||
|
return openaiWindow
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
const cap = getModelCapability(model)
|
const cap = getModelCapability(model)
|
||||||
if (cap?.max_input_tokens && cap.max_input_tokens >= 100_000) {
|
if (cap?.max_input_tokens && cap.max_input_tokens >= 100_000) {
|
||||||
if (
|
if (
|
||||||
@@ -162,6 +176,19 @@ export function getModelMaxOutputTokens(model: string): {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// OpenAI-compatible provider — use known output limits to avoid 400 errors
|
||||||
|
if (
|
||||||
|
process.env.CLAUDE_CODE_USE_OPENAI === '1' ||
|
||||||
|
process.env.CLAUDE_CODE_USE_OPENAI === 'true' ||
|
||||||
|
process.env.CLAUDE_CODE_USE_GEMINI === '1' ||
|
||||||
|
process.env.CLAUDE_CODE_USE_GEMINI === 'true'
|
||||||
|
) {
|
||||||
|
const openaiMax = getOpenAIMaxOutputTokens(model)
|
||||||
|
if (openaiMax !== undefined) {
|
||||||
|
return { default: openaiMax, upperLimit: openaiMax }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
const m = getCanonicalName(model)
|
const m = getCanonicalName(model)
|
||||||
|
|
||||||
if (m.includes('opus-4-6')) {
|
if (m.includes('opus-4-6')) {
|
||||||
|
|||||||
132
src/utils/model/openaiContextWindows.ts
Normal file
132
src/utils/model/openaiContextWindows.ts
Normal file
@@ -0,0 +1,132 @@
|
|||||||
|
/**
|
||||||
|
* openaiContextWindows.ts
|
||||||
|
* Context window sizes for OpenAI-compatible models used via the shim.
|
||||||
|
* Fixes: auto-compact and warnings using wrong 200k default for OpenAI models.
|
||||||
|
*
|
||||||
|
* When CLAUDE_CODE_USE_OPENAI=1, getContextWindowForModel() falls through to
|
||||||
|
* MODEL_CONTEXT_WINDOW_DEFAULT (200k). This causes the warning and blocking
|
||||||
|
* thresholds to be set at 200k even for models like gpt-4o (128k) or llama3 (8k),
|
||||||
|
* meaning users get no warning before hitting a hard API error.
|
||||||
|
*
|
||||||
|
* Prices in tokens as of April 2026 — update as needed.
|
||||||
|
*/
|
||||||
|
|
||||||
|
const OPENAI_CONTEXT_WINDOWS: Record<string, number> = {
|
||||||
|
// OpenAI
|
||||||
|
'gpt-4o': 128_000,
|
||||||
|
'gpt-4o-mini': 128_000,
|
||||||
|
'gpt-4.1': 1_047_576,
|
||||||
|
'gpt-4.1-mini': 1_047_576,
|
||||||
|
'gpt-4.1-nano': 1_047_576,
|
||||||
|
'gpt-4-turbo': 128_000,
|
||||||
|
'gpt-4': 8_192,
|
||||||
|
'o3-mini': 200_000,
|
||||||
|
'o4-mini': 200_000,
|
||||||
|
'o3': 200_000,
|
||||||
|
|
||||||
|
// DeepSeek
|
||||||
|
'deepseek-chat': 64_000,
|
||||||
|
'deepseek-reasoner': 64_000,
|
||||||
|
|
||||||
|
// Groq (fast inference)
|
||||||
|
'llama-3.3-70b-versatile': 128_000,
|
||||||
|
'llama-3.1-8b-instant': 128_000,
|
||||||
|
'mixtral-8x7b-32768': 32_768,
|
||||||
|
|
||||||
|
// Mistral
|
||||||
|
'mistral-large-latest': 131_072,
|
||||||
|
'mistral-small-latest': 131_072,
|
||||||
|
|
||||||
|
// Google (via OpenRouter)
|
||||||
|
'google/gemini-2.0-flash':1_048_576,
|
||||||
|
'google/gemini-2.5-pro': 1_048_576,
|
||||||
|
|
||||||
|
// Ollama local models
|
||||||
|
'llama3.3:70b': 8_192,
|
||||||
|
'llama3.1:8b': 8_192,
|
||||||
|
'llama3.2:3b': 8_192,
|
||||||
|
'qwen2.5-coder:32b': 32_768,
|
||||||
|
'qwen2.5-coder:7b': 32_768,
|
||||||
|
'deepseek-coder-v2:16b': 163_840,
|
||||||
|
'deepseek-r1:14b': 65_536,
|
||||||
|
'mistral:7b': 32_768,
|
||||||
|
'phi4:14b': 16_384,
|
||||||
|
'gemma2:27b': 8_192,
|
||||||
|
'codellama:13b': 16_384,
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Max output (completion) tokens per model.
|
||||||
|
* This is separate from the context window (input limit).
|
||||||
|
* Fixes: 400 error "max_tokens is too large" when default 32k exceeds model limit.
|
||||||
|
*/
|
||||||
|
const OPENAI_MAX_OUTPUT_TOKENS: Record<string, number> = {
|
||||||
|
// OpenAI
|
||||||
|
'gpt-4o': 16_384,
|
||||||
|
'gpt-4o-mini': 16_384,
|
||||||
|
'gpt-4.1': 32_768,
|
||||||
|
'gpt-4.1-mini': 32_768,
|
||||||
|
'gpt-4.1-nano': 32_768,
|
||||||
|
'gpt-4-turbo': 4_096,
|
||||||
|
'gpt-4': 4_096,
|
||||||
|
'o3-mini': 100_000,
|
||||||
|
'o4-mini': 100_000,
|
||||||
|
'o3': 100_000,
|
||||||
|
|
||||||
|
// DeepSeek
|
||||||
|
'deepseek-chat': 8_192,
|
||||||
|
'deepseek-reasoner': 32_768,
|
||||||
|
|
||||||
|
// Groq
|
||||||
|
'llama-3.3-70b-versatile': 32_768,
|
||||||
|
'llama-3.1-8b-instant': 8_192,
|
||||||
|
'mixtral-8x7b-32768': 32_768,
|
||||||
|
|
||||||
|
// Mistral
|
||||||
|
'mistral-large-latest': 32_768,
|
||||||
|
'mistral-small-latest': 32_768,
|
||||||
|
|
||||||
|
// Google (via OpenRouter)
|
||||||
|
'google/gemini-2.0-flash': 8_192,
|
||||||
|
'google/gemini-2.5-pro': 32_768,
|
||||||
|
|
||||||
|
// Ollama local models (conservative safe defaults)
|
||||||
|
'llama3.3:70b': 4_096,
|
||||||
|
'llama3.1:8b': 4_096,
|
||||||
|
'llama3.2:3b': 4_096,
|
||||||
|
'qwen2.5-coder:32b': 8_192,
|
||||||
|
'qwen2.5-coder:7b': 8_192,
|
||||||
|
'deepseek-coder-v2:16b': 8_192,
|
||||||
|
'deepseek-r1:14b': 8_192,
|
||||||
|
'mistral:7b': 4_096,
|
||||||
|
'phi4:14b': 4_096,
|
||||||
|
'gemma2:27b': 4_096,
|
||||||
|
'codellama:13b': 4_096,
|
||||||
|
}
|
||||||
|
|
||||||
|
function lookupByModel<T>(table: Record<string, T>, model: string): T | undefined {
|
||||||
|
if (table[model] !== undefined) return table[model]
|
||||||
|
for (const key of Object.keys(table)) {
|
||||||
|
if (model.startsWith(key)) return table[key]
|
||||||
|
}
|
||||||
|
return undefined
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Look up the context window for an OpenAI-compatible model.
|
||||||
|
* Returns undefined if the model is not in the table.
|
||||||
|
*
|
||||||
|
* Falls back to prefix matching so dated variants like
|
||||||
|
* "gpt-4o-2024-11-20" resolve to the base "gpt-4o" entry.
|
||||||
|
*/
|
||||||
|
export function getOpenAIContextWindow(model: string): number | undefined {
|
||||||
|
return lookupByModel(OPENAI_CONTEXT_WINDOWS, model)
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Look up the max output tokens for an OpenAI-compatible model.
|
||||||
|
* Returns undefined if the model is not in the table.
|
||||||
|
*/
|
||||||
|
export function getOpenAIMaxOutputTokens(model: string): number | undefined {
|
||||||
|
return lookupByModel(OPENAI_MAX_OUTPUT_TOKENS, model)
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user