Files
orcs-code/src/utils/context.ts
Vasanth T aeaa658f77 fix: prevent infinite auto-compact loop for unknown 3P models (#635) (#636)
- Raise context window fallback from 8k to 128k for unknown OpenAI-compat models.
  The 8k fallback caused effective context (8k minus output reservation) to go
  negative, making auto-compact fire on every single message.
- Add safety floor in getEffectiveContextWindowSize(): effective context is
  always at least reservedTokensForSummary + 13k buffer, ensuring the
  auto-compact threshold stays positive.
- Add missing MiniMax model entries (M2.5, M2.5-highspeed, M2.1, M2.1-highspeed)
  all at 204,800 context / 131,072 max output per MiniMax docs.
- Add tests for MiniMax variants, 128k fallback, and autoCompact floor.

Fixes #635

Co-authored-by: root <root@vm7508.lumadock.com>
2026-04-13 02:03:02 +08:00

262 lines
8.5 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
// biome-ignore-all assist/source/organizeImports: internal-only import markers must not be reordered
import { CONTEXT_1M_BETA_HEADER } from '../constants/betas.js'
import { getGlobalConfig } from './config.js'
import { isEnvTruthy } from './envUtils.js'
import { getCanonicalName } from './model/model.js'
import { getModelCapability } from './model/modelCapabilities.js'
import { getOpenAIContextWindow, getOpenAIMaxOutputTokens } from './model/openaiContextWindows.js'
// Model context window size (200k tokens for all models right now)
export const MODEL_CONTEXT_WINDOW_DEFAULT = 200_000
// Fallback context window for unknown 3P models. Must be large enough that
// the effective context (this minus output token reservation) stays positive,
// otherwise auto-compact fires on every message (issue #635).
export const OPENAI_FALLBACK_CONTEXT_WINDOW = 128_000
// Maximum output tokens for compact operations
export const COMPACT_MAX_OUTPUT_TOKENS = 20_000
// Default max output tokens
const MAX_OUTPUT_TOKENS_DEFAULT = 32_000
const MAX_OUTPUT_TOKENS_UPPER_LIMIT = 64_000
// Capped default for slot-reservation optimization. BQ p99 output = 4,911
// tokens, so 32k/64k defaults over-reserve 8-16× slot capacity. With the cap
// enabled, <1% of requests hit the limit; those get one clean retry at 64k
// (see query.ts max_output_tokens_escalate). Cap is applied in
// claude.ts:getMaxOutputTokensForModel to avoid the growthbook→betas→context
// import cycle.
export const CAPPED_DEFAULT_MAX_TOKENS = 8_000
export const ESCALATED_MAX_TOKENS = 64_000
/**
* Check if 1M context is disabled via environment variable.
* Used by C4E admins to disable 1M context for HIPAA compliance.
*/
export function is1mContextDisabled(): boolean {
return isEnvTruthy(process.env.CLAUDE_CODE_DISABLE_1M_CONTEXT)
}
export function has1mContext(model: string): boolean {
if (is1mContextDisabled()) {
return false
}
return /\[1m\]/i.test(model)
}
// @[MODEL LAUNCH]: Update this pattern if the new model supports 1M context
export function modelSupports1M(model: string): boolean {
if (is1mContextDisabled()) {
return false
}
const canonical = getCanonicalName(model)
return canonical.includes('claude-sonnet-4') || canonical.includes('opus-4-6')
}
export function getContextWindowForModel(
model: string,
betas?: string[],
): number {
// Allow override via environment variable (internal-only)
// This takes precedence over all other context window resolution, including 1M detection,
// so users can cap the effective context window for local decisions (auto-compact, etc.)
// while still using a 1M-capable endpoint.
if (
process.env.USER_TYPE === 'ant' &&
process.env.CLAUDE_CODE_MAX_CONTEXT_TOKENS
) {
const override = parseInt(process.env.CLAUDE_CODE_MAX_CONTEXT_TOKENS, 10)
if (!isNaN(override) && override > 0) {
return override
}
}
// [1m] suffix — explicit client-side opt-in, respected over all detection
if (has1mContext(model)) {
return 1_000_000
}
// OpenAI-compatible provider — use known context windows for the model.
// Unknown models get a conservative 128k default. This was previously 8k,
// but that caused auto-compact to fire on every turn because the effective
// context (8k minus output reservation) became negative (issue #635).
const isOpenAIProvider =
isEnvTruthy(process.env.CLAUDE_CODE_USE_OPENAI) ||
isEnvTruthy(process.env.CLAUDE_CODE_USE_GEMINI) ||
isEnvTruthy(process.env.CLAUDE_CODE_USE_GITHUB) ||
isEnvTruthy(process.env.CLAUDE_CODE_USE_MISTRAL)
if (isOpenAIProvider) {
const openaiWindow = getOpenAIContextWindow(model)
if (openaiWindow !== undefined) {
return openaiWindow
}
console.error(
`[context] Warning: model "${model}" not in context window table — using conservative 128k default. ` +
'Add it to src/utils/model/openaiContextWindows.ts for accurate compaction.',
)
return OPENAI_FALLBACK_CONTEXT_WINDOW
}
const cap = getModelCapability(model)
if (cap?.max_input_tokens && cap.max_input_tokens >= 100_000) {
if (
cap.max_input_tokens > MODEL_CONTEXT_WINDOW_DEFAULT &&
is1mContextDisabled()
) {
return MODEL_CONTEXT_WINDOW_DEFAULT
}
return cap.max_input_tokens
}
if (betas?.includes(CONTEXT_1M_BETA_HEADER) && modelSupports1M(model)) {
return 1_000_000
}
if (getSonnet1mExpTreatmentEnabled(model)) {
return 1_000_000
}
if (process.env.USER_TYPE === 'ant') {
const antModel = resolveAntModel(model)
if (antModel?.contextWindow) {
return antModel.contextWindow
}
}
return MODEL_CONTEXT_WINDOW_DEFAULT
}
export function getSonnet1mExpTreatmentEnabled(model: string): boolean {
if (is1mContextDisabled()) {
return false
}
// Only applies to sonnet 4.6 without an explicit [1m] suffix
if (has1mContext(model)) {
return false
}
if (!getCanonicalName(model).includes('sonnet-4-6')) {
return false
}
return getGlobalConfig().clientDataCache?.['coral_reef_sonnet'] === 'true'
}
/**
* Calculate context window usage percentage from token usage data.
* Returns used and remaining percentages, or null values if no usage data.
*/
export function calculateContextPercentages(
currentUsage: {
input_tokens: number
cache_creation_input_tokens: number
cache_read_input_tokens: number
} | null,
contextWindowSize: number,
): { used: number | null; remaining: number | null } {
if (!currentUsage) {
return { used: null, remaining: null }
}
const totalInputTokens =
currentUsage.input_tokens +
currentUsage.cache_creation_input_tokens +
currentUsage.cache_read_input_tokens
const usedPercentage = Math.round(
(totalInputTokens / contextWindowSize) * 100,
)
const clampedUsed = Math.min(100, Math.max(0, usedPercentage))
return {
used: clampedUsed,
remaining: 100 - clampedUsed,
}
}
/**
* Returns the model's default and upper limit for max output tokens.
*/
export function getModelMaxOutputTokens(model: string): {
default: number
upperLimit: number
} {
let defaultTokens: number
let upperLimit: number
if (process.env.USER_TYPE === 'ant') {
const antModel = resolveAntModel(model.toLowerCase())
if (antModel) {
defaultTokens = antModel.defaultMaxTokens ?? MAX_OUTPUT_TOKENS_DEFAULT
upperLimit = antModel.upperMaxTokensLimit ?? MAX_OUTPUT_TOKENS_UPPER_LIMIT
return { default: defaultTokens, upperLimit }
}
}
// OpenAI-compatible provider — use known output limits to avoid 400 errors
if (
isEnvTruthy(process.env.CLAUDE_CODE_USE_OPENAI) ||
isEnvTruthy(process.env.CLAUDE_CODE_USE_GEMINI) ||
isEnvTruthy(process.env.CLAUDE_CODE_USE_GITHUB) ||
isEnvTruthy(process.env.CLAUDE_CODE_USE_MISTRAL)
) {
const openaiMax = getOpenAIMaxOutputTokens(model)
if (openaiMax !== undefined) {
return { default: openaiMax, upperLimit: openaiMax }
}
}
const m = getCanonicalName(model)
if (m.includes('opus-4-6')) {
defaultTokens = 64_000
upperLimit = 128_000
} else if (m.includes('sonnet-4-6')) {
defaultTokens = 32_000
upperLimit = 128_000
} else if (
m.includes('opus-4-5') ||
m.includes('sonnet-4') ||
m.includes('haiku-4')
) {
defaultTokens = 32_000
upperLimit = 64_000
} else if (m.includes('opus-4-1') || m.includes('opus-4')) {
defaultTokens = 32_000
upperLimit = 32_000
} else if (m.includes('claude-3-opus')) {
defaultTokens = 4_096
upperLimit = 4_096
} else if (m.includes('claude-3-sonnet')) {
defaultTokens = 8_192
upperLimit = 8_192
} else if (m.includes('claude-3-haiku')) {
defaultTokens = 4_096
upperLimit = 4_096
} else if (m.includes('3-5-sonnet') || m.includes('3-5-haiku')) {
defaultTokens = 8_192
upperLimit = 8_192
} else if (m.includes('3-7-sonnet')) {
defaultTokens = 32_000
upperLimit = 64_000
} else {
defaultTokens = MAX_OUTPUT_TOKENS_DEFAULT
upperLimit = MAX_OUTPUT_TOKENS_UPPER_LIMIT
}
const cap = getModelCapability(model)
if (cap?.max_tokens && cap.max_tokens >= 4_096) {
upperLimit = cap.max_tokens
defaultTokens = Math.min(defaultTokens, upperLimit)
}
return { default: defaultTokens, upperLimit }
}
/**
* Returns the max thinking budget tokens for a given model. The max
* thinking tokens should be strictly less than the max output tokens.
*
* Deprecated since newer models use adaptive thinking rather than a
* strict thinking token budget.
*/
export function getMaxThinkingTokensForModel(model: string): number {
return getModelMaxOutputTokens(model).upperLimit - 1
}