diff --git a/.env.example b/.env.example index 42555f2b..14192e8d 100644 --- a/.env.example +++ b/.env.example @@ -299,6 +299,20 @@ ANTHROPIC_API_KEY=sk-ant-your-key-here # Useful for users who want full transparency over what the model sees # OPENCLAUDE_DISABLE_TOOL_REMINDERS=1 +# Log structured per-request token usage (including cache metrics) to stderr. +# Useful for auditing cache hit rate / debugging cost spikes outside the REPL. +# Any truthy value enables it ("verbose", "1", "true"). +# +# Complements (does NOT replace) CLAUDE_CODE_ENABLE_TOKEN_USAGE_ATTACHMENT — +# they serve different audiences: +# - OPENCLAUDE_LOG_TOKEN_USAGE is user-facing: one JSON line per API +# request on stderr, intended for humans inspecting cost/caching. +# - CLAUDE_CODE_ENABLE_TOKEN_USAGE_ATTACHMENT is model-facing: injects +# a context-usage attachment INTO the prompt so the model can reason +# about its own remaining context. Does not touch stderr. +# Turn on whichever audience you're debugging; both can run together. +# OPENCLAUDE_LOG_TOKEN_USAGE=verbose + # Custom timeout for API requests in milliseconds (default: varies) # API_TIMEOUT_MS=60000 diff --git a/docs/advanced-setup.md b/docs/advanced-setup.md index 5d8939dc..ff2513f8 100644 --- a/docs/advanced-setup.md +++ b/docs/advanced-setup.md @@ -177,6 +177,7 @@ export OPENAI_MODEL=gpt-4o | `CODEX_AUTH_JSON_PATH` | Codex only | Path to a Codex CLI `auth.json` file | | `CODEX_HOME` | Codex only | Alternative Codex home directory | | `OPENCLAUDE_DISABLE_CO_AUTHORED_BY` | No | Suppress the default `Co-Authored-By` trailer in generated git commits | +| `OPENCLAUDE_LOG_TOKEN_USAGE` | No | When truthy (e.g. `verbose`), emits one JSON line on stderr per API request with input/output/cache tokens and the resolved provider. **User-facing debug output** — complements the REPL display controlled by `/config showCacheStats`. Distinct from `CLAUDE_CODE_ENABLE_TOKEN_USAGE_ATTACHMENT`, which is **model-facing** (injects context usage info into the prompt itself). Both can run together. | You can also use `ANTHROPIC_MODEL` to override the model name. `OPENAI_MODEL` takes priority. diff --git a/src/commands.ts b/src/commands.ts index e43e73b5..0a55d993 100644 --- a/src/commands.ts +++ b/src/commands.ts @@ -34,6 +34,7 @@ import installGitHubApp from './commands/install-github-app/index.js' import installSlackApp from './commands/install-slack-app/index.js' import breakCache from './commands/break-cache/index.js' import cacheProbe from './commands/cache-probe/index.js' +import cacheStats from './commands/cacheStats/index.js' import mcp from './commands/mcp/index.js' import mobile from './commands/mobile/index.js' import onboarding from './commands/onboarding/index.js' @@ -271,6 +272,7 @@ const COMMANDS = memoize((): Command[] => [ branch, btw, cacheProbe, + cacheStats, chrome, clear, color, diff --git a/src/commands/cacheStats/cacheStats.test.ts b/src/commands/cacheStats/cacheStats.test.ts new file mode 100644 index 00000000..a508344c --- /dev/null +++ b/src/commands/cacheStats/cacheStats.test.ts @@ -0,0 +1,157 @@ +/** + * Tests for `/cache-stats` command rendering. + * + * The command has non-trivial string formatting (timestamp slicing, model + * label padding, conditional N/A footnote, recent-rows cap) which can + * silently regress — these snapshot tests keep it honest. + */ +import { beforeEach, describe, expect, test } from 'bun:test' +import type { CacheMetrics } from '../../services/api/cacheMetrics.js' +import { + _setHistoryCapForTesting, + recordRequest, + resetSessionCacheStats, +} from '../../services/api/cacheStatsTracker.js' +import { call } from './cacheStats.js' + +function supported(partial: Partial): CacheMetrics { + return { + read: 0, + created: 0, + total: 0, + hitRate: null, + supported: true, + ...partial, + } +} + +const UNSUPPORTED: CacheMetrics = { + read: 0, + created: 0, + total: 0, + hitRate: null, + supported: false, +} + +// The command signature requires a LocalJSXCommandContext. Our command +// doesn't actually read it — we pass an empty stand-in so the test can +// invoke call() without dragging the whole REPL context in. +const EMPTY_CTX = {} as Parameters[1] + +// /cache-stats always returns a text result. Narrow the union here so +// the assertions don't need to redo the discriminant check every call. +async function runCommand(): Promise { + const result = await call('', EMPTY_CTX) + if (result.type !== 'text') { + throw new Error( + `cacheStats command must return type:'text', got ${result.type}`, + ) + } + return result.value +} + +beforeEach(() => { + resetSessionCacheStats() + _setHistoryCapForTesting(500) +}) + +describe('/cache-stats — empty session', () => { + test('shows friendly "no requests yet" message', async () => { + const value = await runCommand() + expect(value).toContain('No API requests yet this session') + expect(value).toContain('/cache-stats') + }) +}) + +describe('/cache-stats — supported-only session', () => { + test('renders Cache stats header, turn and session summaries', async () => { + recordRequest( + supported({ read: 500, total: 1_000, hitRate: 0.5 }), + 'claude-sonnet-4', + ) + const value = await runCommand() + expect(value).toContain('Cache stats') + expect(value).toContain('Current turn:') + expect(value).toContain('Session total:') + // Compact metric line should appear in the recent-requests table. + expect(value).toContain('claude-sonnet-4') + expect(value).toContain('read') + }) + + test('omits the N/A footnote when every row is supported', async () => { + recordRequest(supported({ read: 200, total: 400, hitRate: 0.5 }), 'model-A') + const value = await runCommand() + expect(value).not.toContain('N/A rows') + }) +}) + +describe('/cache-stats — mixed supported + unsupported', () => { + test('renders N/A footnote when any row is unsupported', async () => { + recordRequest(UNSUPPORTED, 'gpt-4-copilot') + recordRequest( + supported({ read: 100, total: 500, hitRate: 0.2 }), + 'claude-sonnet-4', + ) + const value = await runCommand() + expect(value).toContain( + 'N/A rows: provider API does not expose cache usage', + ) + expect(value).toContain('GitHub Copilot') + expect(value).toContain('Ollama') + }) +}) + +describe('/cache-stats — recent-rows cap', () => { + test('caps the breakdown at 20 rows and reports omitted count', async () => { + for (let i = 0; i < 25; i++) { + recordRequest( + supported({ read: i, total: 100, hitRate: i / 100 }), + `model-${i}`, + ) + } + const value = await runCommand() + // 20 shown, 5 omitted from the oldest end. + expect(value).toContain('(20 of 25, 5 older omitted)') + // Oldest rows (model-0..model-4) should not appear; newest must. + expect(value).toContain('model-24') + expect(value).not.toContain('model-0 ') + }) + + test('does not mention "older omitted" when all rows fit', async () => { + for (let i = 0; i < 5; i++) { + recordRequest(supported({ read: i, total: 10 }), `m${i}`) + } + const value = await runCommand() + expect(value).not.toContain('older omitted') + expect(value).toContain('(5)') + }) +}) + +describe('/cache-stats — model label rendering', () => { + test('truncates long model labels to fit the column width', async () => { + // cacheStats.ts pads+slices the label to 28 chars for alignment. + const longLabel = 'some-extremely-long-model-identifier-that-wraps' + recordRequest(supported({ read: 10, total: 100, hitRate: 0.1 }), longLabel) + const value = await runCommand() + // Sliced to 28 chars. + expect(value).toContain(longLabel.slice(0, 28)) + // And the full string should NOT appear (would mean no truncation). + expect(value).not.toContain(longLabel) + }) +}) + +describe('/cache-stats — timestamp rendering', () => { + test('renders each row with full date and time (YYYY-MM-DD HH:MM:SS)', async () => { + recordRequest(supported({ read: 5, total: 10, hitRate: 0.5 }), 'claude-x') + const value = await runCommand() + // Match the full ISO-ish date + time the row uses. We assert the shape, + // not a specific timestamp — real clock is used, so a regex on the + // format is the right assertion. + expect(value).toMatch(/\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}/) + // Bare time-of-day alone (no date) should NOT appear in isolation — it + // must always be preceded by the date. Guards against regression if + // someone shortens the formatter again. + const timeOnlyInRow = /\n\s*#\s*\d+\s+\d{2}:\d{2}:\d{2}\s/.test(value) + expect(timeOnlyInRow).toBe(false) + }) +}) diff --git a/src/commands/cacheStats/cacheStats.ts b/src/commands/cacheStats/cacheStats.ts new file mode 100644 index 00000000..ebd32ab2 --- /dev/null +++ b/src/commands/cacheStats/cacheStats.ts @@ -0,0 +1,74 @@ +import { + getCacheStatsHistory, + getCurrentTurnCacheMetrics, + getSessionCacheMetrics, + type CacheStatsEntry, +} from '../../services/api/cacheStatsTracker.js' +import { + formatCacheMetricsCompact, + formatCacheMetricsFull, + type CacheMetrics, +} from '../../services/api/cacheMetrics.js' +import type { LocalCommandCall } from '../../types/command.js' + +// Cap the per-request breakdown to keep output readable. Users wanting +// the full history can rely on OPENCLAUDE_LOG_TOKEN_USAGE=verbose for +// structured per-request stderr output. +const MAX_RECENT_ROWS = 20 + +function formatRow(entry: CacheStatsEntry, idx: number): string { + // `YYYY-MM-DD HH:MM:SS` — long-running sessions can span midnight and a + // bare time-of-day makes the wrong row look "most recent" when two + // entries on different days share the same HH:MM:SS. + const iso = new Date(entry.timestamp).toISOString() + const ts = `${iso.slice(0, 10)} ${iso.slice(11, 19)}` + const line = formatCacheMetricsCompact(entry.metrics) + return ` #${String(idx + 1).padStart(3)} ${ts} ${entry.label.padEnd(28).slice(0, 28)} ${line}` +} + +function summarize(label: string, m: CacheMetrics): string { + return `${label.padEnd(18)}${formatCacheMetricsFull(m)}` +} + +export const call: LocalCommandCall = async () => { + const history = getCacheStatsHistory() + const session = getSessionCacheMetrics() + const turn = getCurrentTurnCacheMetrics() + + if (history.length === 0) { + return { + type: 'text', + value: + 'Cache stats\n No API requests yet this session.\n Start a turn and re-run /cache-stats to see results.', + } + } + + const recent = history.slice(-MAX_RECENT_ROWS) + const omitted = history.length - recent.length + + const lines: string[] = ['Cache stats', ''] + lines.push(summarize('Current turn:', turn)) + lines.push(summarize('Session total:', session)) + lines.push('') + lines.push(`Recent requests (${recent.length}${omitted > 0 ? ` of ${history.length}, ${omitted} older omitted` : ''}):`) + lines.push(` # time model cache`) + for (const [i, entry] of recent.entries()) { + lines.push(formatRow(entry, history.length - recent.length + i)) + } + + // Honesty footnote — providers without cache reporting (vanilla Copilot, + // Ollama) show [Cache: N/A] rather than a fake 0%. Tell the user so they + // don't read "N/A" as "broken". + const hasUnsupported = recent.some((e) => !e.metrics.supported) + if (hasUnsupported) { + lines.push('') + lines.push( + ' N/A rows: provider API does not expose cache usage (GitHub Copilot, Ollama).', + ) + lines.push( + ' The request still ran normally — only the metric is unavailable.', + ) + } + + return { type: 'text', value: lines.join('\n') } +} diff --git a/src/commands/cacheStats/index.ts b/src/commands/cacheStats/index.ts new file mode 100644 index 00000000..aaf1f273 --- /dev/null +++ b/src/commands/cacheStats/index.ts @@ -0,0 +1,24 @@ +/** + * /cache-stats — per-session cache diagnostics. + * + * Always-on diagnostic command (no toggle) that surfaces the metrics + * tracked in `cacheStatsTracker.ts`. Breaks cache usage down by request + * and also reports the session-wide aggregate — useful when the user + * suspects a cache bust (e.g. after /reload-plugins) and wants to see + * whether recent turns still hit the cache. + * + * Lazy-loaded (implementation in cacheStats.ts) to keep startup time + * minimal — same pattern used by /cost and /cache-probe. + */ +import type { Command } from '../../commands.js' + +const cacheStats = { + type: 'local', + name: 'cache-stats', + description: + 'Show per-turn and session cache hit/miss stats (works across all providers)', + supportsNonInteractive: true, + load: () => import('./cacheStats.js'), +} satisfies Command + +export default cacheStats diff --git a/src/components/Settings/Config.tsx b/src/components/Settings/Config.tsx index 31ed3ee4..98ffe5ad 100644 --- a/src/components/Settings/Config.tsx +++ b/src/components/Settings/Config.tsx @@ -299,6 +299,26 @@ export function Config({ enabled: toolHistoryCompressionEnabled }); } + }, { + id: 'showCacheStats', + label: 'Cache stats display', + value: globalConfig.showCacheStats, + options: ['off', 'compact', 'full'], + type: 'enum' as const, + onChange(mode: string) { + const showCacheStats = (mode === 'off' || mode === 'compact' || mode === 'full' ? mode : 'compact') as 'off' | 'compact' | 'full'; + saveGlobalConfig(current_cs => ({ + ...current_cs, + showCacheStats + })); + setGlobalConfig({ + ...getGlobalConfig(), + showCacheStats + }); + logEvent('tengu_show_cache_stats_setting_changed', { + mode: showCacheStats as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS + }); + } }, { id: 'spinnerTipsEnabled', label: 'Show tips', diff --git a/src/cost-tracker.cacheIntegration.test.ts b/src/cost-tracker.cacheIntegration.test.ts new file mode 100644 index 00000000..528af3f9 --- /dev/null +++ b/src/cost-tracker.cacheIntegration.test.ts @@ -0,0 +1,128 @@ +/** + * Integration test for cost-tracker → cacheStatsTracker wiring. + * + * The unit tests in services/api/cacheMetrics.test.ts and + * services/api/cacheStatsTracker.test.ts verify that each piece works + * in isolation. This file verifies that they're ACTUALLY CONNECTED — + * that `addToTotalSessionCost` resolves the provider, extracts metrics, + * and records them on the tracker on every call. Without this test, a + * future refactor could silently unwire the call chain (wrong param + * order, renamed symbol, removed call) and every individual unit test + * would still pass while `/cache-stats` showed empty data. + * + * We use real state — `resetCostState` + `getCurrentTurnCacheMetrics` — + * rather than mocking the tracker module. Fewer moving parts, and the + * test fails for the right reason if anyone breaks the wrapping. + */ +import { beforeEach, describe, expect, test } from 'bun:test' +import { addToTotalSessionCost, resetCostState } from './cost-tracker.js' +import { + getCurrentTurnCacheMetrics, + getSessionCacheMetrics, +} from './services/api/cacheStatsTracker.js' + +// BetaUsage-compatible shape — minimum fields addToTotalSessionCost +// needs to run without throwing. Cache fields are the ones we care +// about here; input/output go into model cost calc. +function anthropicUsage(partial: { + input?: number + output?: number + cacheRead?: number + cacheCreation?: number +}): Parameters[1] { + return { + input_tokens: partial.input ?? 0, + output_tokens: partial.output ?? 0, + cache_read_input_tokens: partial.cacheRead ?? 0, + cache_creation_input_tokens: partial.cacheCreation ?? 0, + // BetaUsage has several other optional fields; they're not read by + // the cache-tracking path so we leave them undefined. + } as Parameters[1] +} + +beforeEach(() => { + // resetCostState is the wrapped version that ALSO clears the cache + // tracker — this line is itself part of what we're verifying. + resetCostState() +}) + +describe('addToTotalSessionCost → cacheStatsTracker wiring', () => { + test('records normalized cache metrics on the tracker for each call', () => { + addToTotalSessionCost( + 0.01, + anthropicUsage({ + input: 200, + output: 50, + cacheRead: 800, + cacheCreation: 100, + }), + 'claude-sonnet-4', + ) + + const turn = getCurrentTurnCacheMetrics() + expect(turn.supported).toBe(true) + expect(turn.read).toBe(800) + expect(turn.created).toBe(100) + // total = fresh(200) + read(800) + created(100) = 1100 + expect(turn.total).toBe(1_100) + // hitRate = read / total = 800 / 1100 ≈ 0.727 + expect(turn.hitRate).toBeCloseTo(800 / 1_100, 4) + }) + + test('session aggregate accumulates across multiple API calls', () => { + addToTotalSessionCost( + 0.01, + anthropicUsage({ input: 100, cacheRead: 400 }), + 'claude-sonnet-4', + ) + addToTotalSessionCost( + 0.02, + anthropicUsage({ input: 200, cacheRead: 600 }), + 'claude-sonnet-4', + ) + + const session = getSessionCacheMetrics() + expect(session.read).toBe(1_000) + // total = (100+400) + (200+600) = 1300 + expect(session.total).toBe(1_300) + expect(session.hitRate).toBeCloseTo(1_000 / 1_300, 4) + }) + + test('cold turn (no cache read/created) still records as supported', () => { + addToTotalSessionCost( + 0.005, + anthropicUsage({ input: 500, output: 100 }), + 'claude-sonnet-4', + ) + + const turn = getCurrentTurnCacheMetrics() + expect(turn.supported).toBe(true) + expect(turn.read).toBe(0) + expect(turn.created).toBe(0) + expect(turn.total).toBe(500) + // hitRate computed against a non-zero total is 0, not null — empty + // cache on a cacheable provider is a legitimate "no-hit" signal. + expect(turn.hitRate).toBe(0) + }) +}) + +describe('resetCostState wrapper also clears cache tracker', () => { + test('resetCostState() zeros both cost counters and cache stats', () => { + // Populate both systems + addToTotalSessionCost( + 0.01, + anthropicUsage({ input: 100, cacheRead: 500 }), + 'claude-sonnet-4', + ) + expect(getSessionCacheMetrics().read).toBe(500) + + // resetCostState is the WRAPPED version — bootstrap's + // resetCostState cleared cost state historically but not cache + // stats. The wrapper in cost-tracker.ts adds the second call. + resetCostState() + + const session = getSessionCacheMetrics() + expect(session.read).toBe(0) + expect(session.supported).toBe(false) + }) +}) diff --git a/src/cost-tracker.ts b/src/cost-tracker.ts index 56920c5a..1e3f123a 100644 --- a/src/cost-tracker.ts +++ b/src/cost-tracker.ts @@ -1,5 +1,14 @@ import type { BetaUsage as Usage } from '@anthropic-ai/sdk/resources/beta/messages/messages.mjs' import chalk from 'chalk' +import { + extractCacheMetrics, + resolveCacheProvider, +} from './services/api/cacheMetrics.js' +import { + recordRequest as recordCacheRequest, + resetSessionCacheStats, +} from './services/api/cacheStatsTracker.js' +import { getAPIProvider, isGithubNativeAnthropicMode } from './utils/model/providers.js' import { addToTotalCostState, addToTotalLinesChanged, @@ -22,7 +31,7 @@ import { getTotalWebSearchRequests, getUsageForModel, hasUnknownModelCost, - resetCostState, + resetCostState as baseResetCostState, resetStateForTests, setCostStateForRestore, setHasUnknownModelCost, @@ -62,12 +71,22 @@ export { formatCost, hasUnknownModelCost, resetStateForTests, - resetCostState, setHasUnknownModelCost, getModelUsage, getUsageForModel, } +/** + * Wraps bootstrap's resetCostState() so /clear, /compact and session + * switches zero the cache-stats tracker alongside the cost counters. + * Exported under the same name so existing callers pick up the cache + * reset without any call-site changes. + */ +export function resetCostState(): void { + baseResetCostState() + resetSessionCacheStats() +} + type StoredCostState = { totalCostUSD: number totalAPIDuration: number @@ -251,6 +270,16 @@ function round(number: number, precision: number): number { return Math.round(number * precision) / precision } +// Env-gated verbose token usage log. Treated as a boolean regardless of +// value specifics — any truthy-ish string switches it on. `verbose` is the +// documented keyword but we accept `1`/`true` for ergonomic parity with +// other OPENCLAUDE_* flags. +function shouldLogTokenUsageVerbose(): boolean { + const v = (process.env.OPENCLAUDE_LOG_TOKEN_USAGE ?? '').trim().toLowerCase() + if (!v) return false + return v !== '0' && v !== 'false' && v !== 'off' +} + function addToTotalModelUsage( cost: number, usage: Usage, @@ -287,6 +316,43 @@ export function addToTotalSessionCost( const modelUsage = addToTotalModelUsage(cost, usage, model) addToTotalCostState(cost, modelUsage, model) + // Record normalized cache metrics for REPL display + /cache-stats. + // Resolved from the current process provider — at this point `usage` has + // already been Anthropic-shaped by the shim layer, so we feed the + // corresponding bucket (anthropic / copilot-claude / openai-like) to the + // extractor. For providers that genuinely don't report cache data + // (vanilla Copilot, Ollama), resolveCacheProvider steers us to + // supported:false so the UI shows "N/A" instead of lying with "0%". + const cacheProvider = resolveCacheProvider(getAPIProvider(), { + githubNativeAnthropic: isGithubNativeAnthropicMode(model), + openAiBaseUrl: process.env.OPENAI_BASE_URL ?? process.env.OPENAI_API_BASE, + }) + const cacheMetrics = extractCacheMetrics( + usage as unknown as Record, + cacheProvider, + ) + recordCacheRequest(cacheMetrics, model) + + // Opt-in structured per-request debug log on stderr. Power-user knob, not + // shown in the REPL — complements CLAUDE_CODE_ENABLE_TOKEN_USAGE_ATTACHMENT + // (which is model-facing). Any truthy value except "0"/"false" enables it. + if (shouldLogTokenUsageVerbose()) { + process.stderr.write( + JSON.stringify({ + tag: 'openclaude.tokenUsage', + model, + provider: cacheProvider, + input_tokens: usage.input_tokens, + output_tokens: usage.output_tokens, + cache_read_input_tokens: usage.cache_read_input_tokens ?? 0, + cache_creation_input_tokens: usage.cache_creation_input_tokens ?? 0, + cache_supported: cacheMetrics.supported, + cache_hit_rate: cacheMetrics.hitRate, + cost_usd: cost, + }) + '\n', + ) + } + const attrs = isFastModeEnabled() && usage.speed === 'fast' ? { model, speed: 'fast' } diff --git a/src/screens/REPL.tsx b/src/screens/REPL.tsx index d3e3f0f9..72ddec53 100644 --- a/src/screens/REPL.tsx +++ b/src/screens/REPL.tsx @@ -133,6 +133,8 @@ import { hasConsoleBillingAccess } from '../utils/billing.js'; import { logEvent, type AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS } from 'src/services/analytics/index.js'; import { getFeatureValue_CACHED_MAY_BE_STALE } from 'src/services/analytics/growthbook.js'; import { textForResubmit, handleMessageFromStream, type StreamingToolUse, type StreamingThinking, isCompactBoundaryMessage, getMessagesAfterCompactBoundary, getContentText, createUserMessage, createAssistantMessage, createTurnDurationMessage, createAgentsKilledMessage, createApiMetricsMessage, createSystemMessage, createCommandInputMessage, formatCommandInputTags } from '../utils/messages.js'; +import { getCurrentTurnCacheMetrics, resetCurrentTurn } from '../services/api/cacheStatsTracker.js'; +import { formatCacheMetricsCompact, formatCacheMetricsFull } from '../services/api/cacheMetrics.js'; import { generateSessionTitle } from '../utils/sessionTitle.js'; import { BASH_INPUT_TAG, COMMAND_MESSAGE_TAG, COMMAND_NAME_TAG, LOCAL_COMMAND_STDOUT_TAG } from '../constants/xml.js'; import { escapeXml } from '../utils/xml.js'; @@ -2921,6 +2923,13 @@ export function REPL({ // isLoading is derived from queryGuard — tryStart() above already // transitioned dispatching→running, so no setter call needed here. resetTimingRefs(); + // Start-of-turn cache tracker reset. The end-of-turn path at the + // bottom of this function already resets, but mirror the call here + // so a turn that never reaches end-of-turn (crash, unhandled + // rejection, process exit) still starts clean on the next one. + // Idempotent with respect to the end-of-turn reset — double-reset + // is a no-op. + resetCurrentTurn(); setMessages(oldMessages => [...oldMessages, ...newMessages]); responseLengthRef.current = 0; if (feature('TOKEN_BUDGET')) { @@ -3019,6 +3028,38 @@ export function REPL({ setMessages(prev => [...prev, createTurnDurationMessage(turnDurationMs, budgetInfo, count(prev, isLoggableMessage))]); } } + // Cache stats line — controlled by `/config showCacheStats`. Shows + // per-query read/hit stats using the provider-normalized metrics + // from cacheStatsTracker. 'off' skips, 'compact' gives a one-liner, + // 'full' gives a breakdown. Display is skipped when the user + // aborted or proactive mode is active — but the counter reset + // below still runs in those cases. + if (!abortController.signal.aborted && !proactiveActive) { + // Defensive default: config layer already merges 'compact' from + // DEFAULT_GLOBAL_CONFIG (see config.ts:1494) for configs that + // predate this feature, so `mode` should always be defined. + // The `?? 'compact'` fallback covers pathological cases — a + // corrupt config read that returned an empty object, or a + // race between writer and reader — where the merge didn't + // land. Rendering the line is the safer failure mode than + // silently hiding it. + const mode = getGlobalConfig().showCacheStats ?? 'compact'; + if (mode !== 'off') { + const turnMetrics = getCurrentTurnCacheMetrics(); + // Skip rendering if the turn recorded no API activity at all — + // avoids a spurious "[Cache: cold]" on local-only commands. + if (turnMetrics.supported || turnMetrics.read > 0 || turnMetrics.total > 0) { + const line = mode === 'full' ? formatCacheMetricsFull(turnMetrics) : formatCacheMetricsCompact(turnMetrics); + setMessages(prev => [...prev, createSystemMessage(line, 'info')]); + } + } + } + // Reset turn counters UNCONDITIONALLY — users routinely interrupt + // (Ctrl+C) mid-turn, and if we kept the reset gated on + // !aborted, the in-flight turn's metrics would leak into the + // next turn's aggregate. Proactive turns also need the reset so + // their metrics don't pile onto the following user turn. + resetCurrentTurn(); // Clear the controller so CancelRequestHandler's canCancelRunningTask // reads false at the idle prompt. Without this, the stale non-aborted // controller makes ctrl+c fire onCancel() (aborting nothing) instead of diff --git a/src/services/api/cacheMetrics.test.ts b/src/services/api/cacheMetrics.test.ts new file mode 100644 index 00000000..893924c8 --- /dev/null +++ b/src/services/api/cacheMetrics.test.ts @@ -0,0 +1,782 @@ +import { expect, test, describe } from 'bun:test' +import { + extractCacheMetrics, + extractCacheReadFromRawUsage, + resolveCacheProvider, + formatCacheMetricsCompact, + formatCacheMetricsFull, + addCacheMetrics, +} from './cacheMetrics.js' + +describe('extractCacheMetrics — Anthropic (firstParty/bedrock/vertex/foundry)', () => { + test('reports read/created separately and computes hit rate over total input', () => { + const usage = { + input_tokens: 300, + output_tokens: 100, + cache_read_input_tokens: 800, + cache_creation_input_tokens: 200, + } + const m = extractCacheMetrics(usage, 'anthropic') + expect(m.supported).toBe(true) + expect(m.read).toBe(800) + expect(m.created).toBe(200) + // total = fresh(300) + created(200) + read(800) = 1300 + expect(m.total).toBe(1300) + expect(m.hitRate).toBeCloseTo(800 / 1300, 4) + }) + + test('returns cold metrics when no cache activity yet', () => { + const m = extractCacheMetrics({ input_tokens: 500 }, 'anthropic') + expect(m.supported).toBe(true) + expect(m.read).toBe(0) + expect(m.created).toBe(0) + expect(m.hitRate).toBe(0) + }) + + test('null hit rate when usage has no input at all', () => { + const m = extractCacheMetrics({}, 'anthropic') + expect(m.supported).toBe(true) + expect(m.hitRate).toBeNull() + }) +}) + +// NOTE: OpenAI/Codex/Kimi/DeepSeek/Gemini raw shapes are now tested through +// extractCacheReadFromRawUsage (below). extractCacheMetrics sees the +// post-shim Anthropic shape for every provider, so the tests here verify +// that the shape lookup works uniformly against the shimmed fields. + +describe('extractCacheMetrics — post-shim Anthropic shape (applies to all providers)', () => { + test('OpenAI post-shim (openai bucket) — reads Anthropic fields injected by convertChunkUsage', () => { + // This is what cost-tracker actually sees for OpenAI upstreams: the + // shim has already subtracted cached from prompt_tokens and moved it + // to cache_read_input_tokens. + const shimmed = { + input_tokens: 800, // fresh = 2000 - 1200 + output_tokens: 300, + cache_creation_input_tokens: 0, + cache_read_input_tokens: 1_200, + } + const m = extractCacheMetrics(shimmed, 'openai') + expect(m.supported).toBe(true) + expect(m.read).toBe(1_200) + expect(m.created).toBe(0) + expect(m.total).toBe(2_000) // 800 fresh + 1200 read + expect(m.hitRate).toBe(0.6) + }) + + test('Codex post-shim — same Anthropic shape as OpenAI', () => { + const shimmed = { + input_tokens: 900, // 1500 - 600 + cache_creation_input_tokens: 0, + cache_read_input_tokens: 600, + } + const m = extractCacheMetrics(shimmed, 'codex') + expect(m.read).toBe(600) + expect(m.total).toBe(1_500) + expect(m.hitRate).toBe(0.4) + }) + + test('Kimi post-shim — shim moved top-level cached_tokens into Anthropic field', () => { + const shimmed = { + input_tokens: 600, // 1000 - 400 + cache_creation_input_tokens: 0, + cache_read_input_tokens: 400, + } + const m = extractCacheMetrics(shimmed, 'kimi') + expect(m.read).toBe(400) + expect(m.total).toBe(1_000) + expect(m.hitRate).toBe(0.4) + }) + + test('DeepSeek post-shim — hit moved to cache_read_input_tokens, miss to input_tokens', () => { + const shimmed = { + input_tokens: 300, // miss + cache_creation_input_tokens: 0, + cache_read_input_tokens: 700, // hit + } + const m = extractCacheMetrics(shimmed, 'deepseek') + expect(m.read).toBe(700) + expect(m.total).toBe(1_000) + expect(m.hitRate).toBe(0.7) + }) + + test('Gemini post-shim — cached_content_token_count moved to cache_read_input_tokens', () => { + const shimmed = { + input_tokens: 800, // 4000 - 3200 + cache_creation_input_tokens: 0, + cache_read_input_tokens: 3_200, + } + const m = extractCacheMetrics(shimmed, 'gemini') + expect(m.read).toBe(3_200) + expect(m.total).toBe(4_000) + expect(m.hitRate).toBe(0.8) + }) +}) + +describe('extractCacheReadFromRawUsage — single source of truth for shim layer', () => { + test('Anthropic-native passthrough: cache_read_input_tokens', () => { + expect( + extractCacheReadFromRawUsage({ cache_read_input_tokens: 1_500 }), + ).toBe(1_500) + }) + + test('OpenAI Chat Completions: prompt_tokens_details.cached_tokens', () => { + expect( + extractCacheReadFromRawUsage({ + prompt_tokens: 2_000, + prompt_tokens_details: { cached_tokens: 1_200 }, + }), + ).toBe(1_200) + }) + + test('Codex Responses API: input_tokens_details.cached_tokens', () => { + expect( + extractCacheReadFromRawUsage({ + input_tokens: 1_500, + input_tokens_details: { cached_tokens: 600 }, + }), + ).toBe(600) + }) + + test('Kimi / Moonshot: top-level cached_tokens', () => { + expect( + extractCacheReadFromRawUsage({ prompt_tokens: 1_000, cached_tokens: 400 }), + ).toBe(400) + }) + + test('DeepSeek: prompt_cache_hit_tokens', () => { + expect( + extractCacheReadFromRawUsage({ + prompt_cache_hit_tokens: 700, + prompt_cache_miss_tokens: 300, + }), + ).toBe(700) + }) + + test('Gemini: cached_content_token_count', () => { + expect( + extractCacheReadFromRawUsage({ + prompt_token_count: 4_000, + cached_content_token_count: 3_200, + }), + ).toBe(3_200) + }) + + test('no cache fields at all → 0 (Copilot/Ollama/unknown shape)', () => { + expect(extractCacheReadFromRawUsage({ prompt_tokens: 500 })).toBe(0) + }) + + test('Anthropic field wins over OpenAI field when both present', () => { + // Shouldn't happen in practice, but if usage was double-annotated we + // trust the Anthropic-native number (it's the more authoritative one). + expect( + extractCacheReadFromRawUsage({ + cache_read_input_tokens: 999, + prompt_tokens_details: { cached_tokens: 111 }, + }), + ).toBe(999) + }) + + test('null/undefined/non-object → 0', () => { + expect(extractCacheReadFromRawUsage(null)).toBe(0) + expect(extractCacheReadFromRawUsage(undefined)).toBe(0) + expect(extractCacheReadFromRawUsage('nope' as unknown as never)).toBe(0) + }) +}) + +describe('extractCacheMetrics — Copilot / Ollama (unsupported)', () => { + test('returns supported:false with all zeros and null hitRate for Copilot', () => { + const m = extractCacheMetrics({ prompt_tokens: 1000 }, 'copilot') + expect(m.supported).toBe(false) + expect(m.read).toBe(0) + expect(m.created).toBe(0) + expect(m.hitRate).toBeNull() + }) + + test('returns supported:false for Ollama', () => { + const m = extractCacheMetrics({ prompt_tokens: 42 }, 'ollama') + expect(m.supported).toBe(false) + expect(m.hitRate).toBeNull() + }) + + test('Copilot serving Claude (copilot-claude) is supported and uses Anthropic fields', () => { + const usage = { + input_tokens: 200, + cache_read_input_tokens: 800, + cache_creation_input_tokens: 100, + } + const m = extractCacheMetrics(usage, 'copilot-claude') + expect(m.supported).toBe(true) + expect(m.read).toBe(800) + expect(m.created).toBe(100) + expect(m.total).toBe(1_100) + }) +}) + +describe('extractCacheMetrics — bad/empty input', () => { + test('null usage returns unsupported', () => { + expect(extractCacheMetrics(null, 'anthropic').supported).toBe(false) + }) + + test('non-object usage returns unsupported', () => { + expect(extractCacheMetrics('oops' as unknown as never, 'openai').supported).toBe( + false, + ) + }) +}) + +describe('resolveCacheProvider', () => { + test('firstParty → anthropic', () => { + expect(resolveCacheProvider('firstParty')).toBe('anthropic') + }) + test('bedrock/vertex/foundry → anthropic', () => { + expect(resolveCacheProvider('bedrock')).toBe('anthropic') + expect(resolveCacheProvider('vertex')).toBe('anthropic') + expect(resolveCacheProvider('foundry')).toBe('anthropic') + }) + test('github without claude hint → copilot (unsupported)', () => { + expect(resolveCacheProvider('github')).toBe('copilot') + }) + test('github with claude hint → copilot-claude', () => { + expect( + resolveCacheProvider('github', { githubNativeAnthropic: true }), + ).toBe('copilot-claude') + }) + test('openai with localhost / loopback → self-hosted', () => { + // These used to return 'ollama'; the bucket is now 'self-hosted' + // because not every local OpenAI-compatible server is Ollama + // (could be vLLM, LM Studio, LocalAI, text-generation-webui). + // Both buckets collapse to supported=false downstream. + expect( + resolveCacheProvider('openai', { openAiBaseUrl: 'http://localhost:8080/v1' }), + ).toBe('self-hosted') + expect( + resolveCacheProvider('openai', { openAiBaseUrl: 'http://127.0.0.1:1234/v1' }), + ).toBe('self-hosted') + // Localhost:11434 hits the self-hosted branch first — 'ollama' only + // kicks in when the :11434 port appears on a public-looking URL + // (which would be unusual but still deserves honest classification). + expect( + resolveCacheProvider('openai', { openAiBaseUrl: 'http://localhost:11434/v1' }), + ).toBe('self-hosted') + expect( + resolveCacheProvider('openai', { openAiBaseUrl: 'http://[::1]:5000/v1' }), + ).toBe('self-hosted') + }) + + test('openai on RFC1918 private IP → self-hosted (pre-fix: misclassified as openai)', () => { + // These are the exact cases the reviewer flagged. Before this fix, + // a vLLM / LocalAI server on a LAN address fell through to the + // 'openai' branch and /cache-stats showed '[Cache: cold]' — which + // users read as "my cache is broken" when the provider simply + // didn't report cache fields. Now they land in 'self-hosted' and + // /cache-stats shows '[Cache: N/A]'. + expect( + resolveCacheProvider('openai', { openAiBaseUrl: 'http://192.168.1.50:8000/v1' }), + ).toBe('self-hosted') + expect( + resolveCacheProvider('openai', { openAiBaseUrl: 'http://10.0.0.7:8080/v1' }), + ).toBe('self-hosted') + expect( + resolveCacheProvider('openai', { openAiBaseUrl: 'http://172.20.0.3:5000/v1' }), + ).toBe('self-hosted') + }) + + test('openai on link-local / CGNAT → self-hosted', () => { + expect( + resolveCacheProvider('openai', { openAiBaseUrl: 'http://169.254.169.254/v1' }), + ).toBe('self-hosted') + expect( + resolveCacheProvider('openai', { openAiBaseUrl: 'http://100.64.1.5:8000/v1' }), + ).toBe('self-hosted') + }) + + test('openai on reserved TLD (.local / .internal / .lan / .home.arpa) → self-hosted', () => { + // Per RFC 6761 (.local/mDNS), RFC 8375 (.home.arpa), and widely + // used .internal / .lan conventions. These never resolve publicly. + expect( + resolveCacheProvider('openai', { openAiBaseUrl: 'http://llm.internal:5000/v1' }), + ).toBe('self-hosted') + expect( + resolveCacheProvider('openai', { openAiBaseUrl: 'http://llm.local:8080/v1' }), + ).toBe('self-hosted') + expect( + resolveCacheProvider('openai', { openAiBaseUrl: 'http://vllm.home.arpa/v1' }), + ).toBe('self-hosted') + expect( + resolveCacheProvider('openai', { openAiBaseUrl: 'http://box.lan:1234/v1' }), + ).toBe('self-hosted') + }) + + test('openai on IPv6 local / link-local → self-hosted', () => { + expect( + resolveCacheProvider('openai', { openAiBaseUrl: 'http://[fe80::1]:8000/v1' }), + ).toBe('self-hosted') + expect( + resolveCacheProvider('openai', { openAiBaseUrl: 'http://[fd12:3456::7]:8080/v1' }), + ).toBe('self-hosted') + expect( + resolveCacheProvider('openai', { openAiBaseUrl: 'http://[fc00::1]:8080/v1' }), + ).toBe('self-hosted') + }) + + test('IPv6 ULA prefix (fc/fd) does NOT over-match public hostnames', () => { + // Regression guard: an early version of isLocalOrPrivateUrl checked + // `h.startsWith('fc')` / `startsWith('fd')` without a colon guard, + // which misclassified legitimate public hosts whose names happen to + // begin with those letters. The fix requires a colon in the match + // so only real IPv6 literals hit the branch. + expect( + resolveCacheProvider('openai', { + openAiBaseUrl: 'https://fc-api.example.com/v1', + }), + ).toBe('openai') + expect( + resolveCacheProvider('openai', { + openAiBaseUrl: 'https://fd-hosted.example.com/v1', + }), + ).toBe('openai') + // Same goes for names that look like hex prefixes but aren't IPv6. + expect( + resolveCacheProvider('openai', { + openAiBaseUrl: 'https://fcbench.net/v1', + }), + ).toBe('openai') + }) + + test('openai with :11434 on a public host → ollama (default-port heuristic)', () => { + // Contrived but the heuristic should still fire — someone running + // Ollama behind a reverse proxy with port preserved. + expect( + resolveCacheProvider('openai', { + openAiBaseUrl: 'https://ollama.example.com:11434/v1', + }), + ).toBe('ollama') + }) + + test('openai with moonshot URL → kimi', () => { + expect( + resolveCacheProvider('openai', { openAiBaseUrl: 'https://api.moonshot.ai/v1' }), + ).toBe('kimi') + }) + test('openai with deepseek URL → deepseek', () => { + expect( + resolveCacheProvider('openai', { openAiBaseUrl: 'https://api.deepseek.com/v1' }), + ).toBe('deepseek') + }) + test('private IP beats hosted-keyword matching (self-hosted takes priority)', () => { + // A pathological URL: a private-IP host whose path string contains + // "deepseek". Self-hosted detection must run FIRST so the URL + // classifies honestly — the path alone doesn't prove the upstream + // is the real DeepSeek API. + expect( + resolveCacheProvider('openai', { + openAiBaseUrl: 'http://10.0.0.5:8000/v1/deepseek-proxy', + }), + ).toBe('self-hosted') + }) + test('plain openai remains openai', () => { + expect( + resolveCacheProvider('openai', { openAiBaseUrl: 'https://api.openai.com/v1' }), + ).toBe('openai') + }) + test('unparseable base URL falls back to substring heuristic', () => { + // Bare host:port without a scheme is common in misconfigured env. + // We can't URL-parse it, but we still honor the "localhost" hint so + // a broken config doesn't silently masquerade as cache-capable. + expect( + resolveCacheProvider('openai', { openAiBaseUrl: 'localhost:8000' }), + ).toBe('self-hosted') + // An unparseable and opaque string falls through to plain 'openai' + // (best-effort — nothing we can infer from "foo-bar-baz"). + expect( + resolveCacheProvider('openai', { openAiBaseUrl: '???' }), + ).toBe('openai') + }) + test('empty base URL → plain openai', () => { + // No hint at all: assume the canonical api.openai.com. + expect(resolveCacheProvider('openai')).toBe('openai') + expect( + resolveCacheProvider('openai', { openAiBaseUrl: '' }), + ).toBe('openai') + }) + test('codex → codex', () => { + expect(resolveCacheProvider('codex')).toBe('codex') + }) + test('gemini → gemini', () => { + expect(resolveCacheProvider('gemini')).toBe('gemini') + }) +}) + +describe('resolveCacheProvider — .localhost TLD (RFC 6761)', () => { + test('subdomains of .localhost classify as self-hosted', () => { + // Chrome, Firefox, and systemd-resolved all natively resolve + // *.localhost to 127.0.0.1. Kubernetes Ingress and docker-compose + // setups commonly use app.localhost, api.localhost, etc. + expect( + resolveCacheProvider('openai', { + openAiBaseUrl: 'http://app.localhost:3000/v1', + }), + ).toBe('self-hosted') + expect( + resolveCacheProvider('openai', { + openAiBaseUrl: 'http://api.localhost/v1', + }), + ).toBe('self-hosted') + expect( + resolveCacheProvider('openai', { + openAiBaseUrl: 'http://llm.dev.localhost:8080/v1', + }), + ).toBe('self-hosted') + }) + + test('.localhost TLD does NOT match substring collisions', () => { + // Guard against regressions where `localhost` would match via + // substring rather than TLD semantics. `localhostify.com` and + // `mylocalhost.net` must stay on the public `openai` path. + expect( + resolveCacheProvider('openai', { + openAiBaseUrl: 'https://localhostify.com/v1', + }), + ).toBe('openai') + expect( + resolveCacheProvider('openai', { + openAiBaseUrl: 'https://mylocalhost.net/v1', + }), + ).toBe('openai') + }) +}) + +describe('extractCacheMetrics — hit rate clamp', () => { + test('hitRate is clamped to 1.0 on pathological input (read > total)', () => { + // Defensive guard: with valid non-negative inputs the math enforces + // read <= total, so hitRate cannot exceed 1. But an upstream shim + // bug (e.g. reading a negative `fresh` from a future provider) could + // break the invariant. `Math.min(1, read/total)` caps the display at + // 100% rather than letting a `read=800 total=500` case render as + // "hit 160%" or (worse) null, which would hide the anomaly. + const metrics = extractCacheMetrics( + { + cache_read_input_tokens: 800, + cache_creation_input_tokens: 0, + // asNumber keeps finite negatives, so fresh = -500 → total = + // 800 + 0 + (-500) = 300, read=800 → raw ratio 2.67, clamp to 1. + input_tokens: -500, + } as unknown as Record, + 'anthropic', + ) + expect(metrics.supported).toBe(true) + expect(metrics.hitRate).toBe(1) + }) + + test('normal inputs still yield accurate fractional hit rates', () => { + // Regression: clamp must not perturb the happy path. + const metrics = extractCacheMetrics( + { + cache_read_input_tokens: 300, + cache_creation_input_tokens: 0, + input_tokens: 700, + }, + 'anthropic', + ) + expect(metrics.hitRate).toBeCloseTo(0.3, 5) + }) +}) + +describe('extractCacheMetrics — self-hosted bucket (data-driven)', () => { + test('vanilla self-hosted endpoint without cache fields → unsupported / N/A', () => { + // vLLM, LocalAI, text-generation-webui, etc. emit no cache fields + // at all. With read=created=0 we mark unsupported so the REPL shows + // honest '[Cache: N/A]' instead of a fabricated 0%. + const metrics = extractCacheMetrics( + { input_tokens: 1_000, output_tokens: 200 }, + 'self-hosted', + ) + expect(metrics.supported).toBe(false) + expect(metrics.hitRate).toBeNull() + expect(metrics.read).toBe(0) + expect(metrics.created).toBe(0) + }) + + test('internal reverse proxy forwarding real cache data → supported', () => { + // Review-blocker regression guard: an enterprise setup with an + // internal proxy on a private URL (e.g. `http://llm.internal:5000/v1`) + // forwarding to OpenAI / Kimi / DeepSeek / Gemini WILL deliver real + // cache fields via the shim. Pre-fix we would discard them because + // the URL heuristic classified the endpoint as 'self-hosted'. Now + // the data itself decides: any non-zero cache activity flows through + // the same normalization as an OpenAI bucket. + const shimmed = { + input_tokens: 800, // fresh (post-shim, cached already subtracted) + cache_read_input_tokens: 1_200, // shim extracted from upstream + cache_creation_input_tokens: 0, + } + const metrics = extractCacheMetrics(shimmed, 'self-hosted') + expect(metrics.supported).toBe(true) + expect(metrics.read).toBe(1_200) + expect(metrics.total).toBe(2_000) + expect(metrics.hitRate).toBe(0.6) + }) + + test('proxy with cache_creation but zero cache_read → still supported', () => { + // Mirror of the above for the first-call / cold-cache scenario: + // Anthropic-compatible upstreams emit creation tokens on the first + // request that primes the cache. Self-hosted proxy must preserve + // that signal, not swallow it because read is still 0. + const shimmed = { + input_tokens: 500, + cache_read_input_tokens: 0, + cache_creation_input_tokens: 800, + } + const metrics = extractCacheMetrics(shimmed, 'self-hosted') + expect(metrics.supported).toBe(true) + expect(metrics.created).toBe(800) + expect(metrics.read).toBe(0) + }) +}) + +describe('formatCacheMetrics — defensive null/undefined guards', () => { + test('formatCacheMetricsCompact returns N/A for undefined input', () => { + // Signature says `CacheMetrics` but runtime bug on a failed API + // response could leave the caller with nothing. The formatter + // should degrade gracefully rather than throw on `.supported`. + expect(formatCacheMetricsCompact(undefined)).toBe('[Cache: N/A]') + expect(formatCacheMetricsCompact(null as unknown as undefined)).toBe( + '[Cache: N/A]', + ) + }) + + test('formatCacheMetricsFull returns N/A for undefined input', () => { + expect(formatCacheMetricsFull(undefined)).toBe('[Cache: N/A]') + expect(formatCacheMetricsFull(null as unknown as undefined)).toBe( + '[Cache: N/A]', + ) + }) +}) + +describe('formatCacheMetricsCompact — self-hosted display paths', () => { + test('vanilla self-hosted (no cache data) renders as N/A', () => { + const metrics = extractCacheMetrics( + { input_tokens: 500 }, + 'self-hosted', + ) + expect(formatCacheMetricsCompact(metrics)).toBe('[Cache: N/A]') + expect(formatCacheMetricsFull(metrics)).toBe('[Cache: N/A]') + }) + + test('self-hosted proxy with forwarded cache data renders real metrics', () => { + // Full display-path regression guard for the review-blocker fix: + // the user must see the real hit rate that the upstream emitted, + // not a silent N/A because the URL looked private. + const metrics = extractCacheMetrics( + { + input_tokens: 800, + cache_read_input_tokens: 1_200, + cache_creation_input_tokens: 0, + }, + 'self-hosted', + ) + expect(formatCacheMetricsCompact(metrics)).toBe('[Cache: 1.2k read • hit 60%]') + expect(formatCacheMetricsFull(metrics)).toBe( + '[Cache: read=1.2k created=0 hit=60%]', + ) + }) +}) + +describe('formatCacheMetricsCompact — snapshot-stable output', () => { + test('supported with reads shows "k" abbreviation and hit rate', () => { + const out = formatCacheMetricsCompact({ + read: 1_234, + created: 0, + total: 10_000, + hitRate: 0.1234, + supported: true, + }) + expect(out).toBe('[Cache: 1.2k read • hit 12%]') + }) + + test('supported with no cache activity renders "cold"', () => { + const out = formatCacheMetricsCompact({ + read: 0, + created: 0, + total: 500, + hitRate: 0, + supported: true, + }) + expect(out).toBe('[Cache: cold]') + }) + + test('unsupported renders "N/A"', () => { + const out = formatCacheMetricsCompact({ + read: 0, + created: 0, + total: 0, + hitRate: null, + supported: false, + }) + expect(out).toBe('[Cache: N/A]') + }) + + test('small numbers render without abbreviation', () => { + const out = formatCacheMetricsCompact({ + read: 42, + created: 0, + total: 100, + hitRate: 0.42, + supported: true, + }) + expect(out).toBe('[Cache: 42 read • hit 42%]') + }) +}) + +describe('formatCacheMetricsFull — snapshot-stable output', () => { + test('supported shows all fields', () => { + const out = formatCacheMetricsFull({ + read: 1_234, + created: 250, + total: 10_000, + hitRate: 0.1234, + supported: true, + }) + expect(out).toBe('[Cache: read=1.2k created=250 hit=12%]') + }) + + test('null hit rate renders n/a', () => { + const out = formatCacheMetricsFull({ + read: 0, + created: 0, + total: 0, + hitRate: null, + supported: true, + }) + expect(out).toBe('[Cache: read=0 created=0 hit=n/a]') + }) + + test('unsupported renders "N/A"', () => { + const out = formatCacheMetricsFull({ + read: 0, + created: 0, + total: 0, + hitRate: null, + supported: false, + }) + expect(out).toBe('[Cache: N/A]') + }) +}) + +describe('hit-rate edge cases (plan-mandated coverage)', () => { + test('0 read / 0 created on supported provider → hitRate = 0 (not null) when total > 0', () => { + const m = extractCacheMetrics({ input_tokens: 500 }, 'anthropic') + expect(m.read).toBe(0) + expect(m.created).toBe(0) + expect(m.hitRate).toBe(0) + }) + + test('read only (no created) computes proportion correctly', () => { + const m = extractCacheMetrics( + { input_tokens: 0, cache_read_input_tokens: 800, cache_creation_input_tokens: 0 }, + 'anthropic', + ) + expect(m.read).toBe(800) + expect(m.created).toBe(0) + expect(m.total).toBe(800) + expect(m.hitRate).toBe(1) + }) + + test('created only (first turn — no reads yet) gives 0 hit rate', () => { + const m = extractCacheMetrics( + { + input_tokens: 200, + cache_read_input_tokens: 0, + cache_creation_input_tokens: 1_000, + }, + 'anthropic', + ) + expect(m.read).toBe(0) + expect(m.created).toBe(1_000) + expect(m.total).toBe(1_200) + expect(m.hitRate).toBe(0) + }) + + test('mixed read + created + fresh input — full denominator', () => { + const m = extractCacheMetrics( + { + input_tokens: 500, + cache_read_input_tokens: 3_000, + cache_creation_input_tokens: 1_500, + }, + 'anthropic', + ) + // Denominator = fresh(500) + created(1500) + read(3000) = 5_000 + // Hit = read/total = 3000 / 5000 = 0.6 + expect(m.total).toBe(5_000) + expect(m.hitRate).toBe(0.6) + }) + + test('N/A (unsupported provider) preserves null hit-rate even with populated usage', () => { + // Simulate a Copilot usage payload that might look like OpenAI shape — + // we must NOT try to read it and must report supported:false. + const m = extractCacheMetrics( + { prompt_tokens: 5_000, prompt_tokens_details: { cached_tokens: 2_000 } }, + 'copilot', + ) + expect(m.supported).toBe(false) + expect(m.read).toBe(0) + expect(m.hitRate).toBeNull() + }) +}) + +describe('addCacheMetrics — session aggregation', () => { + test('sums read/created/total and recomputes hit rate', () => { + const a = { + read: 100, + created: 50, + total: 300, + hitRate: 100 / 300, + supported: true, + } + const b = { + read: 200, + created: 0, + total: 400, + hitRate: 0.5, + supported: true, + } + const sum = addCacheMetrics(a, b) + expect(sum.read).toBe(300) + expect(sum.created).toBe(50) + expect(sum.total).toBe(700) + expect(sum.hitRate).toBeCloseTo(300 / 700, 5) + }) + + test('unsupported + supported = supported (so we never lose honest data)', () => { + const unsupported = { + read: 0, + created: 0, + total: 0, + hitRate: null, + supported: false, + } + const supported = { + read: 10, + created: 0, + total: 100, + hitRate: 0.1, + supported: true, + } + expect(addCacheMetrics(unsupported, supported)).toBe(supported) + expect(addCacheMetrics(supported, unsupported)).toBe(supported) + }) + + test('unsupported + unsupported = unsupported', () => { + const u = { + read: 0, + created: 0, + total: 0, + hitRate: null, + supported: false, + } + const sum = addCacheMetrics(u, u) + expect(sum.supported).toBe(false) + }) +}) diff --git a/src/services/api/cacheMetrics.ts b/src/services/api/cacheMetrics.ts new file mode 100644 index 00000000..d1f27794 --- /dev/null +++ b/src/services/api/cacheMetrics.ts @@ -0,0 +1,538 @@ +/** + * Cross-provider cache usage normalizer for Phase 1 observability. + * + * Two layers of extraction, because the shim layer (openaiShim/codexShim) + * already converts raw provider usage to Anthropic-shape on the way in: + * + * 1. `extractCacheReadFromRawUsage` — consumes RAW provider usage, used + * from inside the shims where each provider's native field names are + * still visible. Single source of truth for "where is the cached- + * tokens count on provider X". + * 2. `extractCacheMetrics` — consumes POST-shim Anthropic-shape usage, + * which is what every downstream caller (cost-tracker, REPL display, + * /cache-stats) actually sees. Uses the `provider` argument only to + * decide whether the metric is `supported` (Copilot vanilla, Ollama + * get N/A rather than a fabricated 0%). + * + * Design rationale: + * - Pure functions, no globals: callers pass the provider explicitly so + * that tests, background agents and teammates get consistent results + * even when the process-level provider flag differs. + * - Honest N/A: Copilot (non-Claude) and Ollama do not expose cache data + * at all. Returning 0 would lie and corrupt aggregate hit-rate, so we + * return `supported: false` and let the display decide how to render. + * - `hitRate` is null whenever there is no input to compare against + * (0 read + 0 created). A 0% hit rate would suggest "cold" when in + * reality the turn had no cacheable content to begin with. + * - After normalization, `read + created ≤ total`, with any remainder + * being fresh (non-cacheable) input tokens. The shim enforces this + * invariant by subtracting cached from raw prompt_tokens so that + * post-shim `input_tokens` is always "fresh only" per Anthropic + * convention. + * + * Raw provider shapes (as of 2026-04): + * - Anthropic: usage.cache_read_input_tokens, + * usage.cache_creation_input_tokens, + * usage.input_tokens (fresh only) + * - OpenAI / Codex: usage.input_tokens_details?.cached_tokens + * usage.prompt_tokens_details?.cached_tokens, + * usage.prompt_tokens (includes cached) + * - Kimi / Moonshot: usage.cached_tokens (top level), usage.prompt_tokens + * - DeepSeek: usage.prompt_cache_hit_tokens, + * usage.prompt_cache_miss_tokens + * - Gemini: usage.cached_content_token_count, + * usage.prompt_token_count + * - Copilot (non-Claude) / Ollama: not reported → supported=false + */ +import type { APIProvider } from '../../utils/model/providers.js' + +/** Providers for which we know how to read cache fields. */ +export type CacheAwareProvider = + | 'anthropic' + | 'openai' + | 'codex' + | 'kimi' + | 'deepseek' + | 'gemini' + | 'ollama' + // Generic local / self-hosted OpenAI-compatible endpoints (vLLM, + // LM Studio, LocalAI, text-generation-webui, custom internal servers + // on RFC1918 addresses, reserved TLDs like .local / .internal, etc.). + // Distinct from `ollama` because Ollama might someday add cache + // reporting; keeping the buckets separate means that change stays + // local to one branch. + | 'self-hosted' + | 'copilot' + | 'copilot-claude' + +/** Unified cache metrics for one API response. */ +export type CacheMetrics = { + /** Tokens served from cache on this request. */ + read: number + /** + * Tokens written INTO the cache on this request. Only non-zero for + * providers with explicit caching (Anthropic family). + */ + created: number + /** + * Total input tokens the request is measured against, computed uniformly + * as `fresh + read + created` after the shim normalizes every provider + * to the Anthropic convention. Used as the denominator for hit-rate. + */ + total: number + /** + * `read / total`, or null when the denominator is zero or the provider + * doesn't support cache reporting. + */ + hitRate: number | null + /** + * False for providers that do not expose cache data at all. Callers + * should render "N/A" instead of "0%" in that case. + */ + supported: boolean +} + +/** Empty reference returned for unsupported providers — copy elision. */ +const UNSUPPORTED: CacheMetrics = { + read: 0, + created: 0, + total: 0, + hitRate: null, + supported: false, +} + +/** Raw usage shape — intentionally permissive, each provider picks its fields. */ +export type RawUsage = Record | null | undefined + +function asNumber(value: unknown): number { + return typeof value === 'number' && Number.isFinite(value) ? value : 0 +} + +function pickPath(usage: RawUsage, path: string[]): unknown { + let cur: unknown = usage + for (const key of path) { + if (cur == null || typeof cur !== 'object') return undefined + cur = (cur as Record)[key] + } + return cur +} + +/** + * Returns true when the URL points at a private, loopback, link-local, + * CGNAT, or reserved-TLD host — anywhere a self-hosted OpenAI-compatible + * server is likely running (vLLM, LM Studio, LocalAI, Ollama on a + * non-default port, text-generation-webui, corporate internal proxies). + * + * WHY a dedicated helper (vs the old substring match): + * The previous check only looked for `localhost` / `127.0.0.1` / + * `:11434` / `:1234` as substrings. That misclassified real setups: + * a vLLM server at `http://192.168.1.50:8000/v1` or an internal + * endpoint at `http://llm.internal:5000/v1` fell through the `openai` + * branch, got marked as cache-capable, and `/cache-stats` reported + * `[Cache: cold]` — making users think their cache was broken when + * in reality the provider simply doesn't report cache fields. + * + * Intentionally narrower than WebSearchTool's `isPrivateHostname` + * (which defends against SSRF bypass vectors like IPv4-mapped IPv6 + * and octal-encoded IPs). We only need to classify a reporting bucket, + * not enforce a security boundary — a false negative here at worst + * shows `[Cache: cold]` instead of `[Cache: N/A]`. + * + * See cacheMetrics.test.ts for the cases this function is contracted to + * return true/false for. + */ +function isLocalOrPrivateUrl(url: string): boolean { + if (!url) return false + let hostname = '' + try { + hostname = new URL(url).hostname.toLowerCase() + } catch { + // Fall through to the substring fallback below. + } + // WHATWG URL accepts `localhost:8000` (treats `localhost:` as scheme, + // leaving hostname empty). Treat empty-hostname parses the same as a + // parse failure so we still catch the obvious cases with substring. + if (!hostname) { + const lower = url.toLowerCase() + return ( + lower.includes('localhost') || + lower.includes('127.0.0.1') || + lower.includes('::1') + ) + } + // Unwrap IPv6 literal brackets that URL.hostname leaves attached. + const h = hostname.startsWith('[') && hostname.endsWith(']') + ? hostname.slice(1, -1) + : hostname + // Reserved TLDs and `localhost` itself — all guaranteed never to + // resolve to public infrastructure. Sources: + // - RFC 6761 §6.3 — `.localhost` (Chrome/Firefox/systemd-resolved + // resolve `*.localhost` to 127.0.0.1 natively) + // - RFC 6762 — `.local` mDNS (Bonjour) + // - RFC 8375 — `.home.arpa` (residential home networks) + // - de facto — `.lan`, `.internal`, `.intranet` (widely used + // in corporate DNS despite not being formally + // reserved) + if ( + h === 'localhost' || + h.endsWith('.localhost') || + h.endsWith('.local') || + h.endsWith('.lan') || + h.endsWith('.internal') || + h.endsWith('.intranet') || + h.endsWith('.home.arpa') + ) { + return true + } + // IPv4 private and reserved ranges. URL.hostname normalizes short / + // hex / octal IPv4 representations to dotted-quad, so a simple regex + // works for the display-classification use case. + const ipv4 = h.match(/^(\d{1,3})\.(\d{1,3})\.(\d{1,3})\.(\d{1,3})$/) + if (ipv4) { + const a = Number(ipv4[1]) + const b = Number(ipv4[2]) + // 10.0.0.0/8 (RFC 1918) + if (a === 10) return true + // 172.16.0.0/12 (RFC 1918) + if (a === 172 && b >= 16 && b <= 31) return true + // 192.168.0.0/16 (RFC 1918) + if (a === 192 && b === 168) return true + // 127.0.0.0/8 loopback + if (a === 127) return true + // 169.254.0.0/16 link-local (AWS/GCP metadata, stateless autoconf) + if (a === 169 && b === 254) return true + // 100.64.0.0/10 CGNAT (Tailscale, carrier-grade NAT) + if (a === 100 && b >= 64 && b <= 127) return true + } + // IPv6 common local/private ranges — narrow by design. + if (h === '::1' || h === '::') return true + // fe80::/10 link-local and fc00::/7 unique-local (ULA). A colon is + // required in the match so `fc` / `fd` don't over-match real + // hostnames like `fc-api.example.com` or `fd-hosted.com`. URL.hostname + // strips brackets, so an IPv6 literal like `fc00::1` shows up here as + // `fc00::1` — still contains the colon. + if ( + h.startsWith('fe80:') || + /^fc[0-9a-f]{0,2}:/.test(h) || + /^fd[0-9a-f]{0,2}:/.test(h) + ) { + return true + } + return false +} + +/** + * Map the canonical APIProvider enum (+ environment hints) into a + * cache-capability bucket. We separate `copilot` (no cache) from + * `copilot-claude` (Anthropic shim via Copilot with explicit cache) + * because the two behave very differently even under the same provider + * flag — see `isGithubNativeAnthropicMode` in utils/model/providers.ts. + * + * Order of OpenAI-compatible checks matters: + * 1. Private / self-hosted URL — no cache fields regardless of vendor. + * 2. Vendor-specific hosted providers (Kimi, DeepSeek) — known cache + * shapes that deserve their own normalization branch. + * 3. Plain OpenAI — default bucket. + * Doing hosted-vendor matching before self-hosted detection would let a + * private-IP endpoint with "deepseek" in the URL fall into the wrong + * branch; doing self-hosted last would let a `.internal` URL with + * "openai" in its path be misclassified. The current order is correct + * for both pathological cases. + */ +export function resolveCacheProvider( + provider: APIProvider, + hints?: { githubNativeAnthropic?: boolean; openAiBaseUrl?: string }, +): CacheAwareProvider { + if (provider === 'github') { + return hints?.githubNativeAnthropic ? 'copilot-claude' : 'copilot' + } + if (provider === 'firstParty' || provider === 'bedrock' || provider === 'vertex' || provider === 'foundry') { + return 'anthropic' + } + if (provider === 'gemini') return 'gemini' + if (provider === 'codex') return 'codex' + if (provider === 'openai') { + const url = hints?.openAiBaseUrl ?? '' + // Self-hosted / private-network endpoint — detect first so a vLLM + // server on 192.168.x.x or a .internal DNS entry is honestly + // classified as no-cache, not misreported as plain OpenAI. + if (isLocalOrPrivateUrl(url)) return 'self-hosted' + const lower = url.toLowerCase() + // The :11434 port still signals Ollama specifically (default port). + // If someone runs Ollama on a private IP:11434 we picked it up above + // as 'self-hosted'; only a public-looking URL with :11434 lands here. + if (lower.includes(':11434')) return 'ollama' + if (lower.includes('moonshot') || lower.includes('kimi')) return 'kimi' + if (lower.includes('deepseek')) return 'deepseek' + return 'openai' + } + // nvidia-nim, minimax, mistral share the OpenAI Chat Completions convention + // for cache reporting (prompt_tokens_details.cached_tokens). Treat them as + // 'openai' for normalization purposes — if the provider doesn't emit the + // field we simply get zeros, and hitRate stays null via the 0-guard below. + return 'openai' +} + +/** + * Read the cached-tokens count from a RAW provider usage object, handling + * every shape we know about. Callers are the shim layer (openaiShim, + * codexShim) — the only place where the native provider fields still + * exist before conversion to Anthropic shape. + * + * Order of fallbacks is deliberate: the first non-zero match wins, so + * adding a provider that combines shapes is safe as long as we list the + * most authoritative field first. + */ +export function extractCacheReadFromRawUsage(usage: RawUsage): number { + if (!usage || typeof usage !== 'object') return 0 + const u = usage as Record + // 1. Anthropic-native shape — already normalized upstream. + const anthropicRead = asNumber(u.cache_read_input_tokens) + if (anthropicRead > 0) return anthropicRead + // 2. OpenAI / Codex — cached_tokens nested under input/prompt details. + // Responses API uses `input_tokens_details`, Chat Completions uses + // `prompt_tokens_details`; some models report both with the same value. + const openaiNested = + asNumber(pickPath(usage, ['input_tokens_details', 'cached_tokens'])) || + asNumber(pickPath(usage, ['prompt_tokens_details', 'cached_tokens'])) + if (openaiNested > 0) return openaiNested + // 3. Kimi / Moonshot — top-level cached_tokens (not nested). + const kimi = asNumber(u.cached_tokens) + if (kimi > 0) return kimi + // 4. DeepSeek — hit/miss split at top level. + const deepseek = asNumber(u.prompt_cache_hit_tokens) + if (deepseek > 0) return deepseek + // 5. Gemini — cached_content_token_count. + const gemini = asNumber(u.cached_content_token_count) + if (gemini > 0) return gemini + return 0 +} + +/** + * Shape produced by the shim layer — matches the Anthropic BetaUsage + * fields that every downstream caller (cost-tracker, REPL, /cache-stats) + * consumes. Keeping it in this module lets the shim and the integration + * tests share one definition and eliminates the drift class of bugs + * where a shim is updated but a test simulator isn't. + */ +export type NormalizedShimUsage = { + input_tokens: number + output_tokens: number + cache_creation_input_tokens: number + cache_read_input_tokens: number +} + +/** + * Convert raw provider usage (any known shape) into the Anthropic-shape + * `NormalizedShimUsage` used throughout the codebase. Single source of + * truth for the shim layer — `codexShim.makeUsage`, + * `openaiShim.convertChunkUsage`, and the non-streaming response in + * `OpenAIShimMessages` all call this helper, and the integration test + * calls it directly instead of re-implementing the conversion. + * + * Design contract: + * - `cache_read_input_tokens` comes from `extractCacheReadFromRawUsage` + * (provider-aware extraction). + * - `input_tokens` is rewritten to Anthropic convention: FRESH only, + * with `cache_read` subtracted from the raw prompt count if the + * provider included it there (OpenAI family does; Anthropic native + * already excludes it). + * - `cache_creation_input_tokens` is always 0 at the shim boundary — + * only Anthropic native emits a non-zero creation count, and it + * doesn't flow through these shims. + * - Output token count accepts both `output_tokens` (Codex/Responses) + * and `completion_tokens` (Chat Completions). + * + * Observed raw shapes per provider (pinned so future drift is caught): + * - OpenAI Chat Completions: + * `{ prompt_tokens, completion_tokens, + * prompt_tokens_details: { cached_tokens } }` + * where `cached_tokens` is a SUBSET of `prompt_tokens` — hence + * the subtraction below. + * - OpenAI Codex / Responses API: + * `{ input_tokens, output_tokens, + * input_tokens_details: { cached_tokens } }` + * same convention: cached is included in `input_tokens`. + * - Anthropic native: + * `{ input_tokens, output_tokens, + * cache_read_input_tokens, cache_creation_input_tokens }` + * cached is EXCLUDED from `input_tokens`. The subtraction here + * no-ops (cache_read is read off a dedicated field, then fresh = + * input_tokens - 0 = input_tokens) — safe passthrough. + * - Kimi/Moonshot: + * `{ prompt_tokens, completion_tokens, cached_tokens }` — top + * level, not nested. OpenAI-family subset convention. + * - DeepSeek: + * `{ prompt_tokens, completion_tokens, prompt_cache_hit_tokens, + * prompt_cache_miss_tokens }`. The `hit` field is the cached + * count, also a subset of `prompt_tokens`. + * + * If a future provider deviates (ships cached tokens ALREADY excluded + * from input_tokens, Anthropic-style), this function will under-count + * their fresh-input by `cache_read`. The regression test + * `cacheMetricsIntegration.test.ts > "Codex makeUsage no longer + * double-bills"` pins the current Codex shape so a deviation breaks + * visibly. If you're adding a new provider, verify the shape and — + * if needed — extend `extractCacheReadFromRawUsage` to pick a field + * that represents cached-tokens-already-excluded (and skip the + * subtraction by setting `rawInput` to `prompt_tokens + cache_read`). + */ +export function buildAnthropicUsageFromRawUsage( + raw: RawUsage, +): NormalizedShimUsage { + const cacheRead = extractCacheReadFromRawUsage(raw) + const u = (raw ?? {}) as Record + const rawInput = + asNumber(u.input_tokens) || asNumber(u.prompt_tokens) + const fresh = rawInput >= cacheRead ? rawInput - cacheRead : rawInput + const output = + asNumber(u.output_tokens) || asNumber(u.completion_tokens) + return { + input_tokens: fresh, + output_tokens: output, + cache_creation_input_tokens: 0, + cache_read_input_tokens: cacheRead, + } +} + +/** + * Extract a unified CacheMetrics from POST-SHIM (Anthropic-shape) usage. + * + * By the time this runs, openaiShim/codexShim have already converted + * raw provider fields into `cache_read_input_tokens` (via + * `extractCacheReadFromRawUsage`) and adjusted `input_tokens` to be + * "fresh only" per Anthropic convention. This function is therefore + * deliberately provider-independent for the numeric extraction — the + * `provider` argument is used only to surface `supported: false` for + * providers that expose no cache data at all. + */ +export function extractCacheMetrics( + usage: RawUsage, + provider: CacheAwareProvider, +): CacheMetrics { + if (!usage || typeof usage !== 'object') return UNSUPPORTED + const u = usage as Record + const read = asNumber(u.cache_read_input_tokens) + const created = asNumber(u.cache_creation_input_tokens) + const fresh = asNumber(u.input_tokens) + // Copilot vanilla (no Claude) and Ollama don't expose cache fields at + // all as a provider-identity matter. These are explicit provider + // selections (via CLAUDE_CODE_USE_GITHUB and the Ollama base-URL + // default port), so we can hard-wire `supported: false` and let the + // REPL print "N/A" instead of a fabricated 0%. + if (provider === 'copilot' || provider === 'ollama') { + return UNSUPPORTED + } + // `self-hosted` is different: the bucket is inferred from the base + // URL being on a private network (RFC1918, .local TLD, etc.), which + // is a heuristic, not an authoritative "this endpoint cannot cache" + // signal. An internal reverse proxy forwarding to OpenAI / Kimi / + // DeepSeek / Gemini will produce a private URL but ALSO emit real + // cache fields via the shim. Force-unsupported here would discard + // legitimate data. Let the data decide: if the shim extracted any + // cache activity (read OR created), trust it and fall through to + // normal extraction; otherwise render honest N/A for vanilla + // vLLM/LocalAI-style endpoints that really don't cache. + if (provider === 'self-hosted' && read === 0 && created === 0) { + return UNSUPPORTED + } + // total = fresh + read + created — shim already stripped `read` out of + // `fresh` so the three components don't double-count. This matches the + // Anthropic convention even when the upstream was OpenAI/Kimi/DeepSeek. + const total = read + created + fresh + return { + read, + created, + total, + // Clamp to [0, 1]. With non-negative inputs the math guarantees + // `read <= total` — but an upstream shim bug (e.g. future provider + // where we accidentally read a negative `fresh`) could violate the + // invariant. Showing a pinned `1.0` on anomalous input is clearer + // than a nonsense ratio > 100% and safer than `null` (which would + // hide the issue completely). + hitRate: total > 0 ? Math.min(1, read / total) : null, + supported: true, + } +} + +/** + * Format a CacheMetrics value into a human-facing one-liner used by + * `showCacheStats: 'compact'`. Stable format — snapshot-tested. + * + * Examples: + * "[Cache: 1.2k read • hit 12%]" + * "[Cache: N/A]" (unsupported provider) + * "[Cache: cold]" (supported, no reads yet) + * + * The `undefined` branch at the top is defensive: TypeScript enforces + * `CacheMetrics` at call sites, but a failed API response could leave + * the caller with nothing to render. Treat absent metrics as "no data" + * rather than throwing on `metrics.supported`. + */ +export function formatCacheMetricsCompact( + metrics: CacheMetrics | undefined | null, +): string { + if (!metrics) return '[Cache: N/A]' + if (!metrics.supported) return '[Cache: N/A]' + if (metrics.read === 0 && metrics.created === 0) return '[Cache: cold]' + const parts: string[] = [`${formatCompactNumber(metrics.read)} read`] + if (metrics.hitRate !== null) { + parts.push(`hit ${Math.round(metrics.hitRate * 100)}%`) + } + return `[Cache: ${parts.join(' • ')}]` +} + +/** + * Format a CacheMetrics value into a multi-field breakdown used by + * `showCacheStats: 'full'`. Stable format — snapshot-tested. + * + * Example: + * "[Cache: read=1.2k created=340 hit=12%]" + * + * Same `undefined` tolerance as `formatCacheMetricsCompact` — a failed + * API response shouldn't throw on the display path. + */ +export function formatCacheMetricsFull( + metrics: CacheMetrics | undefined | null, +): string { + if (!metrics) return '[Cache: N/A]' + if (!metrics.supported) return '[Cache: N/A]' + const parts: string[] = [ + `read=${formatCompactNumber(metrics.read)}`, + `created=${formatCompactNumber(metrics.created)}`, + ] + if (metrics.hitRate !== null) { + parts.push(`hit=${Math.round(metrics.hitRate * 100)}%`) + } else { + parts.push('hit=n/a') + } + return `[Cache: ${parts.join(' ')}]` +} + +// Compact 1.2k-style formatter. Duplicated here (not imported from +// utils/format.ts) because this module should stay dependency-light and +// deterministic — utils/format pulls Intl locale state which varies. +function formatCompactNumber(n: number): string { + if (n < 1_000) return String(n) + if (n < 1_000_000) return `${(n / 1_000).toFixed(1).replace(/\.0$/, '')}k` + return `${(n / 1_000_000).toFixed(1).replace(/\.0$/, '')}m` +} + +/** Sum two CacheMetrics, preserving `supported` as true only if both are. */ +export function addCacheMetrics(a: CacheMetrics, b: CacheMetrics): CacheMetrics { + // Copy elision: if either side is the unsupported sentinel, return the + // other as-is so aggregates on a purely-unsupported session stay cheap. + if (!a.supported && !b.supported) return UNSUPPORTED + if (!a.supported) return b + if (!b.supported) return a + const read = a.read + b.read + const created = a.created + b.created + const total = a.total + b.total + return { + read, + created, + total, + hitRate: total > 0 ? read / total : null, + supported: true, + } +} diff --git a/src/services/api/cacheMetricsIntegration.test.ts b/src/services/api/cacheMetricsIntegration.test.ts new file mode 100644 index 00000000..6d3517da --- /dev/null +++ b/src/services/api/cacheMetricsIntegration.test.ts @@ -0,0 +1,339 @@ +/** + * Integration tests for the raw-usage → shim → cost-tracker pipeline. + * + * These tests simulate what happens on each provider end-to-end: + * 1. The provider returns a raw `usage` object in its native shape. + * 2. The shim (openaiShim.convertChunkUsage / codexShim.makeUsage) + * rewrites it to Anthropic shape via buildAnthropicUsageFromRawUsage. + * 3. cost-tracker feeds the shimmed usage to extractCacheMetrics. + * + * The unit tests in cacheMetrics.test.ts exercise each layer in isolation. + * This file exists so that a regression in ANY one of them (e.g. someone + * adding a new provider branch to the helper but forgetting to wire it + * into the shim) surfaces as an integration failure rather than silently + * showing "[Cache: cold]" in production. + * + * We call `buildAnthropicUsageFromRawUsage` directly instead of + * re-implementing the shim behavior locally. Both shims + * (`codexShim.makeUsage`, `openaiShim.convertChunkUsage`, and the + * non-streaming path in `OpenAIShimMessages`) delegate to this helper, + * so this test covers the exact same code that runs in production — + * no simulator drift possible. + */ +import { describe, expect, test } from 'bun:test' +import { + buildAnthropicUsageFromRawUsage, + extractCacheMetrics, + formatCacheMetricsCompact, + formatCacheMetricsFull, + resolveCacheProvider, + type CacheAwareProvider, +} from './cacheMetrics.js' + +type Scenario = { + name: string + provider: CacheAwareProvider + rawUsage: Record + expectedRead: number + expectedTotal: number + expectedHitRate: number + expectedFreshInput: number +} + +// End-to-end scenarios for every provider shape the OpenClaude shim layer +// might see. `expectedTotal` is what a user should see as "input this +// request", `expectedHitRate` is what `/cache-stats` should display. +const scenarios: Scenario[] = [ + { + name: 'Anthropic native (firstParty) — passthrough', + provider: 'anthropic', + rawUsage: { + input_tokens: 200, + cache_read_input_tokens: 800, + cache_creation_input_tokens: 100, + }, + expectedRead: 800, + // Anthropic native doesn't go through the shim in production, but + // buildAnthropicUsageFromRawUsage handles it correctly as passthrough: + // prompt_tokens fallback is 0, so fresh comes from input_tokens (200), + // cache_read is picked up from cache_read_input_tokens (800). + expectedTotal: 1_000, // 200 fresh + 800 read (created is not tracked at this layer) + expectedHitRate: 800 / 1_000, + expectedFreshInput: 200, + }, + { + name: 'OpenAI Chat Completions via openaiShim', + provider: 'openai', + rawUsage: { + prompt_tokens: 2_000, + completion_tokens: 300, + prompt_tokens_details: { cached_tokens: 1_200 }, + }, + expectedRead: 1_200, + expectedTotal: 2_000, // 800 fresh + 1200 read + expectedHitRate: 0.6, + expectedFreshInput: 800, + }, + { + name: 'Codex Responses API via codexShim', + provider: 'codex', + rawUsage: { + input_tokens: 1_500, + output_tokens: 50, + input_tokens_details: { cached_tokens: 600 }, + }, + expectedRead: 600, + expectedTotal: 1_500, + expectedHitRate: 0.4, + expectedFreshInput: 900, + }, + { + name: 'Kimi / Moonshot via openaiShim — top-level cached_tokens', + provider: 'kimi', + rawUsage: { + prompt_tokens: 1_000, + completion_tokens: 120, + cached_tokens: 400, + }, + expectedRead: 400, + expectedTotal: 1_000, + expectedHitRate: 0.4, + expectedFreshInput: 600, + }, + { + name: 'DeepSeek via openaiShim — prompt_cache_hit_tokens', + provider: 'deepseek', + rawUsage: { + prompt_tokens: 1_000, + completion_tokens: 40, + prompt_cache_hit_tokens: 700, + prompt_cache_miss_tokens: 300, + }, + expectedRead: 700, + expectedTotal: 1_000, + expectedHitRate: 0.7, + expectedFreshInput: 300, + }, + { + name: 'Gemini via openaiShim — cached_content_token_count', + provider: 'gemini', + rawUsage: { + prompt_tokens: 4_000, + completion_tokens: 200, + cached_content_token_count: 3_200, + }, + expectedRead: 3_200, + expectedTotal: 4_000, + expectedHitRate: 0.8, + expectedFreshInput: 800, + }, +] + +describe('raw usage → shim → extractCacheMetrics pipeline', () => { + for (const s of scenarios) { + test(s.name, () => { + // Call the same helper the shims call in production — no + // simulator, no possibility of drift. + const shimmed = buildAnthropicUsageFromRawUsage(s.rawUsage) + expect(shimmed.cache_read_input_tokens).toBe(s.expectedRead) + expect(shimmed.input_tokens).toBe(s.expectedFreshInput) + + const metrics = extractCacheMetrics( + shimmed as unknown as Record, + s.provider, + ) + expect(metrics.supported).toBe(true) + expect(metrics.read).toBe(s.expectedRead) + expect(metrics.total).toBe(s.expectedTotal) + expect(metrics.hitRate).toBeCloseTo(s.expectedHitRate, 4) + }) + } +}) + +describe('no-cache providers — pipeline honestly reports unsupported', () => { + test('GitHub Copilot (vanilla) — shim runs, but provider bucket maps to unsupported', () => { + const shimmed = buildAnthropicUsageFromRawUsage({ + prompt_tokens: 500, + completion_tokens: 40, + }) + // Shim normalized correctly (0 cache_read), but Copilot-vanilla must + // surface as unsupported so /cache-stats shows "N/A" instead of "0%". + expect(shimmed.cache_read_input_tokens).toBe(0) + const metrics = extractCacheMetrics( + shimmed as unknown as Record, + 'copilot', + ) + expect(metrics.supported).toBe(false) + expect(metrics.hitRate).toBeNull() + }) + + test('Ollama (local) — same treatment as Copilot-vanilla', () => { + const shimmed = buildAnthropicUsageFromRawUsage({ + prompt_tokens: 1_000, + completion_tokens: 200, + }) + const metrics = extractCacheMetrics( + shimmed as unknown as Record, + 'ollama', + ) + expect(metrics.supported).toBe(false) + }) +}) + +describe('display path end-to-end — private-IP, custom-port, self-hosted endpoints', () => { + // These tests exercise the FULL pipeline that runs when a user + // configures OpenClaude against a self-hosted OpenAI-compatible + // server (vLLM, LM Studio, LocalAI, text-generation-webui, etc.): + // + // OPENAI_BASE_URL → resolveCacheProvider → real provider usage → + // buildAnthropicUsageFromRawUsage → extractCacheMetrics → + // formatCacheMetricsCompact / Full (= what user sees in REPL and + // via /cache-stats) + // + // Pre-fix behavior: substring check missed these URLs, they fell + // into the 'openai' bucket, and the display showed '[Cache: cold]' — + // i.e. implied a cache miss when the provider simply doesn't report + // cache fields. Post-fix: '[Cache: N/A]' every time. + + const privateEndpoints: Array<{ name: string; baseUrl: string }> = [ + { name: 'vLLM on RFC1918 LAN IP', baseUrl: 'http://192.168.1.50:8000/v1' }, + { name: 'LocalAI on 10.x.x.x corporate network', baseUrl: 'http://10.0.0.7:8080/v1' }, + { name: 'self-hosted on 172.16.x.x', baseUrl: 'http://172.20.0.3:5000/v1' }, + { name: 'reverse-proxied on .internal DNS', baseUrl: 'http://llm.internal:5000/v1' }, + { name: 'mDNS .local hostname', baseUrl: 'http://box.local:8080/v1' }, + { name: 'RFC 8375 .home.arpa', baseUrl: 'http://vllm.home.arpa/v1' }, + { name: 'CGNAT / Tailscale 100.64.x.x', baseUrl: 'http://100.64.1.5:8000/v1' }, + { name: 'IPv6 loopback literal', baseUrl: 'http://[::1]:5000/v1' }, + { name: 'IPv6 link-local', baseUrl: 'http://[fe80::1]:8000/v1' }, + { name: 'IPv6 ULA fc00::/7', baseUrl: 'http://[fd12:3456::7]:8080/v1' }, + { name: 'link-local cloud-metadata IP', baseUrl: 'http://169.254.169.254/v1' }, + ] + + for (const { name, baseUrl } of privateEndpoints) { + test(`${name} (${baseUrl}) — renders [Cache: N/A], not [Cache: cold]`, () => { + // 1. URL resolves to self-hosted bucket. + const bucket = resolveCacheProvider('openai', { openAiBaseUrl: baseUrl }) + expect(bucket).toBe('self-hosted') + + // 2. Typical self-hosted server returns OpenAI-shape usage with no + // cache fields — the shim normalizes it cleanly. + const shimmed = buildAnthropicUsageFromRawUsage({ + prompt_tokens: 1_200, + completion_tokens: 250, + }) + expect(shimmed.cache_read_input_tokens).toBe(0) + + // 3. The display path marks the bucket unsupported. + const metrics = extractCacheMetrics( + shimmed as unknown as Record, + bucket, + ) + expect(metrics.supported).toBe(false) + expect(metrics.hitRate).toBeNull() + + // 4. User-visible output — both formats honor the unsupported flag. + expect(formatCacheMetricsCompact(metrics)).toBe('[Cache: N/A]') + expect(formatCacheMetricsFull(metrics)).toBe('[Cache: N/A]') + }) + } + + test('public-looking URL with non-standard port stays in openai bucket (no false positive)', () => { + // A real hosted API that happens to run on a custom port must NOT + // be misclassified as self-hosted. This guards the fix against + // over-matching. + const bucket = resolveCacheProvider('openai', { + openAiBaseUrl: 'https://api.openai.com:8443/v1', + }) + expect(bucket).toBe('openai') + }) + + test('private IP + hosted-provider keyword in path → self-hosted wins', () => { + // A URL like 'http://10.0.0.5:8000/v1/deepseek-proxy' has "deepseek" + // in the path but the upstream is a LAN box, not the real DeepSeek. + // Priority ordering in resolveCacheProvider must put self-hosted + // detection first. + const bucket = resolveCacheProvider('openai', { + openAiBaseUrl: 'http://10.0.0.5:8000/v1/deepseek-proxy', + }) + expect(bucket).toBe('self-hosted') + }) + + test('self-hosted proxy forwarding real upstream cache data is NOT discarded', () => { + // Review-blocker regression: an enterprise setup with an internal + // reverse proxy on a private URL forwarding to OpenAI / Kimi / + // DeepSeek / Gemini WILL deliver real cache fields via the shim. + // Pre-fix, the URL heuristic → self-hosted → unconditional + // `supported: false` discarded the data and rendered '[Cache: N/A]' + // even though valid cache metrics were on the payload. Post-fix, + // the data decides: non-zero cache activity trumps the URL bucket. + const bucket = resolveCacheProvider('openai', { + openAiBaseUrl: 'http://llm-proxy.corp.internal:5000/v1', + }) + expect(bucket).toBe('self-hosted') + + // Typical raw Kimi shape (the reverse proxy forwards this through + // unchanged). Shim normalizes to Anthropic shape. + const raw = { prompt_tokens: 2_000, cached_tokens: 800 } + const shimmed = buildAnthropicUsageFromRawUsage(raw) + + // Display path with the fix: data is preserved end-to-end. + const metrics = extractCacheMetrics( + shimmed as unknown as Record, + bucket, + ) + expect(metrics.supported).toBe(true) + expect(metrics.read).toBe(800) + expect(metrics.hitRate).toBe(0.4) + expect(formatCacheMetricsCompact(metrics)).toBe( + '[Cache: 800 read • hit 40%]', + ) + }) +}) + +describe('regression guards — bug reproducers', () => { + test('Kimi cache hit survives the shim (pre-fix: silently dropped to 0)', () => { + // Before the Option-C refactor, the shim only read + // prompt_tokens_details.cached_tokens, so Kimi's top-level + // cached_tokens (400 below) was lost — the tracker saw read=0 and + // users saw "[Cache: cold]" even after real cache hits. This test + // fails loudly if the helper forgets the top-level branch. + const raw = { prompt_tokens: 800, cached_tokens: 300 } + const shimmed = buildAnthropicUsageFromRawUsage(raw) + const metrics = extractCacheMetrics( + shimmed as unknown as Record, + 'kimi', + ) + expect(metrics.read).toBe(300) + expect(metrics.hitRate).toBeGreaterThan(0) + }) + + test('DeepSeek cache hit survives the shim (pre-fix: silently dropped to 0)', () => { + const raw = { + prompt_tokens: 1_200, + prompt_cache_hit_tokens: 900, + prompt_cache_miss_tokens: 300, + } + const shimmed = buildAnthropicUsageFromRawUsage(raw) + const metrics = extractCacheMetrics( + shimmed as unknown as Record, + 'deepseek', + ) + expect(metrics.read).toBe(900) + expect(metrics.hitRate).toBe(0.75) + }) + + test('Codex makeUsage no longer double-bills (pre-fix: input_tokens kept cached)', () => { + // Pre-fix, codexShim.makeUsage set input_tokens to the raw value + // without subtracting cached_tokens, so modelCost.calculateUSDCost + // charged the same tokens under both input_tokens * rate AND + // cache_read_input_tokens * rate. This test enforces the Anthropic + // convention at the shim boundary. + const raw = { + input_tokens: 2_000, + input_tokens_details: { cached_tokens: 1_500 }, + } + const shimmed = buildAnthropicUsageFromRawUsage(raw) + expect(shimmed.input_tokens).toBe(500) // 2000 - 1500, not 2000 + expect(shimmed.cache_read_input_tokens).toBe(1_500) + }) +}) diff --git a/src/services/api/cacheStatsTracker.test.ts b/src/services/api/cacheStatsTracker.test.ts new file mode 100644 index 00000000..c1617b0e --- /dev/null +++ b/src/services/api/cacheStatsTracker.test.ts @@ -0,0 +1,210 @@ +import { beforeEach, expect, test, describe } from 'bun:test' +import { + _setHistoryCapForTesting, + getCacheStatsHistory, + getCurrentTurnCacheMetrics, + getSessionCacheMetrics, + recordRequest, + resetCurrentTurn, + resetSessionCacheStats, +} from './cacheStatsTracker.js' +import type { CacheMetrics } from './cacheMetrics.js' + +function makeMetrics(partial: Partial): CacheMetrics { + return { + read: 0, + created: 0, + total: 0, + hitRate: null, + supported: true, + ...partial, + } +} + +beforeEach(() => { + resetSessionCacheStats() + _setHistoryCapForTesting(500) +}) + +describe('cacheStatsTracker — aggregation', () => { + test('currentTurn and session both start empty and unsupported', () => { + expect(getCurrentTurnCacheMetrics().supported).toBe(false) + expect(getSessionCacheMetrics().supported).toBe(false) + expect(getCacheStatsHistory()).toEqual([]) + }) + + test('one recorded request flows into both turn and session', () => { + recordRequest( + makeMetrics({ read: 500, total: 1000, hitRate: 0.5 }), + 'claude-sonnet-4', + ) + expect(getCurrentTurnCacheMetrics().read).toBe(500) + expect(getCurrentTurnCacheMetrics().total).toBe(1000) + expect(getSessionCacheMetrics().read).toBe(500) + }) + + test('multiple requests sum across turn', () => { + recordRequest( + makeMetrics({ read: 100, total: 500, hitRate: 0.2 }), + 'm1', + ) + recordRequest( + makeMetrics({ read: 300, total: 500, hitRate: 0.6 }), + 'm1', + ) + const turn = getCurrentTurnCacheMetrics() + expect(turn.read).toBe(400) + expect(turn.total).toBe(1000) + expect(turn.hitRate).toBeCloseTo(0.4, 5) + }) + + test('resetCurrentTurn clears turn but preserves session', () => { + recordRequest(makeMetrics({ read: 200, total: 400 }), 'm1') + resetCurrentTurn() + expect(getCurrentTurnCacheMetrics().supported).toBe(false) + expect(getSessionCacheMetrics().read).toBe(200) + }) + + test('resetSessionCacheStats clears everything', () => { + recordRequest(makeMetrics({ read: 200, total: 400 }), 'm1') + resetSessionCacheStats() + expect(getCurrentTurnCacheMetrics().supported).toBe(false) + expect(getSessionCacheMetrics().supported).toBe(false) + expect(getCacheStatsHistory()).toEqual([]) + }) +}) + +describe('cacheStatsTracker — history', () => { + test('records each request with label and timestamp', () => { + const before = Date.now() + recordRequest(makeMetrics({ read: 1, total: 2 }), 'model-A') + recordRequest(makeMetrics({ read: 3, total: 4 }), 'model-B') + const history = getCacheStatsHistory() + expect(history.length).toBe(2) + expect(history[0]!.label).toBe('model-A') + expect(history[1]!.label).toBe('model-B') + expect(history[0]!.timestamp).toBeGreaterThanOrEqual(before) + }) + + test('evicts oldest entries when cap is exceeded', () => { + _setHistoryCapForTesting(3) + for (let i = 0; i < 5; i++) { + recordRequest(makeMetrics({ read: i, total: 10 }), `m${i}`) + } + const history = getCacheStatsHistory() + expect(history.length).toBe(3) + expect(history.map((h) => h.label)).toEqual(['m2', 'm3', 'm4']) + }) + + test('history copy is detached from internal state', () => { + recordRequest(makeMetrics({ read: 1, total: 2 }), 'x') + const snapshot = getCacheStatsHistory() + snapshot.pop() + expect(getCacheStatsHistory().length).toBe(1) + }) +}) + +describe('cacheStatsTracker — ring buffer semantics', () => { + test('ring wraps at cap without shifting (chronological order preserved)', () => { + _setHistoryCapForTesting(4) + // Push exactly 2×cap entries — forces one full wrap. + for (let i = 0; i < 8; i++) { + recordRequest(makeMetrics({ read: i, total: 10 }), `m${i}`) + } + const history = getCacheStatsHistory() + expect(history.length).toBe(4) + // After 8 pushes with cap=4, the survivors must be the newest 4 — + // m4, m5, m6, m7 — in chronological order. If the ring logic were + // wrong (e.g. off-by-one on writeIdx) this would come out rotated. + expect(history.map((h) => h.label)).toEqual(['m4', 'm5', 'm6', 'm7']) + }) + + test('read before ring wraps returns partial history in order', () => { + _setHistoryCapForTesting(10) + for (let i = 0; i < 3; i++) { + recordRequest(makeMetrics({ read: i, total: 10 }), `m${i}`) + } + const history = getCacheStatsHistory() + expect(history.map((h) => h.label)).toEqual(['m0', 'm1', 'm2']) + }) + + test('shrinking cap preserves the newest entries in order', () => { + _setHistoryCapForTesting(5) + for (let i = 0; i < 5; i++) { + recordRequest(makeMetrics({ read: i, total: 10 }), `m${i}`) + } + _setHistoryCapForTesting(3) + const history = getCacheStatsHistory() + expect(history.map((h) => h.label)).toEqual(['m2', 'm3', 'm4']) + // And pushing after shrink still respects the new cap. + recordRequest(makeMetrics({ read: 5, total: 10 }), 'm5') + expect(getCacheStatsHistory().map((h) => h.label)).toEqual(['m3', 'm4', 'm5']) + }) + + test('growing cap preserves existing entries and accepts more', () => { + _setHistoryCapForTesting(3) + for (let i = 0; i < 3; i++) { + recordRequest(makeMetrics({ read: i, total: 10 }), `m${i}`) + } + _setHistoryCapForTesting(6) + // After growing, the existing three should still be there in order, + // and we should be able to push three more before eviction starts. + for (let i = 3; i < 6; i++) { + recordRequest(makeMetrics({ read: i, total: 10 }), `m${i}`) + } + const history = getCacheStatsHistory() + expect(history.map((h) => h.label)).toEqual([ + 'm0', + 'm1', + 'm2', + 'm3', + 'm4', + 'm5', + ]) + }) + + test('_setHistoryCapForTesting throws on non-positive cap', () => { + // A zero cap would divide-by-zero on the ring write index and + // silently corrupt the buffer. Loud failure > NaN indices. + expect(() => _setHistoryCapForTesting(0)).toThrow(/cap must be >= 1/) + expect(() => _setHistoryCapForTesting(-3)).toThrow(/cap must be >= 1/) + }) + + test('resetSessionCacheStats empties the ring even when wrapped', () => { + _setHistoryCapForTesting(3) + for (let i = 0; i < 10; i++) { + recordRequest(makeMetrics({ read: i, total: 10 }), `m${i}`) + } + // Sanity: ring has wrapped many times. + expect(getCacheStatsHistory().length).toBe(3) + resetSessionCacheStats() + expect(getCacheStatsHistory()).toEqual([]) + // And a fresh push after reset starts from index 0 again. + recordRequest(makeMetrics({ read: 99, total: 100 }), 'post-reset') + const after = getCacheStatsHistory() + expect(after.length).toBe(1) + expect(after[0]!.label).toBe('post-reset') + }) +}) + +describe('cacheStatsTracker — unsupported mixing', () => { + test('mixing supported + unsupported keeps supported data visible', () => { + recordRequest( + { + read: 0, + created: 0, + total: 0, + hitRate: null, + supported: false, + }, + 'copilot', + ) + recordRequest( + makeMetrics({ read: 100, total: 500, hitRate: 0.2 }), + 'claude', + ) + const turn = getCurrentTurnCacheMetrics() + expect(turn.supported).toBe(true) + expect(turn.read).toBe(100) + }) +}) diff --git a/src/services/api/cacheStatsTracker.ts b/src/services/api/cacheStatsTracker.ts new file mode 100644 index 00000000..ba8aac5b --- /dev/null +++ b/src/services/api/cacheStatsTracker.ts @@ -0,0 +1,179 @@ +/** + * Per-query and per-session cache metrics tracker for Phase 1 observability. + * + * Sits downstream of `extractCacheMetrics` (normalizer) and upstream of the + * REPL display + `/cache-stats` command. The shim layers already report raw + * usage into Anthropic-shaped fields, so this tracker listens for each + * successful API response and folds the metrics into three buckets: + * + * - currentTurn : cleared by callers at the start of each user turn + * - session : accumulates from process start until `/clear` + * - history : per-request log for `/cache-stats` breakdown view + * + * Design rationale: + * - Module-local state (not AppState, not bootstrap/state.ts) because + * this is strictly observability — nothing in the conversation flow + * depends on it and we don't want to couple the shim to React state. + * - `recordRequest()` takes an ALREADY-normalized CacheMetrics so the + * shim layer can resolve provider once and we avoid re-running env + * detection on every response. + * - `history` is bounded (DEFAULT_HISTORY_MAX) so a long-lived session + * can't grow memory unboundedly. Oldest entries drop first. + * - `supported: false` requests still land in history (so the user can + * see "6 requests, all N/A" rather than "no data"), but they add to + * sums as zero — `addCacheMetrics` preserves the supported flag. + * + * History is stored as a **ring buffer** (fixed-size array + write index). + * Previous implementation used `array.splice(0, n)` on every overflow, + * which shifts the entire tail — O(n) per recordRequest for the default + * cap of 500 (negligible in practice, but wasteful). The ring makes + * `recordRequest` strictly O(1). `getCacheStatsHistory()` still pays O(n) + * to reconstruct chronological order, but that only runs when the user + * opens `/cache-stats` or the REPL renders — never in the hot path. + */ +import { addCacheMetrics, type CacheMetrics } from './cacheMetrics.js' + +/** One request's cache footprint — what the tracker remembers per turn. */ +export type CacheStatsEntry = { + /** Unix ms when the request completed. */ + timestamp: number + /** Opaque label (usually the model string) for `/cache-stats` rows. */ + label: string + /** Normalized metrics for this single request. */ + metrics: CacheMetrics +} + +// Bound the per-session history. 500 requests ≈ a full day of active use; +// any more than that is noise for a diagnostic command and starts costing +// real memory (~100 bytes per entry with the labels). +const DEFAULT_HISTORY_MAX = 500 + +const EMPTY_METRICS: CacheMetrics = { + read: 0, + created: 0, + total: 0, + hitRate: null, + supported: false, +} + +type TrackerState = { + currentTurn: CacheMetrics + session: CacheMetrics + // Ring buffer: fixed-size array, `historyWriteIdx` points at the next + // slot to overwrite. Once `historySize === historyMax`, each new push + // drops the oldest entry by simply overwriting it — no shifting. + history: (CacheStatsEntry | undefined)[] + historyWriteIdx: number + historySize: number + historyMax: number +} + +function createInitialState(max: number): TrackerState { + return { + currentTurn: EMPTY_METRICS, + session: EMPTY_METRICS, + history: new Array(max), + historyWriteIdx: 0, + historySize: 0, + historyMax: max, + } +} + +const state: TrackerState = createInitialState(DEFAULT_HISTORY_MAX) + +/** + * Record a single API response's normalized cache metrics. Idempotent per + * request (caller ensures this isn't double-counted) — safe to call from + * the shim right after `addToTotalSessionCost`. + * + * O(1) via ring-buffer write — previously used `splice(0, n)` on overflow + * which was O(n) per call for the default cap of 500. + */ +export function recordRequest( + metrics: CacheMetrics, + label: string, +): void { + state.currentTurn = addCacheMetrics(state.currentTurn, metrics) + state.session = addCacheMetrics(state.session, metrics) + const entry: CacheStatsEntry = { + timestamp: Date.now(), + label, + metrics, + } + // Overwrite at the write head. If the ring is full, this drops the + // oldest entry (which previously lived at this slot) implicitly. + state.history[state.historyWriteIdx] = entry + state.historyWriteIdx = (state.historyWriteIdx + 1) % state.historyMax + if (state.historySize < state.historyMax) { + state.historySize++ + } +} + +/** Clear turn-level counters at the start of a new user turn. */ +export function resetCurrentTurn(): void { + state.currentTurn = EMPTY_METRICS +} + +/** Clear all session state — used by `/clear`, `/compact`, tests. */ +export function resetSessionCacheStats(): void { + state.currentTurn = EMPTY_METRICS + state.session = EMPTY_METRICS + // Rebuild the ring so any hold-over references can be GC'd. Slightly + // more work than zeroing indices, but `/clear` is rare and this avoids + // silently pinning old CacheStatsEntry objects in memory. + state.history = new Array(state.historyMax) + state.historyWriteIdx = 0 + state.historySize = 0 +} + +/** Snapshot of the current turn's aggregate. */ +export function getCurrentTurnCacheMetrics(): CacheMetrics { + return state.currentTurn +} + +/** Snapshot of the session-wide aggregate. */ +export function getSessionCacheMetrics(): CacheMetrics { + return state.session +} + +/** + * Recent per-request entries, oldest-first. Returns a copy so callers + * can freely sort/filter without perturbing the tracker. + * + * Walks the ring from the oldest slot to the newest. Two cases: + * - not yet full: oldest is at index 0, newest at `size-1` + * - full / wrapped: oldest is at `writeIdx`, newest at `writeIdx-1` + */ +export function getCacheStatsHistory(): CacheStatsEntry[] { + if (state.historySize < state.historyMax) { + // Fast path: ring hasn't wrapped yet, entries live at [0..size). + return state.history.slice(0, state.historySize) as CacheStatsEntry[] + } + // Wrapped: reconstruct oldest-first by concatenating the two halves. + const tail = state.history.slice(state.historyWriteIdx) as CacheStatsEntry[] + const head = state.history.slice(0, state.historyWriteIdx) as CacheStatsEntry[] + return tail.concat(head) +} + +/** + * Test/debug hook — do not use in production paths. Resizes the ring + * preserving the most recent `min(cap, size)` entries in chronological + * order, so tests can shrink the cap and verify eviction behavior. + */ +export function _setHistoryCapForTesting(cap: number): void { + // Cap must be positive — a zero-sized ring would divide by zero on + // `preserved.length % cap`. Throw loudly rather than silently land on + // `NaN` indices that would corrupt the ring on the next push. + if (cap < 1) { + throw new Error(`_setHistoryCapForTesting: cap must be >= 1 (got ${cap})`) + } + const current = getCacheStatsHistory() + const preserved = cap < current.length ? current.slice(-cap) : current + state.history = new Array(cap) + for (let i = 0; i < preserved.length; i++) { + state.history[i] = preserved[i] + } + state.historyWriteIdx = preserved.length % cap + state.historySize = preserved.length + state.historyMax = cap +} diff --git a/src/services/api/codexShim.ts b/src/services/api/codexShim.ts index ef8b7806..392ddc7e 100644 --- a/src/services/api/codexShim.ts +++ b/src/services/api/codexShim.ts @@ -1,4 +1,5 @@ import { APIError } from '@anthropic-ai/sdk' +import { buildAnthropicUsageFromRawUsage } from './cacheMetrics.js' import { compressToolHistory } from './compressToolHistory.js' import { fetchWithProxyRetry } from './fetchWithProxyRetry.js' import type { @@ -78,21 +79,12 @@ type CodexSseEvent = { data: Record } -function makeUsage(usage?: { - input_tokens?: number - output_tokens?: number - input_tokens_details?: { cached_tokens?: number } - prompt_tokens_details?: { cached_tokens?: number } -}): AnthropicUsage { - return { - input_tokens: usage?.input_tokens ?? 0, - output_tokens: usage?.output_tokens ?? 0, - cache_creation_input_tokens: 0, - cache_read_input_tokens: - usage?.input_tokens_details?.cached_tokens ?? - usage?.prompt_tokens_details?.cached_tokens ?? - 0, - } +function makeUsage(usage?: Record): AnthropicUsage { + // Single source of truth for raw → Anthropic shape. Lives in + // cacheMetrics.ts alongside the raw-shape extractor so any new + // provider quirk requires a one-file change and the integration test + // can call the exact same function instead of re-implementing it. + return buildAnthropicUsageFromRawUsage(usage) } function makeMessageId(): string { @@ -911,18 +903,14 @@ export async function* codexStreamToAnthropic( stop_reason: determineStopReason(finalResponse, sawToolUse), stop_sequence: null, }, - usage: { - // Subtract cached tokens: OpenAI includes them in input_tokens, - // but Anthropic convention treats input_tokens as non-cached only. - input_tokens: (finalResponse?.usage?.input_tokens ?? 0) - - (finalResponse?.usage?.input_tokens_details?.cached_tokens ?? - finalResponse?.usage?.prompt_tokens_details?.cached_tokens ?? 0), - output_tokens: finalResponse?.usage?.output_tokens ?? 0, - cache_read_input_tokens: - finalResponse?.usage?.input_tokens_details?.cached_tokens ?? - finalResponse?.usage?.prompt_tokens_details?.cached_tokens ?? - 0, - }, + // Delegate to the shared normalizer so the streaming message_delta + // path uses the same raw→Anthropic conversion as makeUsage() above + // and the non-streaming response converter below. Previously this + // block had its own inline subtraction that missed Kimi / DeepSeek + // / Gemini raw shapes that the shared helper handles. + usage: makeUsage( + finalResponse?.usage as Record | undefined, + ), } yield { type: 'message_stop' } } diff --git a/src/services/api/openaiShim.ts b/src/services/api/openaiShim.ts index 7c5633d2..1c0d1392 100644 --- a/src/services/api/openaiShim.ts +++ b/src/services/api/openaiShim.ts @@ -46,6 +46,7 @@ import { type AnthropicUsage, type ShimCreateParams, } from './codexShim.js' +import { buildAnthropicUsageFromRawUsage } from './cacheMetrics.js' import { compressToolHistory } from './compressToolHistory.js' import { fetchWithProxyRetry } from './fetchWithProxyRetry.js' import { @@ -845,16 +846,12 @@ function convertChunkUsage( usage: OpenAIStreamChunk['usage'] | undefined, ): Partial | undefined { if (!usage) return undefined - - const cached = usage.prompt_tokens_details?.cached_tokens ?? 0 - return { - // Subtract cached tokens: OpenAI includes them in prompt_tokens, - // but Anthropic convention treats input_tokens as non-cached only. - input_tokens: (usage.prompt_tokens ?? 0) - cached, - output_tokens: usage.completion_tokens ?? 0, - cache_creation_input_tokens: 0, - cache_read_input_tokens: cached, - } + // Delegates to the shared helper so this path, codexShim.makeUsage, + // the non-streaming response below, and the integration tests all + // produce byte-identical output for the same raw input. + return buildAnthropicUsageFromRawUsage( + usage as unknown as Record, + ) } const JSON_REPAIR_SUFFIXES = [ @@ -2154,12 +2151,9 @@ class OpenAIShimMessages { model: data.model ?? model, stop_reason: stopReason, stop_sequence: null, - usage: { - input_tokens: data.usage?.prompt_tokens ?? 0, - output_tokens: data.usage?.completion_tokens ?? 0, - cache_creation_input_tokens: 0, - cache_read_input_tokens: data.usage?.prompt_tokens_details?.cached_tokens ?? 0, - }, + usage: buildAnthropicUsageFromRawUsage( + data.usage as unknown as Record | undefined, + ), } } } diff --git a/src/tools/ConfigTool/supportedSettings.ts b/src/tools/ConfigTool/supportedSettings.ts index 86d7d2d8..0ce91721 100644 --- a/src/tools/ConfigTool/supportedSettings.ts +++ b/src/tools/ConfigTool/supportedSettings.ts @@ -1,5 +1,8 @@ import { feature } from 'bun:bundle' -import { getRemoteControlAtStartup } from '../../utils/config.js' +import { + getRemoteControlAtStartup, + SHOW_CACHE_STATS_MODES, +} from '../../utils/config.js' import { EDITOR_MODES, NOTIFICATION_CHANNELS, @@ -77,6 +80,13 @@ export const SUPPORTED_SETTINGS: Record = { description: 'Show turn duration message after responses (e.g., "Cooked for 1m 6s")', }, + showCacheStats: { + source: 'global', + type: 'string', + description: + 'Show per-query cache hit/miss summary at end of turn (off | compact | full)', + options: SHOW_CACHE_STATS_MODES, + }, terminalProgressBarEnabled: { source: 'global', type: 'boolean', diff --git a/src/utils/config.showCacheStats.test.ts b/src/utils/config.showCacheStats.test.ts new file mode 100644 index 00000000..c97115a9 --- /dev/null +++ b/src/utils/config.showCacheStats.test.ts @@ -0,0 +1,126 @@ +import { expect, test, describe } from 'bun:test' +import { z } from 'zod' +import { + DEFAULT_GLOBAL_CONFIG, + GLOBAL_CONFIG_KEYS, + isGlobalConfigKey, + SHOW_CACHE_STATS_MODES, + type GlobalConfig, +} from './config.js' + +// Standalone Zod schema mirroring the runtime contract for showCacheStats. +// The config file does not carry a Zod schema per field (GlobalConfig is a +// plain TS type with defaults), so we exercise validation here so that any +// future drift — e.g. adding a mode without updating the UI — is caught at +// test time rather than silently rendered in /config. +const ShowCacheStatsSchema = z.enum(SHOW_CACHE_STATS_MODES) + +describe('GlobalConfig — showCacheStats registration', () => { + test('default is "compact"', () => { + expect(DEFAULT_GLOBAL_CONFIG.showCacheStats).toBe('compact') + }) + + test('is listed in GLOBAL_CONFIG_KEYS (exposed via /config and ConfigTool)', () => { + expect(GLOBAL_CONFIG_KEYS).toContain('showCacheStats') + expect(isGlobalConfigKey('showCacheStats')).toBe(true) + }) + + test('SHOW_CACHE_STATS_MODES is the single source of truth', () => { + expect(SHOW_CACHE_STATS_MODES).toEqual(['off', 'compact', 'full']) + }) +}) + +describe('showCacheStats — Zod validation', () => { + test('accepts "off"', () => { + expect(ShowCacheStatsSchema.parse('off')).toBe('off') + }) + + test('accepts "compact"', () => { + expect(ShowCacheStatsSchema.parse('compact')).toBe('compact') + }) + + test('accepts "full"', () => { + expect(ShowCacheStatsSchema.parse('full')).toBe('full') + }) + + test('rejects arbitrary strings', () => { + expect(() => ShowCacheStatsSchema.parse('verbose')).toThrow() + expect(() => ShowCacheStatsSchema.parse('')).toThrow() + expect(() => ShowCacheStatsSchema.parse('ON')).toThrow() + }) + + test('rejects non-string values', () => { + expect(() => ShowCacheStatsSchema.parse(true)).toThrow() + expect(() => ShowCacheStatsSchema.parse(1)).toThrow() + expect(() => ShowCacheStatsSchema.parse(null)).toThrow() + expect(() => ShowCacheStatsSchema.parse(undefined)).toThrow() + }) +}) + +describe('showCacheStats — GlobalConfig type surface', () => { + test('assignable to each accepted mode without casting', () => { + const a: Pick = { showCacheStats: 'off' } + const b: Pick = { showCacheStats: 'compact' } + const c: Pick = { showCacheStats: 'full' } + expect([a.showCacheStats, b.showCacheStats, c.showCacheStats]).toEqual([ + 'off', + 'compact', + 'full', + ]) + }) +}) + +describe('showCacheStats — default applies to pre-existing configs', () => { + // Review feedback (P2 #7): "ensure the schema explicitly sets + // showCacheStats: 'compact' as the default value, not relying on the + // REPL gate's undefined handling." + // + // Config layer at src/utils/config.ts:1494 already does + // { ...createDefault(), ...parsedConfig } + // so a user who had a config file from before this PR gets the + // 'compact' default automatically on first load. These tests pin that + // behavior so a future refactor of the merge pattern surfaces the + // regression loudly. + + test('legacy config without showCacheStats field merges to default', () => { + // Simulate what getConfig() produces for an old config.json that + // predates this PR: spread default first, then spread the loaded + // (incomplete) object on top. + const legacyLoadedConfig = { + // Fields typical of a pre-PR config — anything real but no + // showCacheStats. The exact shape doesn't matter; we're testing + // the merge semantics. + theme: 'dark' as const, + } + const merged = { + ...DEFAULT_GLOBAL_CONFIG, + ...legacyLoadedConfig, + } + expect(merged.showCacheStats).toBe('compact') + }) + + test('user-set value overrides default via merge', () => { + // Counterpart: if the user has explicitly set a value, the merge + // must preserve it (defaults must NOT clobber user intent). + const userConfig = { showCacheStats: 'off' as const } + const merged = { + ...DEFAULT_GLOBAL_CONFIG, + ...userConfig, + } + expect(merged.showCacheStats).toBe('off') + }) + + test('REPL gate fallback kicks in only when mode is undefined', () => { + // Belt-and-suspenders from REPL.tsx:3031 — `?? 'compact'` after the + // config read. Simulates the code path in case a pathological config + // read returns an empty object and skips the merge entirely. + const corruptConfigRead: Partial = {} + const mode = corruptConfigRead.showCacheStats ?? 'compact' + expect(mode).toBe('compact') + + // Explicit 'off' is preserved — fallback must not clobber user intent. + const explicitOff: Partial = { showCacheStats: 'off' } + const modeOff = explicitOff.showCacheStats ?? 'compact' + expect(modeOff).toBe('off') + }) +}) diff --git a/src/utils/config.ts b/src/utils/config.ts index 6ffc2a1b..380019f4 100644 --- a/src/utils/config.ts +++ b/src/utils/config.ts @@ -179,6 +179,9 @@ export type EditorMode = 'emacs' | (typeof EDITOR_MODES)[number] export type DiffTool = 'terminal' | 'auto' +export type ShowCacheStatsMode = 'off' | 'compact' | 'full' +export const SHOW_CACHE_STATS_MODES = ['off', 'compact', 'full'] as const satisfies readonly ShowCacheStatsMode[] + export type OutputStyle = string export type Providers = typeof PROVIDERS[number] @@ -246,6 +249,11 @@ export type GlobalConfig = { autoCompactEnabled: boolean // Controls whether auto-compact is enabled toolHistoryCompressionEnabled: boolean // Compress old tool_result content for small-context providers showTurnDuration: boolean // Controls whether to show turn duration message (e.g., "Cooked for 1m 6s") + // Controls whether to show per-query cache hit/miss stats at the end of each turn. + // 'off' — no display + // 'compact' — one-line summary (e.g. "[Cache: 1.2k read • hit 12%]") + // 'full' — breakdown (read / created / hit-rate) per query + showCacheStats: ShowCacheStatsMode /** * @deprecated Use settings.env instead. */ @@ -628,6 +636,7 @@ function createDefaultGlobalConfig(): GlobalConfig { autoCompactEnabled: true, toolHistoryCompressionEnabled: true, showTurnDuration: true, + showCacheStats: 'compact', hasSeenTasksHint: false, hasUsedStash: false, hasUsedBackgroundTask: false, @@ -677,6 +686,7 @@ export const GLOBAL_CONFIG_KEYS = [ 'autoCompactEnabled', 'toolHistoryCompressionEnabled', 'showTurnDuration', + 'showCacheStats', 'diffTool', 'env', 'tipsHistory',