From f4ac709fa6eda732bf45204fcab625ba6c5674b9 Mon Sep 17 00:00:00 2001 From: Zartris Date: Fri, 10 Apr 2026 17:40:42 +0200 Subject: [PATCH] fix: report cache reads in streaming and correct cost calculation (#577) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix: report cache reads in streaming and correct cost calculation Fix two bugs in how the OpenAI-to-Anthropic shim handles cached tokens: 1. codexShim: streaming message_delta missing cache_read_input_tokens The codexStreamToAnthropic() function builds the final message_delta usage object inline (not through makeUsage()), and only included input_tokens and output_tokens. cache_read_input_tokens was always 0, so /cost never showed cache reads for Responses API models (GPT-5+). Also fix makeUsage() to read input_tokens_details.cached_tokens and prompt_tokens_details.cached_tokens for the non-streaming path. 2. Both shims: cost double-counting from convention mismatch OpenAI includes cached tokens in input_tokens/prompt_tokens (i.e., input_tokens = uncached + cached). Anthropic treats input_tokens as uncached only. The cost formula was: cost = input_tokens * inputRate + cache_read * cacheRate This double-counts cached tokens. Fix by subtracting cached from input during the conversion: input_tokens = prompt_tokens - cached_tokens In practice this was inflating reported costs by ~2x for sessions with high cache hit rates (which is most sessions, since Copilot auto-caches server-side). Fixes #515 * fix: omit zero cache read/write fields from /cost output Only show "cache read" and "cache write" in /cost per-model usage when the value is > 0. Providers like GitHub Copilot never report cache_creation_input_tokens (the server manages its own cache), so showing "0 cache write" on every line is misleading — it implies caching is not working when it actually is. Before: claude-haiku: 2.6k input, 151 output, 39.8k cache read, 0 cache write ($0.04) After: claude-haiku: 2.6k input, 151 output, 39.8k cache read ($0.04) --------- Co-authored-by: Zartris <14197299+Zartris@users.noreply.github.com> --- src/cost-tracker.ts | 22 +++++++++++++--------- src/services/api/codexShim.ts | 17 +++++++++++++++-- src/services/api/openaiShim.ts | 7 +++++-- 3 files changed, 33 insertions(+), 13 deletions(-) diff --git a/src/cost-tracker.ts b/src/cost-tracker.ts index b03184c6..56920c5a 100644 --- a/src/cost-tracker.ts +++ b/src/cost-tracker.ts @@ -181,7 +181,7 @@ function formatCost(cost: number, maxDecimalPlaces: number = 4): string { function formatModelUsage(): string { const modelUsageMap = getModelUsage() if (Object.keys(modelUsageMap).length === 0) { - return 'Usage: 0 input, 0 output, 0 cache read, 0 cache write' + return 'Usage: 0 input, 0 output' } // Accumulate usage by short name @@ -211,15 +211,19 @@ function formatModelUsage(): string { let result = 'Usage by model:' for (const [shortName, usage] of Object.entries(usageByShortName)) { - const usageString = + let usageString = ` ${formatNumber(usage.inputTokens)} input, ` + - `${formatNumber(usage.outputTokens)} output, ` + - `${formatNumber(usage.cacheReadInputTokens)} cache read, ` + - `${formatNumber(usage.cacheCreationInputTokens)} cache write` + - (usage.webSearchRequests > 0 - ? `, ${formatNumber(usage.webSearchRequests)} web search` - : '') + - ` (${formatCost(usage.costUSD)})` + `${formatNumber(usage.outputTokens)} output` + if (usage.cacheReadInputTokens > 0) { + usageString += `, ${formatNumber(usage.cacheReadInputTokens)} cache read` + } + if (usage.cacheCreationInputTokens > 0) { + usageString += `, ${formatNumber(usage.cacheCreationInputTokens)} cache write` + } + if (usage.webSearchRequests > 0) { + usageString += `, ${formatNumber(usage.webSearchRequests)} web search` + } + usageString += ` (${formatCost(usage.costUSD)})` result += `\n` + `${shortName}:`.padStart(21) + usageString } return result diff --git a/src/services/api/codexShim.ts b/src/services/api/codexShim.ts index 4b7260e7..4c823a3d 100644 --- a/src/services/api/codexShim.ts +++ b/src/services/api/codexShim.ts @@ -80,12 +80,17 @@ type CodexSseEvent = { function makeUsage(usage?: { input_tokens?: number output_tokens?: number + input_tokens_details?: { cached_tokens?: number } + prompt_tokens_details?: { cached_tokens?: number } }): AnthropicUsage { return { input_tokens: usage?.input_tokens ?? 0, output_tokens: usage?.output_tokens ?? 0, cache_creation_input_tokens: 0, - cache_read_input_tokens: 0, + cache_read_input_tokens: + usage?.input_tokens_details?.cached_tokens ?? + usage?.prompt_tokens_details?.cached_tokens ?? + 0, } } @@ -890,8 +895,16 @@ export async function* codexStreamToAnthropic( stop_sequence: null, }, usage: { - input_tokens: finalResponse?.usage?.input_tokens ?? 0, + // Subtract cached tokens: OpenAI includes them in input_tokens, + // but Anthropic convention treats input_tokens as non-cached only. + input_tokens: (finalResponse?.usage?.input_tokens ?? 0) - + (finalResponse?.usage?.input_tokens_details?.cached_tokens ?? + finalResponse?.usage?.prompt_tokens_details?.cached_tokens ?? 0), output_tokens: finalResponse?.usage?.output_tokens ?? 0, + cache_read_input_tokens: + finalResponse?.usage?.input_tokens_details?.cached_tokens ?? + finalResponse?.usage?.prompt_tokens_details?.cached_tokens ?? + 0, }, } yield { type: 'message_stop' } diff --git a/src/services/api/openaiShim.ts b/src/services/api/openaiShim.ts index 727e4ca9..978ecf57 100644 --- a/src/services/api/openaiShim.ts +++ b/src/services/api/openaiShim.ts @@ -564,11 +564,14 @@ function convertChunkUsage( ): Partial | undefined { if (!usage) return undefined + const cached = usage.prompt_tokens_details?.cached_tokens ?? 0 return { - input_tokens: usage.prompt_tokens ?? 0, + // Subtract cached tokens: OpenAI includes them in prompt_tokens, + // but Anthropic convention treats input_tokens as non-cached only. + input_tokens: (usage.prompt_tokens ?? 0) - cached, output_tokens: usage.completion_tokens ?? 0, cache_creation_input_tokens: 0, - cache_read_input_tokens: usage.prompt_tokens_details?.cached_tokens ?? 0, + cache_read_input_tokens: cached, } }