fix: report cache reads in streaming and correct cost calculation (#577)
* fix: report cache reads in streaming and correct cost calculation
Fix two bugs in how the OpenAI-to-Anthropic shim handles cached tokens:
1. codexShim: streaming message_delta missing cache_read_input_tokens
The codexStreamToAnthropic() function builds the final message_delta
usage object inline (not through makeUsage()), and only included
input_tokens and output_tokens. cache_read_input_tokens was always 0,
so /cost never showed cache reads for Responses API models (GPT-5+).
Also fix makeUsage() to read input_tokens_details.cached_tokens and
prompt_tokens_details.cached_tokens for the non-streaming path.
2. Both shims: cost double-counting from convention mismatch
OpenAI includes cached tokens in input_tokens/prompt_tokens (i.e.,
input_tokens = uncached + cached). Anthropic treats input_tokens as
uncached only. The cost formula was:
cost = input_tokens * inputRate + cache_read * cacheRate
This double-counts cached tokens. Fix by subtracting cached from
input during the conversion:
input_tokens = prompt_tokens - cached_tokens
In practice this was inflating reported costs by ~2x for sessions
with high cache hit rates (which is most sessions, since Copilot
auto-caches server-side).
Fixes #515
* fix: omit zero cache read/write fields from /cost output
Only show "cache read" and "cache write" in /cost per-model usage when
the value is > 0. Providers like GitHub Copilot never report
cache_creation_input_tokens (the server manages its own cache), so
showing "0 cache write" on every line is misleading — it implies caching
is not working when it actually is.
Before:
claude-haiku: 2.6k input, 151 output, 39.8k cache read, 0 cache write ($0.04)
After:
claude-haiku: 2.6k input, 151 output, 39.8k cache read ($0.04)
---------
Co-authored-by: Zartris <14197299+Zartris@users.noreply.github.com>
This commit is contained in:
@@ -181,7 +181,7 @@ function formatCost(cost: number, maxDecimalPlaces: number = 4): string {
|
|||||||
function formatModelUsage(): string {
|
function formatModelUsage(): string {
|
||||||
const modelUsageMap = getModelUsage()
|
const modelUsageMap = getModelUsage()
|
||||||
if (Object.keys(modelUsageMap).length === 0) {
|
if (Object.keys(modelUsageMap).length === 0) {
|
||||||
return 'Usage: 0 input, 0 output, 0 cache read, 0 cache write'
|
return 'Usage: 0 input, 0 output'
|
||||||
}
|
}
|
||||||
|
|
||||||
// Accumulate usage by short name
|
// Accumulate usage by short name
|
||||||
@@ -211,15 +211,19 @@ function formatModelUsage(): string {
|
|||||||
|
|
||||||
let result = 'Usage by model:'
|
let result = 'Usage by model:'
|
||||||
for (const [shortName, usage] of Object.entries(usageByShortName)) {
|
for (const [shortName, usage] of Object.entries(usageByShortName)) {
|
||||||
const usageString =
|
let usageString =
|
||||||
` ${formatNumber(usage.inputTokens)} input, ` +
|
` ${formatNumber(usage.inputTokens)} input, ` +
|
||||||
`${formatNumber(usage.outputTokens)} output, ` +
|
`${formatNumber(usage.outputTokens)} output`
|
||||||
`${formatNumber(usage.cacheReadInputTokens)} cache read, ` +
|
if (usage.cacheReadInputTokens > 0) {
|
||||||
`${formatNumber(usage.cacheCreationInputTokens)} cache write` +
|
usageString += `, ${formatNumber(usage.cacheReadInputTokens)} cache read`
|
||||||
(usage.webSearchRequests > 0
|
}
|
||||||
? `, ${formatNumber(usage.webSearchRequests)} web search`
|
if (usage.cacheCreationInputTokens > 0) {
|
||||||
: '') +
|
usageString += `, ${formatNumber(usage.cacheCreationInputTokens)} cache write`
|
||||||
` (${formatCost(usage.costUSD)})`
|
}
|
||||||
|
if (usage.webSearchRequests > 0) {
|
||||||
|
usageString += `, ${formatNumber(usage.webSearchRequests)} web search`
|
||||||
|
}
|
||||||
|
usageString += ` (${formatCost(usage.costUSD)})`
|
||||||
result += `\n` + `${shortName}:`.padStart(21) + usageString
|
result += `\n` + `${shortName}:`.padStart(21) + usageString
|
||||||
}
|
}
|
||||||
return result
|
return result
|
||||||
|
|||||||
@@ -80,12 +80,17 @@ type CodexSseEvent = {
|
|||||||
function makeUsage(usage?: {
|
function makeUsage(usage?: {
|
||||||
input_tokens?: number
|
input_tokens?: number
|
||||||
output_tokens?: number
|
output_tokens?: number
|
||||||
|
input_tokens_details?: { cached_tokens?: number }
|
||||||
|
prompt_tokens_details?: { cached_tokens?: number }
|
||||||
}): AnthropicUsage {
|
}): AnthropicUsage {
|
||||||
return {
|
return {
|
||||||
input_tokens: usage?.input_tokens ?? 0,
|
input_tokens: usage?.input_tokens ?? 0,
|
||||||
output_tokens: usage?.output_tokens ?? 0,
|
output_tokens: usage?.output_tokens ?? 0,
|
||||||
cache_creation_input_tokens: 0,
|
cache_creation_input_tokens: 0,
|
||||||
cache_read_input_tokens: 0,
|
cache_read_input_tokens:
|
||||||
|
usage?.input_tokens_details?.cached_tokens ??
|
||||||
|
usage?.prompt_tokens_details?.cached_tokens ??
|
||||||
|
0,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -890,8 +895,16 @@ export async function* codexStreamToAnthropic(
|
|||||||
stop_sequence: null,
|
stop_sequence: null,
|
||||||
},
|
},
|
||||||
usage: {
|
usage: {
|
||||||
input_tokens: finalResponse?.usage?.input_tokens ?? 0,
|
// Subtract cached tokens: OpenAI includes them in input_tokens,
|
||||||
|
// but Anthropic convention treats input_tokens as non-cached only.
|
||||||
|
input_tokens: (finalResponse?.usage?.input_tokens ?? 0) -
|
||||||
|
(finalResponse?.usage?.input_tokens_details?.cached_tokens ??
|
||||||
|
finalResponse?.usage?.prompt_tokens_details?.cached_tokens ?? 0),
|
||||||
output_tokens: finalResponse?.usage?.output_tokens ?? 0,
|
output_tokens: finalResponse?.usage?.output_tokens ?? 0,
|
||||||
|
cache_read_input_tokens:
|
||||||
|
finalResponse?.usage?.input_tokens_details?.cached_tokens ??
|
||||||
|
finalResponse?.usage?.prompt_tokens_details?.cached_tokens ??
|
||||||
|
0,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
yield { type: 'message_stop' }
|
yield { type: 'message_stop' }
|
||||||
|
|||||||
@@ -564,11 +564,14 @@ function convertChunkUsage(
|
|||||||
): Partial<AnthropicUsage> | undefined {
|
): Partial<AnthropicUsage> | undefined {
|
||||||
if (!usage) return undefined
|
if (!usage) return undefined
|
||||||
|
|
||||||
|
const cached = usage.prompt_tokens_details?.cached_tokens ?? 0
|
||||||
return {
|
return {
|
||||||
input_tokens: usage.prompt_tokens ?? 0,
|
// Subtract cached tokens: OpenAI includes them in prompt_tokens,
|
||||||
|
// but Anthropic convention treats input_tokens as non-cached only.
|
||||||
|
input_tokens: (usage.prompt_tokens ?? 0) - cached,
|
||||||
output_tokens: usage.completion_tokens ?? 0,
|
output_tokens: usage.completion_tokens ?? 0,
|
||||||
cache_creation_input_tokens: 0,
|
cache_creation_input_tokens: 0,
|
||||||
cache_read_input_tokens: usage.prompt_tokens_details?.cached_tokens ?? 0,
|
cache_read_input_tokens: cached,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user