feat(api): expose cache metrics in REPL + normalize across providers (#813)
* feat(api): expose cache metrics in REPL + /cache-stats command * fix(api): normalize Kimi/DeepSeek/Gemini cache fields through shim layer * test(api): cover /cache-stats rendering + fix CacheMetrics docstring drift * fix(api): always reset cache turn counter + include date in /cache-stats rows * refactor(api): unify shim usage builder + add cost-tracker wiring test * fix(api): classify private-IP/self-hosted OpenAI endpoints as N/A instead of cold * fix(api): require colon guard on IPv6 ULA prefix to avoid public-host over-match * perf(api): ring buffer for cache history + hit rate clamp + .localhost TLD * fix(api): null guards on formatters + document Codex Responses API shape * fix(api): defensive start-of-turn reset + config gate fallback + env var docs * fix(api): trust forwarded cache data on self-hosted URLs (data-driven) * refactor(api): delegate streaming Responses usage to shared makeUsage helper
This commit is contained in:
782
src/services/api/cacheMetrics.test.ts
Normal file
782
src/services/api/cacheMetrics.test.ts
Normal file
@@ -0,0 +1,782 @@
|
||||
import { expect, test, describe } from 'bun:test'
|
||||
import {
|
||||
extractCacheMetrics,
|
||||
extractCacheReadFromRawUsage,
|
||||
resolveCacheProvider,
|
||||
formatCacheMetricsCompact,
|
||||
formatCacheMetricsFull,
|
||||
addCacheMetrics,
|
||||
} from './cacheMetrics.js'
|
||||
|
||||
describe('extractCacheMetrics — Anthropic (firstParty/bedrock/vertex/foundry)', () => {
|
||||
test('reports read/created separately and computes hit rate over total input', () => {
|
||||
const usage = {
|
||||
input_tokens: 300,
|
||||
output_tokens: 100,
|
||||
cache_read_input_tokens: 800,
|
||||
cache_creation_input_tokens: 200,
|
||||
}
|
||||
const m = extractCacheMetrics(usage, 'anthropic')
|
||||
expect(m.supported).toBe(true)
|
||||
expect(m.read).toBe(800)
|
||||
expect(m.created).toBe(200)
|
||||
// total = fresh(300) + created(200) + read(800) = 1300
|
||||
expect(m.total).toBe(1300)
|
||||
expect(m.hitRate).toBeCloseTo(800 / 1300, 4)
|
||||
})
|
||||
|
||||
test('returns cold metrics when no cache activity yet', () => {
|
||||
const m = extractCacheMetrics({ input_tokens: 500 }, 'anthropic')
|
||||
expect(m.supported).toBe(true)
|
||||
expect(m.read).toBe(0)
|
||||
expect(m.created).toBe(0)
|
||||
expect(m.hitRate).toBe(0)
|
||||
})
|
||||
|
||||
test('null hit rate when usage has no input at all', () => {
|
||||
const m = extractCacheMetrics({}, 'anthropic')
|
||||
expect(m.supported).toBe(true)
|
||||
expect(m.hitRate).toBeNull()
|
||||
})
|
||||
})
|
||||
|
||||
// NOTE: OpenAI/Codex/Kimi/DeepSeek/Gemini raw shapes are now tested through
|
||||
// extractCacheReadFromRawUsage (below). extractCacheMetrics sees the
|
||||
// post-shim Anthropic shape for every provider, so the tests here verify
|
||||
// that the shape lookup works uniformly against the shimmed fields.
|
||||
|
||||
describe('extractCacheMetrics — post-shim Anthropic shape (applies to all providers)', () => {
|
||||
test('OpenAI post-shim (openai bucket) — reads Anthropic fields injected by convertChunkUsage', () => {
|
||||
// This is what cost-tracker actually sees for OpenAI upstreams: the
|
||||
// shim has already subtracted cached from prompt_tokens and moved it
|
||||
// to cache_read_input_tokens.
|
||||
const shimmed = {
|
||||
input_tokens: 800, // fresh = 2000 - 1200
|
||||
output_tokens: 300,
|
||||
cache_creation_input_tokens: 0,
|
||||
cache_read_input_tokens: 1_200,
|
||||
}
|
||||
const m = extractCacheMetrics(shimmed, 'openai')
|
||||
expect(m.supported).toBe(true)
|
||||
expect(m.read).toBe(1_200)
|
||||
expect(m.created).toBe(0)
|
||||
expect(m.total).toBe(2_000) // 800 fresh + 1200 read
|
||||
expect(m.hitRate).toBe(0.6)
|
||||
})
|
||||
|
||||
test('Codex post-shim — same Anthropic shape as OpenAI', () => {
|
||||
const shimmed = {
|
||||
input_tokens: 900, // 1500 - 600
|
||||
cache_creation_input_tokens: 0,
|
||||
cache_read_input_tokens: 600,
|
||||
}
|
||||
const m = extractCacheMetrics(shimmed, 'codex')
|
||||
expect(m.read).toBe(600)
|
||||
expect(m.total).toBe(1_500)
|
||||
expect(m.hitRate).toBe(0.4)
|
||||
})
|
||||
|
||||
test('Kimi post-shim — shim moved top-level cached_tokens into Anthropic field', () => {
|
||||
const shimmed = {
|
||||
input_tokens: 600, // 1000 - 400
|
||||
cache_creation_input_tokens: 0,
|
||||
cache_read_input_tokens: 400,
|
||||
}
|
||||
const m = extractCacheMetrics(shimmed, 'kimi')
|
||||
expect(m.read).toBe(400)
|
||||
expect(m.total).toBe(1_000)
|
||||
expect(m.hitRate).toBe(0.4)
|
||||
})
|
||||
|
||||
test('DeepSeek post-shim — hit moved to cache_read_input_tokens, miss to input_tokens', () => {
|
||||
const shimmed = {
|
||||
input_tokens: 300, // miss
|
||||
cache_creation_input_tokens: 0,
|
||||
cache_read_input_tokens: 700, // hit
|
||||
}
|
||||
const m = extractCacheMetrics(shimmed, 'deepseek')
|
||||
expect(m.read).toBe(700)
|
||||
expect(m.total).toBe(1_000)
|
||||
expect(m.hitRate).toBe(0.7)
|
||||
})
|
||||
|
||||
test('Gemini post-shim — cached_content_token_count moved to cache_read_input_tokens', () => {
|
||||
const shimmed = {
|
||||
input_tokens: 800, // 4000 - 3200
|
||||
cache_creation_input_tokens: 0,
|
||||
cache_read_input_tokens: 3_200,
|
||||
}
|
||||
const m = extractCacheMetrics(shimmed, 'gemini')
|
||||
expect(m.read).toBe(3_200)
|
||||
expect(m.total).toBe(4_000)
|
||||
expect(m.hitRate).toBe(0.8)
|
||||
})
|
||||
})
|
||||
|
||||
describe('extractCacheReadFromRawUsage — single source of truth for shim layer', () => {
|
||||
test('Anthropic-native passthrough: cache_read_input_tokens', () => {
|
||||
expect(
|
||||
extractCacheReadFromRawUsage({ cache_read_input_tokens: 1_500 }),
|
||||
).toBe(1_500)
|
||||
})
|
||||
|
||||
test('OpenAI Chat Completions: prompt_tokens_details.cached_tokens', () => {
|
||||
expect(
|
||||
extractCacheReadFromRawUsage({
|
||||
prompt_tokens: 2_000,
|
||||
prompt_tokens_details: { cached_tokens: 1_200 },
|
||||
}),
|
||||
).toBe(1_200)
|
||||
})
|
||||
|
||||
test('Codex Responses API: input_tokens_details.cached_tokens', () => {
|
||||
expect(
|
||||
extractCacheReadFromRawUsage({
|
||||
input_tokens: 1_500,
|
||||
input_tokens_details: { cached_tokens: 600 },
|
||||
}),
|
||||
).toBe(600)
|
||||
})
|
||||
|
||||
test('Kimi / Moonshot: top-level cached_tokens', () => {
|
||||
expect(
|
||||
extractCacheReadFromRawUsage({ prompt_tokens: 1_000, cached_tokens: 400 }),
|
||||
).toBe(400)
|
||||
})
|
||||
|
||||
test('DeepSeek: prompt_cache_hit_tokens', () => {
|
||||
expect(
|
||||
extractCacheReadFromRawUsage({
|
||||
prompt_cache_hit_tokens: 700,
|
||||
prompt_cache_miss_tokens: 300,
|
||||
}),
|
||||
).toBe(700)
|
||||
})
|
||||
|
||||
test('Gemini: cached_content_token_count', () => {
|
||||
expect(
|
||||
extractCacheReadFromRawUsage({
|
||||
prompt_token_count: 4_000,
|
||||
cached_content_token_count: 3_200,
|
||||
}),
|
||||
).toBe(3_200)
|
||||
})
|
||||
|
||||
test('no cache fields at all → 0 (Copilot/Ollama/unknown shape)', () => {
|
||||
expect(extractCacheReadFromRawUsage({ prompt_tokens: 500 })).toBe(0)
|
||||
})
|
||||
|
||||
test('Anthropic field wins over OpenAI field when both present', () => {
|
||||
// Shouldn't happen in practice, but if usage was double-annotated we
|
||||
// trust the Anthropic-native number (it's the more authoritative one).
|
||||
expect(
|
||||
extractCacheReadFromRawUsage({
|
||||
cache_read_input_tokens: 999,
|
||||
prompt_tokens_details: { cached_tokens: 111 },
|
||||
}),
|
||||
).toBe(999)
|
||||
})
|
||||
|
||||
test('null/undefined/non-object → 0', () => {
|
||||
expect(extractCacheReadFromRawUsage(null)).toBe(0)
|
||||
expect(extractCacheReadFromRawUsage(undefined)).toBe(0)
|
||||
expect(extractCacheReadFromRawUsage('nope' as unknown as never)).toBe(0)
|
||||
})
|
||||
})
|
||||
|
||||
describe('extractCacheMetrics — Copilot / Ollama (unsupported)', () => {
|
||||
test('returns supported:false with all zeros and null hitRate for Copilot', () => {
|
||||
const m = extractCacheMetrics({ prompt_tokens: 1000 }, 'copilot')
|
||||
expect(m.supported).toBe(false)
|
||||
expect(m.read).toBe(0)
|
||||
expect(m.created).toBe(0)
|
||||
expect(m.hitRate).toBeNull()
|
||||
})
|
||||
|
||||
test('returns supported:false for Ollama', () => {
|
||||
const m = extractCacheMetrics({ prompt_tokens: 42 }, 'ollama')
|
||||
expect(m.supported).toBe(false)
|
||||
expect(m.hitRate).toBeNull()
|
||||
})
|
||||
|
||||
test('Copilot serving Claude (copilot-claude) is supported and uses Anthropic fields', () => {
|
||||
const usage = {
|
||||
input_tokens: 200,
|
||||
cache_read_input_tokens: 800,
|
||||
cache_creation_input_tokens: 100,
|
||||
}
|
||||
const m = extractCacheMetrics(usage, 'copilot-claude')
|
||||
expect(m.supported).toBe(true)
|
||||
expect(m.read).toBe(800)
|
||||
expect(m.created).toBe(100)
|
||||
expect(m.total).toBe(1_100)
|
||||
})
|
||||
})
|
||||
|
||||
describe('extractCacheMetrics — bad/empty input', () => {
|
||||
test('null usage returns unsupported', () => {
|
||||
expect(extractCacheMetrics(null, 'anthropic').supported).toBe(false)
|
||||
})
|
||||
|
||||
test('non-object usage returns unsupported', () => {
|
||||
expect(extractCacheMetrics('oops' as unknown as never, 'openai').supported).toBe(
|
||||
false,
|
||||
)
|
||||
})
|
||||
})
|
||||
|
||||
describe('resolveCacheProvider', () => {
|
||||
test('firstParty → anthropic', () => {
|
||||
expect(resolveCacheProvider('firstParty')).toBe('anthropic')
|
||||
})
|
||||
test('bedrock/vertex/foundry → anthropic', () => {
|
||||
expect(resolveCacheProvider('bedrock')).toBe('anthropic')
|
||||
expect(resolveCacheProvider('vertex')).toBe('anthropic')
|
||||
expect(resolveCacheProvider('foundry')).toBe('anthropic')
|
||||
})
|
||||
test('github without claude hint → copilot (unsupported)', () => {
|
||||
expect(resolveCacheProvider('github')).toBe('copilot')
|
||||
})
|
||||
test('github with claude hint → copilot-claude', () => {
|
||||
expect(
|
||||
resolveCacheProvider('github', { githubNativeAnthropic: true }),
|
||||
).toBe('copilot-claude')
|
||||
})
|
||||
test('openai with localhost / loopback → self-hosted', () => {
|
||||
// These used to return 'ollama'; the bucket is now 'self-hosted'
|
||||
// because not every local OpenAI-compatible server is Ollama
|
||||
// (could be vLLM, LM Studio, LocalAI, text-generation-webui).
|
||||
// Both buckets collapse to supported=false downstream.
|
||||
expect(
|
||||
resolveCacheProvider('openai', { openAiBaseUrl: 'http://localhost:8080/v1' }),
|
||||
).toBe('self-hosted')
|
||||
expect(
|
||||
resolveCacheProvider('openai', { openAiBaseUrl: 'http://127.0.0.1:1234/v1' }),
|
||||
).toBe('self-hosted')
|
||||
// Localhost:11434 hits the self-hosted branch first — 'ollama' only
|
||||
// kicks in when the :11434 port appears on a public-looking URL
|
||||
// (which would be unusual but still deserves honest classification).
|
||||
expect(
|
||||
resolveCacheProvider('openai', { openAiBaseUrl: 'http://localhost:11434/v1' }),
|
||||
).toBe('self-hosted')
|
||||
expect(
|
||||
resolveCacheProvider('openai', { openAiBaseUrl: 'http://[::1]:5000/v1' }),
|
||||
).toBe('self-hosted')
|
||||
})
|
||||
|
||||
test('openai on RFC1918 private IP → self-hosted (pre-fix: misclassified as openai)', () => {
|
||||
// These are the exact cases the reviewer flagged. Before this fix,
|
||||
// a vLLM / LocalAI server on a LAN address fell through to the
|
||||
// 'openai' branch and /cache-stats showed '[Cache: cold]' — which
|
||||
// users read as "my cache is broken" when the provider simply
|
||||
// didn't report cache fields. Now they land in 'self-hosted' and
|
||||
// /cache-stats shows '[Cache: N/A]'.
|
||||
expect(
|
||||
resolveCacheProvider('openai', { openAiBaseUrl: 'http://192.168.1.50:8000/v1' }),
|
||||
).toBe('self-hosted')
|
||||
expect(
|
||||
resolveCacheProvider('openai', { openAiBaseUrl: 'http://10.0.0.7:8080/v1' }),
|
||||
).toBe('self-hosted')
|
||||
expect(
|
||||
resolveCacheProvider('openai', { openAiBaseUrl: 'http://172.20.0.3:5000/v1' }),
|
||||
).toBe('self-hosted')
|
||||
})
|
||||
|
||||
test('openai on link-local / CGNAT → self-hosted', () => {
|
||||
expect(
|
||||
resolveCacheProvider('openai', { openAiBaseUrl: 'http://169.254.169.254/v1' }),
|
||||
).toBe('self-hosted')
|
||||
expect(
|
||||
resolveCacheProvider('openai', { openAiBaseUrl: 'http://100.64.1.5:8000/v1' }),
|
||||
).toBe('self-hosted')
|
||||
})
|
||||
|
||||
test('openai on reserved TLD (.local / .internal / .lan / .home.arpa) → self-hosted', () => {
|
||||
// Per RFC 6761 (.local/mDNS), RFC 8375 (.home.arpa), and widely
|
||||
// used .internal / .lan conventions. These never resolve publicly.
|
||||
expect(
|
||||
resolveCacheProvider('openai', { openAiBaseUrl: 'http://llm.internal:5000/v1' }),
|
||||
).toBe('self-hosted')
|
||||
expect(
|
||||
resolveCacheProvider('openai', { openAiBaseUrl: 'http://llm.local:8080/v1' }),
|
||||
).toBe('self-hosted')
|
||||
expect(
|
||||
resolveCacheProvider('openai', { openAiBaseUrl: 'http://vllm.home.arpa/v1' }),
|
||||
).toBe('self-hosted')
|
||||
expect(
|
||||
resolveCacheProvider('openai', { openAiBaseUrl: 'http://box.lan:1234/v1' }),
|
||||
).toBe('self-hosted')
|
||||
})
|
||||
|
||||
test('openai on IPv6 local / link-local → self-hosted', () => {
|
||||
expect(
|
||||
resolveCacheProvider('openai', { openAiBaseUrl: 'http://[fe80::1]:8000/v1' }),
|
||||
).toBe('self-hosted')
|
||||
expect(
|
||||
resolveCacheProvider('openai', { openAiBaseUrl: 'http://[fd12:3456::7]:8080/v1' }),
|
||||
).toBe('self-hosted')
|
||||
expect(
|
||||
resolveCacheProvider('openai', { openAiBaseUrl: 'http://[fc00::1]:8080/v1' }),
|
||||
).toBe('self-hosted')
|
||||
})
|
||||
|
||||
test('IPv6 ULA prefix (fc/fd) does NOT over-match public hostnames', () => {
|
||||
// Regression guard: an early version of isLocalOrPrivateUrl checked
|
||||
// `h.startsWith('fc')` / `startsWith('fd')` without a colon guard,
|
||||
// which misclassified legitimate public hosts whose names happen to
|
||||
// begin with those letters. The fix requires a colon in the match
|
||||
// so only real IPv6 literals hit the branch.
|
||||
expect(
|
||||
resolveCacheProvider('openai', {
|
||||
openAiBaseUrl: 'https://fc-api.example.com/v1',
|
||||
}),
|
||||
).toBe('openai')
|
||||
expect(
|
||||
resolveCacheProvider('openai', {
|
||||
openAiBaseUrl: 'https://fd-hosted.example.com/v1',
|
||||
}),
|
||||
).toBe('openai')
|
||||
// Same goes for names that look like hex prefixes but aren't IPv6.
|
||||
expect(
|
||||
resolveCacheProvider('openai', {
|
||||
openAiBaseUrl: 'https://fcbench.net/v1',
|
||||
}),
|
||||
).toBe('openai')
|
||||
})
|
||||
|
||||
test('openai with :11434 on a public host → ollama (default-port heuristic)', () => {
|
||||
// Contrived but the heuristic should still fire — someone running
|
||||
// Ollama behind a reverse proxy with port preserved.
|
||||
expect(
|
||||
resolveCacheProvider('openai', {
|
||||
openAiBaseUrl: 'https://ollama.example.com:11434/v1',
|
||||
}),
|
||||
).toBe('ollama')
|
||||
})
|
||||
|
||||
test('openai with moonshot URL → kimi', () => {
|
||||
expect(
|
||||
resolveCacheProvider('openai', { openAiBaseUrl: 'https://api.moonshot.ai/v1' }),
|
||||
).toBe('kimi')
|
||||
})
|
||||
test('openai with deepseek URL → deepseek', () => {
|
||||
expect(
|
||||
resolveCacheProvider('openai', { openAiBaseUrl: 'https://api.deepseek.com/v1' }),
|
||||
).toBe('deepseek')
|
||||
})
|
||||
test('private IP beats hosted-keyword matching (self-hosted takes priority)', () => {
|
||||
// A pathological URL: a private-IP host whose path string contains
|
||||
// "deepseek". Self-hosted detection must run FIRST so the URL
|
||||
// classifies honestly — the path alone doesn't prove the upstream
|
||||
// is the real DeepSeek API.
|
||||
expect(
|
||||
resolveCacheProvider('openai', {
|
||||
openAiBaseUrl: 'http://10.0.0.5:8000/v1/deepseek-proxy',
|
||||
}),
|
||||
).toBe('self-hosted')
|
||||
})
|
||||
test('plain openai remains openai', () => {
|
||||
expect(
|
||||
resolveCacheProvider('openai', { openAiBaseUrl: 'https://api.openai.com/v1' }),
|
||||
).toBe('openai')
|
||||
})
|
||||
test('unparseable base URL falls back to substring heuristic', () => {
|
||||
// Bare host:port without a scheme is common in misconfigured env.
|
||||
// We can't URL-parse it, but we still honor the "localhost" hint so
|
||||
// a broken config doesn't silently masquerade as cache-capable.
|
||||
expect(
|
||||
resolveCacheProvider('openai', { openAiBaseUrl: 'localhost:8000' }),
|
||||
).toBe('self-hosted')
|
||||
// An unparseable and opaque string falls through to plain 'openai'
|
||||
// (best-effort — nothing we can infer from "foo-bar-baz").
|
||||
expect(
|
||||
resolveCacheProvider('openai', { openAiBaseUrl: '???' }),
|
||||
).toBe('openai')
|
||||
})
|
||||
test('empty base URL → plain openai', () => {
|
||||
// No hint at all: assume the canonical api.openai.com.
|
||||
expect(resolveCacheProvider('openai')).toBe('openai')
|
||||
expect(
|
||||
resolveCacheProvider('openai', { openAiBaseUrl: '' }),
|
||||
).toBe('openai')
|
||||
})
|
||||
test('codex → codex', () => {
|
||||
expect(resolveCacheProvider('codex')).toBe('codex')
|
||||
})
|
||||
test('gemini → gemini', () => {
|
||||
expect(resolveCacheProvider('gemini')).toBe('gemini')
|
||||
})
|
||||
})
|
||||
|
||||
describe('resolveCacheProvider — .localhost TLD (RFC 6761)', () => {
|
||||
test('subdomains of .localhost classify as self-hosted', () => {
|
||||
// Chrome, Firefox, and systemd-resolved all natively resolve
|
||||
// *.localhost to 127.0.0.1. Kubernetes Ingress and docker-compose
|
||||
// setups commonly use app.localhost, api.localhost, etc.
|
||||
expect(
|
||||
resolveCacheProvider('openai', {
|
||||
openAiBaseUrl: 'http://app.localhost:3000/v1',
|
||||
}),
|
||||
).toBe('self-hosted')
|
||||
expect(
|
||||
resolveCacheProvider('openai', {
|
||||
openAiBaseUrl: 'http://api.localhost/v1',
|
||||
}),
|
||||
).toBe('self-hosted')
|
||||
expect(
|
||||
resolveCacheProvider('openai', {
|
||||
openAiBaseUrl: 'http://llm.dev.localhost:8080/v1',
|
||||
}),
|
||||
).toBe('self-hosted')
|
||||
})
|
||||
|
||||
test('.localhost TLD does NOT match substring collisions', () => {
|
||||
// Guard against regressions where `localhost` would match via
|
||||
// substring rather than TLD semantics. `localhostify.com` and
|
||||
// `mylocalhost.net` must stay on the public `openai` path.
|
||||
expect(
|
||||
resolveCacheProvider('openai', {
|
||||
openAiBaseUrl: 'https://localhostify.com/v1',
|
||||
}),
|
||||
).toBe('openai')
|
||||
expect(
|
||||
resolveCacheProvider('openai', {
|
||||
openAiBaseUrl: 'https://mylocalhost.net/v1',
|
||||
}),
|
||||
).toBe('openai')
|
||||
})
|
||||
})
|
||||
|
||||
describe('extractCacheMetrics — hit rate clamp', () => {
|
||||
test('hitRate is clamped to 1.0 on pathological input (read > total)', () => {
|
||||
// Defensive guard: with valid non-negative inputs the math enforces
|
||||
// read <= total, so hitRate cannot exceed 1. But an upstream shim
|
||||
// bug (e.g. reading a negative `fresh` from a future provider) could
|
||||
// break the invariant. `Math.min(1, read/total)` caps the display at
|
||||
// 100% rather than letting a `read=800 total=500` case render as
|
||||
// "hit 160%" or (worse) null, which would hide the anomaly.
|
||||
const metrics = extractCacheMetrics(
|
||||
{
|
||||
cache_read_input_tokens: 800,
|
||||
cache_creation_input_tokens: 0,
|
||||
// asNumber keeps finite negatives, so fresh = -500 → total =
|
||||
// 800 + 0 + (-500) = 300, read=800 → raw ratio 2.67, clamp to 1.
|
||||
input_tokens: -500,
|
||||
} as unknown as Record<string, unknown>,
|
||||
'anthropic',
|
||||
)
|
||||
expect(metrics.supported).toBe(true)
|
||||
expect(metrics.hitRate).toBe(1)
|
||||
})
|
||||
|
||||
test('normal inputs still yield accurate fractional hit rates', () => {
|
||||
// Regression: clamp must not perturb the happy path.
|
||||
const metrics = extractCacheMetrics(
|
||||
{
|
||||
cache_read_input_tokens: 300,
|
||||
cache_creation_input_tokens: 0,
|
||||
input_tokens: 700,
|
||||
},
|
||||
'anthropic',
|
||||
)
|
||||
expect(metrics.hitRate).toBeCloseTo(0.3, 5)
|
||||
})
|
||||
})
|
||||
|
||||
describe('extractCacheMetrics — self-hosted bucket (data-driven)', () => {
|
||||
test('vanilla self-hosted endpoint without cache fields → unsupported / N/A', () => {
|
||||
// vLLM, LocalAI, text-generation-webui, etc. emit no cache fields
|
||||
// at all. With read=created=0 we mark unsupported so the REPL shows
|
||||
// honest '[Cache: N/A]' instead of a fabricated 0%.
|
||||
const metrics = extractCacheMetrics(
|
||||
{ input_tokens: 1_000, output_tokens: 200 },
|
||||
'self-hosted',
|
||||
)
|
||||
expect(metrics.supported).toBe(false)
|
||||
expect(metrics.hitRate).toBeNull()
|
||||
expect(metrics.read).toBe(0)
|
||||
expect(metrics.created).toBe(0)
|
||||
})
|
||||
|
||||
test('internal reverse proxy forwarding real cache data → supported', () => {
|
||||
// Review-blocker regression guard: an enterprise setup with an
|
||||
// internal proxy on a private URL (e.g. `http://llm.internal:5000/v1`)
|
||||
// forwarding to OpenAI / Kimi / DeepSeek / Gemini WILL deliver real
|
||||
// cache fields via the shim. Pre-fix we would discard them because
|
||||
// the URL heuristic classified the endpoint as 'self-hosted'. Now
|
||||
// the data itself decides: any non-zero cache activity flows through
|
||||
// the same normalization as an OpenAI bucket.
|
||||
const shimmed = {
|
||||
input_tokens: 800, // fresh (post-shim, cached already subtracted)
|
||||
cache_read_input_tokens: 1_200, // shim extracted from upstream
|
||||
cache_creation_input_tokens: 0,
|
||||
}
|
||||
const metrics = extractCacheMetrics(shimmed, 'self-hosted')
|
||||
expect(metrics.supported).toBe(true)
|
||||
expect(metrics.read).toBe(1_200)
|
||||
expect(metrics.total).toBe(2_000)
|
||||
expect(metrics.hitRate).toBe(0.6)
|
||||
})
|
||||
|
||||
test('proxy with cache_creation but zero cache_read → still supported', () => {
|
||||
// Mirror of the above for the first-call / cold-cache scenario:
|
||||
// Anthropic-compatible upstreams emit creation tokens on the first
|
||||
// request that primes the cache. Self-hosted proxy must preserve
|
||||
// that signal, not swallow it because read is still 0.
|
||||
const shimmed = {
|
||||
input_tokens: 500,
|
||||
cache_read_input_tokens: 0,
|
||||
cache_creation_input_tokens: 800,
|
||||
}
|
||||
const metrics = extractCacheMetrics(shimmed, 'self-hosted')
|
||||
expect(metrics.supported).toBe(true)
|
||||
expect(metrics.created).toBe(800)
|
||||
expect(metrics.read).toBe(0)
|
||||
})
|
||||
})
|
||||
|
||||
describe('formatCacheMetrics — defensive null/undefined guards', () => {
|
||||
test('formatCacheMetricsCompact returns N/A for undefined input', () => {
|
||||
// Signature says `CacheMetrics` but runtime bug on a failed API
|
||||
// response could leave the caller with nothing. The formatter
|
||||
// should degrade gracefully rather than throw on `.supported`.
|
||||
expect(formatCacheMetricsCompact(undefined)).toBe('[Cache: N/A]')
|
||||
expect(formatCacheMetricsCompact(null as unknown as undefined)).toBe(
|
||||
'[Cache: N/A]',
|
||||
)
|
||||
})
|
||||
|
||||
test('formatCacheMetricsFull returns N/A for undefined input', () => {
|
||||
expect(formatCacheMetricsFull(undefined)).toBe('[Cache: N/A]')
|
||||
expect(formatCacheMetricsFull(null as unknown as undefined)).toBe(
|
||||
'[Cache: N/A]',
|
||||
)
|
||||
})
|
||||
})
|
||||
|
||||
describe('formatCacheMetricsCompact — self-hosted display paths', () => {
|
||||
test('vanilla self-hosted (no cache data) renders as N/A', () => {
|
||||
const metrics = extractCacheMetrics(
|
||||
{ input_tokens: 500 },
|
||||
'self-hosted',
|
||||
)
|
||||
expect(formatCacheMetricsCompact(metrics)).toBe('[Cache: N/A]')
|
||||
expect(formatCacheMetricsFull(metrics)).toBe('[Cache: N/A]')
|
||||
})
|
||||
|
||||
test('self-hosted proxy with forwarded cache data renders real metrics', () => {
|
||||
// Full display-path regression guard for the review-blocker fix:
|
||||
// the user must see the real hit rate that the upstream emitted,
|
||||
// not a silent N/A because the URL looked private.
|
||||
const metrics = extractCacheMetrics(
|
||||
{
|
||||
input_tokens: 800,
|
||||
cache_read_input_tokens: 1_200,
|
||||
cache_creation_input_tokens: 0,
|
||||
},
|
||||
'self-hosted',
|
||||
)
|
||||
expect(formatCacheMetricsCompact(metrics)).toBe('[Cache: 1.2k read • hit 60%]')
|
||||
expect(formatCacheMetricsFull(metrics)).toBe(
|
||||
'[Cache: read=1.2k created=0 hit=60%]',
|
||||
)
|
||||
})
|
||||
})
|
||||
|
||||
describe('formatCacheMetricsCompact — snapshot-stable output', () => {
|
||||
test('supported with reads shows "k" abbreviation and hit rate', () => {
|
||||
const out = formatCacheMetricsCompact({
|
||||
read: 1_234,
|
||||
created: 0,
|
||||
total: 10_000,
|
||||
hitRate: 0.1234,
|
||||
supported: true,
|
||||
})
|
||||
expect(out).toBe('[Cache: 1.2k read • hit 12%]')
|
||||
})
|
||||
|
||||
test('supported with no cache activity renders "cold"', () => {
|
||||
const out = formatCacheMetricsCompact({
|
||||
read: 0,
|
||||
created: 0,
|
||||
total: 500,
|
||||
hitRate: 0,
|
||||
supported: true,
|
||||
})
|
||||
expect(out).toBe('[Cache: cold]')
|
||||
})
|
||||
|
||||
test('unsupported renders "N/A"', () => {
|
||||
const out = formatCacheMetricsCompact({
|
||||
read: 0,
|
||||
created: 0,
|
||||
total: 0,
|
||||
hitRate: null,
|
||||
supported: false,
|
||||
})
|
||||
expect(out).toBe('[Cache: N/A]')
|
||||
})
|
||||
|
||||
test('small numbers render without abbreviation', () => {
|
||||
const out = formatCacheMetricsCompact({
|
||||
read: 42,
|
||||
created: 0,
|
||||
total: 100,
|
||||
hitRate: 0.42,
|
||||
supported: true,
|
||||
})
|
||||
expect(out).toBe('[Cache: 42 read • hit 42%]')
|
||||
})
|
||||
})
|
||||
|
||||
describe('formatCacheMetricsFull — snapshot-stable output', () => {
|
||||
test('supported shows all fields', () => {
|
||||
const out = formatCacheMetricsFull({
|
||||
read: 1_234,
|
||||
created: 250,
|
||||
total: 10_000,
|
||||
hitRate: 0.1234,
|
||||
supported: true,
|
||||
})
|
||||
expect(out).toBe('[Cache: read=1.2k created=250 hit=12%]')
|
||||
})
|
||||
|
||||
test('null hit rate renders n/a', () => {
|
||||
const out = formatCacheMetricsFull({
|
||||
read: 0,
|
||||
created: 0,
|
||||
total: 0,
|
||||
hitRate: null,
|
||||
supported: true,
|
||||
})
|
||||
expect(out).toBe('[Cache: read=0 created=0 hit=n/a]')
|
||||
})
|
||||
|
||||
test('unsupported renders "N/A"', () => {
|
||||
const out = formatCacheMetricsFull({
|
||||
read: 0,
|
||||
created: 0,
|
||||
total: 0,
|
||||
hitRate: null,
|
||||
supported: false,
|
||||
})
|
||||
expect(out).toBe('[Cache: N/A]')
|
||||
})
|
||||
})
|
||||
|
||||
describe('hit-rate edge cases (plan-mandated coverage)', () => {
|
||||
test('0 read / 0 created on supported provider → hitRate = 0 (not null) when total > 0', () => {
|
||||
const m = extractCacheMetrics({ input_tokens: 500 }, 'anthropic')
|
||||
expect(m.read).toBe(0)
|
||||
expect(m.created).toBe(0)
|
||||
expect(m.hitRate).toBe(0)
|
||||
})
|
||||
|
||||
test('read only (no created) computes proportion correctly', () => {
|
||||
const m = extractCacheMetrics(
|
||||
{ input_tokens: 0, cache_read_input_tokens: 800, cache_creation_input_tokens: 0 },
|
||||
'anthropic',
|
||||
)
|
||||
expect(m.read).toBe(800)
|
||||
expect(m.created).toBe(0)
|
||||
expect(m.total).toBe(800)
|
||||
expect(m.hitRate).toBe(1)
|
||||
})
|
||||
|
||||
test('created only (first turn — no reads yet) gives 0 hit rate', () => {
|
||||
const m = extractCacheMetrics(
|
||||
{
|
||||
input_tokens: 200,
|
||||
cache_read_input_tokens: 0,
|
||||
cache_creation_input_tokens: 1_000,
|
||||
},
|
||||
'anthropic',
|
||||
)
|
||||
expect(m.read).toBe(0)
|
||||
expect(m.created).toBe(1_000)
|
||||
expect(m.total).toBe(1_200)
|
||||
expect(m.hitRate).toBe(0)
|
||||
})
|
||||
|
||||
test('mixed read + created + fresh input — full denominator', () => {
|
||||
const m = extractCacheMetrics(
|
||||
{
|
||||
input_tokens: 500,
|
||||
cache_read_input_tokens: 3_000,
|
||||
cache_creation_input_tokens: 1_500,
|
||||
},
|
||||
'anthropic',
|
||||
)
|
||||
// Denominator = fresh(500) + created(1500) + read(3000) = 5_000
|
||||
// Hit = read/total = 3000 / 5000 = 0.6
|
||||
expect(m.total).toBe(5_000)
|
||||
expect(m.hitRate).toBe(0.6)
|
||||
})
|
||||
|
||||
test('N/A (unsupported provider) preserves null hit-rate even with populated usage', () => {
|
||||
// Simulate a Copilot usage payload that might look like OpenAI shape —
|
||||
// we must NOT try to read it and must report supported:false.
|
||||
const m = extractCacheMetrics(
|
||||
{ prompt_tokens: 5_000, prompt_tokens_details: { cached_tokens: 2_000 } },
|
||||
'copilot',
|
||||
)
|
||||
expect(m.supported).toBe(false)
|
||||
expect(m.read).toBe(0)
|
||||
expect(m.hitRate).toBeNull()
|
||||
})
|
||||
})
|
||||
|
||||
describe('addCacheMetrics — session aggregation', () => {
|
||||
test('sums read/created/total and recomputes hit rate', () => {
|
||||
const a = {
|
||||
read: 100,
|
||||
created: 50,
|
||||
total: 300,
|
||||
hitRate: 100 / 300,
|
||||
supported: true,
|
||||
}
|
||||
const b = {
|
||||
read: 200,
|
||||
created: 0,
|
||||
total: 400,
|
||||
hitRate: 0.5,
|
||||
supported: true,
|
||||
}
|
||||
const sum = addCacheMetrics(a, b)
|
||||
expect(sum.read).toBe(300)
|
||||
expect(sum.created).toBe(50)
|
||||
expect(sum.total).toBe(700)
|
||||
expect(sum.hitRate).toBeCloseTo(300 / 700, 5)
|
||||
})
|
||||
|
||||
test('unsupported + supported = supported (so we never lose honest data)', () => {
|
||||
const unsupported = {
|
||||
read: 0,
|
||||
created: 0,
|
||||
total: 0,
|
||||
hitRate: null,
|
||||
supported: false,
|
||||
}
|
||||
const supported = {
|
||||
read: 10,
|
||||
created: 0,
|
||||
total: 100,
|
||||
hitRate: 0.1,
|
||||
supported: true,
|
||||
}
|
||||
expect(addCacheMetrics(unsupported, supported)).toBe(supported)
|
||||
expect(addCacheMetrics(supported, unsupported)).toBe(supported)
|
||||
})
|
||||
|
||||
test('unsupported + unsupported = unsupported', () => {
|
||||
const u = {
|
||||
read: 0,
|
||||
created: 0,
|
||||
total: 0,
|
||||
hitRate: null,
|
||||
supported: false,
|
||||
}
|
||||
const sum = addCacheMetrics(u, u)
|
||||
expect(sum.supported).toBe(false)
|
||||
})
|
||||
})
|
||||
538
src/services/api/cacheMetrics.ts
Normal file
538
src/services/api/cacheMetrics.ts
Normal file
@@ -0,0 +1,538 @@
|
||||
/**
|
||||
* Cross-provider cache usage normalizer for Phase 1 observability.
|
||||
*
|
||||
* Two layers of extraction, because the shim layer (openaiShim/codexShim)
|
||||
* already converts raw provider usage to Anthropic-shape on the way in:
|
||||
*
|
||||
* 1. `extractCacheReadFromRawUsage` — consumes RAW provider usage, used
|
||||
* from inside the shims where each provider's native field names are
|
||||
* still visible. Single source of truth for "where is the cached-
|
||||
* tokens count on provider X".
|
||||
* 2. `extractCacheMetrics` — consumes POST-shim Anthropic-shape usage,
|
||||
* which is what every downstream caller (cost-tracker, REPL display,
|
||||
* /cache-stats) actually sees. Uses the `provider` argument only to
|
||||
* decide whether the metric is `supported` (Copilot vanilla, Ollama
|
||||
* get N/A rather than a fabricated 0%).
|
||||
*
|
||||
* Design rationale:
|
||||
* - Pure functions, no globals: callers pass the provider explicitly so
|
||||
* that tests, background agents and teammates get consistent results
|
||||
* even when the process-level provider flag differs.
|
||||
* - Honest N/A: Copilot (non-Claude) and Ollama do not expose cache data
|
||||
* at all. Returning 0 would lie and corrupt aggregate hit-rate, so we
|
||||
* return `supported: false` and let the display decide how to render.
|
||||
* - `hitRate` is null whenever there is no input to compare against
|
||||
* (0 read + 0 created). A 0% hit rate would suggest "cold" when in
|
||||
* reality the turn had no cacheable content to begin with.
|
||||
* - After normalization, `read + created ≤ total`, with any remainder
|
||||
* being fresh (non-cacheable) input tokens. The shim enforces this
|
||||
* invariant by subtracting cached from raw prompt_tokens so that
|
||||
* post-shim `input_tokens` is always "fresh only" per Anthropic
|
||||
* convention.
|
||||
*
|
||||
* Raw provider shapes (as of 2026-04):
|
||||
* - Anthropic: usage.cache_read_input_tokens,
|
||||
* usage.cache_creation_input_tokens,
|
||||
* usage.input_tokens (fresh only)
|
||||
* - OpenAI / Codex: usage.input_tokens_details?.cached_tokens
|
||||
* usage.prompt_tokens_details?.cached_tokens,
|
||||
* usage.prompt_tokens (includes cached)
|
||||
* - Kimi / Moonshot: usage.cached_tokens (top level), usage.prompt_tokens
|
||||
* - DeepSeek: usage.prompt_cache_hit_tokens,
|
||||
* usage.prompt_cache_miss_tokens
|
||||
* - Gemini: usage.cached_content_token_count,
|
||||
* usage.prompt_token_count
|
||||
* - Copilot (non-Claude) / Ollama: not reported → supported=false
|
||||
*/
|
||||
import type { APIProvider } from '../../utils/model/providers.js'
|
||||
|
||||
/** Providers for which we know how to read cache fields. */
|
||||
export type CacheAwareProvider =
|
||||
| 'anthropic'
|
||||
| 'openai'
|
||||
| 'codex'
|
||||
| 'kimi'
|
||||
| 'deepseek'
|
||||
| 'gemini'
|
||||
| 'ollama'
|
||||
// Generic local / self-hosted OpenAI-compatible endpoints (vLLM,
|
||||
// LM Studio, LocalAI, text-generation-webui, custom internal servers
|
||||
// on RFC1918 addresses, reserved TLDs like .local / .internal, etc.).
|
||||
// Distinct from `ollama` because Ollama might someday add cache
|
||||
// reporting; keeping the buckets separate means that change stays
|
||||
// local to one branch.
|
||||
| 'self-hosted'
|
||||
| 'copilot'
|
||||
| 'copilot-claude'
|
||||
|
||||
/** Unified cache metrics for one API response. */
|
||||
export type CacheMetrics = {
|
||||
/** Tokens served from cache on this request. */
|
||||
read: number
|
||||
/**
|
||||
* Tokens written INTO the cache on this request. Only non-zero for
|
||||
* providers with explicit caching (Anthropic family).
|
||||
*/
|
||||
created: number
|
||||
/**
|
||||
* Total input tokens the request is measured against, computed uniformly
|
||||
* as `fresh + read + created` after the shim normalizes every provider
|
||||
* to the Anthropic convention. Used as the denominator for hit-rate.
|
||||
*/
|
||||
total: number
|
||||
/**
|
||||
* `read / total`, or null when the denominator is zero or the provider
|
||||
* doesn't support cache reporting.
|
||||
*/
|
||||
hitRate: number | null
|
||||
/**
|
||||
* False for providers that do not expose cache data at all. Callers
|
||||
* should render "N/A" instead of "0%" in that case.
|
||||
*/
|
||||
supported: boolean
|
||||
}
|
||||
|
||||
/** Empty reference returned for unsupported providers — copy elision. */
|
||||
const UNSUPPORTED: CacheMetrics = {
|
||||
read: 0,
|
||||
created: 0,
|
||||
total: 0,
|
||||
hitRate: null,
|
||||
supported: false,
|
||||
}
|
||||
|
||||
/** Raw usage shape — intentionally permissive, each provider picks its fields. */
|
||||
export type RawUsage = Record<string, unknown> | null | undefined
|
||||
|
||||
function asNumber(value: unknown): number {
|
||||
return typeof value === 'number' && Number.isFinite(value) ? value : 0
|
||||
}
|
||||
|
||||
function pickPath(usage: RawUsage, path: string[]): unknown {
|
||||
let cur: unknown = usage
|
||||
for (const key of path) {
|
||||
if (cur == null || typeof cur !== 'object') return undefined
|
||||
cur = (cur as Record<string, unknown>)[key]
|
||||
}
|
||||
return cur
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true when the URL points at a private, loopback, link-local,
|
||||
* CGNAT, or reserved-TLD host — anywhere a self-hosted OpenAI-compatible
|
||||
* server is likely running (vLLM, LM Studio, LocalAI, Ollama on a
|
||||
* non-default port, text-generation-webui, corporate internal proxies).
|
||||
*
|
||||
* WHY a dedicated helper (vs the old substring match):
|
||||
* The previous check only looked for `localhost` / `127.0.0.1` /
|
||||
* `:11434` / `:1234` as substrings. That misclassified real setups:
|
||||
* a vLLM server at `http://192.168.1.50:8000/v1` or an internal
|
||||
* endpoint at `http://llm.internal:5000/v1` fell through the `openai`
|
||||
* branch, got marked as cache-capable, and `/cache-stats` reported
|
||||
* `[Cache: cold]` — making users think their cache was broken when
|
||||
* in reality the provider simply doesn't report cache fields.
|
||||
*
|
||||
* Intentionally narrower than WebSearchTool's `isPrivateHostname`
|
||||
* (which defends against SSRF bypass vectors like IPv4-mapped IPv6
|
||||
* and octal-encoded IPs). We only need to classify a reporting bucket,
|
||||
* not enforce a security boundary — a false negative here at worst
|
||||
* shows `[Cache: cold]` instead of `[Cache: N/A]`.
|
||||
*
|
||||
* See cacheMetrics.test.ts for the cases this function is contracted to
|
||||
* return true/false for.
|
||||
*/
|
||||
function isLocalOrPrivateUrl(url: string): boolean {
|
||||
if (!url) return false
|
||||
let hostname = ''
|
||||
try {
|
||||
hostname = new URL(url).hostname.toLowerCase()
|
||||
} catch {
|
||||
// Fall through to the substring fallback below.
|
||||
}
|
||||
// WHATWG URL accepts `localhost:8000` (treats `localhost:` as scheme,
|
||||
// leaving hostname empty). Treat empty-hostname parses the same as a
|
||||
// parse failure so we still catch the obvious cases with substring.
|
||||
if (!hostname) {
|
||||
const lower = url.toLowerCase()
|
||||
return (
|
||||
lower.includes('localhost') ||
|
||||
lower.includes('127.0.0.1') ||
|
||||
lower.includes('::1')
|
||||
)
|
||||
}
|
||||
// Unwrap IPv6 literal brackets that URL.hostname leaves attached.
|
||||
const h = hostname.startsWith('[') && hostname.endsWith(']')
|
||||
? hostname.slice(1, -1)
|
||||
: hostname
|
||||
// Reserved TLDs and `localhost` itself — all guaranteed never to
|
||||
// resolve to public infrastructure. Sources:
|
||||
// - RFC 6761 §6.3 — `.localhost` (Chrome/Firefox/systemd-resolved
|
||||
// resolve `*.localhost` to 127.0.0.1 natively)
|
||||
// - RFC 6762 — `.local` mDNS (Bonjour)
|
||||
// - RFC 8375 — `.home.arpa` (residential home networks)
|
||||
// - de facto — `.lan`, `.internal`, `.intranet` (widely used
|
||||
// in corporate DNS despite not being formally
|
||||
// reserved)
|
||||
if (
|
||||
h === 'localhost' ||
|
||||
h.endsWith('.localhost') ||
|
||||
h.endsWith('.local') ||
|
||||
h.endsWith('.lan') ||
|
||||
h.endsWith('.internal') ||
|
||||
h.endsWith('.intranet') ||
|
||||
h.endsWith('.home.arpa')
|
||||
) {
|
||||
return true
|
||||
}
|
||||
// IPv4 private and reserved ranges. URL.hostname normalizes short /
|
||||
// hex / octal IPv4 representations to dotted-quad, so a simple regex
|
||||
// works for the display-classification use case.
|
||||
const ipv4 = h.match(/^(\d{1,3})\.(\d{1,3})\.(\d{1,3})\.(\d{1,3})$/)
|
||||
if (ipv4) {
|
||||
const a = Number(ipv4[1])
|
||||
const b = Number(ipv4[2])
|
||||
// 10.0.0.0/8 (RFC 1918)
|
||||
if (a === 10) return true
|
||||
// 172.16.0.0/12 (RFC 1918)
|
||||
if (a === 172 && b >= 16 && b <= 31) return true
|
||||
// 192.168.0.0/16 (RFC 1918)
|
||||
if (a === 192 && b === 168) return true
|
||||
// 127.0.0.0/8 loopback
|
||||
if (a === 127) return true
|
||||
// 169.254.0.0/16 link-local (AWS/GCP metadata, stateless autoconf)
|
||||
if (a === 169 && b === 254) return true
|
||||
// 100.64.0.0/10 CGNAT (Tailscale, carrier-grade NAT)
|
||||
if (a === 100 && b >= 64 && b <= 127) return true
|
||||
}
|
||||
// IPv6 common local/private ranges — narrow by design.
|
||||
if (h === '::1' || h === '::') return true
|
||||
// fe80::/10 link-local and fc00::/7 unique-local (ULA). A colon is
|
||||
// required in the match so `fc` / `fd` don't over-match real
|
||||
// hostnames like `fc-api.example.com` or `fd-hosted.com`. URL.hostname
|
||||
// strips brackets, so an IPv6 literal like `fc00::1` shows up here as
|
||||
// `fc00::1` — still contains the colon.
|
||||
if (
|
||||
h.startsWith('fe80:') ||
|
||||
/^fc[0-9a-f]{0,2}:/.test(h) ||
|
||||
/^fd[0-9a-f]{0,2}:/.test(h)
|
||||
) {
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
/**
|
||||
* Map the canonical APIProvider enum (+ environment hints) into a
|
||||
* cache-capability bucket. We separate `copilot` (no cache) from
|
||||
* `copilot-claude` (Anthropic shim via Copilot with explicit cache)
|
||||
* because the two behave very differently even under the same provider
|
||||
* flag — see `isGithubNativeAnthropicMode` in utils/model/providers.ts.
|
||||
*
|
||||
* Order of OpenAI-compatible checks matters:
|
||||
* 1. Private / self-hosted URL — no cache fields regardless of vendor.
|
||||
* 2. Vendor-specific hosted providers (Kimi, DeepSeek) — known cache
|
||||
* shapes that deserve their own normalization branch.
|
||||
* 3. Plain OpenAI — default bucket.
|
||||
* Doing hosted-vendor matching before self-hosted detection would let a
|
||||
* private-IP endpoint with "deepseek" in the URL fall into the wrong
|
||||
* branch; doing self-hosted last would let a `.internal` URL with
|
||||
* "openai" in its path be misclassified. The current order is correct
|
||||
* for both pathological cases.
|
||||
*/
|
||||
export function resolveCacheProvider(
|
||||
provider: APIProvider,
|
||||
hints?: { githubNativeAnthropic?: boolean; openAiBaseUrl?: string },
|
||||
): CacheAwareProvider {
|
||||
if (provider === 'github') {
|
||||
return hints?.githubNativeAnthropic ? 'copilot-claude' : 'copilot'
|
||||
}
|
||||
if (provider === 'firstParty' || provider === 'bedrock' || provider === 'vertex' || provider === 'foundry') {
|
||||
return 'anthropic'
|
||||
}
|
||||
if (provider === 'gemini') return 'gemini'
|
||||
if (provider === 'codex') return 'codex'
|
||||
if (provider === 'openai') {
|
||||
const url = hints?.openAiBaseUrl ?? ''
|
||||
// Self-hosted / private-network endpoint — detect first so a vLLM
|
||||
// server on 192.168.x.x or a .internal DNS entry is honestly
|
||||
// classified as no-cache, not misreported as plain OpenAI.
|
||||
if (isLocalOrPrivateUrl(url)) return 'self-hosted'
|
||||
const lower = url.toLowerCase()
|
||||
// The :11434 port still signals Ollama specifically (default port).
|
||||
// If someone runs Ollama on a private IP:11434 we picked it up above
|
||||
// as 'self-hosted'; only a public-looking URL with :11434 lands here.
|
||||
if (lower.includes(':11434')) return 'ollama'
|
||||
if (lower.includes('moonshot') || lower.includes('kimi')) return 'kimi'
|
||||
if (lower.includes('deepseek')) return 'deepseek'
|
||||
return 'openai'
|
||||
}
|
||||
// nvidia-nim, minimax, mistral share the OpenAI Chat Completions convention
|
||||
// for cache reporting (prompt_tokens_details.cached_tokens). Treat them as
|
||||
// 'openai' for normalization purposes — if the provider doesn't emit the
|
||||
// field we simply get zeros, and hitRate stays null via the 0-guard below.
|
||||
return 'openai'
|
||||
}
|
||||
|
||||
/**
|
||||
* Read the cached-tokens count from a RAW provider usage object, handling
|
||||
* every shape we know about. Callers are the shim layer (openaiShim,
|
||||
* codexShim) — the only place where the native provider fields still
|
||||
* exist before conversion to Anthropic shape.
|
||||
*
|
||||
* Order of fallbacks is deliberate: the first non-zero match wins, so
|
||||
* adding a provider that combines shapes is safe as long as we list the
|
||||
* most authoritative field first.
|
||||
*/
|
||||
export function extractCacheReadFromRawUsage(usage: RawUsage): number {
|
||||
if (!usage || typeof usage !== 'object') return 0
|
||||
const u = usage as Record<string, unknown>
|
||||
// 1. Anthropic-native shape — already normalized upstream.
|
||||
const anthropicRead = asNumber(u.cache_read_input_tokens)
|
||||
if (anthropicRead > 0) return anthropicRead
|
||||
// 2. OpenAI / Codex — cached_tokens nested under input/prompt details.
|
||||
// Responses API uses `input_tokens_details`, Chat Completions uses
|
||||
// `prompt_tokens_details`; some models report both with the same value.
|
||||
const openaiNested =
|
||||
asNumber(pickPath(usage, ['input_tokens_details', 'cached_tokens'])) ||
|
||||
asNumber(pickPath(usage, ['prompt_tokens_details', 'cached_tokens']))
|
||||
if (openaiNested > 0) return openaiNested
|
||||
// 3. Kimi / Moonshot — top-level cached_tokens (not nested).
|
||||
const kimi = asNumber(u.cached_tokens)
|
||||
if (kimi > 0) return kimi
|
||||
// 4. DeepSeek — hit/miss split at top level.
|
||||
const deepseek = asNumber(u.prompt_cache_hit_tokens)
|
||||
if (deepseek > 0) return deepseek
|
||||
// 5. Gemini — cached_content_token_count.
|
||||
const gemini = asNumber(u.cached_content_token_count)
|
||||
if (gemini > 0) return gemini
|
||||
return 0
|
||||
}
|
||||
|
||||
/**
|
||||
* Shape produced by the shim layer — matches the Anthropic BetaUsage
|
||||
* fields that every downstream caller (cost-tracker, REPL, /cache-stats)
|
||||
* consumes. Keeping it in this module lets the shim and the integration
|
||||
* tests share one definition and eliminates the drift class of bugs
|
||||
* where a shim is updated but a test simulator isn't.
|
||||
*/
|
||||
export type NormalizedShimUsage = {
|
||||
input_tokens: number
|
||||
output_tokens: number
|
||||
cache_creation_input_tokens: number
|
||||
cache_read_input_tokens: number
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert raw provider usage (any known shape) into the Anthropic-shape
|
||||
* `NormalizedShimUsage` used throughout the codebase. Single source of
|
||||
* truth for the shim layer — `codexShim.makeUsage`,
|
||||
* `openaiShim.convertChunkUsage`, and the non-streaming response in
|
||||
* `OpenAIShimMessages` all call this helper, and the integration test
|
||||
* calls it directly instead of re-implementing the conversion.
|
||||
*
|
||||
* Design contract:
|
||||
* - `cache_read_input_tokens` comes from `extractCacheReadFromRawUsage`
|
||||
* (provider-aware extraction).
|
||||
* - `input_tokens` is rewritten to Anthropic convention: FRESH only,
|
||||
* with `cache_read` subtracted from the raw prompt count if the
|
||||
* provider included it there (OpenAI family does; Anthropic native
|
||||
* already excludes it).
|
||||
* - `cache_creation_input_tokens` is always 0 at the shim boundary —
|
||||
* only Anthropic native emits a non-zero creation count, and it
|
||||
* doesn't flow through these shims.
|
||||
* - Output token count accepts both `output_tokens` (Codex/Responses)
|
||||
* and `completion_tokens` (Chat Completions).
|
||||
*
|
||||
* Observed raw shapes per provider (pinned so future drift is caught):
|
||||
* - OpenAI Chat Completions:
|
||||
* `{ prompt_tokens, completion_tokens,
|
||||
* prompt_tokens_details: { cached_tokens } }`
|
||||
* where `cached_tokens` is a SUBSET of `prompt_tokens` — hence
|
||||
* the subtraction below.
|
||||
* - OpenAI Codex / Responses API:
|
||||
* `{ input_tokens, output_tokens,
|
||||
* input_tokens_details: { cached_tokens } }`
|
||||
* same convention: cached is included in `input_tokens`.
|
||||
* - Anthropic native:
|
||||
* `{ input_tokens, output_tokens,
|
||||
* cache_read_input_tokens, cache_creation_input_tokens }`
|
||||
* cached is EXCLUDED from `input_tokens`. The subtraction here
|
||||
* no-ops (cache_read is read off a dedicated field, then fresh =
|
||||
* input_tokens - 0 = input_tokens) — safe passthrough.
|
||||
* - Kimi/Moonshot:
|
||||
* `{ prompt_tokens, completion_tokens, cached_tokens }` — top
|
||||
* level, not nested. OpenAI-family subset convention.
|
||||
* - DeepSeek:
|
||||
* `{ prompt_tokens, completion_tokens, prompt_cache_hit_tokens,
|
||||
* prompt_cache_miss_tokens }`. The `hit` field is the cached
|
||||
* count, also a subset of `prompt_tokens`.
|
||||
*
|
||||
* If a future provider deviates (ships cached tokens ALREADY excluded
|
||||
* from input_tokens, Anthropic-style), this function will under-count
|
||||
* their fresh-input by `cache_read`. The regression test
|
||||
* `cacheMetricsIntegration.test.ts > "Codex makeUsage no longer
|
||||
* double-bills"` pins the current Codex shape so a deviation breaks
|
||||
* visibly. If you're adding a new provider, verify the shape and —
|
||||
* if needed — extend `extractCacheReadFromRawUsage` to pick a field
|
||||
* that represents cached-tokens-already-excluded (and skip the
|
||||
* subtraction by setting `rawInput` to `prompt_tokens + cache_read`).
|
||||
*/
|
||||
export function buildAnthropicUsageFromRawUsage(
|
||||
raw: RawUsage,
|
||||
): NormalizedShimUsage {
|
||||
const cacheRead = extractCacheReadFromRawUsage(raw)
|
||||
const u = (raw ?? {}) as Record<string, unknown>
|
||||
const rawInput =
|
||||
asNumber(u.input_tokens) || asNumber(u.prompt_tokens)
|
||||
const fresh = rawInput >= cacheRead ? rawInput - cacheRead : rawInput
|
||||
const output =
|
||||
asNumber(u.output_tokens) || asNumber(u.completion_tokens)
|
||||
return {
|
||||
input_tokens: fresh,
|
||||
output_tokens: output,
|
||||
cache_creation_input_tokens: 0,
|
||||
cache_read_input_tokens: cacheRead,
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract a unified CacheMetrics from POST-SHIM (Anthropic-shape) usage.
|
||||
*
|
||||
* By the time this runs, openaiShim/codexShim have already converted
|
||||
* raw provider fields into `cache_read_input_tokens` (via
|
||||
* `extractCacheReadFromRawUsage`) and adjusted `input_tokens` to be
|
||||
* "fresh only" per Anthropic convention. This function is therefore
|
||||
* deliberately provider-independent for the numeric extraction — the
|
||||
* `provider` argument is used only to surface `supported: false` for
|
||||
* providers that expose no cache data at all.
|
||||
*/
|
||||
export function extractCacheMetrics(
|
||||
usage: RawUsage,
|
||||
provider: CacheAwareProvider,
|
||||
): CacheMetrics {
|
||||
if (!usage || typeof usage !== 'object') return UNSUPPORTED
|
||||
const u = usage as Record<string, unknown>
|
||||
const read = asNumber(u.cache_read_input_tokens)
|
||||
const created = asNumber(u.cache_creation_input_tokens)
|
||||
const fresh = asNumber(u.input_tokens)
|
||||
// Copilot vanilla (no Claude) and Ollama don't expose cache fields at
|
||||
// all as a provider-identity matter. These are explicit provider
|
||||
// selections (via CLAUDE_CODE_USE_GITHUB and the Ollama base-URL
|
||||
// default port), so we can hard-wire `supported: false` and let the
|
||||
// REPL print "N/A" instead of a fabricated 0%.
|
||||
if (provider === 'copilot' || provider === 'ollama') {
|
||||
return UNSUPPORTED
|
||||
}
|
||||
// `self-hosted` is different: the bucket is inferred from the base
|
||||
// URL being on a private network (RFC1918, .local TLD, etc.), which
|
||||
// is a heuristic, not an authoritative "this endpoint cannot cache"
|
||||
// signal. An internal reverse proxy forwarding to OpenAI / Kimi /
|
||||
// DeepSeek / Gemini will produce a private URL but ALSO emit real
|
||||
// cache fields via the shim. Force-unsupported here would discard
|
||||
// legitimate data. Let the data decide: if the shim extracted any
|
||||
// cache activity (read OR created), trust it and fall through to
|
||||
// normal extraction; otherwise render honest N/A for vanilla
|
||||
// vLLM/LocalAI-style endpoints that really don't cache.
|
||||
if (provider === 'self-hosted' && read === 0 && created === 0) {
|
||||
return UNSUPPORTED
|
||||
}
|
||||
// total = fresh + read + created — shim already stripped `read` out of
|
||||
// `fresh` so the three components don't double-count. This matches the
|
||||
// Anthropic convention even when the upstream was OpenAI/Kimi/DeepSeek.
|
||||
const total = read + created + fresh
|
||||
return {
|
||||
read,
|
||||
created,
|
||||
total,
|
||||
// Clamp to [0, 1]. With non-negative inputs the math guarantees
|
||||
// `read <= total` — but an upstream shim bug (e.g. future provider
|
||||
// where we accidentally read a negative `fresh`) could violate the
|
||||
// invariant. Showing a pinned `1.0` on anomalous input is clearer
|
||||
// than a nonsense ratio > 100% and safer than `null` (which would
|
||||
// hide the issue completely).
|
||||
hitRate: total > 0 ? Math.min(1, read / total) : null,
|
||||
supported: true,
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Format a CacheMetrics value into a human-facing one-liner used by
|
||||
* `showCacheStats: 'compact'`. Stable format — snapshot-tested.
|
||||
*
|
||||
* Examples:
|
||||
* "[Cache: 1.2k read • hit 12%]"
|
||||
* "[Cache: N/A]" (unsupported provider)
|
||||
* "[Cache: cold]" (supported, no reads yet)
|
||||
*
|
||||
* The `undefined` branch at the top is defensive: TypeScript enforces
|
||||
* `CacheMetrics` at call sites, but a failed API response could leave
|
||||
* the caller with nothing to render. Treat absent metrics as "no data"
|
||||
* rather than throwing on `metrics.supported`.
|
||||
*/
|
||||
export function formatCacheMetricsCompact(
|
||||
metrics: CacheMetrics | undefined | null,
|
||||
): string {
|
||||
if (!metrics) return '[Cache: N/A]'
|
||||
if (!metrics.supported) return '[Cache: N/A]'
|
||||
if (metrics.read === 0 && metrics.created === 0) return '[Cache: cold]'
|
||||
const parts: string[] = [`${formatCompactNumber(metrics.read)} read`]
|
||||
if (metrics.hitRate !== null) {
|
||||
parts.push(`hit ${Math.round(metrics.hitRate * 100)}%`)
|
||||
}
|
||||
return `[Cache: ${parts.join(' • ')}]`
|
||||
}
|
||||
|
||||
/**
|
||||
* Format a CacheMetrics value into a multi-field breakdown used by
|
||||
* `showCacheStats: 'full'`. Stable format — snapshot-tested.
|
||||
*
|
||||
* Example:
|
||||
* "[Cache: read=1.2k created=340 hit=12%]"
|
||||
*
|
||||
* Same `undefined` tolerance as `formatCacheMetricsCompact` — a failed
|
||||
* API response shouldn't throw on the display path.
|
||||
*/
|
||||
export function formatCacheMetricsFull(
|
||||
metrics: CacheMetrics | undefined | null,
|
||||
): string {
|
||||
if (!metrics) return '[Cache: N/A]'
|
||||
if (!metrics.supported) return '[Cache: N/A]'
|
||||
const parts: string[] = [
|
||||
`read=${formatCompactNumber(metrics.read)}`,
|
||||
`created=${formatCompactNumber(metrics.created)}`,
|
||||
]
|
||||
if (metrics.hitRate !== null) {
|
||||
parts.push(`hit=${Math.round(metrics.hitRate * 100)}%`)
|
||||
} else {
|
||||
parts.push('hit=n/a')
|
||||
}
|
||||
return `[Cache: ${parts.join(' ')}]`
|
||||
}
|
||||
|
||||
// Compact 1.2k-style formatter. Duplicated here (not imported from
|
||||
// utils/format.ts) because this module should stay dependency-light and
|
||||
// deterministic — utils/format pulls Intl locale state which varies.
|
||||
function formatCompactNumber(n: number): string {
|
||||
if (n < 1_000) return String(n)
|
||||
if (n < 1_000_000) return `${(n / 1_000).toFixed(1).replace(/\.0$/, '')}k`
|
||||
return `${(n / 1_000_000).toFixed(1).replace(/\.0$/, '')}m`
|
||||
}
|
||||
|
||||
/** Sum two CacheMetrics, preserving `supported` as true only if both are. */
|
||||
export function addCacheMetrics(a: CacheMetrics, b: CacheMetrics): CacheMetrics {
|
||||
// Copy elision: if either side is the unsupported sentinel, return the
|
||||
// other as-is so aggregates on a purely-unsupported session stay cheap.
|
||||
if (!a.supported && !b.supported) return UNSUPPORTED
|
||||
if (!a.supported) return b
|
||||
if (!b.supported) return a
|
||||
const read = a.read + b.read
|
||||
const created = a.created + b.created
|
||||
const total = a.total + b.total
|
||||
return {
|
||||
read,
|
||||
created,
|
||||
total,
|
||||
hitRate: total > 0 ? read / total : null,
|
||||
supported: true,
|
||||
}
|
||||
}
|
||||
339
src/services/api/cacheMetricsIntegration.test.ts
Normal file
339
src/services/api/cacheMetricsIntegration.test.ts
Normal file
@@ -0,0 +1,339 @@
|
||||
/**
|
||||
* Integration tests for the raw-usage → shim → cost-tracker pipeline.
|
||||
*
|
||||
* These tests simulate what happens on each provider end-to-end:
|
||||
* 1. The provider returns a raw `usage` object in its native shape.
|
||||
* 2. The shim (openaiShim.convertChunkUsage / codexShim.makeUsage)
|
||||
* rewrites it to Anthropic shape via buildAnthropicUsageFromRawUsage.
|
||||
* 3. cost-tracker feeds the shimmed usage to extractCacheMetrics.
|
||||
*
|
||||
* The unit tests in cacheMetrics.test.ts exercise each layer in isolation.
|
||||
* This file exists so that a regression in ANY one of them (e.g. someone
|
||||
* adding a new provider branch to the helper but forgetting to wire it
|
||||
* into the shim) surfaces as an integration failure rather than silently
|
||||
* showing "[Cache: cold]" in production.
|
||||
*
|
||||
* We call `buildAnthropicUsageFromRawUsage` directly instead of
|
||||
* re-implementing the shim behavior locally. Both shims
|
||||
* (`codexShim.makeUsage`, `openaiShim.convertChunkUsage`, and the
|
||||
* non-streaming path in `OpenAIShimMessages`) delegate to this helper,
|
||||
* so this test covers the exact same code that runs in production —
|
||||
* no simulator drift possible.
|
||||
*/
|
||||
import { describe, expect, test } from 'bun:test'
|
||||
import {
|
||||
buildAnthropicUsageFromRawUsage,
|
||||
extractCacheMetrics,
|
||||
formatCacheMetricsCompact,
|
||||
formatCacheMetricsFull,
|
||||
resolveCacheProvider,
|
||||
type CacheAwareProvider,
|
||||
} from './cacheMetrics.js'
|
||||
|
||||
type Scenario = {
|
||||
name: string
|
||||
provider: CacheAwareProvider
|
||||
rawUsage: Record<string, unknown>
|
||||
expectedRead: number
|
||||
expectedTotal: number
|
||||
expectedHitRate: number
|
||||
expectedFreshInput: number
|
||||
}
|
||||
|
||||
// End-to-end scenarios for every provider shape the OpenClaude shim layer
|
||||
// might see. `expectedTotal` is what a user should see as "input this
|
||||
// request", `expectedHitRate` is what `/cache-stats` should display.
|
||||
const scenarios: Scenario[] = [
|
||||
{
|
||||
name: 'Anthropic native (firstParty) — passthrough',
|
||||
provider: 'anthropic',
|
||||
rawUsage: {
|
||||
input_tokens: 200,
|
||||
cache_read_input_tokens: 800,
|
||||
cache_creation_input_tokens: 100,
|
||||
},
|
||||
expectedRead: 800,
|
||||
// Anthropic native doesn't go through the shim in production, but
|
||||
// buildAnthropicUsageFromRawUsage handles it correctly as passthrough:
|
||||
// prompt_tokens fallback is 0, so fresh comes from input_tokens (200),
|
||||
// cache_read is picked up from cache_read_input_tokens (800).
|
||||
expectedTotal: 1_000, // 200 fresh + 800 read (created is not tracked at this layer)
|
||||
expectedHitRate: 800 / 1_000,
|
||||
expectedFreshInput: 200,
|
||||
},
|
||||
{
|
||||
name: 'OpenAI Chat Completions via openaiShim',
|
||||
provider: 'openai',
|
||||
rawUsage: {
|
||||
prompt_tokens: 2_000,
|
||||
completion_tokens: 300,
|
||||
prompt_tokens_details: { cached_tokens: 1_200 },
|
||||
},
|
||||
expectedRead: 1_200,
|
||||
expectedTotal: 2_000, // 800 fresh + 1200 read
|
||||
expectedHitRate: 0.6,
|
||||
expectedFreshInput: 800,
|
||||
},
|
||||
{
|
||||
name: 'Codex Responses API via codexShim',
|
||||
provider: 'codex',
|
||||
rawUsage: {
|
||||
input_tokens: 1_500,
|
||||
output_tokens: 50,
|
||||
input_tokens_details: { cached_tokens: 600 },
|
||||
},
|
||||
expectedRead: 600,
|
||||
expectedTotal: 1_500,
|
||||
expectedHitRate: 0.4,
|
||||
expectedFreshInput: 900,
|
||||
},
|
||||
{
|
||||
name: 'Kimi / Moonshot via openaiShim — top-level cached_tokens',
|
||||
provider: 'kimi',
|
||||
rawUsage: {
|
||||
prompt_tokens: 1_000,
|
||||
completion_tokens: 120,
|
||||
cached_tokens: 400,
|
||||
},
|
||||
expectedRead: 400,
|
||||
expectedTotal: 1_000,
|
||||
expectedHitRate: 0.4,
|
||||
expectedFreshInput: 600,
|
||||
},
|
||||
{
|
||||
name: 'DeepSeek via openaiShim — prompt_cache_hit_tokens',
|
||||
provider: 'deepseek',
|
||||
rawUsage: {
|
||||
prompt_tokens: 1_000,
|
||||
completion_tokens: 40,
|
||||
prompt_cache_hit_tokens: 700,
|
||||
prompt_cache_miss_tokens: 300,
|
||||
},
|
||||
expectedRead: 700,
|
||||
expectedTotal: 1_000,
|
||||
expectedHitRate: 0.7,
|
||||
expectedFreshInput: 300,
|
||||
},
|
||||
{
|
||||
name: 'Gemini via openaiShim — cached_content_token_count',
|
||||
provider: 'gemini',
|
||||
rawUsage: {
|
||||
prompt_tokens: 4_000,
|
||||
completion_tokens: 200,
|
||||
cached_content_token_count: 3_200,
|
||||
},
|
||||
expectedRead: 3_200,
|
||||
expectedTotal: 4_000,
|
||||
expectedHitRate: 0.8,
|
||||
expectedFreshInput: 800,
|
||||
},
|
||||
]
|
||||
|
||||
describe('raw usage → shim → extractCacheMetrics pipeline', () => {
|
||||
for (const s of scenarios) {
|
||||
test(s.name, () => {
|
||||
// Call the same helper the shims call in production — no
|
||||
// simulator, no possibility of drift.
|
||||
const shimmed = buildAnthropicUsageFromRawUsage(s.rawUsage)
|
||||
expect(shimmed.cache_read_input_tokens).toBe(s.expectedRead)
|
||||
expect(shimmed.input_tokens).toBe(s.expectedFreshInput)
|
||||
|
||||
const metrics = extractCacheMetrics(
|
||||
shimmed as unknown as Record<string, unknown>,
|
||||
s.provider,
|
||||
)
|
||||
expect(metrics.supported).toBe(true)
|
||||
expect(metrics.read).toBe(s.expectedRead)
|
||||
expect(metrics.total).toBe(s.expectedTotal)
|
||||
expect(metrics.hitRate).toBeCloseTo(s.expectedHitRate, 4)
|
||||
})
|
||||
}
|
||||
})
|
||||
|
||||
describe('no-cache providers — pipeline honestly reports unsupported', () => {
|
||||
test('GitHub Copilot (vanilla) — shim runs, but provider bucket maps to unsupported', () => {
|
||||
const shimmed = buildAnthropicUsageFromRawUsage({
|
||||
prompt_tokens: 500,
|
||||
completion_tokens: 40,
|
||||
})
|
||||
// Shim normalized correctly (0 cache_read), but Copilot-vanilla must
|
||||
// surface as unsupported so /cache-stats shows "N/A" instead of "0%".
|
||||
expect(shimmed.cache_read_input_tokens).toBe(0)
|
||||
const metrics = extractCacheMetrics(
|
||||
shimmed as unknown as Record<string, unknown>,
|
||||
'copilot',
|
||||
)
|
||||
expect(metrics.supported).toBe(false)
|
||||
expect(metrics.hitRate).toBeNull()
|
||||
})
|
||||
|
||||
test('Ollama (local) — same treatment as Copilot-vanilla', () => {
|
||||
const shimmed = buildAnthropicUsageFromRawUsage({
|
||||
prompt_tokens: 1_000,
|
||||
completion_tokens: 200,
|
||||
})
|
||||
const metrics = extractCacheMetrics(
|
||||
shimmed as unknown as Record<string, unknown>,
|
||||
'ollama',
|
||||
)
|
||||
expect(metrics.supported).toBe(false)
|
||||
})
|
||||
})
|
||||
|
||||
describe('display path end-to-end — private-IP, custom-port, self-hosted endpoints', () => {
|
||||
// These tests exercise the FULL pipeline that runs when a user
|
||||
// configures OpenClaude against a self-hosted OpenAI-compatible
|
||||
// server (vLLM, LM Studio, LocalAI, text-generation-webui, etc.):
|
||||
//
|
||||
// OPENAI_BASE_URL → resolveCacheProvider → real provider usage →
|
||||
// buildAnthropicUsageFromRawUsage → extractCacheMetrics →
|
||||
// formatCacheMetricsCompact / Full (= what user sees in REPL and
|
||||
// via /cache-stats)
|
||||
//
|
||||
// Pre-fix behavior: substring check missed these URLs, they fell
|
||||
// into the 'openai' bucket, and the display showed '[Cache: cold]' —
|
||||
// i.e. implied a cache miss when the provider simply doesn't report
|
||||
// cache fields. Post-fix: '[Cache: N/A]' every time.
|
||||
|
||||
const privateEndpoints: Array<{ name: string; baseUrl: string }> = [
|
||||
{ name: 'vLLM on RFC1918 LAN IP', baseUrl: 'http://192.168.1.50:8000/v1' },
|
||||
{ name: 'LocalAI on 10.x.x.x corporate network', baseUrl: 'http://10.0.0.7:8080/v1' },
|
||||
{ name: 'self-hosted on 172.16.x.x', baseUrl: 'http://172.20.0.3:5000/v1' },
|
||||
{ name: 'reverse-proxied on .internal DNS', baseUrl: 'http://llm.internal:5000/v1' },
|
||||
{ name: 'mDNS .local hostname', baseUrl: 'http://box.local:8080/v1' },
|
||||
{ name: 'RFC 8375 .home.arpa', baseUrl: 'http://vllm.home.arpa/v1' },
|
||||
{ name: 'CGNAT / Tailscale 100.64.x.x', baseUrl: 'http://100.64.1.5:8000/v1' },
|
||||
{ name: 'IPv6 loopback literal', baseUrl: 'http://[::1]:5000/v1' },
|
||||
{ name: 'IPv6 link-local', baseUrl: 'http://[fe80::1]:8000/v1' },
|
||||
{ name: 'IPv6 ULA fc00::/7', baseUrl: 'http://[fd12:3456::7]:8080/v1' },
|
||||
{ name: 'link-local cloud-metadata IP', baseUrl: 'http://169.254.169.254/v1' },
|
||||
]
|
||||
|
||||
for (const { name, baseUrl } of privateEndpoints) {
|
||||
test(`${name} (${baseUrl}) — renders [Cache: N/A], not [Cache: cold]`, () => {
|
||||
// 1. URL resolves to self-hosted bucket.
|
||||
const bucket = resolveCacheProvider('openai', { openAiBaseUrl: baseUrl })
|
||||
expect(bucket).toBe('self-hosted')
|
||||
|
||||
// 2. Typical self-hosted server returns OpenAI-shape usage with no
|
||||
// cache fields — the shim normalizes it cleanly.
|
||||
const shimmed = buildAnthropicUsageFromRawUsage({
|
||||
prompt_tokens: 1_200,
|
||||
completion_tokens: 250,
|
||||
})
|
||||
expect(shimmed.cache_read_input_tokens).toBe(0)
|
||||
|
||||
// 3. The display path marks the bucket unsupported.
|
||||
const metrics = extractCacheMetrics(
|
||||
shimmed as unknown as Record<string, unknown>,
|
||||
bucket,
|
||||
)
|
||||
expect(metrics.supported).toBe(false)
|
||||
expect(metrics.hitRate).toBeNull()
|
||||
|
||||
// 4. User-visible output — both formats honor the unsupported flag.
|
||||
expect(formatCacheMetricsCompact(metrics)).toBe('[Cache: N/A]')
|
||||
expect(formatCacheMetricsFull(metrics)).toBe('[Cache: N/A]')
|
||||
})
|
||||
}
|
||||
|
||||
test('public-looking URL with non-standard port stays in openai bucket (no false positive)', () => {
|
||||
// A real hosted API that happens to run on a custom port must NOT
|
||||
// be misclassified as self-hosted. This guards the fix against
|
||||
// over-matching.
|
||||
const bucket = resolveCacheProvider('openai', {
|
||||
openAiBaseUrl: 'https://api.openai.com:8443/v1',
|
||||
})
|
||||
expect(bucket).toBe('openai')
|
||||
})
|
||||
|
||||
test('private IP + hosted-provider keyword in path → self-hosted wins', () => {
|
||||
// A URL like 'http://10.0.0.5:8000/v1/deepseek-proxy' has "deepseek"
|
||||
// in the path but the upstream is a LAN box, not the real DeepSeek.
|
||||
// Priority ordering in resolveCacheProvider must put self-hosted
|
||||
// detection first.
|
||||
const bucket = resolveCacheProvider('openai', {
|
||||
openAiBaseUrl: 'http://10.0.0.5:8000/v1/deepseek-proxy',
|
||||
})
|
||||
expect(bucket).toBe('self-hosted')
|
||||
})
|
||||
|
||||
test('self-hosted proxy forwarding real upstream cache data is NOT discarded', () => {
|
||||
// Review-blocker regression: an enterprise setup with an internal
|
||||
// reverse proxy on a private URL forwarding to OpenAI / Kimi /
|
||||
// DeepSeek / Gemini WILL deliver real cache fields via the shim.
|
||||
// Pre-fix, the URL heuristic → self-hosted → unconditional
|
||||
// `supported: false` discarded the data and rendered '[Cache: N/A]'
|
||||
// even though valid cache metrics were on the payload. Post-fix,
|
||||
// the data decides: non-zero cache activity trumps the URL bucket.
|
||||
const bucket = resolveCacheProvider('openai', {
|
||||
openAiBaseUrl: 'http://llm-proxy.corp.internal:5000/v1',
|
||||
})
|
||||
expect(bucket).toBe('self-hosted')
|
||||
|
||||
// Typical raw Kimi shape (the reverse proxy forwards this through
|
||||
// unchanged). Shim normalizes to Anthropic shape.
|
||||
const raw = { prompt_tokens: 2_000, cached_tokens: 800 }
|
||||
const shimmed = buildAnthropicUsageFromRawUsage(raw)
|
||||
|
||||
// Display path with the fix: data is preserved end-to-end.
|
||||
const metrics = extractCacheMetrics(
|
||||
shimmed as unknown as Record<string, unknown>,
|
||||
bucket,
|
||||
)
|
||||
expect(metrics.supported).toBe(true)
|
||||
expect(metrics.read).toBe(800)
|
||||
expect(metrics.hitRate).toBe(0.4)
|
||||
expect(formatCacheMetricsCompact(metrics)).toBe(
|
||||
'[Cache: 800 read • hit 40%]',
|
||||
)
|
||||
})
|
||||
})
|
||||
|
||||
describe('regression guards — bug reproducers', () => {
|
||||
test('Kimi cache hit survives the shim (pre-fix: silently dropped to 0)', () => {
|
||||
// Before the Option-C refactor, the shim only read
|
||||
// prompt_tokens_details.cached_tokens, so Kimi's top-level
|
||||
// cached_tokens (400 below) was lost — the tracker saw read=0 and
|
||||
// users saw "[Cache: cold]" even after real cache hits. This test
|
||||
// fails loudly if the helper forgets the top-level branch.
|
||||
const raw = { prompt_tokens: 800, cached_tokens: 300 }
|
||||
const shimmed = buildAnthropicUsageFromRawUsage(raw)
|
||||
const metrics = extractCacheMetrics(
|
||||
shimmed as unknown as Record<string, unknown>,
|
||||
'kimi',
|
||||
)
|
||||
expect(metrics.read).toBe(300)
|
||||
expect(metrics.hitRate).toBeGreaterThan(0)
|
||||
})
|
||||
|
||||
test('DeepSeek cache hit survives the shim (pre-fix: silently dropped to 0)', () => {
|
||||
const raw = {
|
||||
prompt_tokens: 1_200,
|
||||
prompt_cache_hit_tokens: 900,
|
||||
prompt_cache_miss_tokens: 300,
|
||||
}
|
||||
const shimmed = buildAnthropicUsageFromRawUsage(raw)
|
||||
const metrics = extractCacheMetrics(
|
||||
shimmed as unknown as Record<string, unknown>,
|
||||
'deepseek',
|
||||
)
|
||||
expect(metrics.read).toBe(900)
|
||||
expect(metrics.hitRate).toBe(0.75)
|
||||
})
|
||||
|
||||
test('Codex makeUsage no longer double-bills (pre-fix: input_tokens kept cached)', () => {
|
||||
// Pre-fix, codexShim.makeUsage set input_tokens to the raw value
|
||||
// without subtracting cached_tokens, so modelCost.calculateUSDCost
|
||||
// charged the same tokens under both input_tokens * rate AND
|
||||
// cache_read_input_tokens * rate. This test enforces the Anthropic
|
||||
// convention at the shim boundary.
|
||||
const raw = {
|
||||
input_tokens: 2_000,
|
||||
input_tokens_details: { cached_tokens: 1_500 },
|
||||
}
|
||||
const shimmed = buildAnthropicUsageFromRawUsage(raw)
|
||||
expect(shimmed.input_tokens).toBe(500) // 2000 - 1500, not 2000
|
||||
expect(shimmed.cache_read_input_tokens).toBe(1_500)
|
||||
})
|
||||
})
|
||||
210
src/services/api/cacheStatsTracker.test.ts
Normal file
210
src/services/api/cacheStatsTracker.test.ts
Normal file
@@ -0,0 +1,210 @@
|
||||
import { beforeEach, expect, test, describe } from 'bun:test'
|
||||
import {
|
||||
_setHistoryCapForTesting,
|
||||
getCacheStatsHistory,
|
||||
getCurrentTurnCacheMetrics,
|
||||
getSessionCacheMetrics,
|
||||
recordRequest,
|
||||
resetCurrentTurn,
|
||||
resetSessionCacheStats,
|
||||
} from './cacheStatsTracker.js'
|
||||
import type { CacheMetrics } from './cacheMetrics.js'
|
||||
|
||||
function makeMetrics(partial: Partial<CacheMetrics>): CacheMetrics {
|
||||
return {
|
||||
read: 0,
|
||||
created: 0,
|
||||
total: 0,
|
||||
hitRate: null,
|
||||
supported: true,
|
||||
...partial,
|
||||
}
|
||||
}
|
||||
|
||||
beforeEach(() => {
|
||||
resetSessionCacheStats()
|
||||
_setHistoryCapForTesting(500)
|
||||
})
|
||||
|
||||
describe('cacheStatsTracker — aggregation', () => {
|
||||
test('currentTurn and session both start empty and unsupported', () => {
|
||||
expect(getCurrentTurnCacheMetrics().supported).toBe(false)
|
||||
expect(getSessionCacheMetrics().supported).toBe(false)
|
||||
expect(getCacheStatsHistory()).toEqual([])
|
||||
})
|
||||
|
||||
test('one recorded request flows into both turn and session', () => {
|
||||
recordRequest(
|
||||
makeMetrics({ read: 500, total: 1000, hitRate: 0.5 }),
|
||||
'claude-sonnet-4',
|
||||
)
|
||||
expect(getCurrentTurnCacheMetrics().read).toBe(500)
|
||||
expect(getCurrentTurnCacheMetrics().total).toBe(1000)
|
||||
expect(getSessionCacheMetrics().read).toBe(500)
|
||||
})
|
||||
|
||||
test('multiple requests sum across turn', () => {
|
||||
recordRequest(
|
||||
makeMetrics({ read: 100, total: 500, hitRate: 0.2 }),
|
||||
'm1',
|
||||
)
|
||||
recordRequest(
|
||||
makeMetrics({ read: 300, total: 500, hitRate: 0.6 }),
|
||||
'm1',
|
||||
)
|
||||
const turn = getCurrentTurnCacheMetrics()
|
||||
expect(turn.read).toBe(400)
|
||||
expect(turn.total).toBe(1000)
|
||||
expect(turn.hitRate).toBeCloseTo(0.4, 5)
|
||||
})
|
||||
|
||||
test('resetCurrentTurn clears turn but preserves session', () => {
|
||||
recordRequest(makeMetrics({ read: 200, total: 400 }), 'm1')
|
||||
resetCurrentTurn()
|
||||
expect(getCurrentTurnCacheMetrics().supported).toBe(false)
|
||||
expect(getSessionCacheMetrics().read).toBe(200)
|
||||
})
|
||||
|
||||
test('resetSessionCacheStats clears everything', () => {
|
||||
recordRequest(makeMetrics({ read: 200, total: 400 }), 'm1')
|
||||
resetSessionCacheStats()
|
||||
expect(getCurrentTurnCacheMetrics().supported).toBe(false)
|
||||
expect(getSessionCacheMetrics().supported).toBe(false)
|
||||
expect(getCacheStatsHistory()).toEqual([])
|
||||
})
|
||||
})
|
||||
|
||||
describe('cacheStatsTracker — history', () => {
|
||||
test('records each request with label and timestamp', () => {
|
||||
const before = Date.now()
|
||||
recordRequest(makeMetrics({ read: 1, total: 2 }), 'model-A')
|
||||
recordRequest(makeMetrics({ read: 3, total: 4 }), 'model-B')
|
||||
const history = getCacheStatsHistory()
|
||||
expect(history.length).toBe(2)
|
||||
expect(history[0]!.label).toBe('model-A')
|
||||
expect(history[1]!.label).toBe('model-B')
|
||||
expect(history[0]!.timestamp).toBeGreaterThanOrEqual(before)
|
||||
})
|
||||
|
||||
test('evicts oldest entries when cap is exceeded', () => {
|
||||
_setHistoryCapForTesting(3)
|
||||
for (let i = 0; i < 5; i++) {
|
||||
recordRequest(makeMetrics({ read: i, total: 10 }), `m${i}`)
|
||||
}
|
||||
const history = getCacheStatsHistory()
|
||||
expect(history.length).toBe(3)
|
||||
expect(history.map((h) => h.label)).toEqual(['m2', 'm3', 'm4'])
|
||||
})
|
||||
|
||||
test('history copy is detached from internal state', () => {
|
||||
recordRequest(makeMetrics({ read: 1, total: 2 }), 'x')
|
||||
const snapshot = getCacheStatsHistory()
|
||||
snapshot.pop()
|
||||
expect(getCacheStatsHistory().length).toBe(1)
|
||||
})
|
||||
})
|
||||
|
||||
describe('cacheStatsTracker — ring buffer semantics', () => {
|
||||
test('ring wraps at cap without shifting (chronological order preserved)', () => {
|
||||
_setHistoryCapForTesting(4)
|
||||
// Push exactly 2×cap entries — forces one full wrap.
|
||||
for (let i = 0; i < 8; i++) {
|
||||
recordRequest(makeMetrics({ read: i, total: 10 }), `m${i}`)
|
||||
}
|
||||
const history = getCacheStatsHistory()
|
||||
expect(history.length).toBe(4)
|
||||
// After 8 pushes with cap=4, the survivors must be the newest 4 —
|
||||
// m4, m5, m6, m7 — in chronological order. If the ring logic were
|
||||
// wrong (e.g. off-by-one on writeIdx) this would come out rotated.
|
||||
expect(history.map((h) => h.label)).toEqual(['m4', 'm5', 'm6', 'm7'])
|
||||
})
|
||||
|
||||
test('read before ring wraps returns partial history in order', () => {
|
||||
_setHistoryCapForTesting(10)
|
||||
for (let i = 0; i < 3; i++) {
|
||||
recordRequest(makeMetrics({ read: i, total: 10 }), `m${i}`)
|
||||
}
|
||||
const history = getCacheStatsHistory()
|
||||
expect(history.map((h) => h.label)).toEqual(['m0', 'm1', 'm2'])
|
||||
})
|
||||
|
||||
test('shrinking cap preserves the newest entries in order', () => {
|
||||
_setHistoryCapForTesting(5)
|
||||
for (let i = 0; i < 5; i++) {
|
||||
recordRequest(makeMetrics({ read: i, total: 10 }), `m${i}`)
|
||||
}
|
||||
_setHistoryCapForTesting(3)
|
||||
const history = getCacheStatsHistory()
|
||||
expect(history.map((h) => h.label)).toEqual(['m2', 'm3', 'm4'])
|
||||
// And pushing after shrink still respects the new cap.
|
||||
recordRequest(makeMetrics({ read: 5, total: 10 }), 'm5')
|
||||
expect(getCacheStatsHistory().map((h) => h.label)).toEqual(['m3', 'm4', 'm5'])
|
||||
})
|
||||
|
||||
test('growing cap preserves existing entries and accepts more', () => {
|
||||
_setHistoryCapForTesting(3)
|
||||
for (let i = 0; i < 3; i++) {
|
||||
recordRequest(makeMetrics({ read: i, total: 10 }), `m${i}`)
|
||||
}
|
||||
_setHistoryCapForTesting(6)
|
||||
// After growing, the existing three should still be there in order,
|
||||
// and we should be able to push three more before eviction starts.
|
||||
for (let i = 3; i < 6; i++) {
|
||||
recordRequest(makeMetrics({ read: i, total: 10 }), `m${i}`)
|
||||
}
|
||||
const history = getCacheStatsHistory()
|
||||
expect(history.map((h) => h.label)).toEqual([
|
||||
'm0',
|
||||
'm1',
|
||||
'm2',
|
||||
'm3',
|
||||
'm4',
|
||||
'm5',
|
||||
])
|
||||
})
|
||||
|
||||
test('_setHistoryCapForTesting throws on non-positive cap', () => {
|
||||
// A zero cap would divide-by-zero on the ring write index and
|
||||
// silently corrupt the buffer. Loud failure > NaN indices.
|
||||
expect(() => _setHistoryCapForTesting(0)).toThrow(/cap must be >= 1/)
|
||||
expect(() => _setHistoryCapForTesting(-3)).toThrow(/cap must be >= 1/)
|
||||
})
|
||||
|
||||
test('resetSessionCacheStats empties the ring even when wrapped', () => {
|
||||
_setHistoryCapForTesting(3)
|
||||
for (let i = 0; i < 10; i++) {
|
||||
recordRequest(makeMetrics({ read: i, total: 10 }), `m${i}`)
|
||||
}
|
||||
// Sanity: ring has wrapped many times.
|
||||
expect(getCacheStatsHistory().length).toBe(3)
|
||||
resetSessionCacheStats()
|
||||
expect(getCacheStatsHistory()).toEqual([])
|
||||
// And a fresh push after reset starts from index 0 again.
|
||||
recordRequest(makeMetrics({ read: 99, total: 100 }), 'post-reset')
|
||||
const after = getCacheStatsHistory()
|
||||
expect(after.length).toBe(1)
|
||||
expect(after[0]!.label).toBe('post-reset')
|
||||
})
|
||||
})
|
||||
|
||||
describe('cacheStatsTracker — unsupported mixing', () => {
|
||||
test('mixing supported + unsupported keeps supported data visible', () => {
|
||||
recordRequest(
|
||||
{
|
||||
read: 0,
|
||||
created: 0,
|
||||
total: 0,
|
||||
hitRate: null,
|
||||
supported: false,
|
||||
},
|
||||
'copilot',
|
||||
)
|
||||
recordRequest(
|
||||
makeMetrics({ read: 100, total: 500, hitRate: 0.2 }),
|
||||
'claude',
|
||||
)
|
||||
const turn = getCurrentTurnCacheMetrics()
|
||||
expect(turn.supported).toBe(true)
|
||||
expect(turn.read).toBe(100)
|
||||
})
|
||||
})
|
||||
179
src/services/api/cacheStatsTracker.ts
Normal file
179
src/services/api/cacheStatsTracker.ts
Normal file
@@ -0,0 +1,179 @@
|
||||
/**
|
||||
* Per-query and per-session cache metrics tracker for Phase 1 observability.
|
||||
*
|
||||
* Sits downstream of `extractCacheMetrics` (normalizer) and upstream of the
|
||||
* REPL display + `/cache-stats` command. The shim layers already report raw
|
||||
* usage into Anthropic-shaped fields, so this tracker listens for each
|
||||
* successful API response and folds the metrics into three buckets:
|
||||
*
|
||||
* - currentTurn : cleared by callers at the start of each user turn
|
||||
* - session : accumulates from process start until `/clear`
|
||||
* - history : per-request log for `/cache-stats` breakdown view
|
||||
*
|
||||
* Design rationale:
|
||||
* - Module-local state (not AppState, not bootstrap/state.ts) because
|
||||
* this is strictly observability — nothing in the conversation flow
|
||||
* depends on it and we don't want to couple the shim to React state.
|
||||
* - `recordRequest()` takes an ALREADY-normalized CacheMetrics so the
|
||||
* shim layer can resolve provider once and we avoid re-running env
|
||||
* detection on every response.
|
||||
* - `history` is bounded (DEFAULT_HISTORY_MAX) so a long-lived session
|
||||
* can't grow memory unboundedly. Oldest entries drop first.
|
||||
* - `supported: false` requests still land in history (so the user can
|
||||
* see "6 requests, all N/A" rather than "no data"), but they add to
|
||||
* sums as zero — `addCacheMetrics` preserves the supported flag.
|
||||
*
|
||||
* History is stored as a **ring buffer** (fixed-size array + write index).
|
||||
* Previous implementation used `array.splice(0, n)` on every overflow,
|
||||
* which shifts the entire tail — O(n) per recordRequest for the default
|
||||
* cap of 500 (negligible in practice, but wasteful). The ring makes
|
||||
* `recordRequest` strictly O(1). `getCacheStatsHistory()` still pays O(n)
|
||||
* to reconstruct chronological order, but that only runs when the user
|
||||
* opens `/cache-stats` or the REPL renders — never in the hot path.
|
||||
*/
|
||||
import { addCacheMetrics, type CacheMetrics } from './cacheMetrics.js'
|
||||
|
||||
/** One request's cache footprint — what the tracker remembers per turn. */
|
||||
export type CacheStatsEntry = {
|
||||
/** Unix ms when the request completed. */
|
||||
timestamp: number
|
||||
/** Opaque label (usually the model string) for `/cache-stats` rows. */
|
||||
label: string
|
||||
/** Normalized metrics for this single request. */
|
||||
metrics: CacheMetrics
|
||||
}
|
||||
|
||||
// Bound the per-session history. 500 requests ≈ a full day of active use;
|
||||
// any more than that is noise for a diagnostic command and starts costing
|
||||
// real memory (~100 bytes per entry with the labels).
|
||||
const DEFAULT_HISTORY_MAX = 500
|
||||
|
||||
const EMPTY_METRICS: CacheMetrics = {
|
||||
read: 0,
|
||||
created: 0,
|
||||
total: 0,
|
||||
hitRate: null,
|
||||
supported: false,
|
||||
}
|
||||
|
||||
type TrackerState = {
|
||||
currentTurn: CacheMetrics
|
||||
session: CacheMetrics
|
||||
// Ring buffer: fixed-size array, `historyWriteIdx` points at the next
|
||||
// slot to overwrite. Once `historySize === historyMax`, each new push
|
||||
// drops the oldest entry by simply overwriting it — no shifting.
|
||||
history: (CacheStatsEntry | undefined)[]
|
||||
historyWriteIdx: number
|
||||
historySize: number
|
||||
historyMax: number
|
||||
}
|
||||
|
||||
function createInitialState(max: number): TrackerState {
|
||||
return {
|
||||
currentTurn: EMPTY_METRICS,
|
||||
session: EMPTY_METRICS,
|
||||
history: new Array(max),
|
||||
historyWriteIdx: 0,
|
||||
historySize: 0,
|
||||
historyMax: max,
|
||||
}
|
||||
}
|
||||
|
||||
const state: TrackerState = createInitialState(DEFAULT_HISTORY_MAX)
|
||||
|
||||
/**
|
||||
* Record a single API response's normalized cache metrics. Idempotent per
|
||||
* request (caller ensures this isn't double-counted) — safe to call from
|
||||
* the shim right after `addToTotalSessionCost`.
|
||||
*
|
||||
* O(1) via ring-buffer write — previously used `splice(0, n)` on overflow
|
||||
* which was O(n) per call for the default cap of 500.
|
||||
*/
|
||||
export function recordRequest(
|
||||
metrics: CacheMetrics,
|
||||
label: string,
|
||||
): void {
|
||||
state.currentTurn = addCacheMetrics(state.currentTurn, metrics)
|
||||
state.session = addCacheMetrics(state.session, metrics)
|
||||
const entry: CacheStatsEntry = {
|
||||
timestamp: Date.now(),
|
||||
label,
|
||||
metrics,
|
||||
}
|
||||
// Overwrite at the write head. If the ring is full, this drops the
|
||||
// oldest entry (which previously lived at this slot) implicitly.
|
||||
state.history[state.historyWriteIdx] = entry
|
||||
state.historyWriteIdx = (state.historyWriteIdx + 1) % state.historyMax
|
||||
if (state.historySize < state.historyMax) {
|
||||
state.historySize++
|
||||
}
|
||||
}
|
||||
|
||||
/** Clear turn-level counters at the start of a new user turn. */
|
||||
export function resetCurrentTurn(): void {
|
||||
state.currentTurn = EMPTY_METRICS
|
||||
}
|
||||
|
||||
/** Clear all session state — used by `/clear`, `/compact`, tests. */
|
||||
export function resetSessionCacheStats(): void {
|
||||
state.currentTurn = EMPTY_METRICS
|
||||
state.session = EMPTY_METRICS
|
||||
// Rebuild the ring so any hold-over references can be GC'd. Slightly
|
||||
// more work than zeroing indices, but `/clear` is rare and this avoids
|
||||
// silently pinning old CacheStatsEntry objects in memory.
|
||||
state.history = new Array(state.historyMax)
|
||||
state.historyWriteIdx = 0
|
||||
state.historySize = 0
|
||||
}
|
||||
|
||||
/** Snapshot of the current turn's aggregate. */
|
||||
export function getCurrentTurnCacheMetrics(): CacheMetrics {
|
||||
return state.currentTurn
|
||||
}
|
||||
|
||||
/** Snapshot of the session-wide aggregate. */
|
||||
export function getSessionCacheMetrics(): CacheMetrics {
|
||||
return state.session
|
||||
}
|
||||
|
||||
/**
|
||||
* Recent per-request entries, oldest-first. Returns a copy so callers
|
||||
* can freely sort/filter without perturbing the tracker.
|
||||
*
|
||||
* Walks the ring from the oldest slot to the newest. Two cases:
|
||||
* - not yet full: oldest is at index 0, newest at `size-1`
|
||||
* - full / wrapped: oldest is at `writeIdx`, newest at `writeIdx-1`
|
||||
*/
|
||||
export function getCacheStatsHistory(): CacheStatsEntry[] {
|
||||
if (state.historySize < state.historyMax) {
|
||||
// Fast path: ring hasn't wrapped yet, entries live at [0..size).
|
||||
return state.history.slice(0, state.historySize) as CacheStatsEntry[]
|
||||
}
|
||||
// Wrapped: reconstruct oldest-first by concatenating the two halves.
|
||||
const tail = state.history.slice(state.historyWriteIdx) as CacheStatsEntry[]
|
||||
const head = state.history.slice(0, state.historyWriteIdx) as CacheStatsEntry[]
|
||||
return tail.concat(head)
|
||||
}
|
||||
|
||||
/**
|
||||
* Test/debug hook — do not use in production paths. Resizes the ring
|
||||
* preserving the most recent `min(cap, size)` entries in chronological
|
||||
* order, so tests can shrink the cap and verify eviction behavior.
|
||||
*/
|
||||
export function _setHistoryCapForTesting(cap: number): void {
|
||||
// Cap must be positive — a zero-sized ring would divide by zero on
|
||||
// `preserved.length % cap`. Throw loudly rather than silently land on
|
||||
// `NaN` indices that would corrupt the ring on the next push.
|
||||
if (cap < 1) {
|
||||
throw new Error(`_setHistoryCapForTesting: cap must be >= 1 (got ${cap})`)
|
||||
}
|
||||
const current = getCacheStatsHistory()
|
||||
const preserved = cap < current.length ? current.slice(-cap) : current
|
||||
state.history = new Array(cap)
|
||||
for (let i = 0; i < preserved.length; i++) {
|
||||
state.history[i] = preserved[i]
|
||||
}
|
||||
state.historyWriteIdx = preserved.length % cap
|
||||
state.historySize = preserved.length
|
||||
state.historyMax = cap
|
||||
}
|
||||
@@ -1,4 +1,5 @@
|
||||
import { APIError } from '@anthropic-ai/sdk'
|
||||
import { buildAnthropicUsageFromRawUsage } from './cacheMetrics.js'
|
||||
import { compressToolHistory } from './compressToolHistory.js'
|
||||
import { fetchWithProxyRetry } from './fetchWithProxyRetry.js'
|
||||
import type {
|
||||
@@ -78,21 +79,12 @@ type CodexSseEvent = {
|
||||
data: Record<string, any>
|
||||
}
|
||||
|
||||
function makeUsage(usage?: {
|
||||
input_tokens?: number
|
||||
output_tokens?: number
|
||||
input_tokens_details?: { cached_tokens?: number }
|
||||
prompt_tokens_details?: { cached_tokens?: number }
|
||||
}): AnthropicUsage {
|
||||
return {
|
||||
input_tokens: usage?.input_tokens ?? 0,
|
||||
output_tokens: usage?.output_tokens ?? 0,
|
||||
cache_creation_input_tokens: 0,
|
||||
cache_read_input_tokens:
|
||||
usage?.input_tokens_details?.cached_tokens ??
|
||||
usage?.prompt_tokens_details?.cached_tokens ??
|
||||
0,
|
||||
}
|
||||
function makeUsage(usage?: Record<string, unknown>): AnthropicUsage {
|
||||
// Single source of truth for raw → Anthropic shape. Lives in
|
||||
// cacheMetrics.ts alongside the raw-shape extractor so any new
|
||||
// provider quirk requires a one-file change and the integration test
|
||||
// can call the exact same function instead of re-implementing it.
|
||||
return buildAnthropicUsageFromRawUsage(usage)
|
||||
}
|
||||
|
||||
function makeMessageId(): string {
|
||||
@@ -911,18 +903,14 @@ export async function* codexStreamToAnthropic(
|
||||
stop_reason: determineStopReason(finalResponse, sawToolUse),
|
||||
stop_sequence: null,
|
||||
},
|
||||
usage: {
|
||||
// Subtract cached tokens: OpenAI includes them in input_tokens,
|
||||
// but Anthropic convention treats input_tokens as non-cached only.
|
||||
input_tokens: (finalResponse?.usage?.input_tokens ?? 0) -
|
||||
(finalResponse?.usage?.input_tokens_details?.cached_tokens ??
|
||||
finalResponse?.usage?.prompt_tokens_details?.cached_tokens ?? 0),
|
||||
output_tokens: finalResponse?.usage?.output_tokens ?? 0,
|
||||
cache_read_input_tokens:
|
||||
finalResponse?.usage?.input_tokens_details?.cached_tokens ??
|
||||
finalResponse?.usage?.prompt_tokens_details?.cached_tokens ??
|
||||
0,
|
||||
},
|
||||
// Delegate to the shared normalizer so the streaming message_delta
|
||||
// path uses the same raw→Anthropic conversion as makeUsage() above
|
||||
// and the non-streaming response converter below. Previously this
|
||||
// block had its own inline subtraction that missed Kimi / DeepSeek
|
||||
// / Gemini raw shapes that the shared helper handles.
|
||||
usage: makeUsage(
|
||||
finalResponse?.usage as Record<string, unknown> | undefined,
|
||||
),
|
||||
}
|
||||
yield { type: 'message_stop' }
|
||||
}
|
||||
|
||||
@@ -46,6 +46,7 @@ import {
|
||||
type AnthropicUsage,
|
||||
type ShimCreateParams,
|
||||
} from './codexShim.js'
|
||||
import { buildAnthropicUsageFromRawUsage } from './cacheMetrics.js'
|
||||
import { compressToolHistory } from './compressToolHistory.js'
|
||||
import { fetchWithProxyRetry } from './fetchWithProxyRetry.js'
|
||||
import {
|
||||
@@ -845,16 +846,12 @@ function convertChunkUsage(
|
||||
usage: OpenAIStreamChunk['usage'] | undefined,
|
||||
): Partial<AnthropicUsage> | undefined {
|
||||
if (!usage) return undefined
|
||||
|
||||
const cached = usage.prompt_tokens_details?.cached_tokens ?? 0
|
||||
return {
|
||||
// Subtract cached tokens: OpenAI includes them in prompt_tokens,
|
||||
// but Anthropic convention treats input_tokens as non-cached only.
|
||||
input_tokens: (usage.prompt_tokens ?? 0) - cached,
|
||||
output_tokens: usage.completion_tokens ?? 0,
|
||||
cache_creation_input_tokens: 0,
|
||||
cache_read_input_tokens: cached,
|
||||
}
|
||||
// Delegates to the shared helper so this path, codexShim.makeUsage,
|
||||
// the non-streaming response below, and the integration tests all
|
||||
// produce byte-identical output for the same raw input.
|
||||
return buildAnthropicUsageFromRawUsage(
|
||||
usage as unknown as Record<string, unknown>,
|
||||
)
|
||||
}
|
||||
|
||||
const JSON_REPAIR_SUFFIXES = [
|
||||
@@ -2154,12 +2151,9 @@ class OpenAIShimMessages {
|
||||
model: data.model ?? model,
|
||||
stop_reason: stopReason,
|
||||
stop_sequence: null,
|
||||
usage: {
|
||||
input_tokens: data.usage?.prompt_tokens ?? 0,
|
||||
output_tokens: data.usage?.completion_tokens ?? 0,
|
||||
cache_creation_input_tokens: 0,
|
||||
cache_read_input_tokens: data.usage?.prompt_tokens_details?.cached_tokens ?? 0,
|
||||
},
|
||||
usage: buildAnthropicUsageFromRawUsage(
|
||||
data.usage as unknown as Record<string, unknown> | undefined,
|
||||
),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user