feat(api): expose cache metrics in REPL + normalize across providers (#813)

* feat(api): expose cache metrics in REPL + /cache-stats command * fix(api): normalize Kimi/DeepSeek/Gemini cache fields through shim layer * test(api): cover /cache-stats rendering + fix CacheMetrics docstring drift * fix(api): always reset cache turn counter + include date in /cache-stats rows * refactor(api): unify shim usage builder + add cost-tracker wiring test * fix(api): classify private-IP/self-hosted OpenAI endpoints as N/A instead of cold * fix(api): require colon guard on IPv6 ULA prefix to avoid public-host over-match * perf(api): ring buffer for cache history + hit rate clamp + .localhost TLD * fix(api): null guards on formatters + document Codex Responses API shape * fix(api): defensive start-of-turn reset + config gate fallback + env var docs * fix(api): trust forwarded cache data on self-hosted URLs (data-driven) * refactor(api): delegate streaming Responses usage to shared makeUsage helper
2026-04-25 01:38:25 -03:00
parent 9070220292
commit 9e23c2bec4
20 changed files with 2749 additions and 46 deletions
--- a/src/services/api/cacheMetrics.test.ts
+++ b/src/services/api/cacheMetrics.test.ts
@@ -0,0 +1,782 @@
+import { expect, test, describe } from 'bun:test'
+import {
+  extractCacheMetrics,
+  extractCacheReadFromRawUsage,
+  resolveCacheProvider,
+  formatCacheMetricsCompact,
+  formatCacheMetricsFull,
+  addCacheMetrics,
+} from './cacheMetrics.js'
+
+describe('extractCacheMetrics — Anthropic (firstParty/bedrock/vertex/foundry)', () => {
+  test('reports read/created separately and computes hit rate over total input', () => {
+    const usage = {
+      input_tokens: 300,
+      output_tokens: 100,
+      cache_read_input_tokens: 800,
+      cache_creation_input_tokens: 200,
+    }
+    const m = extractCacheMetrics(usage, 'anthropic')
+    expect(m.supported).toBe(true)
+    expect(m.read).toBe(800)
+    expect(m.created).toBe(200)
+    // total = fresh(300) + created(200) + read(800) = 1300
+    expect(m.total).toBe(1300)
+    expect(m.hitRate).toBeCloseTo(800 / 1300, 4)
+  })
+
+  test('returns cold metrics when no cache activity yet', () => {
+    const m = extractCacheMetrics({ input_tokens: 500 }, 'anthropic')
+    expect(m.supported).toBe(true)
+    expect(m.read).toBe(0)
+    expect(m.created).toBe(0)
+    expect(m.hitRate).toBe(0)
+  })
+
+  test('null hit rate when usage has no input at all', () => {
+    const m = extractCacheMetrics({}, 'anthropic')
+    expect(m.supported).toBe(true)
+    expect(m.hitRate).toBeNull()
+  })
+})
+
+// NOTE: OpenAI/Codex/Kimi/DeepSeek/Gemini raw shapes are now tested through
+// extractCacheReadFromRawUsage (below). extractCacheMetrics sees the
+// post-shim Anthropic shape for every provider, so the tests here verify
+// that the shape lookup works uniformly against the shimmed fields.
+
+describe('extractCacheMetrics — post-shim Anthropic shape (applies to all providers)', () => {
+  test('OpenAI post-shim (openai bucket) — reads Anthropic fields injected by convertChunkUsage', () => {
+    // This is what cost-tracker actually sees for OpenAI upstreams: the
+    // shim has already subtracted cached from prompt_tokens and moved it
+    // to cache_read_input_tokens.
+    const shimmed = {
+      input_tokens: 800, // fresh = 2000 - 1200
+      output_tokens: 300,
+      cache_creation_input_tokens: 0,
+      cache_read_input_tokens: 1_200,
+    }
+    const m = extractCacheMetrics(shimmed, 'openai')
+    expect(m.supported).toBe(true)
+    expect(m.read).toBe(1_200)
+    expect(m.created).toBe(0)
+    expect(m.total).toBe(2_000) // 800 fresh + 1200 read
+    expect(m.hitRate).toBe(0.6)
+  })
+
+  test('Codex post-shim — same Anthropic shape as OpenAI', () => {
+    const shimmed = {
+      input_tokens: 900, // 1500 - 600
+      cache_creation_input_tokens: 0,
+      cache_read_input_tokens: 600,
+    }
+    const m = extractCacheMetrics(shimmed, 'codex')
+    expect(m.read).toBe(600)
+    expect(m.total).toBe(1_500)
+    expect(m.hitRate).toBe(0.4)
+  })
+
+  test('Kimi post-shim — shim moved top-level cached_tokens into Anthropic field', () => {
+    const shimmed = {
+      input_tokens: 600, // 1000 - 400
+      cache_creation_input_tokens: 0,
+      cache_read_input_tokens: 400,
+    }
+    const m = extractCacheMetrics(shimmed, 'kimi')
+    expect(m.read).toBe(400)
+    expect(m.total).toBe(1_000)
+    expect(m.hitRate).toBe(0.4)
+  })
+
+  test('DeepSeek post-shim — hit moved to cache_read_input_tokens, miss to input_tokens', () => {
+    const shimmed = {
+      input_tokens: 300, // miss
+      cache_creation_input_tokens: 0,
+      cache_read_input_tokens: 700, // hit
+    }
+    const m = extractCacheMetrics(shimmed, 'deepseek')
+    expect(m.read).toBe(700)
+    expect(m.total).toBe(1_000)
+    expect(m.hitRate).toBe(0.7)
+  })
+
+  test('Gemini post-shim — cached_content_token_count moved to cache_read_input_tokens', () => {
+    const shimmed = {
+      input_tokens: 800, // 4000 - 3200
+      cache_creation_input_tokens: 0,
+      cache_read_input_tokens: 3_200,
+    }
+    const m = extractCacheMetrics(shimmed, 'gemini')
+    expect(m.read).toBe(3_200)
+    expect(m.total).toBe(4_000)
+    expect(m.hitRate).toBe(0.8)
+  })
+})
+
+describe('extractCacheReadFromRawUsage — single source of truth for shim layer', () => {
+  test('Anthropic-native passthrough: cache_read_input_tokens', () => {
+    expect(
+      extractCacheReadFromRawUsage({ cache_read_input_tokens: 1_500 }),
+    ).toBe(1_500)
+  })
+
+  test('OpenAI Chat Completions: prompt_tokens_details.cached_tokens', () => {
+    expect(
+      extractCacheReadFromRawUsage({
+        prompt_tokens: 2_000,
+        prompt_tokens_details: { cached_tokens: 1_200 },
+      }),
+    ).toBe(1_200)
+  })
+
+  test('Codex Responses API: input_tokens_details.cached_tokens', () => {
+    expect(
+      extractCacheReadFromRawUsage({
+        input_tokens: 1_500,
+        input_tokens_details: { cached_tokens: 600 },
+      }),
+    ).toBe(600)
+  })
+
+  test('Kimi / Moonshot: top-level cached_tokens', () => {
+    expect(
+      extractCacheReadFromRawUsage({ prompt_tokens: 1_000, cached_tokens: 400 }),
+    ).toBe(400)
+  })
+
+  test('DeepSeek: prompt_cache_hit_tokens', () => {
+    expect(
+      extractCacheReadFromRawUsage({
+        prompt_cache_hit_tokens: 700,
+        prompt_cache_miss_tokens: 300,
+      }),
+    ).toBe(700)
+  })
+
+  test('Gemini: cached_content_token_count', () => {
+    expect(
+      extractCacheReadFromRawUsage({
+        prompt_token_count: 4_000,
+        cached_content_token_count: 3_200,
+      }),
+    ).toBe(3_200)
+  })
+
+  test('no cache fields at all → 0 (Copilot/Ollama/unknown shape)', () => {
+    expect(extractCacheReadFromRawUsage({ prompt_tokens: 500 })).toBe(0)
+  })
+
+  test('Anthropic field wins over OpenAI field when both present', () => {
+    // Shouldn't happen in practice, but if usage was double-annotated we
+    // trust the Anthropic-native number (it's the more authoritative one).
+    expect(
+      extractCacheReadFromRawUsage({
+        cache_read_input_tokens: 999,
+        prompt_tokens_details: { cached_tokens: 111 },
+      }),
+    ).toBe(999)
+  })
+
+  test('null/undefined/non-object → 0', () => {
+    expect(extractCacheReadFromRawUsage(null)).toBe(0)
+    expect(extractCacheReadFromRawUsage(undefined)).toBe(0)
+    expect(extractCacheReadFromRawUsage('nope' as unknown as never)).toBe(0)
+  })
+})
+
+describe('extractCacheMetrics — Copilot / Ollama (unsupported)', () => {
+  test('returns supported:false with all zeros and null hitRate for Copilot', () => {
+    const m = extractCacheMetrics({ prompt_tokens: 1000 }, 'copilot')
+    expect(m.supported).toBe(false)
+    expect(m.read).toBe(0)
+    expect(m.created).toBe(0)
+    expect(m.hitRate).toBeNull()
+  })
+
+  test('returns supported:false for Ollama', () => {
+    const m = extractCacheMetrics({ prompt_tokens: 42 }, 'ollama')
+    expect(m.supported).toBe(false)
+    expect(m.hitRate).toBeNull()
+  })
+
+  test('Copilot serving Claude (copilot-claude) is supported and uses Anthropic fields', () => {
+    const usage = {
+      input_tokens: 200,
+      cache_read_input_tokens: 800,
+      cache_creation_input_tokens: 100,
+    }
+    const m = extractCacheMetrics(usage, 'copilot-claude')
+    expect(m.supported).toBe(true)
+    expect(m.read).toBe(800)
+    expect(m.created).toBe(100)
+    expect(m.total).toBe(1_100)
+  })
+})
+
+describe('extractCacheMetrics — bad/empty input', () => {
+  test('null usage returns unsupported', () => {
+    expect(extractCacheMetrics(null, 'anthropic').supported).toBe(false)
+  })
+
+  test('non-object usage returns unsupported', () => {
+    expect(extractCacheMetrics('oops' as unknown as never, 'openai').supported).toBe(
+      false,
+    )
+  })
+})
+
+describe('resolveCacheProvider', () => {
+  test('firstParty → anthropic', () => {
+    expect(resolveCacheProvider('firstParty')).toBe('anthropic')
+  })
+  test('bedrock/vertex/foundry → anthropic', () => {
+    expect(resolveCacheProvider('bedrock')).toBe('anthropic')
+    expect(resolveCacheProvider('vertex')).toBe('anthropic')
+    expect(resolveCacheProvider('foundry')).toBe('anthropic')
+  })
+  test('github without claude hint → copilot (unsupported)', () => {
+    expect(resolveCacheProvider('github')).toBe('copilot')
+  })
+  test('github with claude hint → copilot-claude', () => {
+    expect(
+      resolveCacheProvider('github', { githubNativeAnthropic: true }),
+    ).toBe('copilot-claude')
+  })
+  test('openai with localhost / loopback → self-hosted', () => {
+    // These used to return 'ollama'; the bucket is now 'self-hosted'
+    // because not every local OpenAI-compatible server is Ollama
+    // (could be vLLM, LM Studio, LocalAI, text-generation-webui).
+    // Both buckets collapse to supported=false downstream.
+    expect(
+      resolveCacheProvider('openai', { openAiBaseUrl: 'http://localhost:8080/v1' }),
+    ).toBe('self-hosted')
+    expect(
+      resolveCacheProvider('openai', { openAiBaseUrl: 'http://127.0.0.1:1234/v1' }),
+    ).toBe('self-hosted')
+    // Localhost:11434 hits the self-hosted branch first — 'ollama' only
+    // kicks in when the :11434 port appears on a public-looking URL
+    // (which would be unusual but still deserves honest classification).
+    expect(
+      resolveCacheProvider('openai', { openAiBaseUrl: 'http://localhost:11434/v1' }),
+    ).toBe('self-hosted')
+    expect(
+      resolveCacheProvider('openai', { openAiBaseUrl: 'http://[::1]:5000/v1' }),
+    ).toBe('self-hosted')
+  })
+
+  test('openai on RFC1918 private IP → self-hosted (pre-fix: misclassified as openai)', () => {
+    // These are the exact cases the reviewer flagged. Before this fix,
+    // a vLLM / LocalAI server on a LAN address fell through to the
+    // 'openai' branch and /cache-stats showed '[Cache: cold]' — which
+    // users read as "my cache is broken" when the provider simply
+    // didn't report cache fields. Now they land in 'self-hosted' and
+    // /cache-stats shows '[Cache: N/A]'.
+    expect(
+      resolveCacheProvider('openai', { openAiBaseUrl: 'http://192.168.1.50:8000/v1' }),
+    ).toBe('self-hosted')
+    expect(
+      resolveCacheProvider('openai', { openAiBaseUrl: 'http://10.0.0.7:8080/v1' }),
+    ).toBe('self-hosted')
+    expect(
+      resolveCacheProvider('openai', { openAiBaseUrl: 'http://172.20.0.3:5000/v1' }),
+    ).toBe('self-hosted')
+  })
+
+  test('openai on link-local / CGNAT → self-hosted', () => {
+    expect(
+      resolveCacheProvider('openai', { openAiBaseUrl: 'http://169.254.169.254/v1' }),
+    ).toBe('self-hosted')
+    expect(
+      resolveCacheProvider('openai', { openAiBaseUrl: 'http://100.64.1.5:8000/v1' }),
+    ).toBe('self-hosted')
+  })
+
+  test('openai on reserved TLD (.local / .internal / .lan / .home.arpa) → self-hosted', () => {
+    // Per RFC 6761 (.local/mDNS), RFC 8375 (.home.arpa), and widely
+    // used .internal / .lan conventions. These never resolve publicly.
+    expect(
+      resolveCacheProvider('openai', { openAiBaseUrl: 'http://llm.internal:5000/v1' }),
+    ).toBe('self-hosted')
+    expect(
+      resolveCacheProvider('openai', { openAiBaseUrl: 'http://llm.local:8080/v1' }),
+    ).toBe('self-hosted')
+    expect(
+      resolveCacheProvider('openai', { openAiBaseUrl: 'http://vllm.home.arpa/v1' }),
+    ).toBe('self-hosted')
+    expect(
+      resolveCacheProvider('openai', { openAiBaseUrl: 'http://box.lan:1234/v1' }),
+    ).toBe('self-hosted')
+  })
+
+  test('openai on IPv6 local / link-local → self-hosted', () => {
+    expect(
+      resolveCacheProvider('openai', { openAiBaseUrl: 'http://[fe80::1]:8000/v1' }),
+    ).toBe('self-hosted')
+    expect(
+      resolveCacheProvider('openai', { openAiBaseUrl: 'http://[fd12:3456::7]:8080/v1' }),
+    ).toBe('self-hosted')
+    expect(
+      resolveCacheProvider('openai', { openAiBaseUrl: 'http://[fc00::1]:8080/v1' }),
+    ).toBe('self-hosted')
+  })
+
+  test('IPv6 ULA prefix (fc/fd) does NOT over-match public hostnames', () => {
+    // Regression guard: an early version of isLocalOrPrivateUrl checked
+    // `h.startsWith('fc')` / `startsWith('fd')` without a colon guard,
+    // which misclassified legitimate public hosts whose names happen to
+    // begin with those letters. The fix requires a colon in the match
+    // so only real IPv6 literals hit the branch.
+    expect(
+      resolveCacheProvider('openai', {
+        openAiBaseUrl: 'https://fc-api.example.com/v1',
+      }),
+    ).toBe('openai')
+    expect(
+      resolveCacheProvider('openai', {
+        openAiBaseUrl: 'https://fd-hosted.example.com/v1',
+      }),
+    ).toBe('openai')
+    // Same goes for names that look like hex prefixes but aren't IPv6.
+    expect(
+      resolveCacheProvider('openai', {
+        openAiBaseUrl: 'https://fcbench.net/v1',
+      }),
+    ).toBe('openai')
+  })
+
+  test('openai with :11434 on a public host → ollama (default-port heuristic)', () => {
+    // Contrived but the heuristic should still fire — someone running
+    // Ollama behind a reverse proxy with port preserved.
+    expect(
+      resolveCacheProvider('openai', {
+        openAiBaseUrl: 'https://ollama.example.com:11434/v1',
+      }),
+    ).toBe('ollama')
+  })
+
+  test('openai with moonshot URL → kimi', () => {
+    expect(
+      resolveCacheProvider('openai', { openAiBaseUrl: 'https://api.moonshot.ai/v1' }),
+    ).toBe('kimi')
+  })
+  test('openai with deepseek URL → deepseek', () => {
+    expect(
+      resolveCacheProvider('openai', { openAiBaseUrl: 'https://api.deepseek.com/v1' }),
+    ).toBe('deepseek')
+  })
+  test('private IP beats hosted-keyword matching (self-hosted takes priority)', () => {
+    // A pathological URL: a private-IP host whose path string contains
+    // "deepseek". Self-hosted detection must run FIRST so the URL
+    // classifies honestly — the path alone doesn't prove the upstream
+    // is the real DeepSeek API.
+    expect(
+      resolveCacheProvider('openai', {
+        openAiBaseUrl: 'http://10.0.0.5:8000/v1/deepseek-proxy',
+      }),
+    ).toBe('self-hosted')
+  })
+  test('plain openai remains openai', () => {
+    expect(
+      resolveCacheProvider('openai', { openAiBaseUrl: 'https://api.openai.com/v1' }),
+    ).toBe('openai')
+  })
+  test('unparseable base URL falls back to substring heuristic', () => {
+    // Bare host:port without a scheme is common in misconfigured env.
+    // We can't URL-parse it, but we still honor the "localhost" hint so
+    // a broken config doesn't silently masquerade as cache-capable.
+    expect(
+      resolveCacheProvider('openai', { openAiBaseUrl: 'localhost:8000' }),
+    ).toBe('self-hosted')
+    // An unparseable and opaque string falls through to plain 'openai'
+    // (best-effort — nothing we can infer from "foo-bar-baz").
+    expect(
+      resolveCacheProvider('openai', { openAiBaseUrl: '???' }),
+    ).toBe('openai')
+  })
+  test('empty base URL → plain openai', () => {
+    // No hint at all: assume the canonical api.openai.com.
+    expect(resolveCacheProvider('openai')).toBe('openai')
+    expect(
+      resolveCacheProvider('openai', { openAiBaseUrl: '' }),
+    ).toBe('openai')
+  })
+  test('codex → codex', () => {
+    expect(resolveCacheProvider('codex')).toBe('codex')
+  })
+  test('gemini → gemini', () => {
+    expect(resolveCacheProvider('gemini')).toBe('gemini')
+  })
+})
+
+describe('resolveCacheProvider — .localhost TLD (RFC 6761)', () => {
+  test('subdomains of .localhost classify as self-hosted', () => {
+    // Chrome, Firefox, and systemd-resolved all natively resolve
+    // *.localhost to 127.0.0.1. Kubernetes Ingress and docker-compose
+    // setups commonly use app.localhost, api.localhost, etc.
+    expect(
+      resolveCacheProvider('openai', {
+        openAiBaseUrl: 'http://app.localhost:3000/v1',
+      }),
+    ).toBe('self-hosted')
+    expect(
+      resolveCacheProvider('openai', {
+        openAiBaseUrl: 'http://api.localhost/v1',
+      }),
+    ).toBe('self-hosted')
+    expect(
+      resolveCacheProvider('openai', {
+        openAiBaseUrl: 'http://llm.dev.localhost:8080/v1',
+      }),
+    ).toBe('self-hosted')
+  })
+
+  test('.localhost TLD does NOT match substring collisions', () => {
+    // Guard against regressions where `localhost` would match via
+    // substring rather than TLD semantics. `localhostify.com` and
+    // `mylocalhost.net` must stay on the public `openai` path.
+    expect(
+      resolveCacheProvider('openai', {
+        openAiBaseUrl: 'https://localhostify.com/v1',
+      }),
+    ).toBe('openai')
+    expect(
+      resolveCacheProvider('openai', {
+        openAiBaseUrl: 'https://mylocalhost.net/v1',
+      }),
+    ).toBe('openai')
+  })
+})
+
+describe('extractCacheMetrics — hit rate clamp', () => {
+  test('hitRate is clamped to 1.0 on pathological input (read > total)', () => {
+    // Defensive guard: with valid non-negative inputs the math enforces
+    // read <= total, so hitRate cannot exceed 1. But an upstream shim
+    // bug (e.g. reading a negative `fresh` from a future provider) could
+    // break the invariant. `Math.min(1, read/total)` caps the display at
+    // 100% rather than letting a `read=800 total=500` case render as
+    // "hit 160%" or (worse) null, which would hide the anomaly.
+    const metrics = extractCacheMetrics(
+      {
+        cache_read_input_tokens: 800,
+        cache_creation_input_tokens: 0,
+        // asNumber keeps finite negatives, so fresh = -500 → total =
+        // 800 + 0 + (-500) = 300, read=800 → raw ratio 2.67, clamp to 1.
+        input_tokens: -500,
+      } as unknown as Record<string, unknown>,
+      'anthropic',
+    )
+    expect(metrics.supported).toBe(true)
+    expect(metrics.hitRate).toBe(1)
+  })
+
+  test('normal inputs still yield accurate fractional hit rates', () => {
+    // Regression: clamp must not perturb the happy path.
+    const metrics = extractCacheMetrics(
+      {
+        cache_read_input_tokens: 300,
+        cache_creation_input_tokens: 0,
+        input_tokens: 700,
+      },
+      'anthropic',
+    )
+    expect(metrics.hitRate).toBeCloseTo(0.3, 5)
+  })
+})
+
+describe('extractCacheMetrics — self-hosted bucket (data-driven)', () => {
+  test('vanilla self-hosted endpoint without cache fields → unsupported / N/A', () => {
+    // vLLM, LocalAI, text-generation-webui, etc. emit no cache fields
+    // at all. With read=created=0 we mark unsupported so the REPL shows
+    // honest '[Cache: N/A]' instead of a fabricated 0%.
+    const metrics = extractCacheMetrics(
+      { input_tokens: 1_000, output_tokens: 200 },
+      'self-hosted',
+    )
+    expect(metrics.supported).toBe(false)
+    expect(metrics.hitRate).toBeNull()
+    expect(metrics.read).toBe(0)
+    expect(metrics.created).toBe(0)
+  })
+
+  test('internal reverse proxy forwarding real cache data → supported', () => {
+    // Review-blocker regression guard: an enterprise setup with an
+    // internal proxy on a private URL (e.g. `http://llm.internal:5000/v1`)
+    // forwarding to OpenAI / Kimi / DeepSeek / Gemini WILL deliver real
+    // cache fields via the shim. Pre-fix we would discard them because
+    // the URL heuristic classified the endpoint as 'self-hosted'. Now
+    // the data itself decides: any non-zero cache activity flows through
+    // the same normalization as an OpenAI bucket.
+    const shimmed = {
+      input_tokens: 800, // fresh (post-shim, cached already subtracted)
+      cache_read_input_tokens: 1_200, // shim extracted from upstream
+      cache_creation_input_tokens: 0,
+    }
+    const metrics = extractCacheMetrics(shimmed, 'self-hosted')
+    expect(metrics.supported).toBe(true)
+    expect(metrics.read).toBe(1_200)
+    expect(metrics.total).toBe(2_000)
+    expect(metrics.hitRate).toBe(0.6)
+  })
+
+  test('proxy with cache_creation but zero cache_read → still supported', () => {
+    // Mirror of the above for the first-call / cold-cache scenario:
+    // Anthropic-compatible upstreams emit creation tokens on the first
+    // request that primes the cache. Self-hosted proxy must preserve
+    // that signal, not swallow it because read is still 0.
+    const shimmed = {
+      input_tokens: 500,
+      cache_read_input_tokens: 0,
+      cache_creation_input_tokens: 800,
+    }
+    const metrics = extractCacheMetrics(shimmed, 'self-hosted')
+    expect(metrics.supported).toBe(true)
+    expect(metrics.created).toBe(800)
+    expect(metrics.read).toBe(0)
+  })
+})
+
+describe('formatCacheMetrics — defensive null/undefined guards', () => {
+  test('formatCacheMetricsCompact returns N/A for undefined input', () => {
+    // Signature says `CacheMetrics` but runtime bug on a failed API
+    // response could leave the caller with nothing. The formatter
+    // should degrade gracefully rather than throw on `.supported`.
+    expect(formatCacheMetricsCompact(undefined)).toBe('[Cache: N/A]')
+    expect(formatCacheMetricsCompact(null as unknown as undefined)).toBe(
+      '[Cache: N/A]',
+    )
+  })
+
+  test('formatCacheMetricsFull returns N/A for undefined input', () => {
+    expect(formatCacheMetricsFull(undefined)).toBe('[Cache: N/A]')
+    expect(formatCacheMetricsFull(null as unknown as undefined)).toBe(
+      '[Cache: N/A]',
+    )
+  })
+})
+
+describe('formatCacheMetricsCompact — self-hosted display paths', () => {
+  test('vanilla self-hosted (no cache data) renders as N/A', () => {
+    const metrics = extractCacheMetrics(
+      { input_tokens: 500 },
+      'self-hosted',
+    )
+    expect(formatCacheMetricsCompact(metrics)).toBe('[Cache: N/A]')
+    expect(formatCacheMetricsFull(metrics)).toBe('[Cache: N/A]')
+  })
+
+  test('self-hosted proxy with forwarded cache data renders real metrics', () => {
+    // Full display-path regression guard for the review-blocker fix:
+    // the user must see the real hit rate that the upstream emitted,
+    // not a silent N/A because the URL looked private.
+    const metrics = extractCacheMetrics(
+      {
+        input_tokens: 800,
+        cache_read_input_tokens: 1_200,
+        cache_creation_input_tokens: 0,
+      },
+      'self-hosted',
+    )
+    expect(formatCacheMetricsCompact(metrics)).toBe('[Cache: 1.2k read • hit 60%]')
+    expect(formatCacheMetricsFull(metrics)).toBe(
+      '[Cache: read=1.2k created=0 hit=60%]',
+    )
+  })
+})
+
+describe('formatCacheMetricsCompact — snapshot-stable output', () => {
+  test('supported with reads shows "k" abbreviation and hit rate', () => {
+    const out = formatCacheMetricsCompact({
+      read: 1_234,
+      created: 0,
+      total: 10_000,
+      hitRate: 0.1234,
+      supported: true,
+    })
+    expect(out).toBe('[Cache: 1.2k read • hit 12%]')
+  })
+
+  test('supported with no cache activity renders "cold"', () => {
+    const out = formatCacheMetricsCompact({
+      read: 0,
+      created: 0,
+      total: 500,
+      hitRate: 0,
+      supported: true,
+    })
+    expect(out).toBe('[Cache: cold]')
+  })
+
+  test('unsupported renders "N/A"', () => {
+    const out = formatCacheMetricsCompact({
+      read: 0,
+      created: 0,
+      total: 0,
+      hitRate: null,
+      supported: false,
+    })
+    expect(out).toBe('[Cache: N/A]')
+  })
+
+  test('small numbers render without abbreviation', () => {
+    const out = formatCacheMetricsCompact({
+      read: 42,
+      created: 0,
+      total: 100,
+      hitRate: 0.42,
+      supported: true,
+    })
+    expect(out).toBe('[Cache: 42 read • hit 42%]')
+  })
+})
+
+describe('formatCacheMetricsFull — snapshot-stable output', () => {
+  test('supported shows all fields', () => {
+    const out = formatCacheMetricsFull({
+      read: 1_234,
+      created: 250,
+      total: 10_000,
+      hitRate: 0.1234,
+      supported: true,
+    })
+    expect(out).toBe('[Cache: read=1.2k created=250 hit=12%]')
+  })
+
+  test('null hit rate renders n/a', () => {
+    const out = formatCacheMetricsFull({
+      read: 0,
+      created: 0,
+      total: 0,
+      hitRate: null,
+      supported: true,
+    })
+    expect(out).toBe('[Cache: read=0 created=0 hit=n/a]')
+  })
+
+  test('unsupported renders "N/A"', () => {
+    const out = formatCacheMetricsFull({
+      read: 0,
+      created: 0,
+      total: 0,
+      hitRate: null,
+      supported: false,
+    })
+    expect(out).toBe('[Cache: N/A]')
+  })
+})
+
+describe('hit-rate edge cases (plan-mandated coverage)', () => {
+  test('0 read / 0 created on supported provider → hitRate = 0 (not null) when total > 0', () => {
+    const m = extractCacheMetrics({ input_tokens: 500 }, 'anthropic')
+    expect(m.read).toBe(0)
+    expect(m.created).toBe(0)
+    expect(m.hitRate).toBe(0)
+  })
+
+  test('read only (no created) computes proportion correctly', () => {
+    const m = extractCacheMetrics(
+      { input_tokens: 0, cache_read_input_tokens: 800, cache_creation_input_tokens: 0 },
+      'anthropic',
+    )
+    expect(m.read).toBe(800)
+    expect(m.created).toBe(0)
+    expect(m.total).toBe(800)
+    expect(m.hitRate).toBe(1)
+  })
+
+  test('created only (first turn — no reads yet) gives 0 hit rate', () => {
+    const m = extractCacheMetrics(
+      {
+        input_tokens: 200,
+        cache_read_input_tokens: 0,
+        cache_creation_input_tokens: 1_000,
+      },
+      'anthropic',
+    )
+    expect(m.read).toBe(0)
+    expect(m.created).toBe(1_000)
+    expect(m.total).toBe(1_200)
+    expect(m.hitRate).toBe(0)
+  })
+
+  test('mixed read + created + fresh input — full denominator', () => {
+    const m = extractCacheMetrics(
+      {
+        input_tokens: 500,
+        cache_read_input_tokens: 3_000,
+        cache_creation_input_tokens: 1_500,
+      },
+      'anthropic',
+    )
+    // Denominator = fresh(500) + created(1500) + read(3000) = 5_000
+    // Hit = read/total = 3000 / 5000 = 0.6
+    expect(m.total).toBe(5_000)
+    expect(m.hitRate).toBe(0.6)
+  })
+
+  test('N/A (unsupported provider) preserves null hit-rate even with populated usage', () => {
+    // Simulate a Copilot usage payload that might look like OpenAI shape —
+    // we must NOT try to read it and must report supported:false.
+    const m = extractCacheMetrics(
+      { prompt_tokens: 5_000, prompt_tokens_details: { cached_tokens: 2_000 } },
+      'copilot',
+    )
+    expect(m.supported).toBe(false)
+    expect(m.read).toBe(0)
+    expect(m.hitRate).toBeNull()
+  })
+})
+
+describe('addCacheMetrics — session aggregation', () => {
+  test('sums read/created/total and recomputes hit rate', () => {
+    const a = {
+      read: 100,
+      created: 50,
+      total: 300,
+      hitRate: 100 / 300,
+      supported: true,
+    }
+    const b = {
+      read: 200,
+      created: 0,
+      total: 400,
+      hitRate: 0.5,
+      supported: true,
+    }
+    const sum = addCacheMetrics(a, b)
+    expect(sum.read).toBe(300)
+    expect(sum.created).toBe(50)
+    expect(sum.total).toBe(700)
+    expect(sum.hitRate).toBeCloseTo(300 / 700, 5)
+  })
+
+  test('unsupported + supported = supported (so we never lose honest data)', () => {
+    const unsupported = {
+      read: 0,
+      created: 0,
+      total: 0,
+      hitRate: null,
+      supported: false,
+    }
+    const supported = {
+      read: 10,
+      created: 0,
+      total: 100,
+      hitRate: 0.1,
+      supported: true,
+    }
+    expect(addCacheMetrics(unsupported, supported)).toBe(supported)
+    expect(addCacheMetrics(supported, unsupported)).toBe(supported)
+  })
+
+  test('unsupported + unsupported = unsupported', () => {
+    const u = {
+      read: 0,
+      created: 0,
+      total: 0,
+      hitRate: null,
+      supported: false,
+    }
+    const sum = addCacheMetrics(u, u)
+    expect(sum.supported).toBe(false)
+  })
+})