feat(api): expose cache metrics in REPL + normalize across providers (#813)

* feat(api): expose cache metrics in REPL + /cache-stats command * fix(api): normalize Kimi/DeepSeek/Gemini cache fields through shim layer * test(api): cover /cache-stats rendering + fix CacheMetrics docstring drift * fix(api): always reset cache turn counter + include date in /cache-stats rows * refactor(api): unify shim usage builder + add cost-tracker wiring test * fix(api): classify private-IP/self-hosted OpenAI endpoints as N/A instead of cold * fix(api): require colon guard on IPv6 ULA prefix to avoid public-host over-match * perf(api): ring buffer for cache history + hit rate clamp + .localhost TLD * fix(api): null guards on formatters + document Codex Responses API shape * fix(api): defensive start-of-turn reset + config gate fallback + env var docs * fix(api): trust forwarded cache data on self-hosted URLs (data-driven) * refactor(api): delegate streaming Responses usage to shared makeUsage helper
2026-04-25 01:38:25 -03:00
parent 9070220292
commit 9e23c2bec4
20 changed files with 2749 additions and 46 deletions
--- a/.env.example
+++ b/.env.example
@@ -299,6 +299,20 @@ ANTHROPIC_API_KEY=sk-ant-your-key-here
 # Useful for users who want full transparency over what the model sees
 # OPENCLAUDE_DISABLE_TOOL_REMINDERS=1

+# Log structured per-request token usage (including cache metrics) to stderr.
+# Useful for auditing cache hit rate / debugging cost spikes outside the REPL.
+# Any truthy value enables it ("verbose", "1", "true").
+#
+# Complements (does NOT replace) CLAUDE_CODE_ENABLE_TOKEN_USAGE_ATTACHMENT —
+# they serve different audiences:
+#   - OPENCLAUDE_LOG_TOKEN_USAGE is user-facing: one JSON line per API
+#     request on stderr, intended for humans inspecting cost/caching.
+#   - CLAUDE_CODE_ENABLE_TOKEN_USAGE_ATTACHMENT is model-facing: injects
+#     a context-usage attachment INTO the prompt so the model can reason
+#     about its own remaining context. Does not touch stderr.
+# Turn on whichever audience you're debugging; both can run together.
+# OPENCLAUDE_LOG_TOKEN_USAGE=verbose
+
 # Custom timeout for API requests in milliseconds (default: varies)
 # API_TIMEOUT_MS=60000

--- a/docs/advanced-setup.md
+++ b/docs/advanced-setup.md
@@ -177,6 +177,7 @@ export OPENAI_MODEL=gpt-4o
 | `CODEX_AUTH_JSON_PATH` | Codex only | Path to a Codex CLI `auth.json` file |
 | `CODEX_HOME` | Codex only | Alternative Codex home directory |
 | `OPENCLAUDE_DISABLE_CO_AUTHORED_BY` | No | Suppress the default `Co-Authored-By` trailer in generated git commits |
+| `OPENCLAUDE_LOG_TOKEN_USAGE` | No | When truthy (e.g. `verbose`), emits one JSON line on stderr per API request with input/output/cache tokens and the resolved provider. **User-facing debug output** — complements the REPL display controlled by `/config showCacheStats`. Distinct from `CLAUDE_CODE_ENABLE_TOKEN_USAGE_ATTACHMENT`, which is **model-facing** (injects context usage info into the prompt itself). Both can run together. |

 You can also use `ANTHROPIC_MODEL` to override the model name. `OPENAI_MODEL` takes priority.

--- a/src/commands.ts
+++ b/src/commands.ts
@@ -34,6 +34,7 @@ import installGitHubApp from './commands/install-github-app/index.js'
 import installSlackApp from './commands/install-slack-app/index.js'
 import breakCache from './commands/break-cache/index.js'
 import cacheProbe from './commands/cache-probe/index.js'
+import cacheStats from './commands/cacheStats/index.js'
 import mcp from './commands/mcp/index.js'
 import mobile from './commands/mobile/index.js'
 import onboarding from './commands/onboarding/index.js'
@@ -271,6 +272,7 @@ const COMMANDS = memoize((): Command[] => [
  branch,
  btw,
  cacheProbe,
+  cacheStats,
  chrome,
  clear,
  color,
--- a/src/commands/cacheStats/cacheStats.test.ts
+++ b/src/commands/cacheStats/cacheStats.test.ts
@@ -0,0 +1,157 @@
+/**
+ * Tests for `/cache-stats` command rendering.
+ *
+ * The command has non-trivial string formatting (timestamp slicing, model
+ * label padding, conditional N/A footnote, recent-rows cap) which can
+ * silently regress — these snapshot tests keep it honest.
+ */
+import { beforeEach, describe, expect, test } from 'bun:test'
+import type { CacheMetrics } from '../../services/api/cacheMetrics.js'
+import {
+  _setHistoryCapForTesting,
+  recordRequest,
+  resetSessionCacheStats,
+} from '../../services/api/cacheStatsTracker.js'
+import { call } from './cacheStats.js'
+
+function supported(partial: Partial<CacheMetrics>): CacheMetrics {
+  return {
+    read: 0,
+    created: 0,
+    total: 0,
+    hitRate: null,
+    supported: true,
+    ...partial,
+  }
+}
+
+const UNSUPPORTED: CacheMetrics = {
+  read: 0,
+  created: 0,
+  total: 0,
+  hitRate: null,
+  supported: false,
+}
+
+// The command signature requires a LocalJSXCommandContext. Our command
+// doesn't actually read it — we pass an empty stand-in so the test can
+// invoke call() without dragging the whole REPL context in.
+const EMPTY_CTX = {} as Parameters<typeof call>[1]
+
+// /cache-stats always returns a text result. Narrow the union here so
+// the assertions don't need to redo the discriminant check every call.
+async function runCommand(): Promise<string> {
+  const result = await call('', EMPTY_CTX)
+  if (result.type !== 'text') {
+    throw new Error(
+      `cacheStats command must return type:'text', got ${result.type}`,
+    )
+  }
+  return result.value
+}
+
+beforeEach(() => {
+  resetSessionCacheStats()
+  _setHistoryCapForTesting(500)
+})
+
+describe('/cache-stats — empty session', () => {
+  test('shows friendly "no requests yet" message', async () => {
+    const value = await runCommand()
+    expect(value).toContain('No API requests yet this session')
+    expect(value).toContain('/cache-stats')
+  })
+})
+
+describe('/cache-stats — supported-only session', () => {
+  test('renders Cache stats header, turn and session summaries', async () => {
+    recordRequest(
+      supported({ read: 500, total: 1_000, hitRate: 0.5 }),
+      'claude-sonnet-4',
+    )
+    const value = await runCommand()
+    expect(value).toContain('Cache stats')
+    expect(value).toContain('Current turn:')
+    expect(value).toContain('Session total:')
+    // Compact metric line should appear in the recent-requests table.
+    expect(value).toContain('claude-sonnet-4')
+    expect(value).toContain('read')
+  })
+
+  test('omits the N/A footnote when every row is supported', async () => {
+    recordRequest(supported({ read: 200, total: 400, hitRate: 0.5 }), 'model-A')
+    const value = await runCommand()
+    expect(value).not.toContain('N/A rows')
+  })
+})
+
+describe('/cache-stats — mixed supported + unsupported', () => {
+  test('renders N/A footnote when any row is unsupported', async () => {
+    recordRequest(UNSUPPORTED, 'gpt-4-copilot')
+    recordRequest(
+      supported({ read: 100, total: 500, hitRate: 0.2 }),
+      'claude-sonnet-4',
+    )
+    const value = await runCommand()
+    expect(value).toContain(
+      'N/A rows: provider API does not expose cache usage',
+    )
+    expect(value).toContain('GitHub Copilot')
+    expect(value).toContain('Ollama')
+  })
+})
+
+describe('/cache-stats — recent-rows cap', () => {
+  test('caps the breakdown at 20 rows and reports omitted count', async () => {
+    for (let i = 0; i < 25; i++) {
+      recordRequest(
+        supported({ read: i, total: 100, hitRate: i / 100 }),
+        `model-${i}`,
+      )
+    }
+    const value = await runCommand()
+    // 20 shown, 5 omitted from the oldest end.
+    expect(value).toContain('(20 of 25, 5 older omitted)')
+    // Oldest rows (model-0..model-4) should not appear; newest must.
+    expect(value).toContain('model-24')
+    expect(value).not.toContain('model-0 ')
+  })
+
+  test('does not mention "older omitted" when all rows fit', async () => {
+    for (let i = 0; i < 5; i++) {
+      recordRequest(supported({ read: i, total: 10 }), `m${i}`)
+    }
+    const value = await runCommand()
+    expect(value).not.toContain('older omitted')
+    expect(value).toContain('(5)')
+  })
+})
+
+describe('/cache-stats — model label rendering', () => {
+  test('truncates long model labels to fit the column width', async () => {
+    // cacheStats.ts pads+slices the label to 28 chars for alignment.
+    const longLabel = 'some-extremely-long-model-identifier-that-wraps'
+    recordRequest(supported({ read: 10, total: 100, hitRate: 0.1 }), longLabel)
+    const value = await runCommand()
+    // Sliced to 28 chars.
+    expect(value).toContain(longLabel.slice(0, 28))
+    // And the full string should NOT appear (would mean no truncation).
+    expect(value).not.toContain(longLabel)
+  })
+})
+
+describe('/cache-stats — timestamp rendering', () => {
+  test('renders each row with full date and time (YYYY-MM-DD HH:MM:SS)', async () => {
+    recordRequest(supported({ read: 5, total: 10, hitRate: 0.5 }), 'claude-x')
+    const value = await runCommand()
+    // Match the full ISO-ish date + time the row uses. We assert the shape,
+    // not a specific timestamp — real clock is used, so a regex on the
+    // format is the right assertion.
+    expect(value).toMatch(/\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}/)
+    // Bare time-of-day alone (no date) should NOT appear in isolation — it
+    // must always be preceded by the date. Guards against regression if
+    // someone shortens the formatter again.
+    const timeOnlyInRow = /\n\s*#\s*\d+\s+\d{2}:\d{2}:\d{2}\s/.test(value)
+    expect(timeOnlyInRow).toBe(false)
+  })
+})
--- a/src/commands/cacheStats/cacheStats.ts
+++ b/src/commands/cacheStats/cacheStats.ts
@@ -0,0 +1,74 @@
+import {
+  getCacheStatsHistory,
+  getCurrentTurnCacheMetrics,
+  getSessionCacheMetrics,
+  type CacheStatsEntry,
+} from '../../services/api/cacheStatsTracker.js'
+import {
+  formatCacheMetricsCompact,
+  formatCacheMetricsFull,
+  type CacheMetrics,
+} from '../../services/api/cacheMetrics.js'
+import type { LocalCommandCall } from '../../types/command.js'
+
+// Cap the per-request breakdown to keep output readable. Users wanting
+// the full history can rely on OPENCLAUDE_LOG_TOKEN_USAGE=verbose for
+// structured per-request stderr output.
+const MAX_RECENT_ROWS = 20
+
+function formatRow(entry: CacheStatsEntry, idx: number): string {
+  // `YYYY-MM-DD HH:MM:SS` — long-running sessions can span midnight and a
+  // bare time-of-day makes the wrong row look "most recent" when two
+  // entries on different days share the same HH:MM:SS.
+  const iso = new Date(entry.timestamp).toISOString()
+  const ts = `${iso.slice(0, 10)} ${iso.slice(11, 19)}`
+  const line = formatCacheMetricsCompact(entry.metrics)
+  return `  #${String(idx + 1).padStart(3)}  ${ts}  ${entry.label.padEnd(28).slice(0, 28)}  ${line}`
+}
+
+function summarize(label: string, m: CacheMetrics): string {
+  return `${label.padEnd(18)}${formatCacheMetricsFull(m)}`
+}
+
+export const call: LocalCommandCall = async () => {
+  const history = getCacheStatsHistory()
+  const session = getSessionCacheMetrics()
+  const turn = getCurrentTurnCacheMetrics()
+
+  if (history.length === 0) {
+    return {
+      type: 'text',
+      value:
+        'Cache stats\n  No API requests yet this session.\n  Start a turn and re-run /cache-stats to see results.',
+    }
+  }
+
+  const recent = history.slice(-MAX_RECENT_ROWS)
+  const omitted = history.length - recent.length
+
+  const lines: string[] = ['Cache stats', '']
+  lines.push(summarize('Current turn:', turn))
+  lines.push(summarize('Session total:', session))
+  lines.push('')
+  lines.push(`Recent requests (${recent.length}${omitted > 0 ? ` of ${history.length}, ${omitted} older omitted` : ''}):`)
+  lines.push(`  #     time      model                         cache`)
+  for (const [i, entry] of recent.entries()) {
+    lines.push(formatRow(entry, history.length - recent.length + i))
+  }
+
+  // Honesty footnote — providers without cache reporting (vanilla Copilot,
+  // Ollama) show [Cache: N/A] rather than a fake 0%. Tell the user so they
+  // don't read "N/A" as "broken".
+  const hasUnsupported = recent.some((e) => !e.metrics.supported)
+  if (hasUnsupported) {
+    lines.push('')
+    lines.push(
+      '  N/A rows: provider API does not expose cache usage (GitHub Copilot, Ollama).',
+    )
+    lines.push(
+      '  The request still ran normally — only the metric is unavailable.',
+    )
+  }
+
+  return { type: 'text', value: lines.join('\n') }
+}
--- a/src/commands/cacheStats/index.ts
+++ b/src/commands/cacheStats/index.ts
@@ -0,0 +1,24 @@
+/**
+ * /cache-stats — per-session cache diagnostics.
+ *
+ * Always-on diagnostic command (no toggle) that surfaces the metrics
+ * tracked in `cacheStatsTracker.ts`. Breaks cache usage down by request
+ * and also reports the session-wide aggregate — useful when the user
+ * suspects a cache bust (e.g. after /reload-plugins) and wants to see
+ * whether recent turns still hit the cache.
+ *
+ * Lazy-loaded (implementation in cacheStats.ts) to keep startup time
+ * minimal — same pattern used by /cost and /cache-probe.
+ */
+import type { Command } from '../../commands.js'
+
+const cacheStats = {
+  type: 'local',
+  name: 'cache-stats',
+  description:
+    'Show per-turn and session cache hit/miss stats (works across all providers)',
+  supportsNonInteractive: true,
+  load: () => import('./cacheStats.js'),
+} satisfies Command
+
+export default cacheStats
--- a/src/components/Settings/Config.tsx
+++ b/src/components/Settings/Config.tsx
@@ -299,6 +299,26 @@ export function Config({
        enabled: toolHistoryCompressionEnabled
      });
    }
+  }, {
+    id: 'showCacheStats',
+    label: 'Cache stats display',
+    value: globalConfig.showCacheStats,
+    options: ['off', 'compact', 'full'],
+    type: 'enum' as const,
+    onChange(mode: string) {
+      const showCacheStats = (mode === 'off' || mode === 'compact' || mode === 'full' ? mode : 'compact') as 'off' | 'compact' | 'full';
+      saveGlobalConfig(current_cs => ({
+        ...current_cs,
+        showCacheStats
+      }));
+      setGlobalConfig({
+        ...getGlobalConfig(),
+        showCacheStats
+      });
+      logEvent('tengu_show_cache_stats_setting_changed', {
+        mode: showCacheStats as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS
+      });
+    }
  }, {
    id: 'spinnerTipsEnabled',
    label: 'Show tips',
--- a/src/cost-tracker.cacheIntegration.test.ts
+++ b/src/cost-tracker.cacheIntegration.test.ts
@@ -0,0 +1,128 @@
+/**
+ * Integration test for cost-tracker → cacheStatsTracker wiring.
+ *
+ * The unit tests in services/api/cacheMetrics.test.ts and
+ * services/api/cacheStatsTracker.test.ts verify that each piece works
+ * in isolation. This file verifies that they're ACTUALLY CONNECTED —
+ * that `addToTotalSessionCost` resolves the provider, extracts metrics,
+ * and records them on the tracker on every call. Without this test, a
+ * future refactor could silently unwire the call chain (wrong param
+ * order, renamed symbol, removed call) and every individual unit test
+ * would still pass while `/cache-stats` showed empty data.
+ *
+ * We use real state — `resetCostState` + `getCurrentTurnCacheMetrics` —
+ * rather than mocking the tracker module. Fewer moving parts, and the
+ * test fails for the right reason if anyone breaks the wrapping.
+ */
+import { beforeEach, describe, expect, test } from 'bun:test'
+import { addToTotalSessionCost, resetCostState } from './cost-tracker.js'
+import {
+  getCurrentTurnCacheMetrics,
+  getSessionCacheMetrics,
+} from './services/api/cacheStatsTracker.js'
+
+// BetaUsage-compatible shape — minimum fields addToTotalSessionCost
+// needs to run without throwing. Cache fields are the ones we care
+// about here; input/output go into model cost calc.
+function anthropicUsage(partial: {
+  input?: number
+  output?: number
+  cacheRead?: number
+  cacheCreation?: number
+}): Parameters<typeof addToTotalSessionCost>[1] {
+  return {
+    input_tokens: partial.input ?? 0,
+    output_tokens: partial.output ?? 0,
+    cache_read_input_tokens: partial.cacheRead ?? 0,
+    cache_creation_input_tokens: partial.cacheCreation ?? 0,
+    // BetaUsage has several other optional fields; they're not read by
+    // the cache-tracking path so we leave them undefined.
+  } as Parameters<typeof addToTotalSessionCost>[1]
+}
+
+beforeEach(() => {
+  // resetCostState is the wrapped version that ALSO clears the cache
+  // tracker — this line is itself part of what we're verifying.
+  resetCostState()
+})
+
+describe('addToTotalSessionCost → cacheStatsTracker wiring', () => {
+  test('records normalized cache metrics on the tracker for each call', () => {
+    addToTotalSessionCost(
+      0.01,
+      anthropicUsage({
+        input: 200,
+        output: 50,
+        cacheRead: 800,
+        cacheCreation: 100,
+      }),
+      'claude-sonnet-4',
+    )
+
+    const turn = getCurrentTurnCacheMetrics()
+    expect(turn.supported).toBe(true)
+    expect(turn.read).toBe(800)
+    expect(turn.created).toBe(100)
+    // total = fresh(200) + read(800) + created(100) = 1100
+    expect(turn.total).toBe(1_100)
+    // hitRate = read / total = 800 / 1100 ≈ 0.727
+    expect(turn.hitRate).toBeCloseTo(800 / 1_100, 4)
+  })
+
+  test('session aggregate accumulates across multiple API calls', () => {
+    addToTotalSessionCost(
+      0.01,
+      anthropicUsage({ input: 100, cacheRead: 400 }),
+      'claude-sonnet-4',
+    )
+    addToTotalSessionCost(
+      0.02,
+      anthropicUsage({ input: 200, cacheRead: 600 }),
+      'claude-sonnet-4',
+    )
+
+    const session = getSessionCacheMetrics()
+    expect(session.read).toBe(1_000)
+    // total = (100+400) + (200+600) = 1300
+    expect(session.total).toBe(1_300)
+    expect(session.hitRate).toBeCloseTo(1_000 / 1_300, 4)
+  })
+
+  test('cold turn (no cache read/created) still records as supported', () => {
+    addToTotalSessionCost(
+      0.005,
+      anthropicUsage({ input: 500, output: 100 }),
+      'claude-sonnet-4',
+    )
+
+    const turn = getCurrentTurnCacheMetrics()
+    expect(turn.supported).toBe(true)
+    expect(turn.read).toBe(0)
+    expect(turn.created).toBe(0)
+    expect(turn.total).toBe(500)
+    // hitRate computed against a non-zero total is 0, not null — empty
+    // cache on a cacheable provider is a legitimate "no-hit" signal.
+    expect(turn.hitRate).toBe(0)
+  })
+})
+
+describe('resetCostState wrapper also clears cache tracker', () => {
+  test('resetCostState() zeros both cost counters and cache stats', () => {
+    // Populate both systems
+    addToTotalSessionCost(
+      0.01,
+      anthropicUsage({ input: 100, cacheRead: 500 }),
+      'claude-sonnet-4',
+    )
+    expect(getSessionCacheMetrics().read).toBe(500)
+
+    // resetCostState is the WRAPPED version — bootstrap's
+    // resetCostState cleared cost state historically but not cache
+    // stats. The wrapper in cost-tracker.ts adds the second call.
+    resetCostState()
+
+    const session = getSessionCacheMetrics()
+    expect(session.read).toBe(0)
+    expect(session.supported).toBe(false)
+  })
+})
--- a/src/cost-tracker.ts
+++ b/src/cost-tracker.ts
@@ -1,5 +1,14 @@
 import type { BetaUsage as Usage } from '@anthropic-ai/sdk/resources/beta/messages/messages.mjs'
 import chalk from 'chalk'
+import {
+  extractCacheMetrics,
+  resolveCacheProvider,
+} from './services/api/cacheMetrics.js'
+import {
+  recordRequest as recordCacheRequest,
+  resetSessionCacheStats,
+} from './services/api/cacheStatsTracker.js'
+import { getAPIProvider, isGithubNativeAnthropicMode } from './utils/model/providers.js'
 import {
  addToTotalCostState,
  addToTotalLinesChanged,
@@ -22,7 +31,7 @@ import {
  getTotalWebSearchRequests,
  getUsageForModel,
  hasUnknownModelCost,
-  resetCostState,
+  resetCostState as baseResetCostState,
  resetStateForTests,
  setCostStateForRestore,
  setHasUnknownModelCost,
@@ -62,12 +71,22 @@ export {
  formatCost,
  hasUnknownModelCost,
  resetStateForTests,
-  resetCostState,
  setHasUnknownModelCost,
  getModelUsage,
  getUsageForModel,
 }

+/**
+ * Wraps bootstrap's resetCostState() so /clear, /compact and session
+ * switches zero the cache-stats tracker alongside the cost counters.
+ * Exported under the same name so existing callers pick up the cache
+ * reset without any call-site changes.
+ */
+export function resetCostState(): void {
+  baseResetCostState()
+  resetSessionCacheStats()
+}
+
 type StoredCostState = {
  totalCostUSD: number
  totalAPIDuration: number
@@ -251,6 +270,16 @@ function round(number: number, precision: number): number {
  return Math.round(number * precision) / precision
 }

+// Env-gated verbose token usage log. Treated as a boolean regardless of
+// value specifics — any truthy-ish string switches it on. `verbose` is the
+// documented keyword but we accept `1`/`true` for ergonomic parity with
+// other OPENCLAUDE_* flags.
+function shouldLogTokenUsageVerbose(): boolean {
+  const v = (process.env.OPENCLAUDE_LOG_TOKEN_USAGE ?? '').trim().toLowerCase()
+  if (!v) return false
+  return v !== '0' && v !== 'false' && v !== 'off'
+}
+
 function addToTotalModelUsage(
  cost: number,
  usage: Usage,
@@ -287,6 +316,43 @@ export function addToTotalSessionCost(
  const modelUsage = addToTotalModelUsage(cost, usage, model)
  addToTotalCostState(cost, modelUsage, model)

+  // Record normalized cache metrics for REPL display + /cache-stats.
+  // Resolved from the current process provider — at this point `usage` has
+  // already been Anthropic-shaped by the shim layer, so we feed the
+  // corresponding bucket (anthropic / copilot-claude / openai-like) to the
+  // extractor. For providers that genuinely don't report cache data
+  // (vanilla Copilot, Ollama), resolveCacheProvider steers us to
+  // supported:false so the UI shows "N/A" instead of lying with "0%".
+  const cacheProvider = resolveCacheProvider(getAPIProvider(), {
+    githubNativeAnthropic: isGithubNativeAnthropicMode(model),
+    openAiBaseUrl: process.env.OPENAI_BASE_URL ?? process.env.OPENAI_API_BASE,
+  })
+  const cacheMetrics = extractCacheMetrics(
+    usage as unknown as Record<string, unknown>,
+    cacheProvider,
+  )
+  recordCacheRequest(cacheMetrics, model)
+
+  // Opt-in structured per-request debug log on stderr. Power-user knob, not
+  // shown in the REPL — complements CLAUDE_CODE_ENABLE_TOKEN_USAGE_ATTACHMENT
+  // (which is model-facing). Any truthy value except "0"/"false" enables it.
+  if (shouldLogTokenUsageVerbose()) {
+    process.stderr.write(
+      JSON.stringify({
+        tag: 'openclaude.tokenUsage',
+        model,
+        provider: cacheProvider,
+        input_tokens: usage.input_tokens,
+        output_tokens: usage.output_tokens,
+        cache_read_input_tokens: usage.cache_read_input_tokens ?? 0,
+        cache_creation_input_tokens: usage.cache_creation_input_tokens ?? 0,
+        cache_supported: cacheMetrics.supported,
+        cache_hit_rate: cacheMetrics.hitRate,
+        cost_usd: cost,
+      }) + '\n',
+    )
+  }
+
  const attrs =
    isFastModeEnabled() && usage.speed === 'fast'
      ? { model, speed: 'fast' }
--- a/src/screens/REPL.tsx
+++ b/src/screens/REPL.tsx
@@ -133,6 +133,8 @@ import { hasConsoleBillingAccess } from '../utils/billing.js';
 import { logEvent, type AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS } from 'src/services/analytics/index.js';
 import { getFeatureValue_CACHED_MAY_BE_STALE } from 'src/services/analytics/growthbook.js';
 import { textForResubmit, handleMessageFromStream, type StreamingToolUse, type StreamingThinking, isCompactBoundaryMessage, getMessagesAfterCompactBoundary, getContentText, createUserMessage, createAssistantMessage, createTurnDurationMessage, createAgentsKilledMessage, createApiMetricsMessage, createSystemMessage, createCommandInputMessage, formatCommandInputTags } from '../utils/messages.js';
+import { getCurrentTurnCacheMetrics, resetCurrentTurn } from '../services/api/cacheStatsTracker.js';
+import { formatCacheMetricsCompact, formatCacheMetricsFull } from '../services/api/cacheMetrics.js';
 import { generateSessionTitle } from '../utils/sessionTitle.js';
 import { BASH_INPUT_TAG, COMMAND_MESSAGE_TAG, COMMAND_NAME_TAG, LOCAL_COMMAND_STDOUT_TAG } from '../constants/xml.js';
 import { escapeXml } from '../utils/xml.js';
@@ -2921,6 +2923,13 @@ export function REPL({
      // isLoading is derived from queryGuard — tryStart() above already
      // transitioned dispatching→running, so no setter call needed here.
      resetTimingRefs();
+      // Start-of-turn cache tracker reset. The end-of-turn path at the
+      // bottom of this function already resets, but mirror the call here
+      // so a turn that never reaches end-of-turn (crash, unhandled
+      // rejection, process exit) still starts clean on the next one.
+      // Idempotent with respect to the end-of-turn reset — double-reset
+      // is a no-op.
+      resetCurrentTurn();
      setMessages(oldMessages => [...oldMessages, ...newMessages]);
      responseLengthRef.current = 0;
      if (feature('TOKEN_BUDGET')) {
@@ -3019,6 +3028,38 @@ export function REPL({
            setMessages(prev => [...prev, createTurnDurationMessage(turnDurationMs, budgetInfo, count(prev, isLoggableMessage))]);
          }
        }
+        // Cache stats line — controlled by `/config showCacheStats`. Shows
+        // per-query read/hit stats using the provider-normalized metrics
+        // from cacheStatsTracker. 'off' skips, 'compact' gives a one-liner,
+        // 'full' gives a breakdown. Display is skipped when the user
+        // aborted or proactive mode is active — but the counter reset
+        // below still runs in those cases.
+        if (!abortController.signal.aborted && !proactiveActive) {
+          // Defensive default: config layer already merges 'compact' from
+          // DEFAULT_GLOBAL_CONFIG (see config.ts:1494) for configs that
+          // predate this feature, so `mode` should always be defined.
+          // The `?? 'compact'` fallback covers pathological cases — a
+          // corrupt config read that returned an empty object, or a
+          // race between writer and reader — where the merge didn't
+          // land. Rendering the line is the safer failure mode than
+          // silently hiding it.
+          const mode = getGlobalConfig().showCacheStats ?? 'compact';
+          if (mode !== 'off') {
+            const turnMetrics = getCurrentTurnCacheMetrics();
+            // Skip rendering if the turn recorded no API activity at all —
+            // avoids a spurious "[Cache: cold]" on local-only commands.
+            if (turnMetrics.supported || turnMetrics.read > 0 || turnMetrics.total > 0) {
+              const line = mode === 'full' ? formatCacheMetricsFull(turnMetrics) : formatCacheMetricsCompact(turnMetrics);
+              setMessages(prev => [...prev, createSystemMessage(line, 'info')]);
+            }
+          }
+        }
+        // Reset turn counters UNCONDITIONALLY — users routinely interrupt
+        // (Ctrl+C) mid-turn, and if we kept the reset gated on
+        // !aborted, the in-flight turn's metrics would leak into the
+        // next turn's aggregate. Proactive turns also need the reset so
+        // their metrics don't pile onto the following user turn.
+        resetCurrentTurn();
        // Clear the controller so CancelRequestHandler's canCancelRunningTask
        // reads false at the idle prompt. Without this, the stale non-aborted
        // controller makes ctrl+c fire onCancel() (aborting nothing) instead of
--- a/src/services/api/cacheMetrics.test.ts
+++ b/src/services/api/cacheMetrics.test.ts
@@ -0,0 +1,782 @@
+import { expect, test, describe } from 'bun:test'
+import {
+  extractCacheMetrics,
+  extractCacheReadFromRawUsage,
+  resolveCacheProvider,
+  formatCacheMetricsCompact,
+  formatCacheMetricsFull,
+  addCacheMetrics,
+} from './cacheMetrics.js'
+
+describe('extractCacheMetrics — Anthropic (firstParty/bedrock/vertex/foundry)', () => {
+  test('reports read/created separately and computes hit rate over total input', () => {
+    const usage = {
+      input_tokens: 300,
+      output_tokens: 100,
+      cache_read_input_tokens: 800,
+      cache_creation_input_tokens: 200,
+    }
+    const m = extractCacheMetrics(usage, 'anthropic')
+    expect(m.supported).toBe(true)
+    expect(m.read).toBe(800)
+    expect(m.created).toBe(200)
+    // total = fresh(300) + created(200) + read(800) = 1300
+    expect(m.total).toBe(1300)
+    expect(m.hitRate).toBeCloseTo(800 / 1300, 4)
+  })
+
+  test('returns cold metrics when no cache activity yet', () => {
+    const m = extractCacheMetrics({ input_tokens: 500 }, 'anthropic')
+    expect(m.supported).toBe(true)
+    expect(m.read).toBe(0)
+    expect(m.created).toBe(0)
+    expect(m.hitRate).toBe(0)
+  })
+
+  test('null hit rate when usage has no input at all', () => {
+    const m = extractCacheMetrics({}, 'anthropic')
+    expect(m.supported).toBe(true)
+    expect(m.hitRate).toBeNull()
+  })
+})
+
+// NOTE: OpenAI/Codex/Kimi/DeepSeek/Gemini raw shapes are now tested through
+// extractCacheReadFromRawUsage (below). extractCacheMetrics sees the
+// post-shim Anthropic shape for every provider, so the tests here verify
+// that the shape lookup works uniformly against the shimmed fields.
+
+describe('extractCacheMetrics — post-shim Anthropic shape (applies to all providers)', () => {
+  test('OpenAI post-shim (openai bucket) — reads Anthropic fields injected by convertChunkUsage', () => {
+    // This is what cost-tracker actually sees for OpenAI upstreams: the
+    // shim has already subtracted cached from prompt_tokens and moved it
+    // to cache_read_input_tokens.
+    const shimmed = {
+      input_tokens: 800, // fresh = 2000 - 1200
+      output_tokens: 300,
+      cache_creation_input_tokens: 0,
+      cache_read_input_tokens: 1_200,
+    }
+    const m = extractCacheMetrics(shimmed, 'openai')
+    expect(m.supported).toBe(true)
+    expect(m.read).toBe(1_200)
+    expect(m.created).toBe(0)
+    expect(m.total).toBe(2_000) // 800 fresh + 1200 read
+    expect(m.hitRate).toBe(0.6)
+  })
+
+  test('Codex post-shim — same Anthropic shape as OpenAI', () => {
+    const shimmed = {
+      input_tokens: 900, // 1500 - 600
+      cache_creation_input_tokens: 0,
+      cache_read_input_tokens: 600,
+    }
+    const m = extractCacheMetrics(shimmed, 'codex')
+    expect(m.read).toBe(600)
+    expect(m.total).toBe(1_500)
+    expect(m.hitRate).toBe(0.4)
+  })
+
+  test('Kimi post-shim — shim moved top-level cached_tokens into Anthropic field', () => {
+    const shimmed = {
+      input_tokens: 600, // 1000 - 400
+      cache_creation_input_tokens: 0,
+      cache_read_input_tokens: 400,
+    }
+    const m = extractCacheMetrics(shimmed, 'kimi')
+    expect(m.read).toBe(400)
+    expect(m.total).toBe(1_000)
+    expect(m.hitRate).toBe(0.4)
+  })
+
+  test('DeepSeek post-shim — hit moved to cache_read_input_tokens, miss to input_tokens', () => {
+    const shimmed = {
+      input_tokens: 300, // miss
+      cache_creation_input_tokens: 0,
+      cache_read_input_tokens: 700, // hit
+    }
+    const m = extractCacheMetrics(shimmed, 'deepseek')
+    expect(m.read).toBe(700)
+    expect(m.total).toBe(1_000)
+    expect(m.hitRate).toBe(0.7)
+  })
+
+  test('Gemini post-shim — cached_content_token_count moved to cache_read_input_tokens', () => {
+    const shimmed = {
+      input_tokens: 800, // 4000 - 3200
+      cache_creation_input_tokens: 0,
+      cache_read_input_tokens: 3_200,
+    }
+    const m = extractCacheMetrics(shimmed, 'gemini')
+    expect(m.read).toBe(3_200)
+    expect(m.total).toBe(4_000)
+    expect(m.hitRate).toBe(0.8)
+  })
+})
+
+describe('extractCacheReadFromRawUsage — single source of truth for shim layer', () => {
+  test('Anthropic-native passthrough: cache_read_input_tokens', () => {
+    expect(
+      extractCacheReadFromRawUsage({ cache_read_input_tokens: 1_500 }),
+    ).toBe(1_500)
+  })
+
+  test('OpenAI Chat Completions: prompt_tokens_details.cached_tokens', () => {
+    expect(
+      extractCacheReadFromRawUsage({
+        prompt_tokens: 2_000,
+        prompt_tokens_details: { cached_tokens: 1_200 },
+      }),
+    ).toBe(1_200)
+  })
+
+  test('Codex Responses API: input_tokens_details.cached_tokens', () => {
+    expect(
+      extractCacheReadFromRawUsage({
+        input_tokens: 1_500,
+        input_tokens_details: { cached_tokens: 600 },
+      }),
+    ).toBe(600)
+  })
+
+  test('Kimi / Moonshot: top-level cached_tokens', () => {
+    expect(
+      extractCacheReadFromRawUsage({ prompt_tokens: 1_000, cached_tokens: 400 }),
+    ).toBe(400)
+  })
+
+  test('DeepSeek: prompt_cache_hit_tokens', () => {
+    expect(
+      extractCacheReadFromRawUsage({
+        prompt_cache_hit_tokens: 700,
+        prompt_cache_miss_tokens: 300,
+      }),
+    ).toBe(700)
+  })
+
+  test('Gemini: cached_content_token_count', () => {
+    expect(
+      extractCacheReadFromRawUsage({
+        prompt_token_count: 4_000,
+        cached_content_token_count: 3_200,
+      }),
+    ).toBe(3_200)
+  })
+
+  test('no cache fields at all → 0 (Copilot/Ollama/unknown shape)', () => {
+    expect(extractCacheReadFromRawUsage({ prompt_tokens: 500 })).toBe(0)
+  })
+
+  test('Anthropic field wins over OpenAI field when both present', () => {
+    // Shouldn't happen in practice, but if usage was double-annotated we
+    // trust the Anthropic-native number (it's the more authoritative one).
+    expect(
+      extractCacheReadFromRawUsage({
+        cache_read_input_tokens: 999,
+        prompt_tokens_details: { cached_tokens: 111 },
+      }),
+    ).toBe(999)
+  })
+
+  test('null/undefined/non-object → 0', () => {
+    expect(extractCacheReadFromRawUsage(null)).toBe(0)
+    expect(extractCacheReadFromRawUsage(undefined)).toBe(0)
+    expect(extractCacheReadFromRawUsage('nope' as unknown as never)).toBe(0)
+  })
+})
+
+describe('extractCacheMetrics — Copilot / Ollama (unsupported)', () => {
+  test('returns supported:false with all zeros and null hitRate for Copilot', () => {
+    const m = extractCacheMetrics({ prompt_tokens: 1000 }, 'copilot')
+    expect(m.supported).toBe(false)
+    expect(m.read).toBe(0)
+    expect(m.created).toBe(0)
+    expect(m.hitRate).toBeNull()
+  })
+
+  test('returns supported:false for Ollama', () => {
+    const m = extractCacheMetrics({ prompt_tokens: 42 }, 'ollama')
+    expect(m.supported).toBe(false)
+    expect(m.hitRate).toBeNull()
+  })
+
+  test('Copilot serving Claude (copilot-claude) is supported and uses Anthropic fields', () => {
+    const usage = {
+      input_tokens: 200,
+      cache_read_input_tokens: 800,
+      cache_creation_input_tokens: 100,
+    }
+    const m = extractCacheMetrics(usage, 'copilot-claude')
+    expect(m.supported).toBe(true)
+    expect(m.read).toBe(800)
+    expect(m.created).toBe(100)
+    expect(m.total).toBe(1_100)
+  })
+})
+
+describe('extractCacheMetrics — bad/empty input', () => {
+  test('null usage returns unsupported', () => {
+    expect(extractCacheMetrics(null, 'anthropic').supported).toBe(false)
+  })
+
+  test('non-object usage returns unsupported', () => {
+    expect(extractCacheMetrics('oops' as unknown as never, 'openai').supported).toBe(
+      false,
+    )
+  })
+})
+
+describe('resolveCacheProvider', () => {
+  test('firstParty → anthropic', () => {
+    expect(resolveCacheProvider('firstParty')).toBe('anthropic')
+  })
+  test('bedrock/vertex/foundry → anthropic', () => {
+    expect(resolveCacheProvider('bedrock')).toBe('anthropic')
+    expect(resolveCacheProvider('vertex')).toBe('anthropic')
+    expect(resolveCacheProvider('foundry')).toBe('anthropic')
+  })
+  test('github without claude hint → copilot (unsupported)', () => {
+    expect(resolveCacheProvider('github')).toBe('copilot')
+  })
+  test('github with claude hint → copilot-claude', () => {
+    expect(
+      resolveCacheProvider('github', { githubNativeAnthropic: true }),
+    ).toBe('copilot-claude')
+  })
+  test('openai with localhost / loopback → self-hosted', () => {
+    // These used to return 'ollama'; the bucket is now 'self-hosted'
+    // because not every local OpenAI-compatible server is Ollama
+    // (could be vLLM, LM Studio, LocalAI, text-generation-webui).
+    // Both buckets collapse to supported=false downstream.
+    expect(
+      resolveCacheProvider('openai', { openAiBaseUrl: 'http://localhost:8080/v1' }),
+    ).toBe('self-hosted')
+    expect(
+      resolveCacheProvider('openai', { openAiBaseUrl: 'http://127.0.0.1:1234/v1' }),
+    ).toBe('self-hosted')
+    // Localhost:11434 hits the self-hosted branch first — 'ollama' only
+    // kicks in when the :11434 port appears on a public-looking URL
+    // (which would be unusual but still deserves honest classification).
+    expect(
+      resolveCacheProvider('openai', { openAiBaseUrl: 'http://localhost:11434/v1' }),
+    ).toBe('self-hosted')
+    expect(
+      resolveCacheProvider('openai', { openAiBaseUrl: 'http://[::1]:5000/v1' }),
+    ).toBe('self-hosted')
+  })
+
+  test('openai on RFC1918 private IP → self-hosted (pre-fix: misclassified as openai)', () => {
+    // These are the exact cases the reviewer flagged. Before this fix,
+    // a vLLM / LocalAI server on a LAN address fell through to the
+    // 'openai' branch and /cache-stats showed '[Cache: cold]' — which
+    // users read as "my cache is broken" when the provider simply
+    // didn't report cache fields. Now they land in 'self-hosted' and
+    // /cache-stats shows '[Cache: N/A]'.
+    expect(
+      resolveCacheProvider('openai', { openAiBaseUrl: 'http://192.168.1.50:8000/v1' }),
+    ).toBe('self-hosted')
+    expect(
+      resolveCacheProvider('openai', { openAiBaseUrl: 'http://10.0.0.7:8080/v1' }),
+    ).toBe('self-hosted')
+    expect(
+      resolveCacheProvider('openai', { openAiBaseUrl: 'http://172.20.0.3:5000/v1' }),
+    ).toBe('self-hosted')
+  })
+
+  test('openai on link-local / CGNAT → self-hosted', () => {
+    expect(
+      resolveCacheProvider('openai', { openAiBaseUrl: 'http://169.254.169.254/v1' }),
+    ).toBe('self-hosted')
+    expect(
+      resolveCacheProvider('openai', { openAiBaseUrl: 'http://100.64.1.5:8000/v1' }),
+    ).toBe('self-hosted')
+  })
+
+  test('openai on reserved TLD (.local / .internal / .lan / .home.arpa) → self-hosted', () => {
+    // Per RFC 6761 (.local/mDNS), RFC 8375 (.home.arpa), and widely
+    // used .internal / .lan conventions. These never resolve publicly.
+    expect(
+      resolveCacheProvider('openai', { openAiBaseUrl: 'http://llm.internal:5000/v1' }),
+    ).toBe('self-hosted')
+    expect(
+      resolveCacheProvider('openai', { openAiBaseUrl: 'http://llm.local:8080/v1' }),
+    ).toBe('self-hosted')
+    expect(
+      resolveCacheProvider('openai', { openAiBaseUrl: 'http://vllm.home.arpa/v1' }),
+    ).toBe('self-hosted')
+    expect(
+      resolveCacheProvider('openai', { openAiBaseUrl: 'http://box.lan:1234/v1' }),
+    ).toBe('self-hosted')
+  })
+
+  test('openai on IPv6 local / link-local → self-hosted', () => {
+    expect(
+      resolveCacheProvider('openai', { openAiBaseUrl: 'http://[fe80::1]:8000/v1' }),
+    ).toBe('self-hosted')
+    expect(
+      resolveCacheProvider('openai', { openAiBaseUrl: 'http://[fd12:3456::7]:8080/v1' }),
+    ).toBe('self-hosted')
+    expect(
+      resolveCacheProvider('openai', { openAiBaseUrl: 'http://[fc00::1]:8080/v1' }),
+    ).toBe('self-hosted')
+  })
+
+  test('IPv6 ULA prefix (fc/fd) does NOT over-match public hostnames', () => {
+    // Regression guard: an early version of isLocalOrPrivateUrl checked
+    // `h.startsWith('fc')` / `startsWith('fd')` without a colon guard,
+    // which misclassified legitimate public hosts whose names happen to
+    // begin with those letters. The fix requires a colon in the match
+    // so only real IPv6 literals hit the branch.
+    expect(
+      resolveCacheProvider('openai', {
+        openAiBaseUrl: 'https://fc-api.example.com/v1',
+      }),
+    ).toBe('openai')
+    expect(
+      resolveCacheProvider('openai', {
+        openAiBaseUrl: 'https://fd-hosted.example.com/v1',
+      }),
+    ).toBe('openai')
+    // Same goes for names that look like hex prefixes but aren't IPv6.
+    expect(
+      resolveCacheProvider('openai', {
+        openAiBaseUrl: 'https://fcbench.net/v1',
+      }),
+    ).toBe('openai')
+  })
+
+  test('openai with :11434 on a public host → ollama (default-port heuristic)', () => {
+    // Contrived but the heuristic should still fire — someone running
+    // Ollama behind a reverse proxy with port preserved.
+    expect(
+      resolveCacheProvider('openai', {
+        openAiBaseUrl: 'https://ollama.example.com:11434/v1',
+      }),
+    ).toBe('ollama')
+  })
+
+  test('openai with moonshot URL → kimi', () => {
+    expect(
+      resolveCacheProvider('openai', { openAiBaseUrl: 'https://api.moonshot.ai/v1' }),
+    ).toBe('kimi')
+  })
+  test('openai with deepseek URL → deepseek', () => {
+    expect(
+      resolveCacheProvider('openai', { openAiBaseUrl: 'https://api.deepseek.com/v1' }),
+    ).toBe('deepseek')
+  })
+  test('private IP beats hosted-keyword matching (self-hosted takes priority)', () => {
+    // A pathological URL: a private-IP host whose path string contains
+    // "deepseek". Self-hosted detection must run FIRST so the URL
+    // classifies honestly — the path alone doesn't prove the upstream
+    // is the real DeepSeek API.
+    expect(
+      resolveCacheProvider('openai', {
+        openAiBaseUrl: 'http://10.0.0.5:8000/v1/deepseek-proxy',
+      }),
+    ).toBe('self-hosted')
+  })
+  test('plain openai remains openai', () => {
+    expect(
+      resolveCacheProvider('openai', { openAiBaseUrl: 'https://api.openai.com/v1' }),
+    ).toBe('openai')
+  })
+  test('unparseable base URL falls back to substring heuristic', () => {
+    // Bare host:port without a scheme is common in misconfigured env.
+    // We can't URL-parse it, but we still honor the "localhost" hint so
+    // a broken config doesn't silently masquerade as cache-capable.
+    expect(
+      resolveCacheProvider('openai', { openAiBaseUrl: 'localhost:8000' }),
+    ).toBe('self-hosted')
+    // An unparseable and opaque string falls through to plain 'openai'
+    // (best-effort — nothing we can infer from "foo-bar-baz").
+    expect(
+      resolveCacheProvider('openai', { openAiBaseUrl: '???' }),
+    ).toBe('openai')
+  })
+  test('empty base URL → plain openai', () => {
+    // No hint at all: assume the canonical api.openai.com.
+    expect(resolveCacheProvider('openai')).toBe('openai')
+    expect(
+      resolveCacheProvider('openai', { openAiBaseUrl: '' }),
+    ).toBe('openai')
+  })
+  test('codex → codex', () => {
+    expect(resolveCacheProvider('codex')).toBe('codex')
+  })
+  test('gemini → gemini', () => {
+    expect(resolveCacheProvider('gemini')).toBe('gemini')
+  })
+})
+
+describe('resolveCacheProvider — .localhost TLD (RFC 6761)', () => {
+  test('subdomains of .localhost classify as self-hosted', () => {
+    // Chrome, Firefox, and systemd-resolved all natively resolve
+    // *.localhost to 127.0.0.1. Kubernetes Ingress and docker-compose
+    // setups commonly use app.localhost, api.localhost, etc.
+    expect(
+      resolveCacheProvider('openai', {
+        openAiBaseUrl: 'http://app.localhost:3000/v1',
+      }),
+    ).toBe('self-hosted')
+    expect(
+      resolveCacheProvider('openai', {
+        openAiBaseUrl: 'http://api.localhost/v1',
+      }),
+    ).toBe('self-hosted')
+    expect(
+      resolveCacheProvider('openai', {
+        openAiBaseUrl: 'http://llm.dev.localhost:8080/v1',
+      }),
+    ).toBe('self-hosted')
+  })
+
+  test('.localhost TLD does NOT match substring collisions', () => {
+    // Guard against regressions where `localhost` would match via
+    // substring rather than TLD semantics. `localhostify.com` and
+    // `mylocalhost.net` must stay on the public `openai` path.
+    expect(
+      resolveCacheProvider('openai', {
+        openAiBaseUrl: 'https://localhostify.com/v1',
+      }),
+    ).toBe('openai')
+    expect(
+      resolveCacheProvider('openai', {
+        openAiBaseUrl: 'https://mylocalhost.net/v1',
+      }),
+    ).toBe('openai')
+  })
+})
+
+describe('extractCacheMetrics — hit rate clamp', () => {
+  test('hitRate is clamped to 1.0 on pathological input (read > total)', () => {
+    // Defensive guard: with valid non-negative inputs the math enforces
+    // read <= total, so hitRate cannot exceed 1. But an upstream shim
+    // bug (e.g. reading a negative `fresh` from a future provider) could
+    // break the invariant. `Math.min(1, read/total)` caps the display at
+    // 100% rather than letting a `read=800 total=500` case render as
+    // "hit 160%" or (worse) null, which would hide the anomaly.
+    const metrics = extractCacheMetrics(
+      {
+        cache_read_input_tokens: 800,
+        cache_creation_input_tokens: 0,
+        // asNumber keeps finite negatives, so fresh = -500 → total =
+        // 800 + 0 + (-500) = 300, read=800 → raw ratio 2.67, clamp to 1.
+        input_tokens: -500,
+      } as unknown as Record<string, unknown>,
+      'anthropic',
+    )
+    expect(metrics.supported).toBe(true)
+    expect(metrics.hitRate).toBe(1)
+  })
+
+  test('normal inputs still yield accurate fractional hit rates', () => {
+    // Regression: clamp must not perturb the happy path.
+    const metrics = extractCacheMetrics(
+      {
+        cache_read_input_tokens: 300,
+        cache_creation_input_tokens: 0,
+        input_tokens: 700,
+      },
+      'anthropic',
+    )
+    expect(metrics.hitRate).toBeCloseTo(0.3, 5)
+  })
+})
+
+describe('extractCacheMetrics — self-hosted bucket (data-driven)', () => {
+  test('vanilla self-hosted endpoint without cache fields → unsupported / N/A', () => {
+    // vLLM, LocalAI, text-generation-webui, etc. emit no cache fields
+    // at all. With read=created=0 we mark unsupported so the REPL shows
+    // honest '[Cache: N/A]' instead of a fabricated 0%.
+    const metrics = extractCacheMetrics(
+      { input_tokens: 1_000, output_tokens: 200 },
+      'self-hosted',
+    )
+    expect(metrics.supported).toBe(false)
+    expect(metrics.hitRate).toBeNull()
+    expect(metrics.read).toBe(0)
+    expect(metrics.created).toBe(0)
+  })
+
+  test('internal reverse proxy forwarding real cache data → supported', () => {
+    // Review-blocker regression guard: an enterprise setup with an
+    // internal proxy on a private URL (e.g. `http://llm.internal:5000/v1`)
+    // forwarding to OpenAI / Kimi / DeepSeek / Gemini WILL deliver real
+    // cache fields via the shim. Pre-fix we would discard them because
+    // the URL heuristic classified the endpoint as 'self-hosted'. Now
+    // the data itself decides: any non-zero cache activity flows through
+    // the same normalization as an OpenAI bucket.
+    const shimmed = {
+      input_tokens: 800, // fresh (post-shim, cached already subtracted)
+      cache_read_input_tokens: 1_200, // shim extracted from upstream
+      cache_creation_input_tokens: 0,
+    }
+    const metrics = extractCacheMetrics(shimmed, 'self-hosted')
+    expect(metrics.supported).toBe(true)
+    expect(metrics.read).toBe(1_200)
+    expect(metrics.total).toBe(2_000)
+    expect(metrics.hitRate).toBe(0.6)
+  })
+
+  test('proxy with cache_creation but zero cache_read → still supported', () => {
+    // Mirror of the above for the first-call / cold-cache scenario:
+    // Anthropic-compatible upstreams emit creation tokens on the first
+    // request that primes the cache. Self-hosted proxy must preserve
+    // that signal, not swallow it because read is still 0.
+    const shimmed = {
+      input_tokens: 500,
+      cache_read_input_tokens: 0,
+      cache_creation_input_tokens: 800,
+    }
+    const metrics = extractCacheMetrics(shimmed, 'self-hosted')
+    expect(metrics.supported).toBe(true)
+    expect(metrics.created).toBe(800)
+    expect(metrics.read).toBe(0)
+  })
+})
+
+describe('formatCacheMetrics — defensive null/undefined guards', () => {
+  test('formatCacheMetricsCompact returns N/A for undefined input', () => {
+    // Signature says `CacheMetrics` but runtime bug on a failed API
+    // response could leave the caller with nothing. The formatter
+    // should degrade gracefully rather than throw on `.supported`.
+    expect(formatCacheMetricsCompact(undefined)).toBe('[Cache: N/A]')
+    expect(formatCacheMetricsCompact(null as unknown as undefined)).toBe(
+      '[Cache: N/A]',
+    )
+  })
+
+  test('formatCacheMetricsFull returns N/A for undefined input', () => {
+    expect(formatCacheMetricsFull(undefined)).toBe('[Cache: N/A]')
+    expect(formatCacheMetricsFull(null as unknown as undefined)).toBe(
+      '[Cache: N/A]',
+    )
+  })
+})
+
+describe('formatCacheMetricsCompact — self-hosted display paths', () => {
+  test('vanilla self-hosted (no cache data) renders as N/A', () => {
+    const metrics = extractCacheMetrics(
+      { input_tokens: 500 },
+      'self-hosted',
+    )
+    expect(formatCacheMetricsCompact(metrics)).toBe('[Cache: N/A]')
+    expect(formatCacheMetricsFull(metrics)).toBe('[Cache: N/A]')
+  })
+
+  test('self-hosted proxy with forwarded cache data renders real metrics', () => {
+    // Full display-path regression guard for the review-blocker fix:
+    // the user must see the real hit rate that the upstream emitted,
+    // not a silent N/A because the URL looked private.
+    const metrics = extractCacheMetrics(
+      {
+        input_tokens: 800,
+        cache_read_input_tokens: 1_200,
+        cache_creation_input_tokens: 0,
+      },
+      'self-hosted',
+    )
+    expect(formatCacheMetricsCompact(metrics)).toBe('[Cache: 1.2k read • hit 60%]')
+    expect(formatCacheMetricsFull(metrics)).toBe(
+      '[Cache: read=1.2k created=0 hit=60%]',
+    )
+  })
+})
+
+describe('formatCacheMetricsCompact — snapshot-stable output', () => {
+  test('supported with reads shows "k" abbreviation and hit rate', () => {
+    const out = formatCacheMetricsCompact({
+      read: 1_234,
+      created: 0,
+      total: 10_000,
+      hitRate: 0.1234,
+      supported: true,
+    })
+    expect(out).toBe('[Cache: 1.2k read • hit 12%]')
+  })
+
+  test('supported with no cache activity renders "cold"', () => {
+    const out = formatCacheMetricsCompact({
+      read: 0,
+      created: 0,
+      total: 500,
+      hitRate: 0,
+      supported: true,
+    })
+    expect(out).toBe('[Cache: cold]')
+  })
+
+  test('unsupported renders "N/A"', () => {
+    const out = formatCacheMetricsCompact({
+      read: 0,
+      created: 0,
+      total: 0,
+      hitRate: null,
+      supported: false,
+    })
+    expect(out).toBe('[Cache: N/A]')
+  })
+
+  test('small numbers render without abbreviation', () => {
+    const out = formatCacheMetricsCompact({
+      read: 42,
+      created: 0,
+      total: 100,
+      hitRate: 0.42,
+      supported: true,
+    })
+    expect(out).toBe('[Cache: 42 read • hit 42%]')
+  })
+})
+
+describe('formatCacheMetricsFull — snapshot-stable output', () => {
+  test('supported shows all fields', () => {
+    const out = formatCacheMetricsFull({
+      read: 1_234,
+      created: 250,
+      total: 10_000,
+      hitRate: 0.1234,
+      supported: true,
+    })
+    expect(out).toBe('[Cache: read=1.2k created=250 hit=12%]')
+  })
+
+  test('null hit rate renders n/a', () => {
+    const out = formatCacheMetricsFull({
+      read: 0,
+      created: 0,
+      total: 0,
+      hitRate: null,
+      supported: true,
+    })
+    expect(out).toBe('[Cache: read=0 created=0 hit=n/a]')
+  })
+
+  test('unsupported renders "N/A"', () => {
+    const out = formatCacheMetricsFull({
+      read: 0,
+      created: 0,
+      total: 0,
+      hitRate: null,
+      supported: false,
+    })
+    expect(out).toBe('[Cache: N/A]')
+  })
+})
+
+describe('hit-rate edge cases (plan-mandated coverage)', () => {
+  test('0 read / 0 created on supported provider → hitRate = 0 (not null) when total > 0', () => {
+    const m = extractCacheMetrics({ input_tokens: 500 }, 'anthropic')
+    expect(m.read).toBe(0)
+    expect(m.created).toBe(0)
+    expect(m.hitRate).toBe(0)
+  })
+
+  test('read only (no created) computes proportion correctly', () => {
+    const m = extractCacheMetrics(
+      { input_tokens: 0, cache_read_input_tokens: 800, cache_creation_input_tokens: 0 },
+      'anthropic',
+    )
+    expect(m.read).toBe(800)
+    expect(m.created).toBe(0)
+    expect(m.total).toBe(800)
+    expect(m.hitRate).toBe(1)
+  })
+
+  test('created only (first turn — no reads yet) gives 0 hit rate', () => {
+    const m = extractCacheMetrics(
+      {
+        input_tokens: 200,
+        cache_read_input_tokens: 0,
+        cache_creation_input_tokens: 1_000,
+      },
+      'anthropic',
+    )
+    expect(m.read).toBe(0)
+    expect(m.created).toBe(1_000)
+    expect(m.total).toBe(1_200)
+    expect(m.hitRate).toBe(0)
+  })
+
+  test('mixed read + created + fresh input — full denominator', () => {
+    const m = extractCacheMetrics(
+      {
+        input_tokens: 500,
+        cache_read_input_tokens: 3_000,
+        cache_creation_input_tokens: 1_500,
+      },
+      'anthropic',
+    )
+    // Denominator = fresh(500) + created(1500) + read(3000) = 5_000
+    // Hit = read/total = 3000 / 5000 = 0.6
+    expect(m.total).toBe(5_000)
+    expect(m.hitRate).toBe(0.6)
+  })
+
+  test('N/A (unsupported provider) preserves null hit-rate even with populated usage', () => {
+    // Simulate a Copilot usage payload that might look like OpenAI shape —
+    // we must NOT try to read it and must report supported:false.
+    const m = extractCacheMetrics(
+      { prompt_tokens: 5_000, prompt_tokens_details: { cached_tokens: 2_000 } },
+      'copilot',
+    )
+    expect(m.supported).toBe(false)
+    expect(m.read).toBe(0)
+    expect(m.hitRate).toBeNull()
+  })
+})
+
+describe('addCacheMetrics — session aggregation', () => {
+  test('sums read/created/total and recomputes hit rate', () => {
+    const a = {
+      read: 100,
+      created: 50,
+      total: 300,
+      hitRate: 100 / 300,
+      supported: true,
+    }
+    const b = {
+      read: 200,
+      created: 0,
+      total: 400,
+      hitRate: 0.5,
+      supported: true,
+    }
+    const sum = addCacheMetrics(a, b)
+    expect(sum.read).toBe(300)
+    expect(sum.created).toBe(50)
+    expect(sum.total).toBe(700)
+    expect(sum.hitRate).toBeCloseTo(300 / 700, 5)
+  })
+
+  test('unsupported + supported = supported (so we never lose honest data)', () => {
+    const unsupported = {
+      read: 0,
+      created: 0,
+      total: 0,
+      hitRate: null,
+      supported: false,
+    }
+    const supported = {
+      read: 10,
+      created: 0,
+      total: 100,
+      hitRate: 0.1,
+      supported: true,
+    }
+    expect(addCacheMetrics(unsupported, supported)).toBe(supported)
+    expect(addCacheMetrics(supported, unsupported)).toBe(supported)
+  })
+
+  test('unsupported + unsupported = unsupported', () => {
+    const u = {
+      read: 0,
+      created: 0,
+      total: 0,
+      hitRate: null,
+      supported: false,
+    }
+    const sum = addCacheMetrics(u, u)
+    expect(sum.supported).toBe(false)
+  })
+})
--- a/src/services/api/cacheMetrics.ts
+++ b/src/services/api/cacheMetrics.ts
@@ -0,0 +1,538 @@
+/**
+ * Cross-provider cache usage normalizer for Phase 1 observability.
+ *
+ * Two layers of extraction, because the shim layer (openaiShim/codexShim)
+ * already converts raw provider usage to Anthropic-shape on the way in:
+ *
+ *   1. `extractCacheReadFromRawUsage` — consumes RAW provider usage, used
+ *      from inside the shims where each provider's native field names are
+ *      still visible. Single source of truth for "where is the cached-
+ *      tokens count on provider X".
+ *   2. `extractCacheMetrics` — consumes POST-shim Anthropic-shape usage,
+ *      which is what every downstream caller (cost-tracker, REPL display,
+ *      /cache-stats) actually sees. Uses the `provider` argument only to
+ *      decide whether the metric is `supported` (Copilot vanilla, Ollama
+ *      get N/A rather than a fabricated 0%).
+ *
+ * Design rationale:
+ *   - Pure functions, no globals: callers pass the provider explicitly so
+ *     that tests, background agents and teammates get consistent results
+ *     even when the process-level provider flag differs.
+ *   - Honest N/A: Copilot (non-Claude) and Ollama do not expose cache data
+ *     at all. Returning 0 would lie and corrupt aggregate hit-rate, so we
+ *     return `supported: false` and let the display decide how to render.
+ *   - `hitRate` is null whenever there is no input to compare against
+ *     (0 read + 0 created). A 0% hit rate would suggest "cold" when in
+ *     reality the turn had no cacheable content to begin with.
+ *   - After normalization, `read + created ≤ total`, with any remainder
+ *     being fresh (non-cacheable) input tokens. The shim enforces this
+ *     invariant by subtracting cached from raw prompt_tokens so that
+ *     post-shim `input_tokens` is always "fresh only" per Anthropic
+ *     convention.
+ *
+ * Raw provider shapes (as of 2026-04):
+ *   - Anthropic:        usage.cache_read_input_tokens,
+ *                       usage.cache_creation_input_tokens,
+ *                       usage.input_tokens (fresh only)
+ *   - OpenAI / Codex:   usage.input_tokens_details?.cached_tokens
+ *                       usage.prompt_tokens_details?.cached_tokens,
+ *                       usage.prompt_tokens (includes cached)
+ *   - Kimi / Moonshot:  usage.cached_tokens (top level), usage.prompt_tokens
+ *   - DeepSeek:         usage.prompt_cache_hit_tokens,
+ *                       usage.prompt_cache_miss_tokens
+ *   - Gemini:           usage.cached_content_token_count,
+ *                       usage.prompt_token_count
+ *   - Copilot (non-Claude) / Ollama: not reported → supported=false
+ */
+import type { APIProvider } from '../../utils/model/providers.js'
+
+/** Providers for which we know how to read cache fields. */
+export type CacheAwareProvider =
+  | 'anthropic'
+  | 'openai'
+  | 'codex'
+  | 'kimi'
+  | 'deepseek'
+  | 'gemini'
+  | 'ollama'
+  // Generic local / self-hosted OpenAI-compatible endpoints (vLLM,
+  // LM Studio, LocalAI, text-generation-webui, custom internal servers
+  // on RFC1918 addresses, reserved TLDs like .local / .internal, etc.).
+  // Distinct from `ollama` because Ollama might someday add cache
+  // reporting; keeping the buckets separate means that change stays
+  // local to one branch.
+  | 'self-hosted'
+  | 'copilot'
+  | 'copilot-claude'
+
+/** Unified cache metrics for one API response. */
+export type CacheMetrics = {
+  /** Tokens served from cache on this request. */
+  read: number
+  /**
+   * Tokens written INTO the cache on this request. Only non-zero for
+   * providers with explicit caching (Anthropic family).
+   */
+  created: number
+  /**
+   * Total input tokens the request is measured against, computed uniformly
+   * as `fresh + read + created` after the shim normalizes every provider
+   * to the Anthropic convention. Used as the denominator for hit-rate.
+   */
+  total: number
+  /**
+   * `read / total`, or null when the denominator is zero or the provider
+   * doesn't support cache reporting.
+   */
+  hitRate: number | null
+  /**
+   * False for providers that do not expose cache data at all. Callers
+   * should render "N/A" instead of "0%" in that case.
+   */
+  supported: boolean
+}
+
+/** Empty reference returned for unsupported providers — copy elision. */
+const UNSUPPORTED: CacheMetrics = {
+  read: 0,
+  created: 0,
+  total: 0,
+  hitRate: null,
+  supported: false,
+}
+
+/** Raw usage shape — intentionally permissive, each provider picks its fields. */
+export type RawUsage = Record<string, unknown> | null | undefined
+
+function asNumber(value: unknown): number {
+  return typeof value === 'number' && Number.isFinite(value) ? value : 0
+}
+
+function pickPath(usage: RawUsage, path: string[]): unknown {
+  let cur: unknown = usage
+  for (const key of path) {
+    if (cur == null || typeof cur !== 'object') return undefined
+    cur = (cur as Record<string, unknown>)[key]
+  }
+  return cur
+}
+
+/**
+ * Returns true when the URL points at a private, loopback, link-local,
+ * CGNAT, or reserved-TLD host — anywhere a self-hosted OpenAI-compatible
+ * server is likely running (vLLM, LM Studio, LocalAI, Ollama on a
+ * non-default port, text-generation-webui, corporate internal proxies).
+ *
+ * WHY a dedicated helper (vs the old substring match):
+ *   The previous check only looked for `localhost` / `127.0.0.1` /
+ *   `:11434` / `:1234` as substrings. That misclassified real setups:
+ *   a vLLM server at `http://192.168.1.50:8000/v1` or an internal
+ *   endpoint at `http://llm.internal:5000/v1` fell through the `openai`
+ *   branch, got marked as cache-capable, and `/cache-stats` reported
+ *   `[Cache: cold]` — making users think their cache was broken when
+ *   in reality the provider simply doesn't report cache fields.
+ *
+ * Intentionally narrower than WebSearchTool's `isPrivateHostname`
+ * (which defends against SSRF bypass vectors like IPv4-mapped IPv6
+ * and octal-encoded IPs). We only need to classify a reporting bucket,
+ * not enforce a security boundary — a false negative here at worst
+ * shows `[Cache: cold]` instead of `[Cache: N/A]`.
+ *
+ * See cacheMetrics.test.ts for the cases this function is contracted to
+ * return true/false for.
+ */
+function isLocalOrPrivateUrl(url: string): boolean {
+  if (!url) return false
+  let hostname = ''
+  try {
+    hostname = new URL(url).hostname.toLowerCase()
+  } catch {
+    // Fall through to the substring fallback below.
+  }
+  // WHATWG URL accepts `localhost:8000` (treats `localhost:` as scheme,
+  // leaving hostname empty). Treat empty-hostname parses the same as a
+  // parse failure so we still catch the obvious cases with substring.
+  if (!hostname) {
+    const lower = url.toLowerCase()
+    return (
+      lower.includes('localhost') ||
+      lower.includes('127.0.0.1') ||
+      lower.includes('::1')
+    )
+  }
+  // Unwrap IPv6 literal brackets that URL.hostname leaves attached.
+  const h = hostname.startsWith('[') && hostname.endsWith(']')
+    ? hostname.slice(1, -1)
+    : hostname
+  // Reserved TLDs and `localhost` itself — all guaranteed never to
+  // resolve to public infrastructure. Sources:
+  //   - RFC 6761 §6.3  — `.localhost` (Chrome/Firefox/systemd-resolved
+  //                       resolve `*.localhost` to 127.0.0.1 natively)
+  //   - RFC 6762        — `.local` mDNS (Bonjour)
+  //   - RFC 8375        — `.home.arpa` (residential home networks)
+  //   - de facto        — `.lan`, `.internal`, `.intranet` (widely used
+  //                       in corporate DNS despite not being formally
+  //                       reserved)
+  if (
+    h === 'localhost' ||
+    h.endsWith('.localhost') ||
+    h.endsWith('.local') ||
+    h.endsWith('.lan') ||
+    h.endsWith('.internal') ||
+    h.endsWith('.intranet') ||
+    h.endsWith('.home.arpa')
+  ) {
+    return true
+  }
+  // IPv4 private and reserved ranges. URL.hostname normalizes short /
+  // hex / octal IPv4 representations to dotted-quad, so a simple regex
+  // works for the display-classification use case.
+  const ipv4 = h.match(/^(\d{1,3})\.(\d{1,3})\.(\d{1,3})\.(\d{1,3})$/)
+  if (ipv4) {
+    const a = Number(ipv4[1])
+    const b = Number(ipv4[2])
+    // 10.0.0.0/8 (RFC 1918)
+    if (a === 10) return true
+    // 172.16.0.0/12 (RFC 1918)
+    if (a === 172 && b >= 16 && b <= 31) return true
+    // 192.168.0.0/16 (RFC 1918)
+    if (a === 192 && b === 168) return true
+    // 127.0.0.0/8 loopback
+    if (a === 127) return true
+    // 169.254.0.0/16 link-local (AWS/GCP metadata, stateless autoconf)
+    if (a === 169 && b === 254) return true
+    // 100.64.0.0/10 CGNAT (Tailscale, carrier-grade NAT)
+    if (a === 100 && b >= 64 && b <= 127) return true
+  }
+  // IPv6 common local/private ranges — narrow by design.
+  if (h === '::1' || h === '::') return true
+  // fe80::/10 link-local and fc00::/7 unique-local (ULA). A colon is
+  // required in the match so `fc` / `fd` don't over-match real
+  // hostnames like `fc-api.example.com` or `fd-hosted.com`. URL.hostname
+  // strips brackets, so an IPv6 literal like `fc00::1` shows up here as
+  // `fc00::1` — still contains the colon.
+  if (
+    h.startsWith('fe80:') ||
+    /^fc[0-9a-f]{0,2}:/.test(h) ||
+    /^fd[0-9a-f]{0,2}:/.test(h)
+  ) {
+    return true
+  }
+  return false
+}
+
+/**
+ * Map the canonical APIProvider enum (+ environment hints) into a
+ * cache-capability bucket. We separate `copilot` (no cache) from
+ * `copilot-claude` (Anthropic shim via Copilot with explicit cache)
+ * because the two behave very differently even under the same provider
+ * flag — see `isGithubNativeAnthropicMode` in utils/model/providers.ts.
+ *
+ * Order of OpenAI-compatible checks matters:
+ *   1. Private / self-hosted URL — no cache fields regardless of vendor.
+ *   2. Vendor-specific hosted providers (Kimi, DeepSeek) — known cache
+ *      shapes that deserve their own normalization branch.
+ *   3. Plain OpenAI — default bucket.
+ * Doing hosted-vendor matching before self-hosted detection would let a
+ * private-IP endpoint with "deepseek" in the URL fall into the wrong
+ * branch; doing self-hosted last would let a `.internal` URL with
+ * "openai" in its path be misclassified. The current order is correct
+ * for both pathological cases.
+ */
+export function resolveCacheProvider(
+  provider: APIProvider,
+  hints?: { githubNativeAnthropic?: boolean; openAiBaseUrl?: string },
+): CacheAwareProvider {
+  if (provider === 'github') {
+    return hints?.githubNativeAnthropic ? 'copilot-claude' : 'copilot'
+  }
+  if (provider === 'firstParty' || provider === 'bedrock' || provider === 'vertex' || provider === 'foundry') {
+    return 'anthropic'
+  }
+  if (provider === 'gemini') return 'gemini'
+  if (provider === 'codex') return 'codex'
+  if (provider === 'openai') {
+    const url = hints?.openAiBaseUrl ?? ''
+    // Self-hosted / private-network endpoint — detect first so a vLLM
+    // server on 192.168.x.x or a .internal DNS entry is honestly
+    // classified as no-cache, not misreported as plain OpenAI.
+    if (isLocalOrPrivateUrl(url)) return 'self-hosted'
+    const lower = url.toLowerCase()
+    // The :11434 port still signals Ollama specifically (default port).
+    // If someone runs Ollama on a private IP:11434 we picked it up above
+    // as 'self-hosted'; only a public-looking URL with :11434 lands here.
+    if (lower.includes(':11434')) return 'ollama'
+    if (lower.includes('moonshot') || lower.includes('kimi')) return 'kimi'
+    if (lower.includes('deepseek')) return 'deepseek'
+    return 'openai'
+  }
+  // nvidia-nim, minimax, mistral share the OpenAI Chat Completions convention
+  // for cache reporting (prompt_tokens_details.cached_tokens). Treat them as
+  // 'openai' for normalization purposes — if the provider doesn't emit the
+  // field we simply get zeros, and hitRate stays null via the 0-guard below.
+  return 'openai'
+}
+
+/**
+ * Read the cached-tokens count from a RAW provider usage object, handling
+ * every shape we know about. Callers are the shim layer (openaiShim,
+ * codexShim) — the only place where the native provider fields still
+ * exist before conversion to Anthropic shape.
+ *
+ * Order of fallbacks is deliberate: the first non-zero match wins, so
+ * adding a provider that combines shapes is safe as long as we list the
+ * most authoritative field first.
+ */
+export function extractCacheReadFromRawUsage(usage: RawUsage): number {
+  if (!usage || typeof usage !== 'object') return 0
+  const u = usage as Record<string, unknown>
+  // 1. Anthropic-native shape — already normalized upstream.
+  const anthropicRead = asNumber(u.cache_read_input_tokens)
+  if (anthropicRead > 0) return anthropicRead
+  // 2. OpenAI / Codex — cached_tokens nested under input/prompt details.
+  //    Responses API uses `input_tokens_details`, Chat Completions uses
+  //    `prompt_tokens_details`; some models report both with the same value.
+  const openaiNested =
+    asNumber(pickPath(usage, ['input_tokens_details', 'cached_tokens'])) ||
+    asNumber(pickPath(usage, ['prompt_tokens_details', 'cached_tokens']))
+  if (openaiNested > 0) return openaiNested
+  // 3. Kimi / Moonshot — top-level cached_tokens (not nested).
+  const kimi = asNumber(u.cached_tokens)
+  if (kimi > 0) return kimi
+  // 4. DeepSeek — hit/miss split at top level.
+  const deepseek = asNumber(u.prompt_cache_hit_tokens)
+  if (deepseek > 0) return deepseek
+  // 5. Gemini — cached_content_token_count.
+  const gemini = asNumber(u.cached_content_token_count)
+  if (gemini > 0) return gemini
+  return 0
+}
+
+/**
+ * Shape produced by the shim layer — matches the Anthropic BetaUsage
+ * fields that every downstream caller (cost-tracker, REPL, /cache-stats)
+ * consumes. Keeping it in this module lets the shim and the integration
+ * tests share one definition and eliminates the drift class of bugs
+ * where a shim is updated but a test simulator isn't.
+ */
+export type NormalizedShimUsage = {
+  input_tokens: number
+  output_tokens: number
+  cache_creation_input_tokens: number
+  cache_read_input_tokens: number
+}
+
+/**
+ * Convert raw provider usage (any known shape) into the Anthropic-shape
+ * `NormalizedShimUsage` used throughout the codebase. Single source of
+ * truth for the shim layer — `codexShim.makeUsage`,
+ * `openaiShim.convertChunkUsage`, and the non-streaming response in
+ * `OpenAIShimMessages` all call this helper, and the integration test
+ * calls it directly instead of re-implementing the conversion.
+ *
+ * Design contract:
+ *   - `cache_read_input_tokens` comes from `extractCacheReadFromRawUsage`
+ *     (provider-aware extraction).
+ *   - `input_tokens` is rewritten to Anthropic convention: FRESH only,
+ *     with `cache_read` subtracted from the raw prompt count if the
+ *     provider included it there (OpenAI family does; Anthropic native
+ *     already excludes it).
+ *   - `cache_creation_input_tokens` is always 0 at the shim boundary —
+ *     only Anthropic native emits a non-zero creation count, and it
+ *     doesn't flow through these shims.
+ *   - Output token count accepts both `output_tokens` (Codex/Responses)
+ *     and `completion_tokens` (Chat Completions).
+ *
+ * Observed raw shapes per provider (pinned so future drift is caught):
+ *   - OpenAI Chat Completions:
+ *       `{ prompt_tokens, completion_tokens,
+ *          prompt_tokens_details: { cached_tokens } }`
+ *       where `cached_tokens` is a SUBSET of `prompt_tokens` — hence
+ *       the subtraction below.
+ *   - OpenAI Codex / Responses API:
+ *       `{ input_tokens, output_tokens,
+ *          input_tokens_details: { cached_tokens } }`
+ *       same convention: cached is included in `input_tokens`.
+ *   - Anthropic native:
+ *       `{ input_tokens, output_tokens,
+ *          cache_read_input_tokens, cache_creation_input_tokens }`
+ *       cached is EXCLUDED from `input_tokens`. The subtraction here
+ *       no-ops (cache_read is read off a dedicated field, then fresh =
+ *       input_tokens - 0 = input_tokens) — safe passthrough.
+ *   - Kimi/Moonshot:
+ *       `{ prompt_tokens, completion_tokens, cached_tokens }` — top
+ *       level, not nested. OpenAI-family subset convention.
+ *   - DeepSeek:
+ *       `{ prompt_tokens, completion_tokens, prompt_cache_hit_tokens,
+ *          prompt_cache_miss_tokens }`. The `hit` field is the cached
+ *       count, also a subset of `prompt_tokens`.
+ *
+ * If a future provider deviates (ships cached tokens ALREADY excluded
+ * from input_tokens, Anthropic-style), this function will under-count
+ * their fresh-input by `cache_read`. The regression test
+ * `cacheMetricsIntegration.test.ts > "Codex makeUsage no longer
+ * double-bills"` pins the current Codex shape so a deviation breaks
+ * visibly. If you're adding a new provider, verify the shape and —
+ * if needed — extend `extractCacheReadFromRawUsage` to pick a field
+ * that represents cached-tokens-already-excluded (and skip the
+ * subtraction by setting `rawInput` to `prompt_tokens + cache_read`).
+ */
+export function buildAnthropicUsageFromRawUsage(
+  raw: RawUsage,
+): NormalizedShimUsage {
+  const cacheRead = extractCacheReadFromRawUsage(raw)
+  const u = (raw ?? {}) as Record<string, unknown>
+  const rawInput =
+    asNumber(u.input_tokens) || asNumber(u.prompt_tokens)
+  const fresh = rawInput >= cacheRead ? rawInput - cacheRead : rawInput
+  const output =
+    asNumber(u.output_tokens) || asNumber(u.completion_tokens)
+  return {
+    input_tokens: fresh,
+    output_tokens: output,
+    cache_creation_input_tokens: 0,
+    cache_read_input_tokens: cacheRead,
+  }
+}
+
+/**
+ * Extract a unified CacheMetrics from POST-SHIM (Anthropic-shape) usage.
+ *
+ * By the time this runs, openaiShim/codexShim have already converted
+ * raw provider fields into `cache_read_input_tokens` (via
+ * `extractCacheReadFromRawUsage`) and adjusted `input_tokens` to be
+ * "fresh only" per Anthropic convention. This function is therefore
+ * deliberately provider-independent for the numeric extraction — the
+ * `provider` argument is used only to surface `supported: false` for
+ * providers that expose no cache data at all.
+ */
+export function extractCacheMetrics(
+  usage: RawUsage,
+  provider: CacheAwareProvider,
+): CacheMetrics {
+  if (!usage || typeof usage !== 'object') return UNSUPPORTED
+  const u = usage as Record<string, unknown>
+  const read = asNumber(u.cache_read_input_tokens)
+  const created = asNumber(u.cache_creation_input_tokens)
+  const fresh = asNumber(u.input_tokens)
+  // Copilot vanilla (no Claude) and Ollama don't expose cache fields at
+  // all as a provider-identity matter. These are explicit provider
+  // selections (via CLAUDE_CODE_USE_GITHUB and the Ollama base-URL
+  // default port), so we can hard-wire `supported: false` and let the
+  // REPL print "N/A" instead of a fabricated 0%.
+  if (provider === 'copilot' || provider === 'ollama') {
+    return UNSUPPORTED
+  }
+  // `self-hosted` is different: the bucket is inferred from the base
+  // URL being on a private network (RFC1918, .local TLD, etc.), which
+  // is a heuristic, not an authoritative "this endpoint cannot cache"
+  // signal. An internal reverse proxy forwarding to OpenAI / Kimi /
+  // DeepSeek / Gemini will produce a private URL but ALSO emit real
+  // cache fields via the shim. Force-unsupported here would discard
+  // legitimate data. Let the data decide: if the shim extracted any
+  // cache activity (read OR created), trust it and fall through to
+  // normal extraction; otherwise render honest N/A for vanilla
+  // vLLM/LocalAI-style endpoints that really don't cache.
+  if (provider === 'self-hosted' && read === 0 && created === 0) {
+    return UNSUPPORTED
+  }
+  // total = fresh + read + created — shim already stripped `read` out of
+  // `fresh` so the three components don't double-count. This matches the
+  // Anthropic convention even when the upstream was OpenAI/Kimi/DeepSeek.
+  const total = read + created + fresh
+  return {
+    read,
+    created,
+    total,
+    // Clamp to [0, 1]. With non-negative inputs the math guarantees
+    // `read <= total` — but an upstream shim bug (e.g. future provider
+    // where we accidentally read a negative `fresh`) could violate the
+    // invariant. Showing a pinned `1.0` on anomalous input is clearer
+    // than a nonsense ratio > 100% and safer than `null` (which would
+    // hide the issue completely).
+    hitRate: total > 0 ? Math.min(1, read / total) : null,
+    supported: true,
+  }
+}
+
+/**
+ * Format a CacheMetrics value into a human-facing one-liner used by
+ * `showCacheStats: 'compact'`. Stable format — snapshot-tested.
+ *
+ * Examples:
+ *   "[Cache: 1.2k read • hit 12%]"
+ *   "[Cache: N/A]"                  (unsupported provider)
+ *   "[Cache: cold]"                 (supported, no reads yet)
+ *
+ * The `undefined` branch at the top is defensive: TypeScript enforces
+ * `CacheMetrics` at call sites, but a failed API response could leave
+ * the caller with nothing to render. Treat absent metrics as "no data"
+ * rather than throwing on `metrics.supported`.
+ */
+export function formatCacheMetricsCompact(
+  metrics: CacheMetrics | undefined | null,
+): string {
+  if (!metrics) return '[Cache: N/A]'
+  if (!metrics.supported) return '[Cache: N/A]'
+  if (metrics.read === 0 && metrics.created === 0) return '[Cache: cold]'
+  const parts: string[] = [`${formatCompactNumber(metrics.read)} read`]
+  if (metrics.hitRate !== null) {
+    parts.push(`hit ${Math.round(metrics.hitRate * 100)}%`)
+  }
+  return `[Cache: ${parts.join(' • ')}]`
+}
+
+/**
+ * Format a CacheMetrics value into a multi-field breakdown used by
+ * `showCacheStats: 'full'`. Stable format — snapshot-tested.
+ *
+ * Example:
+ *   "[Cache: read=1.2k created=340 hit=12%]"
+ *
+ * Same `undefined` tolerance as `formatCacheMetricsCompact` — a failed
+ * API response shouldn't throw on the display path.
+ */
+export function formatCacheMetricsFull(
+  metrics: CacheMetrics | undefined | null,
+): string {
+  if (!metrics) return '[Cache: N/A]'
+  if (!metrics.supported) return '[Cache: N/A]'
+  const parts: string[] = [
+    `read=${formatCompactNumber(metrics.read)}`,
+    `created=${formatCompactNumber(metrics.created)}`,
+  ]
+  if (metrics.hitRate !== null) {
+    parts.push(`hit=${Math.round(metrics.hitRate * 100)}%`)
+  } else {
+    parts.push('hit=n/a')
+  }
+  return `[Cache: ${parts.join(' ')}]`
+}
+
+// Compact 1.2k-style formatter. Duplicated here (not imported from
+// utils/format.ts) because this module should stay dependency-light and
+// deterministic — utils/format pulls Intl locale state which varies.
+function formatCompactNumber(n: number): string {
+  if (n < 1_000) return String(n)
+  if (n < 1_000_000) return `${(n / 1_000).toFixed(1).replace(/\.0$/, '')}k`
+  return `${(n / 1_000_000).toFixed(1).replace(/\.0$/, '')}m`
+}
+
+/** Sum two CacheMetrics, preserving `supported` as true only if both are. */
+export function addCacheMetrics(a: CacheMetrics, b: CacheMetrics): CacheMetrics {
+  // Copy elision: if either side is the unsupported sentinel, return the
+  // other as-is so aggregates on a purely-unsupported session stay cheap.
+  if (!a.supported && !b.supported) return UNSUPPORTED
+  if (!a.supported) return b
+  if (!b.supported) return a
+  const read = a.read + b.read
+  const created = a.created + b.created
+  const total = a.total + b.total
+  return {
+    read,
+    created,
+    total,
+    hitRate: total > 0 ? read / total : null,
+    supported: true,
+  }
+}
--- a/src/services/api/cacheMetricsIntegration.test.ts
+++ b/src/services/api/cacheMetricsIntegration.test.ts
@@ -0,0 +1,339 @@
+/**
+ * Integration tests for the raw-usage → shim → cost-tracker pipeline.
+ *
+ * These tests simulate what happens on each provider end-to-end:
+ *   1. The provider returns a raw `usage` object in its native shape.
+ *   2. The shim (openaiShim.convertChunkUsage / codexShim.makeUsage)
+ *      rewrites it to Anthropic shape via buildAnthropicUsageFromRawUsage.
+ *   3. cost-tracker feeds the shimmed usage to extractCacheMetrics.
+ *
+ * The unit tests in cacheMetrics.test.ts exercise each layer in isolation.
+ * This file exists so that a regression in ANY one of them (e.g. someone
+ * adding a new provider branch to the helper but forgetting to wire it
+ * into the shim) surfaces as an integration failure rather than silently
+ * showing "[Cache: cold]" in production.
+ *
+ * We call `buildAnthropicUsageFromRawUsage` directly instead of
+ * re-implementing the shim behavior locally. Both shims
+ * (`codexShim.makeUsage`, `openaiShim.convertChunkUsage`, and the
+ * non-streaming path in `OpenAIShimMessages`) delegate to this helper,
+ * so this test covers the exact same code that runs in production —
+ * no simulator drift possible.
+ */
+import { describe, expect, test } from 'bun:test'
+import {
+  buildAnthropicUsageFromRawUsage,
+  extractCacheMetrics,
+  formatCacheMetricsCompact,
+  formatCacheMetricsFull,
+  resolveCacheProvider,
+  type CacheAwareProvider,
+} from './cacheMetrics.js'
+
+type Scenario = {
+  name: string
+  provider: CacheAwareProvider
+  rawUsage: Record<string, unknown>
+  expectedRead: number
+  expectedTotal: number
+  expectedHitRate: number
+  expectedFreshInput: number
+}
+
+// End-to-end scenarios for every provider shape the OpenClaude shim layer
+// might see. `expectedTotal` is what a user should see as "input this
+// request", `expectedHitRate` is what `/cache-stats` should display.
+const scenarios: Scenario[] = [
+  {
+    name: 'Anthropic native (firstParty) — passthrough',
+    provider: 'anthropic',
+    rawUsage: {
+      input_tokens: 200,
+      cache_read_input_tokens: 800,
+      cache_creation_input_tokens: 100,
+    },
+    expectedRead: 800,
+    // Anthropic native doesn't go through the shim in production, but
+    // buildAnthropicUsageFromRawUsage handles it correctly as passthrough:
+    // prompt_tokens fallback is 0, so fresh comes from input_tokens (200),
+    // cache_read is picked up from cache_read_input_tokens (800).
+    expectedTotal: 1_000, // 200 fresh + 800 read (created is not tracked at this layer)
+    expectedHitRate: 800 / 1_000,
+    expectedFreshInput: 200,
+  },
+  {
+    name: 'OpenAI Chat Completions via openaiShim',
+    provider: 'openai',
+    rawUsage: {
+      prompt_tokens: 2_000,
+      completion_tokens: 300,
+      prompt_tokens_details: { cached_tokens: 1_200 },
+    },
+    expectedRead: 1_200,
+    expectedTotal: 2_000, // 800 fresh + 1200 read
+    expectedHitRate: 0.6,
+    expectedFreshInput: 800,
+  },
+  {
+    name: 'Codex Responses API via codexShim',
+    provider: 'codex',
+    rawUsage: {
+      input_tokens: 1_500,
+      output_tokens: 50,
+      input_tokens_details: { cached_tokens: 600 },
+    },
+    expectedRead: 600,
+    expectedTotal: 1_500,
+    expectedHitRate: 0.4,
+    expectedFreshInput: 900,
+  },
+  {
+    name: 'Kimi / Moonshot via openaiShim — top-level cached_tokens',
+    provider: 'kimi',
+    rawUsage: {
+      prompt_tokens: 1_000,
+      completion_tokens: 120,
+      cached_tokens: 400,
+    },
+    expectedRead: 400,
+    expectedTotal: 1_000,
+    expectedHitRate: 0.4,
+    expectedFreshInput: 600,
+  },
+  {
+    name: 'DeepSeek via openaiShim — prompt_cache_hit_tokens',
+    provider: 'deepseek',
+    rawUsage: {
+      prompt_tokens: 1_000,
+      completion_tokens: 40,
+      prompt_cache_hit_tokens: 700,
+      prompt_cache_miss_tokens: 300,
+    },
+    expectedRead: 700,
+    expectedTotal: 1_000,
+    expectedHitRate: 0.7,
+    expectedFreshInput: 300,
+  },
+  {
+    name: 'Gemini via openaiShim — cached_content_token_count',
+    provider: 'gemini',
+    rawUsage: {
+      prompt_tokens: 4_000,
+      completion_tokens: 200,
+      cached_content_token_count: 3_200,
+    },
+    expectedRead: 3_200,
+    expectedTotal: 4_000,
+    expectedHitRate: 0.8,
+    expectedFreshInput: 800,
+  },
+]
+
+describe('raw usage → shim → extractCacheMetrics pipeline', () => {
+  for (const s of scenarios) {
+    test(s.name, () => {
+      // Call the same helper the shims call in production — no
+      // simulator, no possibility of drift.
+      const shimmed = buildAnthropicUsageFromRawUsage(s.rawUsage)
+      expect(shimmed.cache_read_input_tokens).toBe(s.expectedRead)
+      expect(shimmed.input_tokens).toBe(s.expectedFreshInput)
+
+      const metrics = extractCacheMetrics(
+        shimmed as unknown as Record<string, unknown>,
+        s.provider,
+      )
+      expect(metrics.supported).toBe(true)
+      expect(metrics.read).toBe(s.expectedRead)
+      expect(metrics.total).toBe(s.expectedTotal)
+      expect(metrics.hitRate).toBeCloseTo(s.expectedHitRate, 4)
+    })
+  }
+})
+
+describe('no-cache providers — pipeline honestly reports unsupported', () => {
+  test('GitHub Copilot (vanilla) — shim runs, but provider bucket maps to unsupported', () => {
+    const shimmed = buildAnthropicUsageFromRawUsage({
+      prompt_tokens: 500,
+      completion_tokens: 40,
+    })
+    // Shim normalized correctly (0 cache_read), but Copilot-vanilla must
+    // surface as unsupported so /cache-stats shows "N/A" instead of "0%".
+    expect(shimmed.cache_read_input_tokens).toBe(0)
+    const metrics = extractCacheMetrics(
+      shimmed as unknown as Record<string, unknown>,
+      'copilot',
+    )
+    expect(metrics.supported).toBe(false)
+    expect(metrics.hitRate).toBeNull()
+  })
+
+  test('Ollama (local) — same treatment as Copilot-vanilla', () => {
+    const shimmed = buildAnthropicUsageFromRawUsage({
+      prompt_tokens: 1_000,
+      completion_tokens: 200,
+    })
+    const metrics = extractCacheMetrics(
+      shimmed as unknown as Record<string, unknown>,
+      'ollama',
+    )
+    expect(metrics.supported).toBe(false)
+  })
+})
+
+describe('display path end-to-end — private-IP, custom-port, self-hosted endpoints', () => {
+  // These tests exercise the FULL pipeline that runs when a user
+  // configures OpenClaude against a self-hosted OpenAI-compatible
+  // server (vLLM, LM Studio, LocalAI, text-generation-webui, etc.):
+  //
+  //   OPENAI_BASE_URL → resolveCacheProvider → real provider usage →
+  //   buildAnthropicUsageFromRawUsage → extractCacheMetrics →
+  //   formatCacheMetricsCompact / Full (= what user sees in REPL and
+  //   via /cache-stats)
+  //
+  // Pre-fix behavior: substring check missed these URLs, they fell
+  // into the 'openai' bucket, and the display showed '[Cache: cold]' —
+  // i.e. implied a cache miss when the provider simply doesn't report
+  // cache fields. Post-fix: '[Cache: N/A]' every time.
+
+  const privateEndpoints: Array<{ name: string; baseUrl: string }> = [
+    { name: 'vLLM on RFC1918 LAN IP', baseUrl: 'http://192.168.1.50:8000/v1' },
+    { name: 'LocalAI on 10.x.x.x corporate network', baseUrl: 'http://10.0.0.7:8080/v1' },
+    { name: 'self-hosted on 172.16.x.x', baseUrl: 'http://172.20.0.3:5000/v1' },
+    { name: 'reverse-proxied on .internal DNS', baseUrl: 'http://llm.internal:5000/v1' },
+    { name: 'mDNS .local hostname', baseUrl: 'http://box.local:8080/v1' },
+    { name: 'RFC 8375 .home.arpa', baseUrl: 'http://vllm.home.arpa/v1' },
+    { name: 'CGNAT / Tailscale 100.64.x.x', baseUrl: 'http://100.64.1.5:8000/v1' },
+    { name: 'IPv6 loopback literal', baseUrl: 'http://[::1]:5000/v1' },
+    { name: 'IPv6 link-local', baseUrl: 'http://[fe80::1]:8000/v1' },
+    { name: 'IPv6 ULA fc00::/7', baseUrl: 'http://[fd12:3456::7]:8080/v1' },
+    { name: 'link-local cloud-metadata IP', baseUrl: 'http://169.254.169.254/v1' },
+  ]
+
+  for (const { name, baseUrl } of privateEndpoints) {
+    test(`${name} (${baseUrl}) — renders [Cache: N/A], not [Cache: cold]`, () => {
+      // 1. URL resolves to self-hosted bucket.
+      const bucket = resolveCacheProvider('openai', { openAiBaseUrl: baseUrl })
+      expect(bucket).toBe('self-hosted')
+
+      // 2. Typical self-hosted server returns OpenAI-shape usage with no
+      //    cache fields — the shim normalizes it cleanly.
+      const shimmed = buildAnthropicUsageFromRawUsage({
+        prompt_tokens: 1_200,
+        completion_tokens: 250,
+      })
+      expect(shimmed.cache_read_input_tokens).toBe(0)
+
+      // 3. The display path marks the bucket unsupported.
+      const metrics = extractCacheMetrics(
+        shimmed as unknown as Record<string, unknown>,
+        bucket,
+      )
+      expect(metrics.supported).toBe(false)
+      expect(metrics.hitRate).toBeNull()
+
+      // 4. User-visible output — both formats honor the unsupported flag.
+      expect(formatCacheMetricsCompact(metrics)).toBe('[Cache: N/A]')
+      expect(formatCacheMetricsFull(metrics)).toBe('[Cache: N/A]')
+    })
+  }
+
+  test('public-looking URL with non-standard port stays in openai bucket (no false positive)', () => {
+    // A real hosted API that happens to run on a custom port must NOT
+    // be misclassified as self-hosted. This guards the fix against
+    // over-matching.
+    const bucket = resolveCacheProvider('openai', {
+      openAiBaseUrl: 'https://api.openai.com:8443/v1',
+    })
+    expect(bucket).toBe('openai')
+  })
+
+  test('private IP + hosted-provider keyword in path → self-hosted wins', () => {
+    // A URL like 'http://10.0.0.5:8000/v1/deepseek-proxy' has "deepseek"
+    // in the path but the upstream is a LAN box, not the real DeepSeek.
+    // Priority ordering in resolveCacheProvider must put self-hosted
+    // detection first.
+    const bucket = resolveCacheProvider('openai', {
+      openAiBaseUrl: 'http://10.0.0.5:8000/v1/deepseek-proxy',
+    })
+    expect(bucket).toBe('self-hosted')
+  })
+
+  test('self-hosted proxy forwarding real upstream cache data is NOT discarded', () => {
+    // Review-blocker regression: an enterprise setup with an internal
+    // reverse proxy on a private URL forwarding to OpenAI / Kimi /
+    // DeepSeek / Gemini WILL deliver real cache fields via the shim.
+    // Pre-fix, the URL heuristic → self-hosted → unconditional
+    // `supported: false` discarded the data and rendered '[Cache: N/A]'
+    // even though valid cache metrics were on the payload. Post-fix,
+    // the data decides: non-zero cache activity trumps the URL bucket.
+    const bucket = resolveCacheProvider('openai', {
+      openAiBaseUrl: 'http://llm-proxy.corp.internal:5000/v1',
+    })
+    expect(bucket).toBe('self-hosted')
+
+    // Typical raw Kimi shape (the reverse proxy forwards this through
+    // unchanged). Shim normalizes to Anthropic shape.
+    const raw = { prompt_tokens: 2_000, cached_tokens: 800 }
+    const shimmed = buildAnthropicUsageFromRawUsage(raw)
+
+    // Display path with the fix: data is preserved end-to-end.
+    const metrics = extractCacheMetrics(
+      shimmed as unknown as Record<string, unknown>,
+      bucket,
+    )
+    expect(metrics.supported).toBe(true)
+    expect(metrics.read).toBe(800)
+    expect(metrics.hitRate).toBe(0.4)
+    expect(formatCacheMetricsCompact(metrics)).toBe(
+      '[Cache: 800 read • hit 40%]',
+    )
+  })
+})
+
+describe('regression guards — bug reproducers', () => {
+  test('Kimi cache hit survives the shim (pre-fix: silently dropped to 0)', () => {
+    // Before the Option-C refactor, the shim only read
+    // prompt_tokens_details.cached_tokens, so Kimi's top-level
+    // cached_tokens (400 below) was lost — the tracker saw read=0 and
+    // users saw "[Cache: cold]" even after real cache hits. This test
+    // fails loudly if the helper forgets the top-level branch.
+    const raw = { prompt_tokens: 800, cached_tokens: 300 }
+    const shimmed = buildAnthropicUsageFromRawUsage(raw)
+    const metrics = extractCacheMetrics(
+      shimmed as unknown as Record<string, unknown>,
+      'kimi',
+    )
+    expect(metrics.read).toBe(300)
+    expect(metrics.hitRate).toBeGreaterThan(0)
+  })
+
+  test('DeepSeek cache hit survives the shim (pre-fix: silently dropped to 0)', () => {
+    const raw = {
+      prompt_tokens: 1_200,
+      prompt_cache_hit_tokens: 900,
+      prompt_cache_miss_tokens: 300,
+    }
+    const shimmed = buildAnthropicUsageFromRawUsage(raw)
+    const metrics = extractCacheMetrics(
+      shimmed as unknown as Record<string, unknown>,
+      'deepseek',
+    )
+    expect(metrics.read).toBe(900)
+    expect(metrics.hitRate).toBe(0.75)
+  })
+
+  test('Codex makeUsage no longer double-bills (pre-fix: input_tokens kept cached)', () => {
+    // Pre-fix, codexShim.makeUsage set input_tokens to the raw value
+    // without subtracting cached_tokens, so modelCost.calculateUSDCost
+    // charged the same tokens under both input_tokens * rate AND
+    // cache_read_input_tokens * rate. This test enforces the Anthropic
+    // convention at the shim boundary.
+    const raw = {
+      input_tokens: 2_000,
+      input_tokens_details: { cached_tokens: 1_500 },
+    }
+    const shimmed = buildAnthropicUsageFromRawUsage(raw)
+    expect(shimmed.input_tokens).toBe(500) // 2000 - 1500, not 2000
+    expect(shimmed.cache_read_input_tokens).toBe(1_500)
+  })
+})
--- a/src/services/api/cacheStatsTracker.test.ts
+++ b/src/services/api/cacheStatsTracker.test.ts
@@ -0,0 +1,210 @@
+import { beforeEach, expect, test, describe } from 'bun:test'
+import {
+  _setHistoryCapForTesting,
+  getCacheStatsHistory,
+  getCurrentTurnCacheMetrics,
+  getSessionCacheMetrics,
+  recordRequest,
+  resetCurrentTurn,
+  resetSessionCacheStats,
+} from './cacheStatsTracker.js'
+import type { CacheMetrics } from './cacheMetrics.js'
+
+function makeMetrics(partial: Partial<CacheMetrics>): CacheMetrics {
+  return {
+    read: 0,
+    created: 0,
+    total: 0,
+    hitRate: null,
+    supported: true,
+    ...partial,
+  }
+}
+
+beforeEach(() => {
+  resetSessionCacheStats()
+  _setHistoryCapForTesting(500)
+})
+
+describe('cacheStatsTracker — aggregation', () => {
+  test('currentTurn and session both start empty and unsupported', () => {
+    expect(getCurrentTurnCacheMetrics().supported).toBe(false)
+    expect(getSessionCacheMetrics().supported).toBe(false)
+    expect(getCacheStatsHistory()).toEqual([])
+  })
+
+  test('one recorded request flows into both turn and session', () => {
+    recordRequest(
+      makeMetrics({ read: 500, total: 1000, hitRate: 0.5 }),
+      'claude-sonnet-4',
+    )
+    expect(getCurrentTurnCacheMetrics().read).toBe(500)
+    expect(getCurrentTurnCacheMetrics().total).toBe(1000)
+    expect(getSessionCacheMetrics().read).toBe(500)
+  })
+
+  test('multiple requests sum across turn', () => {
+    recordRequest(
+      makeMetrics({ read: 100, total: 500, hitRate: 0.2 }),
+      'm1',
+    )
+    recordRequest(
+      makeMetrics({ read: 300, total: 500, hitRate: 0.6 }),
+      'm1',
+    )
+    const turn = getCurrentTurnCacheMetrics()
+    expect(turn.read).toBe(400)
+    expect(turn.total).toBe(1000)
+    expect(turn.hitRate).toBeCloseTo(0.4, 5)
+  })
+
+  test('resetCurrentTurn clears turn but preserves session', () => {
+    recordRequest(makeMetrics({ read: 200, total: 400 }), 'm1')
+    resetCurrentTurn()
+    expect(getCurrentTurnCacheMetrics().supported).toBe(false)
+    expect(getSessionCacheMetrics().read).toBe(200)
+  })
+
+  test('resetSessionCacheStats clears everything', () => {
+    recordRequest(makeMetrics({ read: 200, total: 400 }), 'm1')
+    resetSessionCacheStats()
+    expect(getCurrentTurnCacheMetrics().supported).toBe(false)
+    expect(getSessionCacheMetrics().supported).toBe(false)
+    expect(getCacheStatsHistory()).toEqual([])
+  })
+})
+
+describe('cacheStatsTracker — history', () => {
+  test('records each request with label and timestamp', () => {
+    const before = Date.now()
+    recordRequest(makeMetrics({ read: 1, total: 2 }), 'model-A')
+    recordRequest(makeMetrics({ read: 3, total: 4 }), 'model-B')
+    const history = getCacheStatsHistory()
+    expect(history.length).toBe(2)
+    expect(history[0]!.label).toBe('model-A')
+    expect(history[1]!.label).toBe('model-B')
+    expect(history[0]!.timestamp).toBeGreaterThanOrEqual(before)
+  })
+
+  test('evicts oldest entries when cap is exceeded', () => {
+    _setHistoryCapForTesting(3)
+    for (let i = 0; i < 5; i++) {
+      recordRequest(makeMetrics({ read: i, total: 10 }), `m${i}`)
+    }
+    const history = getCacheStatsHistory()
+    expect(history.length).toBe(3)
+    expect(history.map((h) => h.label)).toEqual(['m2', 'm3', 'm4'])
+  })
+
+  test('history copy is detached from internal state', () => {
+    recordRequest(makeMetrics({ read: 1, total: 2 }), 'x')
+    const snapshot = getCacheStatsHistory()
+    snapshot.pop()
+    expect(getCacheStatsHistory().length).toBe(1)
+  })
+})
+
+describe('cacheStatsTracker — ring buffer semantics', () => {
+  test('ring wraps at cap without shifting (chronological order preserved)', () => {
+    _setHistoryCapForTesting(4)
+    // Push exactly 2×cap entries — forces one full wrap.
+    for (let i = 0; i < 8; i++) {
+      recordRequest(makeMetrics({ read: i, total: 10 }), `m${i}`)
+    }
+    const history = getCacheStatsHistory()
+    expect(history.length).toBe(4)
+    // After 8 pushes with cap=4, the survivors must be the newest 4 —
+    // m4, m5, m6, m7 — in chronological order. If the ring logic were
+    // wrong (e.g. off-by-one on writeIdx) this would come out rotated.
+    expect(history.map((h) => h.label)).toEqual(['m4', 'm5', 'm6', 'm7'])
+  })
+
+  test('read before ring wraps returns partial history in order', () => {
+    _setHistoryCapForTesting(10)
+    for (let i = 0; i < 3; i++) {
+      recordRequest(makeMetrics({ read: i, total: 10 }), `m${i}`)
+    }
+    const history = getCacheStatsHistory()
+    expect(history.map((h) => h.label)).toEqual(['m0', 'm1', 'm2'])
+  })
+
+  test('shrinking cap preserves the newest entries in order', () => {
+    _setHistoryCapForTesting(5)
+    for (let i = 0; i < 5; i++) {
+      recordRequest(makeMetrics({ read: i, total: 10 }), `m${i}`)
+    }
+    _setHistoryCapForTesting(3)
+    const history = getCacheStatsHistory()
+    expect(history.map((h) => h.label)).toEqual(['m2', 'm3', 'm4'])
+    // And pushing after shrink still respects the new cap.
+    recordRequest(makeMetrics({ read: 5, total: 10 }), 'm5')
+    expect(getCacheStatsHistory().map((h) => h.label)).toEqual(['m3', 'm4', 'm5'])
+  })
+
+  test('growing cap preserves existing entries and accepts more', () => {
+    _setHistoryCapForTesting(3)
+    for (let i = 0; i < 3; i++) {
+      recordRequest(makeMetrics({ read: i, total: 10 }), `m${i}`)
+    }
+    _setHistoryCapForTesting(6)
+    // After growing, the existing three should still be there in order,
+    // and we should be able to push three more before eviction starts.
+    for (let i = 3; i < 6; i++) {
+      recordRequest(makeMetrics({ read: i, total: 10 }), `m${i}`)
+    }
+    const history = getCacheStatsHistory()
+    expect(history.map((h) => h.label)).toEqual([
+      'm0',
+      'm1',
+      'm2',
+      'm3',
+      'm4',
+      'm5',
+    ])
+  })
+
+  test('_setHistoryCapForTesting throws on non-positive cap', () => {
+    // A zero cap would divide-by-zero on the ring write index and
+    // silently corrupt the buffer. Loud failure > NaN indices.
+    expect(() => _setHistoryCapForTesting(0)).toThrow(/cap must be >= 1/)
+    expect(() => _setHistoryCapForTesting(-3)).toThrow(/cap must be >= 1/)
+  })
+
+  test('resetSessionCacheStats empties the ring even when wrapped', () => {
+    _setHistoryCapForTesting(3)
+    for (let i = 0; i < 10; i++) {
+      recordRequest(makeMetrics({ read: i, total: 10 }), `m${i}`)
+    }
+    // Sanity: ring has wrapped many times.
+    expect(getCacheStatsHistory().length).toBe(3)
+    resetSessionCacheStats()
+    expect(getCacheStatsHistory()).toEqual([])
+    // And a fresh push after reset starts from index 0 again.
+    recordRequest(makeMetrics({ read: 99, total: 100 }), 'post-reset')
+    const after = getCacheStatsHistory()
+    expect(after.length).toBe(1)
+    expect(after[0]!.label).toBe('post-reset')
+  })
+})
+
+describe('cacheStatsTracker — unsupported mixing', () => {
+  test('mixing supported + unsupported keeps supported data visible', () => {
+    recordRequest(
+      {
+        read: 0,
+        created: 0,
+        total: 0,
+        hitRate: null,
+        supported: false,
+      },
+      'copilot',
+    )
+    recordRequest(
+      makeMetrics({ read: 100, total: 500, hitRate: 0.2 }),
+      'claude',
+    )
+    const turn = getCurrentTurnCacheMetrics()
+    expect(turn.supported).toBe(true)
+    expect(turn.read).toBe(100)
+  })
+})
--- a/src/services/api/cacheStatsTracker.ts
+++ b/src/services/api/cacheStatsTracker.ts
@@ -0,0 +1,179 @@
+/**
+ * Per-query and per-session cache metrics tracker for Phase 1 observability.
+ *
+ * Sits downstream of `extractCacheMetrics` (normalizer) and upstream of the
+ * REPL display + `/cache-stats` command. The shim layers already report raw
+ * usage into Anthropic-shaped fields, so this tracker listens for each
+ * successful API response and folds the metrics into three buckets:
+ *
+ *   - currentTurn : cleared by callers at the start of each user turn
+ *   - session     : accumulates from process start until `/clear`
+ *   - history     : per-request log for `/cache-stats` breakdown view
+ *
+ * Design rationale:
+ *   - Module-local state (not AppState, not bootstrap/state.ts) because
+ *     this is strictly observability — nothing in the conversation flow
+ *     depends on it and we don't want to couple the shim to React state.
+ *   - `recordRequest()` takes an ALREADY-normalized CacheMetrics so the
+ *     shim layer can resolve provider once and we avoid re-running env
+ *     detection on every response.
+ *   - `history` is bounded (DEFAULT_HISTORY_MAX) so a long-lived session
+ *     can't grow memory unboundedly. Oldest entries drop first.
+ *   - `supported: false` requests still land in history (so the user can
+ *     see "6 requests, all N/A" rather than "no data"), but they add to
+ *     sums as zero — `addCacheMetrics` preserves the supported flag.
+ *
+ * History is stored as a **ring buffer** (fixed-size array + write index).
+ * Previous implementation used `array.splice(0, n)` on every overflow,
+ * which shifts the entire tail — O(n) per recordRequest for the default
+ * cap of 500 (negligible in practice, but wasteful). The ring makes
+ * `recordRequest` strictly O(1). `getCacheStatsHistory()` still pays O(n)
+ * to reconstruct chronological order, but that only runs when the user
+ * opens `/cache-stats` or the REPL renders — never in the hot path.
+ */
+import { addCacheMetrics, type CacheMetrics } from './cacheMetrics.js'
+
+/** One request's cache footprint — what the tracker remembers per turn. */
+export type CacheStatsEntry = {
+  /** Unix ms when the request completed. */
+  timestamp: number
+  /** Opaque label (usually the model string) for `/cache-stats` rows. */
+  label: string
+  /** Normalized metrics for this single request. */
+  metrics: CacheMetrics
+}
+
+// Bound the per-session history. 500 requests ≈ a full day of active use;
+// any more than that is noise for a diagnostic command and starts costing
+// real memory (~100 bytes per entry with the labels).
+const DEFAULT_HISTORY_MAX = 500
+
+const EMPTY_METRICS: CacheMetrics = {
+  read: 0,
+  created: 0,
+  total: 0,
+  hitRate: null,
+  supported: false,
+}
+
+type TrackerState = {
+  currentTurn: CacheMetrics
+  session: CacheMetrics
+  // Ring buffer: fixed-size array, `historyWriteIdx` points at the next
+  // slot to overwrite. Once `historySize === historyMax`, each new push
+  // drops the oldest entry by simply overwriting it — no shifting.
+  history: (CacheStatsEntry | undefined)[]
+  historyWriteIdx: number
+  historySize: number
+  historyMax: number
+}
+
+function createInitialState(max: number): TrackerState {
+  return {
+    currentTurn: EMPTY_METRICS,
+    session: EMPTY_METRICS,
+    history: new Array(max),
+    historyWriteIdx: 0,
+    historySize: 0,
+    historyMax: max,
+  }
+}
+
+const state: TrackerState = createInitialState(DEFAULT_HISTORY_MAX)
+
+/**
+ * Record a single API response's normalized cache metrics. Idempotent per
+ * request (caller ensures this isn't double-counted) — safe to call from
+ * the shim right after `addToTotalSessionCost`.
+ *
+ * O(1) via ring-buffer write — previously used `splice(0, n)` on overflow
+ * which was O(n) per call for the default cap of 500.
+ */
+export function recordRequest(
+  metrics: CacheMetrics,
+  label: string,
+): void {
+  state.currentTurn = addCacheMetrics(state.currentTurn, metrics)
+  state.session = addCacheMetrics(state.session, metrics)
+  const entry: CacheStatsEntry = {
+    timestamp: Date.now(),
+    label,
+    metrics,
+  }
+  // Overwrite at the write head. If the ring is full, this drops the
+  // oldest entry (which previously lived at this slot) implicitly.
+  state.history[state.historyWriteIdx] = entry
+  state.historyWriteIdx = (state.historyWriteIdx + 1) % state.historyMax
+  if (state.historySize < state.historyMax) {
+    state.historySize++
+  }
+}
+
+/** Clear turn-level counters at the start of a new user turn. */
+export function resetCurrentTurn(): void {
+  state.currentTurn = EMPTY_METRICS
+}
+
+/** Clear all session state — used by `/clear`, `/compact`, tests. */
+export function resetSessionCacheStats(): void {
+  state.currentTurn = EMPTY_METRICS
+  state.session = EMPTY_METRICS
+  // Rebuild the ring so any hold-over references can be GC'd. Slightly
+  // more work than zeroing indices, but `/clear` is rare and this avoids
+  // silently pinning old CacheStatsEntry objects in memory.
+  state.history = new Array(state.historyMax)
+  state.historyWriteIdx = 0
+  state.historySize = 0
+}
+
+/** Snapshot of the current turn's aggregate. */
+export function getCurrentTurnCacheMetrics(): CacheMetrics {
+  return state.currentTurn
+}
+
+/** Snapshot of the session-wide aggregate. */
+export function getSessionCacheMetrics(): CacheMetrics {
+  return state.session
+}
+
+/**
+ * Recent per-request entries, oldest-first. Returns a copy so callers
+ * can freely sort/filter without perturbing the tracker.
+ *
+ * Walks the ring from the oldest slot to the newest. Two cases:
+ *   - not yet full: oldest is at index 0, newest at `size-1`
+ *   - full / wrapped: oldest is at `writeIdx`, newest at `writeIdx-1`
+ */
+export function getCacheStatsHistory(): CacheStatsEntry[] {
+  if (state.historySize < state.historyMax) {
+    // Fast path: ring hasn't wrapped yet, entries live at [0..size).
+    return state.history.slice(0, state.historySize) as CacheStatsEntry[]
+  }
+  // Wrapped: reconstruct oldest-first by concatenating the two halves.
+  const tail = state.history.slice(state.historyWriteIdx) as CacheStatsEntry[]
+  const head = state.history.slice(0, state.historyWriteIdx) as CacheStatsEntry[]
+  return tail.concat(head)
+}
+
+/**
+ * Test/debug hook — do not use in production paths. Resizes the ring
+ * preserving the most recent `min(cap, size)` entries in chronological
+ * order, so tests can shrink the cap and verify eviction behavior.
+ */
+export function _setHistoryCapForTesting(cap: number): void {
+  // Cap must be positive — a zero-sized ring would divide by zero on
+  // `preserved.length % cap`. Throw loudly rather than silently land on
+  // `NaN` indices that would corrupt the ring on the next push.
+  if (cap < 1) {
+    throw new Error(`_setHistoryCapForTesting: cap must be >= 1 (got ${cap})`)
+  }
+  const current = getCacheStatsHistory()
+  const preserved = cap < current.length ? current.slice(-cap) : current
+  state.history = new Array(cap)
+  for (let i = 0; i < preserved.length; i++) {
+    state.history[i] = preserved[i]
+  }
+  state.historyWriteIdx = preserved.length % cap
+  state.historySize = preserved.length
+  state.historyMax = cap
+}
--- a/src/services/api/codexShim.ts
+++ b/src/services/api/codexShim.ts
@@ -1,4 +1,5 @@
 import { APIError } from '@anthropic-ai/sdk'
+import { buildAnthropicUsageFromRawUsage } from './cacheMetrics.js'
 import { compressToolHistory } from './compressToolHistory.js'
 import { fetchWithProxyRetry } from './fetchWithProxyRetry.js'
 import type {
@@ -78,21 +79,12 @@ type CodexSseEvent = {
  data: Record<string, any>
 }

-function makeUsage(usage?: {
-  input_tokens?: number
-  output_tokens?: number
-  input_tokens_details?: { cached_tokens?: number }
-  prompt_tokens_details?: { cached_tokens?: number }
-}): AnthropicUsage {
-  return {
-    input_tokens: usage?.input_tokens ?? 0,
-    output_tokens: usage?.output_tokens ?? 0,
-    cache_creation_input_tokens: 0,
-    cache_read_input_tokens:
-      usage?.input_tokens_details?.cached_tokens ??
-      usage?.prompt_tokens_details?.cached_tokens ??
-      0,
-  }
+function makeUsage(usage?: Record<string, unknown>): AnthropicUsage {
+  // Single source of truth for raw → Anthropic shape. Lives in
+  // cacheMetrics.ts alongside the raw-shape extractor so any new
+  // provider quirk requires a one-file change and the integration test
+  // can call the exact same function instead of re-implementing it.
+  return buildAnthropicUsageFromRawUsage(usage)
 }

 function makeMessageId(): string {
@@ -911,18 +903,14 @@ export async function* codexStreamToAnthropic(
      stop_reason: determineStopReason(finalResponse, sawToolUse),
      stop_sequence: null,
    },
-    usage: {
-      // Subtract cached tokens: OpenAI includes them in input_tokens,
-      // but Anthropic convention treats input_tokens as non-cached only.
-      input_tokens: (finalResponse?.usage?.input_tokens ?? 0) -
-        (finalResponse?.usage?.input_tokens_details?.cached_tokens ??
-         finalResponse?.usage?.prompt_tokens_details?.cached_tokens ?? 0),
-      output_tokens: finalResponse?.usage?.output_tokens ?? 0,
-      cache_read_input_tokens:
-        finalResponse?.usage?.input_tokens_details?.cached_tokens ??
-        finalResponse?.usage?.prompt_tokens_details?.cached_tokens ??
-        0,
-    },
+    // Delegate to the shared normalizer so the streaming message_delta
+    // path uses the same raw→Anthropic conversion as makeUsage() above
+    // and the non-streaming response converter below. Previously this
+    // block had its own inline subtraction that missed Kimi / DeepSeek
+    // / Gemini raw shapes that the shared helper handles.
+    usage: makeUsage(
+      finalResponse?.usage as Record<string, unknown> | undefined,
+    ),
  }
  yield { type: 'message_stop' }
 }
--- a/src/services/api/openaiShim.ts
+++ b/src/services/api/openaiShim.ts
@@ -46,6 +46,7 @@ import {
  type AnthropicUsage,
  type ShimCreateParams,
 } from './codexShim.js'
+import { buildAnthropicUsageFromRawUsage } from './cacheMetrics.js'
 import { compressToolHistory } from './compressToolHistory.js'
 import { fetchWithProxyRetry } from './fetchWithProxyRetry.js'
 import {
@@ -845,16 +846,12 @@ function convertChunkUsage(
  usage: OpenAIStreamChunk['usage'] | undefined,
 ): Partial<AnthropicUsage> | undefined {
  if (!usage) return undefined
-
-  const cached = usage.prompt_tokens_details?.cached_tokens ?? 0
-  return {
-    // Subtract cached tokens: OpenAI includes them in prompt_tokens,
-    // but Anthropic convention treats input_tokens as non-cached only.
-    input_tokens: (usage.prompt_tokens ?? 0) - cached,
-    output_tokens: usage.completion_tokens ?? 0,
-    cache_creation_input_tokens: 0,
-    cache_read_input_tokens: cached,
-  }
+  // Delegates to the shared helper so this path, codexShim.makeUsage,
+  // the non-streaming response below, and the integration tests all
+  // produce byte-identical output for the same raw input.
+  return buildAnthropicUsageFromRawUsage(
+    usage as unknown as Record<string, unknown>,
+  )
 }

 const JSON_REPAIR_SUFFIXES = [
@@ -2154,12 +2151,9 @@ class OpenAIShimMessages {
      model: data.model ?? model,
      stop_reason: stopReason,
      stop_sequence: null,
-      usage: {
-        input_tokens: data.usage?.prompt_tokens ?? 0,
-        output_tokens: data.usage?.completion_tokens ?? 0,
-        cache_creation_input_tokens: 0,
-        cache_read_input_tokens: data.usage?.prompt_tokens_details?.cached_tokens ?? 0,
-      },
+      usage: buildAnthropicUsageFromRawUsage(
+        data.usage as unknown as Record<string, unknown> | undefined,
+      ),
    }
  }
 }
--- a/src/tools/ConfigTool/supportedSettings.ts
+++ b/src/tools/ConfigTool/supportedSettings.ts
@@ -1,5 +1,8 @@
 import { feature } from 'bun:bundle'
-import { getRemoteControlAtStartup } from '../../utils/config.js'
+import {
+  getRemoteControlAtStartup,
+  SHOW_CACHE_STATS_MODES,
+} from '../../utils/config.js'
 import {
  EDITOR_MODES,
  NOTIFICATION_CHANNELS,
@@ -77,6 +80,13 @@ export const SUPPORTED_SETTINGS: Record<string, SettingConfig> = {
    description:
      'Show turn duration message after responses (e.g., "Cooked for 1m 6s")',
  },
+  showCacheStats: {
+    source: 'global',
+    type: 'string',
+    description:
+      'Show per-query cache hit/miss summary at end of turn (off | compact | full)',
+    options: SHOW_CACHE_STATS_MODES,
+  },
  terminalProgressBarEnabled: {
    source: 'global',
    type: 'boolean',
--- a/src/utils/config.showCacheStats.test.ts
+++ b/src/utils/config.showCacheStats.test.ts
@@ -0,0 +1,126 @@
+import { expect, test, describe } from 'bun:test'
+import { z } from 'zod'
+import {
+  DEFAULT_GLOBAL_CONFIG,
+  GLOBAL_CONFIG_KEYS,
+  isGlobalConfigKey,
+  SHOW_CACHE_STATS_MODES,
+  type GlobalConfig,
+} from './config.js'
+
+// Standalone Zod schema mirroring the runtime contract for showCacheStats.
+// The config file does not carry a Zod schema per field (GlobalConfig is a
+// plain TS type with defaults), so we exercise validation here so that any
+// future drift — e.g. adding a mode without updating the UI — is caught at
+// test time rather than silently rendered in /config.
+const ShowCacheStatsSchema = z.enum(SHOW_CACHE_STATS_MODES)
+
+describe('GlobalConfig — showCacheStats registration', () => {
+  test('default is "compact"', () => {
+    expect(DEFAULT_GLOBAL_CONFIG.showCacheStats).toBe('compact')
+  })
+
+  test('is listed in GLOBAL_CONFIG_KEYS (exposed via /config and ConfigTool)', () => {
+    expect(GLOBAL_CONFIG_KEYS).toContain('showCacheStats')
+    expect(isGlobalConfigKey('showCacheStats')).toBe(true)
+  })
+
+  test('SHOW_CACHE_STATS_MODES is the single source of truth', () => {
+    expect(SHOW_CACHE_STATS_MODES).toEqual(['off', 'compact', 'full'])
+  })
+})
+
+describe('showCacheStats — Zod validation', () => {
+  test('accepts "off"', () => {
+    expect(ShowCacheStatsSchema.parse('off')).toBe('off')
+  })
+
+  test('accepts "compact"', () => {
+    expect(ShowCacheStatsSchema.parse('compact')).toBe('compact')
+  })
+
+  test('accepts "full"', () => {
+    expect(ShowCacheStatsSchema.parse('full')).toBe('full')
+  })
+
+  test('rejects arbitrary strings', () => {
+    expect(() => ShowCacheStatsSchema.parse('verbose')).toThrow()
+    expect(() => ShowCacheStatsSchema.parse('')).toThrow()
+    expect(() => ShowCacheStatsSchema.parse('ON')).toThrow()
+  })
+
+  test('rejects non-string values', () => {
+    expect(() => ShowCacheStatsSchema.parse(true)).toThrow()
+    expect(() => ShowCacheStatsSchema.parse(1)).toThrow()
+    expect(() => ShowCacheStatsSchema.parse(null)).toThrow()
+    expect(() => ShowCacheStatsSchema.parse(undefined)).toThrow()
+  })
+})
+
+describe('showCacheStats — GlobalConfig type surface', () => {
+  test('assignable to each accepted mode without casting', () => {
+    const a: Pick<GlobalConfig, 'showCacheStats'> = { showCacheStats: 'off' }
+    const b: Pick<GlobalConfig, 'showCacheStats'> = { showCacheStats: 'compact' }
+    const c: Pick<GlobalConfig, 'showCacheStats'> = { showCacheStats: 'full' }
+    expect([a.showCacheStats, b.showCacheStats, c.showCacheStats]).toEqual([
+      'off',
+      'compact',
+      'full',
+    ])
+  })
+})
+
+describe('showCacheStats — default applies to pre-existing configs', () => {
+  // Review feedback (P2 #7): "ensure the schema explicitly sets
+  // showCacheStats: 'compact' as the default value, not relying on the
+  // REPL gate's undefined handling."
+  //
+  // Config layer at src/utils/config.ts:1494 already does
+  //   { ...createDefault(), ...parsedConfig }
+  // so a user who had a config file from before this PR gets the
+  // 'compact' default automatically on first load. These tests pin that
+  // behavior so a future refactor of the merge pattern surfaces the
+  // regression loudly.
+
+  test('legacy config without showCacheStats field merges to default', () => {
+    // Simulate what getConfig() produces for an old config.json that
+    // predates this PR: spread default first, then spread the loaded
+    // (incomplete) object on top.
+    const legacyLoadedConfig = {
+      // Fields typical of a pre-PR config — anything real but no
+      // showCacheStats. The exact shape doesn't matter; we're testing
+      // the merge semantics.
+      theme: 'dark' as const,
+    }
+    const merged = {
+      ...DEFAULT_GLOBAL_CONFIG,
+      ...legacyLoadedConfig,
+    }
+    expect(merged.showCacheStats).toBe('compact')
+  })
+
+  test('user-set value overrides default via merge', () => {
+    // Counterpart: if the user has explicitly set a value, the merge
+    // must preserve it (defaults must NOT clobber user intent).
+    const userConfig = { showCacheStats: 'off' as const }
+    const merged = {
+      ...DEFAULT_GLOBAL_CONFIG,
+      ...userConfig,
+    }
+    expect(merged.showCacheStats).toBe('off')
+  })
+
+  test('REPL gate fallback kicks in only when mode is undefined', () => {
+    // Belt-and-suspenders from REPL.tsx:3031 — `?? 'compact'` after the
+    // config read. Simulates the code path in case a pathological config
+    // read returns an empty object and skips the merge entirely.
+    const corruptConfigRead: Partial<GlobalConfig> = {}
+    const mode = corruptConfigRead.showCacheStats ?? 'compact'
+    expect(mode).toBe('compact')
+
+    // Explicit 'off' is preserved — fallback must not clobber user intent.
+    const explicitOff: Partial<GlobalConfig> = { showCacheStats: 'off' }
+    const modeOff = explicitOff.showCacheStats ?? 'compact'
+    expect(modeOff).toBe('off')
+  })
+})
--- a/src/utils/config.ts
+++ b/src/utils/config.ts
@@ -179,6 +179,9 @@ export type EditorMode = 'emacs' | (typeof EDITOR_MODES)[number]

 export type DiffTool = 'terminal' | 'auto'

+export type ShowCacheStatsMode = 'off' | 'compact' | 'full'
+export const SHOW_CACHE_STATS_MODES = ['off', 'compact', 'full'] as const satisfies readonly ShowCacheStatsMode[]
+
 export type OutputStyle = string

 export type Providers = typeof PROVIDERS[number]
@@ -246,6 +249,11 @@ export type GlobalConfig = {
  autoCompactEnabled: boolean // Controls whether auto-compact is enabled
  toolHistoryCompressionEnabled: boolean // Compress old tool_result content for small-context providers
  showTurnDuration: boolean // Controls whether to show turn duration message (e.g., "Cooked for 1m 6s")
+  // Controls whether to show per-query cache hit/miss stats at the end of each turn.
+  // 'off'     — no display
+  // 'compact' — one-line summary (e.g. "[Cache: 1.2k read • hit 12%]")
+  // 'full'    — breakdown (read / created / hit-rate) per query
+  showCacheStats: ShowCacheStatsMode
  /**
   * @deprecated Use settings.env instead.
   */
@@ -628,6 +636,7 @@ function createDefaultGlobalConfig(): GlobalConfig {
    autoCompactEnabled: true,
    toolHistoryCompressionEnabled: true,
    showTurnDuration: true,
+    showCacheStats: 'compact',
    hasSeenTasksHint: false,
    hasUsedStash: false,
    hasUsedBackgroundTask: false,
@@ -677,6 +686,7 @@ export const GLOBAL_CONFIG_KEYS = [
  'autoCompactEnabled',
  'toolHistoryCompressionEnabled',
  'showTurnDuration',
+  'showCacheStats',
  'diffTool',
  'env',
  'tipsHistory',