feat: add /cache-probe diagnostic command (#580)

Add a /cache-probe slash command for debugging prompt caching behaviour on OpenAI-compatible providers (GitHub Copilot, OpenAI direct). The command sends two identical API requests in sequence and compares the raw server response usage stats, showing: - Input/output token counts - Cache read tokens (from prompt_tokens_details or input_tokens_details) - Latency for each request - Cache hit rate percentage Usage: /cache-probe # test default model /cache-probe claude-sonnet-4 # test specific model /cache-probe gpt-5.4 --no-key # test without prompt_cache_key The --no-key flag omits prompt_cache_key/prompt_cache_retention/store to test whether the server does content-based auto-caching (it does on GitHub Copilot). This is a debugging/diagnostic tool, not intended for regular use. It was instrumental in discovering that: 1. Copilot auto-caches server-side based on content hash 2. prompt_cache_key is ignored by the proxy 3. The streaming path was not reporting cached tokens Only enabled when the provider is OpenAI or GitHub (not for firstParty Anthropic which has different caching semantics). Related: #515 Co-authored-by: Zartris <14197299+Zartris@users.noreply.github.com>
2026-04-10 15:34:38 +02:00
parent 598651f423
commit 9ccaa7a675
3 changed files with 432 additions and 0 deletions
--- a/src/commands/cache-probe/cache-probe.ts
+++ b/src/commands/cache-probe/cache-probe.ts
@@ -0,0 +1,413 @@
+import { getSessionId } from '../../bootstrap/state.js'
+import { resolveProviderRequest } from '../../services/api/providerConfig.js'
+import type { LocalCommandCall } from '../../types/command.js'
+import { logForDebugging } from '../../utils/debug.js'
+import { isEnvTruthy } from '../../utils/envUtils.js'
+import { hydrateGithubModelsTokenFromSecureStorage } from '../../utils/githubModelsCredentials.js'
+import { getMainLoopModel } from '../../utils/model/model.js'
+
+const COPILOT_HEADERS: Record<string, string> = {
+  'User-Agent': 'GitHubCopilotChat/0.26.7',
+  'Editor-Version': 'vscode/1.99.3',
+  'Editor-Plugin-Version': 'copilot-chat/0.26.7',
+  'Copilot-Integration-Id': 'vscode-chat',
+}
+
+// Large system prompt (~6000 chars, ~1500 tokens) to cross the 1024-token cache threshold
+const SYSTEM_PROMPT = [
+  'You are a coding assistant. Answer concisely.',
+  'CONTEXT: User is working on a TypeScript project with Bun runtime.',
+  ...Array.from(
+    { length: 80 },
+    (_, i) =>
+      `Rule ${i + 1}: Follow best practices for TypeScript including strict typing, error handling, testing, and clean code. Prefer explicit types over any. Use const assertions. Await all async operations.`,
+  ),
+].join('\n\n')
+
+const USER_MESSAGE = 'Say "hello" and nothing else.'
+const DELAY_MS = 3000
+
+/**
+ * Extract model family from a versioned model string.
+ * e.g. "gpt-5.4-0626" → "gpt-5.4", "codex-mini-latest" → "codex-mini"
+ */
+function getModelFamily(model: string | undefined): string {
+  if (!model) return 'unknown'
+  return model
+    .replace(/-\d{4,}$/, '')
+    .replace(/-latest$/, '')
+    .replace(/-preview$/, '')
+}
+
+function getField(obj: unknown, path: string): unknown {
+  return path
+    .split('.')
+    .reduce((o: any, k: string) => (o != null ? o[k] : undefined), obj)
+}
+
+interface ProbeResult {
+  label: string
+  status: number
+  elapsed: number
+  headers: Record<string, string>
+  usage: Record<string, unknown> | null
+  responseId: string | null
+  error: string | null
+}
+
+async function sendProbe(
+  url: string,
+  headers: Record<string, string>,
+  body: Record<string, unknown>,
+  label: string,
+): Promise<ProbeResult> {
+  const start = Date.now()
+  let response: Response
+  try {
+    response = await fetch(url, {
+      method: 'POST',
+      headers,
+      body: JSON.stringify(body),
+    })
+  } catch (err: any) {
+    return {
+      label,
+      status: 0,
+      elapsed: Date.now() - start,
+      headers: {},
+      usage: null,
+      responseId: null,
+      error: err.message,
+    }
+  }
+  const elapsed = Date.now() - start
+
+  const respHeaders: Record<string, string> = {}
+  response.headers.forEach((value, key) => {
+    respHeaders[key] = value
+  })
+
+  if (!response.ok) {
+    const errorBody = await response.text().catch(() => '')
+    return {
+      label,
+      status: response.status,
+      elapsed,
+      headers: respHeaders,
+      usage: null,
+      responseId: null,
+      error: errorBody,
+    }
+  }
+
+  // Parse SSE stream for usage data
+  const text = await response.text()
+  let usage: Record<string, unknown> | null = null
+  let responseId: string | null = null
+
+  const isResponses = url.endsWith('/responses')
+  for (const chunk of text.split('\n\n')) {
+    const lines = chunk
+      .split('\n')
+      .map((l) => l.trim())
+      .filter(Boolean)
+
+    if (isResponses) {
+      const eventLine = lines.find((l) => l.startsWith('event: '))
+      const dataLines = lines.filter((l) => l.startsWith('data: '))
+      if (!eventLine || !dataLines.length) continue
+      const event = eventLine.slice(7).trim()
+      if (
+        event === 'response.completed' ||
+        event === 'response.incomplete'
+      ) {
+        try {
+          const data = JSON.parse(
+            dataLines.map((l) => l.slice(6)).join('\n'),
+          )
+          usage = (data?.response?.usage as Record<string, unknown>) ?? null
+          responseId = (data?.response?.id as string) ?? null
+        } catch {}
+      }
+    } else {
+      for (const line of lines) {
+        if (!line.startsWith('data: ')) continue
+        const raw = line.slice(6).trim()
+        if (raw === '[DONE]') continue
+        try {
+          const data = JSON.parse(raw) as Record<string, unknown>
+          if (data.usage) {
+            usage = data.usage as Record<string, unknown>
+            responseId = (data.id as string) ?? null
+          }
+        } catch {}
+      }
+    }
+  }
+
+  return { label, status: response.status, elapsed, headers: respHeaders, usage, responseId, error: null }
+}
+
+function formatResult(r: ProbeResult): string {
+  const lines: string[] = [`--- ${r.label} ---`]
+  if (r.error) {
+    lines.push(`  ERROR (HTTP ${r.status}): ${r.error.slice(0, 200)}`)
+    return lines.join('\n')
+  }
+  lines.push(`  HTTP ${r.status} — ${r.elapsed}ms`)
+  if (r.responseId) lines.push(`  response.id: ${r.responseId}`)
+
+  if (r.usage) {
+    lines.push('  Usage:')
+    lines.push(`    ${JSON.stringify(r.usage, null, 2).replace(/\n/g, '\n    ')}`)
+  } else {
+    lines.push('  Usage: null')
+  }
+
+  // Interesting headers
+  for (const h of [
+    'openai-processing-ms',
+    'x-ratelimit-remaining',
+    'x-ratelimit-limit',
+    'x-ms-region',
+    'x-github-request-id',
+    'x-request-id',
+  ]) {
+    if (r.headers[h]) lines.push(`  ${h}: ${r.headers[h]}`)
+  }
+  return lines.join('\n')
+}
+
+export const call: LocalCommandCall = async (args) => {
+  const parts = (args ?? '').trim().split(/\s+/).filter(Boolean)
+  const noKey = parts.includes('--no-key')
+  const modelOverride = parts.find((p) => !p.startsWith('--')) || undefined
+  const modelStr = modelOverride ?? getMainLoopModel()
+  const request = resolveProviderRequest({ model: modelStr })
+  const isGithub = isEnvTruthy(process.env.CLAUDE_CODE_USE_GITHUB)
+
+  // Resolve API key the same way the OpenAI shim does
+  let apiKey = process.env.OPENAI_API_KEY ?? ''
+  if (!apiKey && isGithub) {
+    hydrateGithubModelsTokenFromSecureStorage()
+    apiKey =
+      process.env.OPENAI_API_KEY ??
+      process.env.GITHUB_TOKEN ??
+      process.env.GH_TOKEN ??
+      ''
+  }
+
+  if (!apiKey) {
+    return {
+      type: 'text',
+      value:
+        'No API key found. Make sure you are in an active OpenAI-compatible or GitHub Copilot session.\n' +
+        'For GitHub Copilot: run /onboard-github first.\n' +
+        'For OpenAI-compatible: set OPENAI_API_KEY.',
+    }
+  }
+
+  const useResponses = request.transport === 'codex_responses'
+  const endpoint = useResponses ? '/responses' : '/chat/completions'
+  const url = `${request.baseUrl}${endpoint}`
+  const family = getModelFamily(request.resolvedModel)
+  const cacheKey = `${getSessionId()}:${family}`
+
+  const headers: Record<string, string> = {
+    'Content-Type': 'application/json',
+    Authorization: `Bearer ${apiKey}`,
+    originator: 'openclaude',
+  }
+  if (isGithub) {
+    Object.assign(headers, COPILOT_HEADERS)
+  }
+
+  let body: Record<string, unknown>
+  if (useResponses) {
+    body = {
+      model: request.resolvedModel,
+      instructions: SYSTEM_PROMPT,
+      input: [
+        {
+          type: 'message',
+          role: 'user',
+          content: [{ type: 'input_text', text: USER_MESSAGE }],
+        },
+      ],
+      stream: true,
+      ...(noKey ? {} : {
+        store: false,
+        prompt_cache_key: cacheKey,
+        prompt_cache_retention: '24h',
+      }),
+    }
+  } else {
+    body = {
+      model: request.resolvedModel,
+      messages: [
+        { role: 'system', content: SYSTEM_PROMPT },
+        { role: 'user', content: USER_MESSAGE },
+      ],
+      stream: true,
+      stream_options: { include_usage: true },
+      max_tokens: 20,
+      ...(noKey ? {} : {
+        store: false,
+        prompt_cache_key: cacheKey,
+      }),
+    }
+  }
+
+  // Log configuration
+  const config = [
+    `[cache-probe] Starting cache probe${noKey ? ' (--no-key: cache params OMITTED)' : ''}`,
+    `  model: ${request.resolvedModel} (family: ${family})`,
+    `  transport: ${request.transport}`,
+    `  endpoint: ${url}`,
+    `  prompt_cache_key: ${noKey ? 'NOT SENT' : cacheKey}`,
+    `  store: ${noKey ? 'NOT SENT' : 'false'}`,
+    `  system prompt: ~${Math.round(SYSTEM_PROMPT.length / 4)} tokens`,
+    `  delay between calls: ${DELAY_MS}ms`,
+  ].join('\n')
+  logForDebugging(config)
+
+  // Call 1 — Cold
+  const r1 = await sendProbe(url, headers, body, 'CALL 1 — Cold (no cache)')
+  logForDebugging(`[cache-probe]\n${formatResult(r1)}`)
+
+  if (r1.error) {
+    return {
+      type: 'text',
+      value: `Cache probe failed on first call: HTTP ${r1.status}\n${r1.error.slice(0, 300)}\n\nFull details in debug log.`,
+    }
+  }
+
+  // Wait
+  await new Promise((r) => setTimeout(r, DELAY_MS))
+
+  // Call 2 — Warm
+  const r2 = await sendProbe(url, headers, body, 'CALL 2 — Warm (cache expected)')
+  logForDebugging(`[cache-probe]\n${formatResult(r2)}`)
+
+  // --- Comparison ---
+  const fields = [
+    'input_tokens',
+    'output_tokens',
+    'total_tokens',
+    'prompt_tokens',
+    'completion_tokens',
+    'input_tokens_details.cached_tokens',
+    'prompt_tokens_details.cached_tokens',
+    'output_tokens_details.reasoning_tokens',
+  ]
+
+  const comparison: string[] = ['[cache-probe] COMPARISON']
+  comparison.push(
+    `  ${'Field'.padEnd(42)} ${'Call 1'.padStart(8)}  ${'Call 2'.padStart(8)}  ${'Delta'.padStart(8)}`,
+  )
+  comparison.push(`  ${'-'.repeat(72)}`)
+
+  for (const f of fields) {
+    const v1 = getField(r1.usage, f)
+    const v2 = getField(r2.usage, f)
+    if (v1 === undefined && v2 === undefined) continue
+    const d =
+      typeof v1 === 'number' && typeof v2 === 'number' ? v2 - v1 : ''
+    comparison.push(
+      `  ${f.padEnd(42)} ${String(v1 ?? '-').padStart(8)}  ${String(v2 ?? '-').padStart(8)}  ${String(d).padStart(8)}`,
+    )
+  }
+
+  comparison.push('')
+  comparison.push(
+    `  Latency: ${r1.elapsed}ms → ${r2.elapsed}ms (${r2.elapsed - r1.elapsed > 0 ? '+' : ''}${r2.elapsed - r1.elapsed}ms)`,
+  )
+
+  // Header comparison
+  for (const h of ['openai-processing-ms', 'x-ms-region', 'x-ratelimit-remaining']) {
+    const v1 = r1.headers[h]
+    const v2 = r2.headers[h]
+    if (v1 || v2) {
+      comparison.push(`  ${h}: ${v1 ?? '-'} → ${v2 ?? '-'}`)
+    }
+  }
+
+  // Verdict
+  const cached2 =
+    (getField(r2.usage, 'input_tokens_details.cached_tokens') as number) ??
+    (getField(r2.usage, 'prompt_tokens_details.cached_tokens') as number) ??
+    0
+  const input1 =
+    ((r1.usage?.input_tokens ?? r1.usage?.prompt_tokens) as number) ?? 0
+  const input2 =
+    ((r2.usage?.input_tokens ?? r2.usage?.prompt_tokens) as number) ?? 0
+
+  let verdict: string
+  if (cached2 > 0) {
+    const rate = input2 > 0 ? Math.round((cached2 / input2) * 100) : '?'
+    verdict = `CACHE HIT: ${cached2} cached tokens (${rate}% of input)`
+  } else if (input1 === 0 && input2 === 0) {
+    verdict = 'INCONCLUSIVE: Server returns 0 input_tokens — cannot measure'
+  } else if (r2.elapsed < r1.elapsed * 0.6 && input1 > 100) {
+    verdict = `POSSIBLE SILENT CACHING: Call 2 was ${Math.round((1 - r2.elapsed / r1.elapsed) * 100)}% faster but no cached_tokens reported`
+  } else {
+    verdict = 'NO CACHE DETECTED'
+  }
+
+  comparison.push(`\n  Verdict: ${verdict}`)
+
+  // --- Simulate what main's shim code does with this usage ---
+  // codexShim.ts makeUsage() — used for Responses API (GPT-5+/Codex)
+  function mainMakeUsage(u: any) {
+    return {
+      input_tokens: u?.input_tokens ?? 0,
+      output_tokens: u?.output_tokens ?? 0,
+      cache_creation_input_tokens: 0,
+      cache_read_input_tokens: 0,  // ← main hardcodes this to 0
+    }
+  }
+  // openaiShim.ts convertChunkUsage() — used for Chat Completions
+  function mainConvertChunkUsage(u: any) {
+    return {
+      input_tokens: u?.prompt_tokens ?? 0,
+      output_tokens: u?.completion_tokens ?? 0,
+      cache_creation_input_tokens: 0,
+      cache_read_input_tokens: u?.prompt_tokens_details?.cached_tokens ?? 0,
+    }
+  }
+
+  const shimFn = useResponses ? mainMakeUsage : mainConvertChunkUsage
+  const shim1 = shimFn(r1.usage)
+  const shim2 = shimFn(r2.usage)
+
+  comparison.push('')
+  comparison.push(`  --- What main's shim reports (${useResponses ? 'codexShim.makeUsage' : 'openaiShim.convertChunkUsage'}) ---`)
+  comparison.push(`  Call 1: cache_read_input_tokens=${shim1.cache_read_input_tokens}`)
+  comparison.push(`  Call 2: cache_read_input_tokens=${shim2.cache_read_input_tokens}`)
+  if (useResponses && cached2 > 0) {
+    comparison.push(`  BUG: Server returned ${cached2} cached tokens but main's makeUsage() drops it → reports 0`)
+  } else if (!useResponses && shim2.cache_read_input_tokens > 0) {
+    comparison.push(`  OK: Chat Completions path on main correctly reads cached_tokens`)
+  }
+
+  logForDebugging(comparison.join('\n'))
+
+  // User-facing summary
+  const mode = noKey ? ' (NO cache key sent)' : ''
+  const shimLabel = useResponses ? 'codexShim.makeUsage()' : 'openaiShim.convertChunkUsage()'
+  const summary = [
+    `Cache Probe — ${request.resolvedModel} via ${useResponses ? 'Responses API' : 'Chat Completions'}${mode}`,
+    '',
+    `Call 1: ${r1.elapsed}ms, input=${input1}, cached=${(getField(r1.usage, 'input_tokens_details.cached_tokens') as number) ?? (getField(r1.usage, 'prompt_tokens_details.cached_tokens') as number) ?? 0}`,
+    `Call 2: ${r2.elapsed}ms, input=${input2}, cached=${cached2}`,
+    '',
+    verdict,
+    '',
+    `What main's ${shimLabel} reports:`,
+    `  Call 2 cache_read_input_tokens = ${shim2.cache_read_input_tokens}${useResponses && cached2 > 0 ? '  ← BUG: server sent ' + cached2 + ' but main drops it' : ''}`,
+    '',
+    'Full details written to debug log.',
+  ].join('\n')
+
+  return { type: 'text', value: summary }
+}