feat: add /cache-probe diagnostic command (#580)

Add a /cache-probe slash command for debugging prompt caching behaviour
on OpenAI-compatible providers (GitHub Copilot, OpenAI direct).

The command sends two identical API requests in sequence and compares the
raw server response usage stats, showing:
- Input/output token counts
- Cache read tokens (from prompt_tokens_details or input_tokens_details)
- Latency for each request
- Cache hit rate percentage

Usage:
  /cache-probe                    # test default model
  /cache-probe claude-sonnet-4    # test specific model
  /cache-probe gpt-5.4 --no-key  # test without prompt_cache_key

The --no-key flag omits prompt_cache_key/prompt_cache_retention/store to
test whether the server does content-based auto-caching (it does on
GitHub Copilot).

This is a debugging/diagnostic tool, not intended for regular use. It was
instrumental in discovering that:
1. Copilot auto-caches server-side based on content hash
2. prompt_cache_key is ignored by the proxy
3. The streaming path was not reporting cached tokens

Only enabled when the provider is OpenAI or GitHub (not for firstParty
Anthropic which has different caching semantics).

Related: #515

Co-authored-by: Zartris <14197299+Zartris@users.noreply.github.com>
This commit is contained in:
Zartris
2026-04-10 15:34:38 +02:00
committed by GitHub
parent 598651f423
commit 9ccaa7a675
3 changed files with 432 additions and 0 deletions

View File

@@ -0,0 +1,413 @@
import { getSessionId } from '../../bootstrap/state.js'
import { resolveProviderRequest } from '../../services/api/providerConfig.js'
import type { LocalCommandCall } from '../../types/command.js'
import { logForDebugging } from '../../utils/debug.js'
import { isEnvTruthy } from '../../utils/envUtils.js'
import { hydrateGithubModelsTokenFromSecureStorage } from '../../utils/githubModelsCredentials.js'
import { getMainLoopModel } from '../../utils/model/model.js'
const COPILOT_HEADERS: Record<string, string> = {
'User-Agent': 'GitHubCopilotChat/0.26.7',
'Editor-Version': 'vscode/1.99.3',
'Editor-Plugin-Version': 'copilot-chat/0.26.7',
'Copilot-Integration-Id': 'vscode-chat',
}
// Large system prompt (~6000 chars, ~1500 tokens) to cross the 1024-token cache threshold
const SYSTEM_PROMPT = [
'You are a coding assistant. Answer concisely.',
'CONTEXT: User is working on a TypeScript project with Bun runtime.',
...Array.from(
{ length: 80 },
(_, i) =>
`Rule ${i + 1}: Follow best practices for TypeScript including strict typing, error handling, testing, and clean code. Prefer explicit types over any. Use const assertions. Await all async operations.`,
),
].join('\n\n')
const USER_MESSAGE = 'Say "hello" and nothing else.'
const DELAY_MS = 3000
/**
* Extract model family from a versioned model string.
* e.g. "gpt-5.4-0626" → "gpt-5.4", "codex-mini-latest" → "codex-mini"
*/
function getModelFamily(model: string | undefined): string {
if (!model) return 'unknown'
return model
.replace(/-\d{4,}$/, '')
.replace(/-latest$/, '')
.replace(/-preview$/, '')
}
function getField(obj: unknown, path: string): unknown {
return path
.split('.')
.reduce((o: any, k: string) => (o != null ? o[k] : undefined), obj)
}
interface ProbeResult {
label: string
status: number
elapsed: number
headers: Record<string, string>
usage: Record<string, unknown> | null
responseId: string | null
error: string | null
}
async function sendProbe(
url: string,
headers: Record<string, string>,
body: Record<string, unknown>,
label: string,
): Promise<ProbeResult> {
const start = Date.now()
let response: Response
try {
response = await fetch(url, {
method: 'POST',
headers,
body: JSON.stringify(body),
})
} catch (err: any) {
return {
label,
status: 0,
elapsed: Date.now() - start,
headers: {},
usage: null,
responseId: null,
error: err.message,
}
}
const elapsed = Date.now() - start
const respHeaders: Record<string, string> = {}
response.headers.forEach((value, key) => {
respHeaders[key] = value
})
if (!response.ok) {
const errorBody = await response.text().catch(() => '')
return {
label,
status: response.status,
elapsed,
headers: respHeaders,
usage: null,
responseId: null,
error: errorBody,
}
}
// Parse SSE stream for usage data
const text = await response.text()
let usage: Record<string, unknown> | null = null
let responseId: string | null = null
const isResponses = url.endsWith('/responses')
for (const chunk of text.split('\n\n')) {
const lines = chunk
.split('\n')
.map((l) => l.trim())
.filter(Boolean)
if (isResponses) {
const eventLine = lines.find((l) => l.startsWith('event: '))
const dataLines = lines.filter((l) => l.startsWith('data: '))
if (!eventLine || !dataLines.length) continue
const event = eventLine.slice(7).trim()
if (
event === 'response.completed' ||
event === 'response.incomplete'
) {
try {
const data = JSON.parse(
dataLines.map((l) => l.slice(6)).join('\n'),
)
usage = (data?.response?.usage as Record<string, unknown>) ?? null
responseId = (data?.response?.id as string) ?? null
} catch {}
}
} else {
for (const line of lines) {
if (!line.startsWith('data: ')) continue
const raw = line.slice(6).trim()
if (raw === '[DONE]') continue
try {
const data = JSON.parse(raw) as Record<string, unknown>
if (data.usage) {
usage = data.usage as Record<string, unknown>
responseId = (data.id as string) ?? null
}
} catch {}
}
}
}
return { label, status: response.status, elapsed, headers: respHeaders, usage, responseId, error: null }
}
function formatResult(r: ProbeResult): string {
const lines: string[] = [`--- ${r.label} ---`]
if (r.error) {
lines.push(` ERROR (HTTP ${r.status}): ${r.error.slice(0, 200)}`)
return lines.join('\n')
}
lines.push(` HTTP ${r.status}${r.elapsed}ms`)
if (r.responseId) lines.push(` response.id: ${r.responseId}`)
if (r.usage) {
lines.push(' Usage:')
lines.push(` ${JSON.stringify(r.usage, null, 2).replace(/\n/g, '\n ')}`)
} else {
lines.push(' Usage: null')
}
// Interesting headers
for (const h of [
'openai-processing-ms',
'x-ratelimit-remaining',
'x-ratelimit-limit',
'x-ms-region',
'x-github-request-id',
'x-request-id',
]) {
if (r.headers[h]) lines.push(` ${h}: ${r.headers[h]}`)
}
return lines.join('\n')
}
export const call: LocalCommandCall = async (args) => {
const parts = (args ?? '').trim().split(/\s+/).filter(Boolean)
const noKey = parts.includes('--no-key')
const modelOverride = parts.find((p) => !p.startsWith('--')) || undefined
const modelStr = modelOverride ?? getMainLoopModel()
const request = resolveProviderRequest({ model: modelStr })
const isGithub = isEnvTruthy(process.env.CLAUDE_CODE_USE_GITHUB)
// Resolve API key the same way the OpenAI shim does
let apiKey = process.env.OPENAI_API_KEY ?? ''
if (!apiKey && isGithub) {
hydrateGithubModelsTokenFromSecureStorage()
apiKey =
process.env.OPENAI_API_KEY ??
process.env.GITHUB_TOKEN ??
process.env.GH_TOKEN ??
''
}
if (!apiKey) {
return {
type: 'text',
value:
'No API key found. Make sure you are in an active OpenAI-compatible or GitHub Copilot session.\n' +
'For GitHub Copilot: run /onboard-github first.\n' +
'For OpenAI-compatible: set OPENAI_API_KEY.',
}
}
const useResponses = request.transport === 'codex_responses'
const endpoint = useResponses ? '/responses' : '/chat/completions'
const url = `${request.baseUrl}${endpoint}`
const family = getModelFamily(request.resolvedModel)
const cacheKey = `${getSessionId()}:${family}`
const headers: Record<string, string> = {
'Content-Type': 'application/json',
Authorization: `Bearer ${apiKey}`,
originator: 'openclaude',
}
if (isGithub) {
Object.assign(headers, COPILOT_HEADERS)
}
let body: Record<string, unknown>
if (useResponses) {
body = {
model: request.resolvedModel,
instructions: SYSTEM_PROMPT,
input: [
{
type: 'message',
role: 'user',
content: [{ type: 'input_text', text: USER_MESSAGE }],
},
],
stream: true,
...(noKey ? {} : {
store: false,
prompt_cache_key: cacheKey,
prompt_cache_retention: '24h',
}),
}
} else {
body = {
model: request.resolvedModel,
messages: [
{ role: 'system', content: SYSTEM_PROMPT },
{ role: 'user', content: USER_MESSAGE },
],
stream: true,
stream_options: { include_usage: true },
max_tokens: 20,
...(noKey ? {} : {
store: false,
prompt_cache_key: cacheKey,
}),
}
}
// Log configuration
const config = [
`[cache-probe] Starting cache probe${noKey ? ' (--no-key: cache params OMITTED)' : ''}`,
` model: ${request.resolvedModel} (family: ${family})`,
` transport: ${request.transport}`,
` endpoint: ${url}`,
` prompt_cache_key: ${noKey ? 'NOT SENT' : cacheKey}`,
` store: ${noKey ? 'NOT SENT' : 'false'}`,
` system prompt: ~${Math.round(SYSTEM_PROMPT.length / 4)} tokens`,
` delay between calls: ${DELAY_MS}ms`,
].join('\n')
logForDebugging(config)
// Call 1 — Cold
const r1 = await sendProbe(url, headers, body, 'CALL 1 — Cold (no cache)')
logForDebugging(`[cache-probe]\n${formatResult(r1)}`)
if (r1.error) {
return {
type: 'text',
value: `Cache probe failed on first call: HTTP ${r1.status}\n${r1.error.slice(0, 300)}\n\nFull details in debug log.`,
}
}
// Wait
await new Promise((r) => setTimeout(r, DELAY_MS))
// Call 2 — Warm
const r2 = await sendProbe(url, headers, body, 'CALL 2 — Warm (cache expected)')
logForDebugging(`[cache-probe]\n${formatResult(r2)}`)
// --- Comparison ---
const fields = [
'input_tokens',
'output_tokens',
'total_tokens',
'prompt_tokens',
'completion_tokens',
'input_tokens_details.cached_tokens',
'prompt_tokens_details.cached_tokens',
'output_tokens_details.reasoning_tokens',
]
const comparison: string[] = ['[cache-probe] COMPARISON']
comparison.push(
` ${'Field'.padEnd(42)} ${'Call 1'.padStart(8)} ${'Call 2'.padStart(8)} ${'Delta'.padStart(8)}`,
)
comparison.push(` ${'-'.repeat(72)}`)
for (const f of fields) {
const v1 = getField(r1.usage, f)
const v2 = getField(r2.usage, f)
if (v1 === undefined && v2 === undefined) continue
const d =
typeof v1 === 'number' && typeof v2 === 'number' ? v2 - v1 : ''
comparison.push(
` ${f.padEnd(42)} ${String(v1 ?? '-').padStart(8)} ${String(v2 ?? '-').padStart(8)} ${String(d).padStart(8)}`,
)
}
comparison.push('')
comparison.push(
` Latency: ${r1.elapsed}ms → ${r2.elapsed}ms (${r2.elapsed - r1.elapsed > 0 ? '+' : ''}${r2.elapsed - r1.elapsed}ms)`,
)
// Header comparison
for (const h of ['openai-processing-ms', 'x-ms-region', 'x-ratelimit-remaining']) {
const v1 = r1.headers[h]
const v2 = r2.headers[h]
if (v1 || v2) {
comparison.push(` ${h}: ${v1 ?? '-'}${v2 ?? '-'}`)
}
}
// Verdict
const cached2 =
(getField(r2.usage, 'input_tokens_details.cached_tokens') as number) ??
(getField(r2.usage, 'prompt_tokens_details.cached_tokens') as number) ??
0
const input1 =
((r1.usage?.input_tokens ?? r1.usage?.prompt_tokens) as number) ?? 0
const input2 =
((r2.usage?.input_tokens ?? r2.usage?.prompt_tokens) as number) ?? 0
let verdict: string
if (cached2 > 0) {
const rate = input2 > 0 ? Math.round((cached2 / input2) * 100) : '?'
verdict = `CACHE HIT: ${cached2} cached tokens (${rate}% of input)`
} else if (input1 === 0 && input2 === 0) {
verdict = 'INCONCLUSIVE: Server returns 0 input_tokens — cannot measure'
} else if (r2.elapsed < r1.elapsed * 0.6 && input1 > 100) {
verdict = `POSSIBLE SILENT CACHING: Call 2 was ${Math.round((1 - r2.elapsed / r1.elapsed) * 100)}% faster but no cached_tokens reported`
} else {
verdict = 'NO CACHE DETECTED'
}
comparison.push(`\n Verdict: ${verdict}`)
// --- Simulate what main's shim code does with this usage ---
// codexShim.ts makeUsage() — used for Responses API (GPT-5+/Codex)
function mainMakeUsage(u: any) {
return {
input_tokens: u?.input_tokens ?? 0,
output_tokens: u?.output_tokens ?? 0,
cache_creation_input_tokens: 0,
cache_read_input_tokens: 0, // ← main hardcodes this to 0
}
}
// openaiShim.ts convertChunkUsage() — used for Chat Completions
function mainConvertChunkUsage(u: any) {
return {
input_tokens: u?.prompt_tokens ?? 0,
output_tokens: u?.completion_tokens ?? 0,
cache_creation_input_tokens: 0,
cache_read_input_tokens: u?.prompt_tokens_details?.cached_tokens ?? 0,
}
}
const shimFn = useResponses ? mainMakeUsage : mainConvertChunkUsage
const shim1 = shimFn(r1.usage)
const shim2 = shimFn(r2.usage)
comparison.push('')
comparison.push(` --- What main's shim reports (${useResponses ? 'codexShim.makeUsage' : 'openaiShim.convertChunkUsage'}) ---`)
comparison.push(` Call 1: cache_read_input_tokens=${shim1.cache_read_input_tokens}`)
comparison.push(` Call 2: cache_read_input_tokens=${shim2.cache_read_input_tokens}`)
if (useResponses && cached2 > 0) {
comparison.push(` BUG: Server returned ${cached2} cached tokens but main's makeUsage() drops it → reports 0`)
} else if (!useResponses && shim2.cache_read_input_tokens > 0) {
comparison.push(` OK: Chat Completions path on main correctly reads cached_tokens`)
}
logForDebugging(comparison.join('\n'))
// User-facing summary
const mode = noKey ? ' (NO cache key sent)' : ''
const shimLabel = useResponses ? 'codexShim.makeUsage()' : 'openaiShim.convertChunkUsage()'
const summary = [
`Cache Probe — ${request.resolvedModel} via ${useResponses ? 'Responses API' : 'Chat Completions'}${mode}`,
'',
`Call 1: ${r1.elapsed}ms, input=${input1}, cached=${(getField(r1.usage, 'input_tokens_details.cached_tokens') as number) ?? (getField(r1.usage, 'prompt_tokens_details.cached_tokens') as number) ?? 0}`,
`Call 2: ${r2.elapsed}ms, input=${input2}, cached=${cached2}`,
'',
verdict,
'',
`What main's ${shimLabel} reports:`,
` Call 2 cache_read_input_tokens = ${shim2.cache_read_input_tokens}${useResponses && cached2 > 0 ? ' ← BUG: server sent ' + cached2 + ' but main drops it' : ''}`,
'',
'Full details written to debug log.',
].join('\n')
return { type: 'text', value: summary }
}