diff --git a/src/commands.ts b/src/commands.ts index d1d876e7..6fb8c600 100644 --- a/src/commands.ts +++ b/src/commands.ts @@ -32,6 +32,7 @@ import logout from './commands/logout/index.js' import installGitHubApp from './commands/install-github-app/index.js' import installSlackApp from './commands/install-slack-app/index.js' import breakCache from './commands/break-cache/index.js' +import cacheProbe from './commands/cache-probe/index.js' import mcp from './commands/mcp/index.js' import mobile from './commands/mobile/index.js' import onboarding from './commands/onboarding/index.js' @@ -268,6 +269,7 @@ const COMMANDS = memoize((): Command[] => [ autoFix, branch, btw, + cacheProbe, chrome, clear, color, diff --git a/src/commands/cache-probe/cache-probe.ts b/src/commands/cache-probe/cache-probe.ts new file mode 100644 index 00000000..e3aa9100 --- /dev/null +++ b/src/commands/cache-probe/cache-probe.ts @@ -0,0 +1,413 @@ +import { getSessionId } from '../../bootstrap/state.js' +import { resolveProviderRequest } from '../../services/api/providerConfig.js' +import type { LocalCommandCall } from '../../types/command.js' +import { logForDebugging } from '../../utils/debug.js' +import { isEnvTruthy } from '../../utils/envUtils.js' +import { hydrateGithubModelsTokenFromSecureStorage } from '../../utils/githubModelsCredentials.js' +import { getMainLoopModel } from '../../utils/model/model.js' + +const COPILOT_HEADERS: Record = { + 'User-Agent': 'GitHubCopilotChat/0.26.7', + 'Editor-Version': 'vscode/1.99.3', + 'Editor-Plugin-Version': 'copilot-chat/0.26.7', + 'Copilot-Integration-Id': 'vscode-chat', +} + +// Large system prompt (~6000 chars, ~1500 tokens) to cross the 1024-token cache threshold +const SYSTEM_PROMPT = [ + 'You are a coding assistant. Answer concisely.', + 'CONTEXT: User is working on a TypeScript project with Bun runtime.', + ...Array.from( + { length: 80 }, + (_, i) => + `Rule ${i + 1}: Follow best practices for TypeScript including strict typing, error handling, testing, and clean code. Prefer explicit types over any. Use const assertions. Await all async operations.`, + ), +].join('\n\n') + +const USER_MESSAGE = 'Say "hello" and nothing else.' +const DELAY_MS = 3000 + +/** + * Extract model family from a versioned model string. + * e.g. "gpt-5.4-0626" → "gpt-5.4", "codex-mini-latest" → "codex-mini" + */ +function getModelFamily(model: string | undefined): string { + if (!model) return 'unknown' + return model + .replace(/-\d{4,}$/, '') + .replace(/-latest$/, '') + .replace(/-preview$/, '') +} + +function getField(obj: unknown, path: string): unknown { + return path + .split('.') + .reduce((o: any, k: string) => (o != null ? o[k] : undefined), obj) +} + +interface ProbeResult { + label: string + status: number + elapsed: number + headers: Record + usage: Record | null + responseId: string | null + error: string | null +} + +async function sendProbe( + url: string, + headers: Record, + body: Record, + label: string, +): Promise { + const start = Date.now() + let response: Response + try { + response = await fetch(url, { + method: 'POST', + headers, + body: JSON.stringify(body), + }) + } catch (err: any) { + return { + label, + status: 0, + elapsed: Date.now() - start, + headers: {}, + usage: null, + responseId: null, + error: err.message, + } + } + const elapsed = Date.now() - start + + const respHeaders: Record = {} + response.headers.forEach((value, key) => { + respHeaders[key] = value + }) + + if (!response.ok) { + const errorBody = await response.text().catch(() => '') + return { + label, + status: response.status, + elapsed, + headers: respHeaders, + usage: null, + responseId: null, + error: errorBody, + } + } + + // Parse SSE stream for usage data + const text = await response.text() + let usage: Record | null = null + let responseId: string | null = null + + const isResponses = url.endsWith('/responses') + for (const chunk of text.split('\n\n')) { + const lines = chunk + .split('\n') + .map((l) => l.trim()) + .filter(Boolean) + + if (isResponses) { + const eventLine = lines.find((l) => l.startsWith('event: ')) + const dataLines = lines.filter((l) => l.startsWith('data: ')) + if (!eventLine || !dataLines.length) continue + const event = eventLine.slice(7).trim() + if ( + event === 'response.completed' || + event === 'response.incomplete' + ) { + try { + const data = JSON.parse( + dataLines.map((l) => l.slice(6)).join('\n'), + ) + usage = (data?.response?.usage as Record) ?? null + responseId = (data?.response?.id as string) ?? null + } catch {} + } + } else { + for (const line of lines) { + if (!line.startsWith('data: ')) continue + const raw = line.slice(6).trim() + if (raw === '[DONE]') continue + try { + const data = JSON.parse(raw) as Record + if (data.usage) { + usage = data.usage as Record + responseId = (data.id as string) ?? null + } + } catch {} + } + } + } + + return { label, status: response.status, elapsed, headers: respHeaders, usage, responseId, error: null } +} + +function formatResult(r: ProbeResult): string { + const lines: string[] = [`--- ${r.label} ---`] + if (r.error) { + lines.push(` ERROR (HTTP ${r.status}): ${r.error.slice(0, 200)}`) + return lines.join('\n') + } + lines.push(` HTTP ${r.status} — ${r.elapsed}ms`) + if (r.responseId) lines.push(` response.id: ${r.responseId}`) + + if (r.usage) { + lines.push(' Usage:') + lines.push(` ${JSON.stringify(r.usage, null, 2).replace(/\n/g, '\n ')}`) + } else { + lines.push(' Usage: null') + } + + // Interesting headers + for (const h of [ + 'openai-processing-ms', + 'x-ratelimit-remaining', + 'x-ratelimit-limit', + 'x-ms-region', + 'x-github-request-id', + 'x-request-id', + ]) { + if (r.headers[h]) lines.push(` ${h}: ${r.headers[h]}`) + } + return lines.join('\n') +} + +export const call: LocalCommandCall = async (args) => { + const parts = (args ?? '').trim().split(/\s+/).filter(Boolean) + const noKey = parts.includes('--no-key') + const modelOverride = parts.find((p) => !p.startsWith('--')) || undefined + const modelStr = modelOverride ?? getMainLoopModel() + const request = resolveProviderRequest({ model: modelStr }) + const isGithub = isEnvTruthy(process.env.CLAUDE_CODE_USE_GITHUB) + + // Resolve API key the same way the OpenAI shim does + let apiKey = process.env.OPENAI_API_KEY ?? '' + if (!apiKey && isGithub) { + hydrateGithubModelsTokenFromSecureStorage() + apiKey = + process.env.OPENAI_API_KEY ?? + process.env.GITHUB_TOKEN ?? + process.env.GH_TOKEN ?? + '' + } + + if (!apiKey) { + return { + type: 'text', + value: + 'No API key found. Make sure you are in an active OpenAI-compatible or GitHub Copilot session.\n' + + 'For GitHub Copilot: run /onboard-github first.\n' + + 'For OpenAI-compatible: set OPENAI_API_KEY.', + } + } + + const useResponses = request.transport === 'codex_responses' + const endpoint = useResponses ? '/responses' : '/chat/completions' + const url = `${request.baseUrl}${endpoint}` + const family = getModelFamily(request.resolvedModel) + const cacheKey = `${getSessionId()}:${family}` + + const headers: Record = { + 'Content-Type': 'application/json', + Authorization: `Bearer ${apiKey}`, + originator: 'openclaude', + } + if (isGithub) { + Object.assign(headers, COPILOT_HEADERS) + } + + let body: Record + if (useResponses) { + body = { + model: request.resolvedModel, + instructions: SYSTEM_PROMPT, + input: [ + { + type: 'message', + role: 'user', + content: [{ type: 'input_text', text: USER_MESSAGE }], + }, + ], + stream: true, + ...(noKey ? {} : { + store: false, + prompt_cache_key: cacheKey, + prompt_cache_retention: '24h', + }), + } + } else { + body = { + model: request.resolvedModel, + messages: [ + { role: 'system', content: SYSTEM_PROMPT }, + { role: 'user', content: USER_MESSAGE }, + ], + stream: true, + stream_options: { include_usage: true }, + max_tokens: 20, + ...(noKey ? {} : { + store: false, + prompt_cache_key: cacheKey, + }), + } + } + + // Log configuration + const config = [ + `[cache-probe] Starting cache probe${noKey ? ' (--no-key: cache params OMITTED)' : ''}`, + ` model: ${request.resolvedModel} (family: ${family})`, + ` transport: ${request.transport}`, + ` endpoint: ${url}`, + ` prompt_cache_key: ${noKey ? 'NOT SENT' : cacheKey}`, + ` store: ${noKey ? 'NOT SENT' : 'false'}`, + ` system prompt: ~${Math.round(SYSTEM_PROMPT.length / 4)} tokens`, + ` delay between calls: ${DELAY_MS}ms`, + ].join('\n') + logForDebugging(config) + + // Call 1 — Cold + const r1 = await sendProbe(url, headers, body, 'CALL 1 — Cold (no cache)') + logForDebugging(`[cache-probe]\n${formatResult(r1)}`) + + if (r1.error) { + return { + type: 'text', + value: `Cache probe failed on first call: HTTP ${r1.status}\n${r1.error.slice(0, 300)}\n\nFull details in debug log.`, + } + } + + // Wait + await new Promise((r) => setTimeout(r, DELAY_MS)) + + // Call 2 — Warm + const r2 = await sendProbe(url, headers, body, 'CALL 2 — Warm (cache expected)') + logForDebugging(`[cache-probe]\n${formatResult(r2)}`) + + // --- Comparison --- + const fields = [ + 'input_tokens', + 'output_tokens', + 'total_tokens', + 'prompt_tokens', + 'completion_tokens', + 'input_tokens_details.cached_tokens', + 'prompt_tokens_details.cached_tokens', + 'output_tokens_details.reasoning_tokens', + ] + + const comparison: string[] = ['[cache-probe] COMPARISON'] + comparison.push( + ` ${'Field'.padEnd(42)} ${'Call 1'.padStart(8)} ${'Call 2'.padStart(8)} ${'Delta'.padStart(8)}`, + ) + comparison.push(` ${'-'.repeat(72)}`) + + for (const f of fields) { + const v1 = getField(r1.usage, f) + const v2 = getField(r2.usage, f) + if (v1 === undefined && v2 === undefined) continue + const d = + typeof v1 === 'number' && typeof v2 === 'number' ? v2 - v1 : '' + comparison.push( + ` ${f.padEnd(42)} ${String(v1 ?? '-').padStart(8)} ${String(v2 ?? '-').padStart(8)} ${String(d).padStart(8)}`, + ) + } + + comparison.push('') + comparison.push( + ` Latency: ${r1.elapsed}ms → ${r2.elapsed}ms (${r2.elapsed - r1.elapsed > 0 ? '+' : ''}${r2.elapsed - r1.elapsed}ms)`, + ) + + // Header comparison + for (const h of ['openai-processing-ms', 'x-ms-region', 'x-ratelimit-remaining']) { + const v1 = r1.headers[h] + const v2 = r2.headers[h] + if (v1 || v2) { + comparison.push(` ${h}: ${v1 ?? '-'} → ${v2 ?? '-'}`) + } + } + + // Verdict + const cached2 = + (getField(r2.usage, 'input_tokens_details.cached_tokens') as number) ?? + (getField(r2.usage, 'prompt_tokens_details.cached_tokens') as number) ?? + 0 + const input1 = + ((r1.usage?.input_tokens ?? r1.usage?.prompt_tokens) as number) ?? 0 + const input2 = + ((r2.usage?.input_tokens ?? r2.usage?.prompt_tokens) as number) ?? 0 + + let verdict: string + if (cached2 > 0) { + const rate = input2 > 0 ? Math.round((cached2 / input2) * 100) : '?' + verdict = `CACHE HIT: ${cached2} cached tokens (${rate}% of input)` + } else if (input1 === 0 && input2 === 0) { + verdict = 'INCONCLUSIVE: Server returns 0 input_tokens — cannot measure' + } else if (r2.elapsed < r1.elapsed * 0.6 && input1 > 100) { + verdict = `POSSIBLE SILENT CACHING: Call 2 was ${Math.round((1 - r2.elapsed / r1.elapsed) * 100)}% faster but no cached_tokens reported` + } else { + verdict = 'NO CACHE DETECTED' + } + + comparison.push(`\n Verdict: ${verdict}`) + + // --- Simulate what main's shim code does with this usage --- + // codexShim.ts makeUsage() — used for Responses API (GPT-5+/Codex) + function mainMakeUsage(u: any) { + return { + input_tokens: u?.input_tokens ?? 0, + output_tokens: u?.output_tokens ?? 0, + cache_creation_input_tokens: 0, + cache_read_input_tokens: 0, // ← main hardcodes this to 0 + } + } + // openaiShim.ts convertChunkUsage() — used for Chat Completions + function mainConvertChunkUsage(u: any) { + return { + input_tokens: u?.prompt_tokens ?? 0, + output_tokens: u?.completion_tokens ?? 0, + cache_creation_input_tokens: 0, + cache_read_input_tokens: u?.prompt_tokens_details?.cached_tokens ?? 0, + } + } + + const shimFn = useResponses ? mainMakeUsage : mainConvertChunkUsage + const shim1 = shimFn(r1.usage) + const shim2 = shimFn(r2.usage) + + comparison.push('') + comparison.push(` --- What main's shim reports (${useResponses ? 'codexShim.makeUsage' : 'openaiShim.convertChunkUsage'}) ---`) + comparison.push(` Call 1: cache_read_input_tokens=${shim1.cache_read_input_tokens}`) + comparison.push(` Call 2: cache_read_input_tokens=${shim2.cache_read_input_tokens}`) + if (useResponses && cached2 > 0) { + comparison.push(` BUG: Server returned ${cached2} cached tokens but main's makeUsage() drops it → reports 0`) + } else if (!useResponses && shim2.cache_read_input_tokens > 0) { + comparison.push(` OK: Chat Completions path on main correctly reads cached_tokens`) + } + + logForDebugging(comparison.join('\n')) + + // User-facing summary + const mode = noKey ? ' (NO cache key sent)' : '' + const shimLabel = useResponses ? 'codexShim.makeUsage()' : 'openaiShim.convertChunkUsage()' + const summary = [ + `Cache Probe — ${request.resolvedModel} via ${useResponses ? 'Responses API' : 'Chat Completions'}${mode}`, + '', + `Call 1: ${r1.elapsed}ms, input=${input1}, cached=${(getField(r1.usage, 'input_tokens_details.cached_tokens') as number) ?? (getField(r1.usage, 'prompt_tokens_details.cached_tokens') as number) ?? 0}`, + `Call 2: ${r2.elapsed}ms, input=${input2}, cached=${cached2}`, + '', + verdict, + '', + `What main's ${shimLabel} reports:`, + ` Call 2 cache_read_input_tokens = ${shim2.cache_read_input_tokens}${useResponses && cached2 > 0 ? ' ← BUG: server sent ' + cached2 + ' but main drops it' : ''}`, + '', + 'Full details written to debug log.', + ].join('\n') + + return { type: 'text', value: summary } +} diff --git a/src/commands/cache-probe/index.ts b/src/commands/cache-probe/index.ts new file mode 100644 index 00000000..060719f4 --- /dev/null +++ b/src/commands/cache-probe/index.ts @@ -0,0 +1,17 @@ +import type { Command } from '../../commands.js' +import { isEnvTruthy } from '../../utils/envUtils.js' + +const cacheProbe: Command = { + type: 'local', + name: 'cache-probe', + description: + 'Send identical requests to test prompt caching (results in debug log)', + argumentHint: '[model] [--no-key]', + isEnabled: () => + isEnvTruthy(process.env.CLAUDE_CODE_USE_OPENAI) || + isEnvTruthy(process.env.CLAUDE_CODE_USE_GITHUB), + supportsNonInteractive: false, + load: () => import('./cache-probe.js'), +} + +export default cacheProbe