Files
orcs-code/src/services/api/compressToolHistory.test.ts
viudes a6a3de5ac1 feat(api): compress old tool_result content for small-context providers (#801)
* feat(api): compress old tool_result content for small-context providers

Adds a shim-layer pass that tiers tool_result content by age on
providers
  with small effective context windows (Copilot gpt-4o 128k, Mistral,
  Ollama). Recent turns remain full; mid-tier results are truncated to
2k
  chars; older results are replaced with a stub that preserves tool name
  and arguments so the model can re-invoke if needed.

  Tier sizes auto-tune via getEffectiveContextWindowSize, same
calculation
  used by auto-compact. Reuses COMPACTABLE_TOOLS and
  TOOL_RESULT_CLEARED_MESSAGE to complement (not duplicate)
microCompact.
  Configurable via /config toolHistoryCompressionEnabled.

  Addresses active-session context accumulation on Copilot where
  microCompact's time-based trigger never fires, which surfaces as
  "tools appearing in a loop" and prompt_too_long errors after ~15
turns.

* fix: config tool history
2026-04-21 17:36:26 +08:00

573 lines
18 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import { afterEach, beforeEach, expect, mock, test } from 'bun:test'
import { compressToolHistory, getTiers } from './compressToolHistory.js'
// Mock the two dependencies so tests are deterministic and don't read disk config.
const mockState = {
enabled: true,
effectiveWindow: 100_000,
}
mock.module('../../utils/config.js', () => ({
getGlobalConfig: () => ({
toolHistoryCompressionEnabled: mockState.enabled,
}),
}))
mock.module('../compact/autoCompact.js', () => ({
getEffectiveContextWindowSize: () => mockState.effectiveWindow,
}))
beforeEach(() => {
mockState.enabled = true
mockState.effectiveWindow = 100_000
})
afterEach(() => {
mockState.enabled = true
mockState.effectiveWindow = 100_000
})
type Block = Record<string, unknown>
type Msg = { role: string; content: Block[] | string }
function bigText(n: number): string {
return 'x'.repeat(n)
}
function buildToolExchange(id: number, resultLength: number): Msg[] {
return [
{
role: 'assistant',
content: [
{
type: 'tool_use',
id: `toolu_${id}`,
name: 'Read',
input: { file_path: `/path/to/file${id}.ts` },
},
],
},
{
role: 'user',
content: [
{
type: 'tool_result',
tool_use_id: `toolu_${id}`,
content: bigText(resultLength),
},
],
},
]
}
function buildConversation(numToolExchanges: number, resultLength = 5_000): Msg[] {
const out: Msg[] = [{ role: 'user', content: 'Initial request' }]
for (let i = 0; i < numToolExchanges; i++) {
out.push(...buildToolExchange(i, resultLength))
}
return out
}
function getResultMessages(messages: Msg[]): Msg[] {
return messages.filter(
m => Array.isArray(m.content) && m.content.some((b: any) => b.type === 'tool_result'),
)
}
function getResultBlock(msg: Msg): Block {
return (msg.content as Block[]).find((b: any) => b.type === 'tool_result') as Block
}
function getResultText(msg: Msg): string {
const block = getResultBlock(msg)
const c = block.content
if (typeof c === 'string') return c
if (Array.isArray(c)) {
return c
.filter((b: any) => b.type === 'text')
.map((b: any) => b.text)
.join('\n')
}
return ''
}
// ---------- getTiers ----------
test('getTiers: < 16k window → recent=2, mid=3', () => {
expect(getTiers(8_000)).toEqual({ recent: 2, mid: 3 })
})
test('getTiers: 16k32k → recent=3, mid=5', () => {
expect(getTiers(20_000)).toEqual({ recent: 3, mid: 5 })
})
test('getTiers: 32k64k → recent=4, mid=8', () => {
expect(getTiers(48_000)).toEqual({ recent: 4, mid: 8 })
})
test('getTiers: 64k128k (Copilot gpt-4o) → recent=5, mid=10', () => {
expect(getTiers(100_000)).toEqual({ recent: 5, mid: 10 })
})
test('getTiers: 128k256k (Copilot Claude) → recent=8, mid=15', () => {
expect(getTiers(200_000)).toEqual({ recent: 8, mid: 15 })
})
test('getTiers: 256k500k → recent=12, mid=25', () => {
expect(getTiers(400_000)).toEqual({ recent: 12, mid: 25 })
})
test('getTiers: ≥ 500k (gpt-4.1 1M) → recent=25, mid=50', () => {
expect(getTiers(1_000_000)).toEqual({ recent: 25, mid: 50 })
})
// ---------- master switch ----------
test('pass-through when toolHistoryCompressionEnabled is false', () => {
mockState.enabled = false
const messages = buildConversation(20)
const result = compressToolHistory(messages, 'gpt-4o')
expect(result).toBe(messages) // same reference (no transformation)
})
test('pass-through when total tool_results <= recent tier', () => {
// 100k effective → recent=5; only 4 exchanges → no compression
const messages = buildConversation(4)
const result = compressToolHistory(messages, 'gpt-4o')
expect(result).toBe(messages)
})
// ---------- per-tier behavior ----------
test('recent tier: tool_result content untouched', () => {
// 100k effective → recent=5, mid=10. With 6 exchanges, only the oldest is touched.
const messages = buildConversation(6, 5_000)
const result = compressToolHistory(messages, 'gpt-4o')
const resultMsgs = getResultMessages(result)
// Last 5 should be untouched (full 5000 chars)
for (let i = resultMsgs.length - 5; i < resultMsgs.length; i++) {
expect(getResultText(resultMsgs[i]).length).toBe(5_000)
}
})
test('mid tier: long content truncated to MID_MAX_CHARS with marker', () => {
// 100k → recent=5, mid=10. 10 exchanges: 5 recent + 5 mid (none old).
const messages = buildConversation(10, 5_000)
const result = compressToolHistory(messages, 'gpt-4o')
const resultMsgs = getResultMessages(result)
// First 5 are mid tier — should be truncated to ~2000 chars + marker
for (let i = 0; i < 5; i++) {
const text = getResultText(resultMsgs[i])
expect(text).toContain('[…truncated')
expect(text).toContain('chars from tool history]')
// Should be roughly 2000 chars + marker (under 2200)
expect(text.length).toBeLessThan(2_200)
expect(text.length).toBeGreaterThan(2_000)
}
})
test('mid tier: short content (< MID_MAX_CHARS) untouched', () => {
const messages = buildConversation(10, 500) // 500 < MID_MAX_CHARS
const result = compressToolHistory(messages, 'gpt-4o')
const resultMsgs = getResultMessages(result)
for (let i = 0; i < 5; i++) {
expect(getResultText(resultMsgs[i])).toBe(bigText(500))
}
})
test('old tier: content replaced with stub [name args={...} → N chars omitted]', () => {
// 100k → recent=5, mid=10, old=rest. 20 exchanges → 5 old + 10 mid + 5 recent.
const messages = buildConversation(20, 5_000)
const result = compressToolHistory(messages, 'gpt-4o')
const resultMsgs = getResultMessages(result)
// First 5 are old tier — should be stubs
for (let i = 0; i < 5; i++) {
const text = getResultText(resultMsgs[i])
expect(text).toMatch(/^\[Read args=\{.*\} → 5000 chars omitted\]$/)
}
})
test('old tier: stub args truncated to 200 chars', () => {
const longArg = bigText(500)
const messages: Msg[] = [
{ role: 'user', content: 'start' },
{
role: 'assistant',
content: [
{
type: 'tool_use',
id: 'toolu_x',
name: 'Bash',
input: { command: longArg },
},
],
},
{
role: 'user',
content: [
{ type: 'tool_result', tool_use_id: 'toolu_x', content: 'output' },
],
},
// Pad with enough recent exchanges to push the above into old tier
...buildConversation(20, 100).slice(1),
]
const result = compressToolHistory(messages, 'gpt-4o')
const resultMsgs = getResultMessages(result)
const text = getResultText(resultMsgs[0])
// Stub format: [Bash args=<json≤200chars> → N chars omitted]
// The args portion (between args= and →) must be ≤ 200 chars.
const argsMatch = text.match(/args=(.*?) →/)
expect(argsMatch).not.toBeNull()
expect(argsMatch![1].length).toBeLessThanOrEqual(200)
})
test('old tier: orphan tool_result (no matching tool_use) falls back to "tool"', () => {
const messages: Msg[] = [
{ role: 'user', content: 'start' },
// Orphan: tool_result without matching tool_use in history
{
role: 'user',
content: [
{ type: 'tool_result', tool_use_id: 'orphan_id', content: 'data' },
],
},
...buildConversation(20, 100).slice(1),
]
const result = compressToolHistory(messages, 'gpt-4o')
const resultMsgs = getResultMessages(result)
const text = getResultText(resultMsgs[0])
expect(text).toMatch(/^\[tool args=\{\} → 4 chars omitted\]$/)
})
// ---------- structural preservation ----------
test('tool_use blocks always preserved', () => {
const messages = buildConversation(20, 5_000)
const result = compressToolHistory(messages, 'gpt-4o')
const useCount = (msgs: Msg[]) =>
msgs.reduce((sum, m) => {
if (!Array.isArray(m.content)) return sum
return sum + m.content.filter((b: any) => b.type === 'tool_use').length
}, 0)
expect(useCount(result as Msg[])).toBe(useCount(messages))
})
test('text blocks always preserved', () => {
const messages: Msg[] = [
{ role: 'user', content: 'first' },
{
role: 'assistant',
content: [
{ type: 'text', text: 'reasoning before tool' },
{ type: 'tool_use', id: 'toolu_1', name: 'Read', input: {} },
],
},
{
role: 'user',
content: [{ type: 'tool_result', tool_use_id: 'toolu_1', content: bigText(5000) }],
},
...buildConversation(20, 5_000).slice(1),
]
const result = compressToolHistory(messages, 'gpt-4o')
const assistantMsg = (result as Msg[])[1]
const textBlock = (assistantMsg.content as Block[]).find((b: any) => b.type === 'text')
expect(textBlock).toEqual({ type: 'text', text: 'reasoning before tool' })
})
test('thinking blocks always preserved', () => {
const messages: Msg[] = [
{ role: 'user', content: 'first' },
{
role: 'assistant',
content: [
{ type: 'thinking', thinking: 'internal reasoning', signature: 'sig' },
{ type: 'tool_use', id: 'toolu_1', name: 'Read', input: {} },
],
},
{
role: 'user',
content: [{ type: 'tool_result', tool_use_id: 'toolu_1', content: bigText(5000) }],
},
...buildConversation(20, 5_000).slice(1),
]
const result = compressToolHistory(messages, 'gpt-4o')
const assistantMsg = (result as Msg[])[1]
const thinking = (assistantMsg.content as Block[]).find((b: any) => b.type === 'thinking')
expect(thinking).toEqual({
type: 'thinking',
thinking: 'internal reasoning',
signature: 'sig',
})
})
test('non-array content (string) handled gracefully', () => {
const messages: Msg[] = [
{ role: 'user', content: 'plain string content' },
...buildConversation(20, 100).slice(1),
]
const result = compressToolHistory(messages, 'gpt-4o')
expect((result as Msg[])[0].content).toBe('plain string content')
})
test('empty content array handled gracefully', () => {
const messages: Msg[] = [
{ role: 'user', content: [] },
...buildConversation(20, 100).slice(1),
]
expect(() => compressToolHistory(messages, 'gpt-4o')).not.toThrow()
})
// ---------- message shape compatibility ----------
test('wrapped shape ({ message: { role, content } }) handled', () => {
type WrappedMsg = { message: { role: string; content: Block[] | string } }
const wrap = (m: Msg): WrappedMsg => ({ message: { role: m.role, content: m.content } })
const messages = buildConversation(20, 5_000).map(wrap)
const result = compressToolHistory(messages as any, 'gpt-4o')
// First wrapped tool-result message should have stub content (old tier)
const firstResultMsg = (result as WrappedMsg[]).find(
m =>
Array.isArray(m.message.content) &&
m.message.content.some((b: any) => b.type === 'tool_result'),
)
const block = (firstResultMsg!.message.content as Block[]).find(
(b: any) => b.type === 'tool_result',
) as Block
const text = ((block.content as Block[])[0] as any).text
expect(text).toMatch(/^\[Read args=.*→ 5000 chars omitted\]$/)
})
test('flat shape ({ role, content }) handled', () => {
const messages = buildConversation(20, 5_000)
const result = compressToolHistory(messages, 'gpt-4o')
const resultMsgs = getResultMessages(result)
expect(getResultText(resultMsgs[0])).toMatch(/^\[Read args=.*→ 5000 chars omitted\]$/)
})
// ---------- tier boundary correctness ----------
test('tier boundaries: 6 exchanges → 1 mid + 5 recent (recent=5)', () => {
const messages = buildConversation(6, 5_000)
const result = compressToolHistory(messages, 'gpt-4o')
const resultMsgs = getResultMessages(result)
// Oldest: mid (truncated)
expect(getResultText(resultMsgs[0])).toContain('[…truncated')
// Last 5: untouched
for (let i = 1; i < 6; i++) {
expect(getResultText(resultMsgs[i]).length).toBe(5_000)
}
})
test('tier boundaries: 16 exchanges → 1 old + 10 mid + 5 recent', () => {
const messages = buildConversation(16, 5_000)
const result = compressToolHistory(messages, 'gpt-4o')
const resultMsgs = getResultMessages(result)
// Oldest 1: stub (old tier)
expect(getResultText(resultMsgs[0])).toMatch(/^\[Read .*chars omitted\]$/)
// Next 10: mid (truncated)
for (let i = 1; i < 11; i++) {
expect(getResultText(resultMsgs[i])).toContain('[…truncated')
}
// Last 5: untouched
for (let i = 11; i < 16; i++) {
expect(getResultText(resultMsgs[i]).length).toBe(5_000)
}
})
test('large window (1M) with 30 exchanges: all untouched (recent=25 ≥ 30 - 5)', () => {
// ≥500k → recent=25, mid=50. 30 exchanges → 5 mid + 25 recent. None old.
mockState.effectiveWindow = 1_000_000
const messages = buildConversation(30, 5_000)
const result = compressToolHistory(messages, 'gpt-4.1')
const resultMsgs = getResultMessages(result)
// Last 25: untouched
for (let i = 5; i < 30; i++) {
expect(getResultText(resultMsgs[i]).length).toBe(5_000)
}
})
// ---------- attribute preservation ----------
test('is_error flag preserved in mid tier', () => {
const messages: Msg[] = [
{ role: 'user', content: 'start' },
{
role: 'assistant',
content: [{ type: 'tool_use', id: 'toolu_err', name: 'Bash', input: {} }],
},
{
role: 'user',
content: [
{
type: 'tool_result',
tool_use_id: 'toolu_err',
is_error: true,
content: bigText(5_000),
},
],
},
// Pad with enough recent exchanges to push the above into MID tier
...buildConversation(10, 100).slice(1),
]
const result = compressToolHistory(messages, 'gpt-4o')
const resultMsgs = getResultMessages(result)
const block = getResultBlock(resultMsgs[0]) as { is_error?: boolean; content: unknown }
expect(block.is_error).toBe(true)
expect(getResultText(resultMsgs[0])).toContain('[…truncated')
})
test('is_error flag preserved in old tier (stub)', () => {
const messages: Msg[] = [
{ role: 'user', content: 'start' },
{
role: 'assistant',
content: [{ type: 'tool_use', id: 'toolu_err', name: 'Bash', input: {} }],
},
{
role: 'user',
content: [
{
type: 'tool_result',
tool_use_id: 'toolu_err',
is_error: true,
content: bigText(5_000),
},
],
},
...buildConversation(20, 100).slice(1),
]
const result = compressToolHistory(messages, 'gpt-4o')
const resultMsgs = getResultMessages(result)
const block = getResultBlock(resultMsgs[0]) as { is_error?: boolean; content: unknown }
expect(block.is_error).toBe(true)
expect(getResultText(resultMsgs[0])).toMatch(/^\[Bash .*chars omitted\]$/)
})
// ---------- COMPACTABLE_TOOLS filter ----------
test('non-compactable tool (e.g. Task/Agent) is NEVER compressed', () => {
// Build conversation where the OLDEST exchange uses a non-compactable tool name
const messages: Msg[] = [
{ role: 'user', content: 'start' },
{
role: 'assistant',
content: [
{ type: 'tool_use', id: 'task_1', name: 'Task', input: { goal: 'plan' } },
],
},
{
role: 'user',
content: [
{ type: 'tool_result', tool_use_id: 'task_1', content: bigText(5_000) },
],
},
// Pad with 20 compactable exchanges to push Task into old tier
...buildConversation(20, 100).slice(1),
]
const result = compressToolHistory(messages, 'gpt-4o')
const resultMsgs = getResultMessages(result)
// First tool_result is for Task (non-compactable) → must remain full
expect(getResultText(resultMsgs[0]).length).toBe(5_000)
expect(getResultText(resultMsgs[0])).not.toContain('chars omitted')
expect(getResultText(resultMsgs[0])).not.toContain('[…truncated')
})
test('mcp__ prefixed tools ARE compactable (matches microCompact behavior)', () => {
const messages: Msg[] = [
{ role: 'user', content: 'start' },
{
role: 'assistant',
content: [
{ type: 'tool_use', id: 'mcp_1', name: 'mcp__github__get_issue', input: {} },
],
},
{
role: 'user',
content: [
{ type: 'tool_result', tool_use_id: 'mcp_1', content: bigText(5_000) },
],
},
...buildConversation(20, 100).slice(1),
]
const result = compressToolHistory(messages, 'gpt-4o')
const resultMsgs = getResultMessages(result)
// MCP tool result is compressed (gets stub since it's in old tier)
expect(getResultText(resultMsgs[0])).toMatch(/^\[mcp__github__get_issue .*chars omitted\]$/)
})
// ---------- skip already-cleared blocks ----------
test('blocks already cleared by microCompact are NOT re-compressed', () => {
const messages: Msg[] = [
{ role: 'user', content: 'start' },
{
role: 'assistant',
content: [{ type: 'tool_use', id: 'cleared_1', name: 'Read', input: {} }],
},
{
role: 'user',
content: [
{
type: 'tool_result',
tool_use_id: 'cleared_1',
content: '[Old tool result content cleared]', // microCompact's marker
},
],
},
...buildConversation(20, 100).slice(1),
]
const result = compressToolHistory(messages, 'gpt-4o')
const resultMsgs = getResultMessages(result)
// Already-cleared marker survives untouched (no double processing)
expect(getResultText(resultMsgs[0])).toBe('[Old tool result content cleared]')
})
test('extra block attributes (e.g. cache_control) preserved across rewrites', () => {
const cacheControl = { type: 'ephemeral' }
const messages: Msg[] = [
{ role: 'user', content: 'start' },
{
role: 'assistant',
content: [{ type: 'tool_use', id: 'toolu_cc', name: 'Read', input: {} }],
},
{
role: 'user',
content: [
{
type: 'tool_result',
tool_use_id: 'toolu_cc',
cache_control: cacheControl,
content: bigText(5_000),
},
],
},
...buildConversation(20, 100).slice(1),
]
const result = compressToolHistory(messages, 'gpt-4o')
const resultMsgs = getResultMessages(result)
const block = getResultBlock(resultMsgs[0]) as { cache_control?: unknown }
// The custom attribute survived the stub rewrite via ...block spread
expect(block.cache_control).toEqual(cacheControl)
})