fix: harden resume after compaction failures (#195)

* fix: harden resume after compaction failures

* test: cover resume compaction safeguards

* fix: address resume safeguard review findings
This commit is contained in:
sooth
2026-04-03 10:31:06 -04:00
committed by GitHub
parent 6987a54a71
commit b0d796e5c3
8 changed files with 499 additions and 27 deletions

View File

@@ -46,6 +46,7 @@ import type { AttributionState } from './utils/commitAttribution.js'
import { getGlobalConfig } from './utils/config.js'
import { getCwd } from './utils/cwd.js'
import { isBareMode, isEnvTruthy } from './utils/envUtils.js'
import { logForDebugging } from './utils/debug.js'
import { getFastModeState } from './utils/fastMode.js'
import {
type FileHistoryState,
@@ -695,9 +696,11 @@ export class QueryEngine {
// progress are now recorded inline (their switch cases below), but
// this flush still matters for the preservedSegment tail walk.
// If the SDK subprocess restarts before then (claude-desktop kills
// between turns), tailUuid points to a never-written message
// applyPreservedSegmentRelinks fails its tail→head walk → returns
// without pruning → resume loads full pre-compact history.
// between turns), tailUuid can point to a never-written message. In
// that case strip preservedSegment before transcript persistence so
// resume falls back to ordinary boundary pruning instead of relying on
// broken relink metadata.
let transcriptMessage = message
if (
persistSession &&
message.type === 'system' &&
@@ -710,10 +713,21 @@ export class QueryEngine {
)
if (tailIdx !== -1) {
await recordTranscript(this.mutableMessages.slice(0, tailIdx + 1))
} else {
transcriptMessage = {
...message,
compactMetadata: {
...message.compactMetadata,
preservedSegment: undefined,
},
}
logForDebugging(
`[QueryEngine] stripped preservedSegment before transcript write; missing tail ${tailUuid}`,
)
}
}
}
messages.push(message)
messages.push(transcriptMessage)
if (persistSession) {
// Fire-and-forget for assistant messages. claude.ts yields one
// assistant message per content block, then mutates the last

View File

@@ -3137,7 +3137,7 @@ async function run(): Promise<CommanderCommand> {
});
}
logError(error);
process.exit(1);
return await exitWithError(root, errorMessage(error), () => gracefulShutdown(1));
}
} else if (feature('DIRECT_CONNECT') && _pendingConnect?.url) {
// `claude connect <url>` — full interactive TUI connected to a remote server
@@ -3644,7 +3644,7 @@ async function run(): Promise<CommanderCommand> {
success: false
});
logError(error);
await exitWithError(root, `Unable to load transcript from file: ${options.resume}`, () => gracefulShutdown(1));
await exitWithError(root, errorMessage(error), () => gracefulShutdown(1));
}
}
}
@@ -3686,7 +3686,7 @@ async function run(): Promise<CommanderCommand> {
success: false
});
logError(error);
await exitWithError(root, `Failed to resume session ${sessionId}`);
await exitWithError(root, errorMessage(error));
}
}

View File

@@ -25,6 +25,7 @@ import { renameRecordingForSession } from '../utils/asciicast.js';
import { updateSessionName } from '../utils/concurrentSessions.js';
import { loadConversationForResume } from '../utils/conversationRecovery.js';
import { checkCrossProjectResume } from '../utils/crossProjectResume.js';
import { errorMessage } from '../utils/errors.js';
import type { FileHistorySnapshot } from '../utils/fileHistory.js';
import { logError } from '../utils/log.js';
import { createSystemMessage } from '../utils/messages.js';
@@ -101,6 +102,7 @@ export function ResumeConversation({
agentColor?: AgentColorName;
mainThreadAgentDefinition?: AgentDefinition;
} | null>(null);
const [resumeError, setResumeError] = React.useState<string | null>(null);
const [crossProjectCommand, setCrossProjectCommand] = React.useState<string | null>(null);
const sessionLogResultRef = React.useRef<SessionLogResult | null>(null);
// Mirror of logs.length so loadMoreLogs can compute value indices outside
@@ -176,6 +178,7 @@ export function ResumeConversation({
process.exit(1);
}
async function onSelect(log_0: LogOption) {
setResumeError(null);
setResuming(true);
const resumeStart = performance.now();
const crossProjectCheck = checkCrossProjectResume(log_0, showAllProjects, worktreePaths);
@@ -287,7 +290,8 @@ export function ResumeConversation({
success: false
});
logError(e as Error);
throw e;
setResumeError(errorMessage(e));
setResuming(false);
}
}
if (crossProjectCommand) {
@@ -308,10 +312,18 @@ export function ResumeConversation({
<Text> Resuming conversation</Text>
</Box>;
}
const resumeErrorBanner = resumeError ? <Box flexDirection="column" marginBottom={1}>
<Text color="red">Failed to resume conversation.</Text>
<Text>{resumeError}</Text>
<Text dimColor={true}>Choose a different conversation to continue.</Text>
</Box> : null;
if (filteredLogs.length === 0) {
return <NoConversationsMessage />;
}
return <LogSelector logs={filteredLogs} maxHeight={rows} onCancel={onCancel} onSelect={onSelect} onLogsChanged={isResumeWithRenameEnabled ? () => loadLogs(showAllProjects) : undefined} onLoadMore={loadMoreLogs} initialSearchQuery={initialSearchQuery} showAllProjects={showAllProjects} onToggleAllProjects={handleToggleAllProjects} onAgenticSearch={agenticSessionSearch} />;
return <Box flexDirection="column">
{resumeErrorBanner}
<LogSelector logs={filteredLogs} maxHeight={rows} onCancel={onCancel} onSelect={onSelect} onLogsChanged={isResumeWithRenameEnabled ? () => loadLogs(showAllProjects) : undefined} onLoadMore={loadMoreLogs} initialSearchQuery={initialSearchQuery} showAllProjects={showAllProjects} onToggleAllProjects={handleToggleAllProjects} onAgenticSearch={agenticSessionSearch} />
</Box>;
}
function NoConversationsMessage() {
const $ = _c(2);

View File

@@ -0,0 +1,71 @@
/**
* Hook-side-effect regression lives in a separate file with no static import of
* conversationRecovery so Bun's mock.module can replace sessionStart before
* that module is first loaded.
*/
import { afterEach, expect, mock, test } from 'bun:test'
import { mkdtemp, rm, writeFile } from 'node:fs/promises'
import { tmpdir } from 'node:os'
import { join } from 'node:path'
const tempDirs: string[] = []
const originalSimple = process.env.CLAUDE_CODE_SIMPLE
const sessionId = '00000000-0000-4000-8000-000000001999'
const ts = '2026-04-02T00:00:00.000Z'
function id(n: number): string {
return `00000000-0000-4000-8000-${String(n).padStart(12, '0')}`
}
function user(uuid: string, content: string) {
return {
type: 'user',
uuid,
parentUuid: null,
timestamp: ts,
cwd: '/tmp',
userType: 'external',
sessionId,
version: 'test',
isSidechain: false,
isMeta: false,
message: {
role: 'user',
content,
},
}
}
async function writeJsonl(entry: unknown): Promise<string> {
const dir = await mkdtemp(join(tmpdir(), 'openclaude-conversation-recovery-hooks-'))
tempDirs.push(dir)
const filePath = join(dir, 'resume.jsonl')
await writeFile(filePath, `${JSON.stringify(entry)}\n`)
return filePath
}
afterEach(async () => {
mock.restore()
process.env.CLAUDE_CODE_SIMPLE = originalSimple
await Promise.all(tempDirs.splice(0).map(dir => rm(dir, { recursive: true, force: true })))
})
test('loadConversationForResume rejects oversized transcripts before resume hooks run', async () => {
delete process.env.CLAUDE_CODE_SIMPLE
const hugeContent = 'x'.repeat(8 * 1024 * 1024 + 32 * 1024)
const path = await writeJsonl(user(id(3), hugeContent))
const hookSpy = mock(() => Promise.resolve([{ type: 'hook' }]))
mock.module('./sessionStart.js', () => ({
processSessionStartHooks: hookSpy,
}))
const { loadConversationForResume, ResumeTranscriptTooLargeError } = await import(
'./conversationRecovery.ts'
)
await expect(loadConversationForResume('fixture', path)).rejects.toBeInstanceOf(
ResumeTranscriptTooLargeError,
)
expect(hookSpy).not.toHaveBeenCalled()
})

View File

@@ -0,0 +1,79 @@
import { afterEach, expect, test } from 'bun:test'
import { mkdtemp, rm, writeFile } from 'node:fs/promises'
import { tmpdir } from 'node:os'
import { join } from 'node:path'
import {
loadConversationForResume,
ResumeTranscriptTooLargeError,
} from './conversationRecovery.ts'
const tempDirs: string[] = []
const originalSimple = process.env.CLAUDE_CODE_SIMPLE
const sessionId = '00000000-0000-4000-8000-000000001999'
const ts = '2026-04-02T00:00:00.000Z'
function id(n: number): string {
return `00000000-0000-4000-8000-${String(n).padStart(12, '0')}`
}
function user(uuid: string, content: string) {
return {
type: 'user',
uuid,
parentUuid: null,
timestamp: ts,
cwd: '/tmp',
userType: 'external',
sessionId,
version: 'test',
isSidechain: false,
isMeta: false,
message: {
role: 'user',
content,
},
}
}
async function writeJsonl(entry: unknown): Promise<string> {
const dir = await mkdtemp(join(tmpdir(), 'openclaude-conversation-recovery-'))
tempDirs.push(dir)
const filePath = join(dir, 'resume.jsonl')
await writeFile(filePath, `${JSON.stringify(entry)}\n`)
return filePath
}
afterEach(async () => {
process.env.CLAUDE_CODE_SIMPLE = originalSimple
await Promise.all(tempDirs.splice(0).map(dir => rm(dir, { recursive: true, force: true })))
})
test('loadConversationForResume accepts a small transcript from jsonl path', async () => {
process.env.CLAUDE_CODE_SIMPLE = '1'
const path = await writeJsonl(user(id(1), 'hello'))
const result = await loadConversationForResume('fixture', path)
expect(result).not.toBeNull()
expect(result?.sessionId).toBe(sessionId)
expect(result?.messages.length).toBeGreaterThan(0)
})
test('loadConversationForResume rejects oversized reconstructed transcripts', async () => {
process.env.CLAUDE_CODE_SIMPLE = '1'
const hugeContent = 'x'.repeat(8 * 1024 * 1024 + 32 * 1024)
const path = await writeJsonl(user(id(2), hugeContent))
let caught: unknown
try {
await loadConversationForResume('fixture', path)
} catch (error) {
caught = error
}
expect(caught).toBeInstanceOf(ResumeTranscriptTooLargeError)
expect((caught as Error).message).toContain(
'Reconstructed transcript is too large to resume safely',
)
})

View File

@@ -47,6 +47,7 @@ import {
loadTranscriptFile,
removeExtraFields,
} from './sessionStorage.js'
import { jsonStringify } from './slowOperations.js'
import type { ContentReplacementRecord } from './toolResultStorage.js'
// Dead code elimination: ant-only tool names are conditionally required so
@@ -71,6 +72,37 @@ const SEND_USER_FILE_TOOL_NAME: string | null = feature('KAIROS')
: null
/* eslint-enable @typescript-eslint/no-require-imports */
// Hard cap for reconstructed resume payloads before REPL boot. 8 MiB keeps
// resume bounded well below the multi-GB failure mode we saw while leaving
// enough room for normal compacted sessions plus resume hook context.
const MAX_RESUME_MESSAGE_BYTES = 8 * 1024 * 1024
export class ResumeTranscriptTooLargeError extends Error {
constructor(
readonly bytes: number,
readonly maxBytes: number,
readonly messageCount: number,
) {
super(
`Reconstructed transcript is too large to resume safely (${(
bytes / (1024 * 1024)
).toFixed(1)} MiB > ${(maxBytes / (1024 * 1024)).toFixed(1)} MiB, ${messageCount} messages).`,
)
this.name = 'ResumeTranscriptTooLargeError'
}
}
function assertResumeMessageSize(messages: Message[]): void {
const bytes = Buffer.byteLength(jsonStringify(messages), 'utf8')
if (bytes > MAX_RESUME_MESSAGE_BYTES) {
throw new ResumeTranscriptTooLargeError(
bytes,
MAX_RESUME_MESSAGE_BYTES,
messages.length,
)
}
}
/**
* Transforms legacy attachment types to current types for backward compatibility
*/
@@ -561,11 +593,16 @@ export async function loadConversationForResume(
const deserialized = deserializeMessagesWithInterruptDetection(messages!)
messages = deserialized.messages
// Reject oversized resumes before running side-effectful resume hooks.
assertResumeMessageSize(messages)
// Process session start hooks for resume
const hookMessages = await processSessionStartHooks('resume', { sessionId })
// Append hook messages to the conversation
// Append hook messages to the conversation and guard again in case hook
// output itself pushes the session over the safe resume limit.
messages.push(...hookMessages)
assertResumeMessageSize(messages)
return {
messages,

View File

@@ -0,0 +1,196 @@
import { afterEach, expect, test } from 'bun:test'
import { mkdtemp, rm, writeFile } from 'node:fs/promises'
import { tmpdir } from 'node:os'
import { join } from 'node:path'
import { buildConversationChain, loadTranscriptFile } from './sessionStorage.ts'
const tempDirs: string[] = []
const sessionId = '00000000-0000-4000-8000-000000000999'
const ts = '2026-04-02T00:00:00.000Z'
function id(n: number): string {
return `00000000-0000-4000-8000-${String(n).padStart(12, '0')}`
}
function base(uuid: string, parentUuid: string | null) {
return {
uuid,
parentUuid,
timestamp: ts,
cwd: '/tmp',
userType: 'external',
sessionId,
version: 'test',
isSidechain: false,
}
}
function user(uuid: string, parentUuid: string | null, content: string) {
return {
...base(uuid, parentUuid),
type: 'user',
isMeta: false,
message: {
role: 'user',
content,
},
}
}
function assistant(uuid: string, parentUuid: string | null, text: string) {
return {
...base(uuid, parentUuid),
type: 'assistant',
message: {
id: uuid,
type: 'message',
role: 'assistant',
content: [{ type: 'text', text }],
model: 'test-model',
stop_reason: 'end_turn',
usage: {
input_tokens: 1,
output_tokens: 1,
cache_creation_input_tokens: 0,
cache_read_input_tokens: 0,
},
},
}
}
function compactBoundary(
uuid: string,
parentUuid: string | null,
preservedSegment: {
headUuid: string
anchorUuid: string
tailUuid: string
},
) {
return {
...base(uuid, parentUuid),
type: 'system',
subtype: 'compact_boundary',
level: 'info',
isMeta: false,
content: 'Conversation compacted',
compactMetadata: {
trigger: 'manual',
preTokens: 123,
preservedSegment,
},
}
}
async function writeJsonl(entries: unknown[]): Promise<string> {
const dir = await mkdtemp(join(tmpdir(), 'openclaude-session-storage-'))
tempDirs.push(dir)
const filePath = join(dir, 'session.jsonl')
await writeFile(filePath, `${entries.map(e => JSON.stringify(e)).join('\n')}\n`)
return filePath
}
afterEach(async () => {
await Promise.all(tempDirs.splice(0).map(dir => rm(dir, { recursive: true, force: true })))
})
test('loadTranscriptFile fails closed when preserved-segment tail is missing', async () => {
const oldUser = user(id(1), null, 'old user')
const oldAssistant = assistant(id(2), id(1), 'old assistant')
const preservedHead = assistant(id(3), id(2), 'preserved head')
const boundary = compactBoundary(id(4), id(2), {
headUuid: id(3),
anchorUuid: id(5),
tailUuid: id(30),
})
const summary = user(id(5), id(4), 'summary')
const filePath = await writeJsonl([
oldUser,
oldAssistant,
preservedHead,
boundary,
summary,
])
const { messages } = await loadTranscriptFile(filePath)
expect(messages.has(id(1))).toBe(false)
expect(messages.has(id(2))).toBe(false)
expect(messages.has(id(3))).toBe(false)
expect(messages.has(id(4))).toBe(true)
expect(messages.has(id(5))).toBe(true)
const chain = buildConversationChain(messages, messages.get(id(5))!)
expect(chain.map(message => message.uuid)).toEqual([id(4), id(5)])
})
test('loadTranscriptFile preserves and relinks a valid preserved segment', async () => {
const oldUser = user(id(11), null, 'old user')
const oldAssistant = assistant(id(12), id(11), 'old assistant')
const preservedHead = assistant(id(13), id(12), 'preserved head')
const preservedTail = assistant(id(14), id(13), 'preserved tail')
const boundary = compactBoundary(id(15), id(12), {
headUuid: id(13),
anchorUuid: id(16),
tailUuid: id(14),
})
const summary = user(id(16), id(15), 'summary')
const filePath = await writeJsonl([
oldUser,
oldAssistant,
preservedHead,
preservedTail,
boundary,
summary,
])
const { messages } = await loadTranscriptFile(filePath)
expect(messages.has(id(11))).toBe(false)
expect(messages.has(id(12))).toBe(false)
expect(messages.has(id(13))).toBe(true)
expect(messages.has(id(14))).toBe(true)
expect(messages.get(id(13))?.parentUuid).toBe(id(16))
expect(messages.get(id(14))?.parentUuid).toBe(id(13))
const chain = buildConversationChain(messages, messages.get(id(14))!)
expect(chain.map(message => message.uuid)).toEqual([
id(15),
id(16),
id(13),
id(14),
])
})
test('loadTranscriptFile fails closed when preserved-segment anchor is missing', async () => {
// Models the case where the compact boundary was written but the post-boundary
// summary/anchor message never made it to disk.
const oldUser = user(id(21), null, 'old user')
const oldAssistant = assistant(id(22), id(21), 'old assistant')
const preservedHead = assistant(id(23), id(22), 'preserved head')
const preservedTail = assistant(id(24), id(23), 'preserved tail')
const boundary = compactBoundary(id(25), id(22), {
headUuid: id(23),
anchorUuid: id(26),
tailUuid: id(24),
})
const filePath = await writeJsonl([
oldUser,
oldAssistant,
preservedHead,
preservedTail,
boundary,
])
const { messages } = await loadTranscriptFile(filePath)
expect(messages.has(id(21))).toBe(false)
expect(messages.has(id(22))).toBe(false)
expect(messages.has(id(23))).toBe(false)
expect(messages.has(id(24))).toBe(false)
expect(messages.has(id(25))).toBe(true)
const chain = buildConversationChain(messages, messages.get(id(25))!)
expect(chain.map(message => message.uuid)).toEqual([id(25)])
})

View File

@@ -1838,7 +1838,10 @@ export function removeExtraFields(
*/
function applyPreservedSegmentRelinks(
messages: Map<UUID, TranscriptMessage>,
): void {
): {
relinkFailed: boolean
} {
let relinkFailed = false
type Seg = NonNullable<
SystemCompactBoundaryMessage['compactMetadata']['preservedSegment']
>
@@ -1863,46 +1866,100 @@ function applyPreservedSegmentRelinks(
i++
}
// No seg anywhere → no-op. findUnresolvedToolUse etc. read the full map.
if (!lastSeg) return
if (!lastSeg) return { relinkFailed }
// Seg stale (no-seg boundary came after): skip relink, still prune at
// absolute — otherwise the stale preserved chain becomes a phantom leaf.
const segIsLive = lastSegBoundaryIdx === absoluteLastBoundaryIdx
// Validate tail→head BEFORE mutating so malformed metadata is a true
// no-op (walk stops at headUuid, doesn't need the relink to run first).
// Validate tail→head BEFORE mutating so malformed metadata never keeps
// the full pre-compact history alive on resume. If the walk breaks, mark
// the relink as failed and fall through to absolute-boundary pruning.
const preservedUuids = new Set<UUID>()
if (segIsLive) {
const walkSeen = new Set<UUID>()
const tailInTranscript = messages.has(lastSeg.tailUuid)
const headInTranscript = messages.has(lastSeg.headUuid)
const anchorInTranscript = messages.has(lastSeg.anchorUuid)
let cur = messages.get(lastSeg.tailUuid)
let reachedHead = false
while (cur && !walkSeen.has(cur.uuid)) {
let failureKind:
| 'missing_tail'
| 'missing_parent'
| 'null_parent_before_head'
| 'cycle_before_head'
| 'missing_anchor' = 'missing_tail'
let lastSeenUuid: UUID | undefined
let lastSeenType: TranscriptMessage['type'] | undefined
let breakParentUuid: UUID | null | undefined
while (cur) {
if (walkSeen.has(cur.uuid)) {
failureKind = 'cycle_before_head'
break
}
walkSeen.add(cur.uuid)
preservedUuids.add(cur.uuid)
lastSeenUuid = cur.uuid
lastSeenType = cur.type
if (cur.uuid === lastSeg.headUuid) {
reachedHead = true
break
}
cur = cur.parentUuid ? messages.get(cur.parentUuid) : undefined
breakParentUuid = cur.parentUuid
if (!breakParentUuid) {
failureKind = 'null_parent_before_head'
break
}
const next = messages.get(breakParentUuid)
if (!next) {
failureKind = 'missing_parent'
break
}
cur = next
}
if (!reachedHead) {
if (!reachedHead || !anchorInTranscript) {
if (!anchorInTranscript && reachedHead) {
failureKind = 'missing_anchor'
}
// tail→head walk broke — a UUID in the preserved segment isn't in the
// transcript. Returning here skips the prune below, so resume loads
// the full pre-compact history. Known cause: mid-turn-yielded
// attachment pushed to mutableMessages but never recordTranscript'd
// (SDK subprocess restarted before next turn's qe:420 flush).
// transcript. Fail closed: keep only the post-boundary chain instead of
// loading the full pre-compact history on resume.
relinkFailed = true
preservedUuids.clear()
logEvent('tengu_relink_walk_broken', {
tailInTranscript: messages.has(lastSeg.tailUuid),
headInTranscript: messages.has(lastSeg.headUuid),
anchorInTranscript: messages.has(lastSeg.anchorUuid),
failureKind:
failureKind as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
tailInTranscript,
headInTranscript,
anchorInTranscript,
walkSteps: walkSeen.size,
transcriptSize: messages.size,
tailIndex: entryIndex.get(lastSeg.tailUuid),
headIndex: entryIndex.get(lastSeg.headUuid),
anchorIndex: entryIndex.get(lastSeg.anchorUuid),
lastSeenType,
breakParentInTranscript: Boolean(
breakParentUuid && messages.has(breakParentUuid),
),
breakParentIsNull: breakParentUuid === null,
})
logForDiagnosticsNoPII('warn', 'relink_walk_broken', {
failureKind,
tailInTranscript,
headInTranscript,
anchorInTranscript,
walkSteps: walkSeen.size,
transcriptSize: messages.size,
})
return
logForDebugging(
`[sessionStorage] preserved-segment relink failed: kind=${failureKind} tail=${lastSeg.tailUuid} head=${lastSeg.headUuid} anchor=${lastSeg.anchorUuid} lastSeen=${lastSeenUuid ?? 'none'} breakParent=${breakParentUuid ?? 'null'}`,
)
}
}
if (segIsLive) {
if (segIsLive && !relinkFailed) {
const head = messages.get(lastSeg.headUuid)
if (head) {
messages.set(lastSeg.headUuid, {
@@ -1953,6 +2010,7 @@ function applyPreservedSegmentRelinks(
}
}
for (const uuid of toDelete) messages.delete(uuid)
return { relinkFailed }
}
/**
@@ -3701,7 +3759,12 @@ export async function loadTranscriptFile(
// File doesn't exist or can't be read
}
applyPreservedSegmentRelinks(messages)
const { relinkFailed } = applyPreservedSegmentRelinks(messages)
if (relinkFailed) {
logForDiagnosticsNoPII('warn', 'resume_relink_fail_closed', {
transcriptSize: messages.size,
})
}
applySnipRemovals(messages)
// Compute leaf UUIDs once at load time