fix: harden resume after compaction failures (#195)

* fix: harden resume after compaction failures * test: cover resume compaction safeguards * fix: address resume safeguard review findings
2026-04-03 10:31:06 -04:00
parent 6987a54a71
commit b0d796e5c3
8 changed files with 499 additions and 27 deletions
--- a/src/QueryEngine.ts
+++ b/src/QueryEngine.ts
@@ -46,6 +46,7 @@ import type { AttributionState } from './utils/commitAttribution.js'
 import { getGlobalConfig } from './utils/config.js'
 import { getCwd } from './utils/cwd.js'
 import { isBareMode, isEnvTruthy } from './utils/envUtils.js'
+import { logForDebugging } from './utils/debug.js'
 import { getFastModeState } from './utils/fastMode.js'
 import {
  type FileHistoryState,
@@ -695,9 +696,11 @@ export class QueryEngine {
        // progress are now recorded inline (their switch cases below), but
        // this flush still matters for the preservedSegment tail walk.
        // If the SDK subprocess restarts before then (claude-desktop kills
-        // between turns), tailUuid points to a never-written message →
-        // applyPreservedSegmentRelinks fails its tail→head walk → returns
-        // without pruning → resume loads full pre-compact history.
+        // between turns), tailUuid can point to a never-written message. In
+        // that case strip preservedSegment before transcript persistence so
+        // resume falls back to ordinary boundary pruning instead of relying on
+        // broken relink metadata.
+        let transcriptMessage = message
        if (
          persistSession &&
          message.type === 'system' &&
@@ -710,10 +713,21 @@ export class QueryEngine {
            )
            if (tailIdx !== -1) {
              await recordTranscript(this.mutableMessages.slice(0, tailIdx + 1))
+            } else {
+              transcriptMessage = {
+                ...message,
+                compactMetadata: {
+                  ...message.compactMetadata,
+                  preservedSegment: undefined,
+                },
+              }
+              logForDebugging(
+                `[QueryEngine] stripped preservedSegment before transcript write; missing tail ${tailUuid}`,
+              )
            }
          }
        }
-        messages.push(message)
+        messages.push(transcriptMessage)
        if (persistSession) {
          // Fire-and-forget for assistant messages. claude.ts yields one
          // assistant message per content block, then mutates the last
--- a/src/main.tsx
+++ b/src/main.tsx
@@ -3137,7 +3137,7 @@ async function run(): Promise<CommanderCommand> {
          });
        }
        logError(error);
-        process.exit(1);
+        return await exitWithError(root, errorMessage(error), () => gracefulShutdown(1));
      }
    } else if (feature('DIRECT_CONNECT') && _pendingConnect?.url) {
      // `claude connect <url>` — full interactive TUI connected to a remote server
@@ -3644,7 +3644,7 @@ async function run(): Promise<CommanderCommand> {
                success: false
              });
              logError(error);
-              await exitWithError(root, `Unable to load transcript from file: ${options.resume}`, () => gracefulShutdown(1));
+              await exitWithError(root, errorMessage(error), () => gracefulShutdown(1));
            }
          }
        }
@@ -3686,7 +3686,7 @@ async function run(): Promise<CommanderCommand> {
            success: false
          });
          logError(error);
-          await exitWithError(root, `Failed to resume session ${sessionId}`);
+          await exitWithError(root, errorMessage(error));
        }
      }

--- a/src/screens/ResumeConversation.tsx
+++ b/src/screens/ResumeConversation.tsx
@@ -25,6 +25,7 @@ import { renameRecordingForSession } from '../utils/asciicast.js';
 import { updateSessionName } from '../utils/concurrentSessions.js';
 import { loadConversationForResume } from '../utils/conversationRecovery.js';
 import { checkCrossProjectResume } from '../utils/crossProjectResume.js';
+import { errorMessage } from '../utils/errors.js';
 import type { FileHistorySnapshot } from '../utils/fileHistory.js';
 import { logError } from '../utils/log.js';
 import { createSystemMessage } from '../utils/messages.js';
@@ -101,6 +102,7 @@ export function ResumeConversation({
    agentColor?: AgentColorName;
    mainThreadAgentDefinition?: AgentDefinition;
  } | null>(null);
+  const [resumeError, setResumeError] = React.useState<string | null>(null);
  const [crossProjectCommand, setCrossProjectCommand] = React.useState<string | null>(null);
  const sessionLogResultRef = React.useRef<SessionLogResult | null>(null);
  // Mirror of logs.length so loadMoreLogs can compute value indices outside
@@ -176,6 +178,7 @@ export function ResumeConversation({
    process.exit(1);
  }
  async function onSelect(log_0: LogOption) {
+    setResumeError(null);
    setResuming(true);
    const resumeStart = performance.now();
    const crossProjectCheck = checkCrossProjectResume(log_0, showAllProjects, worktreePaths);
@@ -287,7 +290,8 @@ export function ResumeConversation({
        success: false
      });
      logError(e as Error);
-      throw e;
+      setResumeError(errorMessage(e));
+      setResuming(false);
    }
  }
  if (crossProjectCommand) {
@@ -308,10 +312,18 @@ export function ResumeConversation({
        <Text> Resuming conversation…</Text>
      </Box>;
  }
+  const resumeErrorBanner = resumeError ? <Box flexDirection="column" marginBottom={1}>
+      <Text color="red">Failed to resume conversation.</Text>
+      <Text>{resumeError}</Text>
+      <Text dimColor={true}>Choose a different conversation to continue.</Text>
+    </Box> : null;
  if (filteredLogs.length === 0) {
    return <NoConversationsMessage />;
  }
-  return <LogSelector logs={filteredLogs} maxHeight={rows} onCancel={onCancel} onSelect={onSelect} onLogsChanged={isResumeWithRenameEnabled ? () => loadLogs(showAllProjects) : undefined} onLoadMore={loadMoreLogs} initialSearchQuery={initialSearchQuery} showAllProjects={showAllProjects} onToggleAllProjects={handleToggleAllProjects} onAgenticSearch={agenticSessionSearch} />;
+  return <Box flexDirection="column">
+      {resumeErrorBanner}
+      <LogSelector logs={filteredLogs} maxHeight={rows} onCancel={onCancel} onSelect={onSelect} onLogsChanged={isResumeWithRenameEnabled ? () => loadLogs(showAllProjects) : undefined} onLoadMore={loadMoreLogs} initialSearchQuery={initialSearchQuery} showAllProjects={showAllProjects} onToggleAllProjects={handleToggleAllProjects} onAgenticSearch={agenticSessionSearch} />
+    </Box>;
 }
 function NoConversationsMessage() {
  const $ = _c(2);
--- a/src/utils/conversationRecovery.hooks.test.ts
+++ b/src/utils/conversationRecovery.hooks.test.ts
@@ -0,0 +1,71 @@
+/**
+ * Hook-side-effect regression lives in a separate file with no static import of
+ * conversationRecovery so Bun's mock.module can replace sessionStart before
+ * that module is first loaded.
+ */
+import { afterEach, expect, mock, test } from 'bun:test'
+import { mkdtemp, rm, writeFile } from 'node:fs/promises'
+import { tmpdir } from 'node:os'
+import { join } from 'node:path'
+
+const tempDirs: string[] = []
+const originalSimple = process.env.CLAUDE_CODE_SIMPLE
+const sessionId = '00000000-0000-4000-8000-000000001999'
+const ts = '2026-04-02T00:00:00.000Z'
+
+function id(n: number): string {
+  return `00000000-0000-4000-8000-${String(n).padStart(12, '0')}`
+}
+
+function user(uuid: string, content: string) {
+  return {
+    type: 'user',
+    uuid,
+    parentUuid: null,
+    timestamp: ts,
+    cwd: '/tmp',
+    userType: 'external',
+    sessionId,
+    version: 'test',
+    isSidechain: false,
+    isMeta: false,
+    message: {
+      role: 'user',
+      content,
+    },
+  }
+}
+
+async function writeJsonl(entry: unknown): Promise<string> {
+  const dir = await mkdtemp(join(tmpdir(), 'openclaude-conversation-recovery-hooks-'))
+  tempDirs.push(dir)
+  const filePath = join(dir, 'resume.jsonl')
+  await writeFile(filePath, `${JSON.stringify(entry)}\n`)
+  return filePath
+}
+
+afterEach(async () => {
+  mock.restore()
+  process.env.CLAUDE_CODE_SIMPLE = originalSimple
+  await Promise.all(tempDirs.splice(0).map(dir => rm(dir, { recursive: true, force: true })))
+})
+
+test('loadConversationForResume rejects oversized transcripts before resume hooks run', async () => {
+  delete process.env.CLAUDE_CODE_SIMPLE
+  const hugeContent = 'x'.repeat(8 * 1024 * 1024 + 32 * 1024)
+  const path = await writeJsonl(user(id(3), hugeContent))
+  const hookSpy = mock(() => Promise.resolve([{ type: 'hook' }]))
+
+  mock.module('./sessionStart.js', () => ({
+    processSessionStartHooks: hookSpy,
+  }))
+
+  const { loadConversationForResume, ResumeTranscriptTooLargeError } = await import(
+    './conversationRecovery.ts'
+  )
+
+  await expect(loadConversationForResume('fixture', path)).rejects.toBeInstanceOf(
+    ResumeTranscriptTooLargeError,
+  )
+  expect(hookSpy).not.toHaveBeenCalled()
+})
--- a/src/utils/conversationRecovery.test.ts
+++ b/src/utils/conversationRecovery.test.ts
@@ -0,0 +1,79 @@
+import { afterEach, expect, test } from 'bun:test'
+import { mkdtemp, rm, writeFile } from 'node:fs/promises'
+import { tmpdir } from 'node:os'
+import { join } from 'node:path'
+
+import {
+  loadConversationForResume,
+  ResumeTranscriptTooLargeError,
+} from './conversationRecovery.ts'
+
+const tempDirs: string[] = []
+const originalSimple = process.env.CLAUDE_CODE_SIMPLE
+const sessionId = '00000000-0000-4000-8000-000000001999'
+const ts = '2026-04-02T00:00:00.000Z'
+
+function id(n: number): string {
+  return `00000000-0000-4000-8000-${String(n).padStart(12, '0')}`
+}
+
+function user(uuid: string, content: string) {
+  return {
+    type: 'user',
+    uuid,
+    parentUuid: null,
+    timestamp: ts,
+    cwd: '/tmp',
+    userType: 'external',
+    sessionId,
+    version: 'test',
+    isSidechain: false,
+    isMeta: false,
+    message: {
+      role: 'user',
+      content,
+    },
+  }
+}
+
+async function writeJsonl(entry: unknown): Promise<string> {
+  const dir = await mkdtemp(join(tmpdir(), 'openclaude-conversation-recovery-'))
+  tempDirs.push(dir)
+  const filePath = join(dir, 'resume.jsonl')
+  await writeFile(filePath, `${JSON.stringify(entry)}\n`)
+  return filePath
+}
+
+afterEach(async () => {
+  process.env.CLAUDE_CODE_SIMPLE = originalSimple
+  await Promise.all(tempDirs.splice(0).map(dir => rm(dir, { recursive: true, force: true })))
+})
+
+test('loadConversationForResume accepts a small transcript from jsonl path', async () => {
+  process.env.CLAUDE_CODE_SIMPLE = '1'
+  const path = await writeJsonl(user(id(1), 'hello'))
+
+  const result = await loadConversationForResume('fixture', path)
+  expect(result).not.toBeNull()
+  expect(result?.sessionId).toBe(sessionId)
+  expect(result?.messages.length).toBeGreaterThan(0)
+})
+
+test('loadConversationForResume rejects oversized reconstructed transcripts', async () => {
+  process.env.CLAUDE_CODE_SIMPLE = '1'
+  const hugeContent = 'x'.repeat(8 * 1024 * 1024 + 32 * 1024)
+  const path = await writeJsonl(user(id(2), hugeContent))
+
+  let caught: unknown
+  try {
+    await loadConversationForResume('fixture', path)
+  } catch (error) {
+    caught = error
+  }
+
+  expect(caught).toBeInstanceOf(ResumeTranscriptTooLargeError)
+  expect((caught as Error).message).toContain(
+    'Reconstructed transcript is too large to resume safely',
+  )
+})
+
--- a/src/utils/conversationRecovery.ts
+++ b/src/utils/conversationRecovery.ts
@@ -47,6 +47,7 @@ import {
  loadTranscriptFile,
  removeExtraFields,
 } from './sessionStorage.js'
+import { jsonStringify } from './slowOperations.js'
 import type { ContentReplacementRecord } from './toolResultStorage.js'

 // Dead code elimination: ant-only tool names are conditionally required so
@@ -71,6 +72,37 @@ const SEND_USER_FILE_TOOL_NAME: string | null = feature('KAIROS')
  : null
 /* eslint-enable @typescript-eslint/no-require-imports */

+// Hard cap for reconstructed resume payloads before REPL boot. 8 MiB keeps
+// resume bounded well below the multi-GB failure mode we saw while leaving
+// enough room for normal compacted sessions plus resume hook context.
+const MAX_RESUME_MESSAGE_BYTES = 8 * 1024 * 1024
+
+export class ResumeTranscriptTooLargeError extends Error {
+  constructor(
+    readonly bytes: number,
+    readonly maxBytes: number,
+    readonly messageCount: number,
+  ) {
+    super(
+      `Reconstructed transcript is too large to resume safely (${(
+        bytes / (1024 * 1024)
+      ).toFixed(1)} MiB > ${(maxBytes / (1024 * 1024)).toFixed(1)} MiB, ${messageCount} messages).`,
+    )
+    this.name = 'ResumeTranscriptTooLargeError'
+  }
+}
+
+function assertResumeMessageSize(messages: Message[]): void {
+  const bytes = Buffer.byteLength(jsonStringify(messages), 'utf8')
+  if (bytes > MAX_RESUME_MESSAGE_BYTES) {
+    throw new ResumeTranscriptTooLargeError(
+      bytes,
+      MAX_RESUME_MESSAGE_BYTES,
+      messages.length,
+    )
+  }
+}
+
 /**
 * Transforms legacy attachment types to current types for backward compatibility
 */
@@ -561,11 +593,16 @@ export async function loadConversationForResume(
    const deserialized = deserializeMessagesWithInterruptDetection(messages!)
    messages = deserialized.messages

+    // Reject oversized resumes before running side-effectful resume hooks.
+    assertResumeMessageSize(messages)
+
    // Process session start hooks for resume
    const hookMessages = await processSessionStartHooks('resume', { sessionId })

-    // Append hook messages to the conversation
+    // Append hook messages to the conversation and guard again in case hook
+    // output itself pushes the session over the safe resume limit.
    messages.push(...hookMessages)
+    assertResumeMessageSize(messages)

    return {
      messages,
--- a/src/utils/sessionStorage.test.ts
+++ b/src/utils/sessionStorage.test.ts
@@ -0,0 +1,196 @@
+import { afterEach, expect, test } from 'bun:test'
+import { mkdtemp, rm, writeFile } from 'node:fs/promises'
+import { tmpdir } from 'node:os'
+import { join } from 'node:path'
+
+import { buildConversationChain, loadTranscriptFile } from './sessionStorage.ts'
+
+const tempDirs: string[] = []
+const sessionId = '00000000-0000-4000-8000-000000000999'
+const ts = '2026-04-02T00:00:00.000Z'
+
+function id(n: number): string {
+  return `00000000-0000-4000-8000-${String(n).padStart(12, '0')}`
+}
+
+function base(uuid: string, parentUuid: string | null) {
+  return {
+    uuid,
+    parentUuid,
+    timestamp: ts,
+    cwd: '/tmp',
+    userType: 'external',
+    sessionId,
+    version: 'test',
+    isSidechain: false,
+  }
+}
+
+function user(uuid: string, parentUuid: string | null, content: string) {
+  return {
+    ...base(uuid, parentUuid),
+    type: 'user',
+    isMeta: false,
+    message: {
+      role: 'user',
+      content,
+    },
+  }
+}
+
+function assistant(uuid: string, parentUuid: string | null, text: string) {
+  return {
+    ...base(uuid, parentUuid),
+    type: 'assistant',
+    message: {
+      id: uuid,
+      type: 'message',
+      role: 'assistant',
+      content: [{ type: 'text', text }],
+      model: 'test-model',
+      stop_reason: 'end_turn',
+      usage: {
+        input_tokens: 1,
+        output_tokens: 1,
+        cache_creation_input_tokens: 0,
+        cache_read_input_tokens: 0,
+      },
+    },
+  }
+}
+
+function compactBoundary(
+  uuid: string,
+  parentUuid: string | null,
+  preservedSegment: {
+    headUuid: string
+    anchorUuid: string
+    tailUuid: string
+  },
+) {
+  return {
+    ...base(uuid, parentUuid),
+    type: 'system',
+    subtype: 'compact_boundary',
+    level: 'info',
+    isMeta: false,
+    content: 'Conversation compacted',
+    compactMetadata: {
+      trigger: 'manual',
+      preTokens: 123,
+      preservedSegment,
+    },
+  }
+}
+
+async function writeJsonl(entries: unknown[]): Promise<string> {
+  const dir = await mkdtemp(join(tmpdir(), 'openclaude-session-storage-'))
+  tempDirs.push(dir)
+  const filePath = join(dir, 'session.jsonl')
+  await writeFile(filePath, `${entries.map(e => JSON.stringify(e)).join('\n')}\n`)
+  return filePath
+}
+
+afterEach(async () => {
+  await Promise.all(tempDirs.splice(0).map(dir => rm(dir, { recursive: true, force: true })))
+})
+
+test('loadTranscriptFile fails closed when preserved-segment tail is missing', async () => {
+  const oldUser = user(id(1), null, 'old user')
+  const oldAssistant = assistant(id(2), id(1), 'old assistant')
+  const preservedHead = assistant(id(3), id(2), 'preserved head')
+  const boundary = compactBoundary(id(4), id(2), {
+    headUuid: id(3),
+    anchorUuid: id(5),
+    tailUuid: id(30),
+  })
+  const summary = user(id(5), id(4), 'summary')
+
+  const filePath = await writeJsonl([
+    oldUser,
+    oldAssistant,
+    preservedHead,
+    boundary,
+    summary,
+  ])
+
+  const { messages } = await loadTranscriptFile(filePath)
+  expect(messages.has(id(1))).toBe(false)
+  expect(messages.has(id(2))).toBe(false)
+  expect(messages.has(id(3))).toBe(false)
+  expect(messages.has(id(4))).toBe(true)
+  expect(messages.has(id(5))).toBe(true)
+
+  const chain = buildConversationChain(messages, messages.get(id(5))!)
+  expect(chain.map(message => message.uuid)).toEqual([id(4), id(5)])
+})
+
+test('loadTranscriptFile preserves and relinks a valid preserved segment', async () => {
+  const oldUser = user(id(11), null, 'old user')
+  const oldAssistant = assistant(id(12), id(11), 'old assistant')
+  const preservedHead = assistant(id(13), id(12), 'preserved head')
+  const preservedTail = assistant(id(14), id(13), 'preserved tail')
+  const boundary = compactBoundary(id(15), id(12), {
+    headUuid: id(13),
+    anchorUuid: id(16),
+    tailUuid: id(14),
+  })
+  const summary = user(id(16), id(15), 'summary')
+
+  const filePath = await writeJsonl([
+    oldUser,
+    oldAssistant,
+    preservedHead,
+    preservedTail,
+    boundary,
+    summary,
+  ])
+
+  const { messages } = await loadTranscriptFile(filePath)
+  expect(messages.has(id(11))).toBe(false)
+  expect(messages.has(id(12))).toBe(false)
+  expect(messages.has(id(13))).toBe(true)
+  expect(messages.has(id(14))).toBe(true)
+  expect(messages.get(id(13))?.parentUuid).toBe(id(16))
+  expect(messages.get(id(14))?.parentUuid).toBe(id(13))
+
+  const chain = buildConversationChain(messages, messages.get(id(14))!)
+  expect(chain.map(message => message.uuid)).toEqual([
+    id(15),
+    id(16),
+    id(13),
+    id(14),
+  ])
+})
+
+test('loadTranscriptFile fails closed when preserved-segment anchor is missing', async () => {
+  // Models the case where the compact boundary was written but the post-boundary
+  // summary/anchor message never made it to disk.
+  const oldUser = user(id(21), null, 'old user')
+  const oldAssistant = assistant(id(22), id(21), 'old assistant')
+  const preservedHead = assistant(id(23), id(22), 'preserved head')
+  const preservedTail = assistant(id(24), id(23), 'preserved tail')
+  const boundary = compactBoundary(id(25), id(22), {
+    headUuid: id(23),
+    anchorUuid: id(26),
+    tailUuid: id(24),
+  })
+
+  const filePath = await writeJsonl([
+    oldUser,
+    oldAssistant,
+    preservedHead,
+    preservedTail,
+    boundary,
+  ])
+
+  const { messages } = await loadTranscriptFile(filePath)
+  expect(messages.has(id(21))).toBe(false)
+  expect(messages.has(id(22))).toBe(false)
+  expect(messages.has(id(23))).toBe(false)
+  expect(messages.has(id(24))).toBe(false)
+  expect(messages.has(id(25))).toBe(true)
+
+  const chain = buildConversationChain(messages, messages.get(id(25))!)
+  expect(chain.map(message => message.uuid)).toEqual([id(25)])
+})
--- a/src/utils/sessionStorage.ts
+++ b/src/utils/sessionStorage.ts
@@ -1838,7 +1838,10 @@ export function removeExtraFields(
 */
 function applyPreservedSegmentRelinks(
  messages: Map<UUID, TranscriptMessage>,
-): void {
+): {
+  relinkFailed: boolean
+} {
+  let relinkFailed = false
  type Seg = NonNullable<
    SystemCompactBoundaryMessage['compactMetadata']['preservedSegment']
  >
@@ -1863,46 +1866,100 @@ function applyPreservedSegmentRelinks(
    i++
  }
  // No seg anywhere → no-op. findUnresolvedToolUse etc. read the full map.
-  if (!lastSeg) return
+  if (!lastSeg) return { relinkFailed }

  // Seg stale (no-seg boundary came after): skip relink, still prune at
  // absolute — otherwise the stale preserved chain becomes a phantom leaf.
  const segIsLive = lastSegBoundaryIdx === absoluteLastBoundaryIdx

-  // Validate tail→head BEFORE mutating so malformed metadata is a true
-  // no-op (walk stops at headUuid, doesn't need the relink to run first).
+  // Validate tail→head BEFORE mutating so malformed metadata never keeps
+  // the full pre-compact history alive on resume. If the walk breaks, mark
+  // the relink as failed and fall through to absolute-boundary pruning.
  const preservedUuids = new Set<UUID>()
  if (segIsLive) {
    const walkSeen = new Set<UUID>()
+    const tailInTranscript = messages.has(lastSeg.tailUuid)
+    const headInTranscript = messages.has(lastSeg.headUuid)
+    const anchorInTranscript = messages.has(lastSeg.anchorUuid)
    let cur = messages.get(lastSeg.tailUuid)
    let reachedHead = false
-    while (cur && !walkSeen.has(cur.uuid)) {
+    let failureKind:
+      | 'missing_tail'
+      | 'missing_parent'
+      | 'null_parent_before_head'
+      | 'cycle_before_head'
+      | 'missing_anchor' = 'missing_tail'
+    let lastSeenUuid: UUID | undefined
+    let lastSeenType: TranscriptMessage['type'] | undefined
+    let breakParentUuid: UUID | null | undefined
+
+    while (cur) {
+      if (walkSeen.has(cur.uuid)) {
+        failureKind = 'cycle_before_head'
+        break
+      }
      walkSeen.add(cur.uuid)
      preservedUuids.add(cur.uuid)
+      lastSeenUuid = cur.uuid
+      lastSeenType = cur.type
      if (cur.uuid === lastSeg.headUuid) {
        reachedHead = true
        break
      }
-      cur = cur.parentUuid ? messages.get(cur.parentUuid) : undefined
+      breakParentUuid = cur.parentUuid
+      if (!breakParentUuid) {
+        failureKind = 'null_parent_before_head'
+        break
+      }
+      const next = messages.get(breakParentUuid)
+      if (!next) {
+        failureKind = 'missing_parent'
+        break
+      }
+      cur = next
    }
-    if (!reachedHead) {
+
+    if (!reachedHead || !anchorInTranscript) {
+      if (!anchorInTranscript && reachedHead) {
+        failureKind = 'missing_anchor'
+      }
      // tail→head walk broke — a UUID in the preserved segment isn't in the
-      // transcript. Returning here skips the prune below, so resume loads
-      // the full pre-compact history. Known cause: mid-turn-yielded
-      // attachment pushed to mutableMessages but never recordTranscript'd
-      // (SDK subprocess restarted before next turn's qe:420 flush).
+      // transcript. Fail closed: keep only the post-boundary chain instead of
+      // loading the full pre-compact history on resume.
+      relinkFailed = true
+      preservedUuids.clear()
      logEvent('tengu_relink_walk_broken', {
-        tailInTranscript: messages.has(lastSeg.tailUuid),
-        headInTranscript: messages.has(lastSeg.headUuid),
-        anchorInTranscript: messages.has(lastSeg.anchorUuid),
+        failureKind:
+          failureKind as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
+        tailInTranscript,
+        headInTranscript,
+        anchorInTranscript,
+        walkSteps: walkSeen.size,
+        transcriptSize: messages.size,
+        tailIndex: entryIndex.get(lastSeg.tailUuid),
+        headIndex: entryIndex.get(lastSeg.headUuid),
+        anchorIndex: entryIndex.get(lastSeg.anchorUuid),
+        lastSeenType,
+        breakParentInTranscript: Boolean(
+          breakParentUuid && messages.has(breakParentUuid),
+        ),
+        breakParentIsNull: breakParentUuid === null,
+      })
+      logForDiagnosticsNoPII('warn', 'relink_walk_broken', {
+        failureKind,
+        tailInTranscript,
+        headInTranscript,
+        anchorInTranscript,
        walkSteps: walkSeen.size,
        transcriptSize: messages.size,
      })
-      return
+      logForDebugging(
+        `[sessionStorage] preserved-segment relink failed: kind=${failureKind} tail=${lastSeg.tailUuid} head=${lastSeg.headUuid} anchor=${lastSeg.anchorUuid} lastSeen=${lastSeenUuid ?? 'none'} breakParent=${breakParentUuid ?? 'null'}`,
+      )
    }
  }

-  if (segIsLive) {
+  if (segIsLive && !relinkFailed) {
    const head = messages.get(lastSeg.headUuid)
    if (head) {
      messages.set(lastSeg.headUuid, {
@@ -1953,6 +2010,7 @@ function applyPreservedSegmentRelinks(
    }
  }
  for (const uuid of toDelete) messages.delete(uuid)
+  return { relinkFailed }
 }

 /**
@@ -3701,7 +3759,12 @@ export async function loadTranscriptFile(
    // File doesn't exist or can't be read
  }

-  applyPreservedSegmentRelinks(messages)
+  const { relinkFailed } = applyPreservedSegmentRelinks(messages)
+  if (relinkFailed) {
+    logForDiagnosticsNoPII('warn', 'resume_relink_fail_closed', {
+      transcriptSize: messages.size,
+    })
+  }
  applySnipRemovals(messages)

  // Compute leaf UUIDs once at load time