From 336ddcc50d59d79ebff50993f2673652aecb0d7d Mon Sep 17 00:00:00 2001 From: Kevin Codex Date: Mon, 20 Apr 2026 15:18:58 +0800 Subject: [PATCH] fix(api): replace phrase-based reasoning sanitizer with tag-based filter (#779) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reasoning models (MiniMax M2.7, GLM-4.5/5, DeepSeek, Kimi K2) inline chain-of-thought inside ... tags in the content field rather than using the reasoning_content channel. The prior phrase-matching sanitizer (looksLikeLeakedReasoningPrefix) only caught English-prose preambles like "I should"/"the user asked", missed tag-based leaks entirely, and risked false-stripping legitimate assistant output. Replace with a structural tag-based approach (same pattern as hermes-agent): - createThinkTagFilter() — streaming state machine that buffers partial tags across SSE delta boundaries (), so tags split mid-chunk still parse correctly. - stripThinkTags() — whole-text cleanup for non-streaming responses and as a safety net. Handles closed pairs, unterminated opens at block boundaries, and orphan tags. - Recognizes think, thinking, reasoning, thought, REASONING_SCRATCHPAD case-insensitively, including tags with attributes. - False-negative bias: flush() discards buffered partial tags at stream end rather than leaking them. Existing phrase-based shim tests updated to exercise the actual tag leak. Added regression tests confirming legitimate prose starting with "I should..." is preserved (the old sanitizer's main false-positive). Co-authored-by: Claude Opus 4.7 (1M context) --- src/services/api/codexShim.test.ts | 89 ++++++++- src/services/api/codexShim.ts | 66 ++----- src/services/api/openaiShim.test.ts | 95 ++++++++- src/services/api/openaiShim.ts | 62 ++---- .../api/reasoningLeakSanitizer.test.ts | 46 ----- src/services/api/reasoningLeakSanitizer.ts | 54 ------ src/services/api/thinkTagSanitizer.test.ts | 183 ++++++++++++++++++ src/services/api/thinkTagSanitizer.ts | 162 ++++++++++++++++ 8 files changed, 544 insertions(+), 213 deletions(-) delete mode 100644 src/services/api/reasoningLeakSanitizer.test.ts delete mode 100644 src/services/api/reasoningLeakSanitizer.ts create mode 100644 src/services/api/thinkTagSanitizer.test.ts create mode 100644 src/services/api/thinkTagSanitizer.ts diff --git a/src/services/api/codexShim.test.ts b/src/services/api/codexShim.test.ts index d2e39aae..40fcf79a 100644 --- a/src/services/api/codexShim.test.ts +++ b/src/services/api/codexShim.test.ts @@ -547,7 +547,7 @@ describe('Codex request translation', () => { ]) }) - test('strips leaked reasoning preamble from completed Codex text responses', () => { + test('strips tag block from completed Codex text responses', () => { const message = convertCodexResponseToAnthropicMessage( { id: 'resp_1', @@ -560,7 +560,7 @@ describe('Codex request translation', () => { { type: 'output_text', text: - 'The user just said "hey" - a simple greeting. I should respond briefly and friendly.\n\nHey! How can I help you today?', + 'user wants a greeting, respond brieflyHey! How can I help you today?', }, ], }, @@ -578,6 +578,37 @@ describe('Codex request translation', () => { ]) }) + test('strips unterminated tag at block boundary in Codex completed response', () => { + const message = convertCodexResponseToAnthropicMessage( + { + id: 'resp_1', + model: 'gpt-5.4', + output: [ + { + type: 'message', + role: 'assistant', + content: [ + { + type: 'output_text', + text: + 'Here is the answer.\nwait, let me reconsider the user request', + }, + ], + }, + ], + usage: { input_tokens: 12, output_tokens: 4 }, + }, + 'gpt-5.4', + ) + + expect(message.content).toEqual([ + { + type: 'text', + text: 'Here is the answer.', + }, + ]) + }) + test('translates Codex SSE text stream into Anthropic events', async () => { const responseText = [ 'event: response.output_item.added', @@ -609,7 +640,7 @@ describe('Codex request translation', () => { ]) }) - test('strips leaked reasoning preamble from Codex SSE text stream', async () => { + test('strips tag block from Codex SSE text stream', async () => { const responseText = [ 'event: response.output_item.added', 'data: {"type":"response.output_item.added","item":{"id":"msg_1","type":"message","status":"in_progress","content":[],"role":"assistant"},"output_index":0,"sequence_number":0}', @@ -618,13 +649,13 @@ describe('Codex request translation', () => { 'data: {"type":"response.content_part.added","content_index":0,"item_id":"msg_1","output_index":0,"part":{"type":"output_text","text":""},"sequence_number":1}', '', 'event: response.output_text.delta', - 'data: {"type":"response.output_text.delta","content_index":0,"delta":"The user just said \\"hey\\" - a simple greeting. I should respond briefly and friendly.\\n\\nHey! How can I help you today?","item_id":"msg_1","output_index":0,"sequence_number":2}', + 'data: {"type":"response.output_text.delta","content_index":0,"delta":"user wants a greeting, respond brieflyHey! How can I help you today?","item_id":"msg_1","output_index":0,"sequence_number":2}', '', 'event: response.output_item.done', - 'data: {"type":"response.output_item.done","item":{"id":"msg_1","type":"message","status":"completed","content":[{"type":"output_text","text":"The user just said \\"hey\\" - a simple greeting. I should respond briefly and friendly.\\n\\nHey! How can I help you today?"}],"role":"assistant"},"output_index":0,"sequence_number":3}', + 'data: {"type":"response.output_item.done","item":{"id":"msg_1","type":"message","status":"completed","content":[{"type":"output_text","text":"user wants a greeting, respond brieflyHey! How can I help you today?"}],"role":"assistant"},"output_index":0,"sequence_number":3}', '', 'event: response.completed', - 'data: {"type":"response.completed","response":{"id":"resp_1","status":"completed","model":"gpt-5.4","output":[{"type":"message","role":"assistant","content":[{"type":"output_text","text":"The user just said \\"hey\\" - a simple greeting. I should respond briefly and friendly.\\n\\nHey! How can I help you today?"}]}],"usage":{"input_tokens":2,"output_tokens":1}},"sequence_number":4}', + 'data: {"type":"response.completed","response":{"id":"resp_1","status":"completed","model":"gpt-5.4","output":[{"type":"message","role":"assistant","content":[{"type":"output_text","text":"user wants a greeting, respond brieflyHey! How can I help you today?"}]}],"usage":{"input_tokens":2,"output_tokens":1}},"sequence_number":4}', '', ].join('\n') @@ -646,6 +677,50 @@ describe('Codex request translation', () => { } } - expect(textDeltas).toEqual(['Hey! How can I help you today?']) + expect(textDeltas.join('')).toBe('Hey! How can I help you today?') + }) + + test('preserves prose without tags (no phrase-based false positive)', async () => { + // Regression test: older phrase-based sanitizer would incorrectly strip text + // starting with "I should" or "The user". The tag-based approach leaves it alone. + const responseText = [ + 'event: response.output_item.added', + 'data: {"type":"response.output_item.added","item":{"id":"msg_1","type":"message","status":"in_progress","content":[],"role":"assistant"},"output_index":0,"sequence_number":0}', + '', + 'event: response.content_part.added', + 'data: {"type":"response.content_part.added","content_index":0,"item_id":"msg_1","output_index":0,"part":{"type":"output_text","text":""},"sequence_number":1}', + '', + 'event: response.output_text.delta', + 'data: {"type":"response.output_text.delta","content_index":0,"delta":"I should note that the user role requires a briefly concise friendly response format.","item_id":"msg_1","output_index":0,"sequence_number":2}', + '', + 'event: response.output_item.done', + 'data: {"type":"response.output_item.done","item":{"id":"msg_1","type":"message","status":"completed","content":[{"type":"output_text","text":"I should note that the user role requires a briefly concise friendly response format."}],"role":"assistant"},"output_index":0,"sequence_number":3}', + '', + 'event: response.completed', + 'data: {"type":"response.completed","response":{"id":"resp_1","status":"completed","model":"gpt-5.4","output":[{"type":"message","role":"assistant","content":[{"type":"output_text","text":"I should note that the user role requires a briefly concise friendly response format."}]}],"usage":{"input_tokens":2,"output_tokens":1}},"sequence_number":4}', + '', + ].join('\n') + + const stream = new ReadableStream({ + start(controller) { + controller.enqueue(new TextEncoder().encode(responseText)) + controller.close() + }, + }) + + const textDeltas: string[] = [] + for await (const event of codexStreamToAnthropic( + new Response(stream), + 'gpt-5.4', + )) { + const delta = (event as { delta?: { type?: string; text?: string } }).delta + if (delta?.type === 'text_delta' && typeof delta.text === 'string') { + textDeltas.push(delta.text) + } + } + + expect(textDeltas.join('')).toBe( + 'I should note that the user role requires a briefly concise friendly response format.', + ) }) }) diff --git a/src/services/api/codexShim.ts b/src/services/api/codexShim.ts index 211bdd82..7ed29df5 100644 --- a/src/services/api/codexShim.ts +++ b/src/services/api/codexShim.ts @@ -6,10 +6,9 @@ import type { } from './providerConfig.js' import { sanitizeSchemaForOpenAICompat } from './openaiSchemaSanitizer.js' import { - looksLikeLeakedReasoningPrefix, - shouldBufferPotentialReasoningPrefix, - stripLeakedReasoningPreamble, -} from './reasoningLeakSanitizer.js' + createThinkTagFilter, + stripThinkTags, +} from './thinkTagSanitizer.js' export interface AnthropicUsage { input_tokens: number @@ -734,25 +733,22 @@ export async function* codexStreamToAnthropic( { index: number; toolUseId: string } >() let activeTextBlockIndex: number | null = null - let activeTextBuffer = '' - let textBufferMode: 'none' | 'pending' | 'strip' = 'none' + const thinkFilter = createThinkTagFilter() let nextContentBlockIndex = 0 let sawToolUse = false let finalResponse: Record | undefined const closeActiveTextBlock = async function* () { if (activeTextBlockIndex === null) return - if (textBufferMode !== 'none') { - const sanitized = stripLeakedReasoningPreamble(activeTextBuffer) - if (sanitized) { - yield { - type: 'content_block_delta', - index: activeTextBlockIndex, - delta: { - type: 'text_delta', - text: sanitized, - }, - } + const tail = thinkFilter.flush() + if (tail) { + yield { + type: 'content_block_delta', + index: activeTextBlockIndex, + delta: { + type: 'text_delta', + text: tail, + }, } } yield { @@ -760,8 +756,6 @@ export async function* codexStreamToAnthropic( index: activeTextBlockIndex, } activeTextBlockIndex = null - activeTextBuffer = '' - textBufferMode = 'none' } const startTextBlockIfNeeded = async function* () { @@ -837,43 +831,17 @@ export async function* codexStreamToAnthropic( if (event.event === 'response.output_text.delta') { yield* startTextBlockIfNeeded() - activeTextBuffer += payload.delta ?? '' if (activeTextBlockIndex !== null) { - if ( - textBufferMode === 'strip' || - looksLikeLeakedReasoningPrefix(activeTextBuffer) - ) { - textBufferMode = 'strip' - continue - } - - if (textBufferMode === 'pending') { - if (shouldBufferPotentialReasoningPrefix(activeTextBuffer)) { - continue - } + const visible = thinkFilter.feed(payload.delta ?? '') + if (visible) { yield { type: 'content_block_delta', index: activeTextBlockIndex, delta: { type: 'text_delta', - text: activeTextBuffer, + text: visible, }, } - textBufferMode = 'none' - continue - } - - if (shouldBufferPotentialReasoningPrefix(activeTextBuffer)) { - textBufferMode = 'pending' - continue - } - yield { - type: 'content_block_delta', - index: activeTextBlockIndex, - delta: { - type: 'text_delta', - text: payload.delta ?? '', - }, } } continue @@ -969,7 +937,7 @@ export function convertCodexResponseToAnthropicMessage( if (part?.type === 'output_text') { content.push({ type: 'text', - text: stripLeakedReasoningPreamble(part.text ?? ''), + text: stripThinkTags(part.text ?? ''), }) } } diff --git a/src/services/api/openaiShim.test.ts b/src/services/api/openaiShim.test.ts index 23a7dd30..eaf8b590 100644 --- a/src/services/api/openaiShim.test.ts +++ b/src/services/api/openaiShim.test.ts @@ -2513,7 +2513,7 @@ test('non-streaming: real content takes precedence over reasoning_content', asyn ]) }) -test('non-streaming: strips leaked reasoning preamble from assistant content', async () => { +test('non-streaming: strips tag block from assistant content', async () => { globalThis.fetch = (async () => { return new Response( JSON.stringify({ @@ -2524,7 +2524,7 @@ test('non-streaming: strips leaked reasoning preamble from assistant content', a message: { role: 'assistant', content: - 'The user just said "hey" - a simple greeting. I should respond briefly and friendly.\n\nHey! How can I help you today?', + 'user wants a greeting, respond brieflyHey! How can I help you today?', }, finish_reason: 'stop', }, @@ -2645,7 +2645,7 @@ test('streaming: thinking block closed before tool call', async () => { expect(thinkingStart?.content_block?.type).toBe('thinking') }) -test('streaming: strips leaked reasoning preamble from assistant content deltas', async () => { +test('streaming: strips tag block from assistant content deltas', async () => { globalThis.fetch = (async () => { const chunks = makeStreamChunks([ { @@ -2658,7 +2658,7 @@ test('streaming: strips leaked reasoning preamble from assistant content deltas' delta: { role: 'assistant', content: - 'The user just said "hey" - a simple greeting. I should respond briefly and friendly.\n\nHey! How can I help you today?', + 'user wants a greeting, respond brieflyHey! How can I help you today?', }, finish_reason: null, }, @@ -2700,10 +2700,10 @@ test('streaming: strips leaked reasoning preamble from assistant content deltas' } } - expect(textDeltas).toEqual(['Hey! How can I help you today?']) + expect(textDeltas.join('')).toBe('Hey! How can I help you today?') }) -test('streaming: strips leaked reasoning preamble when split across multiple content chunks', async () => { +test('streaming: strips tag split across multiple content chunks', async () => { globalThis.fetch = (async () => { const chunks = makeStreamChunks([ { @@ -2715,7 +2715,7 @@ test('streaming: strips leaked reasoning preamble when split across multiple con index: 0, delta: { role: 'assistant', - content: 'The user said "hey" - this is a simple greeting. ', + content: 'user wants a greeting,', }, finish_reason: null, }, @@ -2729,8 +2729,21 @@ test('streaming: strips leaked reasoning preamble when split across multiple con { index: 0, delta: { - content: - 'I should respond in a friendly, concise way.\n\nHey! How can I help you today?', + content: ' respond brieflyHey! How can I help you today?', }, finish_reason: null, }, @@ -2773,7 +2786,69 @@ test('streaming: strips leaked reasoning preamble when split across multiple con } } - expect(textDeltas).toEqual(['Hey! How can I help you today?']) + expect(textDeltas.join('')).toBe('Hey! How can I help you today?') +}) + +test('streaming: preserves prose without tags (no phrase-based false positive)', async () => { + // Regression: older phrase-based sanitizer would strip "I should..." prose. + // The tag-based approach leaves legitimate assistant output alone. + globalThis.fetch = (async () => { + const chunks = makeStreamChunks([ + { + id: 'chatcmpl-1', + object: 'chat.completion.chunk', + model: 'gpt-5-mini', + choices: [ + { + index: 0, + delta: { + role: 'assistant', + content: + 'I should note that the user role requires a briefly concise friendly response format.', + }, + finish_reason: null, + }, + ], + }, + { + id: 'chatcmpl-1', + object: 'chat.completion.chunk', + model: 'gpt-5-mini', + choices: [ + { + index: 0, + delta: {}, + finish_reason: 'stop', + }, + ], + }, + ]) + + return makeSseResponse(chunks) + }) as FetchType + + const client = createOpenAIShimClient({}) as OpenAIShimClient + const result = await client.beta.messages + .create({ + model: 'gpt-5-mini', + system: 'test system', + messages: [{ role: 'user', content: 'hey' }], + max_tokens: 64, + stream: true, + }) + .withResponse() + + const textDeltas: string[] = [] + for await (const event of result.data) { + const delta = (event as { delta?: { type?: string; text?: string } }).delta + if (delta?.type === 'text_delta' && typeof delta.text === 'string') { + textDeltas.push(delta.text) + } + } + + expect(textDeltas.join('')).toBe( + 'I should note that the user role requires a briefly concise friendly response format.', + ) }) test('classifies localhost transport failures with actionable category marker', async () => { diff --git a/src/services/api/openaiShim.ts b/src/services/api/openaiShim.ts index 5fde76e6..6c3c1ffa 100644 --- a/src/services/api/openaiShim.ts +++ b/src/services/api/openaiShim.ts @@ -32,10 +32,9 @@ import { resolveGeminiCredential } from '../../utils/geminiAuth.js' import { hydrateGeminiAccessTokenFromSecureStorage } from '../../utils/geminiCredentials.js' import { hydrateGithubModelsTokenFromSecureStorage } from '../../utils/githubModelsCredentials.js' import { - looksLikeLeakedReasoningPrefix, - shouldBufferPotentialReasoningPrefix, - stripLeakedReasoningPreamble, -} from './reasoningLeakSanitizer.js' + createThinkTagFilter, + stripThinkTags, +} from './thinkTagSanitizer.js' import { codexStreamToAnthropic, collectCodexCompletedResponse, @@ -718,8 +717,7 @@ async function* openaiStreamToAnthropic( let hasEmittedContentStart = false let hasEmittedThinkingStart = false let hasClosedThinking = false - let activeTextBuffer = '' - let textBufferMode: 'none' | 'pending' | 'strip' = 'none' + const thinkFilter = createThinkTagFilter() let lastStopReason: 'tool_use' | 'max_tokens' | 'end_turn' | null = null let hasEmittedFinalUsage = false let hasProcessedFinishReason = false @@ -798,14 +796,12 @@ async function* openaiStreamToAnthropic( const closeActiveContentBlock = async function* () { if (!hasEmittedContentStart) return - if (textBufferMode !== 'none') { - const sanitized = stripLeakedReasoningPreamble(activeTextBuffer) - if (sanitized) { - yield { - type: 'content_block_delta', - index: contentBlockIndex, - delta: { type: 'text_delta', text: sanitized }, - } + const tail = thinkFilter.flush() + if (tail) { + yield { + type: 'content_block_delta', + index: contentBlockIndex, + delta: { type: 'text_delta', text: tail }, } } @@ -815,8 +811,6 @@ async function* openaiStreamToAnthropic( } contentBlockIndex++ hasEmittedContentStart = false - activeTextBuffer = '' - textBufferMode = 'none' } try { @@ -873,7 +867,6 @@ async function* openaiStreamToAnthropic( contentBlockIndex++ hasClosedThinking = true } - activeTextBuffer += delta.content if (!hasEmittedContentStart) { yield { type: 'content_block_start', @@ -883,38 +876,13 @@ async function* openaiStreamToAnthropic( hasEmittedContentStart = true } - if ( - textBufferMode === 'strip' || - looksLikeLeakedReasoningPrefix(activeTextBuffer) - ) { - textBufferMode = 'strip' - continue - } - - if (textBufferMode === 'pending') { - if (shouldBufferPotentialReasoningPrefix(activeTextBuffer)) { - continue - } + const visible = thinkFilter.feed(delta.content) + if (visible) { yield { type: 'content_block_delta', index: contentBlockIndex, - delta: { - type: 'text_delta', - text: activeTextBuffer, - }, + delta: { type: 'text_delta', text: visible }, } - textBufferMode = 'none' - continue - } - - if (shouldBufferPotentialReasoningPrefix(activeTextBuffer)) { - textBufferMode = 'pending' - continue - } - yield { - type: 'content_block_delta', - index: contentBlockIndex, - delta: { type: 'text_delta', text: delta.content }, } } @@ -1742,7 +1710,7 @@ class OpenAIShimMessages { if (typeof rawContent === 'string' && rawContent) { content.push({ type: 'text', - text: stripLeakedReasoningPreamble(rawContent), + text: stripThinkTags(rawContent), }) } else if (Array.isArray(rawContent) && rawContent.length > 0) { const parts: string[] = [] @@ -1760,7 +1728,7 @@ class OpenAIShimMessages { if (joined) { content.push({ type: 'text', - text: stripLeakedReasoningPreamble(joined), + text: stripThinkTags(joined), }) } } diff --git a/src/services/api/reasoningLeakSanitizer.test.ts b/src/services/api/reasoningLeakSanitizer.test.ts deleted file mode 100644 index e89e5a2e..00000000 --- a/src/services/api/reasoningLeakSanitizer.test.ts +++ /dev/null @@ -1,46 +0,0 @@ -import { describe, expect, test } from 'bun:test' - -import { - looksLikeLeakedReasoningPrefix, - shouldBufferPotentialReasoningPrefix, - stripLeakedReasoningPreamble, -} from './reasoningLeakSanitizer.ts' - -describe('reasoning leak sanitizer', () => { - test('strips explicit internal reasoning preambles', () => { - const text = - 'The user just said "hey" - a simple greeting. I should respond briefly and friendly.\n\nHey! How can I help you today?' - - expect(looksLikeLeakedReasoningPrefix(text)).toBe(true) - expect(stripLeakedReasoningPreamble(text)).toBe( - 'Hey! How can I help you today?', - ) - }) - - test('does not strip normal user-facing advice that mentions "the user should"', () => { - const text = - 'The user should reset their password immediately.\n\nHere are the steps...' - - expect(looksLikeLeakedReasoningPrefix(text)).toBe(false) - expect(shouldBufferPotentialReasoningPrefix(text)).toBe(false) - expect(stripLeakedReasoningPreamble(text)).toBe(text) - }) - - test('does not strip legitimate first-person advice about responding to an incident', () => { - const text = - 'I need to respond to this security incident immediately. The system is compromised.\n\nHere are the remediation steps...' - - expect(looksLikeLeakedReasoningPrefix(text)).toBe(false) - expect(shouldBufferPotentialReasoningPrefix(text)).toBe(false) - expect(stripLeakedReasoningPreamble(text)).toBe(text) - }) - - test('does not strip legitimate first-person advice about answering a support ticket', () => { - const text = - 'I need to answer the support ticket before end of day. The customer is waiting.\n\nHere is the response I drafted...' - - expect(looksLikeLeakedReasoningPrefix(text)).toBe(false) - expect(shouldBufferPotentialReasoningPrefix(text)).toBe(false) - expect(stripLeakedReasoningPreamble(text)).toBe(text) - }) -}) diff --git a/src/services/api/reasoningLeakSanitizer.ts b/src/services/api/reasoningLeakSanitizer.ts deleted file mode 100644 index 00d02cd0..00000000 --- a/src/services/api/reasoningLeakSanitizer.ts +++ /dev/null @@ -1,54 +0,0 @@ -const EXPLICIT_REASONING_START_RE = - /^\s*(i should\b|i need to\b|let me think\b|the task\b|the request\b)/i - -const EXPLICIT_REASONING_META_RE = - /\b(user|request|question|prompt|message|task|greeting|small talk|briefly|friendly|concise)\b/i - -const USER_META_START_RE = - /^\s*the user\s+(just\s+)?(said|asked|is asking|wants|wanted|mentioned|seems|appears)\b/i - -const USER_REASONING_RE = - /^\s*the user\s+(just\s+)?(said|asked|is asking|wants|wanted|mentioned|seems|appears)\b[\s\S]*\b(i should|i need to|let me think|respond|reply|answer|greeting|small talk|briefly|friendly|concise)\b/i - -export function shouldBufferPotentialReasoningPrefix(text: string): boolean { - const normalized = text.trim() - if (!normalized) return false - - if (looksLikeLeakedReasoningPrefix(normalized)) { - return true - } - - const hasParagraphBoundary = /\n\s*\n/.test(normalized) - if (hasParagraphBoundary) { - return false - } - - return ( - EXPLICIT_REASONING_START_RE.test(normalized) || - USER_META_START_RE.test(normalized) - ) -} - -export function looksLikeLeakedReasoningPrefix(text: string): boolean { - const normalized = text.trim() - if (!normalized) return false - return ( - (EXPLICIT_REASONING_START_RE.test(normalized) && - EXPLICIT_REASONING_META_RE.test(normalized)) || - USER_REASONING_RE.test(normalized) - ) -} - -export function stripLeakedReasoningPreamble(text: string): string { - const normalized = text.replace(/\r\n/g, '\n') - const parts = normalized.split(/\n\s*\n/) - if (parts.length < 2) return text - - const first = parts[0]?.trim() ?? '' - if (!looksLikeLeakedReasoningPrefix(first)) { - return text - } - - const remainder = parts.slice(1).join('\n\n').trim() - return remainder || text -} diff --git a/src/services/api/thinkTagSanitizer.test.ts b/src/services/api/thinkTagSanitizer.test.ts new file mode 100644 index 00000000..749a2771 --- /dev/null +++ b/src/services/api/thinkTagSanitizer.test.ts @@ -0,0 +1,183 @@ +import { describe, expect, test } from 'bun:test' + +import { + createThinkTagFilter, + stripThinkTags, +} from './thinkTagSanitizer.ts' + +describe('stripThinkTags — whole-text cleanup', () => { + test('strips closed think pair', () => { + expect(stripThinkTags('reasoningHello')).toBe('Hello') + }) + + test('strips closed thinking pair', () => { + expect(stripThinkTags('xOut')).toBe('Out') + }) + + test('strips closed reasoning pair', () => { + expect(stripThinkTags('xOut')).toBe('Out') + }) + + test('strips REASONING_SCRATCHPAD pair', () => { + expect(stripThinkTags('planAnswer')) + .toBe('Answer') + }) + + test('is case-insensitive', () => { + expect(stripThinkTags('xout')).toBe('out') + expect(stripThinkTags('xout')).toBe('out') + }) + + test('handles attributes on open tag', () => { + expect(stripThinkTags('reasonok')).toBe('ok') + }) + + test('strips unterminated open tag at block boundary', () => { + expect(stripThinkTags('reasoning that never closes')).toBe('') + }) + + test('strips unterminated open tag after newline', () => { + // Block-boundary match consumes the leading newline, same as hermes. + expect(stripThinkTags('Answer: 42\nsecond-guess myself')) + .toBe('Answer: 42') + }) + + test('strips orphan close tag', () => { + expect(stripThinkTags('trailing done')).toBe('trailing done') + }) + + test('strips multiple blocks', () => { + expect(stripThinkTags('aBcD')).toBe('BD') + }) + + test('handles reasoning mid-response after content', () => { + expect(stripThinkTags('Answer: 42\ndouble-check\nDone')) + .toBe('Answer: 42\n\nDone') + }) + + test('handles nested-looking tags (lazy match + orphan cleanup)', () => { + expect(stripThinkTags('xy')).toBe('y') + }) + + test('preserves legitimate non-think tags', () => { + expect(stripThinkTags('use
and ')).toBe('use
and ') + }) + + test('preserves text without any tags', () => { + expect(stripThinkTags('Hello, world. I should respond briefly.')).toBe( + 'Hello, world. I should respond briefly.', + ) + }) + + test('handles empty input', () => { + expect(stripThinkTags('')).toBe('') + }) +}) + +describe('createThinkTagFilter — streaming state machine', () => { + test('passes through plain text', () => { + const f = createThinkTagFilter() + expect(f.feed('Hello, ')).toBe('Hello, ') + expect(f.feed('world!')).toBe('world!') + expect(f.flush()).toBe('') + }) + + test('strips a complete think block in one chunk', () => { + const f = createThinkTagFilter() + expect(f.feed('prereasonpost')).toBe('prepost') + expect(f.flush()).toBe('') + }) + + test('handles open tag split across deltas', () => { + const f = createThinkTagFilter() + expect(f.feed('beforereasonafter')).toBe('after') + expect(f.flush()).toBe('') + }) + + test('handles close tag split across deltas', () => { + const f = createThinkTagFilter() + expect(f.feed('reasonkeep')).toBe('keep') + expect(f.flush()).toBe('') + }) + + test('handles tag split on bare < boundary', () => { + const f = createThinkTagFilter() + expect(f.feed('leading <')).toBe('leading ') + expect(f.feed('think>innertail')).toBe('tail') + expect(f.flush()).toBe('') + }) + + test('preserves partial non-tag < at boundary when next char rules it out', () => { + const f = createThinkTagFilter() + // "rest')).toBe('iv>rest') + expect(f.flush()).toBe('') + }) + + test('case-insensitive streaming', () => { + const f = createThinkTagFilter() + expect(f.feed('xout')).toBe('out') + expect(f.flush()).toBe('') + }) + + test('unterminated open tag — flush drops remainder', () => { + const f = createThinkTagFilter() + expect(f.feed('reasoning with no close ')).toBe('') + expect(f.feed('and more reasoning')).toBe('') + expect(f.flush()).toBe('') + expect(f.isInsideBlock()).toBe(false) + }) + + test('multiple blocks in single feed', () => { + const f = createThinkTagFilter() + expect(f.feed('aBcD')).toBe('BD') + expect(f.flush()).toBe('') + }) + + test('flush after clean stream emits nothing extra', () => { + const f = createThinkTagFilter() + expect(f.feed('complete message')).toBe('complete message') + expect(f.flush()).toBe('') + }) + + test('flush of bare < at end emits it (not a tag prefix)', () => { + const f = createThinkTagFilter() + // bare '<' held back; flush emits it since it has no tag-name chars + expect(f.feed('x <')).toBe('x ') + expect(f.flush()).toBe('<') + }) + + test('flush of partial tag-name prefix at end drops it', () => { + const f = createThinkTagFilter() + expect(f.feed('x { + const f = createThinkTagFilter() + expect(f.feed('reasonok')).toBe('ok') + expect(f.flush()).toBe('') + }) + + test('mid-delta transition: content, reasoning, content', () => { + const f = createThinkTagFilter() + expect(f.feed('Answer: 42\n')).toBe('Answer: 42\n') + expect(f.feed('double-check')).toBe('') + expect(f.feed('\nDone')).toBe('\nDone') + expect(f.flush()).toBe('') + }) + + test('orphan close tag mid-stream is stripped on flush via safety-net behavior', () => { + // Filter alone treats orphan close as "we're not inside", so it emits as-is. + // Safety net (stripThinkTags on final text) removes orphans. + const f = createThinkTagFilter() + const chunk1 = f.feed('trailing ') + const chunk2 = f.feed('done') + const final = chunk1 + chunk2 + f.flush() + // Orphan close appears in stream output; safety net cleans it + expect(stripThinkTags(final)).toBe('trailing done') + }) +}) diff --git a/src/services/api/thinkTagSanitizer.ts b/src/services/api/thinkTagSanitizer.ts new file mode 100644 index 00000000..8957b216 --- /dev/null +++ b/src/services/api/thinkTagSanitizer.ts @@ -0,0 +1,162 @@ +/** + * Think-tag sanitizer for reasoning content leaks. + * + * Some OpenAI-compatible reasoning models (MiniMax M2.7, GLM-4.5/5, DeepSeek, Kimi K2, + * self-hosted vLLM builds) emit chain-of-thought inline inside the `content` field using + * XML-like tags instead of the separate `reasoning_content` channel. Example: + * + * the user wants foo, let me check barHere is the answer: ... + * + * This module strips those blocks structurally (tag-based), independent of English + * phrasings. Three layers: + * + * 1. `createThinkTagFilter()` — streaming state machine. Feeds deltas, emits only + * the visible (non-reasoning) portion, and buffers partial tags across chunk + * boundaries so `` still parses correctly. + * + * 2. `stripThinkTags()` — whole-text cleanup. Removes closed pairs, unterminated + * opens at block boundaries, and orphan open/close tags. Used for non-streaming + * responses and as a safety net after stream close. + * + * 3. Flush discards buffered partial tags at stream end (false-negative bias — + * prefer losing a partial reasoning fragment over leaking it). + */ + +const TAG_NAMES = [ + 'think', + 'thinking', + 'reasoning', + 'thought', + 'reasoning_scratchpad', +] as const + +const TAG_ALT = TAG_NAMES.join('|') + +const OPEN_TAG_RE = new RegExp(`<\\s*(?:${TAG_ALT})\\b[^>]*>`, 'i') +const CLOSE_TAG_RE = new RegExp(`<\\s*/\\s*(?:${TAG_ALT})\\s*>`, 'i') + +const CLOSED_PAIR_RE_G = new RegExp( + `<\\s*(${TAG_ALT})\\b[^>]*>[\\s\\S]*?<\\s*/\\s*\\1\\s*>`, + 'gi', +) +const UNTERMINATED_OPEN_RE = new RegExp( + `(?:^|\\n)[ \\t]*<\\s*(?:${TAG_ALT})\\b[^>]*>[\\s\\S]*$`, + 'i', +) +const ORPHAN_TAG_RE_G = new RegExp( + `<\\s*/?\\s*(?:${TAG_ALT})\\b[^>]*>\\s*`, + 'gi', +) + +const MAX_PARTIAL_TAG = 64 + +/** + * Remove reasoning/thinking blocks from a complete text body. + * + * Handles: + * - Closed pairs: ... (lazy match, anywhere in text) + * - Unterminated open tags at a block boundary: strips from the tag to end of string + * - Orphan open or close tags (no matching partner) + * + * False-negative bias: prefers leaving a few tag characters in rare edge cases over + * stripping legitimate content. + */ +export function stripThinkTags(text: string): string { + if (!text) return text + let out = text + out = out.replace(CLOSED_PAIR_RE_G, '') + out = out.replace(UNTERMINATED_OPEN_RE, '') + out = out.replace(ORPHAN_TAG_RE_G, '') + return out +} + +export interface ThinkTagFilter { + feed(chunk: string): string + flush(): string + isInsideBlock(): boolean +} + +/** + * Streaming state machine. Feed deltas, emits visible (non-reasoning) text. + * Handles tags split across chunk boundaries by holding back a short tail buffer + * whenever the current buffer ends with what looks like a partial tag. + */ +export function createThinkTagFilter(): ThinkTagFilter { + let inside = false + let buffer = '' + + function findPartialTagStart(s: string): number { + const lastLt = s.lastIndexOf('<') + if (lastLt === -1) return -1 + if (s.indexOf('>', lastLt) !== -1) return -1 + const tail = s.slice(lastLt) + if (tail.length > MAX_PARTIAL_TAG) return -1 + + const m = /^<\s*\/?\s*([a-zA-Z_]\w*)?\s*$/.exec(tail) + if (!m) return -1 + const partialName = (m[1] ?? '').toLowerCase() + if (!partialName) return lastLt + if (TAG_NAMES.some(name => name.startsWith(partialName))) return lastLt + return -1 + } + + function feed(chunk: string): string { + if (!chunk) return '' + buffer += chunk + let out = '' + + while (buffer.length > 0) { + if (!inside) { + const open = OPEN_TAG_RE.exec(buffer) + if (open) { + out += buffer.slice(0, open.index) + buffer = buffer.slice(open.index + open[0].length) + inside = true + continue + } + + const partialStart = findPartialTagStart(buffer) + if (partialStart === -1) { + out += buffer + buffer = '' + } else { + out += buffer.slice(0, partialStart) + buffer = buffer.slice(partialStart) + } + return out + } + + const close = CLOSE_TAG_RE.exec(buffer) + if (close) { + buffer = buffer.slice(close.index + close[0].length) + inside = false + continue + } + + const partialStart = findPartialTagStart(buffer) + if (partialStart === -1) { + buffer = '' + } else { + buffer = buffer.slice(partialStart) + } + return out + } + + return out + } + + function flush(): string { + const held = buffer + const wasInside = inside + buffer = '' + inside = false + + if (wasInside) return '' + if (!held) return '' + + if (/^<\s*\/?\s*[a-zA-Z_]/.test(held)) return '' + return held + } + + return { feed, flush, isInsideBlock: () => inside } +}