From 37d4c217398ff6cc91ee22e4d4d457510e1baed0 Mon Sep 17 00:00:00 2001 From: skfallin Date: Thu, 2 Apr 2026 17:57:42 +0200 Subject: [PATCH] fix: make schema sanitization provider-specific --- src/services/api/openaiSchemaSanitizer.ts | 217 +------------------ src/services/api/openaiShim.ts | 3 +- src/utils/api.test.ts | 19 +- src/utils/api.ts | 2 - src/utils/schemaSanitizer.ts | 250 ++++++++++++++++++++-- 5 files changed, 244 insertions(+), 247 deletions(-) diff --git a/src/services/api/openaiSchemaSanitizer.ts b/src/services/api/openaiSchemaSanitizer.ts index 7fc00eff..ac8724be 100644 --- a/src/services/api/openaiSchemaSanitizer.ts +++ b/src/services/api/openaiSchemaSanitizer.ts @@ -1,216 +1 @@ -function isSchemaRecord(value: unknown): value is Record { - return value !== null && typeof value === 'object' && !Array.isArray(value) -} - -function deepEqualJsonValue(a: unknown, b: unknown): boolean { - if (Object.is(a, b)) return true - if (typeof a !== typeof b) return false - - if (Array.isArray(a) && Array.isArray(b)) { - return ( - a.length === b.length && - a.every((value, index) => deepEqualJsonValue(value, b[index])) - ) - } - - if (isSchemaRecord(a) && isSchemaRecord(b)) { - const aKeys = Object.keys(a) - const bKeys = Object.keys(b) - return ( - aKeys.length === bKeys.length && - aKeys.every(key => key in b && deepEqualJsonValue(a[key], b[key])) - ) - } - - return false -} - -function matchesJsonSchemaType(type: string, value: unknown): boolean { - switch (type) { - case 'string': - return typeof value === 'string' - case 'number': - return typeof value === 'number' && Number.isFinite(value) - case 'integer': - return typeof value === 'number' && Number.isInteger(value) - case 'boolean': - return typeof value === 'boolean' - case 'object': - return value !== null && typeof value === 'object' && !Array.isArray(value) - case 'array': - return Array.isArray(value) - case 'null': - return value === null - default: - return true - } -} - -function getJsonSchemaTypes(record: Record): string[] { - const raw = record.type - if (typeof raw === 'string') { - return [raw] - } - if (Array.isArray(raw)) { - return raw.filter((value): value is string => typeof value === 'string') - } - return [] -} - -function schemaAllowsValue(schema: Record, value: unknown): boolean { - if (Array.isArray(schema.anyOf)) { - return schema.anyOf.some(item => - schemaAllowsValue(sanitizeSchemaForOpenAICompat(item), value), - ) - } - - if (Array.isArray(schema.oneOf)) { - return ( - schema.oneOf.filter(item => - schemaAllowsValue(sanitizeSchemaForOpenAICompat(item), value), - ).length === 1 - ) - } - - if (Array.isArray(schema.allOf)) { - return schema.allOf.every(item => - schemaAllowsValue(sanitizeSchemaForOpenAICompat(item), value), - ) - } - - if ('const' in schema && !deepEqualJsonValue(schema.const, value)) { - return false - } - - if (Array.isArray(schema.enum)) { - if (!schema.enum.some(item => deepEqualJsonValue(item, value))) { - return false - } - } - - const types = getJsonSchemaTypes(schema) - if (types.length > 0 && !types.some(type => matchesJsonSchemaType(type, value))) { - return false - } - - return true -} - -function sanitizeTypeField(record: Record): void { - const allowed = new Set([ - 'string', - 'number', - 'integer', - 'boolean', - 'object', - 'array', - 'null', - ]) - - const raw = record.type - if (typeof raw === 'string') { - if (!allowed.has(raw)) delete record.type - return - } - - if (!Array.isArray(raw)) return - - const filtered = raw.filter( - (value, index): value is string => - typeof value === 'string' && - allowed.has(value) && - raw.indexOf(value) === index, - ) - - if (filtered.length === 0) { - delete record.type - } else if (filtered.length === 1) { - record.type = filtered[0] - } else { - record.type = filtered - } -} - -/** - * Sanitize loose/invalid JSON Schema into a form OpenAI-compatible providers - * are more likely to accept. This is intentionally defensive for external MCP - * servers that may advertise imperfect schemas. - */ -export function sanitizeSchemaForOpenAICompat( - schema: unknown, -): Record { - if (!isSchemaRecord(schema)) { - return {} - } - - const record = { ...schema } - - delete record.$schema - delete record.propertyNames - - sanitizeTypeField(record) - - if (isSchemaRecord(record.properties)) { - const sanitizedProps: Record = {} - for (const [key, value] of Object.entries(record.properties)) { - sanitizedProps[key] = sanitizeSchemaForOpenAICompat(value) - } - record.properties = sanitizedProps - } - - if ('items' in record) { - if (Array.isArray(record.items)) { - record.items = record.items.map(item => - sanitizeSchemaForOpenAICompat(item), - ) - } else { - record.items = sanitizeSchemaForOpenAICompat(record.items) - } - } - - for (const key of ['anyOf', 'oneOf', 'allOf'] as const) { - if (Array.isArray(record[key])) { - record[key] = record[key].map(item => - sanitizeSchemaForOpenAICompat(item), - ) - } - } - - if (Array.isArray(record.required) && isSchemaRecord(record.properties)) { - record.required = record.required.filter( - (value): value is string => - typeof value === 'string' && value in record.properties, - ) - } - - const schemaWithoutEnum = { ...record } - delete schemaWithoutEnum.enum - - if (Array.isArray(record.enum)) { - const filteredEnum = record.enum.filter(value => - schemaAllowsValue(schemaWithoutEnum, value), - ) - if (filteredEnum.length > 0) { - record.enum = filteredEnum - } else { - delete record.enum - } - } - - const schemaWithoutConst = { ...record } - delete schemaWithoutConst.const - if ('const' in record && !schemaAllowsValue(schemaWithoutConst, record.const)) { - delete record.const - } - - const schemaWithoutDefault = { ...record } - delete schemaWithoutDefault.default - if ( - 'default' in record && - !schemaAllowsValue(schemaWithoutDefault, record.default) - ) { - delete record.default - } - - return record -} +export { sanitizeSchemaForOpenAICompat } from '../../utils/schemaSanitizer.js' diff --git a/src/services/api/openaiShim.ts b/src/services/api/openaiShim.ts index 31609cc8..fb724513 100644 --- a/src/services/api/openaiShim.ts +++ b/src/services/api/openaiShim.ts @@ -38,9 +38,8 @@ import { resolveCodexApiCredentials, resolveProviderRequest, } from './providerConfig.js' -import { stripIncompatibleSchemaKeywords } from '../../utils/schemaSanitizer.js' +import { sanitizeSchemaForOpenAICompat } from '../../utils/schemaSanitizer.js' import { redactSecretValueForDisplay } from '../../utils/providerProfile.js' -import { sanitizeSchemaForOpenAICompat } from './openaiSchemaSanitizer.js' const GITHUB_MODELS_DEFAULT_BASE = 'https://models.github.ai/inference' const GITHUB_API_VERSION = '2022-11-28' diff --git a/src/utils/api.test.ts b/src/utils/api.test.ts index d2d0e380..8c51142f 100644 --- a/src/utils/api.test.ts +++ b/src/utils/api.test.ts @@ -3,7 +3,7 @@ import { z } from 'zod/v4' import { getEmptyToolPermissionContext, type Tool, type Tools } from '../Tool.js' import { toolToAPISchema } from './api.js' -test('toolToAPISchema strips incompatible schema keywords from input_schema', async () => { +test('toolToAPISchema preserves provider-specific schema keywords in input_schema', async () => { const schema = await toolToAPISchema( { name: 'WebFetch', @@ -18,6 +18,9 @@ test('toolToAPISchema strips incompatible schema keywords from input_schema', as }, metadata: { type: 'object', + propertyNames: { + pattern: '^[a-z]+$', + }, properties: { callback: { type: 'string', @@ -42,26 +45,22 @@ test('toolToAPISchema strips incompatible schema keywords from input_schema', as properties: { url: { type: 'string', + format: 'uri', description: 'Public HTTP or HTTPS URL', }, metadata: { type: 'object', + propertyNames: { + pattern: '^[a-z]+$', + }, properties: { callback: { type: 'string', + format: 'uri-reference', }, }, }, }, }, }) - - const inputSchema = (schema as { input_schema: Record }).input_schema - const properties = inputSchema.properties as Record> - expect(properties.url?.format).toBeUndefined() - expect( - ( - properties.metadata?.properties as Record> - )?.callback?.format, - ).toBeUndefined() }) diff --git a/src/utils/api.ts b/src/utils/api.ts index a18866e6..9b66fd79 100644 --- a/src/utils/api.ts +++ b/src/utils/api.ts @@ -60,7 +60,6 @@ import { import { getPlatform } from './platform.js' import { countFilesRoundedRg } from './ripgrep.js' import { jsonStringify } from './slowOperations.js' -import { stripIncompatibleSchemaKeywords } from './schemaSanitizer.js' import type { SystemPrompt } from './systemPromptType.js' import { getToolSchemaCache } from './toolSchemaCache.js' import { windowsPathToPosixPath } from './windowsPaths.js' @@ -166,7 +165,6 @@ export async function toolToAPISchema( if (!isAgentSwarmsEnabled()) { input_schema = filterSwarmFieldsFromSchema(tool.name, input_schema) } - input_schema = stripIncompatibleSchemaKeywords(input_schema) base = { name: tool.name, diff --git a/src/utils/schemaSanitizer.ts b/src/utils/schemaSanitizer.ts index 6cc066d8..2993018e 100644 --- a/src/utils/schemaSanitizer.ts +++ b/src/utils/schemaSanitizer.ts @@ -1,30 +1,246 @@ -/** - * Anthropic-compatible tool schemas reject several JSON Schema keywords that - * Zod commonly emits, especially string `format` validators like `uri`. - * Strip those fields recursively before sending tool schemas to providers. - */ -export function stripIncompatibleSchemaKeywords( - schema: T, -): T { +const OPENAI_INCOMPATIBLE_SCHEMA_KEYWORDS = new Set([ + '$comment', + '$schema', + 'default', + 'else', + 'examples', + 'format', + 'if', + 'maxLength', + 'maximum', + 'minLength', + 'minimum', + 'multipleOf', + 'pattern', + 'patternProperties', + 'propertyNames', + 'then', + 'unevaluatedProperties', +]) + +function isSchemaRecord(value: unknown): value is Record { + return value !== null && typeof value === 'object' && !Array.isArray(value) +} + +function stripSchemaKeywords(schema: unknown, keywords: Set): unknown { if (Array.isArray(schema)) { - return schema.map(item => stripIncompatibleSchemaKeywords(item)) as T + return schema.map(item => stripSchemaKeywords(item, keywords)) } - if (!schema || typeof schema !== 'object') { + if (!isSchemaRecord(schema)) { return schema } const result: Record = {} - for (const [key, value] of Object.entries(schema as Record)) { - if (key === '$schema' || key === 'format' || key === 'propertyNames') { + for (const [key, value] of Object.entries(schema)) { + if (keywords.has(key)) { continue } - result[key] = - value && typeof value === 'object' - ? stripIncompatibleSchemaKeywords(value) - : value + result[key] = stripSchemaKeywords(value, keywords) } - return result as T + return result +} + +function deepEqualJsonValue(a: unknown, b: unknown): boolean { + if (Object.is(a, b)) return true + if (typeof a !== typeof b) return false + + if (Array.isArray(a) && Array.isArray(b)) { + return ( + a.length === b.length && + a.every((value, index) => deepEqualJsonValue(value, b[index])) + ) + } + + if (isSchemaRecord(a) && isSchemaRecord(b)) { + const aKeys = Object.keys(a) + const bKeys = Object.keys(b) + return ( + aKeys.length === bKeys.length && + aKeys.every(key => key in b && deepEqualJsonValue(a[key], b[key])) + ) + } + + return false +} + +function matchesJsonSchemaType(type: string, value: unknown): boolean { + switch (type) { + case 'string': + return typeof value === 'string' + case 'number': + return typeof value === 'number' && Number.isFinite(value) + case 'integer': + return typeof value === 'number' && Number.isInteger(value) + case 'boolean': + return typeof value === 'boolean' + case 'object': + return value !== null && typeof value === 'object' && !Array.isArray(value) + case 'array': + return Array.isArray(value) + case 'null': + return value === null + default: + return true + } +} + +function getJsonSchemaTypes(record: Record): string[] { + const raw = record.type + if (typeof raw === 'string') { + return [raw] + } + if (Array.isArray(raw)) { + return raw.filter((value): value is string => typeof value === 'string') + } + return [] +} + +function schemaAllowsValue(schema: Record, value: unknown): boolean { + if (Array.isArray(schema.anyOf)) { + return schema.anyOf.some(item => + schemaAllowsValue(sanitizeSchemaForOpenAICompat(item), value), + ) + } + + if (Array.isArray(schema.oneOf)) { + return ( + schema.oneOf.filter(item => + schemaAllowsValue(sanitizeSchemaForOpenAICompat(item), value), + ).length === 1 + ) + } + + if (Array.isArray(schema.allOf)) { + return schema.allOf.every(item => + schemaAllowsValue(sanitizeSchemaForOpenAICompat(item), value), + ) + } + + if ('const' in schema && !deepEqualJsonValue(schema.const, value)) { + return false + } + + if (Array.isArray(schema.enum)) { + if (!schema.enum.some(item => deepEqualJsonValue(item, value))) { + return false + } + } + + const types = getJsonSchemaTypes(schema) + if (types.length > 0 && !types.some(type => matchesJsonSchemaType(type, value))) { + return false + } + + return true +} + +function sanitizeTypeField(record: Record): void { + const allowed = new Set([ + 'string', + 'number', + 'integer', + 'boolean', + 'object', + 'array', + 'null', + ]) + + const raw = record.type + if (typeof raw === 'string') { + if (!allowed.has(raw)) delete record.type + return + } + + if (!Array.isArray(raw)) return + + const filtered = raw.filter( + (value, index): value is string => + typeof value === 'string' && + allowed.has(value) && + raw.indexOf(value) === index, + ) + + if (filtered.length === 0) { + delete record.type + } else if (filtered.length === 1) { + record.type = filtered[0] + } else { + record.type = filtered + } +} + +/** + * Sanitize JSON Schema into a shape OpenAI-compatible providers and Codex + * strict-mode tooling are more likely to accept. This strips provider-rejected + * keywords while keeping enum/const cleanup defensive for imperfect MCP schemas. + */ +export function sanitizeSchemaForOpenAICompat( + schema: unknown, +): Record { + const stripped = stripSchemaKeywords(schema, OPENAI_INCOMPATIBLE_SCHEMA_KEYWORDS) + if (!isSchemaRecord(stripped)) { + return {} + } + + const record = { ...stripped } + + sanitizeTypeField(record) + + if (isSchemaRecord(record.properties)) { + const sanitizedProps: Record = {} + for (const [key, value] of Object.entries(record.properties)) { + sanitizedProps[key] = sanitizeSchemaForOpenAICompat(value) + } + record.properties = sanitizedProps + } + + if ('items' in record) { + if (Array.isArray(record.items)) { + record.items = record.items.map(item => + sanitizeSchemaForOpenAICompat(item), + ) + } else { + record.items = sanitizeSchemaForOpenAICompat(record.items) + } + } + + for (const key of ['anyOf', 'oneOf', 'allOf'] as const) { + if (Array.isArray(record[key])) { + record[key] = record[key].map(item => + sanitizeSchemaForOpenAICompat(item), + ) + } + } + + if (Array.isArray(record.required) && isSchemaRecord(record.properties)) { + record.required = record.required.filter( + (value): value is string => + typeof value === 'string' && value in record.properties, + ) + } + + const schemaWithoutEnum = { ...record } + delete schemaWithoutEnum.enum + + if (Array.isArray(record.enum)) { + const filteredEnum = record.enum.filter(value => + schemaAllowsValue(schemaWithoutEnum, value), + ) + if (filteredEnum.length > 0) { + record.enum = filteredEnum + } else { + delete record.enum + } + } + + const schemaWithoutConst = { ...record } + delete schemaWithoutConst.const + if ('const' in record && !schemaAllowsValue(schemaWithoutConst, record.const)) { + delete record.const + } + + return record }