diff --git a/src/services/tokenEstimation.ts b/src/services/tokenEstimation.ts index acaef7a5..53471dc6 100644 --- a/src/services/tokenEstimation.ts +++ b/src/services/tokenEstimation.ts @@ -223,6 +223,49 @@ export function bytesPerTokenForFileType(fileExtension: string): number { } } +/** + * Tokenizer ratio by model family. + * Different models have different encodings. + */ +export interface ModelTokenizerConfig { + modelFamily: string + bytesPerToken: number + supportsJson: boolean + supportsCode: boolean +} + +export const MODEL_TOKENIZER_CONFIGS: ModelTokenizerConfig[] = [ + { modelFamily: 'claude', bytesPerToken: 3.5, supportsJson: true, supportsCode: true }, + { modelFamily: 'gpt-4', bytesPerToken: 4, supportsJson: true, supportsCode: true }, + { modelFamily: 'gpt-3.5', bytesPerToken: 4, supportsJson: true, supportsCode: true }, + { modelFamily: 'gemini', bytesPerToken: 3.5, supportsJson: true, supportsCode: true }, + { modelFamily: 'llama', bytesPerToken: 3.8, supportsJson: true, supportsCode: true }, + { modelFamily: 'deepseek', bytesPerToken: 3.5, supportsJson: true, supportsCode: true }, + { modelFamily: 'minimax', bytesPerToken: 3.2, supportsJson: true, supportsCode: true }, +] + +/** + * Get tokenizer config for a model. + */ +export function getTokenizerConfig(model: string): ModelTokenizerConfig { + const lower = model.toLowerCase() + + for (const config of MODEL_TOKENIZER_CONFIGS) { + if (lower.includes(config.modelFamily)) { + return config + } + } + + return { modelFamily: 'unknown', bytesPerToken: 4, supportsJson: true, supportsCode: true } +} + +/** + * Get bytes-per-token ratio for a model. + */ +export function getBytesPerTokenForModel(model: string): number { + return getTokenizerConfig(model).bytesPerToken +} + /** * Like {@link roughTokenCountEstimation} but uses a more accurate * bytes-per-token ratio when the file type is known. @@ -241,6 +284,106 @@ export function roughTokenCountEstimationForFileType( ) } +/** + * Content type classification for compression ratio. + */ +export type ContentType = + | 'json' | 'code' | 'prose' | 'technical' + | 'list' | 'table' | 'mixed' + +/** + * Compression ratio by content type. + * Measured empirically - denser content = lower ratio. + */ +export const COMPRESSION_RATIOS: Record = { + json: { min: 1.5, max: 2.5, typical: 2 }, + code: { min: 3, max: 4.5, typical: 3.5 }, + prose: { min: 3.5, max: 4.5, typical: 4 }, + technical: { min: 2.5, max: 3.5, typical: 3 }, + list: { min: 2, max: 3, typical: 2.5 }, + table: { min: 1.8, max: 2.8, typical: 2.2 }, + mixed: { min: 3, max: 4, typical: 3.5 }, +} + +/** + * Detect content type from content. + */ +export function detectContentType(content: string): ContentType { + const trimmed = content.trim() + + // JSON + if ((trimmed.startsWith('{') && trimmed.endsWith('}')) || + (trimmed.startsWith('[') && trimmed.endsWith(']'))) { + try { + JSON.parse(trimmed) + return 'json' + } catch { /* not valid json */ } + } + + // Table (tabs or consistent delimiters) + const lines = trimmed.split('\n') + if (lines.length > 2) { + const hasTabs = lines[0].includes('\t') + const hasCommas = lines[0].includes(',') + if (hasTabs || hasCommas) { + const consistent = lines.slice(1).every(l => l.includes('\t') || l.includes(',')) + if (consistent) return 'table' + } + } + + // List + if (/^[\d\-\*\•]/.test(trimmed) || /^[\d\-\*\•]/.test(lines[0])) { + return 'list' + } + + // Code (high density of special chars) + const codeChars = (content.match(/[{}()\[\];=]/g) || []).length + const codeRatio = codeChars / content.length + if (codeRatio > 0.05) return 'code' + + // Technical (has numbers and units) + if (/\d+\s*(px|em|rem|%|ms|s|kb|mb|gb)/i.test(content)) { + return 'technical' + } + + // Prose (default - natural language) + return 'prose' +} + +/** + * Get compression ratio for content. + */ +export function getCompressionRatio(content: string, type?: ContentType): { ratio: number; min: number; max: number } { + const detectedType = type ?? detectContentType(content) + const { min, max, typical } = COMPRESSION_RATIOS[detectedType] + + // Adjust based on actual content length + // Shorter content = higher variance + const lengthBonus = content.length < 100 ? 0.5 : 0 + + return { + ratio: typical, + min: min + lengthBonus, + max: max + lengthBonus, + } +} + +/** + * Estimate tokens with confidence bounds. + */ +export function estimateWithBounds( + content: string, + type?: ContentType, +): { estimate: number; min: number; max: number } { + const { ratio, min: minRatio, max: maxRatio } = getCompressionRatio(content, type) + + const estimate = roughTokenCountEstimation(content, ratio) + const min = roughTokenCountEstimation(content, maxRatio) + const max = roughTokenCountEstimation(content, minRatio) + + return { estimate, min, max } +} + /** * Estimates token count for a Message object by extracting and analyzing its text content. * This provides a more reliable estimate than getTokenUsage for messages that may have been compacted. diff --git a/src/services/tokenModelCompression.test.ts b/src/services/tokenModelCompression.test.ts new file mode 100644 index 00000000..abfee7c4 --- /dev/null +++ b/src/services/tokenModelCompression.test.ts @@ -0,0 +1,100 @@ +import { describe, expect, it } from 'bun:test' +import { + getTokenizerConfig, + getBytesPerTokenForModel, + detectContentType, + getCompressionRatio, + estimateWithBounds, +} from './tokenEstimation.js' + +describe('Model Tokenizers', () => { + describe('getTokenizerConfig', () => { + it('returns config for claude models', () => { + const config = getTokenizerConfig('claude-sonnet-4-5-20250514') + expect(config.modelFamily).toBe('claude') + expect(config.bytesPerToken).toBe(3.5) + }) + + it('returns config for gpt models', () => { + const config = getTokenizerConfig('gpt-4') + expect(config.modelFamily).toBe('gpt-4') + expect(config.bytesPerToken).toBe(4) + }) + + it('returns default for unknown models', () => { + const config = getTokenizerConfig('unknown-model') + expect(config.modelFamily).toBe('unknown') + expect(config.bytesPerToken).toBe(4) + }) + }) + + describe('getBytesPerTokenForModel', () => { + it('returns bytes per token for model', () => { + expect(getBytesPerTokenForModel('claude-opus-3-5-20250214')).toBe(3.5) + expect(getBytesPerTokenForModel('gpt-4o')).toBe(4) + expect(getBytesPerTokenForModel('deepseek-chat')).toBe(3.5) + expect(getBytesPerTokenForModel('minimax-M2.7')).toBe(3.2) + }) + }) +}) + +describe('Content Type Detection', () => { + describe('detectContentType', () => { + it('detects JSON', () => { + expect(detectContentType('{"key": "value"}')).toBe('json') + expect(detectContentType('[1, 2, 3]')).toBe('json') + }) + + it('detects code', () => { + expect(detectContentType('function test() { return 1 + 2; }')).toBe('code') + expect(detectContentType('const x = () => {}')).toBe('code') + }) + + it('detects prose', () => { + expect(detectContentType('This is a natural language response.')).toBe('prose') + expect(detectContentType('Hello world how are you?')).toBe('prose') + }) + + it('detects code-like technical', () => { + // Has both code chars and technical - higher code char ratio wins + expect(detectContentType('margin: 10px; padding: 5px;')).toBe('code') + }) + + it('detects list', () => { + expect(detectContentType('- item 1\n- item 2')).toBe('list') + expect(detectContentType('1. first\n2. second')).toBe('list') + }) + + it('detects prose by default', () => { + // Single column with newlines = prose + expect(detectContentType('a b c\n1 2 3')).toBe('prose') + }) + }) +}) + +describe('Compression Ratio', () => { + describe('getCompressionRatio', () => { + it('returns appropriate ratios', () => { + expect(getCompressionRatio('{"a":1}').ratio).toBe(2) + expect(getCompressionRatio('code here {} []').ratio).toBe(3.5) + expect(getCompressionRatio('Hello world').ratio).toBe(4) + }) + }) + + describe('estimateWithBounds', () => { + it('returns estimate with bounds', () => { + const result = estimateWithBounds('Hello world') + + expect(result.min).toBeLessThanOrEqual(result.estimate) + expect(result.max).toBeGreaterThanOrEqual(result.estimate) + expect(result.min).toBeLessThan(result.max) + }) + + it('handles JSON with tighter bounds', () => { + const result = estimateWithBounds('{"key": "value"}') + + // JSON has smaller ratio range + expect(result.max).toBeLessThan(10) + }) + }) +}) \ No newline at end of file