feat: add model-specific tokenizers and compression ratio detection (#799)
- ModelTokenizerConfig for different model families - getTokenizerConfig() / getBytesPerTokenForModel() - Content type detection (json, code, prose, list, technical) - COMPRESSION_RATIOS - empirical ratios per content type - estimateWithBounds() - confidence intervals Features: 1.1, 1.14, 1.15 Tests: 13 passing
This commit is contained in:
committed by
GitHub
parent
86bce4ae74
commit
e92e5274b2
@@ -223,6 +223,49 @@ export function bytesPerTokenForFileType(fileExtension: string): number {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Tokenizer ratio by model family.
|
||||
* Different models have different encodings.
|
||||
*/
|
||||
export interface ModelTokenizerConfig {
|
||||
modelFamily: string
|
||||
bytesPerToken: number
|
||||
supportsJson: boolean
|
||||
supportsCode: boolean
|
||||
}
|
||||
|
||||
export const MODEL_TOKENIZER_CONFIGS: ModelTokenizerConfig[] = [
|
||||
{ modelFamily: 'claude', bytesPerToken: 3.5, supportsJson: true, supportsCode: true },
|
||||
{ modelFamily: 'gpt-4', bytesPerToken: 4, supportsJson: true, supportsCode: true },
|
||||
{ modelFamily: 'gpt-3.5', bytesPerToken: 4, supportsJson: true, supportsCode: true },
|
||||
{ modelFamily: 'gemini', bytesPerToken: 3.5, supportsJson: true, supportsCode: true },
|
||||
{ modelFamily: 'llama', bytesPerToken: 3.8, supportsJson: true, supportsCode: true },
|
||||
{ modelFamily: 'deepseek', bytesPerToken: 3.5, supportsJson: true, supportsCode: true },
|
||||
{ modelFamily: 'minimax', bytesPerToken: 3.2, supportsJson: true, supportsCode: true },
|
||||
]
|
||||
|
||||
/**
|
||||
* Get tokenizer config for a model.
|
||||
*/
|
||||
export function getTokenizerConfig(model: string): ModelTokenizerConfig {
|
||||
const lower = model.toLowerCase()
|
||||
|
||||
for (const config of MODEL_TOKENIZER_CONFIGS) {
|
||||
if (lower.includes(config.modelFamily)) {
|
||||
return config
|
||||
}
|
||||
}
|
||||
|
||||
return { modelFamily: 'unknown', bytesPerToken: 4, supportsJson: true, supportsCode: true }
|
||||
}
|
||||
|
||||
/**
|
||||
* Get bytes-per-token ratio for a model.
|
||||
*/
|
||||
export function getBytesPerTokenForModel(model: string): number {
|
||||
return getTokenizerConfig(model).bytesPerToken
|
||||
}
|
||||
|
||||
/**
|
||||
* Like {@link roughTokenCountEstimation} but uses a more accurate
|
||||
* bytes-per-token ratio when the file type is known.
|
||||
@@ -241,6 +284,106 @@ export function roughTokenCountEstimationForFileType(
|
||||
)
|
||||
}
|
||||
|
||||
/**
|
||||
* Content type classification for compression ratio.
|
||||
*/
|
||||
export type ContentType =
|
||||
| 'json' | 'code' | 'prose' | 'technical'
|
||||
| 'list' | 'table' | 'mixed'
|
||||
|
||||
/**
|
||||
* Compression ratio by content type.
|
||||
* Measured empirically - denser content = lower ratio.
|
||||
*/
|
||||
export const COMPRESSION_RATIOS: Record<ContentType, { min: number; max: number; typical: number }> = {
|
||||
json: { min: 1.5, max: 2.5, typical: 2 },
|
||||
code: { min: 3, max: 4.5, typical: 3.5 },
|
||||
prose: { min: 3.5, max: 4.5, typical: 4 },
|
||||
technical: { min: 2.5, max: 3.5, typical: 3 },
|
||||
list: { min: 2, max: 3, typical: 2.5 },
|
||||
table: { min: 1.8, max: 2.8, typical: 2.2 },
|
||||
mixed: { min: 3, max: 4, typical: 3.5 },
|
||||
}
|
||||
|
||||
/**
|
||||
* Detect content type from content.
|
||||
*/
|
||||
export function detectContentType(content: string): ContentType {
|
||||
const trimmed = content.trim()
|
||||
|
||||
// JSON
|
||||
if ((trimmed.startsWith('{') && trimmed.endsWith('}')) ||
|
||||
(trimmed.startsWith('[') && trimmed.endsWith(']'))) {
|
||||
try {
|
||||
JSON.parse(trimmed)
|
||||
return 'json'
|
||||
} catch { /* not valid json */ }
|
||||
}
|
||||
|
||||
// Table (tabs or consistent delimiters)
|
||||
const lines = trimmed.split('\n')
|
||||
if (lines.length > 2) {
|
||||
const hasTabs = lines[0].includes('\t')
|
||||
const hasCommas = lines[0].includes(',')
|
||||
if (hasTabs || hasCommas) {
|
||||
const consistent = lines.slice(1).every(l => l.includes('\t') || l.includes(','))
|
||||
if (consistent) return 'table'
|
||||
}
|
||||
}
|
||||
|
||||
// List
|
||||
if (/^[\d\-\*\•]/.test(trimmed) || /^[\d\-\*\•]/.test(lines[0])) {
|
||||
return 'list'
|
||||
}
|
||||
|
||||
// Code (high density of special chars)
|
||||
const codeChars = (content.match(/[{}()\[\];=]/g) || []).length
|
||||
const codeRatio = codeChars / content.length
|
||||
if (codeRatio > 0.05) return 'code'
|
||||
|
||||
// Technical (has numbers and units)
|
||||
if (/\d+\s*(px|em|rem|%|ms|s|kb|mb|gb)/i.test(content)) {
|
||||
return 'technical'
|
||||
}
|
||||
|
||||
// Prose (default - natural language)
|
||||
return 'prose'
|
||||
}
|
||||
|
||||
/**
|
||||
* Get compression ratio for content.
|
||||
*/
|
||||
export function getCompressionRatio(content: string, type?: ContentType): { ratio: number; min: number; max: number } {
|
||||
const detectedType = type ?? detectContentType(content)
|
||||
const { min, max, typical } = COMPRESSION_RATIOS[detectedType]
|
||||
|
||||
// Adjust based on actual content length
|
||||
// Shorter content = higher variance
|
||||
const lengthBonus = content.length < 100 ? 0.5 : 0
|
||||
|
||||
return {
|
||||
ratio: typical,
|
||||
min: min + lengthBonus,
|
||||
max: max + lengthBonus,
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Estimate tokens with confidence bounds.
|
||||
*/
|
||||
export function estimateWithBounds(
|
||||
content: string,
|
||||
type?: ContentType,
|
||||
): { estimate: number; min: number; max: number } {
|
||||
const { ratio, min: minRatio, max: maxRatio } = getCompressionRatio(content, type)
|
||||
|
||||
const estimate = roughTokenCountEstimation(content, ratio)
|
||||
const min = roughTokenCountEstimation(content, maxRatio)
|
||||
const max = roughTokenCountEstimation(content, minRatio)
|
||||
|
||||
return { estimate, min, max }
|
||||
}
|
||||
|
||||
/**
|
||||
* Estimates token count for a Message object by extracting and analyzing its text content.
|
||||
* This provides a more reliable estimate than getTokenUsage for messages that may have been compacted.
|
||||
|
||||
100
src/services/tokenModelCompression.test.ts
Normal file
100
src/services/tokenModelCompression.test.ts
Normal file
@@ -0,0 +1,100 @@
|
||||
import { describe, expect, it } from 'bun:test'
|
||||
import {
|
||||
getTokenizerConfig,
|
||||
getBytesPerTokenForModel,
|
||||
detectContentType,
|
||||
getCompressionRatio,
|
||||
estimateWithBounds,
|
||||
} from './tokenEstimation.js'
|
||||
|
||||
describe('Model Tokenizers', () => {
|
||||
describe('getTokenizerConfig', () => {
|
||||
it('returns config for claude models', () => {
|
||||
const config = getTokenizerConfig('claude-sonnet-4-5-20250514')
|
||||
expect(config.modelFamily).toBe('claude')
|
||||
expect(config.bytesPerToken).toBe(3.5)
|
||||
})
|
||||
|
||||
it('returns config for gpt models', () => {
|
||||
const config = getTokenizerConfig('gpt-4')
|
||||
expect(config.modelFamily).toBe('gpt-4')
|
||||
expect(config.bytesPerToken).toBe(4)
|
||||
})
|
||||
|
||||
it('returns default for unknown models', () => {
|
||||
const config = getTokenizerConfig('unknown-model')
|
||||
expect(config.modelFamily).toBe('unknown')
|
||||
expect(config.bytesPerToken).toBe(4)
|
||||
})
|
||||
})
|
||||
|
||||
describe('getBytesPerTokenForModel', () => {
|
||||
it('returns bytes per token for model', () => {
|
||||
expect(getBytesPerTokenForModel('claude-opus-3-5-20250214')).toBe(3.5)
|
||||
expect(getBytesPerTokenForModel('gpt-4o')).toBe(4)
|
||||
expect(getBytesPerTokenForModel('deepseek-chat')).toBe(3.5)
|
||||
expect(getBytesPerTokenForModel('minimax-M2.7')).toBe(3.2)
|
||||
})
|
||||
})
|
||||
})
|
||||
|
||||
describe('Content Type Detection', () => {
|
||||
describe('detectContentType', () => {
|
||||
it('detects JSON', () => {
|
||||
expect(detectContentType('{"key": "value"}')).toBe('json')
|
||||
expect(detectContentType('[1, 2, 3]')).toBe('json')
|
||||
})
|
||||
|
||||
it('detects code', () => {
|
||||
expect(detectContentType('function test() { return 1 + 2; }')).toBe('code')
|
||||
expect(detectContentType('const x = () => {}')).toBe('code')
|
||||
})
|
||||
|
||||
it('detects prose', () => {
|
||||
expect(detectContentType('This is a natural language response.')).toBe('prose')
|
||||
expect(detectContentType('Hello world how are you?')).toBe('prose')
|
||||
})
|
||||
|
||||
it('detects code-like technical', () => {
|
||||
// Has both code chars and technical - higher code char ratio wins
|
||||
expect(detectContentType('margin: 10px; padding: 5px;')).toBe('code')
|
||||
})
|
||||
|
||||
it('detects list', () => {
|
||||
expect(detectContentType('- item 1\n- item 2')).toBe('list')
|
||||
expect(detectContentType('1. first\n2. second')).toBe('list')
|
||||
})
|
||||
|
||||
it('detects prose by default', () => {
|
||||
// Single column with newlines = prose
|
||||
expect(detectContentType('a b c\n1 2 3')).toBe('prose')
|
||||
})
|
||||
})
|
||||
})
|
||||
|
||||
describe('Compression Ratio', () => {
|
||||
describe('getCompressionRatio', () => {
|
||||
it('returns appropriate ratios', () => {
|
||||
expect(getCompressionRatio('{"a":1}').ratio).toBe(2)
|
||||
expect(getCompressionRatio('code here {} []').ratio).toBe(3.5)
|
||||
expect(getCompressionRatio('Hello world').ratio).toBe(4)
|
||||
})
|
||||
})
|
||||
|
||||
describe('estimateWithBounds', () => {
|
||||
it('returns estimate with bounds', () => {
|
||||
const result = estimateWithBounds('Hello world')
|
||||
|
||||
expect(result.min).toBeLessThanOrEqual(result.estimate)
|
||||
expect(result.max).toBeGreaterThanOrEqual(result.estimate)
|
||||
expect(result.min).toBeLessThan(result.max)
|
||||
})
|
||||
|
||||
it('handles JSON with tighter bounds', () => {
|
||||
const result = estimateWithBounds('{"key": "value"}')
|
||||
|
||||
// JSON has smaller ratio range
|
||||
expect(result.max).toBeLessThan(10)
|
||||
})
|
||||
})
|
||||
})
|
||||
Reference in New Issue
Block a user