feat: add model-specific tokenizers and compression ratio detection (#799)

- ModelTokenizerConfig for different model families - getTokenizerConfig() / getBytesPerTokenForModel() - Content type detection (json, code, prose, list, technical) - COMPRESSION_RATIOS - empirical ratios per content type - estimateWithBounds() - confidence intervals Features: 1.1, 1.14, 1.15 Tests: 13 passing
2026-04-22 06:24:12 +01:00
parent 86bce4ae74
commit e92e5274b2
2 changed files with 243 additions and 0 deletions
--- a/src/services/tokenEstimation.ts
+++ b/src/services/tokenEstimation.ts
@@ -223,6 +223,49 @@ export function bytesPerTokenForFileType(fileExtension: string): number {
  }
 }

+/**
+ * Tokenizer ratio by model family.
+ * Different models have different encodings.
+ */
+export interface ModelTokenizerConfig {
+  modelFamily: string
+  bytesPerToken: number
+  supportsJson: boolean
+  supportsCode: boolean
+}
+
+export const MODEL_TOKENIZER_CONFIGS: ModelTokenizerConfig[] = [
+  { modelFamily: 'claude', bytesPerToken: 3.5, supportsJson: true, supportsCode: true },
+  { modelFamily: 'gpt-4', bytesPerToken: 4, supportsJson: true, supportsCode: true },
+  { modelFamily: 'gpt-3.5', bytesPerToken: 4, supportsJson: true, supportsCode: true },
+  { modelFamily: 'gemini', bytesPerToken: 3.5, supportsJson: true, supportsCode: true },
+  { modelFamily: 'llama', bytesPerToken: 3.8, supportsJson: true, supportsCode: true },
+  { modelFamily: 'deepseek', bytesPerToken: 3.5, supportsJson: true, supportsCode: true },
+  { modelFamily: 'minimax', bytesPerToken: 3.2, supportsJson: true, supportsCode: true },
+]
+
+/**
+ * Get tokenizer config for a model.
+ */
+export function getTokenizerConfig(model: string): ModelTokenizerConfig {
+  const lower = model.toLowerCase()
+  
+  for (const config of MODEL_TOKENIZER_CONFIGS) {
+    if (lower.includes(config.modelFamily)) {
+      return config
+    }
+  }
+  
+  return { modelFamily: 'unknown', bytesPerToken: 4, supportsJson: true, supportsCode: true }
+}
+
+/**
+ * Get bytes-per-token ratio for a model.
+ */
+export function getBytesPerTokenForModel(model: string): number {
+  return getTokenizerConfig(model).bytesPerToken
+}
+
 /**
 * Like {@link roughTokenCountEstimation} but uses a more accurate
 * bytes-per-token ratio when the file type is known.
@@ -241,6 +284,106 @@ export function roughTokenCountEstimationForFileType(
  )
 }

+/**
+ * Content type classification for compression ratio.
+ */
+export type ContentType = 
+  | 'json' | 'code' | 'prose' | 'technical' 
+  | 'list' | 'table' | 'mixed'
+
+/**
+ * Compression ratio by content type.
+ * Measured empirically - denser content = lower ratio.
+ */
+export const COMPRESSION_RATIOS: Record<ContentType, { min: number; max: number; typical: number }> = {
+  json: { min: 1.5, max: 2.5, typical: 2 },
+  code: { min: 3, max: 4.5, typical: 3.5 },
+  prose: { min: 3.5, max: 4.5, typical: 4 },
+  technical: { min: 2.5, max: 3.5, typical: 3 },
+  list: { min: 2, max: 3, typical: 2.5 },
+  table: { min: 1.8, max: 2.8, typical: 2.2 },
+  mixed: { min: 3, max: 4, typical: 3.5 },
+}
+
+/**
+ * Detect content type from content.
+ */
+export function detectContentType(content: string): ContentType {
+  const trimmed = content.trim()
+  
+  // JSON
+  if ((trimmed.startsWith('{') && trimmed.endsWith('}')) || 
+      (trimmed.startsWith('[') && trimmed.endsWith(']'))) {
+    try {
+      JSON.parse(trimmed)
+      return 'json'
+    } catch { /* not valid json */ }
+  }
+  
+  // Table (tabs or consistent delimiters)
+  const lines = trimmed.split('\n')
+  if (lines.length > 2) {
+    const hasTabs = lines[0].includes('\t')
+    const hasCommas = lines[0].includes(',')
+    if (hasTabs || hasCommas) {
+      const consistent = lines.slice(1).every(l => l.includes('\t') || l.includes(','))
+      if (consistent) return 'table'
+    }
+  }
+  
+  // List
+  if (/^[\d\-\*\•]/.test(trimmed) || /^[\d\-\*\•]/.test(lines[0])) {
+    return 'list'
+  }
+  
+  // Code (high density of special chars)
+  const codeChars = (content.match(/[{}()\[\];=]/g) || []).length
+  const codeRatio = codeChars / content.length
+  if (codeRatio > 0.05) return 'code'
+  
+  // Technical (has numbers and units)
+  if (/\d+\s*(px|em|rem|%|ms|s|kb|mb|gb)/i.test(content)) {
+    return 'technical'
+  }
+  
+  // Prose (default - natural language)
+  return 'prose'
+}
+
+/**
+ * Get compression ratio for content.
+ */
+export function getCompressionRatio(content: string, type?: ContentType): { ratio: number; min: number; max: number } {
+  const detectedType = type ?? detectContentType(content)
+  const { min, max, typical } = COMPRESSION_RATIOS[detectedType]
+  
+  // Adjust based on actual content length
+  // Shorter content = higher variance
+  const lengthBonus = content.length < 100 ? 0.5 : 0
+  
+  return {
+    ratio: typical,
+    min: min + lengthBonus,
+    max: max + lengthBonus,
+  }
+}
+
+/**
+ * Estimate tokens with confidence bounds.
+ */
+export function estimateWithBounds(
+  content: string,
+  type?: ContentType,
+): { estimate: number; min: number; max: number } {
+  const { ratio, min: minRatio, max: maxRatio } = getCompressionRatio(content, type)
+  
+  const estimate = roughTokenCountEstimation(content, ratio)
+  const min = roughTokenCountEstimation(content, maxRatio)
+  const max = roughTokenCountEstimation(content, minRatio)
+  
+  return { estimate, min, max }
+}
+
 /**
 * Estimates token count for a Message object by extracting and analyzing its text content.
 * This provides a more reliable estimate than getTokenUsage for messages that may have been compacted.