orcs-code/src/utils/providerRecommendation.ts

export type RecommendationGoal = 'latency' | 'balanced' | 'coding'

export type OllamaModelDescriptor = {
  name: string
  sizeBytes?: number | null
  family?: string | null
  families?: string[]
  parameterSize?: string | null
  quantizationLevel?: string | null
}

export type RankedOllamaModel = OllamaModelDescriptor & {
  score: number
  reasons: string[]
  summary: string
}

export type BenchmarkedOllamaModel = RankedOllamaModel & {
  benchmarkMs: number | null
}

const CODING_HINTS = [
  'coder',
  'codellama',
  'codegemma',
  'codestral',
  'devstral',
  'starcoder',
  'deepseek-coder',
  'qwen2.5-coder',
  'qwen-coder',
]

const GENERAL_HINTS = [
  'llama',
  'qwen',
  'mistral',
  'gemma',
  'phi',
  'deepseek',
]

const INSTRUCT_HINTS = ['instruct', 'chat', 'assistant']
const NON_CHAT_HINTS = ['embed', 'embedding', 'rerank', 'bge', 'whisper']

function modelHaystack(model: OllamaModelDescriptor): string {
  return [
    model.name,
    model.family ?? '',
    ...(model.families ?? []),
    model.parameterSize ?? '',
    model.quantizationLevel ?? '',
  ]
    .join(' ')
    .toLowerCase()
}

function includesAny(text: string, needles: string[]): boolean {
  return needles.some(needle => text.includes(needle))
}

export function isViableOllamaChatModel(model: OllamaModelDescriptor): boolean {
  return !includesAny(modelHaystack(model), NON_CHAT_HINTS)
}

export function selectRecommendedOllamaModel<
  T extends OllamaModelDescriptor,
>(models: T[]): T | null {
  return models.find(isViableOllamaChatModel) ?? null
}

function inferParameterBillions(model: OllamaModelDescriptor): number | null {
  const text = `${model.parameterSize ?? ''} ${model.name}`.toLowerCase()
  const match = text.match(/(\d+(?:\.\d+)?)\s*b\b/)
  if (match?.[1]) {
    return Number(match[1])
  }
  if (typeof model.sizeBytes === 'number' && model.sizeBytes > 0) {
    return Number((model.sizeBytes / 1_000_000_000).toFixed(1))
  }
  return null
}

function quantizationBucket(model: OllamaModelDescriptor): string {
  return (model.quantizationLevel ?? model.name).toLowerCase()
}

function scoreSizeTier(
  paramsB: number | null,
  goal: RecommendationGoal,
  reasons: string[],
): number {
  if (paramsB === null) {
    reasons.push('unknown size')
    return 0
  }

  if (goal === 'latency') {
    if (paramsB <= 4) {
      reasons.push('tiny model for low latency')
      return 32
    }
    if (paramsB <= 8) {
      reasons.push('small model for fast responses')
      return 26
    }
    if (paramsB <= 14) {
      reasons.push('mid-sized model with acceptable latency')
      return 16
    }
    if (paramsB <= 24) {
      reasons.push('larger model may be slower')
      return 8
    }
    reasons.push('large model likely slower locally')
    return paramsB <= 40 ? 0 : -8
  }

  if (goal === 'coding') {
    if (paramsB >= 7 && paramsB <= 14) {
      reasons.push('strong coding size tier')
      return 24
    }
    if (paramsB > 14 && paramsB <= 34) {
      reasons.push('large coding-capable size tier')
      return 28
    }
    if (paramsB > 34) {
      reasons.push('very large model with higher quality potential')
      return 18
    }
    reasons.push('compact model may trade off coding depth')
    return 12
  }

  if (paramsB >= 7 && paramsB <= 14) {
    reasons.push('great balanced size tier')
    return 26
  }
  if (paramsB >= 3 && paramsB < 7) {
    reasons.push('compact balanced size tier')
    return 18
  }
  if (paramsB > 14 && paramsB <= 24) {
    reasons.push('high quality balanced size tier')
    return 20
  }
  if (paramsB > 24) {
    reasons.push('large model for quality-first usage')
    return 10
  }
  reasons.push('very small model for general usage')
  return 8
}

function scoreQuantization(
  model: OllamaModelDescriptor,
  goal: RecommendationGoal,
  reasons: string[],
): number {
  const quant = quantizationBucket(model)
  if (quant.includes('q4')) {
    reasons.push('efficient Q4 quantization')
    return goal === 'latency' ? 8 : 4
  }
  if (quant.includes('q5')) {
    reasons.push('balanced Q5 quantization')
    return goal === 'latency' ? 6 : 5
  }
  if (quant.includes('q8')) {
    reasons.push('higher quality Q8 quantization')
    return goal === 'latency' ? 2 : 5
  }
  return 0
}

function compareRankedModels(
  a: RankedOllamaModel | BenchmarkedOllamaModel,
  b: RankedOllamaModel | BenchmarkedOllamaModel,
  goal: RecommendationGoal,
): number {
  if (b.score !== a.score) {
    return b.score - a.score
  }

  const aSize = inferParameterBillions(a) ?? Number.POSITIVE_INFINITY
  const bSize = inferParameterBillions(b) ?? Number.POSITIVE_INFINITY

  if (goal === 'latency') {
    return aSize - bSize
  }

  if (goal === 'coding') {
    return bSize - aSize
  }

  const target = 14
  return Math.abs(aSize - target) - Math.abs(bSize - target)
}

export function normalizeRecommendationGoal(
  goal: string | null | undefined,
): RecommendationGoal {
  const normalized = goal?.trim().toLowerCase()
  if (
    normalized === 'latency' ||
    normalized === 'balanced' ||
    normalized === 'coding'
  ) {
    return normalized
  }
  return 'balanced'
}

export function getGoalDefaultOpenAIModel(goal: RecommendationGoal): string {
  switch (goal) {
    case 'latency':
      return 'gpt-4o-mini'
    case 'coding':
      return 'gpt-4o'
    case 'balanced':
    default:
      return 'gpt-4o'
  }
}

export function rankOllamaModels(
  models: OllamaModelDescriptor[],
  goal: RecommendationGoal,
): RankedOllamaModel[] {
  return models
    .map(model => {
      const haystack = modelHaystack(model)
      const reasons: string[] = []
      let score = 0

      if (includesAny(haystack, NON_CHAT_HINTS)) {
        score -= 40
        reasons.push('not a chat-first model')
      }

      if (includesAny(haystack, CODING_HINTS)) {
        score += goal === 'coding' ? 24 : goal === 'balanced' ? 10 : 4
        reasons.push('coding-oriented model family')
      }

      if (includesAny(haystack, GENERAL_HINTS)) {
        score += goal === 'latency' ? 4 : goal === 'coding' ? 6 : 8
        reasons.push('strong general-purpose model family')
      }

      if (includesAny(haystack, INSTRUCT_HINTS)) {
        score += goal === 'latency' ? 2 : 6
        reasons.push('chat/instruct tuned')
      }

      if (haystack.includes('vision') || haystack.includes('vl')) {
        score -= 2
        reasons.push('vision model adds extra overhead')
      }

      score += scoreSizeTier(inferParameterBillions(model), goal, reasons)
      score += scoreQuantization(model, goal, reasons)

      const summary = reasons.slice(0, 3).join(', ')
      return {
        ...model,
        score,
        reasons,
        summary,
      }
    })
    .sort((a, b) => compareRankedModels(a, b, goal))
}

export function recommendOllamaModel(
  models: OllamaModelDescriptor[],
  goal: RecommendationGoal,
): RankedOllamaModel | null {
  return selectRecommendedOllamaModel(rankOllamaModels(models, goal))
}

export function applyBenchmarkLatency(
  models: RankedOllamaModel[],
  benchmarkMs: Record<string, number | null>,
  goal: RecommendationGoal,
): BenchmarkedOllamaModel[] {
  const divisor =
    goal === 'latency' ? 120 : goal === 'coding' ? 500 : 240

  const scoredModels = models
    .map(model => {
      const latency = benchmarkMs[model.name] ?? null
      const benchmarkPenalty = latency === null ? 0 : latency / divisor
      const reasons =
        latency === null
          ? model.reasons
          : [`benchmarked at ${Math.round(latency)}ms`, ...model.reasons]

      return {
        ...model,
        benchmarkMs: latency,
        reasons,
        summary: reasons.slice(0, 3).join(', '),
        score: Number((model.score - benchmarkPenalty).toFixed(2)),
      }
    })

  const benchmarkedModels = scoredModels.filter(model => model.benchmarkMs !== null)
  if (benchmarkedModels.length === 0) {
    return scoredModels.sort((a, b) => compareRankedModels(a, b, goal))
  }

  const unbenchmarkedModels = scoredModels.filter(model => model.benchmarkMs === null)
  benchmarkedModels.sort((a, b) => compareRankedModels(a, b, goal))
  return [...benchmarkedModels, ...unbenchmarkedModels]
}