Files
orcs-code/src/context/repoMap/index.ts
gnanam1990 81896618a1 feat: add Codebase Intelligence — repo map with PageRank-ranked structural summaries
Add a new module that builds a structural map of the repository by parsing
source files with tree-sitter, building a cross-file reference graph
weighted by IDF, ranking files with PageRank, and rendering a
token-budgeted summary of the most important files and their signatures.

Stage 1 — Core module (src/context/repoMap/):
  Symbol extraction via web-tree-sitter WASM, IDF-weighted reference graph
  via graphology, PageRank ranking, token-budgeted rendering via js-tiktoken
  cl100k_base, disk cache with mtime invalidation. Supports TypeScript,
  JavaScript, and Python. 10 tests.

Stage 2 — RepoMap tool (src/tools/RepoMapTool/):
  buildTool wrapper registered in src/tools.ts. Read-only, concurrency-safe.
  Supports focus_files, focus_symbols, and max_tokens parameters. 9 tests.

Stage 3 — Integration:
  Auto-injection into session context behind REPO_MAP feature flag (off by
  default). /repomap slash command with --tokens, --focus, --stats, and
  --invalidate flags. User-facing docs in docs/repo-map.md. 13 tests.

With the flag off, the system context is byte-identical to previous behavior.

Dependencies: web-tree-sitter, tree-sitter-wasms, graphology,
graphology-pagerank, graphology-operators, js-tiktoken

Tests: 32 new, 621 total passing, 0 failures.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-28 11:57:16 +05:30

145 lines
4.2 KiB
TypeScript

import {
computeMapHash,
getCachedTags,
getCacheStats as getCacheStatsImpl,
invalidateCache as invalidateCacheImpl,
loadCache,
saveCache,
setCachedTags,
} from './cache.js'
import { getRepoFiles } from './gitFiles.js'
import { buildGraph } from './graph.js'
import { rankFiles } from './pagerank.js'
import { initParser } from './parser.js'
import { renderMap } from './renderer.js'
import { extractTags } from './symbolExtractor.js'
import type { FileTags, RepoMapOptions, RepoMapResult, CacheStats } from './types.js'
const DEFAULT_MAX_TOKENS = 2048
/**
* Build a structural summary of a code repository.
*
* Walks the repo, extracts symbols via tree-sitter, builds an IDF-weighted
* reference graph, ranks files with PageRank, and renders a token-budgeted
* structural summary.
*/
export async function buildRepoMap(options: RepoMapOptions = {}): Promise<RepoMapResult> {
const startTime = Date.now()
const root = options.root ?? process.cwd()
const maxTokens = options.maxTokens ?? DEFAULT_MAX_TOKENS
const focusFiles = options.focusFiles ?? []
// Initialize tree-sitter
await initParser()
// Get files
const files = options.files ?? await getRepoFiles(root)
const totalFileCount = files.length
// Check if we have a cached rendered map
const mapHash = computeMapHash(files, maxTokens, focusFiles)
const cache = loadCache(root)
// Check if rendered map is cached (stored as a special entry)
const renderedCacheKey = `__rendered__${mapHash}`
const renderedEntry = cache.entries[renderedCacheKey]
if (renderedEntry && renderedEntry.tags.length === 1) {
const cachedResult = renderedEntry.tags[0]!
// The cached "tag" stores the rendered map in the signature field
// and metadata in name/line fields
try {
const meta = JSON.parse(cachedResult.name)
return {
map: cachedResult.signature,
cacheHit: true,
buildTimeMs: Date.now() - startTime,
fileCount: meta.fileCount ?? 0,
totalFileCount,
tokenCount: meta.tokenCount ?? 0,
}
} catch {
// Invalid cached data, continue with full build
}
}
// Extract tags for all files (using per-file cache).
// Separate cached hits from files needing extraction.
const allFileTags: FileTags[] = []
const uncachedFiles: string[] = []
for (const file of files) {
const cachedTags = getCachedTags(cache, file, root)
if (cachedTags) {
allFileTags.push({ path: file, tags: cachedTags })
} else {
uncachedFiles.push(file)
}
}
// Process uncached files in parallel batches
const BATCH_SIZE = 50
for (let i = 0; i < uncachedFiles.length; i += BATCH_SIZE) {
const batch = uncachedFiles.slice(i, i + BATCH_SIZE)
const results = await Promise.all(
batch.map(file => extractTags(file, root).catch(() => null))
)
for (let j = 0; j < results.length; j++) {
const fileTags = results[j]
if (fileTags) {
allFileTags.push(fileTags)
setCachedTags(cache, fileTags.path, root, fileTags.tags)
}
}
}
// Build graph and rank
const graph = buildGraph(allFileTags)
const ranked = rankFiles(graph, focusFiles)
// Build a lookup map
const fileTagsMap = new Map<string, FileTags>()
for (const ft of allFileTags) {
fileTagsMap.set(ft.path, ft)
}
// Render
const { map, tokenCount, fileCount } = renderMap(ranked, fileTagsMap, maxTokens)
// Cache the rendered result
cache.entries[renderedCacheKey] = {
tags: [{
kind: 'def',
name: JSON.stringify({ fileCount, tokenCount }),
line: 0,
signature: map,
}],
mtimeMs: Date.now(),
size: 0,
}
saveCache(root, cache)
return {
map,
cacheHit: false,
buildTimeMs: Date.now() - startTime,
fileCount,
totalFileCount,
tokenCount,
}
}
/** Invalidate the disk cache for a given repo root. */
export function invalidateCache(root?: string): void {
invalidateCacheImpl(root ?? process.cwd())
}
/** Get cache statistics for a given repo root. */
export function getCacheStats(root?: string): CacheStats {
return getCacheStatsImpl(root ?? process.cwd())
}
// Re-export types for convenience
export type { RepoMapOptions, RepoMapResult, CacheStats } from './types.js'