feat: add Codebase Intelligence — repo map with PageRank-ranked structural summaries

Add a new module that builds a structural map of the repository by parsing
source files with tree-sitter, building a cross-file reference graph
weighted by IDF, ranking files with PageRank, and rendering a
token-budgeted summary of the most important files and their signatures.

Stage 1 — Core module (src/context/repoMap/):
  Symbol extraction via web-tree-sitter WASM, IDF-weighted reference graph
  via graphology, PageRank ranking, token-budgeted rendering via js-tiktoken
  cl100k_base, disk cache with mtime invalidation. Supports TypeScript,
  JavaScript, and Python. 10 tests.

Stage 2 — RepoMap tool (src/tools/RepoMapTool/):
  buildTool wrapper registered in src/tools.ts. Read-only, concurrency-safe.
  Supports focus_files, focus_symbols, and max_tokens parameters. 9 tests.

Stage 3 — Integration:
  Auto-injection into session context behind REPO_MAP feature flag (off by
  default). /repomap slash command with --tokens, --focus, --stats, and
  --invalidate flags. User-facing docs in docs/repo-map.md. 13 tests.

With the flag off, the system context is byte-identical to previous behavior.

Dependencies: web-tree-sitter, tree-sitter-wasms, graphology,
graphology-pagerank, graphology-operators, js-tiktoken

Tests: 32 new, 621 total passing, 0 failures.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
gnanam1990
2026-04-09 17:26:34 +05:30
parent 6ea3eb6483
commit 81896618a1
35 changed files with 2384 additions and 0 deletions

View File

@@ -0,0 +1,109 @@
import { execFile } from 'child_process'
import { readdirSync, statSync } from 'fs'
import { join, relative } from 'path'
import type { SupportedLanguage } from './types.js'
const SUPPORTED_EXTENSIONS: Record<string, SupportedLanguage> = {
'.ts': 'typescript',
'.tsx': 'typescript',
'.js': 'javascript',
'.jsx': 'javascript',
'.mjs': 'javascript',
'.cjs': 'javascript',
'.py': 'python',
}
const EXCLUDED_DIRS = new Set([
'node_modules',
'dist',
'.git',
'.hg',
'.svn',
'build',
'out',
'coverage',
'__pycache__',
'.next',
'.nuxt',
'vendor',
'.worktrees',
])
const EXCLUDED_FILES = new Set([
'bun.lock',
'bun.lockb',
'package-lock.json',
'yarn.lock',
'pnpm-lock.yaml',
])
export function getLanguageForFile(filePath: string): SupportedLanguage | null {
const ext = filePath.substring(filePath.lastIndexOf('.'))
return SUPPORTED_EXTENSIONS[ext] ?? null
}
export function isSupportedFile(filePath: string): boolean {
return getLanguageForFile(filePath) !== null
}
/** List files using git ls-files. Returns relative paths. */
function gitLsFiles(root: string): Promise<string[]> {
return new Promise((resolve, reject) => {
execFile(
'git',
['ls-files', '--cached', '--others', '--exclude-standard'],
{ cwd: root, maxBuffer: 10 * 1024 * 1024 },
(error, stdout) => {
if (error) {
reject(error)
return
}
const files = stdout
.split('\n')
.map(f => f.trim())
.filter(f => f.length > 0)
resolve(files)
},
)
})
}
/** Walk directory tree manually as fallback when git is unavailable. */
function walkDirectory(root: string, currentDir: string = root): string[] {
const results: string[] = []
let entries: ReturnType<typeof readdirSync>
try {
entries = readdirSync(currentDir, { withFileTypes: true })
} catch {
return results
}
for (const entry of entries) {
const name = entry.name
if (entry.isDirectory()) {
if (!EXCLUDED_DIRS.has(name) && !name.startsWith('.')) {
results.push(...walkDirectory(root, join(currentDir, name)))
}
} else if (entry.isFile()) {
if (!EXCLUDED_FILES.has(name)) {
results.push(relative(root, join(currentDir, name)))
}
}
}
return results
}
/**
* Enumerate all supported source files in the repo.
* Tries git ls-files first, falls back to manual walk.
*/
export async function getRepoFiles(root: string): Promise<string[]> {
let files: string[]
try {
files = await gitLsFiles(root)
} catch {
files = walkDirectory(root)
}
return files.filter(isSupportedFile)
}