feat: add Codebase Intelligence — repo map with PageRank-ranked structural summaries

Add a new module that builds a structural map of the repository by parsing source files with tree-sitter, building a cross-file reference graph weighted by IDF, ranking files with PageRank, and rendering a token-budgeted summary of the most important files and their signatures. Stage 1 — Core module (src/context/repoMap/): Symbol extraction via web-tree-sitter WASM, IDF-weighted reference graph via graphology, PageRank ranking, token-budgeted rendering via js-tiktoken cl100k_base, disk cache with mtime invalidation. Supports TypeScript, JavaScript, and Python. 10 tests. Stage 2 — RepoMap tool (src/tools/RepoMapTool/): buildTool wrapper registered in src/tools.ts. Read-only, concurrency-safe. Supports focus_files, focus_symbols, and max_tokens parameters. 9 tests. Stage 3 — Integration: Auto-injection into session context behind REPO_MAP feature flag (off by default). /repomap slash command with --tokens, --focus, --stats, and --invalidate flags. User-facing docs in docs/repo-map.md. 13 tests. With the flag off, the system context is byte-identical to previous behavior. Dependencies: web-tree-sitter, tree-sitter-wasms, graphology, graphology-pagerank, graphology-operators, js-tiktoken Tests: 32 new, 621 total passing, 0 failures. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-09 17:26:34 +05:30
parent 6ea3eb6483
commit 81896618a1
35 changed files with 2384 additions and 0 deletions
--- a/src/context/repoMap/symbolExtractor.ts
+++ b/src/context/repoMap/symbolExtractor.ts
@@ -0,0 +1,108 @@
+import { readFileSync } from 'fs'
+import { join } from 'path'
+import { getLanguageForFile } from './gitFiles.js'
+import { createParser, loadLanguage, loadQuery } from './parser.js'
+import type { FileTags, Tag } from './types.js'
+
+/**
+ * Extract definition and reference tags from a single source file.
+ * Returns null if the file can't be parsed (unsupported language, parse error, etc).
+ */
+export async function extractTags(
+  filePath: string,
+  root: string,
+): Promise<FileTags | null> {
+  const language = getLanguageForFile(filePath)
+  if (!language) return null
+
+  const absolutePath = join(root, filePath)
+  let source: string
+  try {
+    source = readFileSync(absolutePath, 'utf-8')
+  } catch {
+    return null
+  }
+
+  const lines = source.split('\n')
+
+  const parser = await createParser(language)
+  if (!parser) return null
+
+  const querySource = loadQuery(language)
+  if (!querySource) {
+    parser.delete()
+    return null
+  }
+
+  try {
+    const tree = parser.parse(source) as {
+      rootNode: unknown
+    }
+
+    const lang = await loadLanguage(language)
+    if (!lang) {
+      parser.delete()
+      return null
+    }
+
+    // Use the non-deprecated Query constructor
+    const { Query } = await import('web-tree-sitter')
+    const query = new Query(lang, querySource) as {
+      matches(rootNode: unknown): Array<{
+        pattern: number
+        captures: Array<{
+          name: string
+          node: {
+            text: string
+            startPosition: { row: number; column: number }
+            endPosition: { row: number; column: number }
+          }
+        }>
+      }>
+    }
+
+    const matches = query.matches(tree.rootNode)
+    const tags: Tag[] = []
+    const seen = new Set<string>() // dedup by kind+name+line
+
+    for (const match of matches) {
+      let name: string | null = null
+      let kind: 'def' | 'ref' | null = null
+      let subKind: string | undefined
+      let lineRow = 0
+
+      for (const capture of match.captures) {
+        const captureName = capture.name
+
+        // Name captures: name.definition.X or name.reference.X
+        if (captureName.startsWith('name.definition.')) {
+          name = capture.node.text
+          kind = 'def'
+          subKind = captureName.slice('name.definition.'.length)
+          lineRow = capture.node.startPosition.row
+        } else if (captureName.startsWith('name.reference.')) {
+          name = capture.node.text
+          kind = 'ref'
+          subKind = captureName.slice('name.reference.'.length)
+          lineRow = capture.node.startPosition.row
+        }
+      }
+
+      if (name && kind) {
+        const key = `${kind}:${name}:${lineRow}`
+        if (!seen.has(key)) {
+          seen.add(key)
+          const line = lineRow + 1 // convert 0-based to 1-based
+          const signature = lines[lineRow]?.trimEnd() ?? ''
+          tags.push({ kind, name, line, signature, subKind })
+        }
+      }
+    }
+
+    parser.delete()
+    return { path: filePath, tags }
+  } catch {
+    parser.delete()
+    return null
+  }
+}