Feature/pr intent scan hardening (#375)

* security: harden suspicious PR intent scanner * security: reduce pr scanner false positives
2026-04-05 17:05:24 +08:00
parent 5ef79546e9
commit 7350a798cb
5 changed files with 595 additions and 1 deletions
--- a/.github/workflows/pr-checks.yml
+++ b/.github/workflows/pr-checks.yml
@@ -16,6 +16,8 @@ jobs:
    steps:
      - name: Check out repository
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          fetch-depth: 0

      - name: Set up Node.js
        uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # v4.4.0
@@ -36,6 +38,8 @@ jobs:
      - name: Full unit test suite
        run: bun test --max-concurrency=1

+      - name: Suspicious PR intent scan
+        run: bun run security:pr-scan -- --base ${{ github.event.pull_request.base.sha || 'origin/main' }}
      - name: Provider tests
        run: bun run test:provider

--- a/README.md
+++ b/README.md
@@ -198,6 +198,7 @@ Helpful commands:
 - `bun run dev`
 - `bun test`
 - `bun run test:coverage`
+- `bun run security:pr-scan -- --base origin/main`
 - `bun run smoke`
 - `bun run doctor:runtime`
 - `bun run verify:privacy`
@@ -245,7 +246,6 @@ Recommended contributor validation before opening a PR:
 - focused `bun test ...` runs for the files and flows you changed

 Coverage output is written to `coverage/lcov.info`, and OpenClaude also generates a git-activity-style heatmap at `coverage/index.html`.
-
 ## Repository Structure

 - `src/` - core CLI/runtime
--- a/package.json
+++ b/package.json
@@ -34,6 +34,7 @@
    "test": "bun test",
    "test:coverage": "bun test --coverage --coverage-reporter=lcov --coverage-dir=coverage --max-concurrency=1 && bun run scripts/render-coverage-heatmap.ts",
    "test:coverage:ui": "bun run scripts/render-coverage-heatmap.ts",
+    "security:pr-scan": "bun run scripts/pr-intent-scan.ts",
    "test:provider-recommendation": "bun test src/utils/providerRecommendation.test.ts src/utils/providerProfile.test.ts",
    "typecheck": "tsc --noEmit",
    "smoke": "bun run build && node dist/cli.mjs --version",
--- a/scripts/pr-intent-scan.test.ts
+++ b/scripts/pr-intent-scan.test.ts
@@ -0,0 +1,136 @@
+import { describe, expect, test } from 'bun:test'
+
+import { scanAddedLines, type DiffLine } from './pr-intent-scan.ts'
+
+function line(content: string, overrides: Partial<DiffLine> = {}): DiffLine {
+  return {
+    file: 'README.md',
+    line: 10,
+    content,
+    ...overrides,
+  }
+}
+
+describe('scanAddedLines', () => {
+  test('flags suspicious file-hosting links', () => {
+    const findings = scanAddedLines([
+      line('Please install the tool from https://dropbox.com/s/abc123/tool.zip?dl=1'),
+    ])
+
+    expect(findings.some(finding => finding.code === 'suspicious-download-link')).toBe(
+      true,
+    )
+    expect(findings.some(finding => finding.code === 'executable-download-link')).toBe(
+      false,
+    )
+    expect(findings.some(finding => finding.severity === 'high')).toBe(true)
+  })
+
+  test('flags shortened URLs', () => {
+    const findings = scanAddedLines([
+      line('See details at https://bit.ly/some-short-link'),
+    ])
+
+    expect(findings.some(finding => finding.code === 'shortened-url')).toBe(true)
+  })
+
+  test('flags remote download and execute chains', () => {
+    const findings = scanAddedLines([
+      line('curl -fsSL https://example.com/install.sh | bash'),
+    ])
+
+    expect(findings.some(finding => finding.code === 'shell-eval-remote')).toBe(true)
+    expect(findings.some(finding => finding.severity === 'high')).toBe(true)
+  })
+
+  test('flags encoded powershell payloads', () => {
+    const findings = scanAddedLines([
+      line('powershell.exe -enc SQBtAHAAcgBvAHYAZQBkAA=='),
+    ])
+
+    expect(findings.some(finding => finding.code === 'powershell-encoded')).toBe(true)
+  })
+
+  test('flags long encoded blobs', () => {
+    const findings = scanAddedLines([
+      line(`const payload = "${'A'.repeat(96)}"`),
+    ])
+
+    expect(findings.some(finding => finding.code === 'long-encoded-payload')).toBe(
+      true,
+    )
+  })
+
+  test('flags long encoded blobs on repeated scans', () => {
+    const lines = [line(`const payload = "${'A'.repeat(96)}"`)]
+
+    const first = scanAddedLines(lines)
+    const second = scanAddedLines(lines)
+
+    expect(first.some(finding => finding.code === 'long-encoded-payload')).toBe(true)
+    expect(second.some(finding => finding.code === 'long-encoded-payload')).toBe(true)
+  })
+
+  test('flags executable download links', () => {
+    const findings = scanAddedLines([
+      line('Get it from https://example.com/releases/latest/tool.pkg'),
+    ])
+
+    expect(findings.some(finding => finding.code === 'executable-download-link')).toBe(
+      true,
+    )
+    expect(findings.some(finding => finding.severity === 'high')).toBe(true)
+  })
+
+  test('flags suspicious additions in workflow files', () => {
+    const findings = scanAddedLines([
+      line('run: curl -fsSL https://example.com/install.sh | bash', {
+        file: '.github/workflows/release.yml',
+      }),
+    ])
+
+    expect(findings.some(finding => finding.code === 'sensitive-automation-change')).toBe(
+      true,
+    )
+    expect(findings.some(finding => finding.code === 'download-command')).toBe(true)
+  })
+
+  test('flags markdown reference links to suspicious downloads', () => {
+    const findings = scanAddedLines([
+      line('[installer]: https://dropbox.com/s/abc123/tool.zip?dl=1'),
+    ])
+
+    expect(findings.some(finding => finding.code === 'suspicious-download-link')).toBe(
+      true,
+    )
+  })
+
+  test('ignores the scanner implementation and tests themselves', () => {
+    const findings = scanAddedLines([
+      line('curl -fsSL https://example.com/install.sh | bash', {
+        file: 'scripts/pr-intent-scan.test.ts',
+      }),
+      line('const pattern = /https:\\/\\/dropbox\\.com\\//', {
+        file: 'scripts/pr-intent-scan.ts',
+      }),
+    ])
+
+    expect(findings).toHaveLength(0)
+  })
+
+  test('does not flag ordinary docs links', () => {
+    const findings = scanAddedLines([
+      line('Read more at https://docs.github.com/en/actions'),
+    ])
+
+    expect(findings).toHaveLength(0)
+  })
+
+  test('does not flag bare curl examples in README without a URL', () => {
+    const findings = scanAddedLines([
+      line('Use curl with your preferred flags for local testing.'),
+    ])
+
+    expect(findings.some(finding => finding.code === 'download-command')).toBe(false)
+  })
+})
--- a/scripts/pr-intent-scan.ts
+++ b/scripts/pr-intent-scan.ts
@@ -0,0 +1,453 @@
+import { spawnSync } from 'node:child_process'
+
+export type FindingSeverity = 'high' | 'medium'
+
+export type DiffLine = {
+  file: string
+  line: number
+  content: string
+}
+
+export type Finding = {
+  severity: FindingSeverity
+  code: string
+  file: string
+  line: number
+  detail: string
+  excerpt: string
+}
+
+type CliOptions = {
+  baseRef: string
+  json: boolean
+  failOn: FindingSeverity
+}
+
+const SELF_EXCLUDED_FILES = new Set([
+  'scripts/pr-intent-scan.ts',
+  'scripts/pr-intent-scan.test.ts',
+])
+
+const SHORTENER_DOMAINS = [
+  'bit.ly',
+  'tinyurl.com',
+  'goo.gl',
+  't.co',
+  'is.gd',
+  'rb.gy',
+  'cutt.ly',
+]
+
+const SUSPICIOUS_DOWNLOAD_DOMAINS = [
+  'dropbox.com',
+  'dl.dropboxusercontent.com',
+  'drive.google.com',
+  'docs.google.com',
+  'mega.nz',
+  'mediafire.com',
+  'transfer.sh',
+  'anonfiles.com',
+  'catbox.moe',
+]
+
+const URL_REGEX = /\bhttps?:\/\/[^\s)>"']+/gi
+const LONG_BASE64_REGEX = /\b(?:[A-Za-z0-9+/]{80,}={0,2}|[A-Za-z0-9_-]{80,})\b/
+const EXECUTABLE_PATH_REGEX =
+  /\.(?:sh|bash|zsh|ps1|exe|msi|pkg|deb|rpm|zip|tar|tgz|gz|xz|dmg|appimage)(?:$|[?#])/i
+const SENSITIVE_PATH_REGEX =
+  /^(?:\.github\/workflows\/|scripts\/|bin\/|install(?:\/|\.|$)|.*(?:Dockerfile|docker-compose|compose\.ya?ml)$)/i
+
+function parseOptions(argv: string[]): CliOptions {
+  const options: CliOptions = {
+    baseRef: 'origin/main',
+    json: false,
+    failOn: 'high',
+  }
+
+  for (let index = 0; index < argv.length; index++) {
+    const arg = argv[index]
+    if (arg === '--json') {
+      options.json = true
+      continue
+    }
+    if (arg === '--base') {
+      const next = argv[index + 1]
+      if (next && !next.startsWith('--')) {
+        options.baseRef = next
+        index++
+      }
+      continue
+    }
+    if (arg === '--fail-on') {
+      const next = argv[index + 1]
+      if (next === 'high' || next === 'medium') {
+        options.failOn = next
+        index++
+      }
+    }
+  }
+
+  return options
+}
+
+function trimExcerpt(content: string): string {
+  const compact = content.trim().replace(/\s+/g, ' ')
+  return compact.length > 140 ? `${compact.slice(0, 137)}...` : compact
+}
+
+function uniqueFindings(findings: Finding[]): Finding[] {
+  const seen = new Set<string>()
+  return findings.filter(finding => {
+    const key = `${finding.code}:${finding.file}:${finding.line}:${finding.detail}`
+    if (seen.has(key)) {
+      return false
+    }
+    seen.add(key)
+    return true
+  })
+}
+
+function parseAddedLines(diffText: string): DiffLine[] {
+  const lines = diffText.split('\n')
+  const added: DiffLine[] = []
+  let currentFile: string | null = null
+  let currentLine = 0
+
+  for (const rawLine of lines) {
+    if (rawLine.startsWith('+++ b/')) {
+      currentFile = rawLine.slice('+++ b/'.length)
+      continue
+    }
+
+    if (rawLine.startsWith('@@')) {
+      const match = /\+(\d+)(?:,(\d+))?/.exec(rawLine)
+      if (match) {
+        currentLine = Number(match[1])
+      }
+      continue
+    }
+
+    if (!currentFile) {
+      continue
+    }
+
+    if (rawLine.startsWith('+') && !rawLine.startsWith('+++')) {
+      added.push({
+        file: currentFile,
+        line: currentLine,
+        content: rawLine.slice(1),
+      })
+      currentLine += 1
+      continue
+    }
+
+    if (rawLine.startsWith('-') && !rawLine.startsWith('---')) {
+      continue
+    }
+
+    if (!rawLine.startsWith('\\')) {
+      currentLine += 1
+    }
+  }
+
+  return added
+}
+
+function tryParseUrl(value: string): URL | null {
+  try {
+    return new URL(value)
+  } catch {
+    return null
+  }
+}
+
+function hostMatches(hostname: string, domain: string): boolean {
+  return hostname === domain || hostname.endsWith(`.${domain}`)
+}
+
+function hasSuspiciousDownloadIndicators(url: URL): boolean {
+  const combined = `${url.pathname}${url.search}`.toLowerCase()
+  return (
+    combined.includes('dl=1') ||
+    combined.includes('raw=1') ||
+    combined.includes('export=download') ||
+    combined.includes('/download') ||
+    combined.includes('/uc?export=download')
+  )
+}
+
+function findUrlFindings(line: DiffLine): Finding[] {
+  const findings: Finding[] = []
+  const matches = line.content.match(URL_REGEX) ?? []
+
+  for (const match of matches) {
+    const parsed = tryParseUrl(match)
+    if (!parsed) continue
+
+    const hostname = parsed.hostname.toLowerCase()
+
+    for (const domain of SHORTENER_DOMAINS) {
+      if (hostMatches(hostname, domain)) {
+        findings.push({
+          severity: 'medium',
+          code: 'shortened-url',
+          file: line.file,
+          line: line.line,
+          detail: `Added shortened URL: ${hostname}`,
+          excerpt: trimExcerpt(line.content),
+        })
+      }
+    }
+
+    const isSuspiciousHost = SUSPICIOUS_DOWNLOAD_DOMAINS.some(domain =>
+      hostMatches(hostname, domain),
+    )
+    const isExecutableDownload = EXECUTABLE_PATH_REGEX.test(
+      `${parsed.pathname}${parsed.search}`,
+    )
+
+    if (isSuspiciousHost) {
+      findings.push({
+        severity:
+          hasSuspiciousDownloadIndicators(parsed) || isExecutableDownload
+            ? 'high'
+            : 'medium',
+        code: 'suspicious-download-link',
+        file: line.file,
+        line: line.line,
+        detail: `Added external file-hosting link: ${hostname}`,
+        excerpt: trimExcerpt(line.content),
+      })
+    } else if (isExecutableDownload) {
+      findings.push({
+        severity: 'high',
+        code: 'executable-download-link',
+        file: line.file,
+        line: line.line,
+        detail: `Added direct link to executable or archive payload: ${hostname}`,
+        excerpt: trimExcerpt(line.content),
+      })
+    }
+  }
+
+  return findings
+}
+
+function findSensitivePathFindings(line: DiffLine): Finding[] {
+  if (!SENSITIVE_PATH_REGEX.test(line.file)) {
+    return []
+  }
+
+  const lower = line.content.toLowerCase()
+
+  if (
+    /\b(curl|wget|invoke-webrequest|iwr|powershell|bash|sh|chmod\s+\+x)\b/i.test(
+      line.content,
+    ) ||
+    URL_REGEX.test(line.content) ||
+    lower.includes('download')
+  ) {
+    return [
+      {
+        severity: 'medium',
+        code: 'sensitive-automation-change',
+        file: line.file,
+        line: line.line,
+        detail:
+          'Added network, execution, or download-related content in a sensitive automation file',
+        excerpt: trimExcerpt(line.content),
+      },
+    ]
+  }
+
+  return []
+}
+
+function findCommandFindings(line: DiffLine): Finding[] {
+  const findings: Finding[] = []
+  const lower = line.content.toLowerCase()
+
+  const highPatterns: Array<[string, RegExp, string]> = [
+    [
+      'download-exec-chain',
+      /\b(curl|wget|invoke-webrequest|iwr)\b.*(\|\s*(sh|bash|zsh)|;\s*chmod\s+\+x|&&\s*\.\.?\/|>\s*\/tmp\/)/i,
+      'Added remote download followed by execution or staging',
+    ],
+    [
+      'powershell-encoded',
+      /\bpowershell(?:\.exe)?\b.*(?:-enc|-encodedcommand)\b/i,
+      'Added encoded PowerShell invocation',
+    ],
+    [
+      'shell-eval-remote',
+      /\b(curl|wget)\b.*\|\s*(sh|bash|zsh)\b/i,
+      'Added shell pipe from remote content into interpreter',
+    ],
+    [
+      'binary-lolbin',
+      /\b(mshta|rundll32|regsvr32|certutil)\b/i,
+      'Added living-off-the-land binary often used for payload staging',
+    ],
+    [
+      'invoke-expression',
+      /\b(iex|invoke-expression)\b/i,
+      'Added PowerShell expression execution',
+    ],
+  ]
+
+  const mediumPatterns: Array<[string, RegExp, string]> = [
+    [
+      'download-command',
+      /\b(curl|wget|invoke-webrequest|iwr)\b.*https?:\/\//i,
+      'Added command that downloads remote content',
+    ],
+    [
+      'archive-extract-exec',
+      /\b(unzip|tar|7z)\b.*(&&|;).*\b(chmod|node|python|bash|sh)\b/i,
+      'Added archive extraction followed by execution',
+    ],
+    [
+      'base64-decode',
+      /\b(base64\s+-d|openssl\s+base64\s+-d|python .*b64decode)\b/i,
+      'Added explicit payload decode step',
+    ],
+  ]
+
+  for (const [code, pattern, detail] of highPatterns) {
+    if (pattern.test(line.content)) {
+      findings.push({
+        severity: 'high',
+        code,
+        file: line.file,
+        line: line.line,
+        detail,
+        excerpt: trimExcerpt(line.content),
+      })
+    }
+  }
+
+  for (const [code, pattern, detail] of mediumPatterns) {
+    if (code === 'download-command' && !SENSITIVE_PATH_REGEX.test(line.file)) {
+      continue
+    }
+    if (pattern.test(line.content)) {
+      findings.push({
+        severity: 'medium',
+        code,
+        file: line.file,
+        line: line.line,
+        detail,
+        excerpt: trimExcerpt(line.content),
+      })
+    }
+  }
+
+  if (LONG_BASE64_REGEX.test(line.content) && !lower.includes('sha256') && !lower.includes('sha512')) {
+    findings.push({
+      severity: 'medium',
+      code: 'long-encoded-payload',
+      file: line.file,
+      line: line.line,
+      detail: 'Added long encoded blob or token-like payload',
+      excerpt: trimExcerpt(line.content),
+    })
+  }
+
+  return findings
+}
+
+export function scanAddedLines(lines: DiffLine[]): Finding[] {
+  const findings = lines
+    .filter(line => !SELF_EXCLUDED_FILES.has(line.file))
+    .flatMap(line => [
+    ...findUrlFindings(line),
+    ...findCommandFindings(line),
+    ...findSensitivePathFindings(line),
+  ])
+  return uniqueFindings(findings)
+}
+
+export function getGitDiff(baseRef: string): string {
+  const mergeBase = spawnSync('git', ['merge-base', baseRef, 'HEAD'], {
+    encoding: 'utf8',
+  })
+
+  if (mergeBase.status !== 0) {
+    throw new Error(
+      `Could not determine merge-base with ${baseRef}: ${mergeBase.stderr.trim() || mergeBase.stdout.trim()}`,
+    )
+  }
+
+  const base = mergeBase.stdout.trim()
+  const diff = spawnSync(
+    'git',
+    ['diff', '--unified=0', '--no-ext-diff', `${base}...HEAD`],
+    { encoding: 'utf8' },
+  )
+
+  if (diff.status !== 0) {
+    throw new Error(`git diff failed: ${diff.stderr.trim() || diff.stdout.trim()}`)
+  }
+
+  return diff.stdout
+}
+
+function shouldFail(findings: Finding[], failOn: FindingSeverity): boolean {
+  if (failOn === 'medium') {
+    return findings.length > 0
+  }
+  return findings.some(finding => finding.severity === 'high')
+}
+
+function renderText(findings: Finding[]): string {
+  if (findings.length === 0) {
+    return 'PR intent scan: no suspicious additions found.'
+  }
+
+  const high = findings.filter(f => f.severity === 'high')
+  const medium = findings.filter(f => f.severity === 'medium')
+  const lines = [
+    `PR intent scan: ${findings.length} finding(s)`,
+    `- high: ${high.length}`,
+    `- medium: ${medium.length}`,
+    '',
+  ]
+
+  for (const finding of findings) {
+    lines.push(
+      `[${finding.severity.toUpperCase()}] ${finding.file}:${finding.line} ${finding.detail}`,
+    )
+    lines.push(`  ${finding.excerpt}`)
+  }
+
+  return lines.join('\n')
+}
+
+export function run(options: CliOptions): number {
+  const diff = getGitDiff(options.baseRef)
+  const addedLines = parseAddedLines(diff)
+  const findings = scanAddedLines(addedLines)
+
+  if (options.json) {
+    process.stdout.write(
+      `${JSON.stringify(
+        {
+          baseRef: options.baseRef,
+          addedLines: addedLines.length,
+          findings,
+        },
+        null,
+        2,
+      )}\n`,
+    )
+  } else {
+    process.stdout.write(`${renderText(findings)}\n`)
+  }
+
+  return shouldFail(findings, options.failOn) ? 1 : 0
+}
+
+if (import.meta.main) {
+  const options = parseOptions(process.argv.slice(2))
+  process.exitCode = run(options)
+}