import { spawnSync } from 'node:child_process' export type FindingSeverity = 'high' | 'medium' export type DiffLine = { file: string line: number content: string } export type Finding = { severity: FindingSeverity code: string file: string line: number detail: string excerpt: string } type CliOptions = { baseRef: string json: boolean failOn: FindingSeverity } const SELF_EXCLUDED_FILES = new Set([ 'scripts/pr-intent-scan.ts', 'scripts/pr-intent-scan.test.ts', ]) const SHORTENER_DOMAINS = [ 'bit.ly', 'tinyurl.com', 'goo.gl', 't.co', 'is.gd', 'rb.gy', 'cutt.ly', ] const SUSPICIOUS_DOWNLOAD_DOMAINS = [ 'dropbox.com', 'dl.dropboxusercontent.com', 'drive.google.com', 'docs.google.com', 'mega.nz', 'mediafire.com', 'transfer.sh', 'anonfiles.com', 'catbox.moe', ] const URL_REGEX = /\bhttps?:\/\/[^\s)>"']+/gi const LONG_BASE64_REGEX = /\b(?:[A-Za-z0-9+/]{80,}={0,2}|[A-Za-z0-9_-]{80,})\b/ const EXECUTABLE_PATH_REGEX = /\.(?:sh|bash|zsh|ps1|exe|msi|pkg|deb|rpm|zip|tar|tgz|gz|xz|dmg|appimage)(?:$|[?#])/i const SENSITIVE_PATH_REGEX = /^(?:\.github\/workflows\/|scripts\/|bin\/|install(?:\/|\.|$)|.*(?:Dockerfile|docker-compose|compose\.ya?ml)$)/i function parseOptions(argv: string[]): CliOptions { const options: CliOptions = { baseRef: 'origin/main', json: false, failOn: 'high', } for (let index = 0; index < argv.length; index++) { const arg = argv[index] if (arg === '--json') { options.json = true continue } if (arg === '--base') { const next = argv[index + 1] if (next && !next.startsWith('--')) { options.baseRef = next index++ } continue } if (arg === '--fail-on') { const next = argv[index + 1] if (next === 'high' || next === 'medium') { options.failOn = next index++ } } } return options } function trimExcerpt(content: string): string { const compact = content.trim().replace(/\s+/g, ' ') return compact.length > 140 ? `${compact.slice(0, 137)}...` : compact } function uniqueFindings(findings: Finding[]): Finding[] { const seen = new Set() return findings.filter(finding => { const key = `${finding.code}:${finding.file}:${finding.line}:${finding.detail}` if (seen.has(key)) { return false } seen.add(key) return true }) } function parseAddedLines(diffText: string): DiffLine[] { const lines = diffText.split('\n') const added: DiffLine[] = [] let currentFile: string | null = null let currentLine = 0 for (const rawLine of lines) { if (rawLine.startsWith('+++ b/')) { currentFile = rawLine.slice('+++ b/'.length) continue } if (rawLine.startsWith('@@')) { const match = /\+(\d+)(?:,(\d+))?/.exec(rawLine) if (match) { currentLine = Number(match[1]) } continue } if (!currentFile) { continue } if (rawLine.startsWith('+') && !rawLine.startsWith('+++')) { added.push({ file: currentFile, line: currentLine, content: rawLine.slice(1), }) currentLine += 1 continue } if (rawLine.startsWith('-') && !rawLine.startsWith('---')) { continue } if (!rawLine.startsWith('\\')) { currentLine += 1 } } return added } function tryParseUrl(value: string): URL | null { try { return new URL(value) } catch { return null } } function hostMatches(hostname: string, domain: string): boolean { return hostname === domain || hostname.endsWith(`.${domain}`) } function hasSuspiciousDownloadIndicators(url: URL): boolean { const combined = `${url.pathname}${url.search}`.toLowerCase() return ( combined.includes('dl=1') || combined.includes('raw=1') || combined.includes('export=download') || combined.includes('/download') || combined.includes('/uc?export=download') ) } function findUrlFindings(line: DiffLine): Finding[] { const findings: Finding[] = [] const matches = line.content.match(URL_REGEX) ?? [] for (const match of matches) { const parsed = tryParseUrl(match) if (!parsed) continue const hostname = parsed.hostname.toLowerCase() for (const domain of SHORTENER_DOMAINS) { if (hostMatches(hostname, domain)) { findings.push({ severity: 'medium', code: 'shortened-url', file: line.file, line: line.line, detail: `Added shortened URL: ${hostname}`, excerpt: trimExcerpt(line.content), }) } } const isSuspiciousHost = SUSPICIOUS_DOWNLOAD_DOMAINS.some(domain => hostMatches(hostname, domain), ) const isExecutableDownload = EXECUTABLE_PATH_REGEX.test( `${parsed.pathname}${parsed.search}`, ) if (isSuspiciousHost) { findings.push({ severity: hasSuspiciousDownloadIndicators(parsed) || isExecutableDownload ? 'high' : 'medium', code: 'suspicious-download-link', file: line.file, line: line.line, detail: `Added external file-hosting link: ${hostname}`, excerpt: trimExcerpt(line.content), }) } else if (isExecutableDownload) { findings.push({ severity: 'high', code: 'executable-download-link', file: line.file, line: line.line, detail: `Added direct link to executable or archive payload: ${hostname}`, excerpt: trimExcerpt(line.content), }) } } return findings } function findSensitivePathFindings(line: DiffLine): Finding[] { if (!SENSITIVE_PATH_REGEX.test(line.file)) { return [] } const lower = line.content.toLowerCase() if ( /\b(curl|wget|invoke-webrequest|iwr|powershell|bash|sh|chmod\s+\+x)\b/i.test( line.content, ) || URL_REGEX.test(line.content) || lower.includes('download') ) { return [ { severity: 'medium', code: 'sensitive-automation-change', file: line.file, line: line.line, detail: 'Added network, execution, or download-related content in a sensitive automation file', excerpt: trimExcerpt(line.content), }, ] } return [] } function findCommandFindings(line: DiffLine): Finding[] { const findings: Finding[] = [] const lower = line.content.toLowerCase() const highPatterns: Array<[string, RegExp, string]> = [ [ 'download-exec-chain', /\b(curl|wget|invoke-webrequest|iwr)\b.*(\|\s*(sh|bash|zsh)|;\s*chmod\s+\+x|&&\s*\.\.?\/|>\s*\/tmp\/)/i, 'Added remote download followed by execution or staging', ], [ 'powershell-encoded', /\bpowershell(?:\.exe)?\b.*(?:-enc|-encodedcommand)\b/i, 'Added encoded PowerShell invocation', ], [ 'shell-eval-remote', /\b(curl|wget)\b.*\|\s*(sh|bash|zsh)\b/i, 'Added shell pipe from remote content into interpreter', ], [ 'binary-lolbin', /\b(mshta|rundll32|regsvr32|certutil)\b/i, 'Added living-off-the-land binary often used for payload staging', ], [ 'invoke-expression', /\b(iex|invoke-expression)\b/i, 'Added PowerShell expression execution', ], ] const mediumPatterns: Array<[string, RegExp, string]> = [ [ 'download-command', /\b(curl|wget|invoke-webrequest|iwr)\b.*https?:\/\//i, 'Added command that downloads remote content', ], [ 'archive-extract-exec', /\b(unzip|tar|7z)\b.*(&&|;).*\b(chmod|node|python|bash|sh)\b/i, 'Added archive extraction followed by execution', ], [ 'base64-decode', /\b(base64\s+-d|openssl\s+base64\s+-d|python .*b64decode)\b/i, 'Added explicit payload decode step', ], ] for (const [code, pattern, detail] of highPatterns) { if (pattern.test(line.content)) { findings.push({ severity: 'high', code, file: line.file, line: line.line, detail, excerpt: trimExcerpt(line.content), }) } } for (const [code, pattern, detail] of mediumPatterns) { if (code === 'download-command' && !SENSITIVE_PATH_REGEX.test(line.file)) { continue } if (pattern.test(line.content)) { findings.push({ severity: 'medium', code, file: line.file, line: line.line, detail, excerpt: trimExcerpt(line.content), }) } } if (LONG_BASE64_REGEX.test(line.content) && !lower.includes('sha256') && !lower.includes('sha512')) { findings.push({ severity: 'medium', code: 'long-encoded-payload', file: line.file, line: line.line, detail: 'Added long encoded blob or token-like payload', excerpt: trimExcerpt(line.content), }) } return findings } export function scanAddedLines(lines: DiffLine[]): Finding[] { const findings = lines .filter(line => !SELF_EXCLUDED_FILES.has(line.file)) .flatMap(line => [ ...findUrlFindings(line), ...findCommandFindings(line), ...findSensitivePathFindings(line), ]) return uniqueFindings(findings) } export function getGitDiff(baseRef: string): string { const mergeBase = spawnSync('git', ['merge-base', baseRef, 'HEAD'], { encoding: 'utf8', }) if (mergeBase.status !== 0) { throw new Error( `Could not determine merge-base with ${baseRef}: ${mergeBase.stderr.trim() || mergeBase.stdout.trim()}`, ) } const base = mergeBase.stdout.trim() const diff = spawnSync( 'git', ['diff', '--unified=0', '--no-ext-diff', `${base}...HEAD`], { encoding: 'utf8' }, ) if (diff.status !== 0) { throw new Error(`git diff failed: ${diff.stderr.trim() || diff.stdout.trim()}`) } return diff.stdout } function shouldFail(findings: Finding[], failOn: FindingSeverity): boolean { if (failOn === 'medium') { return findings.length > 0 } return findings.some(finding => finding.severity === 'high') } function renderText(findings: Finding[]): string { if (findings.length === 0) { return 'PR intent scan: no suspicious additions found.' } const high = findings.filter(f => f.severity === 'high') const medium = findings.filter(f => f.severity === 'medium') const lines = [ `PR intent scan: ${findings.length} finding(s)`, `- high: ${high.length}`, `- medium: ${medium.length}`, '', ] for (const finding of findings) { lines.push( `[${finding.severity.toUpperCase()}] ${finding.file}:${finding.line} ${finding.detail}`, ) lines.push(` ${finding.excerpt}`) } return lines.join('\n') } export function run(options: CliOptions): number { const diff = getGitDiff(options.baseRef) const addedLines = parseAddedLines(diff) const findings = scanAddedLines(addedLines) if (options.json) { process.stdout.write( `${JSON.stringify( { baseRef: options.baseRef, addedLines: addedLines.length, findings, }, null, 2, )}\n`, ) } else { process.stdout.write(`${renderText(findings)}\n`) } return shouldFail(findings, options.failOn) ? 1 : 0 } if (import.meta.main) { const options = parseOptions(process.argv.slice(2)) process.exitCode = run(options) }