The previous `isPrivateHostname` used a list of regexes against `URL.hostname`. Several literal-address forms slipped past it: - IPv4-mapped IPv6 `[::ffff:127.0.0.1]` (WHATWG URL normalizes to `[::ffff:7f00:1]`, which no regex matched) — lets callers reach loopback and other private v4 via an IPv6 literal. - ULA `fc00::/7` (e.g. `[fc00::1]`) — not covered. - Link-local `fe80::/10` (e.g. `[fe80::1]`) — not covered. - IPv4 `169.254.0.0/16` (cloud metadata, including 169.254.169.254), `100.64.0.0/10` (CGNAT), and the full `0.0.0.0/8` — not covered. - The IPv6 regex `/^\[::1?\]$/` also required brackets, but `URL.hostname` returns bracketed form anyway, so this part happened to work. WHATWG `new URL(...)` already normalizes short-form / numeric / hex / octal IPv4 to dotted-quad before we see it, so those cases were in fact handled — the remaining gaps were IPv6 and a few missing v4 ranges. Replace the regex list with: - a dotted-quad IPv4 parser + int range check covering 0/8, 10/8, 100.64/10, 127/8, 169.254/16, 172.16/12, 192.168/16; - a small IPv6 parser (handles `::` compression and embedded v4 suffix) + a byte-range check covering `::`, `::1`, IPv4-mapped (recursing into the v4 classifier), IPv4-compatible, `fc00::/7`, `fe80::/10`, and `fec0::/10`. Export `isPrivateHostname` and add unit tests covering every bypass listed above plus public-address negatives. Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
597 lines
20 KiB
TypeScript
597 lines
20 KiB
TypeScript
/**
|
|
* Custom API provider adapter.
|
|
*
|
|
* Supports:
|
|
* - Any HTTP endpoint via WEB_SEARCH_API
|
|
* - Built-in presets via WEB_PROVIDER (searxng, google, brave, serpapi)
|
|
* - GET or POST (WEB_METHOD)
|
|
* - Query in path via WEB_URL_TEMPLATE with {query}
|
|
* - Custom POST body via WEB_BODY_TEMPLATE with {query}
|
|
* - Extra static params via WEB_PARAMS (JSON)
|
|
* - Flexible response parsing (auto-detects common shapes)
|
|
* - One automatic retry on failure
|
|
*
|
|
* ## Security Guardrails (Option B)
|
|
*
|
|
* This adapter creates a generic outbound HTTP client. The following
|
|
* guardrails are enforced to reduce SSRF and data-exfiltration risk:
|
|
*
|
|
* 1. HTTPS-only by default (opt-out: WEB_CUSTOM_ALLOW_HTTP=true)
|
|
* 2. Private / loopback / link-local IPs are blocked by default
|
|
* (opt-out: WEB_CUSTOM_ALLOW_PRIVATE=true)
|
|
* 3. Built-in allowlist of header names — arbitrary headers require
|
|
* WEB_CUSTOM_ALLOW_ARBITRARY_HEADERS=true
|
|
* 4. Max body size guard (300 KB for POST)
|
|
* 5. Request timeout (default 120s, configurable via WEB_CUSTOM_TIMEOUT_SEC)
|
|
* 6. Audit log on first custom search (one-time warning)
|
|
*/
|
|
|
|
import type { SearchInput, SearchProvider } from './types.js'
|
|
import {
|
|
applyDomainFilters,
|
|
normalizeHit,
|
|
safeHostname,
|
|
type ProviderOutput,
|
|
type SearchHit,
|
|
} from './types.js'
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Built-in provider presets
|
|
// ---------------------------------------------------------------------------
|
|
|
|
interface ProviderPreset {
|
|
urlTemplate: string
|
|
queryParam: string
|
|
method?: string
|
|
authHeader?: string
|
|
authScheme?: string
|
|
jsonPath?: string
|
|
responseAdapter?: (data: any) => SearchHit[]
|
|
}
|
|
|
|
const BUILT_IN_PROVIDERS: Record<string, ProviderPreset> = {
|
|
searxng: {
|
|
// NOTE: default uses https://localhost — users must override WEB_SEARCH_API
|
|
// for their actual instance. The http:// default was intentionally removed
|
|
// to comply with the HTTPS-only guardrail.
|
|
urlTemplate: 'https://localhost:8080/search',
|
|
queryParam: 'q',
|
|
jsonPath: 'results',
|
|
responseAdapter(data: any) {
|
|
return (data.results ?? []).map((r: any) => ({
|
|
title: r.title ?? r.url,
|
|
url: r.url,
|
|
description: r.content,
|
|
source: r.engine ?? r.source,
|
|
}))
|
|
},
|
|
},
|
|
google: {
|
|
urlTemplate: 'https://www.googleapis.com/customsearch/v1',
|
|
queryParam: 'q',
|
|
authHeader: 'Authorization',
|
|
authScheme: 'Bearer',
|
|
responseAdapter(data: any) {
|
|
return (data.items ?? []).map((r: any) => ({
|
|
title: r.title ?? '',
|
|
url: r.link ?? '',
|
|
description: r.snippet,
|
|
source: r.displayLink,
|
|
}))
|
|
},
|
|
},
|
|
brave: {
|
|
urlTemplate: 'https://api.search.brave.com/res/v1/web/search',
|
|
queryParam: 'q',
|
|
authHeader: 'X-Subscription-Token',
|
|
responseAdapter(data: any) {
|
|
return (data.web?.results ?? []).map((r: any) => ({
|
|
title: r.title ?? '',
|
|
url: r.url ?? '',
|
|
description: r.description,
|
|
source: safeHostname(r.url),
|
|
}))
|
|
},
|
|
},
|
|
serpapi: {
|
|
urlTemplate: 'https://serpapi.com/search.json',
|
|
queryParam: 'q',
|
|
authHeader: 'Authorization',
|
|
authScheme: 'Bearer',
|
|
responseAdapter(data: any) {
|
|
return (data.organic_results ?? []).map((r: any) => ({
|
|
title: r.title ?? '',
|
|
url: r.link ?? '',
|
|
description: r.snippet,
|
|
source: r.displayed_link,
|
|
}))
|
|
},
|
|
},
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Security guardrails
|
|
// ---------------------------------------------------------------------------
|
|
|
|
/** Maximum POST body size in bytes (300 KB default, configurable via WEB_CUSTOM_MAX_BODY_KB). */
|
|
const DEFAULT_MAX_BODY_KB = 300
|
|
|
|
/** Default request timeout in seconds. */
|
|
const DEFAULT_TIMEOUT_SECONDS = 120
|
|
|
|
/** Header names that are always allowed (case-insensitive). */
|
|
const SAFE_HEADER_NAMES = new Set([
|
|
'accept',
|
|
'accept-encoding',
|
|
'accept-language',
|
|
'authorization',
|
|
'cache-control',
|
|
'content-type',
|
|
'if-modified-since',
|
|
'if-none-match',
|
|
'ocp-apim-subscription-key',
|
|
'user-agent',
|
|
'x-api-key',
|
|
'x-subscription-token',
|
|
'x-tenant-id',
|
|
])
|
|
|
|
/**
|
|
* Private / reserved address check for SSRF mitigation.
|
|
*
|
|
* Operates on the hostname produced by WHATWG `new URL(...)`, which already
|
|
* normalizes short-form, numeric, hex, and octal IPv4 to dotted-quad
|
|
* (e.g. `127.1`, `2130706433`, `0x7f000001`, `0177.0.0.1` → `127.0.0.1`),
|
|
* and which preserves IPv6 in bracketed compressed form
|
|
* (e.g. `[::ffff:127.0.0.1]` → `[::ffff:7f00:1]`).
|
|
*
|
|
* DNS resolution to private IPs is NOT blocked here — resolving before
|
|
* fetch is not exposed by Node's fetch. This guard blocks literal-address
|
|
* bypasses, which is what the original regex was trying (and failing) to do.
|
|
*/
|
|
|
|
function ipv4DottedToInt(ip: string): number | null {
|
|
const parts = ip.split('.')
|
|
if (parts.length !== 4) return null
|
|
let n = 0
|
|
for (const p of parts) {
|
|
if (!/^\d+$/.test(p)) return null
|
|
const x = Number(p)
|
|
if (!Number.isInteger(x) || x < 0 || x > 255) return null
|
|
n = n * 256 + x
|
|
}
|
|
return n >>> 0
|
|
}
|
|
|
|
function isPrivateIPv4Int(n: number): boolean {
|
|
const a = (n >>> 24) & 0xff
|
|
const b = (n >>> 16) & 0xff
|
|
// 0.0.0.0/8 "this network"
|
|
if (a === 0) return true
|
|
// 10.0.0.0/8
|
|
if (a === 10) return true
|
|
// 100.64.0.0/10 CGNAT
|
|
if (a === 100 && (b & 0xc0) === 0x40) return true
|
|
// 127.0.0.0/8 loopback
|
|
if (a === 127) return true
|
|
// 169.254.0.0/16 link-local
|
|
if (a === 169 && b === 254) return true
|
|
// 172.16.0.0/12
|
|
if (a === 172 && (b & 0xf0) === 0x10) return true
|
|
// 192.168.0.0/16
|
|
if (a === 192 && b === 168) return true
|
|
return false
|
|
}
|
|
|
|
/**
|
|
* Parse an IPv6 address (without brackets, zone id optional) to 16 bytes.
|
|
* Returns null on malformed input. Handles `::` compression and embedded
|
|
* IPv4 suffix (e.g. `::ffff:127.0.0.1`).
|
|
*/
|
|
function parseIPv6(input: string): Uint8Array | null {
|
|
let s = input.split('%')[0] ?? ''
|
|
if (s === '') return null
|
|
|
|
// Split off trailing embedded IPv4 if present
|
|
let trailingV4: [number, number, number, number] | null = null
|
|
const v4m = s.match(/^(.*:)(\d+\.\d+\.\d+\.\d+)$/)
|
|
if (v4m) {
|
|
const n = ipv4DottedToInt(v4m[2]!)
|
|
if (n === null) return null
|
|
trailingV4 = [(n >>> 24) & 0xff, (n >>> 16) & 0xff, (n >>> 8) & 0xff, n & 0xff]
|
|
s = v4m[1]!.replace(/:$/, '')
|
|
if (s === '') s = '::' // e.g. input was "::1.2.3.4"
|
|
}
|
|
|
|
const halves = s.split('::')
|
|
if (halves.length > 2) return null
|
|
const left = halves[0] ? halves[0]!.split(':') : []
|
|
const right = halves.length === 2 && halves[1] ? halves[1]!.split(':') : []
|
|
|
|
const groupsNeeded = 8 - (trailingV4 ? 2 : 0)
|
|
if (halves.length === 1 && left.length !== groupsNeeded) return null
|
|
if (halves.length === 2 && left.length + right.length > groupsNeeded) return null
|
|
|
|
const fill = halves.length === 2 ? groupsNeeded - left.length - right.length : 0
|
|
const groups = [...left, ...Array(fill).fill('0'), ...right]
|
|
|
|
const bytes = new Uint8Array(16)
|
|
for (let i = 0; i < groups.length; i++) {
|
|
const g = groups[i]!
|
|
if (!/^[0-9a-f]{1,4}$/i.test(g)) return null
|
|
const v = parseInt(g, 16)
|
|
bytes[i * 2] = (v >>> 8) & 0xff
|
|
bytes[i * 2 + 1] = v & 0xff
|
|
}
|
|
if (trailingV4) {
|
|
const off = groups.length * 2
|
|
bytes[off] = trailingV4[0]
|
|
bytes[off + 1] = trailingV4[1]
|
|
bytes[off + 2] = trailingV4[2]
|
|
bytes[off + 3] = trailingV4[3]
|
|
}
|
|
return bytes
|
|
}
|
|
|
|
function isPrivateIPv6(bytes: Uint8Array): boolean {
|
|
// ::1 loopback
|
|
let allZeroExceptLast = true
|
|
for (let i = 0; i < 15; i++) if (bytes[i] !== 0) { allZeroExceptLast = false; break }
|
|
if (allZeroExceptLast && bytes[15] === 1) return true
|
|
// :: unspecified
|
|
if (bytes.every(v => v === 0)) return true
|
|
// IPv4-mapped ::ffff:a.b.c.d
|
|
let isV4Mapped = true
|
|
for (let i = 0; i < 10; i++) if (bytes[i] !== 0) { isV4Mapped = false; break }
|
|
if (isV4Mapped && bytes[10] === 0xff && bytes[11] === 0xff) {
|
|
const n = ((bytes[12]! << 24) | (bytes[13]! << 16) | (bytes[14]! << 8) | bytes[15]!) >>> 0
|
|
return isPrivateIPv4Int(n)
|
|
}
|
|
// IPv4-compatible (deprecated) ::a.b.c.d — treat as private if embedded v4 is
|
|
let isV4Compat = true
|
|
for (let i = 0; i < 12; i++) if (bytes[i] !== 0) { isV4Compat = false; break }
|
|
if (isV4Compat) {
|
|
const n = ((bytes[12]! << 24) | (bytes[13]! << 16) | (bytes[14]! << 8) | bytes[15]!) >>> 0
|
|
if (n !== 0 && n !== 1) return isPrivateIPv4Int(n)
|
|
}
|
|
// ULA fc00::/7
|
|
if ((bytes[0]! & 0xfe) === 0xfc) return true
|
|
// Link-local fe80::/10
|
|
if (bytes[0] === 0xfe && (bytes[1]! & 0xc0) === 0x80) return true
|
|
// Site-local (deprecated) fec0::/10
|
|
if (bytes[0] === 0xfe && (bytes[1]! & 0xc0) === 0xc0) return true
|
|
return false
|
|
}
|
|
|
|
export function isPrivateHostname(hostname: string): boolean {
|
|
if (/^localhost$/i.test(hostname)) return true
|
|
// URL.hostname wraps IPv6 literals in brackets; strip for parsing.
|
|
const unwrapped = hostname.startsWith('[') && hostname.endsWith(']')
|
|
? hostname.slice(1, -1)
|
|
: hostname
|
|
// IPv4 dotted-quad (WHATWG URL normalizes short/numeric/hex/octal to this).
|
|
const v4 = ipv4DottedToInt(unwrapped)
|
|
if (v4 !== null) return isPrivateIPv4Int(v4)
|
|
// IPv6
|
|
if (unwrapped.includes(':')) {
|
|
const bytes = parseIPv6(unwrapped)
|
|
if (bytes) return isPrivateIPv6(bytes)
|
|
}
|
|
return false
|
|
}
|
|
|
|
/**
|
|
* Validate the target URL against security guardrails.
|
|
* Throws on violation.
|
|
*/
|
|
function validateUrl(urlString: string): void {
|
|
let parsed: URL
|
|
try {
|
|
parsed = new URL(urlString)
|
|
} catch {
|
|
throw new Error(`Custom search URL is not a valid URL: ${urlString.slice(0, 100)}`)
|
|
}
|
|
|
|
// 2. HTTPS-only (unless explicitly opted out)
|
|
const allowHttp = process.env.WEB_CUSTOM_ALLOW_HTTP === 'true'
|
|
if (!allowHttp && parsed.protocol !== 'https:') {
|
|
throw new Error(
|
|
`Custom search URL must use https:// (got ${parsed.protocol}). ` +
|
|
`Set WEB_CUSTOM_ALLOW_HTTP=true to override (not recommended).`,
|
|
)
|
|
}
|
|
|
|
// 3. Private network check (unless explicitly opted out)
|
|
const allowPrivate = process.env.WEB_CUSTOM_ALLOW_PRIVATE === 'true'
|
|
if (!allowPrivate && isPrivateHostname(parsed.hostname)) {
|
|
throw new Error(
|
|
`Custom search URL targets a private/reserved address (${parsed.hostname}). ` +
|
|
`This is blocked by default to prevent SSRF. ` +
|
|
`Set WEB_CUSTOM_ALLOW_PRIVATE=true to override (e.g. for local SearXNG).`,
|
|
)
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Validate that user-supplied headers are in the safe allowlist,
|
|
* unless WEB_CUSTOM_ALLOW_ARBITRARY_HEADERS=true.
|
|
*/
|
|
function validateHeaderName(name: string): boolean {
|
|
const allowArbitrary = process.env.WEB_CUSTOM_ALLOW_ARBITRARY_HEADERS === 'true'
|
|
if (allowArbitrary) return true
|
|
return SAFE_HEADER_NAMES.has(name.toLowerCase())
|
|
}
|
|
|
|
/**
|
|
* Log a one-time audit warning that custom outbound search is active.
|
|
* Prevents silent data exfiltration.
|
|
*/
|
|
let auditLogged = false
|
|
function auditLogCustomSearch(url: string): void {
|
|
if (auditLogged) return
|
|
auditLogged = true
|
|
console.warn(
|
|
`[web-search] ⚠️ Custom search provider is active. ` +
|
|
`Outbound requests go to: ${safeHostname(url) ?? url}. ` +
|
|
`Ensure this endpoint is trusted. ` +
|
|
`See: https://github.com/Gitlawb/openclaude/pull/512#security`,
|
|
)
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Auth — preset overrides for built-in providers
|
|
// ---------------------------------------------------------------------------
|
|
|
|
export function buildAuthHeadersForPreset(preset?: ProviderPreset): Record<string, string> {
|
|
const apiKey = process.env.WEB_KEY
|
|
if (!apiKey) return {}
|
|
|
|
// WEB_AUTH_HEADER="" is an explicit opt-out of auth headers entirely
|
|
const explicitHeader = process.env.WEB_AUTH_HEADER
|
|
if (explicitHeader === '') return {}
|
|
|
|
const headerName = explicitHeader ?? preset?.authHeader ?? 'Authorization'
|
|
const scheme = process.env.WEB_AUTH_SCHEME !== undefined
|
|
? process.env.WEB_AUTH_SCHEME
|
|
: (preset?.authScheme ?? 'Bearer')
|
|
return { [headerName]: `${scheme} ${apiKey}`.trim() }
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Request construction
|
|
// ---------------------------------------------------------------------------
|
|
|
|
function resolveConfig(): {
|
|
urlTemplate: string
|
|
queryParam: string
|
|
method: string
|
|
jsonPath?: string
|
|
responseAdapter?: (data: any) => SearchHit[]
|
|
preset?: ProviderPreset
|
|
} {
|
|
const providerName = process.env.WEB_PROVIDER
|
|
const preset = providerName ? BUILT_IN_PROVIDERS[providerName] : undefined
|
|
|
|
return {
|
|
urlTemplate: process.env.WEB_URL_TEMPLATE
|
|
?? process.env.WEB_SEARCH_API
|
|
?? preset?.urlTemplate
|
|
?? '',
|
|
queryParam: process.env.WEB_QUERY_PARAM ?? preset?.queryParam ?? 'q',
|
|
method: process.env.WEB_METHOD ?? preset?.method ?? 'GET',
|
|
jsonPath: process.env.WEB_JSON_PATH ?? preset?.jsonPath,
|
|
responseAdapter: preset?.responseAdapter,
|
|
preset,
|
|
}
|
|
}
|
|
|
|
function parseExtraParams(): Record<string, string> {
|
|
const raw = process.env.WEB_PARAMS
|
|
if (!raw) return {}
|
|
try {
|
|
const obj = JSON.parse(raw)
|
|
if (obj && typeof obj === 'object' && !Array.isArray(obj)) return obj
|
|
} catch { /* ignore */ }
|
|
return {}
|
|
}
|
|
|
|
function buildRequest(query: string) {
|
|
const config = resolveConfig()
|
|
const method = config.method.toUpperCase()
|
|
|
|
// --- URL ---
|
|
const rawTemplate = config.urlTemplate
|
|
const templateWithQuery = rawTemplate.replace(/\{query\}/g, encodeURIComponent(query))
|
|
const url = new URL(templateWithQuery)
|
|
|
|
// Merge extra static params
|
|
for (const [k, v] of Object.entries(parseExtraParams())) {
|
|
url.searchParams.set(k, v)
|
|
}
|
|
|
|
// If {query} wasn't in template, add as param
|
|
if (!rawTemplate.includes('{query}')) {
|
|
url.searchParams.set(config.queryParam, query)
|
|
}
|
|
|
|
const urlString = url.toString()
|
|
|
|
// --- Security validation ---
|
|
validateUrl(urlString)
|
|
auditLogCustomSearch(urlString)
|
|
|
|
// --- Headers ---
|
|
const headers: Record<string, string> = {
|
|
...buildAuthHeadersForPreset(config.preset),
|
|
}
|
|
|
|
// Merge WEB_HEADERS with allowlist enforcement
|
|
const rawExtra = process.env.WEB_HEADERS
|
|
if (rawExtra) {
|
|
for (const pair of rawExtra.split(';')) {
|
|
const i = pair.indexOf(':')
|
|
if (i > 0) {
|
|
const k = pair.slice(0, i).trim()
|
|
const v = pair.slice(i + 1).trim()
|
|
if (k) {
|
|
if (!validateHeaderName(k)) {
|
|
throw new Error(
|
|
`Header "${k}" is not in the safe allowlist. ` +
|
|
`Allowed: ${[...SAFE_HEADER_NAMES].join(', ')}. ` +
|
|
`Set WEB_CUSTOM_ALLOW_ARBITRARY_HEADERS=true to override.`,
|
|
)
|
|
}
|
|
headers[k] = v
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
const init: RequestInit = { method, headers }
|
|
|
|
if (method === 'POST') {
|
|
headers['Content-Type'] = 'application/json'
|
|
const bodyTemplate = process.env.WEB_BODY_TEMPLATE
|
|
if (bodyTemplate) {
|
|
const body = bodyTemplate.replace(/\{query\}/g, query)
|
|
const maxBodyBytes = (Number(process.env.WEB_CUSTOM_MAX_BODY_KB) || DEFAULT_MAX_BODY_KB) * 1024
|
|
if (Buffer.byteLength(body) > maxBodyBytes) {
|
|
throw new Error(
|
|
`POST body exceeds ${maxBodyBytes} bytes. ` +
|
|
`Increase WEB_CUSTOM_MAX_BODY_KB if needed.`,
|
|
)
|
|
}
|
|
init.body = body
|
|
} else {
|
|
init.body = JSON.stringify({ [config.queryParam]: query })
|
|
}
|
|
}
|
|
|
|
return { url: urlString, init, config }
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Response parsing — flexible, handles many shapes
|
|
// ---------------------------------------------------------------------------
|
|
|
|
function walkJsonPath(obj: any, path: string): any {
|
|
let current = obj
|
|
for (const seg of path.split('.')) {
|
|
if (current == null || typeof current !== 'object') return undefined
|
|
current = current[seg]
|
|
}
|
|
return current
|
|
}
|
|
|
|
function extractFromNode(node: any): SearchHit[] {
|
|
if (!node) return []
|
|
if (Array.isArray(node)) return node.map(normalizeHit).filter(Boolean) as SearchHit[]
|
|
if (typeof node === 'object') {
|
|
const all: SearchHit[] = []
|
|
for (const sub of Object.values(node)) all.push(...extractFromNode(sub))
|
|
return all
|
|
}
|
|
// node is a primitive (string/number) — not a valid hit structure
|
|
return []
|
|
}
|
|
|
|
export function extractHits(raw: any, jsonPath?: string): SearchHit[] {
|
|
if (jsonPath) return extractFromNode(walkJsonPath(raw, jsonPath))
|
|
if (Array.isArray(raw)) return raw.map(normalizeHit).filter(Boolean) as SearchHit[]
|
|
if (!raw || typeof raw !== 'object') return []
|
|
|
|
const arrayKeys = ['results', 'items', 'data', 'web', 'organic_results', 'hits', 'entries']
|
|
for (const key of arrayKeys) {
|
|
const val = raw[key]
|
|
if (Array.isArray(val)) return val.map(normalizeHit).filter(Boolean) as SearchHit[]
|
|
if (val && typeof val === 'object' && !Array.isArray(val)) {
|
|
const all: SearchHit[] = []
|
|
for (const sub of Object.values(val)) {
|
|
if (Array.isArray(sub)) all.push(...(sub.map(normalizeHit).filter(Boolean) as SearchHit[]))
|
|
}
|
|
if (all.length > 0) return all
|
|
}
|
|
}
|
|
|
|
return []
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Fetch with one retry + timeout
|
|
// ---------------------------------------------------------------------------
|
|
|
|
async function fetchWithRetry(url: string, init: RequestInit, signal?: AbortSignal): Promise<any> {
|
|
const timeoutSec = Number(process.env.WEB_CUSTOM_TIMEOUT_SEC) || DEFAULT_TIMEOUT_SECONDS
|
|
const timeoutMs = timeoutSec * 1000
|
|
let lastErr: Error | undefined
|
|
let lastStatus: number | undefined
|
|
|
|
for (let attempt = 0; attempt < 2; attempt++) {
|
|
// Compose timeout with caller signal via AbortSignal.any so each attempt
|
|
// has a fresh timeout and we don't leak an abort listener on `signal`
|
|
// (the previous implementation added one per attempt and never removed
|
|
// it, and the listener kept a reference to a stale AbortController).
|
|
const timeoutSignal = AbortSignal.timeout(timeoutMs)
|
|
const combined = signal
|
|
? AbortSignal.any([signal, timeoutSignal])
|
|
: timeoutSignal
|
|
|
|
lastStatus = undefined
|
|
try {
|
|
const res = await fetch(url, { ...init, signal: combined })
|
|
|
|
if (!res.ok) {
|
|
lastStatus = res.status
|
|
throw new Error(`Custom search API returned ${res.status}: ${res.statusText}`)
|
|
}
|
|
return await res.json()
|
|
} catch (err) {
|
|
lastErr = err instanceof Error ? err : new Error(String(err))
|
|
|
|
// Caller-initiated abort wins — propagate without retry or rewrite.
|
|
if (signal?.aborted) throw lastErr
|
|
|
|
// Timeout (TimeoutError on Bun/Node, or AbortError with timeoutSignal aborted).
|
|
if (timeoutSignal.aborted) {
|
|
throw new Error(`Custom search timed out after ${timeoutSec}s`)
|
|
}
|
|
|
|
// Retry once on 5xx or network errors; do not retry 4xx.
|
|
if (attempt === 0 && (lastStatus === undefined || lastStatus >= 500)) {
|
|
await new Promise(r => setTimeout(r, 500))
|
|
continue
|
|
}
|
|
throw lastErr
|
|
}
|
|
}
|
|
throw lastErr!
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Provider export
|
|
// ---------------------------------------------------------------------------
|
|
|
|
export const customProvider: SearchProvider = {
|
|
name: 'custom',
|
|
|
|
isConfigured() {
|
|
return Boolean(process.env.WEB_SEARCH_API || process.env.WEB_PROVIDER || process.env.WEB_URL_TEMPLATE)
|
|
},
|
|
|
|
async search(input: SearchInput, signal?: AbortSignal): Promise<ProviderOutput> {
|
|
const start = performance.now()
|
|
const { url, init, config } = buildRequest(input.query)
|
|
const raw = await fetchWithRetry(url, init, signal)
|
|
|
|
const hits = config.responseAdapter
|
|
? config.responseAdapter(raw)
|
|
: extractHits(raw, config.jsonPath)
|
|
|
|
return {
|
|
hits: applyDomainFilters(hits, input),
|
|
providerName: 'custom',
|
|
durationSeconds: (performance.now() - start) / 1000,
|
|
}
|
|
},
|
|
}
|