feat: add Firecrawl backend for WebSearch and WebFetch tools
WebSearch is currently disabled for all non-Anthropic providers (OpenAI shim, DeepSeek, Ollama, etc.) because those providers have no native search backend. This adds Firecrawl as a fallback that activates when FIRECRAWL_API_KEY is set, unlocking web search for every model openclaude supports. WebFetch uses basic HTTP + Turndown for HTML-to-markdown conversion, which fails silently on JS-rendered SPAs and bot-protected pages. Firecrawl scrape replaces the fetch layer when FIRECRAWL_API_KEY is set, returning clean markdown that handles dynamic content correctly. Changes: - WebSearchTool: add runFirecrawlSearch() using @mendable/firecrawl-js, respects allowed_domains (post-filter) and blocked_domains (-site: operators), includes result snippets alongside links. shouldUseFirecrawl() ensures firstParty/Vertex/Foundry/Codex providers keep their native backends. - WebFetchTool: add scrapeWithFirecrawl(), drops into the existing applyPromptToMarkdown() pipeline so prompt processing is unchanged. - Remove "Web search is only available in the US" restriction from prompt when Firecrawl is active (it works globally).
This commit is contained in:
7
bun.lock
7
bun.lock
@@ -13,6 +13,7 @@
|
||||
"@anthropic-ai/vertex-sdk": "0.14.4",
|
||||
"@commander-js/extra-typings": "12.1.0",
|
||||
"@growthbook/growthbook": "1.6.5",
|
||||
"@mendable/firecrawl-js": "^4.18.1",
|
||||
"@modelcontextprotocol/sdk": "1.29.0",
|
||||
"@opentelemetry/api": "1.9.1",
|
||||
"@opentelemetry/api-logs": "0.214.0",
|
||||
@@ -185,6 +186,8 @@
|
||||
|
||||
"@js-sdsl/ordered-map": ["@js-sdsl/ordered-map@4.4.2", "", {}, "sha512-iUKgm52T8HOE/makSxjqoWhe95ZJA1/G1sYsGev2JDKUSS14KAgg1LHb+Ba+IPow0xflbnSkOsZcO08C7w1gYw=="],
|
||||
|
||||
"@mendable/firecrawl-js": ["@mendable/firecrawl-js@4.18.1", "", { "dependencies": { "axios": "1.14.0", "firecrawl": "4.16.0", "typescript-event-target": "^1.1.1", "zod": "^3.23.8", "zod-to-json-schema": "^3.23.0" } }, "sha512-NfmJv+xcHoZthj8I3NP/8KAgO8EWcvOcTvCAvszxqs7/6sCs1CRss6Tum6RycZNSwJkr5RzQossN89IlixRfng=="],
|
||||
|
||||
"@mixmark-io/domino": ["@mixmark-io/domino@2.2.0", "", {}, "sha512-Y28PR25bHXUg88kCV7nivXrP2Nj2RueZ3/l/jdx6J9f8J4nsEGcgX0Qe6lt7Pa+J79+kPiJU3LguR6O/6zrLOw=="],
|
||||
|
||||
"@modelcontextprotocol/sdk": ["@modelcontextprotocol/sdk@1.29.0", "", { "dependencies": { "@hono/node-server": "^1.19.9", "ajv": "^8.17.1", "ajv-formats": "^3.0.1", "content-type": "^1.0.5", "cors": "^2.8.5", "cross-spawn": "^7.0.5", "eventsource": "^3.0.2", "eventsource-parser": "^3.0.0", "express": "^5.2.1", "express-rate-limit": "^8.2.1", "hono": "^4.11.4", "jose": "^6.1.3", "json-schema-typed": "^8.0.2", "pkce-challenge": "^5.0.0", "raw-body": "^3.0.0", "zod": "^3.25 || ^4.0", "zod-to-json-schema": "^3.25.1" }, "peerDependencies": { "@cfworker/json-schema": "^4.1.1" }, "optionalPeers": ["@cfworker/json-schema"] }, "sha512-zo37mZA9hJWpULgkRpowewez1y6ML5GsXJPY8FI0tBBCd77HEvza4jDqRKOXgHNn867PVGCyTdzqpz0izu5ZjQ=="],
|
||||
@@ -495,6 +498,8 @@
|
||||
|
||||
"find-up": ["find-up@4.1.0", "", { "dependencies": { "locate-path": "^5.0.0", "path-exists": "^4.0.0" } }, "sha512-PpOwAdQ/YlXQ2vj8a3h8IipDuYRi3wceVQQGYWxNINccq40Anw7BlsEXCMbt1Zt+OLA6Fq9suIpIWD0OsnISlw=="],
|
||||
|
||||
"firecrawl": ["firecrawl@4.16.0", "", { "dependencies": { "axios": "^1.13.5", "typescript-event-target": "^1.1.1", "zod": "^3.23.8", "zod-to-json-schema": "^3.23.0" } }, "sha512-7SJ/FWhZBtW2gTCE/BsvU+gbfIpfTq+D9IH82l9MacauLVptaY6EdYAhrK3YSMC9yr5NxvxRcpZKcXG/nqjiiQ=="],
|
||||
|
||||
"follow-redirects": ["follow-redirects@1.15.11", "", {}, "sha512-deG2P0JfjrTxl50XGCDyfI97ZGVCxIpfKYmfyrQ54n5FO/0gfIES8C/Psl6kWVDolizcaaxZJnTS0QSMxvnsBQ=="],
|
||||
|
||||
"form-data": ["form-data@4.0.5", "", { "dependencies": { "asynckit": "^0.4.0", "combined-stream": "^1.0.8", "es-set-tostringtag": "^2.1.0", "hasown": "^2.0.2", "mime-types": "^2.1.12" } }, "sha512-8RipRLol37bNs2bhoV67fiTEvdTrbMUYcFTiy3+wuuOnUog2QBHCZWXDRijWQfAkhBj2Uf5UnVaiWwA5vdd82w=="],
|
||||
@@ -767,6 +772,8 @@
|
||||
|
||||
"typescript": ["typescript@5.9.3", "", { "bin": { "tsc": "bin/tsc", "tsserver": "bin/tsserver" } }, "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw=="],
|
||||
|
||||
"typescript-event-target": ["typescript-event-target@1.1.2", "", {}, "sha512-TvkrTUpv7gCPlcnSoEwUVUBwsdheKm+HF5u2tPAKubkIGMfovdSizCTaZRY/NhR8+Ijy8iZZUapbVQAsNrkFrw=="],
|
||||
|
||||
"undici": ["undici@7.24.6", "", {}, "sha512-Xi4agocCbRzt0yYMZGMA6ApD7gvtUFaxm4ZmeacWI4cZxaF6C+8I8QfofC20NAePiB/IcvZmzkJ7XPa471AEtA=="],
|
||||
|
||||
"undici-types": ["undici-types@7.18.2", "", {}, "sha512-AsuCzffGHJybSaRrmr5eHr81mwJU3kjw6M+uprWvCXiNeN9SOGwQ3Jn8jb8m3Z6izVgknn1R0FTCEAP2QrLY/w=="],
|
||||
|
||||
@@ -112,7 +112,8 @@
|
||||
"ws": "8.20.0",
|
||||
"xss": "1.0.15",
|
||||
"yaml": "2.8.3",
|
||||
"zod": "3.25.76"
|
||||
"zod": "3.25.76",
|
||||
"@mendable/firecrawl-js": "^4.18.1"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/bun": "1.3.11",
|
||||
|
||||
@@ -21,6 +21,18 @@ import {
|
||||
MAX_MARKDOWN_LENGTH,
|
||||
} from './utils.js'
|
||||
|
||||
function isFirecrawlEnabled(): boolean {
|
||||
return Boolean(process.env.FIRECRAWL_API_KEY)
|
||||
}
|
||||
|
||||
async function scrapeWithFirecrawl(url: string): Promise<{ markdown: string; bytes: number }> {
|
||||
const { FirecrawlClient } = await import('@mendable/firecrawl-js')
|
||||
const app = new FirecrawlClient({ apiKey: process.env.FIRECRAWL_API_KEY! })
|
||||
const result = await app.scrape(url, { formats: ['markdown'] })
|
||||
const markdown = (result as { markdown?: string }).markdown ?? ''
|
||||
return { markdown, bytes: Buffer.byteLength(markdown) }
|
||||
}
|
||||
|
||||
const inputSchema = lazySchema(() =>
|
||||
z.strictObject({
|
||||
url: z.string().url().describe('The URL to fetch content from'),
|
||||
@@ -211,6 +223,27 @@ ${DESCRIPTION}`
|
||||
) {
|
||||
const start = Date.now()
|
||||
|
||||
if (isFirecrawlEnabled()) {
|
||||
const { markdown, bytes } = await scrapeWithFirecrawl(url)
|
||||
const result = await applyPromptToMarkdown(
|
||||
prompt,
|
||||
markdown,
|
||||
abortController.signal,
|
||||
isNonInteractiveSession,
|
||||
false,
|
||||
)
|
||||
return {
|
||||
data: {
|
||||
bytes,
|
||||
code: 200,
|
||||
codeText: 'OK',
|
||||
result,
|
||||
durationMs: Date.now() - start,
|
||||
url,
|
||||
} satisfies Output,
|
||||
}
|
||||
}
|
||||
|
||||
const response = await getURLMarkdownContent(url, abortController)
|
||||
|
||||
// Check if we got a redirect to a different host
|
||||
|
||||
@@ -88,6 +88,67 @@ function makeToolSchema(input: Input): BetaWebSearchTool20250305 {
|
||||
}
|
||||
}
|
||||
|
||||
function isFirecrawlEnabled(): boolean {
|
||||
return Boolean(process.env.FIRECRAWL_API_KEY)
|
||||
}
|
||||
|
||||
function shouldUseFirecrawl(): boolean {
|
||||
if (!isFirecrawlEnabled()) return false
|
||||
// Don't override native search on providers that already have it
|
||||
if (isCodexResponsesWebSearchEnabled()) return false
|
||||
const provider = getAPIProvider()
|
||||
if (provider === 'firstParty' || provider === 'vertex' || provider === 'foundry') return false
|
||||
return true
|
||||
}
|
||||
|
||||
async function runFirecrawlSearch(input: Input): Promise<Output> {
|
||||
const startTime = performance.now()
|
||||
const { FirecrawlClient } = await import('@mendable/firecrawl-js')
|
||||
const app = new FirecrawlClient({ apiKey: process.env.FIRECRAWL_API_KEY! })
|
||||
|
||||
let query = input.query
|
||||
if (input.blocked_domains?.length) {
|
||||
const exclusions = input.blocked_domains.map(d => `-site:${d}`).join(' ')
|
||||
query = `${query} ${exclusions}`
|
||||
}
|
||||
|
||||
const data = await app.search(query, { limit: 10 })
|
||||
|
||||
let hits = (data.web ?? []).map((r: { url: string; title?: string }) => ({
|
||||
title: r.title ?? r.url,
|
||||
url: r.url,
|
||||
}))
|
||||
|
||||
if (input.allowed_domains?.length) {
|
||||
hits = hits.filter(h =>
|
||||
input.allowed_domains!.some(d => {
|
||||
try {
|
||||
return new URL(h.url).hostname.endsWith(d)
|
||||
} catch {
|
||||
return false
|
||||
}
|
||||
}),
|
||||
)
|
||||
}
|
||||
|
||||
const snippets = (data.web ?? [])
|
||||
.filter((r: { description?: string }) => r.description)
|
||||
.map((r: { url: string; title?: string; description?: string }) =>
|
||||
`**${r.title ?? r.url}** — ${r.description} (${r.url})`,
|
||||
)
|
||||
.join('\n')
|
||||
|
||||
const results: Output['results'] = []
|
||||
if (snippets) results.push(snippets)
|
||||
results.push({ tool_use_id: 'firecrawl-search', content: hits })
|
||||
|
||||
return {
|
||||
query: input.query,
|
||||
results,
|
||||
durationSeconds: (performance.now() - startTime) / 1000,
|
||||
}
|
||||
}
|
||||
|
||||
function isCodexResponsesWebSearchEnabled(): boolean {
|
||||
if (getAPIProvider() !== 'openai') {
|
||||
return false
|
||||
@@ -378,6 +439,10 @@ export const WebSearchTool = buildTool({
|
||||
return summary ? `Searching for ${summary}` : 'Searching the web'
|
||||
},
|
||||
isEnabled() {
|
||||
if (shouldUseFirecrawl()) {
|
||||
return true
|
||||
}
|
||||
|
||||
const provider = getAPIProvider()
|
||||
const model = getMainLoopModel()
|
||||
|
||||
@@ -437,7 +502,7 @@ export const WebSearchTool = buildTool({
|
||||
}
|
||||
},
|
||||
async prompt() {
|
||||
if (isCodexResponsesWebSearchEnabled()) {
|
||||
if (shouldUseFirecrawl() || isCodexResponsesWebSearchEnabled()) {
|
||||
return getWebSearchPrompt().replace(
|
||||
/\n\s*-\s*Web search is only available in the US/,
|
||||
'',
|
||||
@@ -474,6 +539,10 @@ export const WebSearchTool = buildTool({
|
||||
return { result: true }
|
||||
},
|
||||
async call(input, context, _canUseTool, _parentMessage, onProgress) {
|
||||
if (shouldUseFirecrawl()) {
|
||||
return { data: await runFirecrawlSearch(input) }
|
||||
}
|
||||
|
||||
if (isCodexResponsesWebSearchEnabled()) {
|
||||
return {
|
||||
data: await runCodexWebSearch(input, context.abortController.signal),
|
||||
|
||||
Reference in New Issue
Block a user