diff --git a/README.md b/README.md index 0a8e0d2a..cdfd9142 100644 --- a/README.md +++ b/README.md @@ -123,6 +123,27 @@ Best if you want local inference on Apple Silicon with Atomic Chat. See [Advance --- +## Web Search and Fetch + +By default, `WebSearch` is disabled for all non-Anthropic providers. The native search backend requires either the Anthropic API or the Codex responses endpoint, so users on GPT-4o, DeepSeek, Gemini, Ollama, and other OpenAI-compatible providers get no web search at all. + +`WebFetch` works but uses basic HTTP plus HTML-to-markdown conversion. That fails on JavaScript-rendered pages (React, Next.js, Vue SPAs) and sites that block plain HTTP requests. + +Set a [Firecrawl](https://firecrawl.dev) API key to fix both: + +```bash +export FIRECRAWL_API_KEY=your-key-here +``` + +With this set: + +- `WebSearch` is enabled for all providers and routes through Firecrawl's search API +- `WebFetch` uses Firecrawl's scrape endpoint instead of raw HTTP, handling JS-rendered pages correctly + +Free tier at [firecrawl.dev](https://firecrawl.dev) includes 500 credits. The key is optional — if not set, both tools fall back to their original behavior. + +--- + ## How It Works The shim (`src/services/api/openaiShim.ts`) sits between Claude Code and the LLM API: diff --git a/bun.lock b/bun.lock index ce4a898c..ba5037ae 100644 --- a/bun.lock +++ b/bun.lock @@ -13,6 +13,7 @@ "@anthropic-ai/vertex-sdk": "0.14.4", "@commander-js/extra-typings": "12.1.0", "@growthbook/growthbook": "1.6.5", + "@mendable/firecrawl-js": "^4.18.1", "@modelcontextprotocol/sdk": "1.29.0", "@opentelemetry/api": "1.9.1", "@opentelemetry/api-logs": "0.214.0", @@ -185,6 +186,8 @@ "@js-sdsl/ordered-map": ["@js-sdsl/ordered-map@4.4.2", "", {}, "sha512-iUKgm52T8HOE/makSxjqoWhe95ZJA1/G1sYsGev2JDKUSS14KAgg1LHb+Ba+IPow0xflbnSkOsZcO08C7w1gYw=="], + "@mendable/firecrawl-js": ["@mendable/firecrawl-js@4.18.1", "", { "dependencies": { "axios": "1.14.0", "firecrawl": "4.16.0", "typescript-event-target": "^1.1.1", "zod": "^3.23.8", "zod-to-json-schema": "^3.23.0" } }, "sha512-NfmJv+xcHoZthj8I3NP/8KAgO8EWcvOcTvCAvszxqs7/6sCs1CRss6Tum6RycZNSwJkr5RzQossN89IlixRfng=="], + "@mixmark-io/domino": ["@mixmark-io/domino@2.2.0", "", {}, "sha512-Y28PR25bHXUg88kCV7nivXrP2Nj2RueZ3/l/jdx6J9f8J4nsEGcgX0Qe6lt7Pa+J79+kPiJU3LguR6O/6zrLOw=="], "@modelcontextprotocol/sdk": ["@modelcontextprotocol/sdk@1.29.0", "", { "dependencies": { "@hono/node-server": "^1.19.9", "ajv": "^8.17.1", "ajv-formats": "^3.0.1", "content-type": "^1.0.5", "cors": "^2.8.5", "cross-spawn": "^7.0.5", "eventsource": "^3.0.2", "eventsource-parser": "^3.0.0", "express": "^5.2.1", "express-rate-limit": "^8.2.1", "hono": "^4.11.4", "jose": "^6.1.3", "json-schema-typed": "^8.0.2", "pkce-challenge": "^5.0.0", "raw-body": "^3.0.0", "zod": "^3.25 || ^4.0", "zod-to-json-schema": "^3.25.1" }, "peerDependencies": { "@cfworker/json-schema": "^4.1.1" }, "optionalPeers": ["@cfworker/json-schema"] }, "sha512-zo37mZA9hJWpULgkRpowewez1y6ML5GsXJPY8FI0tBBCd77HEvza4jDqRKOXgHNn867PVGCyTdzqpz0izu5ZjQ=="], @@ -495,6 +498,8 @@ "find-up": ["find-up@4.1.0", "", { "dependencies": { "locate-path": "^5.0.0", "path-exists": "^4.0.0" } }, "sha512-PpOwAdQ/YlXQ2vj8a3h8IipDuYRi3wceVQQGYWxNINccq40Anw7BlsEXCMbt1Zt+OLA6Fq9suIpIWD0OsnISlw=="], + "firecrawl": ["firecrawl@4.16.0", "", { "dependencies": { "axios": "^1.13.5", "typescript-event-target": "^1.1.1", "zod": "^3.23.8", "zod-to-json-schema": "^3.23.0" } }, "sha512-7SJ/FWhZBtW2gTCE/BsvU+gbfIpfTq+D9IH82l9MacauLVptaY6EdYAhrK3YSMC9yr5NxvxRcpZKcXG/nqjiiQ=="], + "follow-redirects": ["follow-redirects@1.15.11", "", {}, "sha512-deG2P0JfjrTxl50XGCDyfI97ZGVCxIpfKYmfyrQ54n5FO/0gfIES8C/Psl6kWVDolizcaaxZJnTS0QSMxvnsBQ=="], "form-data": ["form-data@4.0.5", "", { "dependencies": { "asynckit": "^0.4.0", "combined-stream": "^1.0.8", "es-set-tostringtag": "^2.1.0", "hasown": "^2.0.2", "mime-types": "^2.1.12" } }, "sha512-8RipRLol37bNs2bhoV67fiTEvdTrbMUYcFTiy3+wuuOnUog2QBHCZWXDRijWQfAkhBj2Uf5UnVaiWwA5vdd82w=="], @@ -767,6 +772,8 @@ "typescript": ["typescript@5.9.3", "", { "bin": { "tsc": "bin/tsc", "tsserver": "bin/tsserver" } }, "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw=="], + "typescript-event-target": ["typescript-event-target@1.1.2", "", {}, "sha512-TvkrTUpv7gCPlcnSoEwUVUBwsdheKm+HF5u2tPAKubkIGMfovdSizCTaZRY/NhR8+Ijy8iZZUapbVQAsNrkFrw=="], + "undici": ["undici@7.24.6", "", {}, "sha512-Xi4agocCbRzt0yYMZGMA6ApD7gvtUFaxm4ZmeacWI4cZxaF6C+8I8QfofC20NAePiB/IcvZmzkJ7XPa471AEtA=="], "undici-types": ["undici-types@7.18.2", "", {}, "sha512-AsuCzffGHJybSaRrmr5eHr81mwJU3kjw6M+uprWvCXiNeN9SOGwQ3Jn8jb8m3Z6izVgknn1R0FTCEAP2QrLY/w=="], diff --git a/package.json b/package.json index 1dc06b30..17deef55 100644 --- a/package.json +++ b/package.json @@ -112,7 +112,8 @@ "ws": "8.20.0", "xss": "1.0.15", "yaml": "2.8.3", - "zod": "3.25.76" + "zod": "3.25.76", + "@mendable/firecrawl-js": "^4.18.1" }, "devDependencies": { "@types/bun": "1.3.11", diff --git a/src/tools/WebFetchTool/WebFetchTool.ts b/src/tools/WebFetchTool/WebFetchTool.ts index b439f446..258e1c2b 100644 --- a/src/tools/WebFetchTool/WebFetchTool.ts +++ b/src/tools/WebFetchTool/WebFetchTool.ts @@ -21,6 +21,18 @@ import { MAX_MARKDOWN_LENGTH, } from './utils.js' +function isFirecrawlEnabled(): boolean { + return Boolean(process.env.FIRECRAWL_API_KEY) +} + +async function scrapeWithFirecrawl(url: string): Promise<{ markdown: string; bytes: number }> { + const { FirecrawlClient } = await import('@mendable/firecrawl-js') + const app = new FirecrawlClient({ apiKey: process.env.FIRECRAWL_API_KEY! }) + const result = await app.scrape(url, { formats: ['markdown'] }) + const markdown = (result as { markdown?: string }).markdown ?? '' + return { markdown, bytes: Buffer.byteLength(markdown) } +} + const inputSchema = lazySchema(() => z.strictObject({ url: z.string().url().describe('The URL to fetch content from'), @@ -211,6 +223,27 @@ ${DESCRIPTION}` ) { const start = Date.now() + if (isFirecrawlEnabled()) { + const { markdown, bytes } = await scrapeWithFirecrawl(url) + const result = await applyPromptToMarkdown( + prompt, + markdown, + abortController.signal, + isNonInteractiveSession, + false, + ) + return { + data: { + bytes, + code: 200, + codeText: 'OK', + result, + durationMs: Date.now() - start, + url, + } satisfies Output, + } + } + const response = await getURLMarkdownContent(url, abortController) // Check if we got a redirect to a different host diff --git a/src/tools/WebSearchTool/WebSearchTool.ts b/src/tools/WebSearchTool/WebSearchTool.ts index 1410452a..e9eb373e 100644 --- a/src/tools/WebSearchTool/WebSearchTool.ts +++ b/src/tools/WebSearchTool/WebSearchTool.ts @@ -88,6 +88,67 @@ function makeToolSchema(input: Input): BetaWebSearchTool20250305 { } } +function isFirecrawlEnabled(): boolean { + return Boolean(process.env.FIRECRAWL_API_KEY) +} + +function shouldUseFirecrawl(): boolean { + if (!isFirecrawlEnabled()) return false + // Don't override native search on providers that already have it + if (isCodexResponsesWebSearchEnabled()) return false + const provider = getAPIProvider() + if (provider === 'firstParty' || provider === 'vertex' || provider === 'foundry') return false + return true +} + +async function runFirecrawlSearch(input: Input): Promise { + const startTime = performance.now() + const { FirecrawlClient } = await import('@mendable/firecrawl-js') + const app = new FirecrawlClient({ apiKey: process.env.FIRECRAWL_API_KEY! }) + + let query = input.query + if (input.blocked_domains?.length) { + const exclusions = input.blocked_domains.map(d => `-site:${d}`).join(' ') + query = `${query} ${exclusions}` + } + + const data = await app.search(query, { limit: 10 }) + + let hits = (data.web ?? []).map((r: { url: string; title?: string }) => ({ + title: r.title ?? r.url, + url: r.url, + })) + + if (input.allowed_domains?.length) { + hits = hits.filter(h => + input.allowed_domains!.some(d => { + try { + return new URL(h.url).hostname.endsWith(d) + } catch { + return false + } + }), + ) + } + + const snippets = (data.web ?? []) + .filter((r: { description?: string }) => r.description) + .map((r: { url: string; title?: string; description?: string }) => + `**${r.title ?? r.url}** — ${r.description} (${r.url})`, + ) + .join('\n') + + const results: Output['results'] = [] + if (snippets) results.push(snippets) + results.push({ tool_use_id: 'firecrawl-search', content: hits }) + + return { + query: input.query, + results, + durationSeconds: (performance.now() - startTime) / 1000, + } +} + function isCodexResponsesWebSearchEnabled(): boolean { if (getAPIProvider() !== 'openai') { return false @@ -378,6 +439,10 @@ export const WebSearchTool = buildTool({ return summary ? `Searching for ${summary}` : 'Searching the web' }, isEnabled() { + if (shouldUseFirecrawl()) { + return true + } + const provider = getAPIProvider() const model = getMainLoopModel() @@ -437,7 +502,7 @@ export const WebSearchTool = buildTool({ } }, async prompt() { - if (isCodexResponsesWebSearchEnabled()) { + if (shouldUseFirecrawl() || isCodexResponsesWebSearchEnabled()) { return getWebSearchPrompt().replace( /\n\s*-\s*Web search is only available in the US/, '', @@ -474,6 +539,10 @@ export const WebSearchTool = buildTool({ return { result: true } }, async call(input, context, _canUseTool, _parentMessage, onProgress) { + if (shouldUseFirecrawl()) { + return { data: await runFirecrawlSearch(input) } + } + if (isCodexResponsesWebSearchEnabled()) { return { data: await runCodexWebSearch(input, context.abortController.signal),