diff --git a/.env.example b/.env.example index 6f9f3b03..6565e101 100644 --- a/.env.example +++ b/.env.example @@ -248,3 +248,93 @@ ANTHROPIC_API_KEY=sk-ant-your-key-here # Enable debug logging # CLAUDE_DEBUG=1 + + +# ============================================================================= +# WEB SEARCH (OPTIONAL) +# ============================================================================= +# OpenClaude includes a web search tool. By default it uses DuckDuckGo (free) +# or the provider's native search (Anthropic firstParty / vertex). +# +# Set one API key below to enable a provider. That's it. + +# ── Provider API keys — set ONE of these ──────────────────────────── + +# Tavily (AI-optimized search, recommended) +# TAVILY_API_KEY=tvly-your-key-here + +# Exa (neural/semantic search) +# EXA_API_KEY=your-exa-key-here + +# You.com (RAG-ready snippets) +# YOU_API_KEY=your-you-key-here + +# Jina (s.jina.ai endpoint) +# JINA_API_KEY=your-jina-key-here + +# Bing Web Search +# BING_API_KEY=your-bing-key-here + +# Mojeek (privacy-focused) +# MOJEEK_API_KEY=your-mojeek-key-here + +# Linkup +# LINKUP_API_KEY=your-linkup-key-here + +# Firecrawl (premium, uses @mendable/firecrawl-js) +# FIRECRAWL_API_KEY=fc-your-key-here + +# ── Provider selection mode ───────────────────────────────────────── +# +# WEB_SEARCH_PROVIDER controls fallback behavior: +# +# "auto" (default) — try all configured providers, fall through on failure +# "custom" — custom API only, throw on failure (NOT in auto chain) +# "firecrawl" — firecrawl only +# "tavily" — tavily only +# "exa" — exa only +# "you" — you.com only +# "jina" — jina only +# "bing" — bing only +# "mojeek" — mojeek only +# "linkup" — linkup only +# "ddg" — duckduckgo only +# "native" — anthropic native / codex only +# +# Auto mode priority: firecrawl → tavily → exa → you → jina → bing → mojeek → +# linkup → ddg +# Note: "custom" is NOT in the auto chain. To use the custom API provider, +# you must explicitly set WEB_SEARCH_PROVIDER=custom. +# +# WEB_SEARCH_PROVIDER=auto + +# ── Built-in custom API presets ───────────────────────────────────── +# +# Use with WEB_KEY for the API key: +# WEB_PROVIDER=searxng|google|brave|serpapi +# WEB_KEY=your-api-key-here + +# ── Custom API endpoint (advanced) ────────────────────────────────── +# +# WEB_SEARCH_API — base URL of your search endpoint +# WEB_QUERY_PARAM — query parameter name (default: "q") +# WEB_METHOD — GET or POST (default: GET) +# WEB_PARAMS — extra static query params as JSON: {"lang":"en","count":"10"} +# WEB_URL_TEMPLATE — URL template with {query} for path embedding +# WEB_BODY_TEMPLATE — custom POST body with {query} placeholder +# WEB_AUTH_HEADER — header name for API key (default: "Authorization") +# WEB_AUTH_SCHEME — prefix before key (default: "Bearer") +# WEB_HEADERS — extra headers as "Name: value; Name2: value2" +# WEB_JSON_PATH — dot-path to results array in response + +# ── Custom API security guardrails ────────────────────────────────── +# +# The custom provider enforces security guardrails by default. +# Override these only if you understand the risks. +# +# WEB_CUSTOM_TIMEOUT_SEC=15 — request timeout in seconds (default 15) +# WEB_CUSTOM_MAX_BODY_KB=300 — max POST body size in KB (default 300) +# WEB_CUSTOM_ALLOW_ARBITRARY_HEADERS=false — set "true" to use non-standard headers +# WEB_CUSTOM_ALLOW_HTTP=false — set "true" to allow http:// URLs +# WEB_CUSTOM_ALLOW_PRIVATE=false — set "true" to target localhost/private IPs +# (needed for self-hosted SearXNG) diff --git a/src/tools/WebSearchTool/README_SEARCH_PROVIDERS.md b/src/tools/WebSearchTool/README_SEARCH_PROVIDERS.md new file mode 100644 index 00000000..2a992f82 --- /dev/null +++ b/src/tools/WebSearchTool/README_SEARCH_PROVIDERS.md @@ -0,0 +1,518 @@ +# Web Search Providers + +OpenClaude supports multiple search backends through a provider adapter system. + +## Supported Providers + +| Provider | Env Var | Auth Header | Method | +|---|---|---|---| +| Custom API | `WEB_SEARCH_API` | Configurable | GET/POST | +| SearXNG | `WEB_PROVIDER=searxng` | — | GET | +| Google | `WEB_PROVIDER=google` | `Authorization: Bearer` | GET | +| Brave | `WEB_PROVIDER=brave` | `X-Subscription-Token` | GET | +| SerpAPI | `WEB_PROVIDER=serpapi` | `Authorization: Bearer` | GET | +| Firecrawl | `FIRECRAWL_API_KEY` | Internal | SDK | +| Tavily | `TAVILY_API_KEY` | `Authorization: Bearer` | POST | +| Exa | `EXA_API_KEY` | `x-api-key` | POST | +| You.com | `YOU_API_KEY` | `X-API-Key` | GET | +| Jina | `JINA_API_KEY` | `Authorization: Bearer` | GET | +| Bing | `BING_API_KEY` | `Ocp-Apim-Subscription-Key` | GET | +| Mojeek | `MOJEEK_API_KEY` | `Authorization: Bearer` | GET | +| Linkup | `LINKUP_API_KEY` | `Authorization: Bearer` | POST | +| DuckDuckGo | *(default)* | — | SDK | + +## Quick Start + +```bash +# Tavily (recommended for AI — fast, RAG-ready) +export TAVILY_API_KEY=tvly-your-key + +# Exa (neural search, semantic queries) +export EXA_API_KEY=your-exa-key + +# Brave (traditional web search, good coverage) +export WEB_PROVIDER=brave +export WEB_KEY=your-brave-key + +# Bing +export BING_API_KEY=your-bing-key + +# Self-hosted SearXNG (free, private) +export WEB_PROVIDER=searxng +export WEB_SEARCH_API=https://search.example.com/search +``` + +## Provider Selection Mode + +`WEB_SEARCH_PROVIDER` controls fallback behavior: + +| Mode | Behavior | +|---|---| +| `auto` (default) | Try all configured providers in order, fall through on failure | +| `tavily` | Tavily only — throws on failure | +| `exa` | Exa only — throws on failure | +| `custom` | Custom API only — throws on failure. **Not in the auto chain** — must be explicitly selected | +| `firecrawl` | Firecrawl only — throws on failure | +| `ddg` | DuckDuckGo only — throws on failure | +| `native` | Anthropic native / Codex only | + +**Auto mode priority:** firecrawl → tavily → exa → you → jina → bing → mojeek → linkup → ddg + +> **Note:** The `custom` provider is excluded from the `auto` chain. It is only used when `WEB_SEARCH_PROVIDER=custom` is explicitly set. This prevents the generic outbound provider from silently becoming the default backend. + +```bash +# Fail loudly if Tavily is down (don't silently switch backends) +export WEB_SEARCH_PROVIDER=tavily + +# Try everything, fall through gracefully +export WEB_SEARCH_PROVIDER=auto +``` + +## Provider Request & Response Formats + +### Tavily + +```bash +export TAVILY_API_KEY=tvly-your-key +``` + +**Request:** +``` +POST https://api.tavily.com/search +Authorization: Bearer tvly-your-key +Content-Type: application/json + +{"query": "search terms", "max_results": 10, "include_answer": false} +``` + +**Response:** +```json +{ + "results": [ + { + "title": "Result Title", + "url": "https://example.com/page", + "content": "Full text snippet from the page...", + "score": 0.95 + } + ] +} +``` + +### Exa + +```bash +export EXA_API_KEY=your-exa-key +``` + +**Request:** +``` +POST https://api.exa.ai/search +x-api-key: your-exa-key +Content-Type: application/json + +{"query": "search terms", "numResults": 10, "type": "auto"} +``` + +**Response:** +```json +{ + "results": [ + { + "title": "Result Title", + "url": "https://example.com/page", + "snippet": "A short summary of the page content...", + "score": 0.89 + } + ] +} +``` + +### You.com + +```bash +export YOU_API_KEY=your-you-key +``` + +**Request:** +``` +GET https://api.ydc-index.io/v1/search?query=search+terms +X-API-Key: your-you-key +``` + +**Response:** +```json +{ + "results": { + "web": [ + { + "title": "Result Title", + "url": "https://example.com/page", + "snippets": ["First snippet from the page...", "Second snippet..."], + "description": "Page description" + } + ] + } +} +``` + +### Jina + +```bash +export JINA_API_KEY=your-jina-key +``` + +**Request:** +``` +GET https://s.jina.ai/?q=search+terms +Authorization: Bearer your-jina-key +Accept: application/json +``` + +**Response:** +```json +{ + "data": [ + { + "title": "Result Title", + "url": "https://example.com/page", + "description": "Snippet from the page..." + } + ] +} +``` + +### Bing + +```bash +export BING_API_KEY=your-bing-key +``` + +**Request:** +``` +GET https://api.bing.microsoft.com/v7.0/search?q=search+terms&count=10 +Ocp-Apim-Subscription-Key: your-bing-key +``` + +**Response:** +```json +{ + "webPages": { + "value": [ + { + "name": "Result Title", + "url": "https://example.com/page", + "snippet": "A short excerpt from the page...", + "displayUrl": "example.com/page" + } + ] + } +} +``` + +### Mojeek + +```bash +export MOJEEK_API_KEY=your-mojeek-key +``` + +**Request:** +``` +GET https://www.mojeek.com/search?q=search+terms&fmt=json +Authorization: Bearer your-mojeek-key +``` + +**Response:** +```json +{ + "response": { + "results": [ + { + "title": "Result Title", + "url": "https://example.com/page", + "snippet": "Excerpt from the page..." + } + ] + } +} +``` + +### Linkup + +```bash +export LINKUP_API_KEY=your-linkup-key +``` + +**Request:** +``` +POST https://api.linkup.so/v1/search +Authorization: Bearer your-linkup-key +Content-Type: application/json + +{"q": "search terms", "search_type": "standard"} +``` + +**Response:** +```json +{ + "results": [ + { + "name": "Result Title", + "url": "https://example.com/page", + "snippet": "A short description of the result..." + } + ] +} +``` + +### SearXNG (Built-in Preset) + +```bash +export WEB_PROVIDER=searxng +export WEB_SEARCH_API=https://search.example.com/search +``` + +**Request:** +``` +GET https://search.example.com/search?q=search+terms +``` + +**Response:** +```json +{ + "results": [ + { + "title": "Result Title", + "url": "https://example.com/page", + "content": "Snippet from the page...", + "engine": "google" + } + ] +} +``` + +### Google Custom Search (Built-in Preset) + +```bash +export WEB_PROVIDER=google +export WEB_KEY=your-google-api-key +``` + +**Request:** +``` +GET https://www.googleapis.com/customsearch/v1?q=search+terms +Authorization: Bearer your-google-api-key +``` + +**Response:** +```json +{ + "items": [ + { + "title": "Result Title", + "link": "https://example.com/page", + "snippet": "A short excerpt...", + "displayLink": "example.com" + } + ] +} +``` + +### Brave (Built-in Preset) + +```bash +export WEB_PROVIDER=brave +export WEB_KEY=your-brave-key +``` + +**Request:** +``` +GET https://api.search.brave.com/res/v1/web/search?q=search+terms +X-Subscription-Token: your-brave-key +``` + +**Response:** +```json +{ + "web": { + "results": [ + { + "title": "Result Title", + "url": "https://example.com/page", + "description": "Page description..." + } + ] + } +} +``` + +### SerpAPI (Built-in Preset) + +```bash +export WEB_PROVIDER=serpapi +export WEB_KEY=your-serpapi-key +``` + +**Request:** +``` +GET https://serpapi.com/search.json?q=search+terms +Authorization: Bearer your-serpapi-key +``` + +**Response:** +```json +{ + "organic_results": [ + { + "title": "Result Title", + "link": "https://example.com/page", + "snippet": "A short excerpt...", + "displayed_link": "example.com" + } + ] +} +``` + +### DuckDuckGo (Default Fallback) + +No configuration needed. Uses the `duck-duck-scrape` npm package. + +```bash +# Set as explicit-only backend +export WEB_SEARCH_PROVIDER=ddg +``` + +--- + +## Custom API Configuration + +### Standard GET + +``` +GET https://api.example.com/search?q=hello +``` + +```bash +export WEB_SEARCH_API=https://api.example.com/search +export WEB_QUERY_PARAM=q +``` + +### Query in URL Path + +``` +GET https://api.example.com/v2/search/hello +``` + +```bash +export WEB_URL_TEMPLATE=https://api.example.com/v2/search/{query} +``` + +### POST with Custom Body + +``` +POST https://api.example.com/v1/query +Content-Type: application/json + +{"input": {"text": "hello"}} +``` + +```bash +export WEB_SEARCH_API=https://api.example.com/v1/query +export WEB_METHOD=POST +export WEB_BODY_TEMPLATE='{"input":{"text":"{query}"}}' +``` + +### Extra Static Params + +```bash +export WEB_PARAMS='{"lang":"en","count":"10"}' +``` + +## Auth + +API keys are sent in HTTP headers, **never** in query strings. + +```bash +# Default: Authorization: Bearer +export WEB_KEY=your-key + +# Custom header +export WEB_AUTH_HEADER=X-Api-Key +export WEB_AUTH_SCHEME="" + +# Extra headers +export WEB_HEADERS="X-Tenant: acme; Accept: application/json" +``` + +## Response Parsing + +The tool auto-detects many response formats: + +```jsonc +{ "results": [{ "title": "...", "url": "..." }] } // flat array +{ "items": [{ "title": "...", "link": "..." }] } // Google-style +{ "results": { "engine": [{ "title": "...", "url": "..." }] } } // nested map +[{ "title": "...", "url": "..." }] // bare array +``` + +Field name aliases: `title`/`headline`/`name`, `url`/`link`/`href`, `description`/`snippet`/`content` + +For deeply nested responses: +```bash +export WEB_JSON_PATH=response.payload.results +``` + +## Retry + +Failed requests (network errors, 5xx) are retried once after 500ms. Client errors (4xx) are not retried. Custom requests have a default 15s timeout. + +## Custom Provider Security Guardrails + +The custom provider enforces the following guardrails by default: + +| Guardrail | Default | Override | +|-----------|---------|----------| +| HTTPS-only | ✅ | `WEB_CUSTOM_ALLOW_HTTP=true` | +| Block private IPs / localhost | ✅ | `WEB_CUSTOM_ALLOW_PRIVATE=true` | +| Header allowlist | ✅ | `WEB_CUSTOM_ALLOW_ARBITRARY_HEADERS=true` | +| Max POST body | 300 KB | `WEB_CUSTOM_MAX_BODY_KB=` | +| Request timeout | 15s | `WEB_CUSTOM_TIMEOUT_SEC=` | +| Audit log (one-time warning) | ✅ | — | + +### Self-hosted SearXNG example + +```bash +export WEB_PROVIDER=searxng +export WEB_SEARCH_API=https://search.mydomain.com/search +export WEB_CUSTOM_ALLOW_PRIVATE=true # needed if SearXNG is on a private IP +``` + +### Header allowlist + +By default only these headers are permitted: +`accept`, `accept-encoding`, `accept-language`, `authorization`, `cache-control`, `content-type`, `if-modified-since`, `if-none-match`, `ocp-apim-subscription-key`, `user-agent`, `x-api-key`, `x-subscription-token`, `x-tenant-id` + +## Adding a Provider + +1. Create `providers/myprovider.ts`: + +```typescript +import type { SearchInput, SearchProvider } from './types.js' +import { applyDomainFilters, type ProviderOutput } from './types.js' + +export const myProvider: SearchProvider = { + name: 'myprovider', + isConfigured() { return Boolean(process.env.MYPROVIDER_API_KEY) }, + async search(input: SearchInput): Promise { + const start = performance.now() + // ... call API, map to SearchHit[] ... + return { + hits: applyDomainFilters(hits, input), + providerName: 'myprovider', + durationSeconds: (performance.now() - start) / 1000, + } + }, +} +``` + +2. Register in `providers/index.ts` — add import and push to `ALL_PROVIDERS`. diff --git a/src/tools/WebSearchTool/WebSearchTool.ts b/src/tools/WebSearchTool/WebSearchTool.ts index fa9b5360..69c3b3e8 100644 --- a/src/tools/WebSearchTool/WebSearchTool.ts +++ b/src/tools/WebSearchTool/WebSearchTool.ts @@ -28,6 +28,13 @@ import { renderToolUseProgressMessage, } from './UI.js' +import { + runSearch, + getProviderMode, + getAvailableProviders, + type ProviderOutput, +} from './providers/index.js' + const inputSchema = lazySchema(() => z.strictObject({ query: z.string().min(2).describe('The search query to use'), @@ -79,6 +86,39 @@ export type { WebSearchProgress } from '../../types/tools.js' import type { WebSearchProgress } from '../../types/tools.js' +// --------------------------------------------------------------------------- +// Shared formatting: ProviderOutput → Output +// --------------------------------------------------------------------------- + +function formatProviderOutput(po: ProviderOutput, query: string): Output { + const results: (SearchResult | string)[] = [] + + const snippets = po.hits + .filter(h => h.description) + .map(h => `**${h.title}** — ${h.description} (${h.url})`) + .join('\n') + if (snippets) results.push(snippets) + + if (po.hits.length > 0) { + results.push({ + tool_use_id: `${po.providerName}-search`, + content: po.hits.map(h => ({ title: h.title, url: h.url })), + }) + } + + if (results.length === 0) results.push('No results found.') + + return { + query, + results, + durationSeconds: po.durationSeconds, + } +} + +// --------------------------------------------------------------------------- +// Native Anthropic + Codex paths (unchanged, tightly coupled to SDK) +// --------------------------------------------------------------------------- + function makeToolSchema(input: Input): BetaWebSearchTool20250305 { return { type: 'web_search_20250305', @@ -89,161 +129,10 @@ function makeToolSchema(input: Input): BetaWebSearchTool20250305 { } } -function isFirecrawlEnabled(): boolean { - return Boolean(process.env.FIRECRAWL_API_KEY) -} - -function shouldUseFirecrawl(): boolean { - if (!isFirecrawlEnabled()) return false - // Don't override native search on providers that already have it - if (isCodexResponsesWebSearchEnabled()) return false - const provider = getAPIProvider() - if (provider === 'firstParty' || provider === 'vertex' || provider === 'foundry') return false - return true -} - function isClaudeModel(model: string): boolean { return /claude/i.test(model) } -function shouldUseDuckDuckGo(): boolean { - if (isCodexResponsesWebSearchEnabled()) return false - - const provider = getAPIProvider() - // Don't override providers/models that have native web search support. - if (provider === 'firstParty' || provider === 'vertex' || provider === 'foundry') { - return false - } - - // Use free DDG search for non-Claude models by default. - return !isClaudeModel(getMainLoopModel()) -} - -async function runDuckDuckGoSearch(input: Input): Promise { - const startTime = performance.now() - - try { - const { search } = await import('duck-duck-scrape') - - const response = await search(input.query, { - safeSearch: 0, - }) - - let hits = response.results.map(r => ({ - title: r.title || r.url, - url: r.url, - snippet: r.description, - })) - - if (input.blocked_domains?.length) { - hits = hits.filter(h => { - try { - const host = new URL(h.url).hostname - return !input.blocked_domains!.some(d => host.endsWith(d)) - } catch { - return false - } - }) - } - - if (input.allowed_domains?.length) { - hits = hits.filter(h => { - try { - const host = new URL(h.url).hostname - return input.allowed_domains!.some(d => host.endsWith(d)) - } catch { - return false - } - }) - } - - const snippets = hits - .filter(h => h.snippet) - .map(h => `**${h.title}** — ${h.snippet} (${h.url})`) - .join('\n') - - const results: Output['results'] = [] - if (snippets) results.push(snippets) - results.push({ - tool_use_id: 'duckduckgo-search', - content: hits.map(({ title, url }) => ({ title, url })), - }) - - return { - query: input.query, - results, - durationSeconds: (performance.now() - startTime) / 1000, - } - } catch (error) { - const message = error instanceof Error ? error.message : String(error) - const isRateLimited = - message.includes('429') || - message.includes('rate') || - message.includes('CAPTCHA') || - message.includes('blocked') - - if (isRateLimited && isFirecrawlEnabled()) { - return runFirecrawlSearch(input) - } - - return { - query: input.query, - results: [ - 'Web search temporarily unavailable — try again or add a Firecrawl API key for reliable results.', - ], - durationSeconds: (performance.now() - startTime) / 1000, - } - } -} - -async function runFirecrawlSearch(input: Input): Promise { - const startTime = performance.now() - const { FirecrawlClient } = await import('@mendable/firecrawl-js') - const app = new FirecrawlClient({ apiKey: process.env.FIRECRAWL_API_KEY! }) - - let query = input.query - if (input.blocked_domains?.length) { - const exclusions = input.blocked_domains.map(d => `-site:${d}`).join(' ') - query = `${query} ${exclusions}` - } - - const data = await app.search(query, { limit: 10 }) - - let hits = (data.web ?? []).map((r: { url: string; title?: string }) => ({ - title: r.title ?? r.url, - url: r.url, - })) - - if (input.allowed_domains?.length) { - hits = hits.filter(h => - input.allowed_domains!.some(d => { - try { - return new URL(h.url).hostname.endsWith(d) - } catch { - return false - } - }), - ) - } - - const snippets = (data.web ?? []) - .filter((r: { description?: string }) => r.description) - .map((r: { url: string; title?: string; description?: string }) => - `**${r.title ?? r.url}** — ${r.description} (${r.url})`, - ) - .join('\n') - - const results: Output['results'] = [] - if (snippets) results.push(snippets) - results.push({ tool_use_id: 'firecrawl-search', content: hits }) - - return { - query: input.query, - results, - durationSeconds: (performance.now() - startTime) / 1000, - } -} - function isCodexResponsesWebSearchEnabled(): boolean { if (getAPIProvider() !== 'openai') { return false @@ -517,6 +406,37 @@ function makeOutputFromSearchResponse( } } +// --------------------------------------------------------------------------- +// Helper: should we use adapter-based providers? +// --------------------------------------------------------------------------- + +/** + * Returns true when we should use the adapter-based provider system. + * + * In auto mode: native/first-party/Codex paths take precedence. + * → Only falls back to adapter if no native path is available. + * In explicit adapter modes (tavily, ddg, custom, etc.): always true. + * In native mode: never true. + */ +function shouldUseAdapterProvider(): boolean { + const mode = getProviderMode() + if (mode === 'native') return false + if (mode !== 'auto') return true // explicit adapter mode (tavily, ddg, custom, etc.) + + // Auto mode: native/first-party/Codex take precedence over adapter + if (isCodexResponsesWebSearchEnabled()) return false + const provider = getAPIProvider() + if (provider === 'firstParty' || provider === 'vertex' || provider === 'foundry') { + return false + } + // No native path available — fall back to adapter + return getAvailableProviders().length > 0 +} + +// --------------------------------------------------------------------------- +// Tool export +// --------------------------------------------------------------------------- + export const WebSearchTool = buildTool({ name: WEB_SEARCH_TOOL_NAME, searchHint: 'search the web for current information', @@ -534,21 +454,20 @@ export const WebSearchTool = buildTool({ return summary ? `Searching for ${summary}` : 'Searching the web' }, isEnabled() { - if (shouldUseFirecrawl()) { - return true + const mode = getProviderMode() + + // Specific provider mode: enabled if any adapter is configured + if (mode !== 'auto' && mode !== 'native') { + return getAvailableProviders().length > 0 } - if (shouldUseDuckDuckGo()) { - return true - } + // Auto/native mode: check all paths + if (getAvailableProviders().length > 0) return true + if (isCodexResponsesWebSearchEnabled()) return true const provider = getAPIProvider() const model = getMainLoopModel() - if (isCodexResponsesWebSearchEnabled()) { - return true - } - // Enable for firstParty if (provider === 'firstParty') { return true @@ -601,11 +520,8 @@ export const WebSearchTool = buildTool({ } }, async prompt() { - if ( - shouldUseDuckDuckGo() || - shouldUseFirecrawl() || - isCodexResponsesWebSearchEnabled() - ) { + // Strip "US only" when using non-native backends + if (shouldUseAdapterProvider() || isCodexResponsesWebSearchEnabled()) { return getWebSearchPrompt().replace( /\n\s*-\s*Web search is only available in the US/, '', @@ -642,20 +558,30 @@ export const WebSearchTool = buildTool({ return { result: true } }, async call(input, context, _canUseTool, _parentMessage, onProgress) { - if (shouldUseFirecrawl()) { - return { data: await runFirecrawlSearch(input) } - } - - if (shouldUseDuckDuckGo()) { - return { data: await runDuckDuckGoSearch(input) } + // --- Adapter-based providers (custom, firecrawl, ddg) --- + // runSearch handles fallback semantics based on WEB_SEARCH_PROVIDER mode: + // - "auto": tries each provider, falls through on failure + // - specific mode: runs one provider, throws on failure + if (shouldUseAdapterProvider()) { + const providerOutput = await runSearch( + { + query: input.query, + allowed_domains: input.allowed_domains, + blocked_domains: input.blocked_domains, + }, + context.abortController.signal, + ) + return { data: formatProviderOutput(providerOutput, input.query) } } + // --- Codex / OpenAI Responses path --- if (isCodexResponsesWebSearchEnabled()) { return { data: await runCodexWebSearch(input, context.abortController.signal), } } + // --- Native Anthropic path (firstParty / vertex / foundry) --- const startTime = performance.now() const { query } = input const userMessage = createUserMessage({ @@ -715,8 +641,6 @@ export const WebSearchTool = buildTool({ if (contentBlock && contentBlock.type === 'server_tool_use') { currentToolUseId = contentBlock.id currentToolUseJson = '' - // Note: The ServerToolUseBlock doesn't contain input.query - // The actual query comes through input_json_delta events continue } } @@ -733,12 +657,10 @@ export const WebSearchTool = buildTool({ // Try to extract query from partial JSON for progress updates try { - // Look for a complete query field const queryMatch = currentToolUseJson.match( /"query"\s*:\s*"((?:[^"\\]|\\.)*)"/, ) if (queryMatch && queryMatch[1]) { - // The regex properly handles escaped characters const query = jsonParse('"' + queryMatch[1] + '"') if ( @@ -771,7 +693,6 @@ export const WebSearchTool = buildTool({ ) { const contentBlock = event.event.content_block if (contentBlock && contentBlock.type === 'web_search_tool_result') { - // Get the actual query that was used for this search const toolUseId = contentBlock.tool_use_id const actualQuery = toolUseQueries.get(toolUseId) || query const content = contentBlock.content diff --git a/src/tools/WebSearchTool/providers/bing.ts b/src/tools/WebSearchTool/providers/bing.ts new file mode 100644 index 00000000..72a0f92b --- /dev/null +++ b/src/tools/WebSearchTool/providers/bing.ts @@ -0,0 +1,47 @@ +/** + * Bing Web Search API adapter. + * GET https://api.bing.microsoft.com/v7.0/search?q=... + * Auth: Ocp-Apim-Subscription-Key: + */ + +import type { SearchInput, SearchProvider } from './types.js' +import { applyDomainFilters, type ProviderOutput } from './types.js' + +export const bingProvider: SearchProvider = { + name: 'bing', + + isConfigured() { + return Boolean(process.env.BING_API_KEY) + }, + + async search(input: SearchInput, signal?: AbortSignal): Promise { + const start = performance.now() + + const url = new URL('https://api.bing.microsoft.com/v7.0/search') + url.searchParams.set('q', input.query) + url.searchParams.set('count', '10') + + const res = await fetch(url.toString(), { + headers: { 'Ocp-Apim-Subscription-Key': process.env.BING_API_KEY! }, + signal, + }) + + if (!res.ok) { + throw new Error(`Bing search error ${res.status}: ${await res.text().catch(() => '')}`) + } + + const data = await res.json() + const hits = (data.webPages?.value ?? []).map((r: any) => ({ + title: r.name ?? '', + url: r.url ?? '', + description: r.snippet, + source: r.displayUrl, + })) + + return { + hits: applyDomainFilters(hits, input), + providerName: 'bing', + durationSeconds: (performance.now() - start) / 1000, + } + }, +} diff --git a/src/tools/WebSearchTool/providers/custom.test.ts b/src/tools/WebSearchTool/providers/custom.test.ts new file mode 100644 index 00000000..d5641960 --- /dev/null +++ b/src/tools/WebSearchTool/providers/custom.test.ts @@ -0,0 +1,85 @@ +import { describe, expect, test } from 'bun:test' +import { extractHits } from './custom.js' + +// --------------------------------------------------------------------------- +// extractHits — flexible response parsing +// --------------------------------------------------------------------------- + +describe('extractHits', () => { + test('extracts from results array', () => { + const data = { results: [{ title: 'T', url: 'https://ex.com' }] } + const hits = extractHits(data) + expect(hits).toHaveLength(1) + expect(hits[0].title).toBe('T') + }) + + test('extracts from items array (Google-style)', () => { + const data = { items: [{ title: 'T', link: 'https://ex.com' }] } + const hits = extractHits(data) + expect(hits).toHaveLength(1) + expect(hits[0].url).toBe('https://ex.com') + }) + + test('extracts from data array', () => { + const data = { data: [{ title: 'T', url: 'https://ex.com' }] } + const hits = extractHits(data) + expect(hits).toHaveLength(1) + }) + + test('extracts from bare array', () => { + const data = [{ title: 'T', url: 'https://ex.com' }] + const hits = extractHits(data) + expect(hits).toHaveLength(1) + }) + + test('extracts from nested map (e.g. web.results)', () => { + const data = { + web: { + results: [{ title: 'T', url: 'https://ex.com' }], + }, + } + const hits = extractHits(data) + expect(hits).toHaveLength(1) + }) + + test('extracts with explicit jsonPath', () => { + const data = { + response: { + payload: [{ title: 'T', url: 'https://ex.com' }], + }, + } + const hits = extractHits(data, 'response.payload') + expect(hits).toHaveLength(1) + }) + + test('returns empty for empty object', () => { + expect(extractHits({})).toHaveLength(0) + }) + + test('returns empty for null', () => { + expect(extractHits(null)).toHaveLength(0) + }) + + test('returns empty for no array keys', () => { + expect(extractHits({ status: 'ok', count: 5 })).toHaveLength(0) + }) + + test('filters out hits with no title and no url', () => { + const data = { + results: [ + { title: 'Valid', url: 'https://ex.com' }, + { description: 'no title or url' }, + ], + } + const hits = extractHits(data) + expect(hits).toHaveLength(1) + }) + + test('extracts from organic_results (SerpAPI-style)', () => { + const data = { + organic_results: [{ title: 'T', link: 'https://ex.com' }], + } + const hits = extractHits(data) + expect(hits).toHaveLength(1) + }) +}) diff --git a/src/tools/WebSearchTool/providers/custom.ts b/src/tools/WebSearchTool/providers/custom.ts new file mode 100644 index 00000000..bc717733 --- /dev/null +++ b/src/tools/WebSearchTool/providers/custom.ts @@ -0,0 +1,476 @@ +/** + * Custom API provider adapter. + * + * Supports: + * - Any HTTP endpoint via WEB_SEARCH_API + * - Built-in presets via WEB_PROVIDER (searxng, google, brave, serpapi) + * - GET or POST (WEB_METHOD) + * - Query in path via WEB_URL_TEMPLATE with {query} + * - Custom POST body via WEB_BODY_TEMPLATE with {query} + * - Extra static params via WEB_PARAMS (JSON) + * - Flexible response parsing (auto-detects common shapes) + * - One automatic retry on failure + * + * ## Security Guardrails (Option B) + * + * This adapter creates a generic outbound HTTP client. The following + * guardrails are enforced to reduce SSRF and data-exfiltration risk: + * + * 1. HTTPS-only by default (opt-out: WEB_CUSTOM_ALLOW_HTTP=true) + * 2. Private / loopback / link-local IPs are blocked by default + * (opt-out: WEB_CUSTOM_ALLOW_PRIVATE=true) + * 3. Built-in allowlist of header names — arbitrary headers require + * WEB_CUSTOM_ALLOW_ARBITRARY_HEADERS=true + * 4. Max body size guard (300 KB for POST) + * 5. Request timeout (default 15s, configurable via WEB_CUSTOM_TIMEOUT_SEC) + * 6. Audit log on first custom search (one-time warning) + */ + +import type { SearchInput, SearchProvider } from './types.js' +import { + applyDomainFilters, + normalizeHit, + safeHostname, + type ProviderOutput, + type SearchHit, +} from './types.js' + +// --------------------------------------------------------------------------- +// Built-in provider presets +// --------------------------------------------------------------------------- + +interface ProviderPreset { + urlTemplate: string + queryParam: string + method?: string + authHeader?: string + authScheme?: string + jsonPath?: string + responseAdapter?: (data: any) => SearchHit[] +} + +const BUILT_IN_PROVIDERS: Record = { + searxng: { + // NOTE: default uses https://localhost — users must override WEB_SEARCH_API + // for their actual instance. The http:// default was intentionally removed + // to comply with the HTTPS-only guardrail. + urlTemplate: 'https://localhost:8080/search', + queryParam: 'q', + jsonPath: 'results', + responseAdapter(data: any) { + return (data.results ?? []).map((r: any) => ({ + title: r.title ?? r.url, + url: r.url, + description: r.content, + source: r.engine ?? r.source, + })) + }, + }, + google: { + urlTemplate: 'https://www.googleapis.com/customsearch/v1', + queryParam: 'q', + authHeader: 'Authorization', + authScheme: 'Bearer', + responseAdapter(data: any) { + return (data.items ?? []).map((r: any) => ({ + title: r.title ?? '', + url: r.link ?? '', + description: r.snippet, + source: r.displayLink, + })) + }, + }, + brave: { + urlTemplate: 'https://api.search.brave.com/res/v1/web/search', + queryParam: 'q', + authHeader: 'X-Subscription-Token', + responseAdapter(data: any) { + return (data.web?.results ?? []).map((r: any) => ({ + title: r.title ?? '', + url: r.url ?? '', + description: r.description, + source: safeHostname(r.url), + })) + }, + }, + serpapi: { + urlTemplate: 'https://serpapi.com/search.json', + queryParam: 'q', + authHeader: 'Authorization', + authScheme: 'Bearer', + responseAdapter(data: any) { + return (data.organic_results ?? []).map((r: any) => ({ + title: r.title ?? '', + url: r.link ?? '', + description: r.snippet, + source: r.displayed_link, + })) + }, + }, +} + +// --------------------------------------------------------------------------- +// Security guardrails +// --------------------------------------------------------------------------- + +/** Maximum POST body size in bytes (300 KB default, configurable via WEB_CUSTOM_MAX_BODY_KB). */ +const DEFAULT_MAX_BODY_KB = 300 + +/** Default request timeout in seconds. */ +const DEFAULT_TIMEOUT_SECONDS = 15 + +/** Header names that are always allowed (case-insensitive). */ +const SAFE_HEADER_NAMES = new Set([ + 'accept', + 'accept-encoding', + 'accept-language', + 'authorization', + 'cache-control', + 'content-type', + 'if-modified-since', + 'if-none-match', + 'ocp-apim-subscription-key', + 'user-agent', + 'x-api-key', + 'x-subscription-token', + 'x-tenant-id', +]) + +/** + * Private / reserved IP ranges that should not be reachable from a + * search adapter (SSRF mitigation). + * + * This is a hostname-level check. DNS resolution to private IPs is + * NOT blocked here (that would require resolving before fetch, which + * Node fetch does not expose). This guard blocks obvious cases. + */ +const BLOCKED_HOSTNAME_PATTERNS = [ + /^localhost$/i, + /^127\.\d+\.\d+\.\d+$/, + /^10\.\d+\.\d+\.\d+$/, + /^172\.(1[6-9]|2\d|3[01])\.\d+\.\d+$/, + /^192\.168\.\d+\.\d+$/, + /^0\.0\.0\.0$/, + /^\[::1?\]$/i, // [::1] or [::] + /^0x[0-9a-f]+$/i, // hex-encoded IPs +] + +function isPrivateHostname(hostname: string): boolean { + return BLOCKED_HOSTNAME_PATTERNS.some(re => re.test(hostname)) +} + +/** + * Validate the target URL against security guardrails. + * Throws on violation. + */ +function validateUrl(urlString: string): void { + let parsed: URL + try { + parsed = new URL(urlString) + } catch { + throw new Error(`Custom search URL is not a valid URL: ${urlString.slice(0, 100)}`) + } + + // 2. HTTPS-only (unless explicitly opted out) + const allowHttp = process.env.WEB_CUSTOM_ALLOW_HTTP === 'true' + if (!allowHttp && parsed.protocol !== 'https:') { + throw new Error( + `Custom search URL must use https:// (got ${parsed.protocol}). ` + + `Set WEB_CUSTOM_ALLOW_HTTP=true to override (not recommended).`, + ) + } + + // 3. Private network check (unless explicitly opted out) + const allowPrivate = process.env.WEB_CUSTOM_ALLOW_PRIVATE === 'true' + if (!allowPrivate && isPrivateHostname(parsed.hostname)) { + throw new Error( + `Custom search URL targets a private/reserved address (${parsed.hostname}). ` + + `This is blocked by default to prevent SSRF. ` + + `Set WEB_CUSTOM_ALLOW_PRIVATE=true to override (e.g. for local SearXNG).`, + ) + } +} + +/** + * Validate that user-supplied headers are in the safe allowlist, + * unless WEB_CUSTOM_ALLOW_ARBITRARY_HEADERS=true. + */ +function validateHeaderName(name: string): boolean { + const allowArbitrary = process.env.WEB_CUSTOM_ALLOW_ARBITRARY_HEADERS === 'true' + if (allowArbitrary) return true + return SAFE_HEADER_NAMES.has(name.toLowerCase()) +} + +/** + * Log a one-time audit warning that custom outbound search is active. + * Prevents silent data exfiltration. + */ +let auditLogged = false +function auditLogCustomSearch(url: string): void { + if (auditLogged) return + auditLogged = true + console.warn( + `[web-search] ⚠️ Custom search provider is active. ` + + `Outbound requests go to: ${safeHostname(url) ?? url}. ` + + `Ensure this endpoint is trusted. ` + + `See: https://github.com/Gitlawb/openclaude/pull/512#security`, + ) +} + +// --------------------------------------------------------------------------- +// Auth — preset overrides for built-in providers +// --------------------------------------------------------------------------- + +function buildAuthHeadersForPreset(preset?: ProviderPreset): Record { + const apiKey = process.env.WEB_KEY + if (!apiKey) return {} + + const headerName = process.env.WEB_AUTH_HEADER ?? preset?.authHeader ?? 'Authorization' + const scheme = process.env.WEB_AUTH_SCHEME ?? preset?.authScheme ?? 'Bearer' + return { [headerName]: `${scheme} ${apiKey}`.trim() } +} + +// --------------------------------------------------------------------------- +// Request construction +// --------------------------------------------------------------------------- + +function resolveConfig(): { + urlTemplate: string + queryParam: string + method: string + jsonPath?: string + responseAdapter?: (data: any) => SearchHit[] + preset?: ProviderPreset +} { + const providerName = process.env.WEB_PROVIDER + const preset = providerName ? BUILT_IN_PROVIDERS[providerName] : undefined + + return { + urlTemplate: process.env.WEB_URL_TEMPLATE + ?? process.env.WEB_SEARCH_API + ?? preset?.urlTemplate + ?? '', + queryParam: process.env.WEB_QUERY_PARAM ?? preset?.queryParam ?? 'q', + method: process.env.WEB_METHOD ?? preset?.method ?? 'GET', + jsonPath: process.env.WEB_JSON_PATH ?? preset?.jsonPath, + responseAdapter: preset?.responseAdapter, + preset, + } +} + +function parseExtraParams(): Record { + const raw = process.env.WEB_PARAMS + if (!raw) return {} + try { + const obj = JSON.parse(raw) + if (obj && typeof obj === 'object' && !Array.isArray(obj)) return obj + } catch { /* ignore */ } + return {} +} + +function buildRequest(query: string) { + const config = resolveConfig() + const method = config.method.toUpperCase() + + // --- URL --- + const rawTemplate = config.urlTemplate + const templateWithQuery = rawTemplate.replace(/\{query\}/g, encodeURIComponent(query)) + const url = new URL(templateWithQuery) + + // Merge extra static params + for (const [k, v] of Object.entries(parseExtraParams())) { + url.searchParams.set(k, v) + } + + // If {query} wasn't in template, add as param + if (!rawTemplate.includes('{query}')) { + url.searchParams.set(config.queryParam, query) + } + + const urlString = url.toString() + + // --- Security validation --- + validateUrl(urlString) + auditLogCustomSearch(urlString) + + // --- Headers --- + const headers: Record = { + ...buildAuthHeadersForPreset(config.preset), + } + + // Merge WEB_HEADERS with allowlist enforcement + const rawExtra = process.env.WEB_HEADERS + if (rawExtra) { + for (const pair of rawExtra.split(';')) { + const i = pair.indexOf(':') + if (i > 0) { + const k = pair.slice(0, i).trim() + const v = pair.slice(i + 1).trim() + if (k) { + if (!validateHeaderName(k)) { + throw new Error( + `Header "${k}" is not in the safe allowlist. ` + + `Allowed: ${[...SAFE_HEADER_NAMES].join(', ')}. ` + + `Set WEB_CUSTOM_ALLOW_ARBITRARY_HEADERS=true to override.`, + ) + } + headers[k] = v + } + } + } + } + + const init: RequestInit = { method, headers } + + if (method === 'POST') { + headers['Content-Type'] = 'application/json' + const bodyTemplate = process.env.WEB_BODY_TEMPLATE + if (bodyTemplate) { + const body = bodyTemplate.replace(/\{query\}/g, query) + const maxBodyBytes = (Number(process.env.WEB_CUSTOM_MAX_BODY_KB) || DEFAULT_MAX_BODY_KB) * 1024 + if (Buffer.byteLength(body) > maxBodyBytes) { + throw new Error( + `POST body exceeds ${maxBodyBytes} bytes. ` + + `Increase WEB_CUSTOM_MAX_BODY_KB if needed.`, + ) + } + init.body = body + } else { + init.body = JSON.stringify({ [config.queryParam]: query }) + } + } + + return { url: urlString, init, config } +} + +// --------------------------------------------------------------------------- +// Response parsing — flexible, handles many shapes +// --------------------------------------------------------------------------- + +function walkJsonPath(obj: any, path: string): any { + let current = obj + for (const seg of path.split('.')) { + if (current == null) return undefined + current = current[seg] + } + return current +} + +function extractFromNode(node: any): SearchHit[] { + if (!node) return [] + if (Array.isArray(node)) return node.map(normalizeHit).filter(Boolean) as SearchHit[] + if (typeof node === 'object') { + const all: SearchHit[] = [] + for (const sub of Object.values(node)) all.push(...extractFromNode(sub)) + return all + } + return [] +} + +export function extractHits(raw: any, jsonPath?: string): SearchHit[] { + if (jsonPath) return extractFromNode(walkJsonPath(raw, jsonPath)) + if (Array.isArray(raw)) return raw.map(normalizeHit).filter(Boolean) as SearchHit[] + if (!raw || typeof raw !== 'object') return [] + + const arrayKeys = ['results', 'items', 'data', 'web', 'organic_results', 'hits', 'entries'] + for (const key of arrayKeys) { + const val = raw[key] + if (Array.isArray(val)) return val.map(normalizeHit).filter(Boolean) as SearchHit[] + if (val && typeof val === 'object' && !Array.isArray(val)) { + const all: SearchHit[] = [] + for (const sub of Object.values(val)) { + if (Array.isArray(sub)) all.push(...(sub.map(normalizeHit).filter(Boolean) as SearchHit[])) + } + if (all.length > 0) return all + } + } + + return [] +} + +// --------------------------------------------------------------------------- +// Fetch with one retry + timeout +// --------------------------------------------------------------------------- + +async function fetchWithRetry(url: string, init: RequestInit, signal?: AbortSignal): Promise { + const timeoutSec = Number(process.env.WEB_CUSTOM_TIMEOUT_SEC) || DEFAULT_TIMEOUT_SECONDS + const timeoutMs = timeoutSec * 1000 + let lastErr: Error | undefined + let lastStatus: number | undefined + + for (let attempt = 0; attempt < 2; attempt++) { + // Create a timeout that races with the external signal + const controller = new AbortController() + const timer = setTimeout(() => controller.abort(), timeoutMs) + + // If the external signal is already aborted, forward it + if (signal?.aborted) { + controller.abort() + } else { + signal?.addEventListener('abort', () => controller.abort(), { once: true }) + } + + try { + const res = await fetch(url, { ...init, signal: controller.signal }) + clearTimeout(timer) + + if (!res.ok) { + lastStatus = res.status + throw new Error(`Custom search API returned ${res.status}: ${res.statusText}`) + } + return await res.json() + } catch (err) { + clearTimeout(timer) + lastErr = err instanceof Error ? err : new Error(String(err)) + + // AbortError from timeout + if (lastErr.name === 'AbortError' && !signal?.aborted) { + throw new Error(`Custom search timed out after ${timeoutSec}s`) + } + + // Retry on 5xx or network errors only + if (attempt === 0) { + if (lastStatus !== undefined && lastStatus >= 500) { + await new Promise(r => setTimeout(r, 500)) + continue + } + if (lastStatus === undefined) { + // Network error — retry + await new Promise(r => setTimeout(r, 500)) + continue + } + // 4xx — don't retry + } + throw lastErr + } + } + throw lastErr! +} + +// --------------------------------------------------------------------------- +// Provider export +// --------------------------------------------------------------------------- + +export const customProvider: SearchProvider = { + name: 'custom', + + isConfigured() { + return Boolean(process.env.WEB_SEARCH_API || process.env.WEB_PROVIDER) + }, + + async search(input: SearchInput, signal?: AbortSignal): Promise { + const start = performance.now() + const { url, init, config } = buildRequest(input.query) + const raw = await fetchWithRetry(url, init, signal) + + const hits = config.responseAdapter + ? config.responseAdapter(raw) + : extractHits(raw, config.jsonPath) + + return { + hits: applyDomainFilters(hits, input), + providerName: 'custom', + durationSeconds: (performance.now() - start) / 1000, + } + }, +} diff --git a/src/tools/WebSearchTool/providers/duckduckgo.ts b/src/tools/WebSearchTool/providers/duckduckgo.ts new file mode 100644 index 00000000..2ab8b0f0 --- /dev/null +++ b/src/tools/WebSearchTool/providers/duckduckgo.ts @@ -0,0 +1,39 @@ +import type { SearchInput, SearchProvider } from './types.js' +import { applyDomainFilters, type ProviderOutput } from './types.js' + +export const duckduckgoProvider: SearchProvider = { + name: 'duckduckgo', + + isConfigured() { + // DDG is the default fallback — always available (duck-duck-scrape is a runtime dep) + return true + }, + + async search(input: SearchInput, signal?: AbortSignal): Promise { + const start = performance.now() + let search: typeof import('duck-duck-scrape').search + try { + ;({ search } = await import('duck-duck-scrape')) + } catch { + throw new Error('duck-duck-scrape package not installed. Run: npm install duck-duck-scrape') + } + if (signal?.aborted) throw new DOMException('Aborted', 'AbortError') + // TODO: duck-duck-scrape doesn't accept AbortSignal — can't cancel in-flight searches + const response = await search(input.query, { safeSearch: 0 }) + + const hits = applyDomainFilters( + response.results.map(r => ({ + title: r.title || r.url, + url: r.url, + description: r.description ?? undefined, + })), + input, + ) + + return { + hits, + providerName: 'duckduckgo', + durationSeconds: (performance.now() - start) / 1000, + } + }, +} diff --git a/src/tools/WebSearchTool/providers/exa.ts b/src/tools/WebSearchTool/providers/exa.ts new file mode 100644 index 00000000..879bf14a --- /dev/null +++ b/src/tools/WebSearchTool/providers/exa.ts @@ -0,0 +1,58 @@ +/** + * Exa Search API adapter. + * POST https://api.exa.ai/search + * Auth: x-api-key: + */ + +import type { SearchInput, SearchProvider } from './types.js' +import { applyDomainFilters, safeHostname, type ProviderOutput } from './types.js' + +export const exaProvider: SearchProvider = { + name: 'exa', + + isConfigured() { + return Boolean(process.env.EXA_API_KEY) + }, + + async search(input: SearchInput, signal?: AbortSignal): Promise { + const start = performance.now() + + const body: Record = { + query: input.query, + numResults: 10, + type: 'auto', + } + + if (input.allowed_domains?.length) body.includeDomains = input.allowed_domains + if (input.blocked_domains?.length) body.excludeDomains = input.blocked_domains + + const res = await fetch('https://api.exa.ai/search', { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + 'x-api-key': process.env.EXA_API_KEY!, + }, + body: JSON.stringify(body), + signal, + }) + + if (!res.ok) { + throw new Error(`Exa search error ${res.status}: ${await res.text().catch(() => '')}`) + } + + const data = await res.json() + const hits = (data.results ?? []).map((r: any) => ({ + title: r.title ?? '', + url: r.url ?? '', + description: r.snippet ?? r.text, + source: r.url ? safeHostname(r.url) : undefined, + })) + + return { + // Exa handles domain filtering server-side via includeDomains/excludeDomains + hits, + providerName: 'exa', + durationSeconds: (performance.now() - start) / 1000, + } + }, +} diff --git a/src/tools/WebSearchTool/providers/firecrawl.ts b/src/tools/WebSearchTool/providers/firecrawl.ts new file mode 100644 index 00000000..235aefaf --- /dev/null +++ b/src/tools/WebSearchTool/providers/firecrawl.ts @@ -0,0 +1,41 @@ +import type { SearchInput, SearchProvider } from './types.js' +import { applyDomainFilters, type ProviderOutput } from './types.js' + +export const firecrawlProvider: SearchProvider = { + name: 'firecrawl', + + isConfigured() { + return Boolean(process.env.FIRECRAWL_API_KEY) + }, + + async search(input: SearchInput, signal?: AbortSignal): Promise { + const start = performance.now() + if (signal?.aborted) throw new DOMException('Aborted', 'AbortError') + // TODO: @mendable/firecrawl-js SDK doesn't accept AbortSignal — can't cancel in-flight searches + const { FirecrawlClient } = await import('@mendable/firecrawl-js') + const app = new FirecrawlClient({ apiKey: process.env.FIRECRAWL_API_KEY! }) + + let query = input.query + if (input.blocked_domains?.length) { + const exclusions = input.blocked_domains.map(d => `-site:${d}`).join(' ') + query = `${query} ${exclusions}` + } + + const data = await app.search(query, { limit: 10 }) + + const hits = applyDomainFilters( + (data.web ?? []).map((r: { url: string; title?: string; description?: string }) => ({ + title: r.title ?? r.url, + url: r.url, + description: r.description, + })), + input, + ) + + return { + hits, + providerName: 'firecrawl', + durationSeconds: (performance.now() - start) / 1000, + } + }, +} diff --git a/src/tools/WebSearchTool/providers/index.test.ts b/src/tools/WebSearchTool/providers/index.test.ts new file mode 100644 index 00000000..26934d2e --- /dev/null +++ b/src/tools/WebSearchTool/providers/index.test.ts @@ -0,0 +1,160 @@ +import { describe, expect, test, beforeEach, afterEach } from 'bun:test' +import { getProviderMode, getProviderChain, getAvailableProviders } from './index.js' +import type { ProviderMode } from './index.js' + +// --------------------------------------------------------------------------- +// getProviderMode +// --------------------------------------------------------------------------- + +describe('getProviderMode', () => { + const savedEnv = process.env.WEB_SEARCH_PROVIDER + + afterEach(() => { + if (savedEnv === undefined) { + delete process.env.WEB_SEARCH_PROVIDER + } else { + process.env.WEB_SEARCH_PROVIDER = savedEnv + } + }) + + test('returns auto by default', () => { + delete process.env.WEB_SEARCH_PROVIDER + expect(getProviderMode()).toBe('auto') + }) + + test('returns configured mode', () => { + process.env.WEB_SEARCH_PROVIDER = 'tavily' + expect(getProviderMode()).toBe('tavily') + }) + + test('returns ddg mode', () => { + process.env.WEB_SEARCH_PROVIDER = 'ddg' + expect(getProviderMode()).toBe('ddg') + }) + + test('returns native mode', () => { + process.env.WEB_SEARCH_PROVIDER = 'native' + expect(getProviderMode()).toBe('native') + }) + + test('falls back to auto for invalid mode', () => { + process.env.WEB_SEARCH_PROVIDER = 'nonexistent_provider' + expect(getProviderMode()).toBe('auto') + }) +}) + +// --------------------------------------------------------------------------- +// getProviderChain +// --------------------------------------------------------------------------- + +describe('getProviderChain', () => { + test('auto mode returns at least one configured provider', () => { + // DDG isAlways configured (no API key needed) + const chain = getProviderChain('auto') + expect(chain.length).toBeGreaterThan(0) + expect(chain.some(p => p.name === 'duckduckgo')).toBe(true) + }) + + test('auto mode does NOT include custom provider', () => { + const chain = getProviderChain('auto') + expect(chain.some(p => p.name === 'custom')).toBe(false) + }) + + test('custom mode explicitly returns custom provider', () => { + const chain = getProviderChain('custom' as ProviderMode) + expect(chain).toHaveLength(1) + expect(chain[0].name).toBe('custom') + }) + + test('specific mode returns exactly one provider', () => { + const chain = getProviderChain('tavily' as ProviderMode) + expect(chain).toHaveLength(1) + expect(chain[0].name).toBe('tavily') + }) + + test('ddg mode returns duckduckgo provider', () => { + const chain = getProviderChain('ddg' as ProviderMode) + expect(chain).toHaveLength(1) + expect(chain[0].name).toBe('duckduckgo') + }) + + test('native mode returns empty chain', () => { + expect(getProviderChain('native')).toHaveLength(0) + }) + + test('unknown mode returns empty chain', () => { + expect(getProviderChain('nonexistent' as ProviderMode)).toHaveLength(0) + }) +}) + +// --------------------------------------------------------------------------- +// AbortError stops the chain +// --------------------------------------------------------------------------- + +describe('runSearch', () => { + test('AbortError stops the chain immediately in auto mode', async () => { + // Use AbortController to cancel + const controller = new AbortController() + controller.abort() // cancel immediately + + await expect( + // Dynamic import to avoid circular issues + import('./index.js').then(m => + m.runSearch({ query: 'test' }, controller.signal), + ), + ).rejects.toThrow() + }) + + test('explicit mode fails fast when provider is not configured', async () => { + // Save and clear tavily key + const saved = process.env.TAVILY_API_KEY + delete process.env.TAVILY_API_KEY + const savedProvider = process.env.WEB_SEARCH_PROVIDER + process.env.WEB_SEARCH_PROVIDER = 'tavily' + + try { + const { runSearch } = await import('./index.js') + await expect(runSearch({ query: 'test' })).rejects.toThrow( + /not configured/i, + ) + } finally { + if (saved !== undefined) process.env.TAVILY_API_KEY = saved + else delete process.env.TAVILY_API_KEY + if (savedProvider !== undefined) process.env.WEB_SEARCH_PROVIDER = savedProvider + else delete process.env.WEB_SEARCH_PROVIDER + } + }) +}) + +// --------------------------------------------------------------------------- +// getAvailableProviders +// --------------------------------------------------------------------------- + +describe('getAvailableProviders', () => { + test('always includes duckduckgo (no API key required)', () => { + const providers = getAvailableProviders() + expect(providers.some(p => p.name === 'duckduckgo')).toBe(true) + }) + + test('does NOT include custom in available providers (auto chain)', () => { + const providers = getAvailableProviders() + expect(providers.some(p => p.name === 'custom')).toBe(false) + }) + + test('includes providers when API keys are set', () => { + const saved = process.env.TAVILY_API_KEY + process.env.TAVILY_API_KEY = 'test-key' + const providers = getAvailableProviders() + expect(providers.some(p => p.name === 'tavily')).toBe(true) + if (saved === undefined) delete process.env.TAVILY_API_KEY + else process.env.TAVILY_API_KEY = saved + }) + + test('excludes providers when API keys are missing', () => { + const saved = process.env.TAVILY_API_KEY + delete process.env.TAVILY_API_KEY + const providers = getAvailableProviders() + expect(providers.some(p => p.name === 'tavily')).toBe(false) + if (saved !== undefined) process.env.TAVILY_API_KEY = saved + }) +}) diff --git a/src/tools/WebSearchTool/providers/index.ts b/src/tools/WebSearchTool/providers/index.ts new file mode 100644 index 00000000..a993a474 --- /dev/null +++ b/src/tools/WebSearchTool/providers/index.ts @@ -0,0 +1,192 @@ +/** + * Provider registry and selection logic. + * + * WEB_SEARCH_PROVIDER controls which backend to use: + * + * "auto" (default) — try providers in priority order, fall through on failure + * "custom" — use WEB_SEARCH_API / WEB_PROVIDER preset only (fail loudly) + * "firecrawl" — use Firecrawl only (fail loudly) + * "tavily" — use Tavily only (fail loudly) + * "exa" — use Exa only (fail loudly) + * "you" — use You.com only (fail loudly) + * "jina" — use Jina only (fail loudly) + * "bing" — use Bing only (fail loudly) + * "mojeek" — use Mojeek only (fail loudly) + * "linkup" — use Linkup only (fail loudly) + * "ddg" — use DuckDuckGo only (fail loudly) + * "native" — use Anthropic native / Codex only (fail loudly) + * + * "auto" mode is the only mode that silently falls through to the next provider. + * All other modes throw on failure — no silent backend switching. + * + * NOTE: "custom" is NOT included in the "auto" fallback chain. + * It is only used when WEB_SEARCH_PROVIDER=custom is explicitly selected. + */ + +import type { SearchInput, SearchProvider } from './types.js' +import type { ProviderOutput } from './types.js' + +import { customProvider } from './custom.js' +import { duckduckgoProvider } from './duckduckgo.js' +import { firecrawlProvider } from './firecrawl.js' +import { tavilyProvider } from './tavily.js' +import { exaProvider } from './exa.js' +import { youProvider } from './you.js' +import { jinaProvider } from './jina.js' +import { bingProvider } from './bing.js' +import { mojeekProvider } from './mojeek.js' +import { linkupProvider } from './linkup.js' + +export { type SearchInput, type SearchProvider, type ProviderOutput, type SearchHit } from './types.js' +export { applyDomainFilters, safeHostname, hostMatchesDomain } from './types.js' +export { extractHits } from './custom.js' + +// --------------------------------------------------------------------------- +// All registered providers — order matters for auto mode +// --------------------------------------------------------------------------- +// Priority: firecrawl → tavily → exa → you → jina → bing → mojeek → linkup → ddg +// DDG is last because it's free but rate-limited. +// NOTE: customProvider is intentionally excluded from the auto chain. +// It is only available when WEB_SEARCH_PROVIDER=custom is explicitly set. +// This prevents the generic outbound provider from silently becoming the default backend. + +const ALL_PROVIDERS: SearchProvider[] = [ + firecrawlProvider, + tavilyProvider, + exaProvider, + youProvider, + jinaProvider, + bingProvider, + mojeekProvider, + linkupProvider, + duckduckgoProvider, +] + +export function getAvailableProviders(): SearchProvider[] { + return ALL_PROVIDERS.filter(p => p.isConfigured()) +} + +// --------------------------------------------------------------------------- +// Selection +// --------------------------------------------------------------------------- + +export type ProviderMode = + | 'auto' + | 'custom' + | 'firecrawl' + | 'ddg' + | 'tavily' + | 'exa' + | 'you' + | 'jina' + | 'bing' + | 'mojeek' + | 'linkup' + | 'native' + +const PROVIDER_BY_NAME: Record = { + custom: customProvider, + firecrawl: firecrawlProvider, + ddg: duckduckgoProvider, + tavily: tavilyProvider, + exa: exaProvider, + you: youProvider, + jina: jinaProvider, + bing: bingProvider, + mojeek: mojeekProvider, + linkup: linkupProvider, +} + +const VALID_MODES = new Set(Object.keys(PROVIDER_BY_NAME).concat(['auto', 'native'])) + +export function getProviderMode(): ProviderMode { + const raw = process.env.WEB_SEARCH_PROVIDER ?? 'auto' + if (VALID_MODES.has(raw)) return raw as ProviderMode + return 'auto' +} + +/** + * Returns the list of providers to try, in order. + * - Specific mode → single provider + * - Auto → priority order (ALL_PROVIDERS, filtered by isConfigured) + */ +export function getProviderChain(mode: ProviderMode): SearchProvider[] { + if (mode === 'auto') { + return ALL_PROVIDERS.filter(p => p.isConfigured()) + } + if (mode === 'native') { + return [] + } + const provider = PROVIDER_BY_NAME[mode] + if (!provider) return [] + return [provider] +} + +/** + * Run a search using the configured provider chain. + * + * - Auto mode: tries each provider in order, falls through on failure. + * If ALL providers fail, throws the last error. + * - Specific mode: runs the single provider, throws immediately on failure. + */ +export async function runSearch( + input: SearchInput, + signal?: AbortSignal, +): Promise { + const mode = getProviderMode() + const chain = getProviderChain(mode) + + if (chain.length === 0) { + throw new Error( + mode === 'native' + ? 'Native web search requires firstParty/vertex/foundry provider.' + : `No search providers available for mode "${mode}". Check your env vars.`, + ) + } + + const errors: Error[] = [] + + // Explicit provider mode: fail fast if the provider isn't configured + if (mode !== 'auto' && mode !== 'native') { + const provider = chain[0] + if (provider && !provider.isConfigured()) { + throw new Error( + `Search provider "${mode}" is not configured. ` + + `Set the required environment variable (e.g. ${mode.toUpperCase()}_API_KEY) ` + + `or switch to WEB_SEARCH_PROVIDER=auto.`, + ) + } + } + + for (const provider of chain) { + try { + return await provider.search(input, signal) + } catch (err) { + const error = err instanceof Error ? err : new Error(String(err)) + + // Cancellation must stop immediately — don't fall through to other providers + if (error.name === 'AbortError' || signal?.aborted) { + throw error + } + + errors.push(error) + + // Specific mode: fail loudly, no fallback + if (mode !== 'auto') { + throw error + } + + // Auto mode: log and try next + console.error(`[web-search] ${provider.name} failed: ${error.message}`) + } + } + + // All providers failed in auto mode + const lastErr = errors[errors.length - 1] + if (!lastErr) throw new Error('All search providers failed with no error details.') + if (errors.length === 1) throw lastErr + throw new Error( + `All ${errors.length} search providers failed:\n` + + errors.map((e, i) => ` ${i + 1}. ${e.message}`).join('\n'), + ) +} diff --git a/src/tools/WebSearchTool/providers/jina.ts b/src/tools/WebSearchTool/providers/jina.ts new file mode 100644 index 00000000..a9fb3e03 --- /dev/null +++ b/src/tools/WebSearchTool/providers/jina.ts @@ -0,0 +1,49 @@ +/** + * Jina Search API adapter. + * GET https://s.jina.ai/?q=... + * Auth: Authorization: Bearer + */ + +import type { SearchInput, SearchProvider } from './types.js' +import { applyDomainFilters, safeHostname, type ProviderOutput } from './types.js' + +export const jinaProvider: SearchProvider = { + name: 'jina', + + isConfigured() { + return Boolean(process.env.JINA_API_KEY) + }, + + async search(input: SearchInput, signal?: AbortSignal): Promise { + const start = performance.now() + + const url = new URL('https://s.jina.ai/') + url.searchParams.set('q', input.query) + + const res = await fetch(url.toString(), { + headers: { + Authorization: `Bearer ${process.env.JINA_API_KEY}`, + Accept: 'application/json', + }, + signal, + }) + + if (!res.ok) { + throw new Error(`Jina search error ${res.status}: ${await res.text().catch(() => '')}`) + } + + const data = await res.json() + const hits = (data.data ?? data.results ?? []).map((r: any) => ({ + title: r.title ?? '', + url: r.url ?? '', + description: r.description ?? r.snippet ?? r.content, + source: r.url ? safeHostname(r.url) : undefined, + })) + + return { + hits: applyDomainFilters(hits, input), + providerName: 'jina', + durationSeconds: (performance.now() - start) / 1000, + } + }, +} diff --git a/src/tools/WebSearchTool/providers/linkup.ts b/src/tools/WebSearchTool/providers/linkup.ts new file mode 100644 index 00000000..240f00c3 --- /dev/null +++ b/src/tools/WebSearchTool/providers/linkup.ts @@ -0,0 +1,51 @@ +/** + * Linkup Search API adapter. + * POST https://api.linkup.so/v1/search + * Auth: Authorization: Bearer + */ + +import type { SearchInput, SearchProvider } from './types.js' +import { applyDomainFilters, safeHostname, type ProviderOutput } from './types.js' + +export const linkupProvider: SearchProvider = { + name: 'linkup', + + isConfigured() { + return Boolean(process.env.LINKUP_API_KEY) + }, + + async search(input: SearchInput, signal?: AbortSignal): Promise { + const start = performance.now() + + const res = await fetch('https://api.linkup.so/v1/search', { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + Authorization: `Bearer ${process.env.LINKUP_API_KEY}`, + }, + body: JSON.stringify({ + q: input.query, + search_type: 'standard', + }), + signal, + }) + + if (!res.ok) { + throw new Error(`Linkup search error ${res.status}: ${await res.text().catch(() => '')}`) + } + + const data = await res.json() + const hits = (data.results ?? []).map((r: any) => ({ + title: r.name ?? r.title ?? '', + url: r.url ?? '', + description: r.snippet ?? r.description ?? r.content, + source: r.url ? safeHostname(r.url) : undefined, + })) + + return { + hits: applyDomainFilters(hits, input), + providerName: 'linkup', + durationSeconds: (performance.now() - start) / 1000, + } + }, +} diff --git a/src/tools/WebSearchTool/providers/mojeek.ts b/src/tools/WebSearchTool/providers/mojeek.ts new file mode 100644 index 00000000..27511e21 --- /dev/null +++ b/src/tools/WebSearchTool/providers/mojeek.ts @@ -0,0 +1,52 @@ +/** + * Mojeek Search API adapter. + * GET https://www.mojeek.com/search?q=...&fmt=json + * Auth: optional Bearer for API tier + */ + +import type { SearchInput, SearchProvider } from './types.js' +import { applyDomainFilters, safeHostname, type ProviderOutput } from './types.js' + +export const mojeekProvider: SearchProvider = { + name: 'mojeek', + + isConfigured() { + return Boolean(process.env.MOJEEK_API_KEY) + }, + + async search(input: SearchInput, signal?: AbortSignal): Promise { + const start = performance.now() + + const url = new URL('https://www.mojeek.com/search') + url.searchParams.set('q', input.query) + url.searchParams.set('fmt', 'json') + + const headers: Record = {} + if (process.env.MOJEEK_API_KEY) { + headers['Accept'] = 'application/json' + headers['Authorization'] = `Bearer ${process.env.MOJEEK_API_KEY}` + } + + const res = await fetch(url.toString(), { headers, signal }) + + if (!res.ok) { + throw new Error(`Mojeek search error ${res.status}: ${await res.text().catch(() => '')}`) + } + + const data = await res.json() + const rawResults = data?.response?.results ?? data?.results ?? [] + + const hits = rawResults.map((r: any) => ({ + title: r.title ?? '', + url: r.url ?? '', + description: r.snippet ?? r.desc, + source: r.url ? safeHostname(r.url) : undefined, + })) + + return { + hits: applyDomainFilters(hits, input), + providerName: 'mojeek', + durationSeconds: (performance.now() - start) / 1000, + } + }, +} diff --git a/src/tools/WebSearchTool/providers/tavily.ts b/src/tools/WebSearchTool/providers/tavily.ts new file mode 100644 index 00000000..8d9ccfa2 --- /dev/null +++ b/src/tools/WebSearchTool/providers/tavily.ts @@ -0,0 +1,53 @@ +/** + * Tavily Search API adapter. + * POST https://api.tavily.com/search + * Auth: Authorization: Bearer tvly-xxxx + */ + +import type { SearchInput, SearchProvider } from './types.js' +import { applyDomainFilters, safeHostname, type ProviderOutput } from './types.js' + +export const tavilyProvider: SearchProvider = { + name: 'tavily', + + isConfigured() { + return Boolean(process.env.TAVILY_API_KEY) + }, + + async search(input: SearchInput, signal?: AbortSignal): Promise { + const start = performance.now() + + const res = await fetch('https://api.tavily.com/search', { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + Authorization: `Bearer ${process.env.TAVILY_API_KEY}`, + }, + body: JSON.stringify({ + query: input.query, + max_results: 10, + include_answer: false, + }), + signal, + }) + + if (!res.ok) { + throw new Error(`Tavily search error ${res.status}: ${await res.text().catch(() => '')}`) + } + + const data = await res.json() + + const hits = (data.results ?? []).map((r: any) => ({ + title: r.title ?? '', + url: r.url ?? '', + description: r.content ?? r.snippet, + source: r.url ? safeHostname(r.url) : undefined, + })) + + return { + hits: applyDomainFilters(hits, input), + providerName: 'tavily', + durationSeconds: (performance.now() - start) / 1000, + } + }, +} diff --git a/src/tools/WebSearchTool/providers/types.test.ts b/src/tools/WebSearchTool/providers/types.test.ts new file mode 100644 index 00000000..94959c0c --- /dev/null +++ b/src/tools/WebSearchTool/providers/types.test.ts @@ -0,0 +1,229 @@ +import { describe, expect, test } from 'bun:test' +import { applyDomainFilters, hostMatchesDomain, normalizeHit, safeHostname } from './types.js' + +// --------------------------------------------------------------------------- +// safeHostname +// --------------------------------------------------------------------------- + +describe('safeHostname', () => { + test('returns hostname for valid URL', () => { + expect(safeHostname('https://example.com/path')).toBe('example.com') + }) + + test('returns hostname with subdomain', () => { + expect(safeHostname('https://api.example.com/v1')).toBe('api.example.com') + }) + + test('returns undefined for invalid URL', () => { + expect(safeHostname('not-a-url')).toBeUndefined() + }) + + test('returns undefined for empty string', () => { + expect(safeHostname('')).toBeUndefined() + }) + + test('returns undefined for undefined', () => { + expect(safeHostname(undefined)).toBeUndefined() + }) + + test('returns undefined for relative path', () => { + expect(safeHostname('/path/only')).toBeUndefined() + }) +}) + +// --------------------------------------------------------------------------- +// hostMatchesDomain +// --------------------------------------------------------------------------- + +describe('hostMatchesDomain', () => { + test('exact match', () => { + expect(hostMatchesDomain('example.com', 'example.com')).toBe(true) + }) + + test('subdomain match', () => { + expect(hostMatchesDomain('sub.example.com', 'example.com')).toBe(true) + expect(hostMatchesDomain('deep.sub.example.com', 'example.com')).toBe(true) + }) + + test('suffix collision is blocked (badexample.com ≠ example.com)', () => { + expect(hostMatchesDomain('badexample.com', 'example.com')).toBe(false) + }) + + test('different domain', () => { + expect(hostMatchesDomain('other.com', 'example.com')).toBe(false) + }) + + test('partial word collision is blocked', () => { + expect(hostMatchesDomain('notexample.com', 'example.com')).toBe(false) + expect(hostMatchesDomain('xample.com', 'example.com')).toBe(false) + }) +}) + +// --------------------------------------------------------------------------- +// normalizeHit +// --------------------------------------------------------------------------- + +describe('normalizeHit', () => { + test('extracts standard fields', () => { + const hit = normalizeHit({ title: 'Test', url: 'https://example.com' }) + expect(hit).toEqual({ title: 'Test', url: 'https://example.com' }) + }) + + test('extracts alternative field names (headline, link, snippet)', () => { + const hit = normalizeHit({ + headline: 'Test', + link: 'https://ex.com', + snippet: 'desc', + }) + expect(hit?.title).toBe('Test') + expect(hit?.url).toBe('https://ex.com') + expect(hit?.description).toBe('desc') + }) + + test('extracts source from various keys', () => { + const hit = normalizeHit({ + title: 'T', + url: 'https://example.com', + displayLink: 'example.com', + }) + expect(hit?.source).toBe('example.com') + }) + + test('returns null for empty object', () => { + expect(normalizeHit({})).toBeNull() + }) + + test('returns null for null input', () => { + expect(normalizeHit(null)).toBeNull() + }) + + test('returns null for non-object input', () => { + expect(normalizeHit('string')).toBeNull() + expect(normalizeHit(42)).toBeNull() + }) + + test('uses url as title when title missing', () => { + const hit = normalizeHit({ url: 'https://example.com' }) + expect(hit?.title).toBe('https://example.com') + expect(hit?.url).toBe('https://example.com') + }) +}) + +// --------------------------------------------------------------------------- +// applyDomainFilters +// --------------------------------------------------------------------------- + +describe('applyDomainFilters', () => { + test('filters blocked domains', () => { + const hits = [ + { title: 'good', url: 'https://example.com/page' }, + { title: 'bad', url: 'https://badsite.com/page' }, + ] + const result = applyDomainFilters(hits, { + query: 'test', + blocked_domains: ['badsite.com'], + }) + expect(result).toHaveLength(1) + expect(result[0].url).toBe('https://example.com/page') + }) + + test('keeps malformed URLs when filtering blocked (security)', () => { + const hits = [{ title: 'weird', url: 'not-a-valid-url' }] + const result = applyDomainFilters(hits, { + query: 'test', + blocked_domains: ['example.com'], + }) + // Can't confirm it's blocked → keep it + expect(result).toHaveLength(1) + }) + + test('filters allowed domains only', () => { + const hits = [ + { title: 'good', url: 'https://example.com/page' }, + { title: 'bad', url: 'https://other.com/page' }, + ] + const result = applyDomainFilters(hits, { + query: 'test', + allowed_domains: ['example.com'], + }) + expect(result).toHaveLength(1) + expect(result[0].url).toBe('https://example.com/page') + }) + + test('drops malformed URLs when filtering allowed (security)', () => { + const hits = [{ title: 'weird', url: 'not-a-valid-url' }] + const result = applyDomainFilters(hits, { + query: 'test', + allowed_domains: ['example.com'], + }) + // Can't confirm it's allowed → drop it + expect(result).toHaveLength(0) + }) + + test('handles subdomain matching', () => { + const hits = [{ title: 't', url: 'https://sub.example.com/page' }] + const blocked = applyDomainFilters(hits, { + query: 'test', + blocked_domains: ['example.com'], + }) + expect(blocked).toHaveLength(0) + + const allowed = applyDomainFilters(hits, { + query: 'test', + allowed_domains: ['example.com'], + }) + expect(allowed).toHaveLength(1) + }) + + test('returns all hits when no domain filters', () => { + const hits = [ + { title: 'a', url: 'https://a.com' }, + { title: 'b', url: 'https://b.com' }, + ] + const result = applyDomainFilters(hits, { query: 'test' }) + expect(result).toHaveLength(2) + }) + + test('combines blocked and allowed filters', () => { + const hits = [ + { title: 'good', url: 'https://example.com/page' }, + { title: 'blocked', url: 'https://badsite.com/page' }, + { title: 'other', url: 'https://other.com/page' }, + ] + const result = applyDomainFilters(hits, { + query: 'test', + blocked_domains: ['badsite.com'], + allowed_domains: ['example.com'], + }) + expect(result).toHaveLength(1) + expect(result[0].url).toBe('https://example.com/page') + }) + + test('does NOT match suffix collision (badexample.com blocked does not affect example.com)', () => { + const hits = [ + { title: 'good', url: 'https://example.com/page' }, + { title: 'collision', url: 'https://badexample.com/page' }, + ] + const blocked = applyDomainFilters(hits, { + query: 'test', + blocked_domains: ['example.com'], + }) + // Only exact/subdomain of example.com is blocked, not badexample.com + expect(blocked).toHaveLength(1) + expect(blocked[0].url).toBe('https://badexample.com/page') + }) + + test('allowed_domains does NOT match suffix collision', () => { + const hits = [ + { title: 'good', url: 'https://example.com/page' }, + { title: 'collision', url: 'https://badexample.com/page' }, + ] + const allowed = applyDomainFilters(hits, { + query: 'test', + allowed_domains: ['example.com'], + }) + // Only exact/subdomain of example.com is allowed + expect(allowed).toHaveLength(1) + expect(allowed[0].url).toBe('https://example.com/page') + }) +}) diff --git a/src/tools/WebSearchTool/providers/types.ts b/src/tools/WebSearchTool/providers/types.ts new file mode 100644 index 00000000..6b14e953 --- /dev/null +++ b/src/tools/WebSearchTool/providers/types.ts @@ -0,0 +1,119 @@ +/** + * Search provider adapter types. + * + * Every backend implements SearchProvider. WebSearchTool.selectProvider() + * picks the right one; shared logic (domain filtering, snippet formatting, + * result-block construction) lives in the tool layer, not in adapters. + */ + +// --------------------------------------------------------------------------- +// Shared types +// --------------------------------------------------------------------------- + +export interface SearchHit { + title: string + url: string + description?: string + source?: string +} + +export interface SearchInput { + query: string + allowed_domains?: string[] + blocked_domains?: string[] +} + +export interface ProviderOutput { + hits: SearchHit[] + /** Provider name for logging / tool_use_id */ + providerName: string + /** Duration of the provider call in seconds */ + durationSeconds: number +} + +export interface SearchProvider { + /** Human-readable label (used in tool_use_id, logs) */ + readonly name: string + /** Returns true when the env vars / config needed for this provider are present */ + isConfigured(): boolean + /** Perform the search. Throw on unrecoverable errors. */ + search(input: SearchInput, signal?: AbortSignal): Promise +} + +// --------------------------------------------------------------------------- +// Flexible response parsing helpers +// --------------------------------------------------------------------------- + +const TITLE_KEYS = ['title', 'headline', 'name', 'heading'] as const +const URL_KEYS = ['url', 'link', 'href', 'uri', 'permalink'] as const +const DESC_KEYS = [ + 'description', 'snippet', 'content', 'preview', 'summary', 'text', 'body', +] as const +const SOURCE_KEYS = [ + 'source', 'domain', 'displayLink', 'displayed_link', 'engine', +] as const + +function firstMatch(obj: any, keys: readonly string[]): string | undefined { + for (const k of keys) { + if (typeof obj?.[k] === 'string' && obj[k]) return obj[k] + } + return undefined +} + +/** Extract a SearchHit from any object shape using well-known field aliases. */ +export function normalizeHit(raw: any): SearchHit | null { + if (!raw || typeof raw !== 'object') return null + const title = firstMatch(raw, TITLE_KEYS) + const url = firstMatch(raw, URL_KEYS) + if (!title && !url) return null + const hit: SearchHit = { title: title ?? url!, url: url ?? title! } + const desc = firstMatch(raw, DESC_KEYS) + const source = firstMatch(raw, SOURCE_KEYS) + if (desc) hit.description = desc + if (source) hit.source = source + return hit +} + +// --------------------------------------------------------------------------- +// Domain filtering — shared across ALL providers +// --------------------------------------------------------------------------- + +/** Safely extract hostname from a URL string. Returns undefined on parse failure. */ +export function safeHostname(url: string | undefined): string | undefined { + if (!url) return undefined + try { return new URL(url).hostname } catch { return undefined } +} + +/** + * Check if a hostname exactly matches a domain or is a subdomain of it. + * Example: hostMatchesDomain('sub.example.com', 'example.com') → true + * hostMatchesDomain('badexample.com', 'example.com') → false + */ +export function hostMatchesDomain(host: string, domain: string): boolean { + if (host === domain) return true + // Subdomain: must end with `.domain` (not just `domain`) + return host.endsWith('.' + domain) +} + +export function applyDomainFilters( + hits: SearchHit[], + input: SearchInput, +): SearchHit[] { + let out = hits + if (input.blocked_domains?.length) { + out = out.filter(h => { + const host = safeHostname(h.url) + if (!host) return true // can't confirm blocked → keep + return !input.blocked_domains!.some(d => hostMatchesDomain(host, d)) + }) + } + if (input.allowed_domains?.length) { + out = out.filter(h => { + const host = safeHostname(h.url) + if (!host) return false // can't confirm allowed → drop + return input.allowed_domains!.some(d => hostMatchesDomain(host, d)) + }) + } + return out +} + diff --git a/src/tools/WebSearchTool/providers/you.ts b/src/tools/WebSearchTool/providers/you.ts new file mode 100644 index 00000000..5bec709e --- /dev/null +++ b/src/tools/WebSearchTool/providers/you.ts @@ -0,0 +1,51 @@ +/** + * You.com Search API adapter. + * GET https://api.ydc-index.io/v1/search?query=... + * Auth: X-API-Key: + */ + +import type { SearchInput, SearchProvider } from './types.js' +import { applyDomainFilters, safeHostname, type ProviderOutput } from './types.js' + +export const youProvider: SearchProvider = { + name: 'you', + + isConfigured() { + return Boolean(process.env.YOU_API_KEY) + }, + + async search(input: SearchInput, signal?: AbortSignal): Promise { + const start = performance.now() + + const url = new URL('https://api.ydc-index.io/v1/search') + url.searchParams.set('query', input.query) + + const res = await fetch(url.toString(), { + headers: { 'X-API-Key': process.env.YOU_API_KEY! }, + signal, + }) + + if (!res.ok) { + throw new Error(`You.com search error ${res.status}: ${await res.text().catch(() => '')}`) + } + + const data = await res.json() + const webResults = data?.results?.web ?? data?.results ?? [] + + const hits = webResults.map((r: any) => { + const snippet = Array.isArray(r.snippets) ? r.snippets[0] : r.snippet + return { + title: r.title ?? '', + url: r.url ?? '', + description: snippet ?? r.description, + source: r.url ? safeHostname(r.url) : undefined, + } + }) + + return { + hits: applyDomainFilters(hits, input), + providerName: 'you', + durationSeconds: (performance.now() - start) / 1000, + } + }, +}