package main import ( "bytes" "context" "encoding/json" "errors" "fmt" "io" "log/slog" "net/http" ) // web.go is the pluggable web-freshness layer (Phase 3). A WebProvider fetches a // grounded factual digest + source URLs for a query; the cascade then has Grok // synthesise the final answer in voice from that digest. Two providers, chosen by // WEB_PROVIDER: // // - grok_web_search (DEFAULT): the xAI Agent Tools `web_search` tool on the Responses // API (/v1/responses). NB the older chat/completions Live Search `search_parameters` // mechanism was RETIRED by xAI (now 410 Gone), and the web_search tool is not on // chat/completions — hence the Responses endpoint. Billed $5/1k tool calls + tokens. // - gemini_grounding: Gemini native v1beta google_search. Cheaper. Works on current // models INCLUDING gemini-2.5-flash-lite (verified against ai.google.dev — the 2.5 // family supports google_search; only legacy models use google_search_retrieval). // The F-EXT-3 "silently ungrounds" caveat is about the OpenAI-compat endpoint, NOT // the model version — so this provider uses the NATIVE v1beta path and runs behind a // citations verify-gate, degrading if no citations come back. // // The web call is bounded by a per-stage timeout (and gemini_grounding additionally by a // durable daily cap), and either provider failing degrades the request to grok_direct // with a staleness hedge (never silence, never stale-as-fresh). // // The grok_web_search Responses-API request/response shape was VALIDATED live against // /v1/responses (2026-06-01): output[].type=="message" → content[].output_text + inline // url_citation annotations; usage carries input/output tokens, cached subset, and the // web_search_calls count (one request can search several times — each billed). The // computed cost matched the API's own cost_in_usd_ticks to 4 dp. A parse miss still // degrades safely (empty digest → grok_direct). const ( webProviderGrokWebSearch = "grok_web_search" webProviderGeminiGrounding = "gemini_grounding" // grokWebSearchPerCall is xAI's Agent Tools fee: $5 per 1,000 web_search tool calls. grokWebSearchPerCall = 5.0 / 1000.0 // maxWebSearchCalls bounds the per-call fee in the reservation envelope (one Responses // request can search several times; the actual count is billed exactly at settle). maxWebSearchCalls = 4 ) // errGroundingCapped signals the daily web/grounded-prompt cap was hit, so the caller // degrades (with a hedge) rather than paying past the cap. var errGroundingCapped = errors.New("web grounding daily cap reached") // WebSource is one attributable source behind a web answer: a human label (the publisher // domain) and a link the END USER can open. For gemini grounding the URL is the // grounding-api-redirect (clicked by the user → the real article; never resolved // server-side, which Gemini's terms forbid); for grok_web_search it is the real publisher // URL. Surfaced to the user as a compact "Sources" footer (sources.go). type WebSource struct { Title string // publisher domain ("rbc.ru") — the citation's web.title / the URL host URL string // the link to open (gemini: redirect; grok: real article URL) } // WebContext is the result of a web fetch: a factual digest to feed the final model, // the sources behind it, the fetch's own token usage, and the cost the fetch incurred // (kept separate from the final synthesis tokens so each books to its own ledger // column). Cost is populated even when Digest is empty/failed, because the call was // still billed — the caller books it before degrading (§8.1 partial cascade). type WebContext struct { Digest string Citations []string // raw source URLs (the verify-gate + citation_count telemetry) Sources []WebSource // the same sources with display titles (the user-facing footer) Usage Usage Cost CostBreakdown } // WebProvider fetches grounded facts for a query. Stateless. It returns its cost in the // WebContext even on error (the call was billed), and an error when the digest is // unusable so the caller can degrade. type WebProvider interface { Fetch(ctx context.Context, query string) (WebContext, error) } // --- grok_web_search (default): xAI Agent Tools web_search on the Responses API ------- type grokWebSearch struct { base string key string model string cfg *Config httpc *http.Client logger *slog.Logger } func newGrokWebSearch(cfg *Config, logger *slog.Logger) *grokWebSearch { return &grokWebSearch{ base: cfg.XAIBaseURL, key: cfg.XAIAPIKey, model: cfg.XAIModel, cfg: cfg, httpc: &http.Client{}, logger: logger, } } type grokResponsesRequest struct { Model string `json:"model"` Input string `json:"input"` Tools []openAITool `json:"tools"` // Keep the fetch fast/cheap when the operator runs a unified model with effort // "none"; empty → not sent (provider default). Validated against /v1/responses. ReasoningEffort string `json:"reasoning_effort,omitempty"` } // grokResponsesResponse maps the xAI Responses API shape (verified live 2026-06-01): // output[] carries reasoning/web_search_call/message items; the message item's content // has output_text (with inline url_citation annotations); usage reports tokens, the // cached subset, and the count of server-side web_search calls (a single request can // make several, each billed). type grokResponsesResponse struct { Output []struct { Type string `json:"type"` Content []struct { Type string `json:"type"` Text string `json:"text"` Annotations []struct { Type string `json:"type"` URL string `json:"url"` } `json:"annotations"` } `json:"content"` } `json:"output"` Usage struct { InputTokens int `json:"input_tokens"` OutputTokens int `json:"output_tokens"` InputTokensDetails struct { CachedTokens int `json:"cached_tokens"` } `json:"input_tokens_details"` ServerSideToolUsageDetails struct { WebSearchCalls int `json:"web_search_calls"` } `json:"server_side_tool_usage_details"` } `json:"usage"` } func (p *grokWebSearch) Fetch(ctx context.Context, query string) (WebContext, error) { body, err := json.Marshal(grokResponsesRequest{ Model: p.model, Input: query, Tools: []openAITool{{Type: "web_search"}}, ReasoningEffort: p.cfg.GrokReasoningEffort, }) if err != nil { return WebContext{}, err } req, err := http.NewRequestWithContext(ctx, http.MethodPost, p.base+"/responses", bytes.NewReader(body)) if err != nil { return WebContext{}, err } req.Header.Set("Content-Type", "application/json") req.Header.Set("Authorization", "Bearer "+p.key) resp, err := p.httpc.Do(req) if err != nil { return WebContext{}, err } defer resp.Body.Close() data, _ := io.ReadAll(resp.Body) logLLMExchange(ctx, p.logger, "grok_web_search", body, resp.StatusCode, data) if resp.StatusCode < 200 || resp.StatusCode >= 300 { return WebContext{}, fmt.Errorf("grok web search http %d: %s", resp.StatusCode, snippet(data)) } var out grokResponsesResponse if err := json.Unmarshal(data, &out); err != nil { return WebContext{}, fmt.Errorf("grok web search decode: %w", err) } var digest string var citations []string var sources []WebSource for _, item := range out.Output { if item.Type != "message" { continue } for _, c := range item.Content { if c.Type == "output_text" { digest += c.Text } for _, a := range c.Annotations { if a.Type == "url_citation" && a.URL != "" { citations = append(citations, a.URL) // grok returns real publisher URLs, so the host IS the display domain. sources = append(sources, WebSource{Title: hostOf(a.URL), URL: a.URL}) } } } } usage := Usage{ PromptTokens: out.Usage.InputTokens, CachedTokens: out.Usage.InputTokensDetails.CachedTokens, CompletionTokens: out.Usage.OutputTokens, } // Cost = the call's tokens + the $5/1k fee times the ACTUAL number of web_search // calls the request made (one request can search several times). Booked even when the // digest is empty (the 2xx was billed), so the caller accounts for it before degrading. // Cross-checked live against the API's own cost_in_usd_ticks — matched to 4 dp. wc := WebContext{ Digest: digest, Citations: citations, Sources: sources, Usage: usage, Cost: CostBreakdown{ WebTool: computeUSD(p.model, usage, p.cfg) + float64(out.Usage.ServerSideToolUsageDetails.WebSearchCalls)*grokWebSearchPerCall, }, } if digest == "" { return wc, fmt.Errorf("grok web search: empty result") } return wc, nil } // --- gemini_grounding (native v1beta google_search; current models incl. 2.5) ------ type geminiGrounding struct { gem *geminiClient st *Store cfg *Config logger *slog.Logger } func (p *geminiGrounding) Fetch(ctx context.Context, query string) (WebContext, error) { // Durable, atomic daily cap FIRST: a grounded prompt is billed whether or not it // grounds, and the per-prompt overage ($35/1k on 2.5) is the cost this guard exists // to bound. Admit against the cap before spending. (grok_web_search needs no such // cap — its $5/1k per-call fee is fully reserved per request and bounded by the // per-user request cap + global ceiling.) if ok, err := p.st.IncrGroundingIfUnder(p.cfg.WebGroundingDailyCap); err != nil { return WebContext{}, err } else if !ok { return WebContext{}, errGroundingCapped // hit BEFORE billing → no fee, no slot consumed } res, err := p.gem.groundedSearch(ctx, query) // errors (incl. no-citations) → caller degrades // SG1: the prompt is admitted, so treat it as billed — book the token cost AND the // per-grounded-prompt fee, even on the error return. The fee is the money truth the // $10 ceiling must see; it is kept separate from the cap quota below. cost := CostBreakdown{ Grounding: computeUSD(p.cfg.GeminiModel, res.Usage, p.cfg), GroundingFee: p.cfg.GeminiGroundingPerPrompt, } if err != nil { // SG4: the admitted slot produced no usable grounding (no citations, or the call // failed). Refund the cap slot so over-routing / failed fetches don't burn the // day's grounded-answer budget — independent of the fee, which stays booked. // Best-effort: a failed refund only slightly tightens the cap, never money. if derr := p.st.DecrGrounding(); derr != nil && p.logger != nil { p.logger.WarnContext(ctx, "grounding cap refund failed (non-fatal)", "err", derr) } return WebContext{Cost: cost, Usage: res.Usage}, err } return WebContext{Digest: res.Digest, Citations: res.Citations, Sources: res.Sources, Usage: res.Usage, Cost: cost}, nil }