223 lines
8.3 KiB
Go
223 lines
8.3 KiB
Go
package main
|
|
|
|
import (
|
|
"bytes"
|
|
"context"
|
|
"encoding/json"
|
|
"errors"
|
|
"fmt"
|
|
"io"
|
|
"log/slog"
|
|
"net/http"
|
|
)
|
|
|
|
// web.go is the pluggable web-freshness layer (Phase 3). A WebProvider fetches a
|
|
// grounded factual digest + source URLs for a query; the cascade then has Grok
|
|
// synthesise the final answer in voice from that digest. Two providers, chosen by
|
|
// WEB_PROVIDER:
|
|
//
|
|
// - grok_web_search (DEFAULT): the xAI Agent Tools `web_search` tool on the Responses
|
|
// API (/v1/responses). NB the older chat/completions Live Search `search_parameters`
|
|
// mechanism was RETIRED by xAI (now 410 Gone), and the web_search tool is not on
|
|
// chat/completions — hence the Responses endpoint. Billed $5/1k tool calls + tokens.
|
|
// - gemini_grounding: Gemini native v1beta google_search. Cheaper, but Gemini-3 only
|
|
// and silently ungrounds otherwise (F-EXT-3) — so it runs behind a citations
|
|
// verify-gate and degrades if it fails.
|
|
//
|
|
// The web call is bounded by a per-stage timeout (and gemini_grounding additionally by a
|
|
// durable daily cap), and either provider failing degrades the request to grok_direct
|
|
// with a staleness hedge (never silence, never stale-as-fresh).
|
|
//
|
|
// The grok_web_search Responses-API request/response shape was VALIDATED live against
|
|
// /v1/responses (2026-06-01): output[].type=="message" → content[].output_text + inline
|
|
// url_citation annotations; usage carries input/output tokens, cached subset, and the
|
|
// web_search_calls count (one request can search several times — each billed). The
|
|
// computed cost matched the API's own cost_in_usd_ticks to 4 dp. A parse miss still
|
|
// degrades safely (empty digest → grok_direct).
|
|
const (
|
|
webProviderGrokWebSearch = "grok_web_search"
|
|
webProviderGeminiGrounding = "gemini_grounding"
|
|
|
|
// grokWebSearchPerCall is xAI's Agent Tools fee: $5 per 1,000 web_search tool calls.
|
|
grokWebSearchPerCall = 5.0 / 1000.0
|
|
|
|
// maxWebSearchCalls bounds the per-call fee in the reservation envelope (one Responses
|
|
// request can search several times; the actual count is billed exactly at settle).
|
|
maxWebSearchCalls = 4
|
|
)
|
|
|
|
// errGroundingCapped signals the daily web/grounded-prompt cap was hit, so the caller
|
|
// degrades (with a hedge) rather than paying past the cap.
|
|
var errGroundingCapped = errors.New("web grounding daily cap reached")
|
|
|
|
// WebContext is the result of a web fetch: a factual digest to feed the final model,
|
|
// the sources behind it, the fetch's own token usage, and the cost the fetch incurred
|
|
// (kept separate from the final synthesis tokens so each books to its own ledger
|
|
// column). Cost is populated even when Digest is empty/failed, because the call was
|
|
// still billed — the caller books it before degrading (§8.1 partial cascade).
|
|
type WebContext struct {
|
|
Digest string
|
|
Citations []string
|
|
Usage Usage
|
|
Cost CostBreakdown
|
|
}
|
|
|
|
// WebProvider fetches grounded facts for a query. Stateless. It returns its cost in the
|
|
// WebContext even on error (the call was billed), and an error when the digest is
|
|
// unusable so the caller can degrade.
|
|
type WebProvider interface {
|
|
Fetch(ctx context.Context, query string) (WebContext, error)
|
|
}
|
|
|
|
// --- grok_web_search (default): xAI Agent Tools web_search on the Responses API -------
|
|
|
|
type grokWebSearch struct {
|
|
base string
|
|
key string
|
|
model string
|
|
cfg *Config
|
|
httpc *http.Client
|
|
logger *slog.Logger
|
|
}
|
|
|
|
func newGrokWebSearch(cfg *Config, logger *slog.Logger) *grokWebSearch {
|
|
return &grokWebSearch{
|
|
base: cfg.XAIBaseURL, key: cfg.XAIAPIKey, model: cfg.XAIModel,
|
|
cfg: cfg, httpc: &http.Client{}, logger: logger,
|
|
}
|
|
}
|
|
|
|
type grokResponsesRequest struct {
|
|
Model string `json:"model"`
|
|
Input string `json:"input"`
|
|
Tools []openAITool `json:"tools"`
|
|
// Keep the fetch fast/cheap when the operator runs a unified model with effort
|
|
// "none"; empty → not sent (provider default). Validated against /v1/responses.
|
|
ReasoningEffort string `json:"reasoning_effort,omitempty"`
|
|
}
|
|
|
|
// grokResponsesResponse maps the xAI Responses API shape (verified live 2026-06-01):
|
|
// output[] carries reasoning/web_search_call/message items; the message item's content
|
|
// has output_text (with inline url_citation annotations); usage reports tokens, the
|
|
// cached subset, and the count of server-side web_search calls (a single request can
|
|
// make several, each billed).
|
|
type grokResponsesResponse struct {
|
|
Output []struct {
|
|
Type string `json:"type"`
|
|
Content []struct {
|
|
Type string `json:"type"`
|
|
Text string `json:"text"`
|
|
Annotations []struct {
|
|
Type string `json:"type"`
|
|
URL string `json:"url"`
|
|
} `json:"annotations"`
|
|
} `json:"content"`
|
|
} `json:"output"`
|
|
Usage struct {
|
|
InputTokens int `json:"input_tokens"`
|
|
OutputTokens int `json:"output_tokens"`
|
|
InputTokensDetails struct {
|
|
CachedTokens int `json:"cached_tokens"`
|
|
} `json:"input_tokens_details"`
|
|
ServerSideToolUsageDetails struct {
|
|
WebSearchCalls int `json:"web_search_calls"`
|
|
} `json:"server_side_tool_usage_details"`
|
|
} `json:"usage"`
|
|
}
|
|
|
|
func (p *grokWebSearch) Fetch(ctx context.Context, query string) (WebContext, error) {
|
|
body, err := json.Marshal(grokResponsesRequest{
|
|
Model: p.model, Input: query, Tools: []openAITool{{Type: "web_search"}},
|
|
ReasoningEffort: p.cfg.GrokReasoningEffort,
|
|
})
|
|
if err != nil {
|
|
return WebContext{}, err
|
|
}
|
|
req, err := http.NewRequestWithContext(ctx, http.MethodPost, p.base+"/responses", bytes.NewReader(body))
|
|
if err != nil {
|
|
return WebContext{}, err
|
|
}
|
|
req.Header.Set("Content-Type", "application/json")
|
|
req.Header.Set("Authorization", "Bearer "+p.key)
|
|
|
|
resp, err := p.httpc.Do(req)
|
|
if err != nil {
|
|
return WebContext{}, err
|
|
}
|
|
defer resp.Body.Close()
|
|
data, _ := io.ReadAll(resp.Body)
|
|
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
|
|
return WebContext{}, fmt.Errorf("grok web search http %d: %s", resp.StatusCode, snippet(data))
|
|
}
|
|
var out grokResponsesResponse
|
|
if err := json.Unmarshal(data, &out); err != nil {
|
|
return WebContext{}, fmt.Errorf("grok web search decode: %w", err)
|
|
}
|
|
|
|
var digest string
|
|
var citations []string
|
|
for _, item := range out.Output {
|
|
if item.Type != "message" {
|
|
continue
|
|
}
|
|
for _, c := range item.Content {
|
|
if c.Type == "output_text" {
|
|
digest += c.Text
|
|
}
|
|
for _, a := range c.Annotations {
|
|
if a.Type == "url_citation" && a.URL != "" {
|
|
citations = append(citations, a.URL)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
usage := Usage{
|
|
PromptTokens: out.Usage.InputTokens,
|
|
CachedTokens: out.Usage.InputTokensDetails.CachedTokens,
|
|
CompletionTokens: out.Usage.OutputTokens,
|
|
}
|
|
// Cost = the call's tokens + the $5/1k fee times the ACTUAL number of web_search
|
|
// calls the request made (one request can search several times). Booked even when the
|
|
// digest is empty (the 2xx was billed), so the caller accounts for it before degrading.
|
|
// Cross-checked live against the API's own cost_in_usd_ticks — matched to 4 dp.
|
|
wc := WebContext{
|
|
Digest: digest,
|
|
Citations: citations,
|
|
Usage: usage,
|
|
Cost: CostBreakdown{
|
|
WebTool: computeUSD(p.model, usage, p.cfg) +
|
|
float64(out.Usage.ServerSideToolUsageDetails.WebSearchCalls)*grokWebSearchPerCall,
|
|
},
|
|
}
|
|
if digest == "" {
|
|
return wc, fmt.Errorf("grok web search: empty result")
|
|
}
|
|
return wc, nil
|
|
}
|
|
|
|
// --- gemini_grounding (Gemini-3 native only) --------------------------------------
|
|
|
|
type geminiGrounding struct {
|
|
gem *geminiClient
|
|
st *Store
|
|
cfg *Config
|
|
}
|
|
|
|
func (p *geminiGrounding) Fetch(ctx context.Context, query string) (WebContext, error) {
|
|
// Durable, atomic daily cap FIRST: a grounded prompt is billed whether or not it
|
|
// grounds, and the per-prompt overage ($35/1k on 2.5) is the cost this guard exists
|
|
// to bound. Admit against the cap before spending. (grok_web_search needs no such
|
|
// cap — its $5/1k per-call fee is fully reserved per request and bounded by the
|
|
// per-user request cap + global ceiling.)
|
|
if ok, err := p.st.IncrGroundingIfUnder(p.cfg.WebGroundingDailyCap); err != nil {
|
|
return WebContext{}, err
|
|
} else if !ok {
|
|
return WebContext{}, errGroundingCapped
|
|
}
|
|
res, err := p.gem.groundedSearch(ctx, query) // errors (incl. no-citations) → caller degrades
|
|
cost := CostBreakdown{Grounding: computeUSD(p.cfg.GeminiModel, res.Usage, p.cfg)}
|
|
if err != nil {
|
|
return WebContext{Cost: cost, Usage: res.Usage}, err
|
|
}
|
|
return WebContext{Digest: res.Digest, Citations: res.Citations, Usage: res.Usage, Cost: cost}, nil
|
|
}
|