vojo/apps/ai-bot/web.go

223 lines
8.3 KiB
Go

package main
import (
"bytes"
"context"
"encoding/json"
"errors"
"fmt"
"io"
"log/slog"
"net/http"
)
// web.go is the pluggable web-freshness layer (Phase 3). A WebProvider fetches a
// grounded factual digest + source URLs for a query; the cascade then has Grok
// synthesise the final answer in voice from that digest. Two providers, chosen by
// WEB_PROVIDER:
//
// - grok_web_search (DEFAULT): the xAI Agent Tools `web_search` tool on the Responses
// API (/v1/responses). NB the older chat/completions Live Search `search_parameters`
// mechanism was RETIRED by xAI (now 410 Gone), and the web_search tool is not on
// chat/completions — hence the Responses endpoint. Billed $5/1k tool calls + tokens.
// - gemini_grounding: Gemini native v1beta google_search. Cheaper, but Gemini-3 only
// and silently ungrounds otherwise (F-EXT-3) — so it runs behind a citations
// verify-gate and degrades if it fails.
//
// The web call is bounded by a per-stage timeout (and gemini_grounding additionally by a
// durable daily cap), and either provider failing degrades the request to grok_direct
// with a staleness hedge (never silence, never stale-as-fresh).
//
// The grok_web_search Responses-API request/response shape was VALIDATED live against
// /v1/responses (2026-06-01): output[].type=="message" → content[].output_text + inline
// url_citation annotations; usage carries input/output tokens, cached subset, and the
// web_search_calls count (one request can search several times — each billed). The
// computed cost matched the API's own cost_in_usd_ticks to 4 dp. A parse miss still
// degrades safely (empty digest → grok_direct).
const (
webProviderGrokWebSearch = "grok_web_search"
webProviderGeminiGrounding = "gemini_grounding"
// grokWebSearchPerCall is xAI's Agent Tools fee: $5 per 1,000 web_search tool calls.
grokWebSearchPerCall = 5.0 / 1000.0
// maxWebSearchCalls bounds the per-call fee in the reservation envelope (one Responses
// request can search several times; the actual count is billed exactly at settle).
maxWebSearchCalls = 4
)
// errGroundingCapped signals the daily web/grounded-prompt cap was hit, so the caller
// degrades (with a hedge) rather than paying past the cap.
var errGroundingCapped = errors.New("web grounding daily cap reached")
// WebContext is the result of a web fetch: a factual digest to feed the final model,
// the sources behind it, the fetch's own token usage, and the cost the fetch incurred
// (kept separate from the final synthesis tokens so each books to its own ledger
// column). Cost is populated even when Digest is empty/failed, because the call was
// still billed — the caller books it before degrading (§8.1 partial cascade).
type WebContext struct {
Digest string
Citations []string
Usage Usage
Cost CostBreakdown
}
// WebProvider fetches grounded facts for a query. Stateless. It returns its cost in the
// WebContext even on error (the call was billed), and an error when the digest is
// unusable so the caller can degrade.
type WebProvider interface {
Fetch(ctx context.Context, query string) (WebContext, error)
}
// --- grok_web_search (default): xAI Agent Tools web_search on the Responses API -------
type grokWebSearch struct {
base string
key string
model string
cfg *Config
httpc *http.Client
logger *slog.Logger
}
func newGrokWebSearch(cfg *Config, logger *slog.Logger) *grokWebSearch {
return &grokWebSearch{
base: cfg.XAIBaseURL, key: cfg.XAIAPIKey, model: cfg.XAIModel,
cfg: cfg, httpc: &http.Client{}, logger: logger,
}
}
type grokResponsesRequest struct {
Model string `json:"model"`
Input string `json:"input"`
Tools []openAITool `json:"tools"`
// Keep the fetch fast/cheap when the operator runs a unified model with effort
// "none"; empty → not sent (provider default). Validated against /v1/responses.
ReasoningEffort string `json:"reasoning_effort,omitempty"`
}
// grokResponsesResponse maps the xAI Responses API shape (verified live 2026-06-01):
// output[] carries reasoning/web_search_call/message items; the message item's content
// has output_text (with inline url_citation annotations); usage reports tokens, the
// cached subset, and the count of server-side web_search calls (a single request can
// make several, each billed).
type grokResponsesResponse struct {
Output []struct {
Type string `json:"type"`
Content []struct {
Type string `json:"type"`
Text string `json:"text"`
Annotations []struct {
Type string `json:"type"`
URL string `json:"url"`
} `json:"annotations"`
} `json:"content"`
} `json:"output"`
Usage struct {
InputTokens int `json:"input_tokens"`
OutputTokens int `json:"output_tokens"`
InputTokensDetails struct {
CachedTokens int `json:"cached_tokens"`
} `json:"input_tokens_details"`
ServerSideToolUsageDetails struct {
WebSearchCalls int `json:"web_search_calls"`
} `json:"server_side_tool_usage_details"`
} `json:"usage"`
}
func (p *grokWebSearch) Fetch(ctx context.Context, query string) (WebContext, error) {
body, err := json.Marshal(grokResponsesRequest{
Model: p.model, Input: query, Tools: []openAITool{{Type: "web_search"}},
ReasoningEffort: p.cfg.GrokReasoningEffort,
})
if err != nil {
return WebContext{}, err
}
req, err := http.NewRequestWithContext(ctx, http.MethodPost, p.base+"/responses", bytes.NewReader(body))
if err != nil {
return WebContext{}, err
}
req.Header.Set("Content-Type", "application/json")
req.Header.Set("Authorization", "Bearer "+p.key)
resp, err := p.httpc.Do(req)
if err != nil {
return WebContext{}, err
}
defer resp.Body.Close()
data, _ := io.ReadAll(resp.Body)
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
return WebContext{}, fmt.Errorf("grok web search http %d: %s", resp.StatusCode, snippet(data))
}
var out grokResponsesResponse
if err := json.Unmarshal(data, &out); err != nil {
return WebContext{}, fmt.Errorf("grok web search decode: %w", err)
}
var digest string
var citations []string
for _, item := range out.Output {
if item.Type != "message" {
continue
}
for _, c := range item.Content {
if c.Type == "output_text" {
digest += c.Text
}
for _, a := range c.Annotations {
if a.Type == "url_citation" && a.URL != "" {
citations = append(citations, a.URL)
}
}
}
}
usage := Usage{
PromptTokens: out.Usage.InputTokens,
CachedTokens: out.Usage.InputTokensDetails.CachedTokens,
CompletionTokens: out.Usage.OutputTokens,
}
// Cost = the call's tokens + the $5/1k fee times the ACTUAL number of web_search
// calls the request made (one request can search several times). Booked even when the
// digest is empty (the 2xx was billed), so the caller accounts for it before degrading.
// Cross-checked live against the API's own cost_in_usd_ticks — matched to 4 dp.
wc := WebContext{
Digest: digest,
Citations: citations,
Usage: usage,
Cost: CostBreakdown{
WebTool: computeUSD(p.model, usage, p.cfg) +
float64(out.Usage.ServerSideToolUsageDetails.WebSearchCalls)*grokWebSearchPerCall,
},
}
if digest == "" {
return wc, fmt.Errorf("grok web search: empty result")
}
return wc, nil
}
// --- gemini_grounding (Gemini-3 native only) --------------------------------------
type geminiGrounding struct {
gem *geminiClient
st *Store
cfg *Config
}
func (p *geminiGrounding) Fetch(ctx context.Context, query string) (WebContext, error) {
// Durable, atomic daily cap FIRST: a grounded prompt is billed whether or not it
// grounds, and the per-prompt overage ($35/1k on 2.5) is the cost this guard exists
// to bound. Admit against the cap before spending. (grok_web_search needs no such
// cap — its $5/1k per-call fee is fully reserved per request and bounded by the
// per-user request cap + global ceiling.)
if ok, err := p.st.IncrGroundingIfUnder(p.cfg.WebGroundingDailyCap); err != nil {
return WebContext{}, err
} else if !ok {
return WebContext{}, errGroundingCapped
}
res, err := p.gem.groundedSearch(ctx, query) // errors (incl. no-citations) → caller degrades
cost := CostBreakdown{Grounding: computeUSD(p.cfg.GeminiModel, res.Usage, p.cfg)}
if err != nil {
return WebContext{Cost: cost, Usage: res.Usage}, err
}
return WebContext{Digest: res.Digest, Citations: res.Citations, Usage: res.Usage, Cost: cost}, nil
}