vojo/apps/ai-bot/web.go

256 lines
10 KiB
Go

package main
import (
"bytes"
"context"
"encoding/json"
"errors"
"fmt"
"io"
"log/slog"
"net/http"
)
// web.go is the pluggable web-freshness layer (Phase 3). A WebProvider fetches a
// grounded factual digest + source URLs for a query; the cascade then has Grok
// synthesise the final answer in voice from that digest. Two providers, chosen by
// WEB_PROVIDER:
//
// - grok_web_search (DEFAULT): the xAI Agent Tools `web_search` tool on the Responses
// API (/v1/responses). NB the older chat/completions Live Search `search_parameters`
// mechanism was RETIRED by xAI (now 410 Gone), and the web_search tool is not on
// chat/completions — hence the Responses endpoint. Billed $5/1k tool calls + tokens.
// - gemini_grounding: Gemini native v1beta google_search. Cheaper. Works on current
// models INCLUDING gemini-2.5-flash-lite (verified against ai.google.dev — the 2.5
// family supports google_search; only legacy models use google_search_retrieval).
// The F-EXT-3 "silently ungrounds" caveat is about the OpenAI-compat endpoint, NOT
// the model version — so this provider uses the NATIVE v1beta path and runs behind a
// citations verify-gate, degrading if no citations come back.
//
// The web call is bounded by a per-stage timeout (and gemini_grounding additionally by a
// durable daily cap), and either provider failing degrades the request to grok_direct
// with a staleness hedge (never silence, never stale-as-fresh).
//
// The grok_web_search Responses-API request/response shape was VALIDATED live against
// /v1/responses (2026-06-01): output[].type=="message" → content[].output_text + inline
// url_citation annotations; usage carries input/output tokens, cached subset, and the
// web_search_calls count (one request can search several times — each billed). The
// computed cost matched the API's own cost_in_usd_ticks to 4 dp. A parse miss still
// degrades safely (empty digest → grok_direct).
const (
webProviderGrokWebSearch = "grok_web_search"
webProviderGeminiGrounding = "gemini_grounding"
// grokWebSearchPerCall is xAI's Agent Tools fee: $5 per 1,000 web_search tool calls.
grokWebSearchPerCall = 5.0 / 1000.0
// maxWebSearchCalls bounds the per-call fee in the reservation envelope (one Responses
// request can search several times; the actual count is billed exactly at settle).
maxWebSearchCalls = 4
)
// errGroundingCapped signals the daily web/grounded-prompt cap was hit, so the caller
// degrades (with a hedge) rather than paying past the cap.
var errGroundingCapped = errors.New("web grounding daily cap reached")
// WebSource is one attributable source behind a web answer: a human label (the publisher
// domain) and a link the END USER can open. For gemini grounding the URL is the
// grounding-api-redirect (clicked by the user → the real article; never resolved
// server-side, which Gemini's terms forbid); for grok_web_search it is the real publisher
// URL. Surfaced to the user as a compact "Sources" footer (sources.go).
type WebSource struct {
Title string // publisher domain ("rbc.ru") — the citation's web.title / the URL host
URL string // the link to open (gemini: redirect; grok: real article URL)
}
// WebContext is the result of a web fetch: a factual digest to feed the final model,
// the sources behind it, the fetch's own token usage, and the cost the fetch incurred
// (kept separate from the final synthesis tokens so each books to its own ledger
// column). Cost is populated even when Digest is empty/failed, because the call was
// still billed — the caller books it before degrading (§8.1 partial cascade).
type WebContext struct {
Digest string
Citations []string // raw source URLs (the verify-gate + citation_count telemetry)
Sources []WebSource // the same sources with display titles (the user-facing footer)
Usage Usage
Cost CostBreakdown
}
// WebProvider fetches grounded facts for a query. Stateless. It returns its cost in the
// WebContext even on error (the call was billed), and an error when the digest is
// unusable so the caller can degrade.
type WebProvider interface {
Fetch(ctx context.Context, query string) (WebContext, error)
}
// --- grok_web_search (default): xAI Agent Tools web_search on the Responses API -------
type grokWebSearch struct {
base string
key string
model string
cfg *Config
httpc *http.Client
logger *slog.Logger
}
func newGrokWebSearch(cfg *Config, logger *slog.Logger) *grokWebSearch {
return &grokWebSearch{
base: cfg.XAIBaseURL, key: cfg.XAIAPIKey, model: cfg.XAIModel,
cfg: cfg, httpc: &http.Client{}, logger: logger,
}
}
type grokResponsesRequest struct {
Model string `json:"model"`
Input string `json:"input"`
Tools []openAITool `json:"tools"`
// Keep the fetch fast/cheap when the operator runs a unified model with effort
// "none"; empty → not sent (provider default). Validated against /v1/responses.
ReasoningEffort string `json:"reasoning_effort,omitempty"`
}
// grokResponsesResponse maps the xAI Responses API shape (verified live 2026-06-01):
// output[] carries reasoning/web_search_call/message items; the message item's content
// has output_text (with inline url_citation annotations); usage reports tokens, the
// cached subset, and the count of server-side web_search calls (a single request can
// make several, each billed).
type grokResponsesResponse struct {
Output []struct {
Type string `json:"type"`
Content []struct {
Type string `json:"type"`
Text string `json:"text"`
Annotations []struct {
Type string `json:"type"`
URL string `json:"url"`
} `json:"annotations"`
} `json:"content"`
} `json:"output"`
Usage struct {
InputTokens int `json:"input_tokens"`
OutputTokens int `json:"output_tokens"`
InputTokensDetails struct {
CachedTokens int `json:"cached_tokens"`
} `json:"input_tokens_details"`
ServerSideToolUsageDetails struct {
WebSearchCalls int `json:"web_search_calls"`
} `json:"server_side_tool_usage_details"`
} `json:"usage"`
}
func (p *grokWebSearch) Fetch(ctx context.Context, query string) (WebContext, error) {
body, err := json.Marshal(grokResponsesRequest{
Model: p.model, Input: query, Tools: []openAITool{{Type: "web_search"}},
ReasoningEffort: p.cfg.GrokReasoningEffort,
})
if err != nil {
return WebContext{}, err
}
req, err := http.NewRequestWithContext(ctx, http.MethodPost, p.base+"/responses", bytes.NewReader(body))
if err != nil {
return WebContext{}, err
}
req.Header.Set("Content-Type", "application/json")
req.Header.Set("Authorization", "Bearer "+p.key)
resp, err := p.httpc.Do(req)
if err != nil {
return WebContext{}, err
}
defer resp.Body.Close()
data, _ := io.ReadAll(resp.Body)
logLLMExchange(ctx, p.logger, "grok_web_search", body, resp.StatusCode, data)
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
return WebContext{}, fmt.Errorf("grok web search http %d: %s", resp.StatusCode, snippet(data))
}
var out grokResponsesResponse
if err := json.Unmarshal(data, &out); err != nil {
return WebContext{}, fmt.Errorf("grok web search decode: %w", err)
}
var digest string
var citations []string
var sources []WebSource
for _, item := range out.Output {
if item.Type != "message" {
continue
}
for _, c := range item.Content {
if c.Type == "output_text" {
digest += c.Text
}
for _, a := range c.Annotations {
if a.Type == "url_citation" && a.URL != "" {
citations = append(citations, a.URL)
// grok returns real publisher URLs, so the host IS the display domain.
sources = append(sources, WebSource{Title: hostOf(a.URL), URL: a.URL})
}
}
}
}
usage := Usage{
PromptTokens: out.Usage.InputTokens,
CachedTokens: out.Usage.InputTokensDetails.CachedTokens,
CompletionTokens: out.Usage.OutputTokens,
}
// Cost = the call's tokens + the $5/1k fee times the ACTUAL number of web_search
// calls the request made (one request can search several times). Booked even when the
// digest is empty (the 2xx was billed), so the caller accounts for it before degrading.
// Cross-checked live against the API's own cost_in_usd_ticks — matched to 4 dp.
wc := WebContext{
Digest: digest,
Citations: citations,
Sources: sources,
Usage: usage,
Cost: CostBreakdown{
WebTool: computeUSD(p.model, usage, p.cfg) +
float64(out.Usage.ServerSideToolUsageDetails.WebSearchCalls)*grokWebSearchPerCall,
},
}
if digest == "" {
return wc, fmt.Errorf("grok web search: empty result")
}
return wc, nil
}
// --- gemini_grounding (native v1beta google_search; current models incl. 2.5) ------
type geminiGrounding struct {
gem *geminiClient
st *Store
cfg *Config
logger *slog.Logger
}
func (p *geminiGrounding) Fetch(ctx context.Context, query string) (WebContext, error) {
// Durable, atomic daily cap FIRST: a grounded prompt is billed whether or not it
// grounds, and the per-prompt overage ($35/1k on 2.5) is the cost this guard exists
// to bound. Admit against the cap before spending. (grok_web_search needs no such
// cap — its $5/1k per-call fee is fully reserved per request and bounded by the
// per-user request cap + global ceiling.)
if ok, err := p.st.IncrGroundingIfUnder(p.cfg.WebGroundingDailyCap); err != nil {
return WebContext{}, err
} else if !ok {
return WebContext{}, errGroundingCapped // hit BEFORE billing → no fee, no slot consumed
}
res, err := p.gem.groundedSearch(ctx, query) // errors (incl. no-citations) → caller degrades
// SG1: the prompt is admitted, so treat it as billed — book the token cost AND the
// per-grounded-prompt fee, even on the error return. The fee is the money truth the
// $10 ceiling must see; it is kept separate from the cap quota below.
cost := CostBreakdown{
Grounding: computeUSD(p.cfg.GeminiModel, res.Usage, p.cfg),
GroundingFee: p.cfg.GeminiGroundingPerPrompt,
}
if err != nil {
// SG4: the admitted slot produced no usable grounding (no citations, or the call
// failed). Refund the cap slot so over-routing / failed fetches don't burn the
// day's grounded-answer budget — independent of the fee, which stays booked.
// Best-effort: a failed refund only slightly tightens the cap, never money.
if derr := p.st.DecrGrounding(); derr != nil && p.logger != nil {
p.logger.WarnContext(ctx, "grounding cap refund failed (non-fatal)", "err", derr)
}
return WebContext{Cost: cost, Usage: res.Usage}, err
}
return WebContext{Digest: res.Digest, Citations: res.Citations, Sources: res.Sources, Usage: res.Usage, Cost: cost}, nil
}