vojo/apps/ai-bot/web.go

package main

import (
	"bytes"
	"context"
	"encoding/json"
	"errors"
	"fmt"
	"io"
	"log/slog"
	"net/http"
)

// web.go is the pluggable web-freshness layer (Phase 3). A WebProvider fetches a
// grounded factual digest + source URLs for a query; the cascade then has Grok
// synthesise the final answer in voice from that digest. Two providers, chosen by
// WEB_PROVIDER:
//
//   - grok_web_search (DEFAULT): the xAI Agent Tools `web_search` tool on the Responses
//     API (/v1/responses). NB the older chat/completions Live Search `search_parameters`
//     mechanism was RETIRED by xAI (now 410 Gone), and the web_search tool is not on
//     chat/completions — hence the Responses endpoint. Billed $5/1k tool calls + tokens.
//   - gemini_grounding: Gemini native v1beta google_search. Cheaper. Works on current
//     models INCLUDING gemini-2.5-flash-lite (verified against ai.google.dev — the 2.5
//     family supports google_search; only legacy models use google_search_retrieval).
//     The F-EXT-3 "silently ungrounds" caveat is about the OpenAI-compat endpoint, NOT
//     the model version — so this provider uses the NATIVE v1beta path and runs behind a
//     citations verify-gate, degrading if no citations come back.
//
// The web call is bounded by a per-stage timeout (and gemini_grounding additionally by a
// durable daily cap), and either provider failing degrades the request to grok_direct
// with a staleness hedge (never silence, never stale-as-fresh).
//
// The grok_web_search Responses-API request/response shape was VALIDATED live against
// /v1/responses (2026-06-01): output[].type=="message" → content[].output_text + inline
// url_citation annotations; usage carries input/output tokens, cached subset, and the
// web_search_calls count (one request can search several times — each billed). The
// computed cost matched the API's own cost_in_usd_ticks to 4 dp. A parse miss still
// degrades safely (empty digest → grok_direct).
const (
	webProviderGrokWebSearch   = "grok_web_search"
	webProviderGeminiGrounding = "gemini_grounding"

	// grokWebSearchPerCall is xAI's Agent Tools fee: $5 per 1,000 web_search tool calls.
	grokWebSearchPerCall = 5.0 / 1000.0

	// maxWebSearchCalls bounds the per-call fee in the reservation envelope (one Responses
	// request can search several times; the actual count is billed exactly at settle).
	maxWebSearchCalls = 4
)

// errGroundingCapped signals the daily web/grounded-prompt cap was hit, so the caller
// degrades (with a hedge) rather than paying past the cap.
var errGroundingCapped = errors.New("web grounding daily cap reached")

// WebSource is one attributable source behind a web answer: a human label (the publisher
// domain) and a link the END USER can open. For gemini grounding the URL is the
// grounding-api-redirect (clicked by the user → the real article; never resolved
// server-side, which Gemini's terms forbid); for grok_web_search it is the real publisher
// URL. Surfaced to the user as a compact "Sources" footer (sources.go).
type WebSource struct {
	Title string // publisher domain ("rbc.ru") — the citation's web.title / the URL host
	URL   string // the link to open (gemini: redirect; grok: real article URL)
}

// WebContext is the result of a web fetch: a factual digest to feed the final model,
// the sources behind it, the fetch's own token usage, and the cost the fetch incurred
// (kept separate from the final synthesis tokens so each books to its own ledger
// column). Cost is populated even when Digest is empty/failed, because the call was
// still billed — the caller books it before degrading (§8.1 partial cascade).
type WebContext struct {
	Digest    string
	Citations []string    // raw source URLs (the verify-gate + citation_count telemetry)
	Sources   []WebSource // the same sources with display titles (the user-facing footer)
	Usage     Usage
	Cost      CostBreakdown
}

// WebProvider fetches grounded facts for a query. Stateless. It returns its cost in the
// WebContext even on error (the call was billed), and an error when the digest is
// unusable so the caller can degrade.
type WebProvider interface {
	Fetch(ctx context.Context, query string) (WebContext, error)
}

// --- grok_web_search (default): xAI Agent Tools web_search on the Responses API -------

type grokWebSearch struct {
	base   string
	key    string
	model  string
	cfg    *Config
	httpc  *http.Client
	logger *slog.Logger
}

func newGrokWebSearch(cfg *Config, logger *slog.Logger) *grokWebSearch {
	return &grokWebSearch{
		base: cfg.XAIBaseURL, key: cfg.XAIAPIKey, model: cfg.XAIModel,
		cfg: cfg, httpc: &http.Client{}, logger: logger,
	}
}

type grokResponsesRequest struct {
	Model string       `json:"model"`
	Input string       `json:"input"`
	Tools []openAITool `json:"tools"`
	// Keep the fetch fast/cheap when the operator runs a unified model with effort
	// "none"; empty → not sent (provider default). Validated against /v1/responses.
	ReasoningEffort string `json:"reasoning_effort,omitempty"`
}

// grokResponsesResponse maps the xAI Responses API shape (verified live 2026-06-01):
// output[] carries reasoning/web_search_call/message items; the message item's content
// has output_text (with inline url_citation annotations); usage reports tokens, the
// cached subset, and the count of server-side web_search calls (a single request can
// make several, each billed).
type grokResponsesResponse struct {
	Output []struct {
		Type    string `json:"type"`
		Content []struct {
			Type        string `json:"type"`
			Text        string `json:"text"`
			Annotations []struct {
				Type string `json:"type"`
				URL  string `json:"url"`
			} `json:"annotations"`
		} `json:"content"`
	} `json:"output"`
	Usage struct {
		InputTokens        int `json:"input_tokens"`
		OutputTokens       int `json:"output_tokens"`
		InputTokensDetails struct {
			CachedTokens int `json:"cached_tokens"`
		} `json:"input_tokens_details"`
		ServerSideToolUsageDetails struct {
			WebSearchCalls int `json:"web_search_calls"`
		} `json:"server_side_tool_usage_details"`
	} `json:"usage"`
}

func (p *grokWebSearch) Fetch(ctx context.Context, query string) (WebContext, error) {
	body, err := json.Marshal(grokResponsesRequest{
		Model: p.model, Input: query, Tools: []openAITool{{Type: "web_search"}},
		ReasoningEffort: p.cfg.GrokReasoningEffort,
	})
	if err != nil {
		return WebContext{}, err
	}
	req, err := http.NewRequestWithContext(ctx, http.MethodPost, p.base+"/responses", bytes.NewReader(body))
	if err != nil {
		return WebContext{}, err
	}
	req.Header.Set("Content-Type", "application/json")
	req.Header.Set("Authorization", "Bearer "+p.key)

	resp, err := p.httpc.Do(req)
	if err != nil {
		return WebContext{}, err
	}
	defer resp.Body.Close()
	data, _ := io.ReadAll(resp.Body)
	logLLMExchange(ctx, p.logger, "grok_web_search", body, resp.StatusCode, data)
	if resp.StatusCode < 200 || resp.StatusCode >= 300 {
		return WebContext{}, fmt.Errorf("grok web search http %d: %s", resp.StatusCode, snippet(data))
	}
	var out grokResponsesResponse
	if err := json.Unmarshal(data, &out); err != nil {
		return WebContext{}, fmt.Errorf("grok web search decode: %w", err)
	}

	var digest string
	var citations []string
	var sources []WebSource
	for _, item := range out.Output {
		if item.Type != "message" {
			continue
		}
		for _, c := range item.Content {
			if c.Type == "output_text" {
				digest += c.Text
			}
			for _, a := range c.Annotations {
				if a.Type == "url_citation" && a.URL != "" {
					citations = append(citations, a.URL)
					// grok returns real publisher URLs, so the host IS the display domain.
					sources = append(sources, WebSource{Title: hostOf(a.URL), URL: a.URL})
				}
			}
		}
	}
	usage := Usage{
		PromptTokens:     out.Usage.InputTokens,
		CachedTokens:     out.Usage.InputTokensDetails.CachedTokens,
		CompletionTokens: out.Usage.OutputTokens,
	}
	// Cost = the call's tokens + the $5/1k fee times the ACTUAL number of web_search
	// calls the request made (one request can search several times). Booked even when the
	// digest is empty (the 2xx was billed), so the caller accounts for it before degrading.
	// Cross-checked live against the API's own cost_in_usd_ticks — matched to 4 dp.
	wc := WebContext{
		Digest:    digest,
		Citations: citations,
		Sources:   sources,
		Usage:     usage,
		Cost: CostBreakdown{
			WebTool: computeUSD(p.model, usage, p.cfg) +
				float64(out.Usage.ServerSideToolUsageDetails.WebSearchCalls)*grokWebSearchPerCall,
		},
	}
	if digest == "" {
		return wc, fmt.Errorf("grok web search: empty result")
	}
	return wc, nil
}

// --- gemini_grounding (native v1beta google_search; current models incl. 2.5) ------

type geminiGrounding struct {
	gem    *geminiClient
	st     *Store
	cfg    *Config
	logger *slog.Logger
}

func (p *geminiGrounding) Fetch(ctx context.Context, query string) (WebContext, error) {
	// Durable, atomic daily cap FIRST: a grounded prompt is billed whether or not it
	// grounds, and the per-prompt overage ($35/1k on 2.5) is the cost this guard exists
	// to bound. Admit against the cap before spending. (grok_web_search needs no such
	// cap — its $5/1k per-call fee is fully reserved per request and bounded by the
	// per-user request cap + global ceiling.)
	if ok, err := p.st.IncrGroundingIfUnder(p.cfg.WebGroundingDailyCap); err != nil {
		return WebContext{}, err
	} else if !ok {
		return WebContext{}, errGroundingCapped // hit BEFORE billing → no fee, no slot consumed
	}
	res, err := p.gem.groundedSearch(ctx, query) // errors (incl. no-citations) → caller degrades
	// SG1: the prompt is admitted, so treat it as billed — book the token cost AND the
	// per-grounded-prompt fee, even on the error return. The fee is the money truth the
	// $10 ceiling must see; it is kept separate from the cap quota below.
	cost := CostBreakdown{
		Grounding:    computeUSD(p.cfg.GeminiModel, res.Usage, p.cfg),
		GroundingFee: p.cfg.GeminiGroundingPerPrompt,
	}
	if err != nil {
		// SG4: the admitted slot produced no usable grounding (no citations, or the call
		// failed). Refund the cap slot so over-routing / failed fetches don't burn the
		// day's grounded-answer budget — independent of the fee, which stays booked.
		// Best-effort: a failed refund only slightly tightens the cap, never money.
		if derr := p.st.DecrGrounding(); derr != nil && p.logger != nil {
			p.logger.WarnContext(ctx, "grounding cap refund failed (non-fatal)", "err", derr)
		}
		return WebContext{Cost: cost, Usage: res.Usage}, err
	}
	return WebContext{Digest: res.Digest, Citations: res.Citations, Sources: res.Sources, Usage: res.Usage, Cost: cost}, nil
}