vojo/apps/ai-bot/cascade_test.go

535 lines
23 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package main
import (
"context"
"errors"
"io"
"log/slog"
"strings"
"testing"
)
func discardLog() *slog.Logger { return slog.New(slog.NewTextHandler(io.Discard, nil)) }
// fakeLLM is a scriptable LLMClient for dispatch/degrade tests.
type fakeLLM struct {
text string
usage Usage
err error
calls int
lastReq LLMRequest
}
func (f *fakeLLM) Complete(_ context.Context, req LLMRequest) (*LLMResponse, error) {
f.calls++
f.lastReq = req
if f.err != nil {
return nil, f.err
}
return &LLMResponse{Text: f.text, Usage: f.usage, ProviderRequestID: "fake"}, nil
}
type fakeWeb struct {
wc WebContext
err error
calls int
lastQuery string
}
func (f *fakeWeb) Fetch(_ context.Context, q string) (WebContext, error) {
f.calls++
f.lastQuery = q
if f.err != nil {
return WebContext{}, f.err
}
return f.wc, nil
}
// cascadeCfg is a config with the model/price table set and EVERY cascade flag off.
// Tests flip individual flags on a copy.
func cascadeCfg() Config {
return Config{
XAIModel: "grok-x", GeminiModel: "gemini-x", ReasoningModel: "grok-reason",
MaxOutTok: 100, XAITemp: 0.5,
ReasoningTrigger: "подумай глубже",
ReasoningEffort: "high",
WebProvider: webProviderGrokWebSearch,
Prices: map[string]ModelPrice{
"grok-x": {InputPerM: 1, CachedPerM: 0.2, OutputPerM: 2},
"gemini-x": {InputPerM: 0.1, CachedPerM: 0.1, OutputPerM: 0.4},
},
}
}
func msgs(body string) []Message {
return []Message{{Role: "system", Content: "SYS"}, {Role: "user", Content: body}}
}
// TestGenerateAllFlagsOffIsGrokDirect is the cascade-off parity invariant: even a
// "trivial"-looking message goes to Grok, and Gemini is never touched, when the router
// is off.
func TestGenerateAllFlagsOffIsGrokDirect(t *testing.T) {
grok := &fakeLLM{text: "grok answer"}
gem := &fakeLLM{text: "should not run"}
cfg := cascadeCfg()
b := &Bot{cfg: &cfg, llm: grok, gemini: gem, log: discardLog()}
res, err := b.generate(context.Background(), "привет", msgs("привет"), "", true)
if err != nil {
t.Fatalf("generate: %v", err)
}
if res.route != routeGrokDirect || res.text != "grok answer" {
t.Fatalf("res = (%q,%q), want grok_direct/\"grok answer\"", res.route, res.text)
}
if res.decision.Source != "default" {
t.Fatalf("router source = %q, want default (router off)", res.decision.Source)
}
if grok.calls != 1 || gem.calls != 0 {
t.Fatalf("calls grok=%d gem=%d, want 1/0", grok.calls, gem.calls)
}
}
func TestGenerateTrivialOffload(t *testing.T) {
grok := &fakeLLM{text: "grok"}
gem := &fakeLLM{text: "gemini trivial"}
cfg := cascadeCfg()
cfg.RouterEnabled, cfg.TrivialOffloadEnabled = true, true
b := &Bot{cfg: &cfg, llm: grok, gemini: gem, log: discardLog()}
res, err := b.generate(context.Background(), "привет", msgs("привет"), "", true)
if err != nil {
t.Fatalf("generate: %v", err)
}
if res.route != routeTrivial || res.text != "gemini trivial" || res.finalModel != "gemini-x" {
t.Fatalf("res = (%q,%q,%q), want trivial/gemini", res.route, res.text, res.finalModel)
}
if gem.calls != 1 || grok.calls != 0 {
t.Fatalf("calls grok=%d gem=%d, want 0/1 (Gemini answered)", grok.calls, gem.calls)
}
}
// TestGenerateTrivialDegradesToGrok: Gemini failing on the trivial route must fall back
// to Grok, never go silent.
func TestGenerateTrivialDegradesToGrok(t *testing.T) {
grok := &fakeLLM{text: "grok fallback"}
gem := &fakeLLM{err: errors.New("gemini down")}
cfg := cascadeCfg()
cfg.RouterEnabled, cfg.TrivialOffloadEnabled = true, true
b := &Bot{cfg: &cfg, llm: grok, gemini: gem, log: discardLog()}
res, err := b.generate(context.Background(), "привет", msgs("привет"), "", true)
if err != nil {
t.Fatalf("generate: %v", err)
}
if res.route != routeGrokDirect || res.text != "grok fallback" {
t.Fatalf("res = (%q,%q), want grok_direct fallback", res.route, res.text)
}
if !res.fallback || res.degraded != degradeTrivial {
t.Fatalf("fallback=%v degraded=%q, want true/trivial_failed", res.fallback, res.degraded)
}
if gem.calls != 1 || grok.calls != 1 {
t.Fatalf("calls grok=%d gem=%d, want 1/1", grok.calls, gem.calls)
}
}
// TestGenerateWebThenGrok: a freshness query (classifier off → Layer-0 web) fetches then
// has Grok synthesise, booking both calls' tokens + the web fee.
func TestGenerateWebThenGrok(t *testing.T) {
grok := &fakeLLM{text: "synthesised", usage: Usage{PromptTokens: 100, CompletionTokens: 50}}
web := &fakeWeb{wc: WebContext{Digest: "fresh facts", Citations: []string{"http://src"}, Cost: CostBreakdown{WebTool: 0.1}}}
cfg := cascadeCfg()
cfg.RouterEnabled, cfg.WebEnabled = true, true
b := &Bot{cfg: &cfg, llm: grok, web: web, log: discardLog()}
res, err := b.generate(context.Background(), "какие новости сегодня", msgs("какие новости сегодня"), "", true)
if err != nil {
t.Fatalf("generate: %v", err)
}
if res.route != routeWebThenGrok || res.text != "synthesised" {
t.Fatalf("res = (%q,%q), want web_then_grok/synthesised", res.route, res.text)
}
if res.cost.WebTool != 0.1 || res.cost.Token <= 0 {
t.Fatalf("cost = %+v, want WebTool 0.1 + Token>0", res.cost)
}
if !res.webGrounded || res.citationCount != 1 {
t.Fatalf("webGrounded=%v citations=%d, want true/1", res.webGrounded, res.citationCount)
}
if web.calls != 1 || grok.calls != 1 {
t.Fatalf("calls web=%d grok=%d, want 1/1", web.calls, grok.calls)
}
}
// TestGenerateWebDegradesToGrok: a web fetch failure (cap hit) degrades to grok_direct,
// books no web cost, and — being a RECENCY query — uses the staleness hedge, not abstain.
func TestGenerateWebDegradesToGrok(t *testing.T) {
grok := &fakeLLM{text: "grok fallback"}
web := &fakeWeb{err: errGroundingCapped}
cfg := cascadeCfg()
cfg.RouterEnabled, cfg.WebEnabled = true, true
b := &Bot{cfg: &cfg, llm: grok, web: web, log: discardLog()}
res, err := b.generate(context.Background(), "новости сегодня", msgs("новости сегодня"), "", true)
if err != nil {
t.Fatalf("generate: %v", err)
}
if res.route != routeGrokDirect || res.text != "grok fallback" || !res.fallback {
t.Fatalf("res = (%q,%q,fallback=%v), want grok_direct fallback", res.route, res.text, res.fallback)
}
if res.degraded != degradeGroundCap {
t.Fatalf("degraded = %q, want grounding_cap (the specific reason)", res.degraded)
}
if res.cost.WebTool != 0 || res.cost.Grounding != 0 {
t.Fatalf("web cost = %+v, want 0 (fetch failed before billing)", res.cost)
}
// Recency miss → staleness hedge ("устаревшими"), not the factual-abstain hedge.
if !hedgeContains(grok.lastReq.Messages, "устаревш") {
t.Fatalf("freshness degrade should use the staleness hedge; messages = %+v", grok.lastReq.Messages)
}
}
// TestGenerateReasoningForced: the manual trigger routes to the reasoning model with
// reasoning_effort, independent of ROUTER_ENABLED.
func TestGenerateReasoningForced(t *testing.T) {
grok := &fakeLLM{text: "deep answer"}
cfg := cascadeCfg()
cfg.ReasoningEnabled = true // ROUTER_ENABLED deliberately left off
b := &Bot{cfg: &cfg, llm: grok, log: discardLog()}
res, err := b.generate(context.Background(), "подумай глубже про сознание", msgs("подумай глубже про сознание"), "", true)
if err != nil {
t.Fatalf("generate: %v", err)
}
if res.route != routeReason || res.decision.Source != "forced" {
t.Fatalf("res route=%q source=%q, want reason/forced", res.route, res.decision.Source)
}
if grok.lastReq.ReasoningEffort != "high" || grok.lastReq.Model != "grok-reason" {
t.Fatalf("reasoning req = (effort %q, model %q), want high/grok-reason", grok.lastReq.ReasoningEffort, grok.lastReq.Model)
}
}
// TestClassifyTrivialAgreementGate: a trivial route requires the Layer-0 candidate AND
// classifier.trivial AND confidence ≥ trivialFloor. A low-confidence "trivial" or a
// classifier that disagrees stays on grok_direct (no voice leak).
func TestClassifyTrivialAgreementGate(t *testing.T) {
cfg := cascadeCfg()
cfg.RouterEnabled, cfg.RouterClassifierEnabled = true, true
gem := &fakeLLM{}
b := &Bot{cfg: &cfg, gemini: gem, log: discardLog()}
var cost CostBreakdown
gem.text = `{"trivial":true,"needs_web":false,"confidence":0.95}`
if d := b.classify(context.Background(), "привет", "USER: привет", &cost); d.Route != routeTrivial {
t.Fatalf("agreed high-confidence trivial = %q, want trivial", d.Route)
}
gem.text = `{"trivial":true,"needs_web":false,"confidence":0.5}`
if d := b.classify(context.Background(), "привет", "USER: привет", &cost); d.Route != routeGrokDirect {
t.Fatalf("low-confidence trivial = %q, want grok_direct (no leak)", d.Route)
}
// A non-trivial body can never be trivial even if the classifier claims so.
gem.text = `{"trivial":true,"needs_web":false,"confidence":0.99}`
const substantive = "напиши подробное эссе про историю римской империи"
if d := b.classify(context.Background(), substantive, "USER: …", &cost); d.Route != routeGrokDirect {
t.Fatalf("classifier.trivial on a substantive body = %q, want grok_direct", d.Route)
}
}
// TestClassifyClassifierErrorFallsBackToLayer0: a classifier error/garbage degrades to the
// deterministic Layer-0 verdict — grok_direct for a substantive body, web for a freshness
// body — never an ungrounded confident answer, never a degrade-to-web.
func TestClassifyClassifierErrorFallsBackToLayer0(t *testing.T) {
cfg := cascadeCfg()
cfg.RouterEnabled, cfg.RouterClassifierEnabled, cfg.WebParanoid = true, true, true
gem := &fakeLLM{}
b := &Bot{cfg: &cfg, gemini: gem, log: discardLog()}
var cost CostBreakdown
// Transport error → Layer-0.
gem.err = errors.New("gemini down")
if d := b.classify(context.Background(), "напиши эссе про рим", "USER: …", &cost); d.Route != routeGrokDirect {
t.Fatalf("classifier error on substantive body = %q, want grok_direct (Layer-0)", d.Route)
}
if d := b.classify(context.Background(), "новости сегодня", "USER: …", &cost); d.Route != routeWebThenGrok {
t.Fatalf("classifier error on freshness body = %q, want web (deterministic Layer-0 survives)", d.Route)
}
// Garbage JSON (no transport error) → also Layer-0.
gem.err, gem.text = nil, "not json at all"
if d := b.classify(context.Background(), "напиши эссе про рим", "USER: …", &cost); d.Route != routeGrokDirect {
t.Fatalf("garbage classifier JSON = %q, want grok_direct (Layer-0)", d.Route)
}
}
// TestGenerateRoadHouseWebParanoidDM is the headline regression: an obscure-entity factual
// lookup in a DM, with the classifier + WEB_PARANOID on, routes to web AND the fetch uses
// the classifier's context-resolved search_query (the follow-up rewrite). With paranoid
// off it correctly stays grok_direct (the canary-neutral baseline).
func TestGenerateRoadHouseWebParanoidDM(t *testing.T) {
const verdict = `{"needs_web":true,"verifiable":true,"entity_obscure":true,"time_sensitive":false,"trivial":false,"search_query":"Дом у дороги 2024 фильм актёрский состав","confidence":0.7}`
mk := func(paranoid bool) (*fakeLLM, *fakeWeb, genResult) {
grok := &fakeLLM{text: "voiced", usage: Usage{PromptTokens: 10, CompletionTokens: 5}}
gem := &fakeLLM{text: verdict}
web := &fakeWeb{wc: WebContext{Digest: "cast: Patrick Swayze…", Citations: []string{"http://imdb"}}}
cfg := cascadeCfg()
cfg.RouterEnabled, cfg.RouterClassifierEnabled, cfg.WebEnabled, cfg.WebParanoid = true, true, true, paranoid
b := &Bot{cfg: &cfg, llm: grok, gemini: gem, web: web, log: discardLog()}
res, err := b.generate(context.Background(), "2024 года", []Message{
{Role: "system", Content: "SYS"},
{Role: "user", Content: "кто снимался в фильме дом у дороги"},
{Role: "assistant", Content: "В фильме 1989 года…"},
{Role: "user", Content: "2024 года"},
}, "", true)
if err != nil {
t.Fatalf("generate: %v", err)
}
return grok, web, res
}
_, web, res := mk(true)
if res.route != routeWebThenGrok {
t.Fatalf("paranoid DM road-house = %q, want web_then_grok (the fix)", res.route)
}
if !res.rewriteUsed || web.lastQuery != "Дом у дороги 2024 фильм актёрский состав" {
t.Fatalf("fetch should use the rewritten query: rewriteUsed=%v lastQuery=%q", res.rewriteUsed, web.lastQuery)
}
_, _, resOff := mk(false)
if resOff.route != routeGrokDirect {
t.Fatalf("paranoid OFF road-house = %q, want grok_direct (baseline)", resOff.route)
}
}
// TestGenerateFollowupGroupUsesBareBody: in a GROUP the context-resolved rewrite is
// suppressed — the fetch uses the bare (sanitised) body, never the classifier's
// search_query, so a member's follow-up can't ground the wrong prior subject.
func TestGenerateFollowupGroupUsesBareBody(t *testing.T) {
const verdict = `{"needs_web":true,"verifiable":true,"entity_obscure":true,"search_query":"какой-то чужой фильм 2024","confidence":0.7}`
grok := &fakeLLM{text: "voiced"}
gem := &fakeLLM{text: verdict}
web := &fakeWeb{wc: WebContext{Digest: "d", Citations: []string{"http://s"}}}
cfg := cascadeCfg()
cfg.RouterEnabled, cfg.RouterClassifierEnabled, cfg.WebEnabled, cfg.WebParanoid = true, true, true, true
b := &Bot{cfg: &cfg, llm: grok, gemini: gem, web: web, log: discardLog()}
res, err := b.generate(context.Background(), "2024 года", msgs("2024 года"), "", false /* group */)
if err != nil {
t.Fatalf("generate: %v", err)
}
if res.route != routeWebThenGrok {
t.Fatalf("group route = %q, want web_then_grok", res.route)
}
if res.rewriteUsed || web.lastQuery != "2024 года" {
t.Fatalf("group must use the bare body, not the rewrite: rewriteUsed=%v lastQuery=%q", res.rewriteUsed, web.lastQuery)
}
}
// TestGenerateWebEmptySearchQueryFallsBackToBody: the rewrite-with-fallback contract's
// empty arm (§6/§12). A DM web route whose classifier returned an empty search_query must
// fetch the bare (sanitised) body and report rewriteUsed=false — never an empty query.
func TestGenerateWebEmptySearchQueryFallsBackToBody(t *testing.T) {
// verifiable:true so it genuinely routes web (the needs_web arm requires verifiable);
// search_query empty is the point — the fetch must fall back to the bare body.
const verdict = `{"needs_web":true,"verifiable":true,"entity_obscure":false,"search_query":"","confidence":0.7}`
grok := &fakeLLM{text: "voiced"}
gem := &fakeLLM{text: verdict}
web := &fakeWeb{wc: WebContext{Digest: "d", Citations: []string{"http://s"}}}
cfg := cascadeCfg()
cfg.RouterEnabled, cfg.RouterClassifierEnabled, cfg.WebEnabled, cfg.WebParanoid = true, true, true, true
b := &Bot{cfg: &cfg, llm: grok, gemini: gem, web: web, log: discardLog()}
const body = "в каком году основан Рим"
res, err := b.generate(context.Background(), body, msgs(body), "", true /* DM */)
if err != nil {
t.Fatalf("generate: %v", err)
}
if res.route != routeWebThenGrok {
t.Fatalf("route = %q, want web_then_grok", res.route)
}
if res.rewriteUsed || web.lastQuery != body {
t.Fatalf("empty search_query must fall back to the bare body: rewriteUsed=%v lastQuery=%q", res.rewriteUsed, web.lastQuery)
}
}
// TestGenerateFreshnessTrapDesignedWeb: a freshness lexeme in a rumination
// ("сегодня…") still hard-routes to web (the accepted, designed cheap false-web, §14.1).
func TestGenerateFreshnessTrapDesignedWeb(t *testing.T) {
grok := &fakeLLM{text: "x"}
web := &fakeWeb{wc: WebContext{Digest: "d", Citations: []string{"http://s"}}}
cfg := cascadeCfg()
cfg.RouterEnabled, cfg.WebEnabled = true, true // classifier off — freshness alone routes
b := &Bot{cfg: &cfg, llm: grok, web: web, log: discardLog()}
res, err := b.generate(context.Background(), "сегодня я думаю о смысле жизни", msgs("сегодня я думаю о смысле жизни"), "", true)
if err != nil {
t.Fatalf("generate: %v", err)
}
if res.route != routeWebThenGrok {
t.Fatalf("freshness rumination = %q, want web_then_grok (designed)", res.route)
}
}
// TestGenerateWebDegradeFactualAbstain: a STATIC verifiable-fact web miss uses the
// factual-abstain hedge (not the staleness caveat), so Grok abstains on names/dates
// rather than shipping a confident guess.
func TestGenerateWebDegradeFactualAbstain(t *testing.T) {
const verdict = `{"needs_web":true,"verifiable":true,"entity_obscure":true,"time_sensitive":false,"search_query":"q","confidence":0.7}`
grok := &fakeLLM{text: "honest answer"}
gem := &fakeLLM{text: verdict}
web := &fakeWeb{err: errors.New("fetch boom")}
cfg := cascadeCfg()
cfg.RouterEnabled, cfg.RouterClassifierEnabled, cfg.WebEnabled, cfg.WebParanoid = true, true, true, true
b := &Bot{cfg: &cfg, llm: grok, gemini: gem, web: web, log: discardLog()}
res, err := b.generate(context.Background(), "кто снимался в фильме дом у дороги", msgs("кто снимался в фильме дом у дороги"), "", true)
if err != nil {
t.Fatalf("generate: %v", err)
}
if res.route != routeGrokDirect || !res.fallback {
t.Fatalf("res route=%q fallback=%v, want grok_direct fallback", res.route, res.fallback)
}
if !hedgeContains(grok.lastReq.Messages, "Не удалось проверить") {
t.Fatalf("factual miss should use the abstain hedge; messages = %+v", grok.lastReq.Messages)
}
if hedgeContains(grok.lastReq.Messages, "устаревш") {
t.Fatalf("factual miss must NOT use the staleness hedge")
}
}
// TestFactualMissHedge: the web-degrade hedge selection. A recency signal (Freshness or
// time_sensitive) → staleness (factualMiss=false); a static checkable-fact signal
// (verifiable / entity_obscure / a non-recency needs_web) → abstain (factualMiss=true).
func TestFactualMissHedge(t *testing.T) {
cases := []struct {
d RouterDecision
want bool // true => abstain hedge
}{
{RouterDecision{Freshness: "recent"}, false},
{RouterDecision{TimeSensitive: true}, false},
{RouterDecision{Verifiable: true}, true},
{RouterDecision{EntityObscure: true}, true},
{RouterDecision{NeedsWeb: true}, true}, // off-spec needs_web-only → abstain (Q3)
{RouterDecision{NeedsWeb: true, TimeSensitive: true}, false}, // recency still wins
{RouterDecision{}, false},
}
for _, c := range cases {
if got := c.d.factualMiss(); got != c.want {
t.Errorf("factualMiss(%+v) = %v, want %v", c.d, got, c.want)
}
}
}
// TestReserveEstimate: flags off → exactly grok_direct's estimate; with gemini grounding +
// classifier on, it includes the per-prompt fee AND the always-on classifier leg (§7).
func TestReserveEstimate(t *testing.T) {
cfg := cascadeCfg()
b := &Bot{cfg: &cfg, log: discardLog()}
base := b.estimateUSD("grok-x")
if got := b.reserveEstimate(); !approxEq(got, base) {
t.Fatalf("flags-off reserve = %v, want grok_direct estimate %v", got, base)
}
cfg2 := cascadeCfg()
cfg2.WebEnabled, cfg2.WebProvider = true, webProviderGeminiGrounding
cfg2.RouterEnabled, cfg2.RouterClassifierEnabled = true, true
cfg2.GeminiGroundingPerPrompt = 0.035
b2 := &Bot{cfg: &cfg2, log: discardLog()}
want := b2.estimateUSD("grok-x") + b2.estimateUSD("gemini-x") + 0.035 + b2.estimateUSD("gemini-x")
if got := b2.reserveEstimate(); !approxEq(got, want) {
t.Fatalf("web+classifier reserve = %v, want %v (XAI + gemini fetch + $0.035 fee + classifier leg)", got, want)
}
// The fee must actually move the envelope (regression guard for an unbooked fee).
cfg3 := cfg2
cfg3.GeminiGroundingPerPrompt = 0
b3 := &Bot{cfg: &cfg3, log: discardLog()}
if b2.reserveEstimate()-b3.reserveEstimate() < 0.0349 {
t.Fatalf("the grounding fee must raise the reservation by ~0.035")
}
}
// TestGrokReasoningEffort: GROK_REASONING_EFFORT is sent on grok_direct (so grok-4.3 can
// be kept fast with "none"), empty means not sent (compat with grok-4.20-non-reasoning),
// and the reason route always overrides to "high" regardless.
func TestGrokReasoningEffort(t *testing.T) {
grok := &fakeLLM{text: "ok"}
cfg := cascadeCfg()
cfg.GrokReasoningEffort = "none"
b := &Bot{cfg: &cfg, llm: grok, log: discardLog()}
if _, err := b.generate(context.Background(), "hello", msgs("hello"), "", true); err != nil {
t.Fatal(err)
}
if grok.lastReq.ReasoningEffort != "none" {
t.Fatalf("grok_direct effort = %q, want none", grok.lastReq.ReasoningEffort)
}
grokDef := &fakeLLM{text: "ok"}
cfgDef := cascadeCfg() // GrokReasoningEffort == ""
bDef := &Bot{cfg: &cfgDef, llm: grokDef, log: discardLog()}
if _, err := bDef.generate(context.Background(), "hello", msgs("hello"), "", true); err != nil {
t.Fatal(err)
}
if grokDef.lastReq.ReasoningEffort != "" {
t.Fatalf("default effort = %q, want empty (not sent)", grokDef.lastReq.ReasoningEffort)
}
grokR := &fakeLLM{text: "deep"}
cfgR := cascadeCfg()
cfgR.GrokReasoningEffort = "none"
cfgR.ReasoningEnabled = true
bR := &Bot{cfg: &cfgR, llm: grokR, log: discardLog()}
if _, err := bR.generate(context.Background(), "подумай глубже про X", msgs("подумай глубже про X"), "", true); err != nil {
t.Fatal(err)
}
if grokR.lastReq.ReasoningEffort != "high" {
t.Fatalf("reason route effort = %q, want high (overrides GROK_REASONING_EFFORT)", grokR.lastReq.ReasoningEffort)
}
}
// TestGenerateTerminalErrorPropagates: if even grok_direct fails, generate returns the
// error (respond turns it into refund + react), not a silent empty success.
func TestGenerateTerminalErrorPropagates(t *testing.T) {
grok := &fakeLLM{err: errors.New("xai down")}
cfg := cascadeCfg()
b := &Bot{cfg: &cfg, llm: grok, log: discardLog()}
if _, err := b.generate(context.Background(), "hello", msgs("hello"), "", true); err == nil {
t.Fatal("want terminal error when grok_direct fails, got nil")
}
}
// TestWebSynthMessagesNoRawURLs guards the source-leak fix: the grounded digest is
// injected, but the raw gemini-grounding redirect URLs must NOT reach the synth prompt
// (Grok was pasting vertexaisearch.../grounding-api-redirect/... links into the reply).
func TestWebSynthMessagesNoRawURLs(t *testing.T) {
wc := WebContext{
Digest: "Титаник вышел в 1997, режиссёр Джеймс Кэмерон.",
Citations: []string{"https://vertexaisearch.cloud.google.com/grounding-api-redirect/AUZIYQabc123"},
}
out := webSynthMessages(msgs("в каком году титаник"), wc)
var note string
for _, m := range out {
if m.Role == "system" && strings.Contains(m.Content, "Свежие данные") {
note = m.Content
}
}
if note == "" {
t.Fatal("web synth note missing")
}
if !strings.Contains(note, "Титаник вышел в 1997") {
t.Fatalf("digest not injected: %q", note)
}
if strings.Contains(note, "vertexaisearch") || strings.Contains(note, "grounding-api-redirect") || strings.Contains(note, "http") {
t.Fatalf("raw citation URL leaked into the synth prompt: %q", note)
}
}
func hedgeContains(ms []Message, sub string) bool {
for _, m := range ms {
if strings.Contains(m.Content, sub) {
return true
}
}
return false
}
func approxEq(a, b float64) bool {
d := a - b
return d < 1e-9 && d > -1e-9
}