vojo/apps/ai-bot/cascade_test.go

787 lines
34 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package main
import (
"context"
"errors"
"io"
"log/slog"
"strings"
"testing"
)
func discardLog() *slog.Logger { return slog.New(slog.NewTextHandler(io.Discard, nil)) }
// fakeLLM is a scriptable LLMClient for dispatch/degrade tests.
type fakeLLM struct {
text string
usage Usage
err error
calls int
lastReq LLMRequest
}
func (f *fakeLLM) Complete(_ context.Context, req LLMRequest) (*LLMResponse, error) {
f.calls++
f.lastReq = req
if f.err != nil {
return nil, f.err
}
return &LLMResponse{Text: f.text, Usage: f.usage, ProviderRequestID: "fake"}, nil
}
type fakeWeb struct {
wc WebContext
err error
calls int
lastQuery string
}
func (f *fakeWeb) Fetch(_ context.Context, q string) (WebContext, error) {
f.calls++
f.lastQuery = q
if f.err != nil {
return WebContext{}, f.err
}
return f.wc, nil
}
// cascadeCfg is a config with the model/price table set and EVERY cascade flag off.
// Tests flip individual flags on a copy.
func cascadeCfg() Config {
return Config{
XAIModel: "grok-x", GeminiModel: "gemini-x", ReasoningModel: "grok-reason",
MaxOutTok: 100, XAITemp: 0.5,
ReasoningTrigger: "подумай глубже",
ReasoningEffort: "high",
WebProvider: webProviderGrokWebSearch,
Prices: map[string]ModelPrice{
"grok-x": {InputPerM: 1, CachedPerM: 0.2, OutputPerM: 2},
"gemini-x": {InputPerM: 0.1, CachedPerM: 0.1, OutputPerM: 0.4},
},
}
}
func msgs(body string) []Message {
return []Message{{Role: "system", Content: "SYS"}, {Role: "user", Content: body}}
}
// TestGenerateAllFlagsOffIsGrokDirect is the cascade-off parity invariant: even a
// "trivial"-looking message goes to Grok, and Gemini is never touched, when the router
// is off.
func TestGenerateAllFlagsOffIsGrokDirect(t *testing.T) {
grok := &fakeLLM{text: "grok answer"}
gem := &fakeLLM{text: "should not run"}
cfg := cascadeCfg()
b := &Bot{cfg: &cfg, llm: grok, gemini: gem, log: discardLog()}
res, err := b.generate(context.Background(), "привет", msgs("привет"), "", true)
if err != nil {
t.Fatalf("generate: %v", err)
}
if res.route != routeGrokDirect || res.text != "grok answer" {
t.Fatalf("res = (%q,%q), want grok_direct/\"grok answer\"", res.route, res.text)
}
if res.decision.Source != "default" {
t.Fatalf("router source = %q, want default (router off)", res.decision.Source)
}
if grok.calls != 1 || gem.calls != 0 {
t.Fatalf("calls grok=%d gem=%d, want 1/0", grok.calls, gem.calls)
}
}
func TestGenerateTrivialOffload(t *testing.T) {
grok := &fakeLLM{text: "grok"}
gem := &fakeLLM{text: "gemini trivial"}
cfg := cascadeCfg()
cfg.RouterEnabled, cfg.TrivialOffloadEnabled = true, true
b := &Bot{cfg: &cfg, llm: grok, gemini: gem, log: discardLog()}
res, err := b.generate(context.Background(), "привет", msgs("привет"), "", true)
if err != nil {
t.Fatalf("generate: %v", err)
}
if res.route != routeTrivial || res.text != "gemini trivial" || res.finalModel != "gemini-x" {
t.Fatalf("res = (%q,%q,%q), want trivial/gemini", res.route, res.text, res.finalModel)
}
if gem.calls != 1 || grok.calls != 0 {
t.Fatalf("calls grok=%d gem=%d, want 0/1 (Gemini answered)", grok.calls, gem.calls)
}
}
// TestGenerateTrivialDegradesToGrok: Gemini failing on the trivial route must fall back
// to Grok, never go silent.
func TestGenerateTrivialDegradesToGrok(t *testing.T) {
grok := &fakeLLM{text: "grok fallback"}
gem := &fakeLLM{err: errors.New("gemini down")}
cfg := cascadeCfg()
cfg.RouterEnabled, cfg.TrivialOffloadEnabled = true, true
b := &Bot{cfg: &cfg, llm: grok, gemini: gem, log: discardLog()}
res, err := b.generate(context.Background(), "привет", msgs("привет"), "", true)
if err != nil {
t.Fatalf("generate: %v", err)
}
if res.route != routeGrokDirect || res.text != "grok fallback" {
t.Fatalf("res = (%q,%q), want grok_direct fallback", res.route, res.text)
}
if !res.fallback || res.degraded != degradeTrivial {
t.Fatalf("fallback=%v degraded=%q, want true/trivial_failed", res.fallback, res.degraded)
}
if gem.calls != 1 || grok.calls != 1 {
t.Fatalf("calls grok=%d gem=%d, want 1/1", grok.calls, gem.calls)
}
}
// TestGenerateWebThenGrok: a freshness query (classifier off → Layer-0 web) fetches then
// has Grok synthesise, booking both calls' tokens + the web fee.
func TestGenerateWebThenGrok(t *testing.T) {
grok := &fakeLLM{text: "synthesised", usage: Usage{PromptTokens: 100, CompletionTokens: 50}}
web := &fakeWeb{wc: WebContext{Digest: "fresh facts", Citations: []string{"http://src"}, Cost: CostBreakdown{WebTool: 0.1}}}
cfg := cascadeCfg()
cfg.RouterEnabled, cfg.WebEnabled = true, true
b := &Bot{cfg: &cfg, llm: grok, web: web, log: discardLog()}
res, err := b.generate(context.Background(), "какие новости сегодня", msgs("какие новости сегодня"), "", true)
if err != nil {
t.Fatalf("generate: %v", err)
}
if res.route != routeWebThenGrok || res.text != "synthesised" {
t.Fatalf("res = (%q,%q), want web_then_grok/synthesised", res.route, res.text)
}
if res.cost.WebTool != 0.1 || res.cost.Token <= 0 {
t.Fatalf("cost = %+v, want WebTool 0.1 + Token>0", res.cost)
}
if !res.webGrounded || res.citationCount != 1 {
t.Fatalf("webGrounded=%v citations=%d, want true/1", res.webGrounded, res.citationCount)
}
if web.calls != 1 || grok.calls != 1 {
t.Fatalf("calls web=%d grok=%d, want 1/1", web.calls, grok.calls)
}
}
// TestGenerateWebDegradesToGrok: a web fetch failure (cap hit) degrades to grok_direct,
// books no web cost, and — being a RECENCY query — uses the staleness hedge, not abstain.
func TestGenerateWebDegradesToGrok(t *testing.T) {
grok := &fakeLLM{text: "grok fallback"}
web := &fakeWeb{err: errGroundingCapped}
cfg := cascadeCfg()
cfg.RouterEnabled, cfg.WebEnabled = true, true
b := &Bot{cfg: &cfg, llm: grok, web: web, log: discardLog()}
res, err := b.generate(context.Background(), "новости сегодня", msgs("новости сегодня"), "", true)
if err != nil {
t.Fatalf("generate: %v", err)
}
if res.route != routeGrokDirect || res.text != "grok fallback" || !res.fallback {
t.Fatalf("res = (%q,%q,fallback=%v), want grok_direct fallback", res.route, res.text, res.fallback)
}
if res.degraded != degradeGroundCap {
t.Fatalf("degraded = %q, want grounding_cap (the specific reason)", res.degraded)
}
if res.cost.WebTool != 0 || res.cost.Grounding != 0 {
t.Fatalf("web cost = %+v, want 0 (fetch failed before billing)", res.cost)
}
// Recency miss → staleness hedge ("out of date"), not the factual-abstain hedge.
if !hedgeContains(grok.lastReq.Messages, "out of date") {
t.Fatalf("freshness degrade should use the staleness hedge; messages = %+v", grok.lastReq.Messages)
}
}
// TestGenerateReasoningForced: the manual trigger routes to the reasoning model with
// reasoning_effort, independent of ROUTER_ENABLED.
func TestGenerateReasoningForced(t *testing.T) {
grok := &fakeLLM{text: "deep answer"}
cfg := cascadeCfg()
cfg.ReasoningEnabled = true // ROUTER_ENABLED deliberately left off
b := &Bot{cfg: &cfg, llm: grok, log: discardLog()}
res, err := b.generate(context.Background(), "подумай глубже про сознание", msgs("подумай глубже про сознание"), "", true)
if err != nil {
t.Fatalf("generate: %v", err)
}
if res.route != routeReason || res.decision.Source != "forced" {
t.Fatalf("res route=%q source=%q, want reason/forced", res.route, res.decision.Source)
}
if grok.lastReq.ReasoningEffort != "high" || grok.lastReq.Model != "grok-reason" {
t.Fatalf("reasoning req = (effort %q, model %q), want high/grok-reason", grok.lastReq.ReasoningEffort, grok.lastReq.Model)
}
}
// TestClassifyTrivialAgreementGate: a trivial route requires the Layer-0 candidate AND
// classifier.trivial AND confidence ≥ trivialFloor. A low-confidence "trivial" or a
// classifier that disagrees stays on grok_direct (no voice leak).
func TestClassifyTrivialAgreementGate(t *testing.T) {
cfg := cascadeCfg()
cfg.RouterEnabled, cfg.RouterClassifierEnabled = true, true
gem := &fakeLLM{}
b := &Bot{cfg: &cfg, gemini: gem, log: discardLog()}
var cost CostBreakdown
gem.text = `{"trivial":true,"needs_web":false,"confidence":0.95}`
if d := b.classify(context.Background(), "привет", "USER: привет", &cost); d.Route != routeTrivial {
t.Fatalf("agreed high-confidence trivial = %q, want trivial", d.Route)
}
gem.text = `{"trivial":true,"needs_web":false,"confidence":0.5}`
if d := b.classify(context.Background(), "привет", "USER: привет", &cost); d.Route != routeGrokDirect {
t.Fatalf("low-confidence trivial = %q, want grok_direct (no leak)", d.Route)
}
// A non-trivial body can never be trivial even if the classifier claims so.
gem.text = `{"trivial":true,"needs_web":false,"confidence":0.99}`
const substantive = "напиши подробное эссе про историю римской империи"
if d := b.classify(context.Background(), substantive, "USER: …", &cost); d.Route != routeGrokDirect {
t.Fatalf("classifier.trivial on a substantive body = %q, want grok_direct", d.Route)
}
}
// TestClassifyClassifierErrorFallsBackToLayer0: a classifier error/garbage degrades to the
// deterministic Layer-0 verdict — grok_direct for a substantive body, web for a freshness
// body — never an ungrounded confident answer, never a degrade-to-web.
func TestClassifyClassifierErrorFallsBackToLayer0(t *testing.T) {
cfg := cascadeCfg()
cfg.RouterEnabled, cfg.RouterClassifierEnabled, cfg.WebParanoid = true, true, true
gem := &fakeLLM{}
b := &Bot{cfg: &cfg, gemini: gem, log: discardLog()}
var cost CostBreakdown
// Transport error → Layer-0.
gem.err = errors.New("gemini down")
if d := b.classify(context.Background(), "напиши эссе про рим", "USER: …", &cost); d.Route != routeGrokDirect {
t.Fatalf("classifier error on substantive body = %q, want grok_direct (Layer-0)", d.Route)
}
if d := b.classify(context.Background(), "новости сегодня", "USER: …", &cost); d.Route != routeWebThenGrok {
t.Fatalf("classifier error on freshness body = %q, want web (deterministic Layer-0 survives)", d.Route)
}
// Garbage JSON (no transport error) → also Layer-0.
gem.err, gem.text = nil, "not json at all"
if d := b.classify(context.Background(), "напиши эссе про рим", "USER: …", &cost); d.Route != routeGrokDirect {
t.Fatalf("garbage classifier JSON = %q, want grok_direct (Layer-0)", d.Route)
}
}
// TestGenerateRoadHouseWebParanoidDM is the headline regression: an obscure-entity factual
// lookup in a DM, with the classifier + WEB_PARANOID on, routes to web AND the fetch uses
// the classifier's context-resolved search_query (the follow-up rewrite). With paranoid
// off it correctly stays grok_direct (the canary-neutral baseline).
func TestGenerateRoadHouseWebParanoidDM(t *testing.T) {
const verdict = `{"needs_web":true,"verifiable":true,"entity_obscure":true,"time_sensitive":false,"trivial":false,"search_query":"Дом у дороги 2024 фильм актёрский состав","confidence":0.7}`
mk := func(paranoid bool) (*fakeLLM, *fakeWeb, genResult) {
grok := &fakeLLM{text: "voiced", usage: Usage{PromptTokens: 10, CompletionTokens: 5}}
gem := &fakeLLM{text: verdict}
web := &fakeWeb{wc: WebContext{Digest: "cast: Patrick Swayze…", Citations: []string{"http://imdb"}}}
cfg := cascadeCfg()
cfg.RouterEnabled, cfg.RouterClassifierEnabled, cfg.WebEnabled, cfg.WebParanoid = true, true, true, paranoid
b := &Bot{cfg: &cfg, llm: grok, gemini: gem, web: web, log: discardLog()}
res, err := b.generate(context.Background(), "2024 года", []Message{
{Role: "system", Content: "SYS"},
{Role: "user", Content: "кто снимался в фильме дом у дороги"},
{Role: "assistant", Content: "В фильме 1989 года…"},
{Role: "user", Content: "2024 года"},
}, "", true)
if err != nil {
t.Fatalf("generate: %v", err)
}
return grok, web, res
}
_, web, res := mk(true)
if res.route != routeWebThenGrok {
t.Fatalf("paranoid DM road-house = %q, want web_then_grok (the fix)", res.route)
}
if !res.rewriteUsed || web.lastQuery != "Дом у дороги 2024 фильм актёрский состав" {
t.Fatalf("fetch should use the rewritten query: rewriteUsed=%v lastQuery=%q", res.rewriteUsed, web.lastQuery)
}
_, _, resOff := mk(false)
if resOff.route != routeGrokDirect {
t.Fatalf("paranoid OFF road-house = %q, want grok_direct (baseline)", resOff.route)
}
}
// TestGenerateFollowupGroupUsesBareBody: in a GROUP the context-resolved rewrite is
// suppressed — the fetch uses the bare (sanitised) body, never the classifier's
// search_query, so a member's follow-up can't ground the wrong prior subject.
func TestGenerateFollowupGroupUsesBareBody(t *testing.T) {
const verdict = `{"needs_web":true,"verifiable":true,"entity_obscure":true,"search_query":"какой-то чужой фильм 2024","confidence":0.7}`
grok := &fakeLLM{text: "voiced"}
gem := &fakeLLM{text: verdict}
web := &fakeWeb{wc: WebContext{Digest: "d", Citations: []string{"http://s"}}}
cfg := cascadeCfg()
cfg.RouterEnabled, cfg.RouterClassifierEnabled, cfg.WebEnabled, cfg.WebParanoid = true, true, true, true
b := &Bot{cfg: &cfg, llm: grok, gemini: gem, web: web, log: discardLog()}
res, err := b.generate(context.Background(), "2024 года", msgs("2024 года"), "", false /* group */)
if err != nil {
t.Fatalf("generate: %v", err)
}
if res.route != routeWebThenGrok {
t.Fatalf("group route = %q, want web_then_grok", res.route)
}
if res.rewriteUsed || web.lastQuery != "2024 года" {
t.Fatalf("group must use the bare body, not the rewrite: rewriteUsed=%v lastQuery=%q", res.rewriteUsed, web.lastQuery)
}
}
// TestGenerateWebEmptySearchQueryFallsBackToBody: the rewrite-with-fallback contract's
// empty arm (§6/§12). A DM web route whose classifier returned an empty search_query must
// fetch the bare (sanitised) body and report rewriteUsed=false — never an empty query.
func TestGenerateWebEmptySearchQueryFallsBackToBody(t *testing.T) {
// verifiable:true so it genuinely routes web (the needs_web arm requires verifiable);
// search_query empty is the point — the fetch must fall back to the bare body.
const verdict = `{"needs_web":true,"verifiable":true,"entity_obscure":false,"search_query":"","confidence":0.7}`
grok := &fakeLLM{text: "voiced"}
gem := &fakeLLM{text: verdict}
web := &fakeWeb{wc: WebContext{Digest: "d", Citations: []string{"http://s"}}}
cfg := cascadeCfg()
cfg.RouterEnabled, cfg.RouterClassifierEnabled, cfg.WebEnabled, cfg.WebParanoid = true, true, true, true
b := &Bot{cfg: &cfg, llm: grok, gemini: gem, web: web, log: discardLog()}
const body = "в каком году основан Рим"
res, err := b.generate(context.Background(), body, msgs(body), "", true /* DM */)
if err != nil {
t.Fatalf("generate: %v", err)
}
if res.route != routeWebThenGrok {
t.Fatalf("route = %q, want web_then_grok", res.route)
}
if res.rewriteUsed || web.lastQuery != body {
t.Fatalf("empty search_query must fall back to the bare body: rewriteUsed=%v lastQuery=%q", res.rewriteUsed, web.lastQuery)
}
}
// TestGenerateFreshnessTrapDesignedWeb: a freshness lexeme in a rumination
// ("сегодня…") still hard-routes to web (the accepted, designed cheap false-web, §14.1).
func TestGenerateFreshnessTrapDesignedWeb(t *testing.T) {
grok := &fakeLLM{text: "x"}
web := &fakeWeb{wc: WebContext{Digest: "d", Citations: []string{"http://s"}}}
cfg := cascadeCfg()
cfg.RouterEnabled, cfg.WebEnabled = true, true // classifier off — freshness alone routes
b := &Bot{cfg: &cfg, llm: grok, web: web, log: discardLog()}
res, err := b.generate(context.Background(), "сегодня я думаю о смысле жизни", msgs("сегодня я думаю о смысле жизни"), "", true)
if err != nil {
t.Fatalf("generate: %v", err)
}
if res.route != routeWebThenGrok {
t.Fatalf("freshness rumination = %q, want web_then_grok (designed)", res.route)
}
}
// TestGenerateWebDegradeFactualAbstain: a STATIC verifiable-fact web miss uses the
// factual-abstain hedge (not the staleness caveat), so Grok abstains on names/dates
// rather than shipping a confident guess.
func TestGenerateWebDegradeFactualAbstain(t *testing.T) {
const verdict = `{"needs_web":true,"verifiable":true,"entity_obscure":true,"time_sensitive":false,"search_query":"q","confidence":0.7}`
grok := &fakeLLM{text: "honest answer"}
gem := &fakeLLM{text: verdict}
web := &fakeWeb{err: errors.New("fetch boom")}
cfg := cascadeCfg()
cfg.RouterEnabled, cfg.RouterClassifierEnabled, cfg.WebEnabled, cfg.WebParanoid = true, true, true, true
b := &Bot{cfg: &cfg, llm: grok, gemini: gem, web: web, log: discardLog()}
res, err := b.generate(context.Background(), "кто снимался в фильме дом у дороги", msgs("кто снимался в фильме дом у дороги"), "", true)
if err != nil {
t.Fatalf("generate: %v", err)
}
if res.route != routeGrokDirect || !res.fallback {
t.Fatalf("res route=%q fallback=%v, want grok_direct fallback", res.route, res.fallback)
}
if !hedgeContains(grok.lastReq.Messages, "Couldn't verify the facts") {
t.Fatalf("factual miss should use the abstain hedge; messages = %+v", grok.lastReq.Messages)
}
if hedgeContains(grok.lastReq.Messages, "out of date") {
t.Fatalf("factual miss must NOT use the staleness hedge")
}
}
// TestFactualMissHedge: the web-degrade hedge selection. A recency signal (Freshness or
// time_sensitive) → staleness (factualMiss=false); a static checkable-fact signal
// (verifiable / entity_obscure / a non-recency needs_web) → abstain (factualMiss=true).
func TestFactualMissHedge(t *testing.T) {
cases := []struct {
d RouterDecision
want bool // true => abstain hedge
}{
{RouterDecision{Freshness: "recent"}, false},
{RouterDecision{TimeSensitive: true}, false},
{RouterDecision{Verifiable: true}, true},
{RouterDecision{EntityObscure: true}, true},
{RouterDecision{NeedsWeb: true}, true}, // off-spec needs_web-only → abstain (Q3)
{RouterDecision{NeedsWeb: true, TimeSensitive: true}, false}, // recency still wins
{RouterDecision{}, false},
}
for _, c := range cases {
if got := c.d.factualMiss(); got != c.want {
t.Errorf("factualMiss(%+v) = %v, want %v", c.d, got, c.want)
}
}
}
// TestReserveEstimate: flags off → exactly grok_direct's estimate; with gemini grounding +
// classifier on, it includes the per-prompt fee AND the always-on classifier leg (§7).
func TestReserveEstimate(t *testing.T) {
cfg := cascadeCfg()
b := &Bot{cfg: &cfg, log: discardLog()}
base := b.estimateUSD("grok-x")
if got := b.reserveEstimate(); !approxEq(got, base) {
t.Fatalf("flags-off reserve = %v, want grok_direct estimate %v", got, base)
}
cfg2 := cascadeCfg()
cfg2.WebEnabled, cfg2.WebProvider = true, webProviderGeminiGrounding
cfg2.RouterEnabled, cfg2.RouterClassifierEnabled = true, true
cfg2.GeminiGroundingPerPrompt = 0.035
b2 := &Bot{cfg: &cfg2, log: discardLog()}
want := b2.estimateUSD("grok-x") + b2.estimateUSD("gemini-x") + 0.035 + b2.estimateUSD("gemini-x")
if got := b2.reserveEstimate(); !approxEq(got, want) {
t.Fatalf("web+classifier reserve = %v, want %v (XAI + gemini fetch + $0.035 fee + classifier leg)", got, want)
}
// The fee must actually move the envelope (regression guard for an unbooked fee).
cfg3 := cfg2
cfg3.GeminiGroundingPerPrompt = 0
b3 := &Bot{cfg: &cfg3, log: discardLog()}
if b2.reserveEstimate()-b3.reserveEstimate() < 0.0349 {
t.Fatalf("the grounding fee must raise the reservation by ~0.035")
}
}
// TestGrokReasoningEffort: GROK_REASONING_EFFORT is sent on grok_direct (so grok-4.3 can
// be kept fast with "none"), empty means not sent (compat with grok-4.20-non-reasoning),
// and the reason route always overrides to "high" regardless.
func TestGrokReasoningEffort(t *testing.T) {
grok := &fakeLLM{text: "ok"}
cfg := cascadeCfg()
cfg.GrokReasoningEffort = "none"
b := &Bot{cfg: &cfg, llm: grok, log: discardLog()}
if _, err := b.generate(context.Background(), "hello", msgs("hello"), "", true); err != nil {
t.Fatal(err)
}
if grok.lastReq.ReasoningEffort != "none" {
t.Fatalf("grok_direct effort = %q, want none", grok.lastReq.ReasoningEffort)
}
grokDef := &fakeLLM{text: "ok"}
cfgDef := cascadeCfg() // GrokReasoningEffort == ""
bDef := &Bot{cfg: &cfgDef, llm: grokDef, log: discardLog()}
if _, err := bDef.generate(context.Background(), "hello", msgs("hello"), "", true); err != nil {
t.Fatal(err)
}
if grokDef.lastReq.ReasoningEffort != "" {
t.Fatalf("default effort = %q, want empty (not sent)", grokDef.lastReq.ReasoningEffort)
}
grokR := &fakeLLM{text: "deep"}
cfgR := cascadeCfg()
cfgR.GrokReasoningEffort = "none"
cfgR.ReasoningEnabled = true
bR := &Bot{cfg: &cfgR, llm: grokR, log: discardLog()}
if _, err := bR.generate(context.Background(), "подумай глубже про X", msgs("подумай глубже про X"), "", true); err != nil {
t.Fatal(err)
}
if grokR.lastReq.ReasoningEffort != "high" {
t.Fatalf("reason route effort = %q, want high (overrides GROK_REASONING_EFFORT)", grokR.lastReq.ReasoningEffort)
}
}
// TestGenerateTerminalErrorPropagates: if even grok_direct fails, generate returns the
// error (respond turns it into refund + react), not a silent empty success.
func TestGenerateTerminalErrorPropagates(t *testing.T) {
grok := &fakeLLM{err: errors.New("xai down")}
cfg := cascadeCfg()
b := &Bot{cfg: &cfg, llm: grok, log: discardLog()}
if _, err := b.generate(context.Background(), "hello", msgs("hello"), "", true); err == nil {
t.Fatal("want terminal error when grok_direct fails, got nil")
}
}
// TestWebSynthMessagesNoRawURLs guards the web-synth note: the grounded digest is injected,
// the raw gemini-grounding redirect URLs must NOT reach the prompt (Grok was pasting
// vertexaisearch.../grounding-api-redirect/... links into the reply), and the note is
// authoritative enough that Grok uses the data instead of denying web access ("I don't
// have live web access" despite being handed fresh news).
func TestWebSynthMessagesNoRawURLs(t *testing.T) {
wc := WebContext{
Digest: "Титаник вышел в 1997, режиссёр Джеймс Кэмерон.",
Citations: []string{"https://vertexaisearch.cloud.google.com/grounding-api-redirect/AUZIYQabc123"},
}
out := webSynthMessages(msgs("в каком году титаник"), wc)
var note string
for _, m := range out {
if m.Role == "system" && strings.Contains(m.Content, "web-search results") {
note = m.Content
}
}
if note == "" {
t.Fatal("web synth note missing")
}
if !strings.Contains(note, "Титаник вышел в 1997") {
t.Fatalf("digest not injected: %q", note)
}
if strings.Contains(note, "vertexaisearch") || strings.Contains(note, "grounding-api-redirect") || strings.Contains(note, "http") {
t.Fatalf("raw citation URL leaked into the synth prompt: %q", note)
}
// The note must counter the "no internet access" rule so Grok actually uses the data.
if !strings.Contains(note, "no internet access") {
t.Fatalf("note must lift the no-internet rule for the web turn: %q", note)
}
}
// failFirstLLM errors on its first Complete call and succeeds after — for the project
// degrade test, where the KB-injecting Grok call fails but the grok_direct fallback works.
type failFirstLLM struct {
failErr error
okText string
calls int
lastReq LLMRequest
}
func (f *failFirstLLM) Complete(_ context.Context, req LLMRequest) (*LLMResponse, error) {
f.calls++
f.lastReq = req
if f.calls == 1 {
return nil, f.failErr
}
return &LLMResponse{Text: f.okText, ProviderRequestID: "fake"}, nil
}
// TestGenerateProjectFlagOffByteIdentical: even with a KB loaded and a product question,
// all cascade flags off → grok_direct, Gemini untouched, the KB never reaches the prompt.
func TestGenerateProjectFlagOffByteIdentical(t *testing.T) {
grok := &fakeLLM{text: "grok answer"}
gem := &fakeLLM{text: "should not run"}
cfg := cascadeCfg()
cfg.ProjectKB = "VOJO FACTS" // present but the flag is off
b := &Bot{cfg: &cfg, llm: grok, gemini: gem, log: discardLog()}
res, err := b.generate(context.Background(), "что такое vojo", msgs("что такое vojo"), "", true)
if err != nil {
t.Fatalf("generate: %v", err)
}
if res.route != routeGrokDirect {
t.Fatalf("route=%q, want grok_direct (all flags off)", res.route)
}
if gem.calls != 0 {
t.Fatalf("gemini called %d, want 0 (router off)", gem.calls)
}
if hedgeContains(grok.lastReq.Messages, "VOJO FACTS") {
t.Fatalf("KB leaked into the grok prompt with flags off: %+v", grok.lastReq.Messages)
}
}
// TestGenerateProjectFlagOffFallsThrough is the canary-clean property: with the classifier
// on but PROJECT_KB_ENABLED off, Combine still DECIDES project (so about_project is recorded
// for "would-have-fired" measurement) but EXECUTION falls through to grok_direct — the KB is
// never injected and the answer is byte-identical to today's grok_direct.
func TestGenerateProjectFlagOffFallsThrough(t *testing.T) {
const verdict = `{"about_project":true,"confidence":0.9}`
grok := &fakeLLM{text: "grok answer"}
gem := &fakeLLM{text: verdict}
cfg := cascadeCfg()
cfg.RouterEnabled, cfg.RouterClassifierEnabled = true, true // PROJECT_KB_ENABLED deliberately OFF
cfg.ProjectKB = "VOJO FACTS"
b := &Bot{cfg: &cfg, llm: grok, gemini: gem, log: discardLog()}
res, err := b.generate(context.Background(), "что умеет vojo", msgs("что умеет vojo"), "", true)
if err != nil {
t.Fatalf("generate: %v", err)
}
if res.decision.Route != routeProject {
t.Fatalf("decision.Route=%q, want project_then_grok (the would-have-fired signal)", res.decision.Route)
}
if !res.decision.AboutProject {
t.Fatalf("about_project must be recorded for telemetry even with the flag off")
}
if res.route != routeGrokDirect {
t.Fatalf("route=%q, want grok_direct (flag off → fall through)", res.route)
}
if hedgeContains(grok.lastReq.Messages, "VOJO FACTS") {
t.Fatalf("KB injected despite the flag being off: %+v", grok.lastReq.Messages)
}
if grok.calls != 1 || gem.calls != 1 {
t.Fatalf("calls grok=%d gem=%d, want 1/1 (classifier + grok_direct)", grok.calls, gem.calls)
}
}
// TestGenerateProjectThenGrok: with the gate on, an about_project verdict routes to
// project_then_grok, injects the KB as a system note, and Grok voices it — one Grok call,
// Gemini only as the classifier.
func TestGenerateProjectThenGrok(t *testing.T) {
const verdict = `{"about_project":true,"needs_web":false,"confidence":0.9}`
grok := &fakeLLM{text: "voiced from KB", usage: Usage{PromptTokens: 20, CompletionTokens: 8}}
gem := &fakeLLM{text: verdict}
cfg := cascadeCfg()
cfg.RouterEnabled, cfg.RouterClassifierEnabled, cfg.ProjectKBEnabled = true, true, true
cfg.ProjectKB = "VOJO FACTS: encrypted DMs, voice calls; no group calls yet."
b := &Bot{cfg: &cfg, llm: grok, gemini: gem, log: discardLog()}
res, err := b.generate(context.Background(), "что умеет vojo", msgs("что умеет vojo"), "", true)
if err != nil {
t.Fatalf("generate: %v", err)
}
if res.route != routeProject || res.text != "voiced from KB" || res.finalModel != "grok-x" {
t.Fatalf("res=(%q,%q,%q), want project_then_grok/voiced/grok-x", res.route, res.text, res.finalModel)
}
if !hedgeContains(grok.lastReq.Messages, "VOJO FACTS: encrypted DMs") {
t.Fatalf("KB not injected into the grok prompt: %+v", grok.lastReq.Messages)
}
if grok.calls != 1 || gem.calls != 1 {
t.Fatalf("calls grok=%d gem=%d, want 1/1 (classifier + one project synth)", grok.calls, gem.calls)
}
}
// TestGenerateAboutProjectFalseNoKB: when the classifier says about_project=false, the KB is
// NOT injected even with the flag on — the route trusts the classifier in both directions.
func TestGenerateAboutProjectFalseNoKB(t *testing.T) {
const verdict = `{"about_project":false,"confidence":0.9}`
grok := &fakeLLM{text: "grok answer"}
gem := &fakeLLM{text: verdict}
cfg := cascadeCfg()
cfg.RouterEnabled, cfg.RouterClassifierEnabled, cfg.ProjectKBEnabled = true, true, true
cfg.ProjectKB = "VOJO FACTS"
b := &Bot{cfg: &cfg, llm: grok, gemini: gem, log: discardLog()}
res, err := b.generate(context.Background(), "расскажи про телеграм", msgs("расскажи про телеграм"), "", true)
if err != nil {
t.Fatalf("generate: %v", err)
}
if res.decision.Route == routeProject || res.route != routeGrokDirect {
t.Fatalf("about_project=false routed to project: decision=%q route=%q, want grok_direct", res.decision.Route, res.route)
}
if hedgeContains(grok.lastReq.Messages, "VOJO FACTS") {
t.Fatalf("KB injected when the classifier said not-about-project: %+v", grok.lastReq.Messages)
}
if grok.calls != 1 {
t.Fatalf("grok calls=%d, want 1 (no project synth attempt)", grok.calls)
}
}
// TestGenerateProjectContextFollowup: the headline live case — a context-resolved follow-up
// ("Про этот", no literal "vojo") that the classifier flags about_project=true routes to the
// KB. This is what the old regex-hint gate wrongly blocked.
func TestGenerateProjectContextFollowup(t *testing.T) {
const verdict = `{"about_project":true,"confidence":1.0}`
grok := &fakeLLM{text: "voiced from KB"}
gem := &fakeLLM{text: verdict}
cfg := cascadeCfg()
cfg.RouterEnabled, cfg.RouterClassifierEnabled, cfg.ProjectKBEnabled = true, true, true
cfg.ProjectKB = "VOJO FACTS: messaging, calls, channels."
b := &Bot{cfg: &cfg, llm: grok, gemini: gem, log: discardLog()}
res, err := b.generate(context.Background(), "Про этот", []Message{
{Role: "system", Content: "SYS"},
{Role: "user", Content: "знаешь что-нибудь про мессенджер?"},
{Role: "assistant", Content: "Знаю. Про какой именно?"},
{Role: "user", Content: "Про этот"},
}, "", true)
if err != nil {
t.Fatalf("generate: %v", err)
}
if res.route != routeProject {
t.Fatalf("context follow-up = %q, want project_then_grok (no literal 'vojo' needed)", res.route)
}
if !hedgeContains(grok.lastReq.Messages, "VOJO FACTS: messaging") {
t.Fatalf("KB not injected on the context-resolved follow-up: %+v", grok.lastReq.Messages)
}
}
// TestGenerateProjectDegradesToGrok: the KB-injecting Grok call fails → degrade to
// grok_direct with the project-abstain hedge (never silent, never a Vojo guess from empty
// memory).
func TestGenerateProjectDegradesToGrok(t *testing.T) {
const verdict = `{"about_project":true,"confidence":0.9}`
grok := &failFirstLLM{failErr: errors.New("grok boom on KB turn"), okText: "honest fallback"}
gem := &fakeLLM{text: verdict}
cfg := cascadeCfg()
cfg.RouterEnabled, cfg.RouterClassifierEnabled, cfg.ProjectKBEnabled = true, true, true
cfg.ProjectKB = "VOJO FACTS"
b := &Bot{cfg: &cfg, llm: grok, gemini: gem, log: discardLog()}
res, err := b.generate(context.Background(), "что умеет vojo", msgs("что умеет vojo"), "", true)
if err != nil {
t.Fatalf("generate: %v", err)
}
if res.route != routeGrokDirect || res.text != "honest fallback" || !res.fallback {
t.Fatalf("res=(%q,%q,fallback=%v), want grok_direct/honest fallback/true", res.route, res.text, res.fallback)
}
if res.degraded != degradeProject {
t.Fatalf("degraded=%q, want %q", res.degraded, degradeProject)
}
if !hedgeContains(grok.lastReq.Messages, "Couldn't load the Vojo product info") {
t.Fatalf("project degrade should inject the abstain hedge; messages=%+v", grok.lastReq.Messages)
}
if grok.calls != 2 {
t.Fatalf("grok calls=%d, want 2 (failed KB attempt + grok_direct fallback)", grok.calls)
}
}
// TestProjectKBMessagesScoped guards the anti-hallucination note: the KB is injected
// delimited, Vojo claims are restricted to the FACTS, the general part is explicitly
// licensed (entity-scoped, NOT "answer only from KB"), and the abstain clause is present.
func TestProjectKBMessagesScoped(t *testing.T) {
out := projectKBMessages(msgs("что умеет vojo"), "VOJO FACT: chats and calls")
var note string
for _, m := range out {
if m.Role == "system" && strings.Contains(m.Content, "FACTS") {
note = m.Content
}
}
if note == "" {
t.Fatal("project KB note missing")
}
if !strings.Contains(note, "VOJO FACT: chats and calls") {
t.Fatalf("KB not injected: %q", note)
}
if !strings.Contains(note, "<FACTS>") {
t.Fatalf("note must delimit the KB with <FACTS> tags (tagged-context grounding): %q", note)
}
// The load-bearing hard-scoping clause: Vojo claims restricted to the FACTS. Without this
// assertion the clause could be silently softened (mutation-proven) and the route would
// stop grounding — re-opening the hallucination hole.
if !strings.Contains(note, "use ONLY the FACTS") {
t.Fatalf("note must restrict Vojo claims to the FACTS (entity-scoping): %q", note)
}
// Lifts the base prompt's "no file/document access" honesty rule for this turn (like the
// web note lifts "no internet access") — else a fast Grok can hedge "I can't access Vojo
// docs" despite the injected FACTS. The doc comment claims this lift; assert the wire does it.
if !strings.Contains(note, "do NOT say you lack access") {
t.Fatalf("note must lift the no-file-access rule so Grok treats the FACTS as available: %q", note)
}
if !strings.Contains(note, "general") {
t.Fatalf("note must license the general (non-Vojo) part — entity-scoped: %q", note)
}
if !strings.Contains(note, "don't have that information") {
t.Fatalf("note must carry the explicit abstain clause: %q", note)
}
}
// TestReserveEstimateProjectNoBump: enabling PROJECT_KB_ENABLED must NOT raise the
// reservation — the project route is one Grok call on a prompt already capped at
// maxPromptTokens, ≤ the grok_direct base already counted.
func TestReserveEstimateProjectNoBump(t *testing.T) {
base := cascadeCfg()
base.RouterEnabled, base.RouterClassifierEnabled = true, true
bBase := &Bot{cfg: &base, log: discardLog()}
proj := base
proj.ProjectKBEnabled = true
proj.ProjectKB = "facts"
bProj := &Bot{cfg: &proj, log: discardLog()}
if !approxEq(bBase.reserveEstimate(), bProj.reserveEstimate()) {
t.Fatalf("PROJECT_KB_ENABLED changed reserveEstimate: %v vs %v", bBase.reserveEstimate(), bProj.reserveEstimate())
}
}
func hedgeContains(ms []Message, sub string) bool {
for _, m := range ms {
if strings.Contains(m.Content, sub) {
return true
}
}
return false
}
func approxEq(a, b float64) bool {
d := a - b
return d < 1e-9 && d > -1e-9
}