787 lines
34 KiB
Go
787 lines
34 KiB
Go
package main
|
||
|
||
import (
|
||
"context"
|
||
"errors"
|
||
"io"
|
||
"log/slog"
|
||
"strings"
|
||
"testing"
|
||
)
|
||
|
||
func discardLog() *slog.Logger { return slog.New(slog.NewTextHandler(io.Discard, nil)) }
|
||
|
||
// fakeLLM is a scriptable LLMClient for dispatch/degrade tests.
|
||
type fakeLLM struct {
|
||
text string
|
||
usage Usage
|
||
err error
|
||
calls int
|
||
lastReq LLMRequest
|
||
}
|
||
|
||
func (f *fakeLLM) Complete(_ context.Context, req LLMRequest) (*LLMResponse, error) {
|
||
f.calls++
|
||
f.lastReq = req
|
||
if f.err != nil {
|
||
return nil, f.err
|
||
}
|
||
return &LLMResponse{Text: f.text, Usage: f.usage, ProviderRequestID: "fake"}, nil
|
||
}
|
||
|
||
type fakeWeb struct {
|
||
wc WebContext
|
||
err error
|
||
calls int
|
||
lastQuery string
|
||
}
|
||
|
||
func (f *fakeWeb) Fetch(_ context.Context, q string) (WebContext, error) {
|
||
f.calls++
|
||
f.lastQuery = q
|
||
if f.err != nil {
|
||
return WebContext{}, f.err
|
||
}
|
||
return f.wc, nil
|
||
}
|
||
|
||
// cascadeCfg is a config with the model/price table set and EVERY cascade flag off.
|
||
// Tests flip individual flags on a copy.
|
||
func cascadeCfg() Config {
|
||
return Config{
|
||
XAIModel: "grok-x", GeminiModel: "gemini-x", ReasoningModel: "grok-reason",
|
||
MaxOutTok: 100, XAITemp: 0.5,
|
||
ReasoningTrigger: "подумай глубже",
|
||
ReasoningEffort: "high",
|
||
WebProvider: webProviderGrokWebSearch,
|
||
Prices: map[string]ModelPrice{
|
||
"grok-x": {InputPerM: 1, CachedPerM: 0.2, OutputPerM: 2},
|
||
"gemini-x": {InputPerM: 0.1, CachedPerM: 0.1, OutputPerM: 0.4},
|
||
},
|
||
}
|
||
}
|
||
|
||
func msgs(body string) []Message {
|
||
return []Message{{Role: "system", Content: "SYS"}, {Role: "user", Content: body}}
|
||
}
|
||
|
||
// TestGenerateAllFlagsOffIsGrokDirect is the cascade-off parity invariant: even a
|
||
// "trivial"-looking message goes to Grok, and Gemini is never touched, when the router
|
||
// is off.
|
||
func TestGenerateAllFlagsOffIsGrokDirect(t *testing.T) {
|
||
grok := &fakeLLM{text: "grok answer"}
|
||
gem := &fakeLLM{text: "should not run"}
|
||
cfg := cascadeCfg()
|
||
b := &Bot{cfg: &cfg, llm: grok, gemini: gem, log: discardLog()}
|
||
|
||
res, err := b.generate(context.Background(), "привет", msgs("привет"), "", true)
|
||
if err != nil {
|
||
t.Fatalf("generate: %v", err)
|
||
}
|
||
if res.route != routeGrokDirect || res.text != "grok answer" {
|
||
t.Fatalf("res = (%q,%q), want grok_direct/\"grok answer\"", res.route, res.text)
|
||
}
|
||
if res.decision.Source != "default" {
|
||
t.Fatalf("router source = %q, want default (router off)", res.decision.Source)
|
||
}
|
||
if grok.calls != 1 || gem.calls != 0 {
|
||
t.Fatalf("calls grok=%d gem=%d, want 1/0", grok.calls, gem.calls)
|
||
}
|
||
}
|
||
|
||
func TestGenerateTrivialOffload(t *testing.T) {
|
||
grok := &fakeLLM{text: "grok"}
|
||
gem := &fakeLLM{text: "gemini trivial"}
|
||
cfg := cascadeCfg()
|
||
cfg.RouterEnabled, cfg.TrivialOffloadEnabled = true, true
|
||
b := &Bot{cfg: &cfg, llm: grok, gemini: gem, log: discardLog()}
|
||
|
||
res, err := b.generate(context.Background(), "привет", msgs("привет"), "", true)
|
||
if err != nil {
|
||
t.Fatalf("generate: %v", err)
|
||
}
|
||
if res.route != routeTrivial || res.text != "gemini trivial" || res.finalModel != "gemini-x" {
|
||
t.Fatalf("res = (%q,%q,%q), want trivial/gemini", res.route, res.text, res.finalModel)
|
||
}
|
||
if gem.calls != 1 || grok.calls != 0 {
|
||
t.Fatalf("calls grok=%d gem=%d, want 0/1 (Gemini answered)", grok.calls, gem.calls)
|
||
}
|
||
}
|
||
|
||
// TestGenerateTrivialDegradesToGrok: Gemini failing on the trivial route must fall back
|
||
// to Grok, never go silent.
|
||
func TestGenerateTrivialDegradesToGrok(t *testing.T) {
|
||
grok := &fakeLLM{text: "grok fallback"}
|
||
gem := &fakeLLM{err: errors.New("gemini down")}
|
||
cfg := cascadeCfg()
|
||
cfg.RouterEnabled, cfg.TrivialOffloadEnabled = true, true
|
||
b := &Bot{cfg: &cfg, llm: grok, gemini: gem, log: discardLog()}
|
||
|
||
res, err := b.generate(context.Background(), "привет", msgs("привет"), "", true)
|
||
if err != nil {
|
||
t.Fatalf("generate: %v", err)
|
||
}
|
||
if res.route != routeGrokDirect || res.text != "grok fallback" {
|
||
t.Fatalf("res = (%q,%q), want grok_direct fallback", res.route, res.text)
|
||
}
|
||
if !res.fallback || res.degraded != degradeTrivial {
|
||
t.Fatalf("fallback=%v degraded=%q, want true/trivial_failed", res.fallback, res.degraded)
|
||
}
|
||
if gem.calls != 1 || grok.calls != 1 {
|
||
t.Fatalf("calls grok=%d gem=%d, want 1/1", grok.calls, gem.calls)
|
||
}
|
||
}
|
||
|
||
// TestGenerateWebThenGrok: a freshness query (classifier off → Layer-0 web) fetches then
|
||
// has Grok synthesise, booking both calls' tokens + the web fee.
|
||
func TestGenerateWebThenGrok(t *testing.T) {
|
||
grok := &fakeLLM{text: "synthesised", usage: Usage{PromptTokens: 100, CompletionTokens: 50}}
|
||
web := &fakeWeb{wc: WebContext{Digest: "fresh facts", Citations: []string{"http://src"}, Cost: CostBreakdown{WebTool: 0.1}}}
|
||
cfg := cascadeCfg()
|
||
cfg.RouterEnabled, cfg.WebEnabled = true, true
|
||
b := &Bot{cfg: &cfg, llm: grok, web: web, log: discardLog()}
|
||
|
||
res, err := b.generate(context.Background(), "какие новости сегодня", msgs("какие новости сегодня"), "", true)
|
||
if err != nil {
|
||
t.Fatalf("generate: %v", err)
|
||
}
|
||
if res.route != routeWebThenGrok || res.text != "synthesised" {
|
||
t.Fatalf("res = (%q,%q), want web_then_grok/synthesised", res.route, res.text)
|
||
}
|
||
if res.cost.WebTool != 0.1 || res.cost.Token <= 0 {
|
||
t.Fatalf("cost = %+v, want WebTool 0.1 + Token>0", res.cost)
|
||
}
|
||
if !res.webGrounded || res.citationCount != 1 {
|
||
t.Fatalf("webGrounded=%v citations=%d, want true/1", res.webGrounded, res.citationCount)
|
||
}
|
||
if web.calls != 1 || grok.calls != 1 {
|
||
t.Fatalf("calls web=%d grok=%d, want 1/1", web.calls, grok.calls)
|
||
}
|
||
}
|
||
|
||
// TestGenerateWebDegradesToGrok: a web fetch failure (cap hit) degrades to grok_direct,
|
||
// books no web cost, and — being a RECENCY query — uses the staleness hedge, not abstain.
|
||
func TestGenerateWebDegradesToGrok(t *testing.T) {
|
||
grok := &fakeLLM{text: "grok fallback"}
|
||
web := &fakeWeb{err: errGroundingCapped}
|
||
cfg := cascadeCfg()
|
||
cfg.RouterEnabled, cfg.WebEnabled = true, true
|
||
b := &Bot{cfg: &cfg, llm: grok, web: web, log: discardLog()}
|
||
|
||
res, err := b.generate(context.Background(), "новости сегодня", msgs("новости сегодня"), "", true)
|
||
if err != nil {
|
||
t.Fatalf("generate: %v", err)
|
||
}
|
||
if res.route != routeGrokDirect || res.text != "grok fallback" || !res.fallback {
|
||
t.Fatalf("res = (%q,%q,fallback=%v), want grok_direct fallback", res.route, res.text, res.fallback)
|
||
}
|
||
if res.degraded != degradeGroundCap {
|
||
t.Fatalf("degraded = %q, want grounding_cap (the specific reason)", res.degraded)
|
||
}
|
||
if res.cost.WebTool != 0 || res.cost.Grounding != 0 {
|
||
t.Fatalf("web cost = %+v, want 0 (fetch failed before billing)", res.cost)
|
||
}
|
||
// Recency miss → staleness hedge ("out of date"), not the factual-abstain hedge.
|
||
if !hedgeContains(grok.lastReq.Messages, "out of date") {
|
||
t.Fatalf("freshness degrade should use the staleness hedge; messages = %+v", grok.lastReq.Messages)
|
||
}
|
||
}
|
||
|
||
// TestGenerateReasoningForced: the manual trigger routes to the reasoning model with
|
||
// reasoning_effort, independent of ROUTER_ENABLED.
|
||
func TestGenerateReasoningForced(t *testing.T) {
|
||
grok := &fakeLLM{text: "deep answer"}
|
||
cfg := cascadeCfg()
|
||
cfg.ReasoningEnabled = true // ROUTER_ENABLED deliberately left off
|
||
b := &Bot{cfg: &cfg, llm: grok, log: discardLog()}
|
||
|
||
res, err := b.generate(context.Background(), "подумай глубже про сознание", msgs("подумай глубже про сознание"), "", true)
|
||
if err != nil {
|
||
t.Fatalf("generate: %v", err)
|
||
}
|
||
if res.route != routeReason || res.decision.Source != "forced" {
|
||
t.Fatalf("res route=%q source=%q, want reason/forced", res.route, res.decision.Source)
|
||
}
|
||
if grok.lastReq.ReasoningEffort != "high" || grok.lastReq.Model != "grok-reason" {
|
||
t.Fatalf("reasoning req = (effort %q, model %q), want high/grok-reason", grok.lastReq.ReasoningEffort, grok.lastReq.Model)
|
||
}
|
||
}
|
||
|
||
// TestClassifyTrivialAgreementGate: a trivial route requires the Layer-0 candidate AND
|
||
// classifier.trivial AND confidence ≥ trivialFloor. A low-confidence "trivial" or a
|
||
// classifier that disagrees stays on grok_direct (no voice leak).
|
||
func TestClassifyTrivialAgreementGate(t *testing.T) {
|
||
cfg := cascadeCfg()
|
||
cfg.RouterEnabled, cfg.RouterClassifierEnabled = true, true
|
||
gem := &fakeLLM{}
|
||
b := &Bot{cfg: &cfg, gemini: gem, log: discardLog()}
|
||
var cost CostBreakdown
|
||
|
||
gem.text = `{"trivial":true,"needs_web":false,"confidence":0.95}`
|
||
if d := b.classify(context.Background(), "привет", "USER: привет", &cost); d.Route != routeTrivial {
|
||
t.Fatalf("agreed high-confidence trivial = %q, want trivial", d.Route)
|
||
}
|
||
gem.text = `{"trivial":true,"needs_web":false,"confidence":0.5}`
|
||
if d := b.classify(context.Background(), "привет", "USER: привет", &cost); d.Route != routeGrokDirect {
|
||
t.Fatalf("low-confidence trivial = %q, want grok_direct (no leak)", d.Route)
|
||
}
|
||
// A non-trivial body can never be trivial even if the classifier claims so.
|
||
gem.text = `{"trivial":true,"needs_web":false,"confidence":0.99}`
|
||
const substantive = "напиши подробное эссе про историю римской империи"
|
||
if d := b.classify(context.Background(), substantive, "USER: …", &cost); d.Route != routeGrokDirect {
|
||
t.Fatalf("classifier.trivial on a substantive body = %q, want grok_direct", d.Route)
|
||
}
|
||
}
|
||
|
||
// TestClassifyClassifierErrorFallsBackToLayer0: a classifier error/garbage degrades to the
|
||
// deterministic Layer-0 verdict — grok_direct for a substantive body, web for a freshness
|
||
// body — never an ungrounded confident answer, never a degrade-to-web.
|
||
func TestClassifyClassifierErrorFallsBackToLayer0(t *testing.T) {
|
||
cfg := cascadeCfg()
|
||
cfg.RouterEnabled, cfg.RouterClassifierEnabled, cfg.WebParanoid = true, true, true
|
||
gem := &fakeLLM{}
|
||
b := &Bot{cfg: &cfg, gemini: gem, log: discardLog()}
|
||
var cost CostBreakdown
|
||
|
||
// Transport error → Layer-0.
|
||
gem.err = errors.New("gemini down")
|
||
if d := b.classify(context.Background(), "напиши эссе про рим", "USER: …", &cost); d.Route != routeGrokDirect {
|
||
t.Fatalf("classifier error on substantive body = %q, want grok_direct (Layer-0)", d.Route)
|
||
}
|
||
if d := b.classify(context.Background(), "новости сегодня", "USER: …", &cost); d.Route != routeWebThenGrok {
|
||
t.Fatalf("classifier error on freshness body = %q, want web (deterministic Layer-0 survives)", d.Route)
|
||
}
|
||
// Garbage JSON (no transport error) → also Layer-0.
|
||
gem.err, gem.text = nil, "not json at all"
|
||
if d := b.classify(context.Background(), "напиши эссе про рим", "USER: …", &cost); d.Route != routeGrokDirect {
|
||
t.Fatalf("garbage classifier JSON = %q, want grok_direct (Layer-0)", d.Route)
|
||
}
|
||
}
|
||
|
||
// TestGenerateRoadHouseWebParanoidDM is the headline regression: an obscure-entity factual
|
||
// lookup in a DM, with the classifier + WEB_PARANOID on, routes to web AND the fetch uses
|
||
// the classifier's context-resolved search_query (the follow-up rewrite). With paranoid
|
||
// off it correctly stays grok_direct (the canary-neutral baseline).
|
||
func TestGenerateRoadHouseWebParanoidDM(t *testing.T) {
|
||
const verdict = `{"needs_web":true,"verifiable":true,"entity_obscure":true,"time_sensitive":false,"trivial":false,"search_query":"Дом у дороги 2024 фильм актёрский состав","confidence":0.7}`
|
||
mk := func(paranoid bool) (*fakeLLM, *fakeWeb, genResult) {
|
||
grok := &fakeLLM{text: "voiced", usage: Usage{PromptTokens: 10, CompletionTokens: 5}}
|
||
gem := &fakeLLM{text: verdict}
|
||
web := &fakeWeb{wc: WebContext{Digest: "cast: Patrick Swayze…", Citations: []string{"http://imdb"}}}
|
||
cfg := cascadeCfg()
|
||
cfg.RouterEnabled, cfg.RouterClassifierEnabled, cfg.WebEnabled, cfg.WebParanoid = true, true, true, paranoid
|
||
b := &Bot{cfg: &cfg, llm: grok, gemini: gem, web: web, log: discardLog()}
|
||
res, err := b.generate(context.Background(), "2024 года", []Message{
|
||
{Role: "system", Content: "SYS"},
|
||
{Role: "user", Content: "кто снимался в фильме дом у дороги"},
|
||
{Role: "assistant", Content: "В фильме 1989 года…"},
|
||
{Role: "user", Content: "2024 года"},
|
||
}, "", true)
|
||
if err != nil {
|
||
t.Fatalf("generate: %v", err)
|
||
}
|
||
return grok, web, res
|
||
}
|
||
|
||
_, web, res := mk(true)
|
||
if res.route != routeWebThenGrok {
|
||
t.Fatalf("paranoid DM road-house = %q, want web_then_grok (the fix)", res.route)
|
||
}
|
||
if !res.rewriteUsed || web.lastQuery != "Дом у дороги 2024 фильм актёрский состав" {
|
||
t.Fatalf("fetch should use the rewritten query: rewriteUsed=%v lastQuery=%q", res.rewriteUsed, web.lastQuery)
|
||
}
|
||
|
||
_, _, resOff := mk(false)
|
||
if resOff.route != routeGrokDirect {
|
||
t.Fatalf("paranoid OFF road-house = %q, want grok_direct (baseline)", resOff.route)
|
||
}
|
||
}
|
||
|
||
// TestGenerateFollowupGroupUsesBareBody: in a GROUP the context-resolved rewrite is
|
||
// suppressed — the fetch uses the bare (sanitised) body, never the classifier's
|
||
// search_query, so a member's follow-up can't ground the wrong prior subject.
|
||
func TestGenerateFollowupGroupUsesBareBody(t *testing.T) {
|
||
const verdict = `{"needs_web":true,"verifiable":true,"entity_obscure":true,"search_query":"какой-то чужой фильм 2024","confidence":0.7}`
|
||
grok := &fakeLLM{text: "voiced"}
|
||
gem := &fakeLLM{text: verdict}
|
||
web := &fakeWeb{wc: WebContext{Digest: "d", Citations: []string{"http://s"}}}
|
||
cfg := cascadeCfg()
|
||
cfg.RouterEnabled, cfg.RouterClassifierEnabled, cfg.WebEnabled, cfg.WebParanoid = true, true, true, true
|
||
b := &Bot{cfg: &cfg, llm: grok, gemini: gem, web: web, log: discardLog()}
|
||
|
||
res, err := b.generate(context.Background(), "2024 года", msgs("2024 года"), "", false /* group */)
|
||
if err != nil {
|
||
t.Fatalf("generate: %v", err)
|
||
}
|
||
if res.route != routeWebThenGrok {
|
||
t.Fatalf("group route = %q, want web_then_grok", res.route)
|
||
}
|
||
if res.rewriteUsed || web.lastQuery != "2024 года" {
|
||
t.Fatalf("group must use the bare body, not the rewrite: rewriteUsed=%v lastQuery=%q", res.rewriteUsed, web.lastQuery)
|
||
}
|
||
}
|
||
|
||
// TestGenerateWebEmptySearchQueryFallsBackToBody: the rewrite-with-fallback contract's
|
||
// empty arm (§6/§12). A DM web route whose classifier returned an empty search_query must
|
||
// fetch the bare (sanitised) body and report rewriteUsed=false — never an empty query.
|
||
func TestGenerateWebEmptySearchQueryFallsBackToBody(t *testing.T) {
|
||
// verifiable:true so it genuinely routes web (the needs_web arm requires verifiable);
|
||
// search_query empty is the point — the fetch must fall back to the bare body.
|
||
const verdict = `{"needs_web":true,"verifiable":true,"entity_obscure":false,"search_query":"","confidence":0.7}`
|
||
grok := &fakeLLM{text: "voiced"}
|
||
gem := &fakeLLM{text: verdict}
|
||
web := &fakeWeb{wc: WebContext{Digest: "d", Citations: []string{"http://s"}}}
|
||
cfg := cascadeCfg()
|
||
cfg.RouterEnabled, cfg.RouterClassifierEnabled, cfg.WebEnabled, cfg.WebParanoid = true, true, true, true
|
||
b := &Bot{cfg: &cfg, llm: grok, gemini: gem, web: web, log: discardLog()}
|
||
|
||
const body = "в каком году основан Рим"
|
||
res, err := b.generate(context.Background(), body, msgs(body), "", true /* DM */)
|
||
if err != nil {
|
||
t.Fatalf("generate: %v", err)
|
||
}
|
||
if res.route != routeWebThenGrok {
|
||
t.Fatalf("route = %q, want web_then_grok", res.route)
|
||
}
|
||
if res.rewriteUsed || web.lastQuery != body {
|
||
t.Fatalf("empty search_query must fall back to the bare body: rewriteUsed=%v lastQuery=%q", res.rewriteUsed, web.lastQuery)
|
||
}
|
||
}
|
||
|
||
// TestGenerateFreshnessTrapDesignedWeb: a freshness lexeme in a rumination
|
||
// ("сегодня…") still hard-routes to web (the accepted, designed cheap false-web, §14.1).
|
||
func TestGenerateFreshnessTrapDesignedWeb(t *testing.T) {
|
||
grok := &fakeLLM{text: "x"}
|
||
web := &fakeWeb{wc: WebContext{Digest: "d", Citations: []string{"http://s"}}}
|
||
cfg := cascadeCfg()
|
||
cfg.RouterEnabled, cfg.WebEnabled = true, true // classifier off — freshness alone routes
|
||
b := &Bot{cfg: &cfg, llm: grok, web: web, log: discardLog()}
|
||
|
||
res, err := b.generate(context.Background(), "сегодня я думаю о смысле жизни", msgs("сегодня я думаю о смысле жизни"), "", true)
|
||
if err != nil {
|
||
t.Fatalf("generate: %v", err)
|
||
}
|
||
if res.route != routeWebThenGrok {
|
||
t.Fatalf("freshness rumination = %q, want web_then_grok (designed)", res.route)
|
||
}
|
||
}
|
||
|
||
// TestGenerateWebDegradeFactualAbstain: a STATIC verifiable-fact web miss uses the
|
||
// factual-abstain hedge (not the staleness caveat), so Grok abstains on names/dates
|
||
// rather than shipping a confident guess.
|
||
func TestGenerateWebDegradeFactualAbstain(t *testing.T) {
|
||
const verdict = `{"needs_web":true,"verifiable":true,"entity_obscure":true,"time_sensitive":false,"search_query":"q","confidence":0.7}`
|
||
grok := &fakeLLM{text: "honest answer"}
|
||
gem := &fakeLLM{text: verdict}
|
||
web := &fakeWeb{err: errors.New("fetch boom")}
|
||
cfg := cascadeCfg()
|
||
cfg.RouterEnabled, cfg.RouterClassifierEnabled, cfg.WebEnabled, cfg.WebParanoid = true, true, true, true
|
||
b := &Bot{cfg: &cfg, llm: grok, gemini: gem, web: web, log: discardLog()}
|
||
|
||
res, err := b.generate(context.Background(), "кто снимался в фильме дом у дороги", msgs("кто снимался в фильме дом у дороги"), "", true)
|
||
if err != nil {
|
||
t.Fatalf("generate: %v", err)
|
||
}
|
||
if res.route != routeGrokDirect || !res.fallback {
|
||
t.Fatalf("res route=%q fallback=%v, want grok_direct fallback", res.route, res.fallback)
|
||
}
|
||
if !hedgeContains(grok.lastReq.Messages, "Couldn't verify the facts") {
|
||
t.Fatalf("factual miss should use the abstain hedge; messages = %+v", grok.lastReq.Messages)
|
||
}
|
||
if hedgeContains(grok.lastReq.Messages, "out of date") {
|
||
t.Fatalf("factual miss must NOT use the staleness hedge")
|
||
}
|
||
}
|
||
|
||
// TestFactualMissHedge: the web-degrade hedge selection. A recency signal (Freshness or
|
||
// time_sensitive) → staleness (factualMiss=false); a static checkable-fact signal
|
||
// (verifiable / entity_obscure / a non-recency needs_web) → abstain (factualMiss=true).
|
||
func TestFactualMissHedge(t *testing.T) {
|
||
cases := []struct {
|
||
d RouterDecision
|
||
want bool // true => abstain hedge
|
||
}{
|
||
{RouterDecision{Freshness: "recent"}, false},
|
||
{RouterDecision{TimeSensitive: true}, false},
|
||
{RouterDecision{Verifiable: true}, true},
|
||
{RouterDecision{EntityObscure: true}, true},
|
||
{RouterDecision{NeedsWeb: true}, true}, // off-spec needs_web-only → abstain (Q3)
|
||
{RouterDecision{NeedsWeb: true, TimeSensitive: true}, false}, // recency still wins
|
||
{RouterDecision{}, false},
|
||
}
|
||
for _, c := range cases {
|
||
if got := c.d.factualMiss(); got != c.want {
|
||
t.Errorf("factualMiss(%+v) = %v, want %v", c.d, got, c.want)
|
||
}
|
||
}
|
||
}
|
||
|
||
// TestReserveEstimate: flags off → exactly grok_direct's estimate; with gemini grounding +
|
||
// classifier on, it includes the per-prompt fee AND the always-on classifier leg (§7).
|
||
func TestReserveEstimate(t *testing.T) {
|
||
cfg := cascadeCfg()
|
||
b := &Bot{cfg: &cfg, log: discardLog()}
|
||
base := b.estimateUSD("grok-x")
|
||
if got := b.reserveEstimate(); !approxEq(got, base) {
|
||
t.Fatalf("flags-off reserve = %v, want grok_direct estimate %v", got, base)
|
||
}
|
||
|
||
cfg2 := cascadeCfg()
|
||
cfg2.WebEnabled, cfg2.WebProvider = true, webProviderGeminiGrounding
|
||
cfg2.RouterEnabled, cfg2.RouterClassifierEnabled = true, true
|
||
cfg2.GeminiGroundingPerPrompt = 0.035
|
||
b2 := &Bot{cfg: &cfg2, log: discardLog()}
|
||
want := b2.estimateUSD("grok-x") + b2.estimateUSD("gemini-x") + 0.035 + b2.estimateUSD("gemini-x")
|
||
if got := b2.reserveEstimate(); !approxEq(got, want) {
|
||
t.Fatalf("web+classifier reserve = %v, want %v (XAI + gemini fetch + $0.035 fee + classifier leg)", got, want)
|
||
}
|
||
// The fee must actually move the envelope (regression guard for an unbooked fee).
|
||
cfg3 := cfg2
|
||
cfg3.GeminiGroundingPerPrompt = 0
|
||
b3 := &Bot{cfg: &cfg3, log: discardLog()}
|
||
if b2.reserveEstimate()-b3.reserveEstimate() < 0.0349 {
|
||
t.Fatalf("the grounding fee must raise the reservation by ~0.035")
|
||
}
|
||
}
|
||
|
||
// TestGrokReasoningEffort: GROK_REASONING_EFFORT is sent on grok_direct (so grok-4.3 can
|
||
// be kept fast with "none"), empty means not sent (compat with grok-4.20-non-reasoning),
|
||
// and the reason route always overrides to "high" regardless.
|
||
func TestGrokReasoningEffort(t *testing.T) {
|
||
grok := &fakeLLM{text: "ok"}
|
||
cfg := cascadeCfg()
|
||
cfg.GrokReasoningEffort = "none"
|
||
b := &Bot{cfg: &cfg, llm: grok, log: discardLog()}
|
||
if _, err := b.generate(context.Background(), "hello", msgs("hello"), "", true); err != nil {
|
||
t.Fatal(err)
|
||
}
|
||
if grok.lastReq.ReasoningEffort != "none" {
|
||
t.Fatalf("grok_direct effort = %q, want none", grok.lastReq.ReasoningEffort)
|
||
}
|
||
|
||
grokDef := &fakeLLM{text: "ok"}
|
||
cfgDef := cascadeCfg() // GrokReasoningEffort == ""
|
||
bDef := &Bot{cfg: &cfgDef, llm: grokDef, log: discardLog()}
|
||
if _, err := bDef.generate(context.Background(), "hello", msgs("hello"), "", true); err != nil {
|
||
t.Fatal(err)
|
||
}
|
||
if grokDef.lastReq.ReasoningEffort != "" {
|
||
t.Fatalf("default effort = %q, want empty (not sent)", grokDef.lastReq.ReasoningEffort)
|
||
}
|
||
|
||
grokR := &fakeLLM{text: "deep"}
|
||
cfgR := cascadeCfg()
|
||
cfgR.GrokReasoningEffort = "none"
|
||
cfgR.ReasoningEnabled = true
|
||
bR := &Bot{cfg: &cfgR, llm: grokR, log: discardLog()}
|
||
if _, err := bR.generate(context.Background(), "подумай глубже про X", msgs("подумай глубже про X"), "", true); err != nil {
|
||
t.Fatal(err)
|
||
}
|
||
if grokR.lastReq.ReasoningEffort != "high" {
|
||
t.Fatalf("reason route effort = %q, want high (overrides GROK_REASONING_EFFORT)", grokR.lastReq.ReasoningEffort)
|
||
}
|
||
}
|
||
|
||
// TestGenerateTerminalErrorPropagates: if even grok_direct fails, generate returns the
|
||
// error (respond turns it into refund + react), not a silent empty success.
|
||
func TestGenerateTerminalErrorPropagates(t *testing.T) {
|
||
grok := &fakeLLM{err: errors.New("xai down")}
|
||
cfg := cascadeCfg()
|
||
b := &Bot{cfg: &cfg, llm: grok, log: discardLog()}
|
||
|
||
if _, err := b.generate(context.Background(), "hello", msgs("hello"), "", true); err == nil {
|
||
t.Fatal("want terminal error when grok_direct fails, got nil")
|
||
}
|
||
}
|
||
|
||
// TestWebSynthMessagesNoRawURLs guards the web-synth note: the grounded digest is injected,
|
||
// the raw gemini-grounding redirect URLs must NOT reach the prompt (Grok was pasting
|
||
// vertexaisearch.../grounding-api-redirect/... links into the reply), and the note is
|
||
// authoritative enough that Grok uses the data instead of denying web access ("I don't
|
||
// have live web access" despite being handed fresh news).
|
||
func TestWebSynthMessagesNoRawURLs(t *testing.T) {
|
||
wc := WebContext{
|
||
Digest: "Титаник вышел в 1997, режиссёр Джеймс Кэмерон.",
|
||
Citations: []string{"https://vertexaisearch.cloud.google.com/grounding-api-redirect/AUZIYQabc123"},
|
||
}
|
||
out := webSynthMessages(msgs("в каком году титаник"), wc)
|
||
var note string
|
||
for _, m := range out {
|
||
if m.Role == "system" && strings.Contains(m.Content, "web-search results") {
|
||
note = m.Content
|
||
}
|
||
}
|
||
if note == "" {
|
||
t.Fatal("web synth note missing")
|
||
}
|
||
if !strings.Contains(note, "Титаник вышел в 1997") {
|
||
t.Fatalf("digest not injected: %q", note)
|
||
}
|
||
if strings.Contains(note, "vertexaisearch") || strings.Contains(note, "grounding-api-redirect") || strings.Contains(note, "http") {
|
||
t.Fatalf("raw citation URL leaked into the synth prompt: %q", note)
|
||
}
|
||
// The note must counter the "no internet access" rule so Grok actually uses the data.
|
||
if !strings.Contains(note, "no internet access") {
|
||
t.Fatalf("note must lift the no-internet rule for the web turn: %q", note)
|
||
}
|
||
}
|
||
|
||
// failFirstLLM errors on its first Complete call and succeeds after — for the project
|
||
// degrade test, where the KB-injecting Grok call fails but the grok_direct fallback works.
|
||
type failFirstLLM struct {
|
||
failErr error
|
||
okText string
|
||
calls int
|
||
lastReq LLMRequest
|
||
}
|
||
|
||
func (f *failFirstLLM) Complete(_ context.Context, req LLMRequest) (*LLMResponse, error) {
|
||
f.calls++
|
||
f.lastReq = req
|
||
if f.calls == 1 {
|
||
return nil, f.failErr
|
||
}
|
||
return &LLMResponse{Text: f.okText, ProviderRequestID: "fake"}, nil
|
||
}
|
||
|
||
// TestGenerateProjectFlagOffByteIdentical: even with a KB loaded and a product question,
|
||
// all cascade flags off → grok_direct, Gemini untouched, the KB never reaches the prompt.
|
||
func TestGenerateProjectFlagOffByteIdentical(t *testing.T) {
|
||
grok := &fakeLLM{text: "grok answer"}
|
||
gem := &fakeLLM{text: "should not run"}
|
||
cfg := cascadeCfg()
|
||
cfg.ProjectKB = "VOJO FACTS" // present but the flag is off
|
||
b := &Bot{cfg: &cfg, llm: grok, gemini: gem, log: discardLog()}
|
||
|
||
res, err := b.generate(context.Background(), "что такое vojo", msgs("что такое vojo"), "", true)
|
||
if err != nil {
|
||
t.Fatalf("generate: %v", err)
|
||
}
|
||
if res.route != routeGrokDirect {
|
||
t.Fatalf("route=%q, want grok_direct (all flags off)", res.route)
|
||
}
|
||
if gem.calls != 0 {
|
||
t.Fatalf("gemini called %d, want 0 (router off)", gem.calls)
|
||
}
|
||
if hedgeContains(grok.lastReq.Messages, "VOJO FACTS") {
|
||
t.Fatalf("KB leaked into the grok prompt with flags off: %+v", grok.lastReq.Messages)
|
||
}
|
||
}
|
||
|
||
// TestGenerateProjectFlagOffFallsThrough is the canary-clean property: with the classifier
|
||
// on but PROJECT_KB_ENABLED off, Combine still DECIDES project (so about_project is recorded
|
||
// for "would-have-fired" measurement) but EXECUTION falls through to grok_direct — the KB is
|
||
// never injected and the answer is byte-identical to today's grok_direct.
|
||
func TestGenerateProjectFlagOffFallsThrough(t *testing.T) {
|
||
const verdict = `{"about_project":true,"confidence":0.9}`
|
||
grok := &fakeLLM{text: "grok answer"}
|
||
gem := &fakeLLM{text: verdict}
|
||
cfg := cascadeCfg()
|
||
cfg.RouterEnabled, cfg.RouterClassifierEnabled = true, true // PROJECT_KB_ENABLED deliberately OFF
|
||
cfg.ProjectKB = "VOJO FACTS"
|
||
b := &Bot{cfg: &cfg, llm: grok, gemini: gem, log: discardLog()}
|
||
|
||
res, err := b.generate(context.Background(), "что умеет vojo", msgs("что умеет vojo"), "", true)
|
||
if err != nil {
|
||
t.Fatalf("generate: %v", err)
|
||
}
|
||
if res.decision.Route != routeProject {
|
||
t.Fatalf("decision.Route=%q, want project_then_grok (the would-have-fired signal)", res.decision.Route)
|
||
}
|
||
if !res.decision.AboutProject {
|
||
t.Fatalf("about_project must be recorded for telemetry even with the flag off")
|
||
}
|
||
if res.route != routeGrokDirect {
|
||
t.Fatalf("route=%q, want grok_direct (flag off → fall through)", res.route)
|
||
}
|
||
if hedgeContains(grok.lastReq.Messages, "VOJO FACTS") {
|
||
t.Fatalf("KB injected despite the flag being off: %+v", grok.lastReq.Messages)
|
||
}
|
||
if grok.calls != 1 || gem.calls != 1 {
|
||
t.Fatalf("calls grok=%d gem=%d, want 1/1 (classifier + grok_direct)", grok.calls, gem.calls)
|
||
}
|
||
}
|
||
|
||
// TestGenerateProjectThenGrok: with the gate on, an about_project verdict routes to
|
||
// project_then_grok, injects the KB as a system note, and Grok voices it — one Grok call,
|
||
// Gemini only as the classifier.
|
||
func TestGenerateProjectThenGrok(t *testing.T) {
|
||
const verdict = `{"about_project":true,"needs_web":false,"confidence":0.9}`
|
||
grok := &fakeLLM{text: "voiced from KB", usage: Usage{PromptTokens: 20, CompletionTokens: 8}}
|
||
gem := &fakeLLM{text: verdict}
|
||
cfg := cascadeCfg()
|
||
cfg.RouterEnabled, cfg.RouterClassifierEnabled, cfg.ProjectKBEnabled = true, true, true
|
||
cfg.ProjectKB = "VOJO FACTS: encrypted DMs, voice calls; no group calls yet."
|
||
b := &Bot{cfg: &cfg, llm: grok, gemini: gem, log: discardLog()}
|
||
|
||
res, err := b.generate(context.Background(), "что умеет vojo", msgs("что умеет vojo"), "", true)
|
||
if err != nil {
|
||
t.Fatalf("generate: %v", err)
|
||
}
|
||
if res.route != routeProject || res.text != "voiced from KB" || res.finalModel != "grok-x" {
|
||
t.Fatalf("res=(%q,%q,%q), want project_then_grok/voiced/grok-x", res.route, res.text, res.finalModel)
|
||
}
|
||
if !hedgeContains(grok.lastReq.Messages, "VOJO FACTS: encrypted DMs") {
|
||
t.Fatalf("KB not injected into the grok prompt: %+v", grok.lastReq.Messages)
|
||
}
|
||
if grok.calls != 1 || gem.calls != 1 {
|
||
t.Fatalf("calls grok=%d gem=%d, want 1/1 (classifier + one project synth)", grok.calls, gem.calls)
|
||
}
|
||
}
|
||
|
||
// TestGenerateAboutProjectFalseNoKB: when the classifier says about_project=false, the KB is
|
||
// NOT injected even with the flag on — the route trusts the classifier in both directions.
|
||
func TestGenerateAboutProjectFalseNoKB(t *testing.T) {
|
||
const verdict = `{"about_project":false,"confidence":0.9}`
|
||
grok := &fakeLLM{text: "grok answer"}
|
||
gem := &fakeLLM{text: verdict}
|
||
cfg := cascadeCfg()
|
||
cfg.RouterEnabled, cfg.RouterClassifierEnabled, cfg.ProjectKBEnabled = true, true, true
|
||
cfg.ProjectKB = "VOJO FACTS"
|
||
b := &Bot{cfg: &cfg, llm: grok, gemini: gem, log: discardLog()}
|
||
|
||
res, err := b.generate(context.Background(), "расскажи про телеграм", msgs("расскажи про телеграм"), "", true)
|
||
if err != nil {
|
||
t.Fatalf("generate: %v", err)
|
||
}
|
||
if res.decision.Route == routeProject || res.route != routeGrokDirect {
|
||
t.Fatalf("about_project=false routed to project: decision=%q route=%q, want grok_direct", res.decision.Route, res.route)
|
||
}
|
||
if hedgeContains(grok.lastReq.Messages, "VOJO FACTS") {
|
||
t.Fatalf("KB injected when the classifier said not-about-project: %+v", grok.lastReq.Messages)
|
||
}
|
||
if grok.calls != 1 {
|
||
t.Fatalf("grok calls=%d, want 1 (no project synth attempt)", grok.calls)
|
||
}
|
||
}
|
||
|
||
// TestGenerateProjectContextFollowup: the headline live case — a context-resolved follow-up
|
||
// ("Про этот", no literal "vojo") that the classifier flags about_project=true routes to the
|
||
// KB. This is what the old regex-hint gate wrongly blocked.
|
||
func TestGenerateProjectContextFollowup(t *testing.T) {
|
||
const verdict = `{"about_project":true,"confidence":1.0}`
|
||
grok := &fakeLLM{text: "voiced from KB"}
|
||
gem := &fakeLLM{text: verdict}
|
||
cfg := cascadeCfg()
|
||
cfg.RouterEnabled, cfg.RouterClassifierEnabled, cfg.ProjectKBEnabled = true, true, true
|
||
cfg.ProjectKB = "VOJO FACTS: messaging, calls, channels."
|
||
b := &Bot{cfg: &cfg, llm: grok, gemini: gem, log: discardLog()}
|
||
|
||
res, err := b.generate(context.Background(), "Про этот", []Message{
|
||
{Role: "system", Content: "SYS"},
|
||
{Role: "user", Content: "знаешь что-нибудь про мессенджер?"},
|
||
{Role: "assistant", Content: "Знаю. Про какой именно?"},
|
||
{Role: "user", Content: "Про этот"},
|
||
}, "", true)
|
||
if err != nil {
|
||
t.Fatalf("generate: %v", err)
|
||
}
|
||
if res.route != routeProject {
|
||
t.Fatalf("context follow-up = %q, want project_then_grok (no literal 'vojo' needed)", res.route)
|
||
}
|
||
if !hedgeContains(grok.lastReq.Messages, "VOJO FACTS: messaging") {
|
||
t.Fatalf("KB not injected on the context-resolved follow-up: %+v", grok.lastReq.Messages)
|
||
}
|
||
}
|
||
|
||
// TestGenerateProjectDegradesToGrok: the KB-injecting Grok call fails → degrade to
|
||
// grok_direct with the project-abstain hedge (never silent, never a Vojo guess from empty
|
||
// memory).
|
||
func TestGenerateProjectDegradesToGrok(t *testing.T) {
|
||
const verdict = `{"about_project":true,"confidence":0.9}`
|
||
grok := &failFirstLLM{failErr: errors.New("grok boom on KB turn"), okText: "honest fallback"}
|
||
gem := &fakeLLM{text: verdict}
|
||
cfg := cascadeCfg()
|
||
cfg.RouterEnabled, cfg.RouterClassifierEnabled, cfg.ProjectKBEnabled = true, true, true
|
||
cfg.ProjectKB = "VOJO FACTS"
|
||
b := &Bot{cfg: &cfg, llm: grok, gemini: gem, log: discardLog()}
|
||
|
||
res, err := b.generate(context.Background(), "что умеет vojo", msgs("что умеет vojo"), "", true)
|
||
if err != nil {
|
||
t.Fatalf("generate: %v", err)
|
||
}
|
||
if res.route != routeGrokDirect || res.text != "honest fallback" || !res.fallback {
|
||
t.Fatalf("res=(%q,%q,fallback=%v), want grok_direct/honest fallback/true", res.route, res.text, res.fallback)
|
||
}
|
||
if res.degraded != degradeProject {
|
||
t.Fatalf("degraded=%q, want %q", res.degraded, degradeProject)
|
||
}
|
||
if !hedgeContains(grok.lastReq.Messages, "Couldn't load the Vojo product info") {
|
||
t.Fatalf("project degrade should inject the abstain hedge; messages=%+v", grok.lastReq.Messages)
|
||
}
|
||
if grok.calls != 2 {
|
||
t.Fatalf("grok calls=%d, want 2 (failed KB attempt + grok_direct fallback)", grok.calls)
|
||
}
|
||
}
|
||
|
||
// TestProjectKBMessagesScoped guards the anti-hallucination note: the KB is injected
|
||
// delimited, Vojo claims are restricted to the FACTS, the general part is explicitly
|
||
// licensed (entity-scoped, NOT "answer only from KB"), and the abstain clause is present.
|
||
func TestProjectKBMessagesScoped(t *testing.T) {
|
||
out := projectKBMessages(msgs("что умеет vojo"), "VOJO FACT: chats and calls")
|
||
var note string
|
||
for _, m := range out {
|
||
if m.Role == "system" && strings.Contains(m.Content, "FACTS") {
|
||
note = m.Content
|
||
}
|
||
}
|
||
if note == "" {
|
||
t.Fatal("project KB note missing")
|
||
}
|
||
if !strings.Contains(note, "VOJO FACT: chats and calls") {
|
||
t.Fatalf("KB not injected: %q", note)
|
||
}
|
||
if !strings.Contains(note, "<FACTS>") {
|
||
t.Fatalf("note must delimit the KB with <FACTS> tags (tagged-context grounding): %q", note)
|
||
}
|
||
// The load-bearing hard-scoping clause: Vojo claims restricted to the FACTS. Without this
|
||
// assertion the clause could be silently softened (mutation-proven) and the route would
|
||
// stop grounding — re-opening the hallucination hole.
|
||
if !strings.Contains(note, "use ONLY the FACTS") {
|
||
t.Fatalf("note must restrict Vojo claims to the FACTS (entity-scoping): %q", note)
|
||
}
|
||
// Lifts the base prompt's "no file/document access" honesty rule for this turn (like the
|
||
// web note lifts "no internet access") — else a fast Grok can hedge "I can't access Vojo
|
||
// docs" despite the injected FACTS. The doc comment claims this lift; assert the wire does it.
|
||
if !strings.Contains(note, "do NOT say you lack access") {
|
||
t.Fatalf("note must lift the no-file-access rule so Grok treats the FACTS as available: %q", note)
|
||
}
|
||
if !strings.Contains(note, "general") {
|
||
t.Fatalf("note must license the general (non-Vojo) part — entity-scoped: %q", note)
|
||
}
|
||
if !strings.Contains(note, "don't have that information") {
|
||
t.Fatalf("note must carry the explicit abstain clause: %q", note)
|
||
}
|
||
}
|
||
|
||
// TestReserveEstimateProjectNoBump: enabling PROJECT_KB_ENABLED must NOT raise the
|
||
// reservation — the project route is one Grok call on a prompt already capped at
|
||
// maxPromptTokens, ≤ the grok_direct base already counted.
|
||
func TestReserveEstimateProjectNoBump(t *testing.T) {
|
||
base := cascadeCfg()
|
||
base.RouterEnabled, base.RouterClassifierEnabled = true, true
|
||
bBase := &Bot{cfg: &base, log: discardLog()}
|
||
|
||
proj := base
|
||
proj.ProjectKBEnabled = true
|
||
proj.ProjectKB = "facts"
|
||
bProj := &Bot{cfg: &proj, log: discardLog()}
|
||
|
||
if !approxEq(bBase.reserveEstimate(), bProj.reserveEstimate()) {
|
||
t.Fatalf("PROJECT_KB_ENABLED changed reserveEstimate: %v vs %v", bBase.reserveEstimate(), bProj.reserveEstimate())
|
||
}
|
||
}
|
||
|
||
func hedgeContains(ms []Message, sub string) bool {
|
||
for _, m := range ms {
|
||
if strings.Contains(m.Content, sub) {
|
||
return true
|
||
}
|
||
}
|
||
return false
|
||
}
|
||
|
||
func approxEq(a, b float64) bool {
|
||
d := a - b
|
||
return d < 1e-9 && d > -1e-9
|
||
}
|