vojo/apps/ai-bot/internal/routedecide/routedecide_test.go

263 lines
12 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package routedecide
import "testing"
// TestClassifyLayer0 is the free-heuristic golden set: freshness → web (WebForce),
// short greetings/acks/bare-arithmetic → trivial candidate, everything else →
// grok_direct, with substantive messages never trivial.
func TestClassifyLayer0(t *testing.T) {
cases := []struct {
body string
wantRoute string
wantWebForce bool
wantTrivial bool
}{
{"привет", RouteTrivial, false, true},
{"спасибо", RouteTrivial, false, true},
{"2+2", RouteTrivial, false, true},
{"12 / 4 - 1", RouteTrivial, false, true},
{"hello", RouteTrivial, false, true},
{"какие новости сегодня?", RouteWeb, true, false},
{"курс доллара сегодня", RouteWeb, true, false},
{"what's the weather today", RouteWeb, true, false},
{"посоветуй фильм на вечер", RouteGrokDirect, false, false},
{"explain how TCP works", RouteGrokDirect, false, false},
{"спасибо, а теперь подробно объясни квантовую запутанность", RouteGrokDirect, false, false},
{"", RouteGrokDirect, false, false},
}
for _, c := range cases {
l0 := ClassifyLayer0(c.body)
if l0.Route != c.wantRoute || l0.WebForce != c.wantWebForce || l0.Trivial != c.wantTrivial {
t.Errorf("ClassifyLayer0(%q) = {route:%q webForce:%v trivial:%v}, want {%q %v %v}",
c.body, l0.Route, l0.WebForce, l0.Trivial, c.wantRoute, c.wantWebForce, c.wantTrivial)
}
}
}
// TestFreshnessWordBoundaries guards the §7-#7 \b tightening: English freshness tokens
// fire on whole words only — never inside scoreboard / concurrent / weathering — while
// genuine freshness phrases still force web, and Russian stems stay stem-matched.
func TestFreshnessWordBoundaries(t *testing.T) {
shouldForceWeb := []string{
"what's the weather today",
"latest news on AI",
"current bitcoin price",
"какие новости сегодня", // RU stems unchanged
"курс доллара сегодня",
}
for _, s := range shouldForceWeb {
if !ClassifyLayer0(s).WebForce {
t.Errorf("expected WebForce on freshness phrase: %q", s)
}
}
shouldNotForceWeb := []string{
"the scoreboard shows 3:1", // score inside scoreboard
"concurrent programming in Go", // current inside concurrent
"weathering the storm, metaphorically", // weather inside weathering
"subscribe to my newsletter please", // news inside newsletter
}
for _, s := range shouldNotForceWeb {
if ClassifyLayer0(s).WebForce {
t.Errorf("freshness false-positive (substring match) on: %q", s)
}
}
}
// TestLookupHintFalsePositiveCorpus is the §5 guarantee: the soft lookup-intent regex
// must NOT fire on greetings/vocatives/idioms/non-lookup interrogatives — it is anchored
// on interrogative + lookup-verb, never on a capitalised word or a guillemet. A false
// LookupHint can only ever bias the classifier (and only when WEB_PARANOID + verifiable),
// but we still hold the regex itself to near-zero false positives.
func TestLookupHintFalsePositiveCorpus(t *testing.T) {
falsePositives := []string{
"Привет, Москва!", // vocative, no interrogative
"«Война и мир» — топ", // guillemets are not a trigger
"ну ты прям Эйнштейн", // proper noun, no «кто такой»
"кто это сделал?", // «кто» not followed by a lookup-verb
"когда ты придёшь?", // «когда» needs a release/birth verb
"спасибо большое", // ack
"расскажи что-нибудь", // imperative, no lookup interrogative
"I love this movie", // English, no interrogative
"who cares", // «who» not followed by is/was/starred/…
}
for _, s := range falsePositives {
if l0 := ClassifyLayer0(s); l0.LookupHint {
t.Errorf("lookupHint fired on a false-positive trap: %q", s)
}
}
// And it MUST fire on genuine lookup intent (otherwise it's useless).
truePositives := []string{
"кто снимался в фильме дом у дороги",
"кто написал войну и мир",
"в каком году вышел фильм матрица",
"who directed Inception",
"in what year was the Matrix released",
"how many seasons of breaking bad",
}
for _, s := range truePositives {
if l0 := ClassifyLayer0(s); !l0.LookupHint {
t.Errorf("lookupHint should fire on genuine lookup intent: %q", s)
}
}
}
// TestCombineFreshnessAlwaysWeb: a freshnessRe hit (WebForce) routes to web regardless of
// WEB_PARANOID and regardless of the classifier verdict — the deterministic signal that
// survives the classifier being down (§4.4).
func TestCombineFreshnessAlwaysWeb(t *testing.T) {
l0 := Layer0{Route: RouteWeb, WebForce: true, Freshness: "recent"}
v := Verdict{NeedsWeb: false, Confidence: 0.1} // classifier disagrees
for _, paranoid := range []bool{true, false} {
if got := Combine(l0, v, paranoid).Route; got != RouteWeb {
t.Errorf("freshness with paranoid=%v = %q, want web", paranoid, got)
}
}
}
// TestCombineParanoidGating is the Design-X invariant (§15): with WEB_PARANOID OFF, only
// freshness routes to web — the classifier's needs_web/entity/time/lookup signals are
// recorded but do NOT change the route. With it ON, those arms activate.
func TestCombineParanoidGating(t *testing.T) {
l0 := Layer0{Route: RouteGrokDirect, LookupHint: true} // no freshness
arms := []Verdict{
{NeedsWeb: true, Verifiable: true, Confidence: 0.9}, // classifier_needs_web (needs verifiable)
{EntityObscure: true, Confidence: 0.4}, // entity_obscure
{TimeSensitive: true, Confidence: 0.4}, // time_sensitive
{Verifiable: true, Confidence: 0.4}, // lookup_hint && verifiable
}
for i, v := range arms {
if got := Combine(l0, v, false).Route; got != RouteGrokDirect {
t.Errorf("arm %d with paranoid OFF = %q, want grok_direct (web is freshness-only)", i, got)
}
if got := Combine(l0, v, true).Route; got != RouteWeb {
t.Errorf("arm %d with paranoid ON = %q, want web", i, got)
}
}
}
// TestCombineWebFloor: the needs_web arm only fires at/above WebNeedsWebFloor (paranoid).
func TestCombineWebFloor(t *testing.T) {
l0 := Layer0{Route: RouteGrokDirect}
below := Verdict{NeedsWeb: true, Verifiable: true, Confidence: WebNeedsWebFloor - 0.01}
atFloor := Verdict{NeedsWeb: true, Verifiable: true, Confidence: WebNeedsWebFloor}
if got := Combine(l0, below, true).Route; got != RouteGrokDirect {
t.Errorf("needs_web below floor = %q, want grok_direct", got)
}
if got := Combine(l0, atFloor, true).Route; got != RouteWeb {
t.Errorf("needs_web at floor = %q, want web", got)
}
}
// TestCombineNeedsWebRequiresVerifiable is the false-web fix (observed live): the needs_web
// arm fires ONLY when the classifier also flagged a checkable named-entity fact
// (verifiable). A high-confidence needs_web on a non-verifiable query — an opinion or
// explanation the small flash-lite over-eagerly marked needs_web=true ("посоветуй фильм",
// "объясни goroutines") — stays on grok_direct. Recency (time_sensitive/freshness) and
// obscurity (entity_obscure) keep their own arms, so no genuine grounding is lost.
func TestCombineNeedsWebRequiresVerifiable(t *testing.T) {
l0 := Layer0{Route: RouteGrokDirect}
if got := Combine(l0, Verdict{NeedsWeb: true, Verifiable: false, Confidence: 1.0}, true).Route; got != RouteGrokDirect {
t.Errorf("needs_web && !verifiable = %q, want grok_direct (false-web fix)", got)
}
if got := Combine(l0, Verdict{NeedsWeb: true, Verifiable: true, Confidence: 0.6}, true).Route; got != RouteWeb {
t.Errorf("needs_web && verifiable = %q, want web", got)
}
// A non-verifiable needs_web that is ALSO entity_obscure still grounds (obscure arm).
if got := Combine(l0, Verdict{NeedsWeb: true, Verifiable: false, EntityObscure: true, Confidence: 0.1}, true).Route; got != RouteWeb {
t.Errorf("entity_obscure must still route web regardless of verifiable, got %q", got)
}
}
// TestCombineTrivialAgreementGate: trivial requires BOTH the Layer-0 candidate AND
// classifier.trivial AND confidence ≥ TrivialFloor. A lone signal stays on grok_direct.
func TestCombineTrivialAgreementGate(t *testing.T) {
trivialL0 := Layer0{Route: RouteTrivial, Trivial: true}
nonTrivialL0 := Layer0{Route: RouteGrokDirect}
if got := Combine(trivialL0, Verdict{Trivial: true, Confidence: 0.95}, true).Route; got != RouteTrivial {
t.Errorf("agreed high-confidence trivial = %q, want trivial", got)
}
if got := Combine(trivialL0, Verdict{Trivial: true, Confidence: 0.5}, true).Route; got != RouteGrokDirect {
t.Errorf("low-confidence trivial = %q, want grok_direct (no voice leak)", got)
}
if got := Combine(trivialL0, Verdict{Trivial: false, Confidence: 0.95}, true).Route; got != RouteGrokDirect {
t.Errorf("classifier disagrees on trivial = %q, want grok_direct", got)
}
// Never trust classifier.trivial alone: without the Layer-0 candidate it stays grok.
if got := Combine(nonTrivialL0, Verdict{Trivial: true, Confidence: 0.99}, true).Route; got == RouteTrivial {
t.Errorf("classifier.trivial alone routed to trivial; must require the Layer-0 candidate")
}
}
// TestCombineRoadHouse is the regression: the hallucinated-cast bug. With WEB_PARANOID on
// and the classifier flagging the (obscure, verifiable) entity, both the first turn and
// the resolved follow-up route to web; with paranoid off they fall to grok_direct (the
// canary-neutral baseline).
func TestCombineRoadHouse(t *testing.T) {
first := ClassifyLayer0("кто снимался в фильме дом у дороги")
followup := ClassifyLayer0("2024 года") // bare; the classifier resolves via context
v := Verdict{NeedsWeb: true, Verifiable: true, EntityObscure: true, Confidence: 0.7}
for _, l0 := range []Layer0{first, followup} {
if got := Combine(l0, v, true).Route; got != RouteWeb {
t.Errorf("road house with paranoid ON = %q, want web (the hallucination fix)", got)
}
if got := Combine(l0, v, false).Route; got != RouteGrokDirect {
t.Errorf("road house with paranoid OFF = %q, want grok_direct (baseline)", got)
}
}
}
// TestWebDecidedByAttribution: the switch order attributes the right arm (for tuning 0.55).
func TestWebDecidedByAttribution(t *testing.T) {
cases := []struct {
l0 Layer0
v Verdict
want string
}{
{Layer0{WebForce: true}, Verdict{}, WebByFreshness},
{Layer0{}, Verdict{NeedsWeb: true, Verifiable: true, Confidence: 0.9}, WebByNeedsWeb},
{Layer0{}, Verdict{EntityObscure: true, Confidence: 0.1}, WebByObscure},
{Layer0{}, Verdict{TimeSensitive: true, Confidence: 0.1}, WebByTime},
{Layer0{LookupHint: true}, Verdict{Verifiable: true, Confidence: 0.1}, WebByLookupHint},
{Layer0{Route: RouteGrokDirect}, Verdict{Confidence: 0.1}, WebByNone},
}
for _, c := range cases {
if got := Combine(c.l0, c.v, true).WebDecidedBy; got != c.want {
t.Errorf("web_decided_by(%+v,%+v) = %q, want %q", c.l0, c.v, got, c.want)
}
}
}
// TestProjectGateOnAboutProject: the project route trusts the classifier — it fires when
// AboutProject is set and not otherwise. There is no Layer-0 hint requirement (live traffic
// showed it blocked correct context-resolved follow-ups). Independent of WEB_PARANOID.
func TestProjectGateOnAboutProject(t *testing.T) {
l0 := Layer0{Route: RouteGrokDirect}
for _, paranoid := range []bool{true, false} {
if got := Combine(l0, Verdict{AboutProject: true}, paranoid).Route; got != RouteProject {
t.Errorf("AboutProject=true (paranoid=%v) = %q, want project_then_grok", paranoid, got)
}
if got := Combine(l0, Verdict{AboutProject: false}, paranoid).Route; got == RouteProject {
t.Errorf("AboutProject=false (paranoid=%v) routed to project; must not", paranoid)
}
}
}
// TestProjectBeatsWebArms: the project arm is case #0 — it out-prioritizes even the hard
// freshness (WebForce) arm and the classifier web arms, because the curated KB, not the
// web, is the authoritative source for product facts ("какие новости у Vojo" trips
// freshness yet is a product question).
func TestProjectBeatsWebArms(t *testing.T) {
l0 := Layer0{Route: RouteWeb, WebForce: true} // freshness hit
v := Verdict{AboutProject: true, NeedsWeb: true, Verifiable: true, TimeSensitive: true, Confidence: 0.9}
for _, paranoid := range []bool{true, false} {
got := Combine(l0, v, paranoid)
if got.Route != RouteProject {
t.Errorf("project must beat web arms (paranoid=%v) = %q, want project_then_grok", paranoid, got.Route)
}
if got.WebDecidedBy != WebByNone {
t.Errorf("project route web_decided_by = %q, want none", got.WebDecidedBy)
}
}
}