263 lines
12 KiB
Go
263 lines
12 KiB
Go
package routedecide
|
||
|
||
import "testing"
|
||
|
||
// TestClassifyLayer0 is the free-heuristic golden set: freshness → web (WebForce),
|
||
// short greetings/acks/bare-arithmetic → trivial candidate, everything else →
|
||
// grok_direct, with substantive messages never trivial.
|
||
func TestClassifyLayer0(t *testing.T) {
|
||
cases := []struct {
|
||
body string
|
||
wantRoute string
|
||
wantWebForce bool
|
||
wantTrivial bool
|
||
}{
|
||
{"привет", RouteTrivial, false, true},
|
||
{"спасибо", RouteTrivial, false, true},
|
||
{"2+2", RouteTrivial, false, true},
|
||
{"12 / 4 - 1", RouteTrivial, false, true},
|
||
{"hello", RouteTrivial, false, true},
|
||
{"какие новости сегодня?", RouteWeb, true, false},
|
||
{"курс доллара сегодня", RouteWeb, true, false},
|
||
{"what's the weather today", RouteWeb, true, false},
|
||
{"посоветуй фильм на вечер", RouteGrokDirect, false, false},
|
||
{"explain how TCP works", RouteGrokDirect, false, false},
|
||
{"спасибо, а теперь подробно объясни квантовую запутанность", RouteGrokDirect, false, false},
|
||
{"", RouteGrokDirect, false, false},
|
||
}
|
||
for _, c := range cases {
|
||
l0 := ClassifyLayer0(c.body)
|
||
if l0.Route != c.wantRoute || l0.WebForce != c.wantWebForce || l0.Trivial != c.wantTrivial {
|
||
t.Errorf("ClassifyLayer0(%q) = {route:%q webForce:%v trivial:%v}, want {%q %v %v}",
|
||
c.body, l0.Route, l0.WebForce, l0.Trivial, c.wantRoute, c.wantWebForce, c.wantTrivial)
|
||
}
|
||
}
|
||
}
|
||
|
||
// TestFreshnessWordBoundaries guards the §7-#7 \b tightening: English freshness tokens
|
||
// fire on whole words only — never inside scoreboard / concurrent / weathering — while
|
||
// genuine freshness phrases still force web, and Russian stems stay stem-matched.
|
||
func TestFreshnessWordBoundaries(t *testing.T) {
|
||
shouldForceWeb := []string{
|
||
"what's the weather today",
|
||
"latest news on AI",
|
||
"current bitcoin price",
|
||
"какие новости сегодня", // RU stems unchanged
|
||
"курс доллара сегодня",
|
||
}
|
||
for _, s := range shouldForceWeb {
|
||
if !ClassifyLayer0(s).WebForce {
|
||
t.Errorf("expected WebForce on freshness phrase: %q", s)
|
||
}
|
||
}
|
||
shouldNotForceWeb := []string{
|
||
"the scoreboard shows 3:1", // score inside scoreboard
|
||
"concurrent programming in Go", // current inside concurrent
|
||
"weathering the storm, metaphorically", // weather inside weathering
|
||
"subscribe to my newsletter please", // news inside newsletter
|
||
}
|
||
for _, s := range shouldNotForceWeb {
|
||
if ClassifyLayer0(s).WebForce {
|
||
t.Errorf("freshness false-positive (substring match) on: %q", s)
|
||
}
|
||
}
|
||
}
|
||
|
||
// TestLookupHintFalsePositiveCorpus is the §5 guarantee: the soft lookup-intent regex
|
||
// must NOT fire on greetings/vocatives/idioms/non-lookup interrogatives — it is anchored
|
||
// on interrogative + lookup-verb, never on a capitalised word or a guillemet. A false
|
||
// LookupHint can only ever bias the classifier (and only when WEB_PARANOID + verifiable),
|
||
// but we still hold the regex itself to near-zero false positives.
|
||
func TestLookupHintFalsePositiveCorpus(t *testing.T) {
|
||
falsePositives := []string{
|
||
"Привет, Москва!", // vocative, no interrogative
|
||
"«Война и мир» — топ", // guillemets are not a trigger
|
||
"ну ты прям Эйнштейн", // proper noun, no «кто такой»
|
||
"кто это сделал?", // «кто» not followed by a lookup-verb
|
||
"когда ты придёшь?", // «когда» needs a release/birth verb
|
||
"спасибо большое", // ack
|
||
"расскажи что-нибудь", // imperative, no lookup interrogative
|
||
"I love this movie", // English, no interrogative
|
||
"who cares", // «who» not followed by is/was/starred/…
|
||
}
|
||
for _, s := range falsePositives {
|
||
if l0 := ClassifyLayer0(s); l0.LookupHint {
|
||
t.Errorf("lookupHint fired on a false-positive trap: %q", s)
|
||
}
|
||
}
|
||
// And it MUST fire on genuine lookup intent (otherwise it's useless).
|
||
truePositives := []string{
|
||
"кто снимался в фильме дом у дороги",
|
||
"кто написал войну и мир",
|
||
"в каком году вышел фильм матрица",
|
||
"who directed Inception",
|
||
"in what year was the Matrix released",
|
||
"how many seasons of breaking bad",
|
||
}
|
||
for _, s := range truePositives {
|
||
if l0 := ClassifyLayer0(s); !l0.LookupHint {
|
||
t.Errorf("lookupHint should fire on genuine lookup intent: %q", s)
|
||
}
|
||
}
|
||
}
|
||
|
||
// TestCombineFreshnessAlwaysWeb: a freshnessRe hit (WebForce) routes to web regardless of
|
||
// WEB_PARANOID and regardless of the classifier verdict — the deterministic signal that
|
||
// survives the classifier being down (§4.4).
|
||
func TestCombineFreshnessAlwaysWeb(t *testing.T) {
|
||
l0 := Layer0{Route: RouteWeb, WebForce: true, Freshness: "recent"}
|
||
v := Verdict{NeedsWeb: false, Confidence: 0.1} // classifier disagrees
|
||
for _, paranoid := range []bool{true, false} {
|
||
if got := Combine(l0, v, paranoid).Route; got != RouteWeb {
|
||
t.Errorf("freshness with paranoid=%v = %q, want web", paranoid, got)
|
||
}
|
||
}
|
||
}
|
||
|
||
// TestCombineParanoidGating is the Design-X invariant (§15): with WEB_PARANOID OFF, only
|
||
// freshness routes to web — the classifier's needs_web/entity/time/lookup signals are
|
||
// recorded but do NOT change the route. With it ON, those arms activate.
|
||
func TestCombineParanoidGating(t *testing.T) {
|
||
l0 := Layer0{Route: RouteGrokDirect, LookupHint: true} // no freshness
|
||
arms := []Verdict{
|
||
{NeedsWeb: true, Verifiable: true, Confidence: 0.9}, // classifier_needs_web (needs verifiable)
|
||
{EntityObscure: true, Confidence: 0.4}, // entity_obscure
|
||
{TimeSensitive: true, Confidence: 0.4}, // time_sensitive
|
||
{Verifiable: true, Confidence: 0.4}, // lookup_hint && verifiable
|
||
}
|
||
for i, v := range arms {
|
||
if got := Combine(l0, v, false).Route; got != RouteGrokDirect {
|
||
t.Errorf("arm %d with paranoid OFF = %q, want grok_direct (web is freshness-only)", i, got)
|
||
}
|
||
if got := Combine(l0, v, true).Route; got != RouteWeb {
|
||
t.Errorf("arm %d with paranoid ON = %q, want web", i, got)
|
||
}
|
||
}
|
||
}
|
||
|
||
// TestCombineWebFloor: the needs_web arm only fires at/above WebNeedsWebFloor (paranoid).
|
||
func TestCombineWebFloor(t *testing.T) {
|
||
l0 := Layer0{Route: RouteGrokDirect}
|
||
below := Verdict{NeedsWeb: true, Verifiable: true, Confidence: WebNeedsWebFloor - 0.01}
|
||
atFloor := Verdict{NeedsWeb: true, Verifiable: true, Confidence: WebNeedsWebFloor}
|
||
if got := Combine(l0, below, true).Route; got != RouteGrokDirect {
|
||
t.Errorf("needs_web below floor = %q, want grok_direct", got)
|
||
}
|
||
if got := Combine(l0, atFloor, true).Route; got != RouteWeb {
|
||
t.Errorf("needs_web at floor = %q, want web", got)
|
||
}
|
||
}
|
||
|
||
// TestCombineNeedsWebRequiresVerifiable is the false-web fix (observed live): the needs_web
|
||
// arm fires ONLY when the classifier also flagged a checkable named-entity fact
|
||
// (verifiable). A high-confidence needs_web on a non-verifiable query — an opinion or
|
||
// explanation the small flash-lite over-eagerly marked needs_web=true ("посоветуй фильм",
|
||
// "объясни goroutines") — stays on grok_direct. Recency (time_sensitive/freshness) and
|
||
// obscurity (entity_obscure) keep their own arms, so no genuine grounding is lost.
|
||
func TestCombineNeedsWebRequiresVerifiable(t *testing.T) {
|
||
l0 := Layer0{Route: RouteGrokDirect}
|
||
if got := Combine(l0, Verdict{NeedsWeb: true, Verifiable: false, Confidence: 1.0}, true).Route; got != RouteGrokDirect {
|
||
t.Errorf("needs_web && !verifiable = %q, want grok_direct (false-web fix)", got)
|
||
}
|
||
if got := Combine(l0, Verdict{NeedsWeb: true, Verifiable: true, Confidence: 0.6}, true).Route; got != RouteWeb {
|
||
t.Errorf("needs_web && verifiable = %q, want web", got)
|
||
}
|
||
// A non-verifiable needs_web that is ALSO entity_obscure still grounds (obscure arm).
|
||
if got := Combine(l0, Verdict{NeedsWeb: true, Verifiable: false, EntityObscure: true, Confidence: 0.1}, true).Route; got != RouteWeb {
|
||
t.Errorf("entity_obscure must still route web regardless of verifiable, got %q", got)
|
||
}
|
||
}
|
||
|
||
// TestCombineTrivialAgreementGate: trivial requires BOTH the Layer-0 candidate AND
|
||
// classifier.trivial AND confidence ≥ TrivialFloor. A lone signal stays on grok_direct.
|
||
func TestCombineTrivialAgreementGate(t *testing.T) {
|
||
trivialL0 := Layer0{Route: RouteTrivial, Trivial: true}
|
||
nonTrivialL0 := Layer0{Route: RouteGrokDirect}
|
||
|
||
if got := Combine(trivialL0, Verdict{Trivial: true, Confidence: 0.95}, true).Route; got != RouteTrivial {
|
||
t.Errorf("agreed high-confidence trivial = %q, want trivial", got)
|
||
}
|
||
if got := Combine(trivialL0, Verdict{Trivial: true, Confidence: 0.5}, true).Route; got != RouteGrokDirect {
|
||
t.Errorf("low-confidence trivial = %q, want grok_direct (no voice leak)", got)
|
||
}
|
||
if got := Combine(trivialL0, Verdict{Trivial: false, Confidence: 0.95}, true).Route; got != RouteGrokDirect {
|
||
t.Errorf("classifier disagrees on trivial = %q, want grok_direct", got)
|
||
}
|
||
// Never trust classifier.trivial alone: without the Layer-0 candidate it stays grok.
|
||
if got := Combine(nonTrivialL0, Verdict{Trivial: true, Confidence: 0.99}, true).Route; got == RouteTrivial {
|
||
t.Errorf("classifier.trivial alone routed to trivial; must require the Layer-0 candidate")
|
||
}
|
||
}
|
||
|
||
// TestCombineRoadHouse is the regression: the hallucinated-cast bug. With WEB_PARANOID on
|
||
// and the classifier flagging the (obscure, verifiable) entity, both the first turn and
|
||
// the resolved follow-up route to web; with paranoid off they fall to grok_direct (the
|
||
// canary-neutral baseline).
|
||
func TestCombineRoadHouse(t *testing.T) {
|
||
first := ClassifyLayer0("кто снимался в фильме дом у дороги")
|
||
followup := ClassifyLayer0("2024 года") // bare; the classifier resolves via context
|
||
v := Verdict{NeedsWeb: true, Verifiable: true, EntityObscure: true, Confidence: 0.7}
|
||
|
||
for _, l0 := range []Layer0{first, followup} {
|
||
if got := Combine(l0, v, true).Route; got != RouteWeb {
|
||
t.Errorf("road house with paranoid ON = %q, want web (the hallucination fix)", got)
|
||
}
|
||
if got := Combine(l0, v, false).Route; got != RouteGrokDirect {
|
||
t.Errorf("road house with paranoid OFF = %q, want grok_direct (baseline)", got)
|
||
}
|
||
}
|
||
}
|
||
|
||
// TestWebDecidedByAttribution: the switch order attributes the right arm (for tuning 0.55).
|
||
func TestWebDecidedByAttribution(t *testing.T) {
|
||
cases := []struct {
|
||
l0 Layer0
|
||
v Verdict
|
||
want string
|
||
}{
|
||
{Layer0{WebForce: true}, Verdict{}, WebByFreshness},
|
||
{Layer0{}, Verdict{NeedsWeb: true, Verifiable: true, Confidence: 0.9}, WebByNeedsWeb},
|
||
{Layer0{}, Verdict{EntityObscure: true, Confidence: 0.1}, WebByObscure},
|
||
{Layer0{}, Verdict{TimeSensitive: true, Confidence: 0.1}, WebByTime},
|
||
{Layer0{LookupHint: true}, Verdict{Verifiable: true, Confidence: 0.1}, WebByLookupHint},
|
||
{Layer0{Route: RouteGrokDirect}, Verdict{Confidence: 0.1}, WebByNone},
|
||
}
|
||
for _, c := range cases {
|
||
if got := Combine(c.l0, c.v, true).WebDecidedBy; got != c.want {
|
||
t.Errorf("web_decided_by(%+v,%+v) = %q, want %q", c.l0, c.v, got, c.want)
|
||
}
|
||
}
|
||
}
|
||
|
||
// TestProjectGateOnAboutProject: the project route trusts the classifier — it fires when
|
||
// AboutProject is set and not otherwise. There is no Layer-0 hint requirement (live traffic
|
||
// showed it blocked correct context-resolved follow-ups). Independent of WEB_PARANOID.
|
||
func TestProjectGateOnAboutProject(t *testing.T) {
|
||
l0 := Layer0{Route: RouteGrokDirect}
|
||
for _, paranoid := range []bool{true, false} {
|
||
if got := Combine(l0, Verdict{AboutProject: true}, paranoid).Route; got != RouteProject {
|
||
t.Errorf("AboutProject=true (paranoid=%v) = %q, want project_then_grok", paranoid, got)
|
||
}
|
||
if got := Combine(l0, Verdict{AboutProject: false}, paranoid).Route; got == RouteProject {
|
||
t.Errorf("AboutProject=false (paranoid=%v) routed to project; must not", paranoid)
|
||
}
|
||
}
|
||
}
|
||
|
||
// TestProjectBeatsWebArms: the project arm is case #0 — it out-prioritizes even the hard
|
||
// freshness (WebForce) arm and the classifier web arms, because the curated KB, not the
|
||
// web, is the authoritative source for product facts ("какие новости у Vojo" trips
|
||
// freshness yet is a product question).
|
||
func TestProjectBeatsWebArms(t *testing.T) {
|
||
l0 := Layer0{Route: RouteWeb, WebForce: true} // freshness hit
|
||
v := Verdict{AboutProject: true, NeedsWeb: true, Verifiable: true, TimeSensitive: true, Confidence: 0.9}
|
||
for _, paranoid := range []bool{true, false} {
|
||
got := Combine(l0, v, paranoid)
|
||
if got.Route != RouteProject {
|
||
t.Errorf("project must beat web arms (paranoid=%v) = %q, want project_then_grok", paranoid, got.Route)
|
||
}
|
||
if got.WebDecidedBy != WebByNone {
|
||
t.Errorf("project route web_decided_by = %q, want none", got.WebDecidedBy)
|
||
}
|
||
}
|
||
}
|