188 lines
6.3 KiB
Go
188 lines
6.3 KiB
Go
// Command routereval is the OFFLINE router-replay harness for the §11 P1 gate. It reads
|
|
// a golden set of (message, recorded classifier verdict, expected route, factual flag),
|
|
// replays each item through the REAL decision functions (routedecide.ClassifyLayer0 +
|
|
// CombineWithFloors — the same code package main uses, never a copy), and reports the
|
|
// confusion matrix + the four P1 metrics: false-grok-on-factual (the lie metric),
|
|
// false-web, trivial-leak, misroute. It is fully deterministic and needs no network: it
|
|
// measures the ROUTING LAYER given a verdict, so you can sweep WEB_PARANOID and the
|
|
// floors instantly. (Classifier accuracy itself is a separate LIVE check — §11 P2.)
|
|
//
|
|
// The lie label on the web path uses the citation-presence proxy by convention: a golden
|
|
// item's `factual:true` + `expected_route:web_then_grok` marks "this MUST ground"; an
|
|
// LLM-judge over query+answer is the higher-fidelity option to wire later (§14.6/§15).
|
|
//
|
|
// Usage:
|
|
//
|
|
// go run ./cmd/routereval -golden cmd/routereval/golden_sample.json
|
|
// go run ./cmd/routereval -golden set.json -web-floor 0.7 # sweep the needs_web floor
|
|
//
|
|
// NOTE: golden_sample.json is labelled for the PRODUCTION config (paranoid ON) — its
|
|
// expected_route values assume the epistemic web arms are active. Running -paranoid=false
|
|
// against it is a what-if sweep that WILL report NO-GO (the entity facts fall to grok by
|
|
// design); it is NOT a passing baseline. To evaluate the paranoid-off behaviour, label a
|
|
// separate set whose expected_route reflects freshness-only web routing.
|
|
package main
|
|
|
|
import (
|
|
"encoding/json"
|
|
"flag"
|
|
"fmt"
|
|
"os"
|
|
|
|
rd "vojo.chat/ai-bot/internal/routedecide"
|
|
)
|
|
|
|
// goldenItem is one labelled row. Message drives the free Layer-0; Verdict is the
|
|
// recorded classifier output; ExpectedRoute + Factual are the ground-truth labels.
|
|
type goldenItem struct {
|
|
Name string `json:"name"`
|
|
Message string `json:"message"`
|
|
Verdict rd.Verdict `json:"verdict"`
|
|
ExpectedRoute string `json:"expected_route"`
|
|
Factual bool `json:"factual"` // a checkable-fact query that MUST ground
|
|
}
|
|
|
|
func main() {
|
|
goldenPath := flag.String("golden", "cmd/routereval/golden_sample.json", "path to the golden-set JSON")
|
|
paranoid := flag.Bool("paranoid", true, "apply the WEB_PARANOID classifier-driven web arms")
|
|
webFloor := flag.Float64("web-floor", rd.WebNeedsWebFloor, "needs_web confidence floor to sweep")
|
|
trivialFloor := flag.Float64("trivial-floor", rd.TrivialFloor, "trivial confidence floor")
|
|
verbose := flag.Bool("v", false, "print every item, not just the mismatches")
|
|
flag.Parse()
|
|
|
|
raw, err := os.ReadFile(*goldenPath)
|
|
if err != nil {
|
|
fmt.Fprintf(os.Stderr, "read golden set: %v\n", err)
|
|
os.Exit(2)
|
|
}
|
|
var items []goldenItem
|
|
if err := json.Unmarshal(raw, &items); err != nil {
|
|
fmt.Fprintf(os.Stderr, "parse golden set: %v\n", err)
|
|
os.Exit(2)
|
|
}
|
|
if len(items) == 0 {
|
|
fmt.Fprintln(os.Stderr, "golden set is empty")
|
|
os.Exit(2)
|
|
}
|
|
|
|
floors := rd.Floors{WebNeedsWeb: *webFloor, Trivial: *trivialFloor}
|
|
fmt.Printf("routereval: %d items | paranoid=%v web-floor=%.2f trivial-floor=%.2f\n\n",
|
|
len(items), *paranoid, *webFloor, *trivialFloor)
|
|
|
|
var (
|
|
correct int
|
|
factualWeb, factualWebMissed int // denominator/numerator of false-grok-on-factual
|
|
nonWebExpected, falseWeb int
|
|
nonTrivialExpected, trivialLeak int
|
|
)
|
|
roadHouseSeen := false
|
|
roadHousePass := true
|
|
for _, it := range items {
|
|
l0 := rd.ClassifyLayer0(it.Message)
|
|
got := rd.CombineWithFloors(l0, it.Verdict, *paranoid, floors).Route
|
|
ok := got == it.ExpectedRoute
|
|
if ok {
|
|
correct++
|
|
}
|
|
if it.Factual && it.ExpectedRoute == rd.RouteWeb {
|
|
factualWeb++
|
|
if got == rd.RouteGrokDirect {
|
|
factualWebMissed++ // a confident-lie risk: a checkable fact answered from memory
|
|
}
|
|
}
|
|
if it.ExpectedRoute != rd.RouteWeb {
|
|
nonWebExpected++
|
|
if got == rd.RouteWeb {
|
|
falseWeb++
|
|
}
|
|
}
|
|
if it.ExpectedRoute != rd.RouteTrivial {
|
|
nonTrivialExpected++
|
|
if got == rd.RouteTrivial {
|
|
trivialLeak++
|
|
}
|
|
}
|
|
// The Road House regression pair must pass (its name carries "road house").
|
|
if contains(it.Name, "road house") {
|
|
roadHouseSeen = true
|
|
if !ok {
|
|
roadHousePass = false
|
|
}
|
|
}
|
|
if *verbose || !ok {
|
|
flag := "ok "
|
|
if !ok {
|
|
flag = "MISS"
|
|
}
|
|
fmt.Printf(" [%s] %-40s want=%-16s got=%-16s\n", flag, trunc(it.Name, 40), it.ExpectedRoute, got)
|
|
}
|
|
}
|
|
|
|
rate := func(num, den int) float64 {
|
|
if den == 0 {
|
|
return 0
|
|
}
|
|
return float64(num) / float64(den)
|
|
}
|
|
misroute := 1 - rate(correct, len(items))
|
|
lie := rate(factualWebMissed, factualWeb)
|
|
fw := rate(falseWeb, nonWebExpected)
|
|
leak := rate(trivialLeak, nonTrivialExpected)
|
|
|
|
fmt.Printf("\n— metrics (§11 P1 gates) —\n")
|
|
fmt.Printf(" false-grok-on-FACTUAL : %5.1f%% (%d/%d) gate < 5%% %s\n", lie*100, factualWebMissed, factualWeb, pass(lie < 0.05))
|
|
fmt.Printf(" false-web : %5.1f%% (%d/%d) gate ≤ 15%% %s\n", fw*100, falseWeb, nonWebExpected, pass(fw <= 0.15))
|
|
fmt.Printf(" trivial-leak : %5.1f%% (%d/%d) gate ~ 0%% %s\n", leak*100, trivialLeak, nonTrivialExpected, pass(leak == 0))
|
|
fmt.Printf(" misroute : %5.1f%% (%d/%d) gate < 3%% %s\n", misroute*100, len(items)-correct, len(items), pass(misroute < 0.03))
|
|
if roadHouseSeen {
|
|
fmt.Printf(" road-house pair : %s\n", pass(roadHousePass))
|
|
}
|
|
|
|
// Exit non-zero if any gate fails, so the harness is CI/owner-runnable as a go/no-go.
|
|
if lie >= 0.05 || fw > 0.15 || leak > 0 || misroute >= 0.03 || (roadHouseSeen && !roadHousePass) {
|
|
fmt.Println("\nRESULT: NO-GO (a P1 gate failed)")
|
|
os.Exit(1)
|
|
}
|
|
fmt.Println("\nRESULT: GO")
|
|
}
|
|
|
|
func pass(ok bool) string {
|
|
if ok {
|
|
return "PASS"
|
|
}
|
|
return "FAIL"
|
|
}
|
|
|
|
func contains(s, sub string) bool {
|
|
return len(sub) == 0 || indexFold(s, sub) >= 0
|
|
}
|
|
|
|
// indexFold is a tiny case-insensitive substring search (avoids importing strings just
|
|
// for ToLower+Index in this small tool).
|
|
func indexFold(s, sub string) int {
|
|
ls, lsub := toLower(s), toLower(sub)
|
|
for i := 0; i+len(lsub) <= len(ls); i++ {
|
|
if ls[i:i+len(lsub)] == lsub {
|
|
return i
|
|
}
|
|
}
|
|
return -1
|
|
}
|
|
|
|
func toLower(s string) string {
|
|
b := []byte(s)
|
|
for i, c := range b {
|
|
if 'A' <= c && c <= 'Z' {
|
|
b[i] = c + ('a' - 'A')
|
|
}
|
|
}
|
|
return string(b)
|
|
}
|
|
|
|
func trunc(s string, n int) string {
|
|
r := []rune(s)
|
|
if len(r) <= n {
|
|
return s
|
|
}
|
|
return string(r[:n-1]) + "…"
|
|
}
|