vojo/apps/ai-bot/cascade.go

package main

import (
	"context"
	"errors"
	"fmt"
	"strings"
	"time"
)

// cascade.go is the generation half of the bot: given an admitted request, it routes
// (router.go), runs the chosen route's provider(s), and ALWAYS degrades to grok_direct
// on any layer being off or failing (§8.2). It returns a genResult the business logic
// (respond) settles, sends, and logs — keeping ledger/never-silent/telemetry in one
// place and the routing here. With every cascade flag off, classify returns grok_direct
// and this collapses to exactly today's single Grok call.

// genResult is everything respond needs from a generation: the answer, the model's
// usage (for token billing), the FULL cost breakdown (router + web + final), and the
// routing metadata for telemetry. cost accumulates across stages, so a partial cascade
// (a paid web fetch that then degraded) still books what it actually spent.
type genResult struct {
	text       string
	usage      Usage
	cost       CostBreakdown
	finalModel string
	providerID string
	decision   RouterDecision
	route      string // the route actually taken (may differ from decision on degrade)
	fallback   bool   // true if we degraded off the decided route
	degraded   string // degrade reason for request_log
	stageMS    map[string]int

	// Web-route outcome (for request_log §8): the resolved query actually sent to Fetch,
	// whether the context-resolved rewrite was used (vs the bare body), and whether the
	// fetch came back grounded with citations (a zero-citation synth is a silent false-web).
	searchQuery   string
	rewriteUsed   bool
	webGrounded   bool
	citationCount int
}

func msSince(t time.Time) int { return int(time.Since(t).Milliseconds()) }

// reserveEstimate is the admission envelope: the most expensive ENABLED route's cost,
// so whichever route the router picks is covered by the reservation (the ceiling can't
// be slipped by routing to a pricier path after admission). With every cascade flag
// off it equals grok_direct's estimate — byte-for-byte today's reservation. Slightly
// generous is fine: Settle books the authoritative actual afterward.
func (b *Bot) reserveEstimate() float64 {
	est := b.estimateUSD(b.cfg.XAIModel) // grok_direct / trivial(cheaper)/synthesis base
	if b.cfg.WebEnabled {
		// web_then_grok = a web fetch fee + the Grok synthesis already counted above.
		if b.cfg.WebProvider == webProviderGrokWebSearch {
			// fetch can search several times and pull large context; reserve generously.
			est += float64(maxWebSearchCalls)*grokWebSearchPerCall + b.estimateUSD(b.cfg.XAIModel)
		} else {
			// gemini grounding: the fetch's tokens PLUS the per-grounded-prompt fee (§7
			// SG2), so the admission envelope is a true upper bound once the fee is booked.
			est += b.estimateUSD(b.cfg.GeminiModel) + b.cfg.GeminiGroundingPerPrompt
		}
	}
	if b.cfg.ReasoningEnabled {
		// Higher reasoning effort can burn more output tokens; reserve double.
		est = max(est, 2*b.estimateUSD(b.cfg.ReasoningModel))
	}
	// The always-on Layer-1 classifier leg (§7 Finding 4): a cheap Gemini call on every
	// message when the classifier is enabled, so reserved ≥ actual stays true. Added after
	// the max() so it is never swallowed by the reasoning branch.
	if b.cfg.RouterClassifierEnabled {
		est += b.estimateUSD(b.cfg.GeminiModel)
	}
	return est
}

// generate routes and produces an answer, degrading to grok_direct on any failure.
// It returns a terminal error ONLY if even grok_direct fails; every other route falls
// through to grok_direct rather than erroring.
func (b *Bot) generate(ctx context.Context, body string, msgs []Message, convID string, isDM bool) (genResult, error) {
	res := genResult{stageMS: map[string]int{}, finalModel: b.cfg.XAIModel}

	// The privacy-minimised conversation window for the classifier + follow-up rewrite.
	// DM-resolved (last ≤2 turns); bare trigger in groups (no cross-member subject bleed).
	rcx := routerContext(msgs, isDM)

	t0 := time.Now()
	res.decision = b.classify(ctx, body, rcx, &res.cost) // accumulates cost.Router if Layer-1 runs
	res.stageMS["router"] = msSince(t0)
	res.route = res.decision.Route

	// The router's pre-dispatch verdict (what it chose, why, how sure). On a degrade the
	// route that actually runs differs from this — respond logs that final outcome — so
	// the two lines together show "router wanted X, we ran Y". DEBUG: routing diagnostics,
	// content-free (the resolved search_query is NOT logged here — it's a gated path, §8).
	b.log.DebugContext(ctx, "route decided",
		"route", res.decision.Route, "source", res.decision.Source,
		"confidence", res.decision.Confidence, "needs_web", res.decision.NeedsWeb,
		"web_decided_by", res.decision.WebDecidedBy, "verifiable", res.decision.Verifiable,
		"entity_obscure", res.decision.EntityObscure, "time_sensitive", res.decision.TimeSensitive,
		"trivial", res.decision.TrivialScore, "lookup_hint", res.decision.LookupHint,
		"reasoning_level", res.decision.ReasoningLevel)

	finalMsgs := msgs
	switch res.decision.Route {
	case routeTrivial:
		if b.cfg.TrivialOffloadEnabled && b.gemini != nil {
			if err := b.genTrivial(ctx, msgs, &res); err == nil {
				return res, nil
			} else {
				b.log.WarnContext(ctx, "trivial offload failed; degrading to grok_direct", "err", err)
				b.degradeTo(&res, degradeTrivial)
			}
		}
	case routeWebThenGrok:
		if b.cfg.WebEnabled && b.web != nil {
			if err := b.genWebThenGrok(ctx, body, isDM, msgs, convID, &res); err == nil {
				return res, nil
			} else {
				b.log.WarnContext(ctx, "web route failed; degrading to grok_direct", "err", err, "reason", res.degraded)
				b.degradeTo(&res, degradeWeb)
				// We have no fresh facts. For a RECENCY miss, hedge with an honest staleness
				// caveat (§8.2.1). For a STATIC verifiable-fact miss (a film cast, a date),
				// the staleness caveat is wrong — a stale caveat on a wrong cast still ships
				// the wrong cast — so instruct Grok to ABSTAIN on specific names/dates/numbers
				// instead of emitting a confident guess (§4.4).
				if res.decision.factualMiss() {
					finalMsgs = factualAbstainMessages(msgs)
				} else {
					finalMsgs = hedgeMessages(msgs)
				}
			}
		}
	case routeReason:
		if b.cfg.ReasoningEnabled {
			if err := b.genReason(ctx, msgs, convID, &res); err == nil {
				return res, nil
			} else {
				b.log.WarnContext(ctx, "reasoning route failed; degrading to grok_direct", "err", err)
				b.degradeTo(&res, degradeReasoning)
			}
		}
	}

	// grok_direct — the default route AND the universal fallback. The only path that
	// can return a terminal error (even Grok failed). It preserves any cost already
	// spent (router classifier, a partial web fetch) in res.cost.
	if err := b.genGrokDirect(ctx, finalMsgs, convID, &res); err != nil {
		return res, err
	}
	return res, nil
}

// degradeTo marks res as a fallback to grok_direct, keeping the first/most-specific
// degrade reason (e.g. a web provider's grounding_cap set inside genWebThenGrok).
func (b *Bot) degradeTo(res *genResult, reason string) {
	res.fallback = true
	if res.degraded == "" {
		res.degraded = reason
	}
}

// genGrokDirect is today's path: one Grok call. Also the fallback for every other
// route. On success it fills res (route, final model, text, usage, provider id) and
// adds the token cost.
func (b *Bot) genGrokDirect(ctx context.Context, msgs []Message, convID string, res *genResult) error {
	t := time.Now()
	resp, err := b.llm.Complete(ctx, LLMRequest{
		Model:           b.cfg.XAIModel,
		Messages:        msgs,
		MaxTokens:       b.cfg.MaxOutTok,
		Temperature:     b.cfg.XAITemp,
		ConvID:          convID,
		ReasoningEffort: b.cfg.GrokReasoningEffort, // "" → not sent; "none" keeps grok-4.3 fast
	})
	res.stageMS["final"] = msSince(t)
	if err != nil {
		return err
	}
	res.route, res.finalModel = routeGrokDirect, b.cfg.XAIModel
	res.text, res.usage, res.providerID = resp.Text, resp.Usage, resp.ProviderRequestID
	res.cost.Token += computeUSD(b.cfg.XAIModel, resp.Usage, b.cfg)
	return nil
}

// genTrivial answers a trivial message with the cheap Gemini model. An empty reply is
// treated as a failure so the caller degrades to Grok rather than sending nothing.
func (b *Bot) genTrivial(ctx context.Context, msgs []Message, res *genResult) error {
	t := time.Now()
	resp, err := b.gemini.Complete(ctx, LLMRequest{
		Model:       b.cfg.GeminiModel,
		Messages:    msgs,
		MaxTokens:   b.cfg.MaxOutTok,
		Temperature: b.cfg.XAITemp,
	})
	res.stageMS["final"] = msSince(t)
	if err != nil {
		return err
	}
	if strings.TrimSpace(resp.Text) == "" {
		return fmt.Errorf("trivial: empty Gemini reply")
	}
	res.route, res.finalModel = routeTrivial, b.cfg.GeminiModel
	res.text, res.usage, res.providerID = resp.Text, resp.Usage, resp.ProviderRequestID
	res.cost.Token += computeUSD(b.cfg.GeminiModel, resp.Usage, b.cfg)
	return nil
}

// genReason answers with Grok at a higher reasoning effort. Uses the configured
// reasoning-capable model (the default grok-4.20-non-reasoning would reject the param).
func (b *Bot) genReason(ctx context.Context, msgs []Message, convID string, res *genResult) error {
	t := time.Now()
	resp, err := b.llm.Complete(ctx, LLMRequest{
		Model:           b.cfg.ReasoningModel,
		Messages:        msgs,
		MaxTokens:       b.cfg.MaxOutTok,
		Temperature:     b.cfg.XAITemp,
		ReasoningEffort: b.cfg.ReasoningEffort, // "think harder" level (default high)
		ConvID:          convID,
	})
	res.stageMS["final"] = msSince(t)
	if err != nil {
		return err
	}
	if strings.TrimSpace(resp.Text) == "" {
		return fmt.Errorf("reason: empty reply")
	}
	res.route, res.finalModel = routeReason, b.cfg.ReasoningModel
	res.text, res.usage, res.providerID = resp.Text, resp.Usage, resp.ProviderRequestID
	res.cost.Token += computeUSD(b.cfg.ReasoningModel, resp.Usage, b.cfg)
	return nil
}

// webStageTimeout bounds the web/grounding fetch independently of the overall budget
// (§8.2.2): a slow search must not eat the whole request before synthesis.
const webStageTimeout = 15 * time.Second

// genWebThenGrok fetches fresh facts via the web provider, then has Grok synthesise the
// answer in voice from that digest. The web fetch's cost+tokens are booked into res
// EVEN ON FAILURE — the call was billed — so a synth failure or empty fetch still
// accounts for the spend before the caller degrades to grok_direct (the partial cascade
// case, §8.1). The daily cap and per-stage deadline are applied here, uniformly for both
// providers.
func (b *Bot) genWebThenGrok(ctx context.Context, body string, isDM bool, msgs []Message, convID string, res *genResult) error {
	// DM-gated rewrite-with-fallback (§6): use the classifier's self-contained,
	// follow-up-resolved query, but ONLY in a DM (a group buffer interleaves members'
	// topics) and only when it's present and not over-long; otherwise the bare body — so
	// the fetch is never worse than today. Sanitise before egress (it is model-authored
	// text going to an external search API): collapse control chars/whitespace, cap length.
	q := body
	if isDM {
		if sq := strings.TrimSpace(res.decision.SearchQuery); sq != "" && len([]rune(sq)) <= 200 {
			q, res.rewriteUsed = sq, true
		}
	}
	q = sanitizeSearchQuery(q)
	if q == "" {
		q, res.rewriteUsed = sanitizeSearchQuery(body), false // never send an empty query
	}
	res.searchQuery = q

	// Per-stage web/grounding deadline, independent of the overall budget.
	wctx, cancelW := context.WithTimeout(ctx, webStageTimeout)
	tw := time.Now()
	wc, ferr := b.web.Fetch(wctx, q)
	cancelW()
	res.stageMS["web"] = msSince(tw)
	// Book the fetch's fee + tokens whether or not it produced a usable digest — the call
	// was billed (the daily cap, if any, is enforced inside the provider). GroundingFee is
	// the per-grounded-prompt overage (§7 SG1), booked even on the error return.
	res.cost.Grounding += wc.Cost.Grounding
	res.cost.GroundingFee += wc.Cost.GroundingFee
	res.cost.WebTool += wc.Cost.WebTool
	res.citationCount = len(wc.Citations)
	res.webGrounded = len(wc.Citations) > 0
	webUsage := wc.Usage
	if ferr != nil {
		if errors.Is(ferr, errGroundingCapped) {
			res.degraded = degradeGroundCap
		}
		return ferr // web fee already booked; caller degrades to grok_direct (with hedge)
	}
	// A non-empty digest with NO citations is a silent false-web (the answer is synthesised
	// from an ungrounded fetch). gemini_grounding errors out before here; grok_web_search
	// can reach this — surface it at WARN so it's visible at the default level (§8).
	if len(wc.Citations) == 0 {
		b.log.WarnContext(ctx, "web no-citation synth (ungrounded digest)", "provider", b.cfg.WebProvider)
	}

	tf := time.Now()
	resp, err := b.llm.Complete(ctx, LLMRequest{
		Model:           b.cfg.XAIModel,
		Messages:        webSynthMessages(msgs, wc),
		MaxTokens:       b.cfg.MaxOutTok,
		Temperature:     b.cfg.XAITemp,
		ConvID:          convID,
		ReasoningEffort: b.cfg.GrokReasoningEffort, // same voice, same effort as grok_direct
	})
	res.stageMS["final"] = msSince(tf)
	if err != nil {
		return err
	}
	if strings.TrimSpace(resp.Text) == "" {
		return fmt.Errorf("web synth: empty reply")
	}
	res.route, res.finalModel = routeWebThenGrok, b.cfg.XAIModel
	res.text, res.providerID = resp.Text, resp.ProviderRequestID
	// Report BOTH calls' tokens so the analytics token totals match the two-call route.
	res.usage = Usage{
		PromptTokens:     resp.Usage.PromptTokens + webUsage.PromptTokens,
		CachedTokens:     resp.Usage.CachedTokens + webUsage.CachedTokens,
		CompletionTokens: resp.Usage.CompletionTokens + webUsage.CompletionTokens,
	}
	res.cost.Token += computeUSD(b.cfg.XAIModel, resp.Usage, b.cfg)
	return nil
}

// webSynthMessages inserts the fresh web digest as a system note just after the system
// prompt, so Grok answers in voice using current facts. It deliberately does NOT pass the
// raw citation URLs into the prompt, nor ask Grok to "cite sources": gemini grounding
// returns opaque vertexaisearch.../grounding-api-redirect/... redirect links (not publisher
// URLs), and instructing Grok to cite made it paste those ugly redirects verbatim into the
// reply and mis-attribute them ("ссылок из твоего сообщения"). The grounding already
// happened (citation_count is recorded for telemetry); the user wants the answer, not
// Google's internal redirect links. Real source attribution (resolving redirects to
// domains) is a separate, deferred feature.
func webSynthMessages(base []Message, wc WebContext) []Message {
	facts := "Свежие данные из веба — ответь на их основе, кратко и по делу, без URL и ссылок:\n" + wc.Digest
	return insertSystemNote(base, facts)
}

// hedgeMessages adds an honest staleness caveat for a web→grok_direct degrade on a
// RECENCY query: the user wanted fresh facts but we couldn't fetch them, so the model
// must flag that its answer is from training knowledge and may be out of date.
func hedgeMessages(base []Message) []Message {
	return insertSystemNote(base, "Нет доступа к свежим источникам прямо сейчас — отвечай по знаниям на момент обучения и честно предупреди, что данные могут быть устаревшими.")
}

// factualAbstainMessages is the degrade hedge for a STATIC verifiable-fact miss (§4.4):
// a staleness caveat is wrong here (the fact isn't stale, it's checkable and the model
// may simply not know it), so instruct Grok to ABSTAIN on specific names/dates/numbers
// rather than ship a confident guess — the exact failure (the hallucinated film cast)
// this redesign exists to stop.
func factualAbstainMessages(base []Message) []Message {
	return insertSystemNote(base, "Не удалось проверить факты через веб. Если ответ зависит от конкретных имён, дат, годов, чисел или состава — честно скажи, что не уверен в точной фактуре и можешь ошибаться; НЕ выдавай догадку за факт.")
}

// factualMiss reports whether a web degrade should use the abstain hedge (a static
// checkable-fact question) rather than the staleness hedge (a recency question). A
// recency signal (freshnessRe or the classifier's time_sensitive) always means
// staleness; otherwise a verifiable / obscure-entity question — OR any non-recency
// needs_web verdict (so an off-spec needs_web-only verdict still abstains rather than
// emit a confident guess) — means abstain.
func (d RouterDecision) factualMiss() bool {
	if d.Freshness != "" || d.TimeSensitive {
		return false
	}
	return d.Verifiable || d.EntityObscure || d.NeedsWeb
}

// sanitizeSearchQuery prepares a (possibly model-authored) query for egress to an
// external search API: collapse newlines/control chars/runs of whitespace to single
// spaces and cap the rune length. Never trusts the model to have produced clean,
// bounded text.
func sanitizeSearchQuery(q string) string {
	q = strings.Map(func(r rune) rune {
		if r == '\n' || r == '\r' || r == '\t' {
			return ' '
		}
		if r < 0x20 || r == 0x7f {
			return -1 // drop other control chars
		}
		return r
	}, q)
	q = strings.Join(strings.Fields(q), " ") // collapse whitespace runs
	if r := []rune(q); len(r) > 200 {
		q = strings.TrimSpace(string(r[:200]))
	}
	return q
}

// insertSystemNote inserts an extra system message right after the system prompt
// (base[0] from buildContext), preserving the rest of the window.
func insertSystemNote(base []Message, content string) []Message {
	note := Message{Role: "system", Content: content}
	if len(base) == 0 {
		return []Message{note}
	}
	out := make([]Message, 0, len(base)+1)
	out = append(out, base[0], note)
	out = append(out, base[1:]...)
	return out
}