package main import ( "context" "errors" "fmt" "strings" "time" ) // cascade.go is the generation half of the bot: given an admitted request, it routes // (router.go), runs the chosen route's provider(s), and ALWAYS degrades to grok_direct // on any layer being off or failing (§8.2). It returns a genResult the business logic // (respond) settles, sends, and logs — keeping ledger/never-silent/telemetry in one // place and the routing here. With every cascade flag off, classify returns grok_direct // and this collapses to exactly today's single Grok call. // genResult is everything respond needs from a generation: the answer, the model's // usage (for token billing), the FULL cost breakdown (router + web + final), and the // routing metadata for telemetry. cost accumulates across stages, so a partial cascade // (a paid web fetch that then degraded) still books what it actually spent. type genResult struct { text string usage Usage cost CostBreakdown finalModel string providerID string decision RouterDecision route string // the route actually taken (may differ from decision on degrade) fallback bool // true if we degraded off the decided route degraded string // degrade reason for request_log stageMS map[string]int // Web-route outcome (for request_log §8): the resolved query actually sent to Fetch, // whether the context-resolved rewrite was used (vs the bare body), and whether the // fetch came back grounded with citations (a zero-citation synth is a silent false-web). searchQuery string rewriteUsed bool webGrounded bool citationCount int } func msSince(t time.Time) int { return int(time.Since(t).Milliseconds()) } // reserveEstimate is the admission envelope: the most expensive ENABLED route's cost, // so whichever route the router picks is covered by the reservation (the ceiling can't // be slipped by routing to a pricier path after admission). With every cascade flag // off it equals grok_direct's estimate — byte-for-byte today's reservation. Slightly // generous is fine: Settle books the authoritative actual afterward. func (b *Bot) reserveEstimate() float64 { est := b.estimateUSD(b.cfg.XAIModel) // grok_direct / trivial(cheaper)/synthesis base if b.cfg.WebEnabled { // web_then_grok = a web fetch fee + the Grok synthesis already counted above. if b.cfg.WebProvider == webProviderGrokWebSearch { // fetch can search several times and pull large context; reserve generously. est += float64(maxWebSearchCalls)*grokWebSearchPerCall + b.estimateUSD(b.cfg.XAIModel) } else { // gemini grounding: the fetch's tokens PLUS the per-grounded-prompt fee (§7 // SG2), so the admission envelope is a true upper bound once the fee is booked. est += b.estimateUSD(b.cfg.GeminiModel) + b.cfg.GeminiGroundingPerPrompt } } if b.cfg.ReasoningEnabled { // Higher reasoning effort can burn more output tokens; reserve double. est = max(est, 2*b.estimateUSD(b.cfg.ReasoningModel)) } // The always-on Layer-1 classifier leg (§7 Finding 4): a cheap Gemini call on every // message when the classifier is enabled, so reserved ≥ actual stays true. Added after // the max() so it is never swallowed by the reasoning branch. if b.cfg.RouterClassifierEnabled { est += b.estimateUSD(b.cfg.GeminiModel) } return est } // generate routes and produces an answer, degrading to grok_direct on any failure. // It returns a terminal error ONLY if even grok_direct fails; every other route falls // through to grok_direct rather than erroring. func (b *Bot) generate(ctx context.Context, body string, msgs []Message, convID string, isDM bool) (genResult, error) { res := genResult{stageMS: map[string]int{}, finalModel: b.cfg.XAIModel} // The privacy-minimised conversation window for the classifier + follow-up rewrite. // DM-resolved (last ≤2 turns); bare trigger in groups (no cross-member subject bleed). rcx := routerContext(msgs, isDM) t0 := time.Now() res.decision = b.classify(ctx, body, rcx, &res.cost) // accumulates cost.Router if Layer-1 runs res.stageMS["router"] = msSince(t0) res.route = res.decision.Route // The router's pre-dispatch verdict (what it chose, why, how sure). On a degrade the // route that actually runs differs from this — respond logs that final outcome — so // the two lines together show "router wanted X, we ran Y". DEBUG: routing diagnostics, // content-free (the resolved search_query is NOT logged here — it's a gated path, §8). b.log.DebugContext(ctx, "route decided", "route", res.decision.Route, "source", res.decision.Source, "confidence", res.decision.Confidence, "needs_web", res.decision.NeedsWeb, "web_decided_by", res.decision.WebDecidedBy, "verifiable", res.decision.Verifiable, "entity_obscure", res.decision.EntityObscure, "time_sensitive", res.decision.TimeSensitive, "trivial", res.decision.TrivialScore, "lookup_hint", res.decision.LookupHint, "reasoning_level", res.decision.ReasoningLevel) finalMsgs := msgs switch res.decision.Route { case routeTrivial: if b.cfg.TrivialOffloadEnabled && b.gemini != nil { if err := b.genTrivial(ctx, msgs, &res); err == nil { return res, nil } else { b.log.WarnContext(ctx, "trivial offload failed; degrading to grok_direct", "err", err) b.degradeTo(&res, degradeTrivial) } } case routeWebThenGrok: if b.cfg.WebEnabled && b.web != nil { if err := b.genWebThenGrok(ctx, body, isDM, msgs, convID, &res); err == nil { return res, nil } else { b.log.WarnContext(ctx, "web route failed; degrading to grok_direct", "err", err, "reason", res.degraded) b.degradeTo(&res, degradeWeb) // We have no fresh facts. For a RECENCY miss, hedge with an honest staleness // caveat (§8.2.1). For a STATIC verifiable-fact miss (a film cast, a date), // the staleness caveat is wrong — a stale caveat on a wrong cast still ships // the wrong cast — so instruct Grok to ABSTAIN on specific names/dates/numbers // instead of emitting a confident guess (§4.4). if res.decision.factualMiss() { finalMsgs = factualAbstainMessages(msgs) } else { finalMsgs = hedgeMessages(msgs) } } } case routeReason: if b.cfg.ReasoningEnabled { if err := b.genReason(ctx, msgs, convID, &res); err == nil { return res, nil } else { b.log.WarnContext(ctx, "reasoning route failed; degrading to grok_direct", "err", err) b.degradeTo(&res, degradeReasoning) } } } // grok_direct — the default route AND the universal fallback. The only path that // can return a terminal error (even Grok failed). It preserves any cost already // spent (router classifier, a partial web fetch) in res.cost. if err := b.genGrokDirect(ctx, finalMsgs, convID, &res); err != nil { return res, err } return res, nil } // degradeTo marks res as a fallback to grok_direct, keeping the first/most-specific // degrade reason (e.g. a web provider's grounding_cap set inside genWebThenGrok). func (b *Bot) degradeTo(res *genResult, reason string) { res.fallback = true if res.degraded == "" { res.degraded = reason } } // genGrokDirect is today's path: one Grok call. Also the fallback for every other // route. On success it fills res (route, final model, text, usage, provider id) and // adds the token cost. func (b *Bot) genGrokDirect(ctx context.Context, msgs []Message, convID string, res *genResult) error { t := time.Now() resp, err := b.llm.Complete(ctx, LLMRequest{ Model: b.cfg.XAIModel, Messages: msgs, MaxTokens: b.cfg.MaxOutTok, Temperature: b.cfg.XAITemp, ConvID: convID, ReasoningEffort: b.cfg.GrokReasoningEffort, // "" → not sent; "none" keeps grok-4.3 fast }) res.stageMS["final"] = msSince(t) if err != nil { return err } res.route, res.finalModel = routeGrokDirect, b.cfg.XAIModel res.text, res.usage, res.providerID = resp.Text, resp.Usage, resp.ProviderRequestID res.cost.Token += computeUSD(b.cfg.XAIModel, resp.Usage, b.cfg) return nil } // genTrivial answers a trivial message with the cheap Gemini model. An empty reply is // treated as a failure so the caller degrades to Grok rather than sending nothing. func (b *Bot) genTrivial(ctx context.Context, msgs []Message, res *genResult) error { t := time.Now() resp, err := b.gemini.Complete(ctx, LLMRequest{ Model: b.cfg.GeminiModel, Messages: msgs, MaxTokens: b.cfg.MaxOutTok, Temperature: b.cfg.XAITemp, }) res.stageMS["final"] = msSince(t) if err != nil { return err } if strings.TrimSpace(resp.Text) == "" { return fmt.Errorf("trivial: empty Gemini reply") } res.route, res.finalModel = routeTrivial, b.cfg.GeminiModel res.text, res.usage, res.providerID = resp.Text, resp.Usage, resp.ProviderRequestID res.cost.Token += computeUSD(b.cfg.GeminiModel, resp.Usage, b.cfg) return nil } // genReason answers with Grok at a higher reasoning effort. Uses the configured // reasoning-capable model (the default grok-4.20-non-reasoning would reject the param). func (b *Bot) genReason(ctx context.Context, msgs []Message, convID string, res *genResult) error { t := time.Now() resp, err := b.llm.Complete(ctx, LLMRequest{ Model: b.cfg.ReasoningModel, Messages: msgs, MaxTokens: b.cfg.MaxOutTok, Temperature: b.cfg.XAITemp, ReasoningEffort: b.cfg.ReasoningEffort, // "think harder" level (default high) ConvID: convID, }) res.stageMS["final"] = msSince(t) if err != nil { return err } if strings.TrimSpace(resp.Text) == "" { return fmt.Errorf("reason: empty reply") } res.route, res.finalModel = routeReason, b.cfg.ReasoningModel res.text, res.usage, res.providerID = resp.Text, resp.Usage, resp.ProviderRequestID res.cost.Token += computeUSD(b.cfg.ReasoningModel, resp.Usage, b.cfg) return nil } // webStageTimeout bounds the web/grounding fetch independently of the overall budget // (§8.2.2): a slow search must not eat the whole request before synthesis. const webStageTimeout = 15 * time.Second // genWebThenGrok fetches fresh facts via the web provider, then has Grok synthesise the // answer in voice from that digest. The web fetch's cost+tokens are booked into res // EVEN ON FAILURE — the call was billed — so a synth failure or empty fetch still // accounts for the spend before the caller degrades to grok_direct (the partial cascade // case, §8.1). The daily cap and per-stage deadline are applied here, uniformly for both // providers. func (b *Bot) genWebThenGrok(ctx context.Context, body string, isDM bool, msgs []Message, convID string, res *genResult) error { // DM-gated rewrite-with-fallback (§6): use the classifier's self-contained, // follow-up-resolved query, but ONLY in a DM (a group buffer interleaves members' // topics) and only when it's present and not over-long; otherwise the bare body — so // the fetch is never worse than today. Sanitise before egress (it is model-authored // text going to an external search API): collapse control chars/whitespace, cap length. q := body if isDM { if sq := strings.TrimSpace(res.decision.SearchQuery); sq != "" && len([]rune(sq)) <= 200 { q, res.rewriteUsed = sq, true } } q = sanitizeSearchQuery(q) if q == "" { q, res.rewriteUsed = sanitizeSearchQuery(body), false // never send an empty query } res.searchQuery = q // Per-stage web/grounding deadline, independent of the overall budget. wctx, cancelW := context.WithTimeout(ctx, webStageTimeout) tw := time.Now() wc, ferr := b.web.Fetch(wctx, q) cancelW() res.stageMS["web"] = msSince(tw) // Book the fetch's fee + tokens whether or not it produced a usable digest — the call // was billed (the daily cap, if any, is enforced inside the provider). GroundingFee is // the per-grounded-prompt overage (§7 SG1), booked even on the error return. res.cost.Grounding += wc.Cost.Grounding res.cost.GroundingFee += wc.Cost.GroundingFee res.cost.WebTool += wc.Cost.WebTool res.citationCount = len(wc.Citations) res.webGrounded = len(wc.Citations) > 0 webUsage := wc.Usage if ferr != nil { if errors.Is(ferr, errGroundingCapped) { res.degraded = degradeGroundCap } return ferr // web fee already booked; caller degrades to grok_direct (with hedge) } // A non-empty digest with NO citations is a silent false-web (the answer is synthesised // from an ungrounded fetch). gemini_grounding errors out before here; grok_web_search // can reach this — surface it at WARN so it's visible at the default level (§8). if len(wc.Citations) == 0 { b.log.WarnContext(ctx, "web no-citation synth (ungrounded digest)", "provider", b.cfg.WebProvider) } tf := time.Now() resp, err := b.llm.Complete(ctx, LLMRequest{ Model: b.cfg.XAIModel, Messages: webSynthMessages(msgs, wc), MaxTokens: b.cfg.MaxOutTok, Temperature: b.cfg.XAITemp, ConvID: convID, ReasoningEffort: b.cfg.GrokReasoningEffort, // same voice, same effort as grok_direct }) res.stageMS["final"] = msSince(tf) if err != nil { return err } if strings.TrimSpace(resp.Text) == "" { return fmt.Errorf("web synth: empty reply") } res.route, res.finalModel = routeWebThenGrok, b.cfg.XAIModel res.text, res.providerID = resp.Text, resp.ProviderRequestID // Report BOTH calls' tokens so the analytics token totals match the two-call route. res.usage = Usage{ PromptTokens: resp.Usage.PromptTokens + webUsage.PromptTokens, CachedTokens: resp.Usage.CachedTokens + webUsage.CachedTokens, CompletionTokens: resp.Usage.CompletionTokens + webUsage.CompletionTokens, } res.cost.Token += computeUSD(b.cfg.XAIModel, resp.Usage, b.cfg) return nil } // webSynthMessages inserts the fresh web digest as a system note just after the system // prompt, so Grok answers in voice using current facts. It deliberately does NOT pass the // raw citation URLs into the prompt, nor ask Grok to "cite sources": gemini grounding // returns opaque vertexaisearch.../grounding-api-redirect/... redirect links (not publisher // URLs), and instructing Grok to cite made it paste those ugly redirects verbatim into the // reply and mis-attribute them ("ссылок из твоего сообщения"). The grounding already // happened (citation_count is recorded for telemetry); the user wants the answer, not // Google's internal redirect links. Real source attribution (resolving redirects to // domains) is a separate, deferred feature. // // The note is also AUTHORITATIVE about the data being current and provided: the system // prompt's "don't claim you have internet access if you don't" rule otherwise wins on a // fast (reasoning_effort=none) Grok call, so it ignored the injected digest and replied // "I don't have live web access" despite being handed fresh news. The note now explicitly // lifts that rule for this turn (the data IS provided), so Grok answers from it instead of // denying it. The grok_direct "no internet" honesty is untouched — only this web turn. func webSynthMessages(base []Message, wc WebContext) []Message { facts := "Fresh web-search results for the user's request (current as of now) — answer strictly from them as up-to-date facts, briefly and to the point, with no URLs or links. The data is provided to you, so do NOT say you have no internet access or that you can't fetch anything fresh:\n" + wc.Digest return insertSystemNote(base, facts) } // hedgeMessages adds an honest staleness caveat for a web→grok_direct degrade on a // RECENCY query: the user wanted fresh facts but we couldn't fetch them, so the model // must flag that its answer is from training knowledge and may be out of date. func hedgeMessages(base []Message) []Message { return insertSystemNote(base, "No access to fresh sources right now — answer from your training knowledge and honestly warn that the data may be out of date.") } // factualAbstainMessages is the degrade hedge for a STATIC verifiable-fact miss (§4.4): // a staleness caveat is wrong here (the fact isn't stale, it's checkable and the model // may simply not know it), so instruct Grok to ABSTAIN on specific names/dates/numbers // rather than ship a confident guess — the exact failure (the hallucinated film cast) // this redesign exists to stop. func factualAbstainMessages(base []Message) []Message { return insertSystemNote(base, "Couldn't verify the facts via the web. If the answer depends on specific names, dates, years, numbers, or a cast, honestly say you're not sure of the exact details and may be wrong; do NOT pass a guess off as fact.") } // factualMiss reports whether a web degrade should use the abstain hedge (a static // checkable-fact question) rather than the staleness hedge (a recency question). A // recency signal (freshnessRe or the classifier's time_sensitive) always means // staleness; otherwise a verifiable / obscure-entity question — OR any non-recency // needs_web verdict (so an off-spec needs_web-only verdict still abstains rather than // emit a confident guess) — means abstain. func (d RouterDecision) factualMiss() bool { if d.Freshness != "" || d.TimeSensitive { return false } return d.Verifiable || d.EntityObscure || d.NeedsWeb } // sanitizeSearchQuery prepares a (possibly model-authored) query for egress to an // external search API: collapse newlines/control chars/runs of whitespace to single // spaces and cap the rune length. Never trusts the model to have produced clean, // bounded text. func sanitizeSearchQuery(q string) string { q = strings.Map(func(r rune) rune { if r == '\n' || r == '\r' || r == '\t' { return ' ' } if r < 0x20 || r == 0x7f { return -1 // drop other control chars } return r }, q) q = strings.Join(strings.Fields(q), " ") // collapse whitespace runs if r := []rune(q); len(r) > 200 { q = strings.TrimSpace(string(r[:200])) } return q } // insertSystemNote inserts an extra system message right after the system prompt // (base[0] from buildContext), preserving the rest of the window. func insertSystemNote(base []Message, content string) []Message { note := Message{Role: "system", Content: content} if len(base) == 0 { return []Message{note} } out := make([]Message, 0, len(base)+1) out = append(out, base[0], note) out = append(out, base[1:]...) return out }