fix(ai-bot): keep recommendation requests on grok_direct instead of force-routing them to web on a freshness word

This commit is contained in:
heaven 2026-06-06 02:27:05 +03:00
parent ef5a9f5013
commit 9beb5a19bd
3 changed files with 55 additions and 5 deletions

View file

@ -118,6 +118,16 @@ var (
// The leading [\s«"„(] class is only an OPTIONAL left boundary, never a trigger.
lookupIntentRe_RU = regexp.MustCompile(`(?i)(^|[\s«"„(])(кто\s+(так(ой|ая|ие)|снимал(ся|ась|ись)|играл|написал|основал|изобрёл|изобрел|режисс[её]р|автор)|в\s+как(ом|ой)\s+(год[уе]|фильм[еа]|сериал[еа]|книг[еи]|игр[еы])|когда\s+(вышел|вышла|вышло|выйдет|основан[аы]?|родил(ся|ась)|умер(ла)?|состоял(ся|ась)|был[аои]?\s+выпущен)|в\s+каком\s+году|сколько\s+(лет|стоит\s+бил|серий|сезонов|эпизодов)|чем\s+(закончил|известен|знаменит))`)
lookupIntentRe_EN = regexp.MustCompile(`(?i)(^|[\s"'(])(who\s+(is|are|was|were|starred|played|directed|wrote|founded|invented|created)\s|in\s+(what|which)\s+(year|film|movie|show|series|book|game)\b|when\s+(did|was|were|does|is)\b.*\b(release|released|come\s+out|came\s+out|born|die|died|found|founded|launch|launched|air|aired)\b|what\s+year\b|how\s+many\s+(seasons|episodes|films|movies|books))`)
// recommendationRe — a recommendation/advice request ("посоветуй фильм", "что посмотреть",
// "what to watch"). Used ONLY to suppress the freshness WebForce (see ClassifyLayer0): such
// requests are answered from the model's own taste/knowledge, and force-routing them to web
// is actively harmful — the web synth ("answer strictly from the digest") makes Grok parrot a
// generic SEO listicle and recommend nothing (observed live: "посоветуй фильм … в этот вечер"
// → a "домашний спа/почитать книгу" non-answer). Kept tight: only explicit recommend/advice
// verbs and "что/чем/во что/куда + activity", never bare interrogatives, so it can't swallow a
// genuine fresh lookup. Cyrillic stems unanchored (lowercased input), English \b-anchored.
recommendationRe = regexp.MustCompile(`(посовету|порекоменд|что\s+(посмотреть|глянуть|почитать|приготовить|послушать|подарить|поиграть)|чем\s+(себя\s+)?заня|во\s+что\s+(поиграть|сыграть)|куда\s+(сходить|пойти)|\brecommend|\bsuggest|what\s+(to|should\s+i)\s+(watch|read|cook|do|play|listen|make|see)|what\s+(movie|film|book|show|series|game)s?\s+(to|should|do\s+you))`)
)
// NOTE: the project route used to require a Layer-0 lexical hint (literal "vojo" / an
@ -140,7 +150,14 @@ func ClassifyLayer0(body string) Layer0 {
return Layer0{Route: RouteGrokDirect}
}
lookupHint := lookupIntentRe_RU.MatchString(s) || lookupIntentRe_EN.MatchString(s)
if freshnessRe.MatchString(s) {
// Freshness forces web — EXCEPT for a recommendation/advice request that merely happens to
// carry a freshness lexeme ("посоветуй фильм … сегодня вечером"). Those are answered from the
// model's own knowledge; force-routing them to web makes the synth parrot an SEO listicle and
// recommend nothing (see recommendationRe). They fall through to the classifier, which keeps
// them on grok_direct and still sends genuine "новинки"/"latest" recommendations to web via
// time_sensitive. A non-recommendation freshness rumination ("сегодня я думаю…") still
// force-routes — the accepted, designed cheap false-web.
if freshnessRe.MatchString(s) && !recommendationRe.MatchString(s) {
return Layer0{Route: RouteWeb, WebForce: true, Freshness: "recent", LookupHint: lookupHint}
}
if IsTrivial(s) {
@ -182,7 +199,9 @@ type Combined struct {
// gates EXECUTION on PROJECT_KB_ENABLED (mirroring how WebEnabled gates the web route), so
// with the flag off a RouteProject decision cleanly falls through to grok_direct.
// - freshnessRe (WebForce) is a HARD web signal, always honoured (it survives the
// classifier being down).
// classifier being down). The ONE carve-out is applied upstream in ClassifyLayer0:
// a recommendation/advice request ("посоветуй фильм … сегодня") does NOT set WebForce,
// because force-routing a recommendation to web makes the synth parrot an SEO listicle.
// - Every OTHER web arm (the classifier's needs_web≥floor AND verifiable,
// entity_obscure, time_sensitive, lookupHint && verifiable) is gated by `paranoid`
// (WEB_PARANOID). The needs_web arm additionally requires `verifiable`: on a small

View file

@ -101,6 +101,37 @@ func TestLookupHintFalsePositiveCorpus(t *testing.T) {
}
}
// TestRecommendationFreshnessCarveOut: a recommendation/advice request must NOT hard-route to
// web even with a freshness lexeme ("сегодня"/"today"/"right now") — the web synth parrots an
// SEO listicle and recommends nothing (observed live). It falls to grok_direct/classifier;
// genuine non-recommendation freshness queries still force web.
func TestRecommendationFreshnessCarveOut(t *testing.T) {
noForce := []string{
"посоветуй фильм на сегодня вечер",
"что посмотреть сегодня вечером",
"чем заняться сегодня",
"что приготовить сегодня на ужин",
"recommend a movie today",
"what to watch right now",
}
for _, s := range noForce {
if ClassifyLayer0(s).WebForce {
t.Errorf("recommendation with a freshness lexeme must NOT force web: %q", s)
}
}
stillForce := []string{
"какие новости сегодня",
"курс доллара сейчас",
"what's the weather today",
"сегодня я думаю о смысле жизни", // non-recommendation rumination — designed cheap false-web
}
for _, s := range stillForce {
if !ClassifyLayer0(s).WebForce {
t.Errorf("non-recommendation freshness must still force web: %q", s)
}
}
}
// TestCombineFreshnessAlwaysWeb: a freshnessRe hit (WebForce) routes to web regardless of
// WEB_PARANOID and regardless of the classifier verdict — the deterministic signal that
// survives the classifier being down (§4.4).

View file

@ -60,10 +60,10 @@ const classifierPrompt = `You are a routing classifier for a multilingual chat a
Your main job is an EPISTEMIC judgement, not a topic label: if the assistant answered the LAST message purely from its own memory (no web), how likely is it to state a WRONG checkable fact a name, a film/book cast, a date or release year, a number, a price, a score, a population, a who-did-what about a SPECIFIC named person/film/company/place/event? Such facts are exactly what a model misremembers and states confidently.
Decide:
- "needs_web": true if a correct answer DEPENDS on such a checkable external fact, OR on anything time-sensitive (news, "сегодня"/today, "сейчас", latest, current price/rate/weather/score). Recency is sufficient but NOT necessary a STATIC fact like a film's cast or a country's capital also counts. When in doubt, prefer TRUE: grounding is cheap, a confident wrong fact is not. FALSE for opinions, explanations, advice, casual chat, creative writing, code help, or transforming text the user already gave you.
- "needs_web": true if a correct answer DEPENDS on such a checkable external fact, OR on anything time-sensitive (news, "сегодня"/today, "сейчас", latest, current price/rate/weather/score). Recency is sufficient but NOT necessary a STATIC fact like a film's cast or a country's capital also counts. When in doubt, prefer TRUE: grounding is cheap, a confident wrong fact is not. FALSE for opinions, explanations, advice, casual chat, creative writing, code help, or transforming text the user already gave you. Recommendations and suggestions what to watch, read, cook, play, or do ("посоветуй фильм", "что посмотреть", "чем заняться вечером") are ADVICE: answer from your own knowledge, so needs_web=FALSE even when the user says "сегодня"/"tonight"/"this evening" (that is WHEN they will act, not a need for fresh data). The ONLY exception is a request explicitly about NEW or CURRENT releases / what is on right now ("новинки", "что вышло", "what's new", "now playing", "latest") that is needs_web=TRUE AND time_sensitive=TRUE (so a new-release recommendation actually routes to fresh web results).
- "verifiable": true if the message is specifically a checkable fact about a NAMED entity (who acted in <film>, who is CEO of <company>, what year <event>, population of <place>) even if not about "today". A bare follow-up like "2024 года" inherits the entity from the previous turn.
- "entity_obscure": true if the salient entity is plausibly long-tail / not a household name (a minor film, a non-famous person, a niche product) these are where memory fails hardest.
- "time_sensitive": true if the answer can change over time (news, prices, weather, standings, "current"/"latest"/"now").
- "time_sensitive": true if the answer can change over time (news, prices, weather, standings, "current"/"latest"/"now"). But a plan to DO or WATCH something "tonight"/"this evening"/"сегодня вечером" is NOT time-sensitive the timeframe is when the user acts, not a fact that changes.
- "trivial": true ONLY for a bare greeting, acknowledgement, or tiny arithmetic with no real question.
- "about_project": true ONLY if the user is asking about THIS chat app itself, called Vojo its concrete features, how to do something inside the app (calls, encryption, settings, rooms, channels), its limits, privacy, or pricing. Examples: "что ты умеешь", "what can this app do", "как включить шифрование здесь", "does Vojo support video calls". FALSE for any general-knowledge question that merely mentions a product or place name (including one coincidentally called Vojo that is not this app), and FALSE for a generic "what can an AI assistant do". When unsure, prefer FALSE.
- "search_query": a SELF-CONTAINED web search query for this message, written in the LANGUAGE of the user's latest message (an English message an English query; a Russian one a Russian query) so the results match the user's language and region instead of defaulting to one country. Resolve follow-ups from context (a bare "2024 года" after discussing a film becomes "<film name> 2024 фильм актёрский состав"). For broad/region-neutral requests (e.g. "interesting news") keep it general and international, don't narrow it to a single country. Empty string ONLY if both needs_web and verifiable are false.
@ -139,7 +139,7 @@ func (b *Bot) routeLayer1(ctx context.Context, rcx string, l0 rd.Layer0, cost *C
resp, err := b.gemini.Complete(ctx, LLMRequest{
Model: b.cfg.GeminiModel,
Messages: []Message{{Role: "user", Content: classifierPrompt + rcx}},
MaxTokens: 110, // was 80; the schema grew (about_project added) — must not truncate
MaxTokens: 160, // headroom for a long Cyrillic context-resolved search_query; a cut mid-query yields invalid JSON → safe degrade to the Layer-0 heuristic, but we'd lose the verdict, so leave slack
Temperature: 0,
})
if err != nil {