From c12c228eb81deb50d196ff373453cc8ccce1d67d Mon Sep 17 00:00:00 2001 From: heaven Date: Thu, 4 Jun 2026 22:44:56 +0300 Subject: [PATCH] fix(ai-bot): strip the bot mention from the search query and append a clickable sources footer to web answers --- apps/ai-bot/bot.go | 29 ++++++++--- apps/ai-bot/bot_test.go | 23 +++++++++ apps/ai-bot/cascade.go | 10 ++-- apps/ai-bot/mentions.go | 29 +++++++++++ apps/ai-bot/provider_gemini.go | 9 +++- apps/ai-bot/sources.go | 92 ++++++++++++++++++++++++++++++++++ apps/ai-bot/sources_test.go | 52 +++++++++++++++++++ apps/ai-bot/web.go | 19 ++++++- docs/ai/ai-bot.md | 30 +++++++++++ 9 files changed, 279 insertions(+), 14 deletions(-) create mode 100644 apps/ai-bot/sources.go create mode 100644 apps/ai-bot/sources_test.go diff --git a/apps/ai-bot/bot.go b/apps/ai-bot/bot.go index 607568c1..4e9f716b 100644 --- a/apps/ai-bot/bot.go +++ b/apps/ai-bot/bot.go @@ -389,6 +389,14 @@ const unlimitedCap = 1 << 30 func (b *Bot) respond(ctx context.Context, roomID, threadRoot string, isDM bool, ev *Event, mc *MessageContent, history []bufferedMsg) { started := time.Now() + // Clean the trigger ONCE, before it reaches the search query, the prompt, the buffer, + // or telemetry. Two egress hazards both flow from the raw body: the bot's own mention + // ("@ai:vojo.chat …", which the grounding provider mis-read as the subject "vojo.chat"), + // and the Matrix rich-reply fallback (the quoted parent, "> <@ai> …"), which would + // otherwise be googled verbatim on a reply-style follow-up. Both are stripped here so + // every downstream consumer sees only what the user actually asked. Detection already + // happened (m.mentions / replyParentIsBot), so this never affects whether we answer. + mc.Body = stripBotMention(stripReplyFallback(mc.Body), b.cfg.BotMXID) // One telemetry row per request, populated as the flow decides its outcome and // emitted once via defer — so every exit (deny, error, empty, paid silence, success) // is recorded without scattering writes (F-FUNC-5). It starts as route=none/ok=false; @@ -565,7 +573,11 @@ func (b *Bot) respond(ctx context.Context, roomID, threadRoot string, isDM bool, } b.log.InfoContext(ctx, "answered", "room", roomID, "sender", ev.Sender, "dm", isDM, "route", res.route, "usd", res.cost.Total(), "prompt_tokens", res.usage.PromptTokens, "completion_tokens", res.usage.CompletionTokens) - if err := b.sendReply(ctx, roomID, threadRoot, ev, mc, text); err != nil { + // Append the source attribution to the SENT message only — not to the buffered answer: + // the gemini redirect links are ephemeral, so stale links must not pollute the history + // that feeds later turns (sendReply buffers `text`, sends `text+footer`). + footer := sourcesFooter(text, res.sources) + if err := b.sendReply(ctx, roomID, threadRoot, ev, mc, text, footer); err != nil { // Paid silence (§8.1): the spend is real (USD is kept — refunding it would // under-count the ceiling), but the reply never landed. Refund the request SLOT // so the user can retry, and react ⚠️ so the failure isn't silent. @@ -679,12 +691,15 @@ func (b *Bot) reactEncryptedOnce(ctx context.Context, roomID, eventID string) bo return true } -// sendReply sends the model's actual answer and records the completed exchange in the -// conversation buffer so the next turn has context. It RETURNS the send error so the -// caller can handle paid silence (§8.1): a billed answer that failed to deliver must -// refund the slot and react, not vanish. -func (b *Bot) sendReply(ctx context.Context, roomID, threadRoot string, trigger *Event, triggerMC *MessageContent, body string) error { - if err := b.sendMessage(ctx, roomID, threadRoot, trigger, triggerMC, body); err != nil { +// sendReply sends the answer (plus an optional source-attribution footer) and records the +// completed exchange in the conversation buffer so the next turn has context. `body` is the +// model's prose — what gets BUFFERED as the assistant turn; `footer` is the server-built +// "Sources" line, appended only to the SENT message so its ephemeral grounding links never +// enter the history that feeds later turns. It RETURNS the send error so the caller can +// handle paid silence (§8.1): a billed answer that failed to deliver must refund the slot +// and react, not vanish. +func (b *Bot) sendReply(ctx context.Context, roomID, threadRoot string, trigger *Event, triggerMC *MessageContent, body, footer string) error { + if err := b.sendMessage(ctx, roomID, threadRoot, trigger, triggerMC, body+footer); err != nil { return err } // Record the user trigger AND the assistant answer together, only AFTER the answer diff --git a/apps/ai-bot/bot_test.go b/apps/ai-bot/bot_test.go index 5fa3bb73..158cab8a 100644 --- a/apps/ai-bot/bot_test.go +++ b/apps/ai-bot/bot_test.go @@ -70,6 +70,29 @@ func TestStripReplyFallback(t *testing.T) { } } +func TestStripBotMention(t *testing.T) { + cases := []struct{ in, want string }{ + // The headline regression: the full-mxid pill fallback cinny writes must not reach + // the search query (it made the grounding provider search for "vojo.chat"). + {"@ai:vojo.chat мессенджер макс удалили из эппстора?", "мессенджер макс удалили из эппстора?"}, + // Bare "@localpart" fallback some clients write, with trailing address punctuation. + {"@ai, какая погода в Москве", "какая погода в Москве"}, + // Mention mid-message is still removed (it is never user content). + {"скажи @ai:vojo.chat кто выиграл", "скажи кто выиграл"}, + // No mention → unchanged (DMs, where the bot isn't addressed by name). + {"кто выиграл вчера", "кто выиграл вчера"}, + // The product name in a real question must survive (we never strip the display name). + {"@ai:vojo.chat что умеет Vojo AI", "что умеет Vojo AI"}, + // A longer handle that merely contains the localpart is kept. + {"@ai:vojo.chat пинг @aibot", "пинг @aibot"}, + } + for _, c := range cases { + if got := stripBotMention(c.in, botID); got != c.want { + t.Errorf("stripBotMention(%q) = %q, want %q", c.in, got, c.want) + } + } +} + func TestComputeUSD(t *testing.T) { const model = "grok-test" cfg := &Config{XAIModel: model, Prices: map[string]ModelPrice{ diff --git a/apps/ai-bot/cascade.go b/apps/ai-bot/cascade.go index e14f365b..0c11f997 100644 --- a/apps/ai-bot/cascade.go +++ b/apps/ai-bot/cascade.go @@ -38,6 +38,7 @@ type genResult struct { rewriteUsed bool webGrounded bool citationCount int + sources []WebSource // user-facing source attribution (web route only; sources.go) } func msSince(t time.Time) int { return int(time.Since(t).Milliseconds()) } @@ -319,6 +320,7 @@ func (b *Bot) genWebThenGrok(ctx context.Context, body string, isDM bool, msgs [ res.cost.WebTool += wc.Cost.WebTool res.citationCount = len(wc.Citations) res.webGrounded = len(wc.Citations) > 0 + res.sources = wc.Sources // carried to the user-facing "Sources" footer on success webUsage := wc.Usage if ferr != nil { if errors.Is(ferr, errGroundingCapped) { @@ -366,10 +368,10 @@ func (b *Bot) genWebThenGrok(ctx context.Context, body string, isDM bool, msgs [ // raw citation URLs into the prompt, nor ask Grok to "cite sources": gemini grounding // returns opaque vertexaisearch.../grounding-api-redirect/... redirect links (not publisher // URLs), and instructing Grok to cite made it paste those ugly redirects verbatim into the -// reply and mis-attribute them ("ссылок из твоего сообщения"). The grounding already -// happened (citation_count is recorded for telemetry); the user wants the answer, not -// Google's internal redirect links. Real source attribution (resolving redirects to -// domains) is a separate, deferred feature. +// reply and mis-attribute them ("ссылок из твоего сообщения"). Source attribution is instead +// built SERVER-SIDE and appended after the prose (sourcesFooter, sources.go) using the +// citations' publisher-domain titles — controlled format, honest links — so the prompt keeps +// telling Grok "no URLs or links". // // The note is also AUTHORITATIVE about the data being current and provided: the system // prompt's "don't claim you have internet access if you don't" rule otherwise wins on a diff --git a/apps/ai-bot/mentions.go b/apps/ai-bot/mentions.go index b4e967d5..83dde1e7 100644 --- a/apps/ai-bot/mentions.go +++ b/apps/ai-bot/mentions.go @@ -46,6 +46,35 @@ func mentionsBot(mc *MessageContent, botMXID string, replyParentIsBot bool) bool return pillTargetsBot(mc.FormattedBody, botMXID) } +// stripBotMention removes the bot's own mention text from a trigger body before it is +// used as a web-search query, a prompt turn, a buffer entry, or telemetry. cinny writes +// the plain-text fallback of a mention pill as the bot's FULL mxid ("@ai:vojo.chat …"), +// and that literal mxid, sent verbatim to the grounding provider as the search query, made +// it treat "vojo.chat" as the SUBJECT entity — it searched "was the Vojo.chat messenger +// removed?", found nothing, and confabulated "no, it's available", the exact first-ask +// hallucination + same-question/different-answer the "Max" thread showed (the second ask +// happened to anchor on "макс" instead, hence two opposite grounded answers). Mention +// DETECTION already ran upstream via m.mentions (MSC3952), so dropping the body text never +// changes routing. We strip only the UNAMBIGUOUS mxid forms — the full mxid and a +// standalone "@localpart"; the human display name is deliberately left intact so a real +// question that names the product ("что умеет Vojo AI") is never mangled. +func stripBotMention(body, botMXID string) string { + body = strings.ReplaceAll(body, botMXID, " ") + at := "@" + localpartOf(botMXID) + fields := strings.Fields(body) + kept := fields[:0] + for _, f := range fields { + // Drop a standalone "@ai" pill fallback (with trailing address punctuation), but + // keep "@aibot" or any word that merely contains it. + if strings.EqualFold(strings.Trim(f, ",.:;!?–—-"), at) { + continue + } + kept = append(kept, f) + } + out := strings.Join(kept, " ") + return strings.TrimLeft(out, " ,:–—-") // leftover leading address punctuation ("@ai, …") +} + // pillTargetsBot looks for an mention pill addressing the bot in the // HTML body. Matrix pills use either matrix.to/#/ or a matrix: URI. func pillTargetsBot(formattedBody, botMXID string) bool { diff --git a/apps/ai-bot/provider_gemini.go b/apps/ai-bot/provider_gemini.go index 42eac2fa..be5c872e 100644 --- a/apps/ai-bot/provider_gemini.go +++ b/apps/ai-bot/provider_gemini.go @@ -78,7 +78,8 @@ func (c *geminiClient) Complete(ctx context.Context, req LLMRequest) (*LLMRespon type geminiGroundResult struct { Digest string - Citations []string + Citations []string // redirect URIs — the verify-gate + citation_count + Sources []WebSource // the same chunks with their publisher-domain titles (web.title) Usage Usage } @@ -170,9 +171,14 @@ func (c *geminiClient) groundedSearch(ctx context.Context, query string) (gemini sb.WriteString(p.Text) } var citations []string + var sources []WebSource for _, ch := range out.Candidates[0].GroundingMetadata.GroundingChunks { if ch.Web.URI != "" { citations = append(citations, ch.Web.URI) + // web.uri is the grounding-api-redirect (NOT the publisher URL — and Gemini's + // terms forbid resolving it server-side); web.title is the publisher domain + // ("rbc.ru"). Keep both: the user clicks the redirect to reach the real article. + sources = append(sources, WebSource{Title: ch.Web.Title, URL: ch.Web.URI}) } } // The verify-gate: no citations ⇒ not actually grounded ⇒ degrade. @@ -182,6 +188,7 @@ func (c *geminiClient) groundedSearch(ctx context.Context, query string) (gemini return geminiGroundResult{ Digest: strings.TrimSpace(sb.String()), Citations: citations, + Sources: sources, Usage: Usage{ PromptTokens: out.UsageMetadata.PromptTokenCount, CachedTokens: out.UsageMetadata.CachedContentTokenCount, diff --git a/apps/ai-bot/sources.go b/apps/ai-bot/sources.go new file mode 100644 index 00000000..91b89dd3 --- /dev/null +++ b/apps/ai-bot/sources.go @@ -0,0 +1,92 @@ +package main + +import ( + "net/url" + "strings" + "unicode" +) + +// sources.go renders the user-facing "Sources" attribution for a web answer. It is built +// SERVER-SIDE and appended AFTER the model's prose — never handed to the model. The model +// was deliberately told to write "no URLs or links" (webSynthMessages) because instructing +// it to cite made it paste the opaque grounding-api-redirect links uglily and mis-attribute +// them. Doing the attribution here keeps the format controlled and the links honest. +// +// Compliance notes (Gemini Grounding terms, verified against ai.google.dev/gemini-api/terms): +// - We NEVER resolve the grounding redirect server-side ("no programmatic/automated access +// to Grounded Results"). We emit the redirect as a link the END USER clicks — the +// intended direct-access flow — and it lands them on the real article. +// - We label with the publisher domain (web.title), which is stable and ToS-neutral. +// - The strict terms also ask for the Search-Suggestions chip (searchEntryPoint), which a +// sanitised Matrix bubble can't render; that gap is pre-existing (the bot already shows +// grounded prose without it) and out of scope here. + +// maxSourcesShown caps the appended attribution. A handful of domains is plenty and keeps +// the message tidy — gemini grounding routinely returns a dozen near-duplicate chunks. +const maxSourcesShown = 3 + +// sourcesFooter renders a compact, deduped "Sources" line from a web route's sources, or "" +// when there's nothing usable. Each entry is a markdown link whose LABEL is the publisher +// domain and whose HREF is the source link (markdownToHTML promotes it to a clickable ; +// the plain body keeps the readable "[domain](url)" fallback). Dedup is by domain so several +// chunks from one outlet collapse to one link. The label language follows the answer +// (Cyrillic → Russian), since the bot replies in the user's language. +func sourcesFooter(answer string, sources []WebSource) string { + seen := make(map[string]bool, len(sources)) + var links []string + for _, s := range sources { + dom := sourceDomain(s.Title) + u := strings.TrimSpace(s.URL) + if dom == "" || u == "" { + continue + } + key := strings.ToLower(dom) + if seen[key] { + continue + } + seen[key] = true + links = append(links, "["+dom+"]("+u+")") + if len(links) >= maxSourcesShown { + break + } + } + if len(links) == 0 { + return "" + } + label := "Sources" + if hasCyrillic(answer) { + label = "Источники" + } + return "\n\n" + label + ": " + strings.Join(links, ", ") +} + +// sourceDomain normalises a citation's display label to a bare publisher domain: it trims a +// leading "www." and surrounding space. gemini grounding already returns the domain in +// web.title; this just tidies it. Returns "" for an empty/garbage label. +func sourceDomain(title string) string { + t := strings.TrimSpace(title) + t = strings.TrimPrefix(t, "www.") + return strings.TrimSpace(t) +} + +// hostOf extracts the host (minus a leading "www.") from a real URL — used to label +// grok_web_search citations, which carry the actual publisher URL rather than a domain. +// Returns "" if the URL doesn't parse to a host. +func hostOf(rawURL string) string { + u, err := url.Parse(strings.TrimSpace(rawURL)) + if err != nil || u.Host == "" { + return "" + } + return strings.TrimPrefix(u.Host, "www.") +} + +// hasCyrillic reports whether s contains any Cyrillic letter — a cheap proxy for "the bot +// answered in Russian", used only to localise the Sources label. +func hasCyrillic(s string) bool { + for _, r := range s { + if unicode.Is(unicode.Cyrillic, r) { + return true + } + } + return false +} diff --git a/apps/ai-bot/sources_test.go b/apps/ai-bot/sources_test.go new file mode 100644 index 00000000..f883138f --- /dev/null +++ b/apps/ai-bot/sources_test.go @@ -0,0 +1,52 @@ +package main + +import ( + "strings" + "testing" +) + +func TestSourcesFooter(t *testing.T) { + redirect := "https://vertexaisearch.cloud.google.com/grounding-api-redirect/abc" + src := []WebSource{ + {Title: "rbc.ru", URL: redirect + "1"}, + {Title: "www.tass.ru", URL: redirect + "2"}, + {Title: "rbc.ru", URL: redirect + "3"}, // duplicate domain → collapsed + {Title: "lenta.ru", URL: redirect + "4"}, + {Title: "vedomosti.ru", URL: redirect + "5"}, // beyond maxSourcesShown → dropped + } + + // Russian answer → Russian label, deduped, capped, www stripped, clickable. + got := sourcesFooter("Да, удалили 3 июня.", src) + want := "\n\nИсточники: [rbc.ru](" + redirect + "1), [tass.ru](" + redirect + "2), [lenta.ru](" + redirect + "4)" + if got != want { + t.Fatalf("sourcesFooter ru =\n %q\nwant\n %q", got, want) + } + + // English answer → English label. + if got := sourcesFooter("Yes, removed on June 3.", src[:1]); !strings.HasPrefix(got, "\n\nSources: [rbc.ru](") { + t.Fatalf("sourcesFooter en = %q", got) + } + + // No usable sources → empty (no trailing label on a grok_direct/empty answer). + if got := sourcesFooter("привет", nil); got != "" { + t.Fatalf("empty sources should yield no footer, got %q", got) + } + // A source missing a title or URL is skipped. + if got := sourcesFooter("hi", []WebSource{{Title: "", URL: redirect}, {Title: "x.com", URL: ""}}); got != "" { + t.Fatalf("incomplete sources should yield no footer, got %q", got) + } +} + +func TestHostOf(t *testing.T) { + cases := map[string]string{ + "https://www.reuters.com/world/article-123": "reuters.com", + "https://rbc.ru/politics/03/06/2026": "rbc.ru", + "not a url": "", + "": "", + } + for in, want := range cases { + if got := hostOf(in); got != want { + t.Errorf("hostOf(%q) = %q, want %q", in, got, want) + } + } +} diff --git a/apps/ai-bot/web.go b/apps/ai-bot/web.go index 5922680d..21491f12 100644 --- a/apps/ai-bot/web.go +++ b/apps/ai-bot/web.go @@ -53,6 +53,16 @@ const ( // degrades (with a hedge) rather than paying past the cap. var errGroundingCapped = errors.New("web grounding daily cap reached") +// WebSource is one attributable source behind a web answer: a human label (the publisher +// domain) and a link the END USER can open. For gemini grounding the URL is the +// grounding-api-redirect (clicked by the user → the real article; never resolved +// server-side, which Gemini's terms forbid); for grok_web_search it is the real publisher +// URL. Surfaced to the user as a compact "Sources" footer (sources.go). +type WebSource struct { + Title string // publisher domain ("rbc.ru") — the citation's web.title / the URL host + URL string // the link to open (gemini: redirect; grok: real article URL) +} + // WebContext is the result of a web fetch: a factual digest to feed the final model, // the sources behind it, the fetch's own token usage, and the cost the fetch incurred // (kept separate from the final synthesis tokens so each books to its own ledger @@ -60,7 +70,8 @@ var errGroundingCapped = errors.New("web grounding daily cap reached") // still billed — the caller books it before degrading (§8.1 partial cascade). type WebContext struct { Digest string - Citations []string + Citations []string // raw source URLs (the verify-gate + citation_count telemetry) + Sources []WebSource // the same sources with display titles (the user-facing footer) Usage Usage Cost CostBreakdown } @@ -160,6 +171,7 @@ func (p *grokWebSearch) Fetch(ctx context.Context, query string) (WebContext, er var digest string var citations []string + var sources []WebSource for _, item := range out.Output { if item.Type != "message" { continue @@ -171,6 +183,8 @@ func (p *grokWebSearch) Fetch(ctx context.Context, query string) (WebContext, er for _, a := range c.Annotations { if a.Type == "url_citation" && a.URL != "" { citations = append(citations, a.URL) + // grok returns real publisher URLs, so the host IS the display domain. + sources = append(sources, WebSource{Title: hostOf(a.URL), URL: a.URL}) } } } @@ -187,6 +201,7 @@ func (p *grokWebSearch) Fetch(ctx context.Context, query string) (WebContext, er wc := WebContext{ Digest: digest, Citations: citations, + Sources: sources, Usage: usage, Cost: CostBreakdown{ WebTool: computeUSD(p.model, usage, p.cfg) + @@ -237,5 +252,5 @@ func (p *geminiGrounding) Fetch(ctx context.Context, query string) (WebContext, } return WebContext{Cost: cost, Usage: res.Usage}, err } - return WebContext{Digest: res.Digest, Citations: res.Citations, Usage: res.Usage, Cost: cost}, nil + return WebContext{Digest: res.Digest, Citations: res.Citations, Sources: res.Sources, Usage: res.Usage, Cost: cost}, nil } diff --git a/docs/ai/ai-bot.md b/docs/ai/ai-bot.md index 53012337..cd85dbb3 100644 --- a/docs/ai/ai-bot.md +++ b/docs/ai/ai-bot.md @@ -89,6 +89,36 @@ grounding is free under the daily RPD, guarded by `WEB_GROUNDING_DAILY_CAP`. `XA + `GROK_REASONING_EFFORT=none` (4.3 otherwise reasons on every reply). Full flag table in the [README](../../apps/ai-bot/README.md). +## Trigger hygiene (what reaches the search query) + +The raw event body is **cleaned once** at the top of `respond` ([bot.go](../../apps/ai-bot/bot.go), +`stripBotMention(stripReplyFallback(...))`) before it is used as the web-search query, the prompt +trigger, the buffer entry, or telemetry. Two egress hazards both rode the raw body: the bot's own +mention pill fallback (cinny writes the **full mxid** `@ai:vojo.chat` into the plain `body`), and +the rich-reply quoted parent. The mxid was the worse one — sent verbatim to gemini grounding it +made the provider treat **`vojo.chat`** as the subject entity ("was the *Vojo.chat* messenger +removed?") and confabulate a confident wrong answer; the same question without the mention (e.g. in +a DM, which has no mention) grounded correctly. Mention **detection** is unaffected — it runs +upstream on `m.mentions`/`replyParentIsBot` ([mentions.go](../../apps/ai-bot/mentions.go)), not on +body text. The human display name is deliberately **not** stripped, so "что умеет Vojo AI" survives. + +## Source attribution (the "Sources" footer) + +Web answers append a compact, deduped **`Источники: [rbc.ru](…), …`** line built **server-side** +after Grok's prose ([sources.go](../../apps/ai-bot/sources.go) `sourcesFooter`), never via the Grok +prompt (the synth note still says "no URLs or links" — instructing Grok to cite made it paste ugly +redirects and mis-attribute them). The label is the publisher **domain** (`web.title`); the link is +the citation's URL — for `gemini_grounding` that is the opaque `grounding-api-redirect` URL, which +the **end user clicks** to reach the real article. **Gemini Grounding terms** (verified against +`ai.google.dev/gemini-api/terms`) constrain this: the redirect must **not** be resolved +server-side (no "programmatic/automated access to Grounded Results"), and a strict reading also +requires showing the **Search-Suggestions chip** (`searchEntryPoint.renderedContent`, HTML/CSS) — +which a sanitised Matrix bubble can't render, so that part stays unmet (pre-existing gap; the bot +already shows grounded prose without it). The footer is appended to the **sent** message only, not +the buffered turn — the redirect links are ephemeral, so they must not pollute the history that +feeds later prompts. `grok_web_search` returns **real** publisher URLs (no Google display ToS), so +switching `WEB_PROVIDER` is the path to true article links — at ~17× the cost. + ## Observability (logs + per-request trace) `log/slog` to stderr (`LOG_LEVEL`, `LOG_FORMAT=text|json`). A context-aware handler