vojo/apps/ai-bot/sources.go

109 lines
4.5 KiB
Go

package main
import (
"net/url"
"strings"
"unicode"
"golang.org/x/net/idna"
)
// sources.go renders the user-facing "Sources" attribution for a web answer. It is built
// SERVER-SIDE and appended AFTER the model's prose — never handed to the model. The model
// was deliberately told to write "no URLs or links" (webSynthMessages) because instructing
// it to cite made it paste the opaque grounding-api-redirect links uglily and mis-attribute
// them. Doing the attribution here keeps the format controlled and the links honest.
//
// Compliance notes (Gemini Grounding terms, verified against ai.google.dev/gemini-api/terms):
// - We NEVER resolve the grounding redirect server-side ("no programmatic/automated access
// to Grounded Results"). We emit the redirect as a link the END USER clicks — the
// intended direct-access flow — and it lands them on the real article.
// - We label with the publisher domain (web.title), which is stable and ToS-neutral.
// - The strict terms also ask for the Search-Suggestions chip (searchEntryPoint), which a
// sanitised Matrix bubble can't render; that gap is pre-existing (the bot already shows
// grounded prose without it) and out of scope here.
// maxSourcesShown caps the appended attribution. A handful of domains is plenty and keeps
// the message tidy — gemini grounding routinely returns a dozen near-duplicate chunks.
const maxSourcesShown = 3
// sourcesFooter renders a compact, deduped "Sources" line from a web route's sources, or ""
// when there's nothing usable. Each entry is a markdown link whose LABEL is the publisher
// domain and whose HREF is the source link (markdownToHTML promotes it to a clickable <a>;
// the plain body keeps the readable "[domain](url)" fallback). Dedup is by domain so several
// chunks from one outlet collapse to one link. The label language follows the answer
// (Cyrillic → Russian), since the bot replies in the user's language.
func sourcesFooter(answer string, sources []WebSource) string {
seen := make(map[string]bool, len(sources))
var links []string
for _, s := range sources {
dom := displayDomain(s.Title)
u := strings.TrimSpace(s.URL)
if dom == "" || u == "" {
continue
}
key := strings.ToLower(dom)
if seen[key] {
continue
}
seen[key] = true
links = append(links, "["+dom+"]("+u+")")
if len(links) >= maxSourcesShown {
break
}
}
if len(links) == 0 {
return ""
}
label := "Sources"
if hasCyrillic(answer) {
label = "Источники"
}
return "\n\n" + label + ": " + strings.Join(links, ", ")
}
// displayDomain turns a host/domain into a readable label: it trims a leading "www." and
// surrounding space, then decodes a punycode IDN to its Unicode form. gemini grounding returns
// the publisher domain in web.title, but for a non-ASCII host (e.g. a .рф site) that title is
// punycode ("xn--…"), which renders as gibberish in the Sources footer. idna.ToUnicode is
// punycode-only: ASCII domains pass through unchanged and a label that fails to decode keeps
// its raw form (never worse than before). idna.Display was tried and gives byte-identical
// output here — it adds no homograph protection over the basic decode (that lives in TR39
// script-mixing rules, not UTS#46), and the label isn't the click target anyway (the href is
// the source URL), so the simpler profile is used. Shared by both citation label paths
// (sourceDomain for gemini titles, hostOf for grok_web_search URLs). Returns "" for empty.
func displayDomain(s string) string {
s = strings.TrimSpace(s)
s = strings.TrimPrefix(s, "www.")
s = strings.TrimSpace(s)
if s == "" {
return ""
}
if u, err := idna.ToUnicode(s); err == nil && u != "" {
s = u
}
return s
}
// hostOf extracts the readable host from a real URL — used to label grok_web_search citations,
// which carry the actual publisher URL rather than a domain. Runs the host through
// displayDomain so a "www." prefix is dropped and an IDN host decodes to Unicode, matching the
// gemini-title path. Returns "" if the URL doesn't parse to a host.
func hostOf(rawURL string) string {
u, err := url.Parse(strings.TrimSpace(rawURL))
if err != nil || u.Host == "" {
return ""
}
return displayDomain(u.Host)
}
// hasCyrillic reports whether s contains any Cyrillic letter — a cheap proxy for "the bot
// answered in Russian", used only to localise the Sources label.
func hasCyrillic(s string) bool {
for _, r := range s {
if unicode.Is(unicode.Cyrillic, r) {
return true
}
}
return false
}