vojo/apps/ai-bot/sources.go

92 lines
3.6 KiB
Go

package main
import (
"net/url"
"strings"
"unicode"
)
// sources.go renders the user-facing "Sources" attribution for a web answer. It is built
// SERVER-SIDE and appended AFTER the model's prose — never handed to the model. The model
// was deliberately told to write "no URLs or links" (webSynthMessages) because instructing
// it to cite made it paste the opaque grounding-api-redirect links uglily and mis-attribute
// them. Doing the attribution here keeps the format controlled and the links honest.
//
// Compliance notes (Gemini Grounding terms, verified against ai.google.dev/gemini-api/terms):
// - We NEVER resolve the grounding redirect server-side ("no programmatic/automated access
// to Grounded Results"). We emit the redirect as a link the END USER clicks — the
// intended direct-access flow — and it lands them on the real article.
// - We label with the publisher domain (web.title), which is stable and ToS-neutral.
// - The strict terms also ask for the Search-Suggestions chip (searchEntryPoint), which a
// sanitised Matrix bubble can't render; that gap is pre-existing (the bot already shows
// grounded prose without it) and out of scope here.
// maxSourcesShown caps the appended attribution. A handful of domains is plenty and keeps
// the message tidy — gemini grounding routinely returns a dozen near-duplicate chunks.
const maxSourcesShown = 3
// sourcesFooter renders a compact, deduped "Sources" line from a web route's sources, or ""
// when there's nothing usable. Each entry is a markdown link whose LABEL is the publisher
// domain and whose HREF is the source link (markdownToHTML promotes it to a clickable <a>;
// the plain body keeps the readable "[domain](url)" fallback). Dedup is by domain so several
// chunks from one outlet collapse to one link. The label language follows the answer
// (Cyrillic → Russian), since the bot replies in the user's language.
func sourcesFooter(answer string, sources []WebSource) string {
seen := make(map[string]bool, len(sources))
var links []string
for _, s := range sources {
dom := sourceDomain(s.Title)
u := strings.TrimSpace(s.URL)
if dom == "" || u == "" {
continue
}
key := strings.ToLower(dom)
if seen[key] {
continue
}
seen[key] = true
links = append(links, "["+dom+"]("+u+")")
if len(links) >= maxSourcesShown {
break
}
}
if len(links) == 0 {
return ""
}
label := "Sources"
if hasCyrillic(answer) {
label = "Источники"
}
return "\n\n" + label + ": " + strings.Join(links, ", ")
}
// sourceDomain normalises a citation's display label to a bare publisher domain: it trims a
// leading "www." and surrounding space. gemini grounding already returns the domain in
// web.title; this just tidies it. Returns "" for an empty/garbage label.
func sourceDomain(title string) string {
t := strings.TrimSpace(title)
t = strings.TrimPrefix(t, "www.")
return strings.TrimSpace(t)
}
// hostOf extracts the host (minus a leading "www.") from a real URL — used to label
// grok_web_search citations, which carry the actual publisher URL rather than a domain.
// Returns "" if the URL doesn't parse to a host.
func hostOf(rawURL string) string {
u, err := url.Parse(strings.TrimSpace(rawURL))
if err != nil || u.Host == "" {
return ""
}
return strings.TrimPrefix(u.Host, "www.")
}
// hasCyrillic reports whether s contains any Cyrillic letter — a cheap proxy for "the bot
// answered in Russian", used only to localise the Sources label.
func hasCyrillic(s string) bool {
for _, r := range s {
if unicode.Is(unicode.Cyrillic, r) {
return true
}
}
return false
}