92 lines
3.6 KiB
Go
92 lines
3.6 KiB
Go
package main
|
|
|
|
import (
|
|
"net/url"
|
|
"strings"
|
|
"unicode"
|
|
)
|
|
|
|
// sources.go renders the user-facing "Sources" attribution for a web answer. It is built
|
|
// SERVER-SIDE and appended AFTER the model's prose — never handed to the model. The model
|
|
// was deliberately told to write "no URLs or links" (webSynthMessages) because instructing
|
|
// it to cite made it paste the opaque grounding-api-redirect links uglily and mis-attribute
|
|
// them. Doing the attribution here keeps the format controlled and the links honest.
|
|
//
|
|
// Compliance notes (Gemini Grounding terms, verified against ai.google.dev/gemini-api/terms):
|
|
// - We NEVER resolve the grounding redirect server-side ("no programmatic/automated access
|
|
// to Grounded Results"). We emit the redirect as a link the END USER clicks — the
|
|
// intended direct-access flow — and it lands them on the real article.
|
|
// - We label with the publisher domain (web.title), which is stable and ToS-neutral.
|
|
// - The strict terms also ask for the Search-Suggestions chip (searchEntryPoint), which a
|
|
// sanitised Matrix bubble can't render; that gap is pre-existing (the bot already shows
|
|
// grounded prose without it) and out of scope here.
|
|
|
|
// maxSourcesShown caps the appended attribution. A handful of domains is plenty and keeps
|
|
// the message tidy — gemini grounding routinely returns a dozen near-duplicate chunks.
|
|
const maxSourcesShown = 3
|
|
|
|
// sourcesFooter renders a compact, deduped "Sources" line from a web route's sources, or ""
|
|
// when there's nothing usable. Each entry is a markdown link whose LABEL is the publisher
|
|
// domain and whose HREF is the source link (markdownToHTML promotes it to a clickable <a>;
|
|
// the plain body keeps the readable "[domain](url)" fallback). Dedup is by domain so several
|
|
// chunks from one outlet collapse to one link. The label language follows the answer
|
|
// (Cyrillic → Russian), since the bot replies in the user's language.
|
|
func sourcesFooter(answer string, sources []WebSource) string {
|
|
seen := make(map[string]bool, len(sources))
|
|
var links []string
|
|
for _, s := range sources {
|
|
dom := sourceDomain(s.Title)
|
|
u := strings.TrimSpace(s.URL)
|
|
if dom == "" || u == "" {
|
|
continue
|
|
}
|
|
key := strings.ToLower(dom)
|
|
if seen[key] {
|
|
continue
|
|
}
|
|
seen[key] = true
|
|
links = append(links, "["+dom+"]("+u+")")
|
|
if len(links) >= maxSourcesShown {
|
|
break
|
|
}
|
|
}
|
|
if len(links) == 0 {
|
|
return ""
|
|
}
|
|
label := "Sources"
|
|
if hasCyrillic(answer) {
|
|
label = "Источники"
|
|
}
|
|
return "\n\n" + label + ": " + strings.Join(links, ", ")
|
|
}
|
|
|
|
// sourceDomain normalises a citation's display label to a bare publisher domain: it trims a
|
|
// leading "www." and surrounding space. gemini grounding already returns the domain in
|
|
// web.title; this just tidies it. Returns "" for an empty/garbage label.
|
|
func sourceDomain(title string) string {
|
|
t := strings.TrimSpace(title)
|
|
t = strings.TrimPrefix(t, "www.")
|
|
return strings.TrimSpace(t)
|
|
}
|
|
|
|
// hostOf extracts the host (minus a leading "www.") from a real URL — used to label
|
|
// grok_web_search citations, which carry the actual publisher URL rather than a domain.
|
|
// Returns "" if the URL doesn't parse to a host.
|
|
func hostOf(rawURL string) string {
|
|
u, err := url.Parse(strings.TrimSpace(rawURL))
|
|
if err != nil || u.Host == "" {
|
|
return ""
|
|
}
|
|
return strings.TrimPrefix(u.Host, "www.")
|
|
}
|
|
|
|
// hasCyrillic reports whether s contains any Cyrillic letter — a cheap proxy for "the bot
|
|
// answered in Russian", used only to localise the Sources label.
|
|
func hasCyrillic(s string) bool {
|
|
for _, r := range s {
|
|
if unicode.Is(unicode.Cyrillic, r) {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|