vojo/apps/ai-bot/sources.go

package main

import (
	"net/url"
	"strings"
	"unicode"
)

// sources.go renders the user-facing "Sources" attribution for a web answer. It is built
// SERVER-SIDE and appended AFTER the model's prose — never handed to the model. The model
// was deliberately told to write "no URLs or links" (webSynthMessages) because instructing
// it to cite made it paste the opaque grounding-api-redirect links uglily and mis-attribute
// them. Doing the attribution here keeps the format controlled and the links honest.
//
// Compliance notes (Gemini Grounding terms, verified against ai.google.dev/gemini-api/terms):
//   - We NEVER resolve the grounding redirect server-side ("no programmatic/automated access
//     to Grounded Results"). We emit the redirect as a link the END USER clicks — the
//     intended direct-access flow — and it lands them on the real article.
//   - We label with the publisher domain (web.title), which is stable and ToS-neutral.
//   - The strict terms also ask for the Search-Suggestions chip (searchEntryPoint), which a
//     sanitised Matrix bubble can't render; that gap is pre-existing (the bot already shows
//     grounded prose without it) and out of scope here.

// maxSourcesShown caps the appended attribution. A handful of domains is plenty and keeps
// the message tidy — gemini grounding routinely returns a dozen near-duplicate chunks.
const maxSourcesShown = 3

// sourcesFooter renders a compact, deduped "Sources" line from a web route's sources, or ""
// when there's nothing usable. Each entry is a markdown link whose LABEL is the publisher
// domain and whose HREF is the source link (markdownToHTML promotes it to a clickable <a>;
// the plain body keeps the readable "[domain](url)" fallback). Dedup is by domain so several
// chunks from one outlet collapse to one link. The label language follows the answer
// (Cyrillic → Russian), since the bot replies in the user's language.
func sourcesFooter(answer string, sources []WebSource) string {
	seen := make(map[string]bool, len(sources))
	var links []string
	for _, s := range sources {
		dom := sourceDomain(s.Title)
		u := strings.TrimSpace(s.URL)
		if dom == "" || u == "" {
			continue
		}
		key := strings.ToLower(dom)
		if seen[key] {
			continue
		}
		seen[key] = true
		links = append(links, "["+dom+"]("+u+")")
		if len(links) >= maxSourcesShown {
			break
		}
	}
	if len(links) == 0 {
		return ""
	}
	label := "Sources"
	if hasCyrillic(answer) {
		label = "Источники"
	}
	return "\n\n" + label + ": " + strings.Join(links, ", ")
}

// sourceDomain normalises a citation's display label to a bare publisher domain: it trims a
// leading "www." and surrounding space. gemini grounding already returns the domain in
// web.title; this just tidies it. Returns "" for an empty/garbage label.
func sourceDomain(title string) string {
	t := strings.TrimSpace(title)
	t = strings.TrimPrefix(t, "www.")
	return strings.TrimSpace(t)
}

// hostOf extracts the host (minus a leading "www.") from a real URL — used to label
// grok_web_search citations, which carry the actual publisher URL rather than a domain.
// Returns "" if the URL doesn't parse to a host.
func hostOf(rawURL string) string {
	u, err := url.Parse(strings.TrimSpace(rawURL))
	if err != nil || u.Host == "" {
		return ""
	}
	return strings.TrimPrefix(u.Host, "www.")
}

// hasCyrillic reports whether s contains any Cyrillic letter — a cheap proxy for "the bot
// answered in Russian", used only to localise the Sources label.
func hasCyrillic(s string) bool {
	for _, r := range s {
		if unicode.Is(unicode.Cyrillic, r) {
			return true
		}
	}
	return false
}