fix(ai-bot): decode punycode IDN domains to Unicode in the Sources footer so .рф citations read instead of xn--

This commit is contained in:
heaven 2026-06-07 02:47:10 +03:00
parent 674616f398
commit 91899f56fb
3 changed files with 50 additions and 13 deletions

View file

@ -6,6 +6,7 @@ require (
github.com/jackc/pgx/v5 v5.9.2 github.com/jackc/pgx/v5 v5.9.2
github.com/microcosm-cc/bluemonday v1.0.27 github.com/microcosm-cc/bluemonday v1.0.27
github.com/yuin/goldmark v1.8.2 github.com/yuin/goldmark v1.8.2
golang.org/x/net v0.26.0
gopkg.in/yaml.v3 v3.0.1 gopkg.in/yaml.v3 v3.0.1
) )
@ -17,7 +18,6 @@ require (
github.com/jackc/puddle/v2 v2.2.2 // indirect github.com/jackc/puddle/v2 v2.2.2 // indirect
github.com/kr/text v0.2.0 // indirect github.com/kr/text v0.2.0 // indirect
github.com/rogpeppe/go-internal v1.15.0 // indirect github.com/rogpeppe/go-internal v1.15.0 // indirect
golang.org/x/net v0.26.0 // indirect
golang.org/x/sync v0.20.0 // indirect golang.org/x/sync v0.20.0 // indirect
golang.org/x/text v0.29.0 // indirect golang.org/x/text v0.29.0 // indirect
) )

View file

@ -4,6 +4,8 @@ import (
"net/url" "net/url"
"strings" "strings"
"unicode" "unicode"
"golang.org/x/net/idna"
) )
// sources.go renders the user-facing "Sources" attribution for a web answer. It is built // sources.go renders the user-facing "Sources" attribution for a web answer. It is built
@ -35,7 +37,7 @@ func sourcesFooter(answer string, sources []WebSource) string {
seen := make(map[string]bool, len(sources)) seen := make(map[string]bool, len(sources))
var links []string var links []string
for _, s := range sources { for _, s := range sources {
dom := sourceDomain(s.Title) dom := displayDomain(s.Title)
u := strings.TrimSpace(s.URL) u := strings.TrimSpace(s.URL)
if dom == "" || u == "" { if dom == "" || u == "" {
continue continue
@ -60,24 +62,39 @@ func sourcesFooter(answer string, sources []WebSource) string {
return "\n\n" + label + ": " + strings.Join(links, ", ") return "\n\n" + label + ": " + strings.Join(links, ", ")
} }
// sourceDomain normalises a citation's display label to a bare publisher domain: it trims a // displayDomain turns a host/domain into a readable label: it trims a leading "www." and
// leading "www." and surrounding space. gemini grounding already returns the domain in // surrounding space, then decodes a punycode IDN to its Unicode form. gemini grounding returns
// web.title; this just tidies it. Returns "" for an empty/garbage label. // the publisher domain in web.title, but for a non-ASCII host (e.g. a .рф site) that title is
func sourceDomain(title string) string { // punycode ("xn--…"), which renders as gibberish in the Sources footer. idna.ToUnicode is
t := strings.TrimSpace(title) // punycode-only: ASCII domains pass through unchanged and a label that fails to decode keeps
t = strings.TrimPrefix(t, "www.") // its raw form (never worse than before). idna.Display was tried and gives byte-identical
return strings.TrimSpace(t) // output here — it adds no homograph protection over the basic decode (that lives in TR39
// script-mixing rules, not UTS#46), and the label isn't the click target anyway (the href is
// the source URL), so the simpler profile is used. Shared by both citation label paths
// (sourceDomain for gemini titles, hostOf for grok_web_search URLs). Returns "" for empty.
func displayDomain(s string) string {
s = strings.TrimSpace(s)
s = strings.TrimPrefix(s, "www.")
s = strings.TrimSpace(s)
if s == "" {
return ""
}
if u, err := idna.ToUnicode(s); err == nil && u != "" {
s = u
}
return s
} }
// hostOf extracts the host (minus a leading "www.") from a real URL — used to label // hostOf extracts the readable host from a real URL — used to label grok_web_search citations,
// grok_web_search citations, which carry the actual publisher URL rather than a domain. // which carry the actual publisher URL rather than a domain. Runs the host through
// Returns "" if the URL doesn't parse to a host. // displayDomain so a "www." prefix is dropped and an IDN host decodes to Unicode, matching the
// gemini-title path. Returns "" if the URL doesn't parse to a host.
func hostOf(rawURL string) string { func hostOf(rawURL string) string {
u, err := url.Parse(strings.TrimSpace(rawURL)) u, err := url.Parse(strings.TrimSpace(rawURL))
if err != nil || u.Host == "" { if err != nil || u.Host == "" {
return "" return ""
} }
return strings.TrimPrefix(u.Host, "www.") return displayDomain(u.Host)
} }
// hasCyrillic reports whether s contains any Cyrillic letter — a cheap proxy for "the bot // hasCyrillic reports whether s contains any Cyrillic letter — a cheap proxy for "the bot

View file

@ -37,10 +37,30 @@ func TestSourcesFooter(t *testing.T) {
} }
} }
func TestDisplayDomain(t *testing.T) {
cases := map[string]string{
"xn----7sbbtpiaccnexupfs6l4c.xn--p1ai": "крымская-косметика.рф", // real hyphenated .рф from a grounding result
"xn--80aswg.xn--p1ai": "сайт.рф", // .рф IDN → readable Cyrillic, not "xn--…"
"www.xn--80aswg.xn--p1ai": "сайт.рф", // www stripped first, then decoded
"wikipedia.org": "wikipedia.org", // ASCII passes through unchanged
"www.youtube.com": "youtube.com",
" rbc.ru ": "rbc.ru",
"xn--80ak6aa92e.com": "аррӏе.com", // homograph decodes too — fine, the label is not the click target
"xn--invalid-punycode-": "invalid-punycode",
"": "",
}
for in, want := range cases {
if got := displayDomain(in); got != want {
t.Errorf("displayDomain(%q) = %q, want %q", in, got, want)
}
}
}
func TestHostOf(t *testing.T) { func TestHostOf(t *testing.T) {
cases := map[string]string{ cases := map[string]string{
"https://www.reuters.com/world/article-123": "reuters.com", "https://www.reuters.com/world/article-123": "reuters.com",
"https://rbc.ru/politics/03/06/2026": "rbc.ru", "https://rbc.ru/politics/03/06/2026": "rbc.ru",
"https://xn--80aswg.xn--p1ai/p": "сайт.рф", // IDN host decoded to Unicode for display
"not a url": "", "not a url": "",
"": "", "": "",
} }