From 91899f56fbd5ae6750f9e5b4fc63b540b3c8da9a Mon Sep 17 00:00:00 2001 From: heaven Date: Sun, 7 Jun 2026 02:47:10 +0300 Subject: [PATCH] =?UTF-8?q?fix(ai-bot):=20decode=20punycode=20IDN=20domain?= =?UTF-8?q?s=20to=20Unicode=20in=20the=20Sources=20footer=20so=20.=D1=80?= =?UTF-8?q?=D1=84=20citations=20read=20instead=20of=20xn--?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- apps/ai-bot/go.mod | 2 +- apps/ai-bot/sources.go | 41 ++++++++++++++++++++++++++----------- apps/ai-bot/sources_test.go | 20 ++++++++++++++++++ 3 files changed, 50 insertions(+), 13 deletions(-) diff --git a/apps/ai-bot/go.mod b/apps/ai-bot/go.mod index 57b90b5c..79c55302 100644 --- a/apps/ai-bot/go.mod +++ b/apps/ai-bot/go.mod @@ -6,6 +6,7 @@ require ( github.com/jackc/pgx/v5 v5.9.2 github.com/microcosm-cc/bluemonday v1.0.27 github.com/yuin/goldmark v1.8.2 + golang.org/x/net v0.26.0 gopkg.in/yaml.v3 v3.0.1 ) @@ -17,7 +18,6 @@ require ( github.com/jackc/puddle/v2 v2.2.2 // indirect github.com/kr/text v0.2.0 // indirect github.com/rogpeppe/go-internal v1.15.0 // indirect - golang.org/x/net v0.26.0 // indirect golang.org/x/sync v0.20.0 // indirect golang.org/x/text v0.29.0 // indirect ) diff --git a/apps/ai-bot/sources.go b/apps/ai-bot/sources.go index 91b89dd3..305c1cc5 100644 --- a/apps/ai-bot/sources.go +++ b/apps/ai-bot/sources.go @@ -4,6 +4,8 @@ import ( "net/url" "strings" "unicode" + + "golang.org/x/net/idna" ) // sources.go renders the user-facing "Sources" attribution for a web answer. It is built @@ -35,7 +37,7 @@ func sourcesFooter(answer string, sources []WebSource) string { seen := make(map[string]bool, len(sources)) var links []string for _, s := range sources { - dom := sourceDomain(s.Title) + dom := displayDomain(s.Title) u := strings.TrimSpace(s.URL) if dom == "" || u == "" { continue @@ -60,24 +62,39 @@ func sourcesFooter(answer string, sources []WebSource) string { return "\n\n" + label + ": " + strings.Join(links, ", ") } -// sourceDomain normalises a citation's display label to a bare publisher domain: it trims a -// leading "www." and surrounding space. gemini grounding already returns the domain in -// web.title; this just tidies it. Returns "" for an empty/garbage label. -func sourceDomain(title string) string { - t := strings.TrimSpace(title) - t = strings.TrimPrefix(t, "www.") - return strings.TrimSpace(t) +// displayDomain turns a host/domain into a readable label: it trims a leading "www." and +// surrounding space, then decodes a punycode IDN to its Unicode form. gemini grounding returns +// the publisher domain in web.title, but for a non-ASCII host (e.g. a .рф site) that title is +// punycode ("xn--…"), which renders as gibberish in the Sources footer. idna.ToUnicode is +// punycode-only: ASCII domains pass through unchanged and a label that fails to decode keeps +// its raw form (never worse than before). idna.Display was tried and gives byte-identical +// output here — it adds no homograph protection over the basic decode (that lives in TR39 +// script-mixing rules, not UTS#46), and the label isn't the click target anyway (the href is +// the source URL), so the simpler profile is used. Shared by both citation label paths +// (sourceDomain for gemini titles, hostOf for grok_web_search URLs). Returns "" for empty. +func displayDomain(s string) string { + s = strings.TrimSpace(s) + s = strings.TrimPrefix(s, "www.") + s = strings.TrimSpace(s) + if s == "" { + return "" + } + if u, err := idna.ToUnicode(s); err == nil && u != "" { + s = u + } + return s } -// hostOf extracts the host (minus a leading "www.") from a real URL — used to label -// grok_web_search citations, which carry the actual publisher URL rather than a domain. -// Returns "" if the URL doesn't parse to a host. +// hostOf extracts the readable host from a real URL — used to label grok_web_search citations, +// which carry the actual publisher URL rather than a domain. Runs the host through +// displayDomain so a "www." prefix is dropped and an IDN host decodes to Unicode, matching the +// gemini-title path. Returns "" if the URL doesn't parse to a host. func hostOf(rawURL string) string { u, err := url.Parse(strings.TrimSpace(rawURL)) if err != nil || u.Host == "" { return "" } - return strings.TrimPrefix(u.Host, "www.") + return displayDomain(u.Host) } // hasCyrillic reports whether s contains any Cyrillic letter — a cheap proxy for "the bot diff --git a/apps/ai-bot/sources_test.go b/apps/ai-bot/sources_test.go index f883138f..89f4c30d 100644 --- a/apps/ai-bot/sources_test.go +++ b/apps/ai-bot/sources_test.go @@ -37,10 +37,30 @@ func TestSourcesFooter(t *testing.T) { } } +func TestDisplayDomain(t *testing.T) { + cases := map[string]string{ + "xn----7sbbtpiaccnexupfs6l4c.xn--p1ai": "крымская-косметика.рф", // real hyphenated .рф from a grounding result + "xn--80aswg.xn--p1ai": "сайт.рф", // .рф IDN → readable Cyrillic, not "xn--…" + "www.xn--80aswg.xn--p1ai": "сайт.рф", // www stripped first, then decoded + "wikipedia.org": "wikipedia.org", // ASCII passes through unchanged + "www.youtube.com": "youtube.com", + " rbc.ru ": "rbc.ru", + "xn--80ak6aa92e.com": "аррӏе.com", // homograph decodes too — fine, the label is not the click target + "xn--invalid-punycode-": "invalid-punycode", + "": "", + } + for in, want := range cases { + if got := displayDomain(in); got != want { + t.Errorf("displayDomain(%q) = %q, want %q", in, got, want) + } + } +} + func TestHostOf(t *testing.T) { cases := map[string]string{ "https://www.reuters.com/world/article-123": "reuters.com", "https://rbc.ru/politics/03/06/2026": "rbc.ru", + "https://xn--80aswg.xn--p1ai/p": "сайт.рф", // IDN host decoded to Unicode for display "not a url": "", "": "", }