fix(ai-bot): decode punycode IDN domains to Unicode in the Sources footer so .рф citations read instead of xn--
This commit is contained in:
parent
674616f398
commit
91899f56fb
3 changed files with 50 additions and 13 deletions
|
|
@ -6,6 +6,7 @@ require (
|
|||
github.com/jackc/pgx/v5 v5.9.2
|
||||
github.com/microcosm-cc/bluemonday v1.0.27
|
||||
github.com/yuin/goldmark v1.8.2
|
||||
golang.org/x/net v0.26.0
|
||||
gopkg.in/yaml.v3 v3.0.1
|
||||
)
|
||||
|
||||
|
|
@ -17,7 +18,6 @@ require (
|
|||
github.com/jackc/puddle/v2 v2.2.2 // indirect
|
||||
github.com/kr/text v0.2.0 // indirect
|
||||
github.com/rogpeppe/go-internal v1.15.0 // indirect
|
||||
golang.org/x/net v0.26.0 // indirect
|
||||
golang.org/x/sync v0.20.0 // indirect
|
||||
golang.org/x/text v0.29.0 // indirect
|
||||
)
|
||||
|
|
|
|||
|
|
@ -4,6 +4,8 @@ import (
|
|||
"net/url"
|
||||
"strings"
|
||||
"unicode"
|
||||
|
||||
"golang.org/x/net/idna"
|
||||
)
|
||||
|
||||
// sources.go renders the user-facing "Sources" attribution for a web answer. It is built
|
||||
|
|
@ -35,7 +37,7 @@ func sourcesFooter(answer string, sources []WebSource) string {
|
|||
seen := make(map[string]bool, len(sources))
|
||||
var links []string
|
||||
for _, s := range sources {
|
||||
dom := sourceDomain(s.Title)
|
||||
dom := displayDomain(s.Title)
|
||||
u := strings.TrimSpace(s.URL)
|
||||
if dom == "" || u == "" {
|
||||
continue
|
||||
|
|
@ -60,24 +62,39 @@ func sourcesFooter(answer string, sources []WebSource) string {
|
|||
return "\n\n" + label + ": " + strings.Join(links, ", ")
|
||||
}
|
||||
|
||||
// sourceDomain normalises a citation's display label to a bare publisher domain: it trims a
|
||||
// leading "www." and surrounding space. gemini grounding already returns the domain in
|
||||
// web.title; this just tidies it. Returns "" for an empty/garbage label.
|
||||
func sourceDomain(title string) string {
|
||||
t := strings.TrimSpace(title)
|
||||
t = strings.TrimPrefix(t, "www.")
|
||||
return strings.TrimSpace(t)
|
||||
// displayDomain turns a host/domain into a readable label: it trims a leading "www." and
|
||||
// surrounding space, then decodes a punycode IDN to its Unicode form. gemini grounding returns
|
||||
// the publisher domain in web.title, but for a non-ASCII host (e.g. a .рф site) that title is
|
||||
// punycode ("xn--…"), which renders as gibberish in the Sources footer. idna.ToUnicode is
|
||||
// punycode-only: ASCII domains pass through unchanged and a label that fails to decode keeps
|
||||
// its raw form (never worse than before). idna.Display was tried and gives byte-identical
|
||||
// output here — it adds no homograph protection over the basic decode (that lives in TR39
|
||||
// script-mixing rules, not UTS#46), and the label isn't the click target anyway (the href is
|
||||
// the source URL), so the simpler profile is used. Shared by both citation label paths
|
||||
// (sourceDomain for gemini titles, hostOf for grok_web_search URLs). Returns "" for empty.
|
||||
func displayDomain(s string) string {
|
||||
s = strings.TrimSpace(s)
|
||||
s = strings.TrimPrefix(s, "www.")
|
||||
s = strings.TrimSpace(s)
|
||||
if s == "" {
|
||||
return ""
|
||||
}
|
||||
if u, err := idna.ToUnicode(s); err == nil && u != "" {
|
||||
s = u
|
||||
}
|
||||
return s
|
||||
}
|
||||
|
||||
// hostOf extracts the host (minus a leading "www.") from a real URL — used to label
|
||||
// grok_web_search citations, which carry the actual publisher URL rather than a domain.
|
||||
// Returns "" if the URL doesn't parse to a host.
|
||||
// hostOf extracts the readable host from a real URL — used to label grok_web_search citations,
|
||||
// which carry the actual publisher URL rather than a domain. Runs the host through
|
||||
// displayDomain so a "www." prefix is dropped and an IDN host decodes to Unicode, matching the
|
||||
// gemini-title path. Returns "" if the URL doesn't parse to a host.
|
||||
func hostOf(rawURL string) string {
|
||||
u, err := url.Parse(strings.TrimSpace(rawURL))
|
||||
if err != nil || u.Host == "" {
|
||||
return ""
|
||||
}
|
||||
return strings.TrimPrefix(u.Host, "www.")
|
||||
return displayDomain(u.Host)
|
||||
}
|
||||
|
||||
// hasCyrillic reports whether s contains any Cyrillic letter — a cheap proxy for "the bot
|
||||
|
|
|
|||
|
|
@ -37,10 +37,30 @@ func TestSourcesFooter(t *testing.T) {
|
|||
}
|
||||
}
|
||||
|
||||
func TestDisplayDomain(t *testing.T) {
|
||||
cases := map[string]string{
|
||||
"xn----7sbbtpiaccnexupfs6l4c.xn--p1ai": "крымская-косметика.рф", // real hyphenated .рф from a grounding result
|
||||
"xn--80aswg.xn--p1ai": "сайт.рф", // .рф IDN → readable Cyrillic, not "xn--…"
|
||||
"www.xn--80aswg.xn--p1ai": "сайт.рф", // www stripped first, then decoded
|
||||
"wikipedia.org": "wikipedia.org", // ASCII passes through unchanged
|
||||
"www.youtube.com": "youtube.com",
|
||||
" rbc.ru ": "rbc.ru",
|
||||
"xn--80ak6aa92e.com": "аррӏе.com", // homograph decodes too — fine, the label is not the click target
|
||||
"xn--invalid-punycode-": "invalid-punycode",
|
||||
"": "",
|
||||
}
|
||||
for in, want := range cases {
|
||||
if got := displayDomain(in); got != want {
|
||||
t.Errorf("displayDomain(%q) = %q, want %q", in, got, want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestHostOf(t *testing.T) {
|
||||
cases := map[string]string{
|
||||
"https://www.reuters.com/world/article-123": "reuters.com",
|
||||
"https://rbc.ru/politics/03/06/2026": "rbc.ru",
|
||||
"https://xn--80aswg.xn--p1ai/p": "сайт.рф", // IDN host decoded to Unicode for display
|
||||
"not a url": "",
|
||||
"": "",
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue