fix(ai-bot): decode punycode IDN domains to Unicode in the Sources footer so .рф citations read instead of xn--
This commit is contained in:
parent
674616f398
commit
91899f56fb
3 changed files with 50 additions and 13 deletions
|
|
@ -6,6 +6,7 @@ require (
|
||||||
github.com/jackc/pgx/v5 v5.9.2
|
github.com/jackc/pgx/v5 v5.9.2
|
||||||
github.com/microcosm-cc/bluemonday v1.0.27
|
github.com/microcosm-cc/bluemonday v1.0.27
|
||||||
github.com/yuin/goldmark v1.8.2
|
github.com/yuin/goldmark v1.8.2
|
||||||
|
golang.org/x/net v0.26.0
|
||||||
gopkg.in/yaml.v3 v3.0.1
|
gopkg.in/yaml.v3 v3.0.1
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -17,7 +18,6 @@ require (
|
||||||
github.com/jackc/puddle/v2 v2.2.2 // indirect
|
github.com/jackc/puddle/v2 v2.2.2 // indirect
|
||||||
github.com/kr/text v0.2.0 // indirect
|
github.com/kr/text v0.2.0 // indirect
|
||||||
github.com/rogpeppe/go-internal v1.15.0 // indirect
|
github.com/rogpeppe/go-internal v1.15.0 // indirect
|
||||||
golang.org/x/net v0.26.0 // indirect
|
|
||||||
golang.org/x/sync v0.20.0 // indirect
|
golang.org/x/sync v0.20.0 // indirect
|
||||||
golang.org/x/text v0.29.0 // indirect
|
golang.org/x/text v0.29.0 // indirect
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -4,6 +4,8 @@ import (
|
||||||
"net/url"
|
"net/url"
|
||||||
"strings"
|
"strings"
|
||||||
"unicode"
|
"unicode"
|
||||||
|
|
||||||
|
"golang.org/x/net/idna"
|
||||||
)
|
)
|
||||||
|
|
||||||
// sources.go renders the user-facing "Sources" attribution for a web answer. It is built
|
// sources.go renders the user-facing "Sources" attribution for a web answer. It is built
|
||||||
|
|
@ -35,7 +37,7 @@ func sourcesFooter(answer string, sources []WebSource) string {
|
||||||
seen := make(map[string]bool, len(sources))
|
seen := make(map[string]bool, len(sources))
|
||||||
var links []string
|
var links []string
|
||||||
for _, s := range sources {
|
for _, s := range sources {
|
||||||
dom := sourceDomain(s.Title)
|
dom := displayDomain(s.Title)
|
||||||
u := strings.TrimSpace(s.URL)
|
u := strings.TrimSpace(s.URL)
|
||||||
if dom == "" || u == "" {
|
if dom == "" || u == "" {
|
||||||
continue
|
continue
|
||||||
|
|
@ -60,24 +62,39 @@ func sourcesFooter(answer string, sources []WebSource) string {
|
||||||
return "\n\n" + label + ": " + strings.Join(links, ", ")
|
return "\n\n" + label + ": " + strings.Join(links, ", ")
|
||||||
}
|
}
|
||||||
|
|
||||||
// sourceDomain normalises a citation's display label to a bare publisher domain: it trims a
|
// displayDomain turns a host/domain into a readable label: it trims a leading "www." and
|
||||||
// leading "www." and surrounding space. gemini grounding already returns the domain in
|
// surrounding space, then decodes a punycode IDN to its Unicode form. gemini grounding returns
|
||||||
// web.title; this just tidies it. Returns "" for an empty/garbage label.
|
// the publisher domain in web.title, but for a non-ASCII host (e.g. a .рф site) that title is
|
||||||
func sourceDomain(title string) string {
|
// punycode ("xn--…"), which renders as gibberish in the Sources footer. idna.ToUnicode is
|
||||||
t := strings.TrimSpace(title)
|
// punycode-only: ASCII domains pass through unchanged and a label that fails to decode keeps
|
||||||
t = strings.TrimPrefix(t, "www.")
|
// its raw form (never worse than before). idna.Display was tried and gives byte-identical
|
||||||
return strings.TrimSpace(t)
|
// output here — it adds no homograph protection over the basic decode (that lives in TR39
|
||||||
|
// script-mixing rules, not UTS#46), and the label isn't the click target anyway (the href is
|
||||||
|
// the source URL), so the simpler profile is used. Shared by both citation label paths
|
||||||
|
// (sourceDomain for gemini titles, hostOf for grok_web_search URLs). Returns "" for empty.
|
||||||
|
func displayDomain(s string) string {
|
||||||
|
s = strings.TrimSpace(s)
|
||||||
|
s = strings.TrimPrefix(s, "www.")
|
||||||
|
s = strings.TrimSpace(s)
|
||||||
|
if s == "" {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
if u, err := idna.ToUnicode(s); err == nil && u != "" {
|
||||||
|
s = u
|
||||||
|
}
|
||||||
|
return s
|
||||||
}
|
}
|
||||||
|
|
||||||
// hostOf extracts the host (minus a leading "www.") from a real URL — used to label
|
// hostOf extracts the readable host from a real URL — used to label grok_web_search citations,
|
||||||
// grok_web_search citations, which carry the actual publisher URL rather than a domain.
|
// which carry the actual publisher URL rather than a domain. Runs the host through
|
||||||
// Returns "" if the URL doesn't parse to a host.
|
// displayDomain so a "www." prefix is dropped and an IDN host decodes to Unicode, matching the
|
||||||
|
// gemini-title path. Returns "" if the URL doesn't parse to a host.
|
||||||
func hostOf(rawURL string) string {
|
func hostOf(rawURL string) string {
|
||||||
u, err := url.Parse(strings.TrimSpace(rawURL))
|
u, err := url.Parse(strings.TrimSpace(rawURL))
|
||||||
if err != nil || u.Host == "" {
|
if err != nil || u.Host == "" {
|
||||||
return ""
|
return ""
|
||||||
}
|
}
|
||||||
return strings.TrimPrefix(u.Host, "www.")
|
return displayDomain(u.Host)
|
||||||
}
|
}
|
||||||
|
|
||||||
// hasCyrillic reports whether s contains any Cyrillic letter — a cheap proxy for "the bot
|
// hasCyrillic reports whether s contains any Cyrillic letter — a cheap proxy for "the bot
|
||||||
|
|
|
||||||
|
|
@ -37,10 +37,30 @@ func TestSourcesFooter(t *testing.T) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestDisplayDomain(t *testing.T) {
|
||||||
|
cases := map[string]string{
|
||||||
|
"xn----7sbbtpiaccnexupfs6l4c.xn--p1ai": "крымская-косметика.рф", // real hyphenated .рф from a grounding result
|
||||||
|
"xn--80aswg.xn--p1ai": "сайт.рф", // .рф IDN → readable Cyrillic, not "xn--…"
|
||||||
|
"www.xn--80aswg.xn--p1ai": "сайт.рф", // www stripped first, then decoded
|
||||||
|
"wikipedia.org": "wikipedia.org", // ASCII passes through unchanged
|
||||||
|
"www.youtube.com": "youtube.com",
|
||||||
|
" rbc.ru ": "rbc.ru",
|
||||||
|
"xn--80ak6aa92e.com": "аррӏе.com", // homograph decodes too — fine, the label is not the click target
|
||||||
|
"xn--invalid-punycode-": "invalid-punycode",
|
||||||
|
"": "",
|
||||||
|
}
|
||||||
|
for in, want := range cases {
|
||||||
|
if got := displayDomain(in); got != want {
|
||||||
|
t.Errorf("displayDomain(%q) = %q, want %q", in, got, want)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestHostOf(t *testing.T) {
|
func TestHostOf(t *testing.T) {
|
||||||
cases := map[string]string{
|
cases := map[string]string{
|
||||||
"https://www.reuters.com/world/article-123": "reuters.com",
|
"https://www.reuters.com/world/article-123": "reuters.com",
|
||||||
"https://rbc.ru/politics/03/06/2026": "rbc.ru",
|
"https://rbc.ru/politics/03/06/2026": "rbc.ru",
|
||||||
|
"https://xn--80aswg.xn--p1ai/p": "сайт.рф", // IDN host decoded to Unicode for display
|
||||||
"not a url": "",
|
"not a url": "",
|
||||||
"": "",
|
"": "",
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue