package main import ( "bytes" "strings" "github.com/microcosm-cc/bluemonday" "github.com/yuin/goldmark" "github.com/yuin/goldmark/ast" "github.com/yuin/goldmark/extension" "github.com/yuin/goldmark/renderer" ghtml "github.com/yuin/goldmark/renderer/html" "github.com/yuin/goldmark/util" ) // matrixHTMLFormat is the `format` value that flags `formatted_body` as // org.matrix.custom.html (the only rich format Matrix clients render). const matrixHTMLFormat = "org.matrix.custom.html" const ( // maxInputBytes / maxFormattedBytes bound the model reply and the rendered // HTML; beyond either we fall back to the plain body (no formatted_body). maxInputBytes = 512 * 1024 maxFormattedBytes = 64 * 1024 ) // mdParser converts the model's CommonMark + GFM (tables, strikethrough, // autolink, task lists) answer to HTML. WithUnsafe stays OFF (goldmark's default) // so raw HTML and dangerous URLs are escaped, never rendered; WithHardWraps keeps // the answer's line breaks as
; images are rendered as links, not (see // imageLinkRenderer). goldmark depends only on the standard library, so the static // (CGO-free) build is preserved. var mdParser = goldmark.New( goldmark.WithExtensions(extension.GFM), goldmark.WithRendererOptions( ghtml.WithHardWraps(), // Priority < the default renderer's 1000 → registered last → overrides // goldmark's rendering with imageLinkRenderer. renderer.WithNodeRenderers(util.Prioritized(imageLinkRenderer{}, 100)), ), ) // imageLinkRenderer overrides goldmark's image rendering to emit a link instead of // an , so a markdown image stays functional (a clickable link to its source) // without ever putting a remote in the event — which a client could // auto-load, leaking the viewer's IP to a URL a prompt-injected reply chose. type imageLinkRenderer struct{} func (imageLinkRenderer) RegisterFuncs(reg renderer.NodeRendererFuncRegisterer) { reg.Register(ast.KindImage, renderImageAsLink) } // renderImageAsLink renders ![alt](src) as alt: the alt content // (the node's children) becomes the link label. Mirrors goldmark's own URL escape // + dangerous-URL guard; bluemonday re-checks the scheme afterwards. func renderImageAsLink(w util.BufWriter, _ []byte, node ast.Node, entering bool) (ast.WalkStatus, error) { n := node.(*ast.Image) if entering { _, _ = w.WriteString(``) } else { _, _ = w.WriteString("") } return ast.WalkContinue, nil } // htmlPolicy strips goldmark's output to the tags/attributes Cinny's renderer // keeps (src/app/utils/sanitize.ts: permittedHtmlTags / urlSchemes) — defence in // depth over goldmark's own escaping, and the single allowlist a crafted reply // can't get around. Anything else (script/style/img/on*-handlers/unknown URL // schemes) is removed. var htmlPolicy = buildHTMLPolicy() func buildHTMLPolicy() *bluemonday.Policy { p := bluemonday.NewPolicy() p.AllowElements( "p", "br", "hr", "h1", "h2", "h3", "h4", "h5", "h6", "strong", "em", "del", "s", "code", "pre", "blockquote", "ul", "ol", "li", "table", "thead", "tbody", "tr", "th", "td", ) p.AllowAttrs("href").OnElements("a") p.AllowURLSchemes("https", "http", "ftp", "mailto", "magnet") p.RequireParseableURLs(true) p.AllowAttrs("class").OnElements("code", "pre") // language-xxx on code blocks p.AllowAttrs("start").OnElements("ol") return p } // markdownToHTML converts the model's markdown answer to sanitized // org.matrix.custom.html and reports whether any rich formatting was emitted. // When false the caller MUST omit formatted_body so a plain answer renders from // the bare `body` exactly as before (Matrix convention: only attach // formatted_body when it adds formatting the plain body can't carry). func markdownToHTML(md string) (string, bool) { if len(md) > maxInputBytes { return "", false // implausibly large; just send the plain body } var buf bytes.Buffer if err := mdParser.Convert([]byte(md), &buf); err != nil { return "", false } html := strings.TrimSpace(string(htmlPolicy.SanitizeBytes(buf.Bytes()))) if len(html) > maxFormattedBytes { return "", false // too large to be worth sending as a Matrix event } if !hasRichMarkup(html) { return "", false // just a paragraph of text — the plain body is enough } return html, true } // hasRichMarkup reports whether the HTML carries formatting beyond the paragraph // wrapper and soft line breaks goldmark emits for plain text, so a plain reply // keeps rendering from the bare body. Model text is HTML-escaped (a literal '<' // becomes "<"), so any remaining raw '<' is a tag the converter emitted. func hasRichMarkup(html string) bool { stripped := html for _, t := range []string{"

", "

", "
", "
", "
"} { stripped = strings.ReplaceAll(stripped, t, "") } return strings.Contains(stripped, "<") }