128 lines
4.9 KiB
Go
128 lines
4.9 KiB
Go
package main
|
|
|
|
import (
|
|
"bytes"
|
|
"strings"
|
|
|
|
"github.com/microcosm-cc/bluemonday"
|
|
"github.com/yuin/goldmark"
|
|
"github.com/yuin/goldmark/ast"
|
|
"github.com/yuin/goldmark/extension"
|
|
"github.com/yuin/goldmark/renderer"
|
|
ghtml "github.com/yuin/goldmark/renderer/html"
|
|
"github.com/yuin/goldmark/util"
|
|
)
|
|
|
|
// matrixHTMLFormat is the `format` value that flags `formatted_body` as
|
|
// org.matrix.custom.html (the only rich format Matrix clients render).
|
|
const matrixHTMLFormat = "org.matrix.custom.html"
|
|
|
|
const (
|
|
// maxInputBytes / maxFormattedBytes bound the model reply and the rendered
|
|
// HTML; beyond either we fall back to the plain body (no formatted_body).
|
|
maxInputBytes = 512 * 1024
|
|
maxFormattedBytes = 64 * 1024
|
|
)
|
|
|
|
// mdParser converts the model's CommonMark + GFM (tables, strikethrough,
|
|
// autolink, task lists) answer to HTML. WithUnsafe stays OFF (goldmark's default)
|
|
// so raw HTML and dangerous URLs are escaped, never rendered; WithHardWraps keeps
|
|
// the answer's line breaks as <br>; images are rendered as links, not <img> (see
|
|
// imageLinkRenderer). goldmark depends only on the standard library, so the static
|
|
// (CGO-free) build is preserved.
|
|
var mdParser = goldmark.New(
|
|
goldmark.WithExtensions(extension.GFM),
|
|
goldmark.WithRendererOptions(
|
|
ghtml.WithHardWraps(),
|
|
// Priority < the default renderer's 1000 → registered last → overrides
|
|
// goldmark's <img> rendering with imageLinkRenderer.
|
|
renderer.WithNodeRenderers(util.Prioritized(imageLinkRenderer{}, 100)),
|
|
),
|
|
)
|
|
|
|
// imageLinkRenderer overrides goldmark's image rendering to emit a link instead of
|
|
// an <img>, so a markdown image stays functional (a clickable link to its source)
|
|
// without ever putting a remote <img> in the event — which a client could
|
|
// auto-load, leaking the viewer's IP to a URL a prompt-injected reply chose.
|
|
type imageLinkRenderer struct{}
|
|
|
|
func (imageLinkRenderer) RegisterFuncs(reg renderer.NodeRendererFuncRegisterer) {
|
|
reg.Register(ast.KindImage, renderImageAsLink)
|
|
}
|
|
|
|
// renderImageAsLink renders  as <a href="src">alt</a>: the alt content
|
|
// (the node's children) becomes the link label. Mirrors goldmark's own URL escape
|
|
// + dangerous-URL guard; bluemonday re-checks the scheme afterwards.
|
|
func renderImageAsLink(w util.BufWriter, _ []byte, node ast.Node, entering bool) (ast.WalkStatus, error) {
|
|
n := node.(*ast.Image)
|
|
if entering {
|
|
_, _ = w.WriteString(`<a href="`)
|
|
dest := util.URLEscape(n.Destination, true)
|
|
if !ghtml.IsDangerousURL(dest) {
|
|
_, _ = w.Write(util.EscapeHTML(dest))
|
|
}
|
|
_, _ = w.WriteString(`">`)
|
|
} else {
|
|
_, _ = w.WriteString("</a>")
|
|
}
|
|
return ast.WalkContinue, nil
|
|
}
|
|
|
|
// htmlPolicy strips goldmark's output to the tags/attributes Cinny's renderer
|
|
// keeps (src/app/utils/sanitize.ts: permittedHtmlTags / urlSchemes) — defence in
|
|
// depth over goldmark's own escaping, and the single allowlist a crafted reply
|
|
// can't get around. Anything else (script/style/img/on*-handlers/unknown URL
|
|
// schemes) is removed.
|
|
var htmlPolicy = buildHTMLPolicy()
|
|
|
|
func buildHTMLPolicy() *bluemonday.Policy {
|
|
p := bluemonday.NewPolicy()
|
|
p.AllowElements(
|
|
"p", "br", "hr",
|
|
"h1", "h2", "h3", "h4", "h5", "h6",
|
|
"strong", "em", "del", "s", "code", "pre",
|
|
"blockquote", "ul", "ol", "li",
|
|
"table", "thead", "tbody", "tr", "th", "td",
|
|
)
|
|
p.AllowAttrs("href").OnElements("a")
|
|
p.AllowURLSchemes("https", "http", "ftp", "mailto", "magnet")
|
|
p.RequireParseableURLs(true)
|
|
p.AllowAttrs("class").OnElements("code", "pre") // language-xxx on code blocks
|
|
p.AllowAttrs("start").OnElements("ol")
|
|
return p
|
|
}
|
|
|
|
// markdownToHTML converts the model's markdown answer to sanitized
|
|
// org.matrix.custom.html and reports whether any rich formatting was emitted.
|
|
// When false the caller MUST omit formatted_body so a plain answer renders from
|
|
// the bare `body` exactly as before (Matrix convention: only attach
|
|
// formatted_body when it adds formatting the plain body can't carry).
|
|
func markdownToHTML(md string) (string, bool) {
|
|
if len(md) > maxInputBytes {
|
|
return "", false // implausibly large; just send the plain body
|
|
}
|
|
var buf bytes.Buffer
|
|
if err := mdParser.Convert([]byte(md), &buf); err != nil {
|
|
return "", false
|
|
}
|
|
html := strings.TrimSpace(string(htmlPolicy.SanitizeBytes(buf.Bytes())))
|
|
if len(html) > maxFormattedBytes {
|
|
return "", false // too large to be worth sending as a Matrix event
|
|
}
|
|
if !hasRichMarkup(html) {
|
|
return "", false // just a paragraph of text — the plain body is enough
|
|
}
|
|
return html, true
|
|
}
|
|
|
|
// hasRichMarkup reports whether the HTML carries formatting beyond the paragraph
|
|
// wrapper and soft line breaks goldmark emits for plain text, so a plain reply
|
|
// keeps rendering from the bare body. Model text is HTML-escaped (a literal '<'
|
|
// becomes "<"), so any remaining raw '<' is a tag the converter emitted.
|
|
func hasRichMarkup(html string) bool {
|
|
stripped := html
|
|
for _, t := range []string{"<p>", "</p>", "<br>", "<br/>", "<br />"} {
|
|
stripped = strings.ReplaceAll(stripped, t, "")
|
|
}
|
|
return strings.Contains(stripped, "<")
|
|
}
|