vojo/apps/ai-bot/markdown.go

128 lines
4.9 KiB
Go

package main
import (
"bytes"
"strings"
"github.com/microcosm-cc/bluemonday"
"github.com/yuin/goldmark"
"github.com/yuin/goldmark/ast"
"github.com/yuin/goldmark/extension"
"github.com/yuin/goldmark/renderer"
ghtml "github.com/yuin/goldmark/renderer/html"
"github.com/yuin/goldmark/util"
)
// matrixHTMLFormat is the `format` value that flags `formatted_body` as
// org.matrix.custom.html (the only rich format Matrix clients render).
const matrixHTMLFormat = "org.matrix.custom.html"
const (
// maxInputBytes / maxFormattedBytes bound the model reply and the rendered
// HTML; beyond either we fall back to the plain body (no formatted_body).
maxInputBytes = 512 * 1024
maxFormattedBytes = 64 * 1024
)
// mdParser converts the model's CommonMark + GFM (tables, strikethrough,
// autolink, task lists) answer to HTML. WithUnsafe stays OFF (goldmark's default)
// so raw HTML and dangerous URLs are escaped, never rendered; WithHardWraps keeps
// the answer's line breaks as <br>; images are rendered as links, not <img> (see
// imageLinkRenderer). goldmark depends only on the standard library, so the static
// (CGO-free) build is preserved.
var mdParser = goldmark.New(
goldmark.WithExtensions(extension.GFM),
goldmark.WithRendererOptions(
ghtml.WithHardWraps(),
// Priority < the default renderer's 1000 → registered last → overrides
// goldmark's <img> rendering with imageLinkRenderer.
renderer.WithNodeRenderers(util.Prioritized(imageLinkRenderer{}, 100)),
),
)
// imageLinkRenderer overrides goldmark's image rendering to emit a link instead of
// an <img>, so a markdown image stays functional (a clickable link to its source)
// without ever putting a remote <img> in the event — which a client could
// auto-load, leaking the viewer's IP to a URL a prompt-injected reply chose.
type imageLinkRenderer struct{}
func (imageLinkRenderer) RegisterFuncs(reg renderer.NodeRendererFuncRegisterer) {
reg.Register(ast.KindImage, renderImageAsLink)
}
// renderImageAsLink renders ![alt](src) as <a href="src">alt</a>: the alt content
// (the node's children) becomes the link label. Mirrors goldmark's own URL escape
// + dangerous-URL guard; bluemonday re-checks the scheme afterwards.
func renderImageAsLink(w util.BufWriter, _ []byte, node ast.Node, entering bool) (ast.WalkStatus, error) {
n := node.(*ast.Image)
if entering {
_, _ = w.WriteString(`<a href="`)
dest := util.URLEscape(n.Destination, true)
if !ghtml.IsDangerousURL(dest) {
_, _ = w.Write(util.EscapeHTML(dest))
}
_, _ = w.WriteString(`">`)
} else {
_, _ = w.WriteString("</a>")
}
return ast.WalkContinue, nil
}
// htmlPolicy strips goldmark's output to the tags/attributes Cinny's renderer
// keeps (src/app/utils/sanitize.ts: permittedHtmlTags / urlSchemes) — defence in
// depth over goldmark's own escaping, and the single allowlist a crafted reply
// can't get around. Anything else (script/style/img/on*-handlers/unknown URL
// schemes) is removed.
var htmlPolicy = buildHTMLPolicy()
func buildHTMLPolicy() *bluemonday.Policy {
p := bluemonday.NewPolicy()
p.AllowElements(
"p", "br", "hr",
"h1", "h2", "h3", "h4", "h5", "h6",
"strong", "em", "del", "s", "code", "pre",
"blockquote", "ul", "ol", "li",
"table", "thead", "tbody", "tr", "th", "td",
)
p.AllowAttrs("href").OnElements("a")
p.AllowURLSchemes("https", "http", "ftp", "mailto", "magnet")
p.RequireParseableURLs(true)
p.AllowAttrs("class").OnElements("code", "pre") // language-xxx on code blocks
p.AllowAttrs("start").OnElements("ol")
return p
}
// markdownToHTML converts the model's markdown answer to sanitized
// org.matrix.custom.html and reports whether any rich formatting was emitted.
// When false the caller MUST omit formatted_body so a plain answer renders from
// the bare `body` exactly as before (Matrix convention: only attach
// formatted_body when it adds formatting the plain body can't carry).
func markdownToHTML(md string) (string, bool) {
if len(md) > maxInputBytes {
return "", false // implausibly large; just send the plain body
}
var buf bytes.Buffer
if err := mdParser.Convert([]byte(md), &buf); err != nil {
return "", false
}
html := strings.TrimSpace(string(htmlPolicy.SanitizeBytes(buf.Bytes())))
if len(html) > maxFormattedBytes {
return "", false // too large to be worth sending as a Matrix event
}
if !hasRichMarkup(html) {
return "", false // just a paragraph of text — the plain body is enough
}
return html, true
}
// hasRichMarkup reports whether the HTML carries formatting beyond the paragraph
// wrapper and soft line breaks goldmark emits for plain text, so a plain reply
// keeps rendering from the bare body. Model text is HTML-escaped (a literal '<'
// becomes "&lt;"), so any remaining raw '<' is a tag the converter emitted.
func hasRichMarkup(html string) bool {
stripped := html
for _, t := range []string{"<p>", "</p>", "<br>", "<br/>", "<br />"} {
stripped = strings.ReplaceAll(stripped, t, "")
}
return strings.Contains(stripped, "<")
}