From 0c3a0d2f2f81aa87738c3304251eaa4a23240b36 Mon Sep 17 00:00:00 2001 From: Jeremy Daer Date: Thu, 9 Apr 2026 13:37:51 -0700 Subject: [PATCH] Replace hand-rolled MarkdownToHTML with goldmark MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the ~350-line regex pipeline (convertInline, placeholder extraction/restoration, line-by-line parser) with goldmark configured for Trix editor compatibility via three custom components: - trixTransformer: forces tight lists, converts soft breaks to hard inside list items and blockquotes, inserts
between blank-line-separated blocks for Trix paragraph spacing - trixRenderer: compact blockquote output, HTML escaping for raw HTML blocks (vs ), TrixBreak and EscapedAt node rendering - escapedAtParser: intercepts \@ before goldmark's standard escape handling to preserve mention-suppression syntax HTMLToMarkdown gains
-aware round-trip support: blockquote and list handlers now normalize
tags before splitting, so multiline blockquotes and list continuations survive the edit loop (MarkdownToHTML → HTMLToMarkdown) faithfully. Also fixes multiline blockquote/paragraph regex matching ((?i) → (?is)) and trims trailing newlines from code fence content. Promotes goldmark from indirect to direct dependency (v1.7.13, already present via glamour). --- go.mod | 2 +- internal/richtext/richtext.go | 962 +++++++++++++++-------------- internal/richtext/richtext_test.go | 519 +++++++++++++++- 3 files changed, 995 insertions(+), 488 deletions(-) diff --git a/go.mod b/go.mod index 97ef1cf2..957cd115 100644 --- a/go.mod +++ b/go.mod @@ -19,6 +19,7 @@ require ( github.com/spf13/cobra v1.10.2 github.com/spf13/pflag v1.0.10 github.com/stretchr/testify v1.11.1 + github.com/yuin/goldmark v1.7.13 github.com/zalando/go-keyring v0.2.8 golang.org/x/mod v0.34.0 golang.org/x/sys v0.42.0 @@ -70,7 +71,6 @@ require ( github.com/rivo/uniseg v0.4.7 // indirect github.com/rogpeppe/go-internal v1.14.1 // indirect github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e // indirect - github.com/yuin/goldmark v1.7.13 // indirect github.com/yuin/goldmark-emoji v1.0.6 // indirect golang.org/x/net v0.38.0 // indirect golang.org/x/sync v0.20.0 // indirect diff --git a/internal/richtext/richtext.go b/internal/richtext/richtext.go index 04f3f210..730a0b5b 100644 --- a/internal/richtext/richtext.go +++ b/internal/richtext/richtext.go @@ -3,6 +3,7 @@ package richtext import ( + "bytes" "errors" "fmt" "html" @@ -15,40 +16,18 @@ import ( "unicode/utf8" "github.com/charmbracelet/glamour" + "github.com/yuin/goldmark" + "github.com/yuin/goldmark/ast" + "github.com/yuin/goldmark/extension" + "github.com/yuin/goldmark/parser" + "github.com/yuin/goldmark/renderer" + gmhtml "github.com/yuin/goldmark/renderer/html" + "github.com/yuin/goldmark/text" + "github.com/yuin/goldmark/util" ) -// Pre-compiled regexes for MarkdownToHTML list detection -var ( - ulPattern = regexp.MustCompile(`^(\s*)[-*+]\s+(.*)$`) - olPattern = regexp.MustCompile(`^(\s*)\d+\.\s+(.*)$`) -) - -// CommonMark §2.4: any ASCII punctuation may be backslash-escaped. -// Exact set: !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~ -// -// We intentionally omit @ from the set: in Basecamp context \@ is the -// idiomatic way to suppress a mention ping, so it must pass through -// literally and not be unescaped into a bare @ that ResolveMentions -// would convert into a mention. -const commonMarkEscapablePunctuation = "!\"#$%&'()*+,-./:;<=>?[\\]^_`{|}~" - -// Pre-compiled regexes for convertInline (Markdown → HTML inline elements) -var ( - reCodeSpan = regexp.MustCompile("`([^`]+)`") - reBoldStar = regexp.MustCompile(`\*\*([^*]+)\*\*`) - reBoldUnder = regexp.MustCompile(`__([^_]+)__`) - reItalicStar = regexp.MustCompile(`\*([^*]+)\*`) - reItalicUnder = regexp.MustCompile(`(?:^|[^a-zA-Z0-9])_([^_]+)_(?:[^a-zA-Z0-9]|$)`) - reItalicInner = regexp.MustCompile(`_([^_]+)_`) - reImage = regexp.MustCompile(`!\[([^\]]*)\]\(([^)]+)\)`) - reLink = regexp.MustCompile(`\[([^\]]+)\]\(([^)]+)\)`) - reStrikethrough = regexp.MustCompile(`~~([^~]+)~~`) - - // Protect escaped backticks before code-span detection so \` does not start a code span. - reEscapedBacktick = regexp.MustCompile("\\\\`") - // Matches a backslash followed by any CommonMark-escapable ASCII punctuation character. - reBackslashEscape = regexp.MustCompile(`\\([` + regexp.QuoteMeta(commonMarkEscapablePunctuation) + `])`) -) +// Pre-compiled regexes for IsHTML detection (code span stripping) +var reCodeSpan = regexp.MustCompile("`([^`]+)`") // Pre-compiled regexes for HTMLToMarkdown (HTML → Markdown block elements) var ( @@ -58,32 +37,34 @@ var ( reH4 = regexp.MustCompile(`(?i)]*>(.*?)`) reH5 = regexp.MustCompile(`(?i)]*>(.*?)`) reH6 = regexp.MustCompile(`(?i)]*>(.*?)`) - reBlockquote = regexp.MustCompile(`(?i)]*>(.*?)`) + reBlockquote = regexp.MustCompile(`(?is)]*>(.*?)`) reCodeBlock = regexp.MustCompile(`(?is)]*>]*(?:class="language-([^"]*)")?[^>]*>(.*?)`) reCodeLang = regexp.MustCompile(`class="language-([^"]*)"`) + rePreLang = regexp.MustCompile(`(?i)]*\blanguage="([^"]*)"`) reCodeInner = regexp.MustCompile(`(?is)]*>([\s\S]*?)`) - reUL = regexp.MustCompile(`(?is)]*>(.*?)`) - reOL = regexp.MustCompile(`(?is)]*>(.*?)`) - reLI = regexp.MustCompile(`(?is)]*>(.*?)`) - reP = regexp.MustCompile(`(?i)]*>(.*?)

`) - reBR = regexp.MustCompile(`(?i)`) - reHR = regexp.MustCompile(`(?i)`) + // Tag-match patterns use (?:\s[^>]*)? to require whitespace or `>` after the + // tag name, preventing false matches against longer tag names with the same + // prefix (e.g.

vs

,  vs 
, vs , vs , + // vs ", + expected: "

<script>alert(1)</script>

", + }, + { + name: "multiline script tag", + input: "", + expected: "

<script> alert(1) </script>

", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := MarkdownToHTML(tt.input) + if result != tt.expected { + t.Errorf("MarkdownToHTML(%q)\ngot: %q\nwant: %q", tt.input, result, tt.expected) + } + }) + } +} + +// Tag-match regexes use (?:\s[^>]*)? to require whitespace or `>` after the +// tag name. Without that, `]*>` false-matches `
`, `]*>` matches
+// `
`, `]*>` matches ``, etc. — leading to garbled output when +// such tag prefixes coexist with their matching close tags elsewhere. +func TestHTMLToMarkdownTagBoundaries(t *testing.T) { + tests := []struct { + name string + input string + expected string + }{ + { + // Without the boundary, ]*>.*?

would match + // "
" ... "

" across the pre block. + name: "p does not match pre", + input: "
keep

tail

", + expected: "```\nkeep\n```\n\ntail", + }, + { + // Without the boundary, ]*>.*?
would match + // "
" ... "" eating the line break. + name: "b does not match br", + input: "text
and bold", + expected: "text\nand **bold**", + }, + { + // Without the boundary, ]*>.*? would match + // "". + name: "em does not match embed", + input: "real", + expected: "*real*", + }, + { + // Without the boundary, ]*>.*?
would match + // "". + name: "i does not match img", + input: "\"a\" then italic", + expected: "![a](x.png) then *italic*", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := HTMLToMarkdown(tt.input) + if result != tt.expected { + t.Errorf("HTMLToMarkdown(%q)\ngot: %q\nwant: %q", tt.input, result, tt.expected) + } + }) + } +} + +// BC5's SyntaxHighlightFilter converts
 into a Stimulus
+// controller that triggers Prism.js. The CommonMark convention
+// () does not trigger highlighting.
+func TestMarkdownToHTMLCodeBlockSyntaxHighlight(t *testing.T) {
+	tests := []struct {
+		name     string
+		input    string
+		expected string
+	}{
+		{
+			name:     "language emits pre[language] not code[class]",
+			input:    "```ruby\ndef hello; end\n```",
+			expected: "
def hello; end\n
", + }, + { + name: "language with hyphen", + input: "```objective-c\nreturn nil;\n```", + expected: "
return nil;\n
", + }, + { + name: "no language omits attribute", + input: "```\nplain\n```", + expected: "
plain\n
", + }, + { + name: "html content escaped", + input: "```html\n
hi
\n```", + expected: "
<div>hi</div>\n
", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := MarkdownToHTML(tt.input) + if result != tt.expected { + t.Errorf("MarkdownToHTML(%q)\ngot: %q\nwant: %q", tt.input, result, tt.expected) + } + }) + } +} + +// HTMLToMarkdown must recognize both the Trix/BC5 format (
)
+// and the legacy CommonMark format () so round-trips
+// work for content stored in either form.
+func TestHTMLToMarkdownCodeBlockLanguageFormats(t *testing.T) {
+	tests := []struct {
+		name     string
+		input    string
+		expected string
+	}{
+		{
+			name:     "pre language attribute",
+			input:    `
func main() {}
`, + expected: "```go\nfunc main() {}\n```", + }, + { + name: "code class attribute (legacy)", + input: `
func main() {}
`, + expected: "```go\nfunc main() {}\n```", + }, + { + name: "pre language preferred over code class", + input: `
x
`, + expected: "```ruby\nx\n```", + }, + { + name: "no language", + input: `
plain
`, + expected: "```\nplain\n```", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := HTMLToMarkdown(tt.input) + if result != tt.expected { + t.Errorf("HTMLToMarkdown(%q)\ngot: %q\nwant: %q", tt.input, result, tt.expected) + } + }) + } +} + +func TestHTMLToMarkdownMultilineBlockquote(t *testing.T) { + tests := []struct { + name string + input string + expected string + }{ + { + name: "single paragraph", + input: "
\n

text

\n
", + expected: "> text", + }, + { + name: "adjacent paragraphs", + input: "

para1

para2

", + expected: "> para1\n>\n> para2", + }, + { + name: "paragraph then list", + input: "

intro

  • one
  • two
", + expected: "> intro\n>\n> - one\n> - two", + }, + { + name: "paragraph then code block", + input: "

intro

code
", + expected: "> intro\n>\n> ```\n> code\n> ```", + }, + { + name: "code block then paragraph", + input: "
code

tail

", + expected: "> ```\n> code\n> ```\n>\n> tail", + }, + { + name: "code block then nested blockquote", + input: "
code
nested
", + expected: "> ```\n> code\n> ```\n>\n> > nested", + }, + { + name: "whitespace-separated paragraphs", + input: "
\n

para1

\n

para2

\n
", + expected: "> para1\n>\n> para2", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := HTMLToMarkdown(tt.input) + if result != tt.expected { + t.Errorf("HTMLToMarkdown(%q)\ngot: %q\nwant: %q", tt.input, result, tt.expected) + } + }) + } +} + +func TestHTMLToMarkdownMultilineParagraph(t *testing.T) { + input := "

line1\nline2

" + result := HTMLToMarkdown(input) + if !strings.Contains(result, "line1") || !strings.Contains(result, "line2") { + t.Errorf("HTMLToMarkdown(%q)\ngot: %q\nmissing content", input, result) + } +} + +func TestHTMLToMarkdownCodeFenceNewline(t *testing.T) { + input := "
func main() {}\n
" + result := HTMLToMarkdown(input) + if strings.Contains(result, "\n\n```") { + t.Errorf("HTMLToMarkdown(%q) has extra blank line before closing fence\ngot: %q", input, result) + } + if !strings.Contains(result, "func main() {}") { + t.Errorf("HTMLToMarkdown(%q) missing code content\ngot: %q", input, result) + } +} + +func TestHTMLToMarkdownCodePreservesHTMLEntities(t *testing.T) { + tests := []struct { + name string + input string + contains string + }{ + { + name: "p tags in code block survive reP and reStripTags", + input: "
<p>\nhi\n</p>\n
", + contains: "

\nhi\n

", + }, + { + name: "div tags in code block survive reStripTags", + input: "
<div>hello</div>
", + contains: "
hello
", + }, + { + name: "p tags in blockquoted code block", + input: "
<p>\nhi\n</p>\n
", + contains: "

\n> hi\n>

", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := HTMLToMarkdown(tt.input) + if !strings.Contains(result, tt.contains) { + t.Errorf("HTMLToMarkdown(%q)\ngot: %q\nmissing: %q", tt.input, result, tt.contains) + } + }) + } +} + +func TestHTMLToMarkdownNestedLists(t *testing.T) { + tests := []struct { + name string + input string + expected string + }{ + { + name: "nested ul compact", + input: "
  • parent
    • child
", + expected: "- parent\n - child", + }, + { + name: "nested ul with whitespace", + input: "
    \n
  • parent\n
      \n
    • child
    • \n
    \n
  • \n
", + expected: "- parent\n - child", + }, + { + name: "nested ol", + input: "
  1. parent
    1. child
", + expected: "1. parent\n 1. child", + }, + { + name: "mixed nesting ul then ol", + input: "
  • parent
    1. child
", + expected: "- parent\n 1. child", + }, + { + name: "mixed nesting ol then ul", + input: "
  1. parent
    • child
", + expected: "1. parent\n - child", + }, + { + name: "3-level nesting", + input: "
  • a
    • b
      • c
", + expected: "- a\n - b\n - c", + }, + { + name: "uppercase tags", + input: "
  • one
  • two
", + expected: "- one\n- two", + }, + { + name: "nested blockquote", + input: "
nested
", + expected: "> > nested", + }, + { + name: "sibling lists preserved", + input: "
  • a

text

  • b
", + expected: "- a\n\ntext\n\n- b", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := HTMLToMarkdown(tt.input) + if result != tt.expected { + t.Errorf("HTMLToMarkdown(%q)\ngot: %q\nwant: %q", tt.input, result, tt.expected) + } + }) + } +} + +func TestEditLoopRoundTrip(t *testing.T) { + tests := []struct { + name string + markdown string + expected string // exact expected round-trip output + }{ + { + name: "blockquote", + markdown: "> A quote", + expected: "> A quote", + }, + { + name: "multiline blockquote", + markdown: "> line1\n> line2", + expected: "> line1\n> line2", + }, + { + name: "multi-paragraph blockquote", + markdown: "> para1\n>\n> para2", + expected: "> para1\n>\n> para2", + }, + { + name: "unordered list", + markdown: "- One\n- Two\n- Three", + expected: "- One\n- Two\n- Three", + }, + { + name: "list with continuation", + markdown: "1. First\n Desc\n\n2. Second\n More", + expected: "1. First\n Desc\n2. Second\n More", + }, + { + name: "code fence", + markdown: "```go\nfunc main() {}\n```", + expected: "```go\nfunc main() {}\n```", + }, + { + name: "heading", + markdown: "# Title", + expected: "# Title", + }, + { + name: "quoted list", + markdown: "> - One\n> Two", + expected: "> - One\n> Two", + }, + { + name: "quoted code fence", + markdown: "> ```\n> code\n> ```", + expected: "> ```\n> code\n> ```", + }, + { + name: "quoted ordered list", + markdown: "> 1. First\n> 2. Second", + expected: "> 1. First\n> 2. Second", + }, + { + name: "nested unordered list", + markdown: "- parent\n - child", + expected: "- parent\n - child", + }, + { + name: "nested ordered list", + markdown: "1. parent\n 1. child", + expected: "1. parent\n 1. child", + }, + { + name: "nested blockquote", + markdown: "> > nested", + expected: "> > nested", + }, + { + name: "mixed content", + markdown: "# Title\n\nSome **bold** text.\n\n- Item 1\n- Item 2\n\n> A quote\n\n```\ncode\n```", + expected: "# Title\n\nSome **bold** text.\n\n- Item 1\n- Item 2\n\n> A quote\n\n```\ncode\n```", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + html := MarkdownToHTML(tt.markdown) + back := HTMLToMarkdown(html) + if back != tt.expected { + t.Errorf("round-trip mismatch\nmarkdown: %q\nhtml: %q\ngot: %q\nwant: %q", tt.markdown, html, back, tt.expected) + } + }) + } +} + func TestHTMLToMarkdown(t *testing.T) { tests := []struct { name string