Skip to content

Commit d5aaf05

Browse files
feat(plugin): add RobustCodeBlock plugin for syntax highlighter support (#7)
Adds a new plugin that provides robust handling of code blocks from various syntax highlighting libraries. Features: - Language detection from `language-*` and `lang-*` CSS classes - Gutter/line-number stripping (skips elements with `gutter` or `line-numbers` classes) - Recursive text extraction from nested divs/spans (handles highlight.js, prism.js, etc.) - Proper newline handling for block elements and <br> tags - Safe inline code fencing with backtick escaping This consolidates code block handling logic that was duplicated across multiple Firecrawl services. Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
1 parent 25e9840 commit d5aaf05

File tree

2 files changed

+363
-0
lines changed

2 files changed

+363
-0
lines changed

plugin/robust_code_block.go

Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
package plugin
2+
3+
import (
4+
"strings"
5+
"unicode/utf8"
6+
7+
"github.com/PuerkitoBio/goquery"
8+
md "github.com/firecrawl/html-to-markdown"
9+
"golang.org/x/net/html"
10+
)
11+
12+
// RobustCodeBlock adds a robust PRE/CODE handler that extracts nested code text
13+
// (e.g., from syntax highlighters with tables/rows/gutters) and outputs fenced
14+
// blocks with detected language. This is useful for scraping code blocks from
15+
// various websites that use different syntax highlighting libraries.
16+
func RobustCodeBlock() md.Plugin {
17+
return func(c *md.Converter) []md.Rule {
18+
isGutter := func(class string) bool {
19+
lower := strings.ToLower(class)
20+
return strings.Contains(lower, "gutter") || strings.Contains(lower, "line-numbers")
21+
}
22+
23+
detectLang := func(sel *goquery.Selection) string {
24+
classes := sel.AttrOr("class", "")
25+
lower := strings.ToLower(classes)
26+
for _, part := range strings.Fields(lower) {
27+
if strings.HasPrefix(part, "language-") {
28+
return strings.TrimPrefix(part, "language-")
29+
}
30+
if strings.HasPrefix(part, "lang-") {
31+
return strings.TrimPrefix(part, "lang-")
32+
}
33+
}
34+
return ""
35+
}
36+
37+
// collect extracts text recursively, inserting newlines after block elements and br
38+
var collect func(n *html.Node, b *strings.Builder)
39+
collect = func(n *html.Node, b *strings.Builder) {
40+
if n == nil {
41+
return
42+
}
43+
switch n.Type {
44+
case html.TextNode:
45+
b.WriteString(n.Data)
46+
case html.ElementNode:
47+
name := strings.ToLower(n.Data)
48+
// Skip gutters
49+
if name != "" {
50+
for _, a := range n.Attr {
51+
if a.Key == "class" && isGutter(a.Val) {
52+
return
53+
}
54+
}
55+
}
56+
57+
if name == "br" {
58+
b.WriteString("\n")
59+
}
60+
61+
for child := n.FirstChild; child != nil; child = child.NextSibling {
62+
collect(child, b)
63+
}
64+
65+
// Newline after block-ish wrappers to preserve lines
66+
switch name {
67+
case "p", "div", "li", "tr", "table", "thead", "tbody", "tfoot", "section", "article", "blockquote", "pre", "h1", "h2", "h3", "h4", "h5", "h6":
68+
b.WriteString("\n")
69+
}
70+
}
71+
}
72+
73+
preRule := md.Rule{
74+
Filter: []string{"pre"},
75+
Replacement: func(_ string, selec *goquery.Selection, opt *md.Options) *string {
76+
// Find inner <code> if present for language detection
77+
codeSel := selec.Find("code").First()
78+
lang := detectLang(codeSel)
79+
if lang == "" {
80+
lang = detectLang(selec)
81+
}
82+
83+
var b strings.Builder
84+
for _, n := range selec.Nodes {
85+
collect(n, &b)
86+
}
87+
content := strings.TrimRight(b.String(), "\n")
88+
89+
fenceChar, _ := utf8.DecodeRuneInString(opt.Fence)
90+
fence := md.CalculateCodeFence(fenceChar, content)
91+
text := "\n\n" + fence + lang + "\n" + content + "\n" + fence + "\n\n"
92+
return md.String(text)
93+
},
94+
}
95+
96+
codeRule := md.Rule{
97+
Filter: []string{"code"},
98+
Replacement: func(_ string, selec *goquery.Selection, opt *md.Options) *string {
99+
// If inside pre, let the PRE rule handle it
100+
if selec.ParentsFiltered("pre").Length() > 0 {
101+
return nil
102+
}
103+
104+
var b strings.Builder
105+
for _, n := range selec.Nodes {
106+
collect(n, &b)
107+
}
108+
code := b.String()
109+
// Collapse multiple newlines for inline code
110+
code = md.TrimTrailingSpaces(strings.ReplaceAll(code, "\r\n", "\n"))
111+
112+
// Choose fence length safely
113+
fence := "`"
114+
if strings.Contains(code, "`") {
115+
fence = "``"
116+
if strings.Contains(code, "``") {
117+
fence = "```"
118+
}
119+
}
120+
out := fence + code + fence
121+
return md.String(out)
122+
},
123+
}
124+
125+
return []md.Rule{preRule, codeRule}
126+
}
127+
}

plugin/robust_code_block_test.go

Lines changed: 236 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,236 @@
1+
package plugin
2+
3+
import (
4+
"strings"
5+
"testing"
6+
7+
md "github.com/firecrawl/html-to-markdown"
8+
)
9+
10+
func TestRobustCodeBlock_LanguageDetection(t *testing.T) {
11+
tests := []struct {
12+
name string
13+
html string
14+
expected string
15+
}{
16+
{
17+
name: "language- prefix on code",
18+
html: `<pre><code class="language-javascript">const x = 1;</code></pre>`,
19+
expected: "```javascript",
20+
},
21+
{
22+
name: "lang- prefix on code",
23+
html: `<pre><code class="lang-python">print("hello")</code></pre>`,
24+
expected: "```python",
25+
},
26+
{
27+
name: "language on pre element",
28+
html: `<pre class="language-go"><code>func main() {}</code></pre>`,
29+
expected: "```go",
30+
},
31+
{
32+
name: "no language class",
33+
html: `<pre><code>plain code</code></pre>`,
34+
expected: "```\nplain code",
35+
},
36+
}
37+
38+
conv := md.NewConverter("", true, nil)
39+
conv.Use(RobustCodeBlock())
40+
41+
for _, tt := range tests {
42+
t.Run(tt.name, func(t *testing.T) {
43+
markdown, err := conv.ConvertString(tt.html)
44+
if err != nil {
45+
t.Fatalf("ConvertString failed: %v", err)
46+
}
47+
48+
if !strings.Contains(markdown, tt.expected) {
49+
t.Errorf("expected output to contain %q\nGot:\n%s", tt.expected, markdown)
50+
}
51+
})
52+
}
53+
}
54+
55+
func TestRobustCodeBlock_GutterStripping(t *testing.T) {
56+
tests := []struct {
57+
name string
58+
html string
59+
shouldHave []string
60+
shouldntHave []string
61+
}{
62+
{
63+
name: "strips gutter class",
64+
html: `<pre><code><table><tr><td class="gutter">1
65+
2
66+
3</td><td class="code">const a = 1;
67+
const b = 2;
68+
const c = 3;</td></tr></table></code></pre>`,
69+
shouldHave: []string{"const a = 1", "const b = 2", "const c = 3"},
70+
shouldntHave: []string{"\n1\n2\n3"},
71+
},
72+
{
73+
name: "strips line-numbers class",
74+
html: `<pre><code><div class="line-numbers">1
75+
2</div><div class="content">hello
76+
world</div></code></pre>`,
77+
shouldHave: []string{"hello", "world"},
78+
shouldntHave: []string{},
79+
},
80+
}
81+
82+
conv := md.NewConverter("", true, nil)
83+
conv.Use(RobustCodeBlock())
84+
85+
for _, tt := range tests {
86+
t.Run(tt.name, func(t *testing.T) {
87+
markdown, err := conv.ConvertString(tt.html)
88+
if err != nil {
89+
t.Fatalf("ConvertString failed: %v", err)
90+
}
91+
92+
for _, exp := range tt.shouldHave {
93+
if !strings.Contains(markdown, exp) {
94+
t.Errorf("expected output to contain %q\nGot:\n%s", exp, markdown)
95+
}
96+
}
97+
for _, notExp := range tt.shouldntHave {
98+
if strings.Contains(markdown, notExp) {
99+
t.Errorf("expected output NOT to contain %q\nGot:\n%s", notExp, markdown)
100+
}
101+
}
102+
})
103+
}
104+
}
105+
106+
func TestRobustCodeBlock_SyntaxHighlighterDivs(t *testing.T) {
107+
tests := []struct {
108+
name string
109+
html string
110+
expected []string
111+
}{
112+
{
113+
name: "highlight.js style",
114+
html: `<pre><code class="language-json"><div class="hljs">{
115+
"key": "value"
116+
}</div></code></pre>`,
117+
expected: []string{`"key"`, `"value"`},
118+
},
119+
{
120+
name: "prism.js style with spans",
121+
html: `<pre><code class="language-javascript"><span class="token keyword">const</span> <span class="token variable">x</span> <span class="token operator">=</span> <span class="token number">42</span><span class="token punctuation">;</span></code></pre>`,
122+
expected: []string{"const", "x", "=", "42"},
123+
},
124+
{
125+
name: "nested divs with token-line",
126+
html: `<pre><code><div class="token-line">line1</div><div class="token-line">line2</div></code></pre>`,
127+
expected: []string{"line1", "line2"},
128+
},
129+
}
130+
131+
conv := md.NewConverter("", true, nil)
132+
conv.Use(RobustCodeBlock())
133+
134+
for _, tt := range tests {
135+
t.Run(tt.name, func(t *testing.T) {
136+
markdown, err := conv.ConvertString(tt.html)
137+
if err != nil {
138+
t.Fatalf("ConvertString failed: %v", err)
139+
}
140+
141+
for _, exp := range tt.expected {
142+
if !strings.Contains(markdown, exp) {
143+
t.Errorf("expected output to contain %q\nGot:\n%s", exp, markdown)
144+
}
145+
}
146+
})
147+
}
148+
}
149+
150+
func TestRobustCodeBlock_InlineCode(t *testing.T) {
151+
tests := []struct {
152+
name string
153+
html string
154+
expected string
155+
}{
156+
{
157+
name: "simple inline code",
158+
html: `<p>Use <code>fmt.Println</code> to print</p>`,
159+
expected: "`fmt.Println`",
160+
},
161+
{
162+
name: "inline code with backticks",
163+
html: `<p>Use <code>echo ` + "`hello`" + `</code> command</p>`,
164+
expected: "``echo `hello```",
165+
},
166+
{
167+
name: "inline code not affected by pre rule",
168+
html: `<p>The <code>main</code> function is important</p>`,
169+
expected: "`main`",
170+
},
171+
}
172+
173+
conv := md.NewConverter("", true, nil)
174+
conv.Use(RobustCodeBlock())
175+
176+
for _, tt := range tests {
177+
t.Run(tt.name, func(t *testing.T) {
178+
markdown, err := conv.ConvertString(tt.html)
179+
if err != nil {
180+
t.Fatalf("ConvertString failed: %v", err)
181+
}
182+
183+
if !strings.Contains(markdown, tt.expected) {
184+
t.Errorf("expected output to contain %q\nGot:\n%s", tt.expected, markdown)
185+
}
186+
})
187+
}
188+
}
189+
190+
func TestRobustCodeBlock_PreservesNewlines(t *testing.T) {
191+
html := `<pre><code class="language-python">def hello():
192+
print("world")
193+
194+
hello()</code></pre>`
195+
196+
conv := md.NewConverter("", true, nil)
197+
conv.Use(RobustCodeBlock())
198+
199+
markdown, err := conv.ConvertString(html)
200+
if err != nil {
201+
t.Fatalf("ConvertString failed: %v", err)
202+
}
203+
204+
// Check structure is preserved
205+
if !strings.Contains(markdown, "def hello():") {
206+
t.Error("missing function definition")
207+
}
208+
if !strings.Contains(markdown, `print("world")`) {
209+
t.Error("missing print statement")
210+
}
211+
if !strings.Contains(markdown, "hello()") {
212+
t.Error("missing function call")
213+
}
214+
}
215+
216+
func TestRobustCodeBlock_BrTags(t *testing.T) {
217+
html := `<pre><code>line1<br>line2<br/>line3</code></pre>`
218+
219+
conv := md.NewConverter("", true, nil)
220+
conv.Use(RobustCodeBlock())
221+
222+
markdown, err := conv.ConvertString(html)
223+
if err != nil {
224+
t.Fatalf("ConvertString failed: %v", err)
225+
}
226+
227+
if !strings.Contains(markdown, "line1") {
228+
t.Error("missing line1")
229+
}
230+
if !strings.Contains(markdown, "line2") {
231+
t.Error("missing line2")
232+
}
233+
if !strings.Contains(markdown, "line3") {
234+
t.Error("missing line3")
235+
}
236+
}

0 commit comments

Comments
 (0)