Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
127 changes: 127 additions & 0 deletions plugin/robust_code_block.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
package plugin

import (
"strings"
"unicode/utf8"

"github.com/PuerkitoBio/goquery"
md "github.com/firecrawl/html-to-markdown"
"golang.org/x/net/html"
)

// RobustCodeBlock adds a robust PRE/CODE handler that extracts nested code text
// (e.g., from syntax highlighters with tables/rows/gutters) and outputs fenced
// blocks with detected language. This is useful for scraping code blocks from
// various websites that use different syntax highlighting libraries.
func RobustCodeBlock() md.Plugin {
return func(c *md.Converter) []md.Rule {
isGutter := func(class string) bool {
lower := strings.ToLower(class)
return strings.Contains(lower, "gutter") || strings.Contains(lower, "line-numbers")
}

detectLang := func(sel *goquery.Selection) string {
classes := sel.AttrOr("class", "")
lower := strings.ToLower(classes)
for _, part := range strings.Fields(lower) {
if strings.HasPrefix(part, "language-") {
return strings.TrimPrefix(part, "language-")
}
if strings.HasPrefix(part, "lang-") {
return strings.TrimPrefix(part, "lang-")
}
}
return ""
}

// collect extracts text recursively, inserting newlines after block elements and br
var collect func(n *html.Node, b *strings.Builder)
collect = func(n *html.Node, b *strings.Builder) {
if n == nil {
return
}
switch n.Type {
case html.TextNode:
b.WriteString(n.Data)
case html.ElementNode:
name := strings.ToLower(n.Data)
// Skip gutters
if name != "" {
for _, a := range n.Attr {
if a.Key == "class" && isGutter(a.Val) {
return
}
}
}

if name == "br" {
b.WriteString("\n")
}

for child := n.FirstChild; child != nil; child = child.NextSibling {
collect(child, b)
}

// Newline after block-ish wrappers to preserve lines
switch name {
case "p", "div", "li", "tr", "table", "thead", "tbody", "tfoot", "section", "article", "blockquote", "pre", "h1", "h2", "h3", "h4", "h5", "h6":
b.WriteString("\n")
}
}
}

preRule := md.Rule{
Filter: []string{"pre"},
Replacement: func(_ string, selec *goquery.Selection, opt *md.Options) *string {
// Find inner <code> if present for language detection
codeSel := selec.Find("code").First()
lang := detectLang(codeSel)
if lang == "" {
lang = detectLang(selec)
}

var b strings.Builder
for _, n := range selec.Nodes {
collect(n, &b)
}
content := strings.TrimRight(b.String(), "\n")

fenceChar, _ := utf8.DecodeRuneInString(opt.Fence)
fence := md.CalculateCodeFence(fenceChar, content)
text := "\n\n" + fence + lang + "\n" + content + "\n" + fence + "\n\n"
return md.String(text)
},
}

codeRule := md.Rule{
Filter: []string{"code"},
Replacement: func(_ string, selec *goquery.Selection, opt *md.Options) *string {
// If inside pre, let the PRE rule handle it
if selec.ParentsFiltered("pre").Length() > 0 {
return nil
}

var b strings.Builder
for _, n := range selec.Nodes {
collect(n, &b)
}
code := b.String()
// Collapse multiple newlines for inline code
code = md.TrimTrailingSpaces(strings.ReplaceAll(code, "\r\n", "\n"))

// Choose fence length safely
fence := "`"
if strings.Contains(code, "`") {
fence = "``"
if strings.Contains(code, "``") {
fence = "```"
}
}
out := fence + code + fence
return md.String(out)
},
}

return []md.Rule{preRule, codeRule}
}
}
236 changes: 236 additions & 0 deletions plugin/robust_code_block_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,236 @@
package plugin

import (
"strings"
"testing"

md "github.com/firecrawl/html-to-markdown"
)

func TestRobustCodeBlock_LanguageDetection(t *testing.T) {
tests := []struct {
name string
html string
expected string
}{
{
name: "language- prefix on code",
html: `<pre><code class="language-javascript">const x = 1;</code></pre>`,
expected: "```javascript",
},
{
name: "lang- prefix on code",
html: `<pre><code class="lang-python">print("hello")</code></pre>`,
expected: "```python",
},
{
name: "language on pre element",
html: `<pre class="language-go"><code>func main() {}</code></pre>`,
expected: "```go",
},
{
name: "no language class",
html: `<pre><code>plain code</code></pre>`,
expected: "```\nplain code",
},
}

conv := md.NewConverter("", true, nil)
conv.Use(RobustCodeBlock())

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
markdown, err := conv.ConvertString(tt.html)
if err != nil {
t.Fatalf("ConvertString failed: %v", err)
}

if !strings.Contains(markdown, tt.expected) {
t.Errorf("expected output to contain %q\nGot:\n%s", tt.expected, markdown)
}
})
}
}

func TestRobustCodeBlock_GutterStripping(t *testing.T) {
tests := []struct {
name string
html string
shouldHave []string
shouldntHave []string
}{
{
name: "strips gutter class",
html: `<pre><code><table><tr><td class="gutter">1
2
3</td><td class="code">const a = 1;
const b = 2;
const c = 3;</td></tr></table></code></pre>`,
shouldHave: []string{"const a = 1", "const b = 2", "const c = 3"},
shouldntHave: []string{"\n1\n2\n3"},
},
{
name: "strips line-numbers class",
html: `<pre><code><div class="line-numbers">1
2</div><div class="content">hello
world</div></code></pre>`,
shouldHave: []string{"hello", "world"},
shouldntHave: []string{},
},
}

conv := md.NewConverter("", true, nil)
conv.Use(RobustCodeBlock())

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
markdown, err := conv.ConvertString(tt.html)
if err != nil {
t.Fatalf("ConvertString failed: %v", err)
}

for _, exp := range tt.shouldHave {
if !strings.Contains(markdown, exp) {
t.Errorf("expected output to contain %q\nGot:\n%s", exp, markdown)
}
}
for _, notExp := range tt.shouldntHave {
if strings.Contains(markdown, notExp) {
t.Errorf("expected output NOT to contain %q\nGot:\n%s", notExp, markdown)
}
}
})
}
}

func TestRobustCodeBlock_SyntaxHighlighterDivs(t *testing.T) {
tests := []struct {
name string
html string
expected []string
}{
{
name: "highlight.js style",
html: `<pre><code class="language-json"><div class="hljs">{
"key": "value"
}</div></code></pre>`,
expected: []string{`"key"`, `"value"`},
},
{
name: "prism.js style with spans",
html: `<pre><code class="language-javascript"><span class="token keyword">const</span> <span class="token variable">x</span> <span class="token operator">=</span> <span class="token number">42</span><span class="token punctuation">;</span></code></pre>`,
expected: []string{"const", "x", "=", "42"},
},
{
name: "nested divs with token-line",
html: `<pre><code><div class="token-line">line1</div><div class="token-line">line2</div></code></pre>`,
expected: []string{"line1", "line2"},
},
}

conv := md.NewConverter("", true, nil)
conv.Use(RobustCodeBlock())

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
markdown, err := conv.ConvertString(tt.html)
if err != nil {
t.Fatalf("ConvertString failed: %v", err)
}

for _, exp := range tt.expected {
if !strings.Contains(markdown, exp) {
t.Errorf("expected output to contain %q\nGot:\n%s", exp, markdown)
}
}
})
}
}

func TestRobustCodeBlock_InlineCode(t *testing.T) {
tests := []struct {
name string
html string
expected string
}{
{
name: "simple inline code",
html: `<p>Use <code>fmt.Println</code> to print</p>`,
expected: "`fmt.Println`",
},
{
name: "inline code with backticks",
html: `<p>Use <code>echo ` + "`hello`" + `</code> command</p>`,
expected: "``echo `hello```",
},
{
name: "inline code not affected by pre rule",
html: `<p>The <code>main</code> function is important</p>`,
expected: "`main`",
},
}

conv := md.NewConverter("", true, nil)
conv.Use(RobustCodeBlock())

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
markdown, err := conv.ConvertString(tt.html)
if err != nil {
t.Fatalf("ConvertString failed: %v", err)
}

if !strings.Contains(markdown, tt.expected) {
t.Errorf("expected output to contain %q\nGot:\n%s", tt.expected, markdown)
}
})
}
}

func TestRobustCodeBlock_PreservesNewlines(t *testing.T) {
html := `<pre><code class="language-python">def hello():
print("world")

hello()</code></pre>`

conv := md.NewConverter("", true, nil)
conv.Use(RobustCodeBlock())

markdown, err := conv.ConvertString(html)
if err != nil {
t.Fatalf("ConvertString failed: %v", err)
}

// Check structure is preserved
if !strings.Contains(markdown, "def hello():") {
t.Error("missing function definition")
}
if !strings.Contains(markdown, `print("world")`) {
t.Error("missing print statement")
}
if !strings.Contains(markdown, "hello()") {
t.Error("missing function call")
}
}

func TestRobustCodeBlock_BrTags(t *testing.T) {
html := `<pre><code>line1<br>line2<br/>line3</code></pre>`

conv := md.NewConverter("", true, nil)
conv.Use(RobustCodeBlock())

markdown, err := conv.ConvertString(html)
if err != nil {
t.Fatalf("ConvertString failed: %v", err)
}

if !strings.Contains(markdown, "line1") {
t.Error("missing line1")
}
if !strings.Contains(markdown, "line2") {
t.Error("missing line2")
}
if !strings.Contains(markdown, "line3") {
t.Error("missing line3")
}
}
Loading