Skip to content

Commit c035ce0

Browse files
perf: Replace expensive calls in Table Plugin (#5)
* perf: Replace expensive calls in Table Plugin * add test
1 parent d4db4da commit c035ce0

File tree

3 files changed

+5099
-11
lines changed

3 files changed

+5099
-11
lines changed

perf_big_html_test.go

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ import (
77
"testing"
88

99
md "github.com/firecrawl/html-to-markdown"
10+
"github.com/firecrawl/html-to-markdown/plugin"
1011
)
1112

1213
// TestPerfBigHTML_Smoke runs a large HTML file (random-ish content),
@@ -61,3 +62,31 @@ func TestPerfBigList_Smoke(t *testing.T) {
6162
t.Fatalf("expected non-empty markdown output")
6263
}
6364
}
65+
66+
// TestPerfBigTable_GitHubFlavored runs a large table with randomized data,
67+
// converts it once using GitHubFlavored plugin, and asserts we get non-empty output.
68+
//
69+
// This is intentionally a "smoke" perf test: no golden output, just "it renders".
70+
// Run it alone to track timings over time:
71+
//
72+
// go test -run '^TestPerfBigTable_GitHubFlavored$' -count=1
73+
//
74+
// The input file is a large table with 5,000 rows and 12 columns.
75+
func TestPerfBigTable_GitHubFlavored(t *testing.T) {
76+
p := filepath.Join("testdata", "Perf", "table.html")
77+
b, err := os.ReadFile(p)
78+
if err != nil {
79+
t.Fatalf("read %s: %v", p, err)
80+
}
81+
html := string(b)
82+
83+
conv := md.NewConverter("", true, nil)
84+
conv.Use(plugin.GitHubFlavored())
85+
out, err := conv.ConvertString(html)
86+
if err != nil {
87+
t.Fatalf("convert: %v", err)
88+
}
89+
if strings.TrimSpace(out) == "" {
90+
t.Fatalf("expected non-empty markdown output")
91+
}
92+
}

plugin/table.go

Lines changed: 43 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import (
66

77
md "github.com/firecrawl/html-to-markdown"
88
"github.com/PuerkitoBio/goquery"
9+
"golang.org/x/net/html"
910
)
1011

1112
// TableCompat is a compatibility plugin for environments where
@@ -145,17 +146,37 @@ func isHeadingRow(s *goquery.Selection) bool {
145146

146147
isTableOrBody := parent.Is("table") || isFirstTbody(parent)
147148

149+
// Check if every cell is a TH - break early if we find a non-TH
148150
everyTH := true
149-
s.Children().Each(func(i int, s *goquery.Selection) {
150-
if goquery.NodeName(s) != "th" {
151+
children := s.Children()
152+
for i := 0; i < children.Length(); i++ {
153+
if goquery.NodeName(children.Eq(i)) != "th" {
151154
everyTH = false
155+
break
152156
}
153-
})
157+
}
154158

155-
if parent.Children().First().IsSelection(s) && isTableOrBody && everyTH {
156-
return true
159+
// Optimize: Check if this is the first child by comparing node pointers directly
160+
// instead of creating a new Selection with parent.Children().First()
161+
if !everyTH || !isTableOrBody {
162+
return false
157163
}
158164

165+
// Check if s is the first element child by comparing nodes directly
166+
if len(s.Nodes) == 0 || len(parent.Nodes) == 0 {
167+
return false
168+
}
169+
170+
parentNode := parent.Nodes[0]
171+
sNode := s.Nodes[0]
172+
173+
// Find the first element child (skip text nodes)
174+
for child := parentNode.FirstChild; child != nil; child = child.NextSibling {
175+
if child.Type == html.ElementNode {
176+
return child == sNode
177+
}
178+
}
179+
159180
return false
160181
}
161182
func isFirstTbody(s *goquery.Selection) bool {
@@ -175,15 +196,26 @@ func getCellContent(content string, s *goquery.Selection) string {
175196
// nested tables not found
176197
content = newLineRe.ReplaceAllString(content, "<br>")
177198
}
178-
index := -1
179-
for i, node := range s.Parent().Children().Nodes {
180-
if s.IsNodes(node) {
181-
index = i
182-
break
199+
200+
// Optimize: Check if this is the first element child by comparing node pointers directly
201+
// instead of linear search through all children
202+
parent := s.Parent()
203+
isFirst := false
204+
if len(s.Nodes) > 0 && len(parent.Nodes) > 0 {
205+
parentNode := parent.Nodes[0]
206+
sNode := s.Nodes[0]
207+
208+
// Find the first element child (skip text nodes)
209+
for child := parentNode.FirstChild; child != nil; child = child.NextSibling {
210+
if child.Type == html.ElementNode {
211+
isFirst = (child == sNode)
212+
break
213+
}
183214
}
184215
}
216+
185217
prefix := " "
186-
if index == 0 {
218+
if isFirst {
187219
prefix = "| "
188220
}
189221
return prefix + content + " |"

0 commit comments

Comments
 (0)