Skip to content

Commit 090acb7

Browse files
perf: avoid goquery PrevAll/Text in list whitespace heuristic (#2)
* perf: avoid goquery PrevAll/Text in list whitespace heuristic (huge speedup on large HTML) * add perf testcase
1 parent 32a7ad4 commit 090acb7

File tree

4 files changed

+1591
-10
lines changed

4 files changed

+1591
-10
lines changed

commonmark.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -100,9 +100,9 @@ func (c *Converter) InitializeCommonMarkRules() []Rule {
100100
// if its inside a list, trim the spaces to not mess up the indentation
101101
parent := selec.Parent()
102102
next := selec.Next()
103-
if IndexWithText(selec) == 0 &&
104-
(parent.Is("li") || parent.Is("ol") || parent.Is("ul")) &&
105-
(next.Is("ul") || next.Is("ol")) {
103+
if (parent.Is("li") || parent.Is("ol") || parent.Is("ul")) &&
104+
(next.Is("ul") || next.Is("ol")) &&
105+
isFirstNonEmptyListTextNode(selec) {
106106
// trim only spaces and not new lines
107107
text = strings.Trim(text, ` `)
108108
}

perf_big_html_test.go

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
package md_test
2+
3+
import (
4+
"os"
5+
"path/filepath"
6+
"strings"
7+
"testing"
8+
9+
md "github.com/firecrawl/html-to-markdown"
10+
)
11+
12+
// TestPerfBigHTML_Smoke runs a large HTML file (random-ish content),
13+
// converts it once, and asserts we get non-empty output.
14+
//
15+
// This is intentionally a "smoke" perf test: no golden output, just "it renders".
16+
// Run it alone to track timings over time:
17+
//
18+
// go test -run '^TestPerfBigHTML_Smoke$' -count=1
19+
func TestPerfBigHTML_Smoke(t *testing.T) {
20+
p := filepath.Join("testdata", "Perf", "big.html")
21+
22+
if _, err := os.Stat(p); err != nil {
23+
p = filepath.Join("testdata", "Perf", "test.html")
24+
}
25+
b, err := os.ReadFile(p)
26+
if err != nil {
27+
t.Fatalf("read %s: %v", p, err)
28+
}
29+
html := string(b)
30+
31+
conv := md.NewConverter("", true, nil)
32+
out, err := conv.ConvertString(html)
33+
if err != nil {
34+
t.Fatalf("convert: %v", err)
35+
}
36+
if strings.TrimSpace(out) == "" {
37+
t.Fatalf("expected non-empty markdown output")
38+
}
39+
}

0 commit comments

Comments
 (0)