Skip to content

Commit d4db4da

Browse files
Perf: precompute list indentation metadata to avoid per-<li> goquery traversal (#3)
* Perf: precompute list indentation metadata to avoid per-<li> goquery traversal * test regression * remove unused * add perf test * test file
1 parent debb56e commit d4db4da

File tree

6 files changed

+39758
-72
lines changed

6 files changed

+39758
-72
lines changed

commonmark.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,8 @@ func (c *Converter) InitializeCommonMarkRules() []Rule {
6565

6666
// `prefixCount` is not nessesarily the length of the empty string `prefix`
6767
// but how much space is reserved for the prefixes of the siblings.
68-
prefixCount, previousPrefixCounts := countListParents(opt, selec)
68+
prefixCount, _ := strconv.Atoi(selec.AttrOr(attrListPrefixCount, "0"))
69+
previousPrefixCounts, _ := strconv.Atoi(selec.AttrOr(attrListPrevPrefixCounts, "0"))
6970

7071
// if the prefix is not needed, balance it by adding the usual prefix spaces
7172
if prefix == "" {

from.go

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,10 @@ func validateOptions(opt Options) error {
100100

101101
var (
102102
attrListPrefix = "data-converter-list-prefix"
103+
// Cached list indentation metadata, computed once in a pre-pass over the DOM.
104+
// This avoids extremely expensive goquery traversal per <li> on large documents.
105+
attrListPrefixCount = "data-converter-prefix-count"
106+
attrListPrevPrefixCounts = "data-converter-prev-prefix-counts"
103107
)
104108

105109
// NewConverter initializes a new converter and holds all the rules.
@@ -120,13 +124,6 @@ func NewConverter(domain string, enableCommonmark bool, options *Options) *Conve
120124
s.SetAttr("data-index", strconv.Itoa(i+1))
121125
})
122126
})
123-
conv.before = append(conv.before, func(selec *goquery.Selection) {
124-
selec.Find("li").Each(func(i int, s *goquery.Selection) {
125-
prefix := getListPrefix(options, s)
126-
127-
s.SetAttr(attrListPrefix, prefix)
128-
})
129-
})
130127
conv.after = append(conv.after, func(markdown string) string {
131128
markdown = strings.TrimSpace(markdown)
132129
markdown = multipleNewLinesRegex.ReplaceAllString(markdown, "\n\n")
@@ -382,6 +379,10 @@ func (conv *Converter) Convert(selec *goquery.Selection) string {
382379
hook(selec)
383380
}
384381

382+
// Precompute list indentation metadata *after* before hooks (so any DOM mutations
383+
// performed by user hooks are reflected).
384+
annotateListIndentation(selec, &options)
385+
385386
res := conv.selecToMD(selec, &options)
386387
markdown := res.Markdown
387388

list_regression_test.go

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
package md_test
2+
3+
import (
4+
"strings"
5+
"testing"
6+
7+
"github.com/PuerkitoBio/goquery"
8+
md "github.com/firecrawl/html-to-markdown"
9+
)
10+
11+
func TestListRegression_NonLiChildInOrderedListSkipsIndex(t *testing.T) {
12+
conv := md.NewConverter("", true, nil)
13+
14+
html := `<ol><li>one</li><div></div><li>two</li></ol>`
15+
out, err := conv.ConvertString(html)
16+
if err != nil {
17+
t.Fatal(err)
18+
}
19+
20+
// goquery's Index counts element siblings, so the second <li> becomes index 2 => "3."
21+
if !strings.Contains(out, "1. one") {
22+
t.Fatalf("expected output to contain %q, got:\n%s", "1. one", out)
23+
}
24+
if !strings.Contains(out, "3. two") {
25+
t.Fatalf("expected output to contain %q (due to non-li sibling), got:\n%s", "3. two", out)
26+
}
27+
if strings.Contains(out, "2. two") {
28+
t.Fatalf("did not expect output to contain %q, got:\n%s", "2. two", out)
29+
}
30+
}
31+
32+
func TestListRegression_WrapperListItemDoesNotEmitEmptyBullet(t *testing.T) {
33+
conv := md.NewConverter("", true, nil)
34+
35+
html := `<ul><li><ul><li>Nested</li></ul></li></ul>`
36+
out, err := conv.ConvertString(html)
37+
if err != nil {
38+
t.Fatal(err)
39+
}
40+
41+
if !strings.Contains(out, "Nested") {
42+
t.Fatalf("expected output to contain Nested, got:\n%s", out)
43+
}
44+
for _, line := range strings.Split(out, "\n") {
45+
if strings.TrimSpace(line) == "-" {
46+
t.Fatalf("unexpected empty list marker line, got:\n%s", out)
47+
}
48+
}
49+
}
50+
51+
func TestListRegression_MalformedLiOutsideListDoesNotPanic(t *testing.T) {
52+
conv := md.NewConverter("", true, nil)
53+
54+
html := `<div><li>Item</li></div>`
55+
out, err := conv.ConvertString(html)
56+
if err != nil {
57+
t.Fatal(err)
58+
}
59+
if !strings.Contains(out, "Item") {
60+
t.Fatalf("expected output to contain Item, got:\n%s", out)
61+
}
62+
}
63+
64+
func TestListRegression_BeforeHookMutationIsIncludedInListMetadata(t *testing.T) {
65+
conv := md.NewConverter("", true, nil)
66+
conv.Before(func(selec *goquery.Selection) {
67+
selec.Find("ol").First().AppendHtml("<li>b</li>")
68+
})
69+
70+
out, err := conv.ConvertString(`<ol><li>a</li></ol>`)
71+
if err != nil {
72+
t.Fatal(err)
73+
}
74+
75+
if !strings.Contains(out, "1. a") || !strings.Contains(out, "2. b") {
76+
t.Fatalf("expected mutated list items to be numbered, got:\n%s", out)
77+
}
78+
}
79+
80+

perf_big_html_test.go

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,33 @@ import (
1919
func TestPerfBigHTML_Smoke(t *testing.T) {
2020
p := filepath.Join("testdata", "Perf", "big.html")
2121

22-
if _, err := os.Stat(p); err != nil {
23-
p = filepath.Join("testdata", "Perf", "test.html")
22+
b, err := os.ReadFile(p)
23+
if err != nil {
24+
t.Fatalf("read %s: %v", p, err)
2425
}
26+
html := string(b)
27+
28+
conv := md.NewConverter("", true, nil)
29+
out, err := conv.ConvertString(html)
30+
if err != nil {
31+
t.Fatalf("convert: %v", err)
32+
}
33+
if strings.TrimSpace(out) == "" {
34+
t.Fatalf("expected non-empty markdown output")
35+
}
36+
}
37+
38+
// TestPerfBigList_Smoke runs a large list (random-ish content),
39+
// converts it once, and asserts we get non-empty output.
40+
//
41+
// This is intentionally a "smoke" perf test: no golden output, just "it renders".
42+
// Run it alone to track timings over time:
43+
//
44+
// go test -run '^TestPerfBigList_Smoke$' -count=1
45+
//
46+
// The input file is a large list with 100,000 items.
47+
func TestPerfBigList_Smoke(t *testing.T) {
48+
p := filepath.Join("testdata", "Perf", "big_list.html")
2549
b, err := os.ReadFile(p)
2650
if err != nil {
2751
t.Fatalf("read %s: %v", p, err)

0 commit comments

Comments
 (0)