Skip to content

Commit 275d63f

Browse files
committed
fix test
1 parent 5f13beb commit 275d63f

File tree

2 files changed

+10
-9
lines changed

2 files changed

+10
-9
lines changed

common/pagetypeclassifier/pagetypeclassifier.go

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,11 @@ import (
44
_ "embed"
55
"sync"
66

7-
"github.com/microcosm-cc/bluemonday"
7+
"fmt"
8+
89
htmltomarkdown "github.com/JohannesKaufmann/html-to-markdown/v2"
10+
"github.com/microcosm-cc/bluemonday"
911
"github.com/projectdiscovery/utils/ml/naive_bayes"
10-
"fmt"
1112
)
1213

1314
//go:embed clf.gob
@@ -63,28 +64,28 @@ func getSanitizerPolicy() *bluemonday.Policy {
6364
// Strategy:
6465
// 1. Always sanitize HTML with bluemonday first to remove useless elements and reduce nesting
6566
// 2. Convert sanitized HTML to markdown
66-
// 3. If conversion panics, recover and return empty string
67+
// 3. If conversion panics, recover and return empty string with error
6768
func htmlToText(html string) (text string, err error) {
6869
defer func() {
6970
if r := recover(); r != nil {
7071
err = fmt.Errorf("html parser panic: %v", r)
7172
text = ""
7273
}
7374
}()
74-
75+
7576
// First, sanitize HTML with bluemonday to strip useless elements and reduce nesting
7677
sanitizedHTML := getSanitizerPolicy().Sanitize(html)
77-
78+
7879
// If sanitization failed or produced empty result, return empty
7980
if sanitizedHTML == "" {
8081
return "", nil
8182
}
82-
83+
8384
// Convert sanitized HTML to markdown
8485
text, err = htmltomarkdown.ConvertString(sanitizedHTML)
8586
if err != nil || text == "" {
8687
return "", err
8788
}
88-
89+
8990
return
9091
}

common/pagetypeclassifier/pagetypeclassifier_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -90,9 +90,9 @@ func TestPageTypeClassifier(t *testing.T) {
9090
}
9191
deeplyNestedHTML += "</div>"
9292

93-
// Should not panic and should return empty string on panic
93+
// Should not panic and should return empty string with error on panic
9494
result, err := htmlToText(deeplyNestedHTML)
95-
require.NoError(t, err)
95+
require.Error(t, err)
9696
require.Equal(t, "", result)
9797
})
9898

0 commit comments

Comments
 (0)