File tree Expand file tree Collapse file tree 2 files changed +10
-9
lines changed
common/pagetypeclassifier Expand file tree Collapse file tree 2 files changed +10
-9
lines changed Original file line number Diff line number Diff line change @@ -4,10 +4,11 @@ import (
44 _ "embed"
55 "sync"
66
7- "github.com/microcosm-cc/bluemonday"
7+ "fmt"
8+
89 htmltomarkdown "github.com/JohannesKaufmann/html-to-markdown/v2"
10+ "github.com/microcosm-cc/bluemonday"
911 "github.com/projectdiscovery/utils/ml/naive_bayes"
10- "fmt"
1112)
1213
1314//go:embed clf.gob
@@ -63,28 +64,28 @@ func getSanitizerPolicy() *bluemonday.Policy {
6364// Strategy:
6465// 1. Always sanitize HTML with bluemonday first to remove useless elements and reduce nesting
6566// 2. Convert sanitized HTML to markdown
66- // 3. If conversion panics, recover and return empty string
67+ // 3. If conversion panics, recover and return empty string with error
6768func htmlToText (html string ) (text string , err error ) {
6869 defer func () {
6970 if r := recover (); r != nil {
7071 err = fmt .Errorf ("html parser panic: %v" , r )
7172 text = ""
7273 }
7374 }()
74-
75+
7576 // First, sanitize HTML with bluemonday to strip useless elements and reduce nesting
7677 sanitizedHTML := getSanitizerPolicy ().Sanitize (html )
77-
78+
7879 // If sanitization failed or produced empty result, return empty
7980 if sanitizedHTML == "" {
8081 return "" , nil
8182 }
82-
83+
8384 // Convert sanitized HTML to markdown
8485 text , err = htmltomarkdown .ConvertString (sanitizedHTML )
8586 if err != nil || text == "" {
8687 return "" , err
8788 }
88-
89+
8990 return
9091}
Original file line number Diff line number Diff line change @@ -90,9 +90,9 @@ func TestPageTypeClassifier(t *testing.T) {
9090 }
9191 deeplyNestedHTML += "</div>"
9292
93- // Should not panic and should return empty string on panic
93+ // Should not panic and should return empty string with error on panic
9494 result , err := htmlToText (deeplyNestedHTML )
95- require .NoError (t , err )
95+ require .Error (t , err )
9696 require .Equal (t , "" , result )
9797 })
9898
You can’t perform that action at this time.
0 commit comments