Skip to content

Commit c6948ed

Browse files
authored
Merge pull request #2309 from GDATTACKER-RESEARCHER/dev
Improve error handling in htmlToText function
2 parents 45d8476 + 05c6364 commit c6948ed

File tree

4 files changed

+26
-20
lines changed

4 files changed

+26
-20
lines changed

common/httpx/httpx.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
package httpx
22

33
import (
4+
"context"
45
"crypto/tls"
56
"fmt"
67
"io"
@@ -25,7 +26,6 @@ import (
2526
pdhttputil "github.com/projectdiscovery/utils/http"
2627
stringsutil "github.com/projectdiscovery/utils/strings"
2728
urlutil "github.com/projectdiscovery/utils/url"
28-
"golang.org/x/net/context"
2929
"golang.org/x/net/http2"
3030
)
3131

common/pagetypeclassifier/pagetypeclassifier.go

Lines changed: 8 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -14,26 +14,23 @@ type PageTypeClassifier struct {
1414
classifier *naive_bayes.NaiveBayesClassifier
1515
}
1616

17-
func New() *PageTypeClassifier {
17+
func New() (*PageTypeClassifier, error) {
1818
classifier, err := naive_bayes.NewClassifierFromFileData(classifierData)
1919
if err != nil {
20-
panic(err)
20+
return nil, err
2121
}
22-
return &PageTypeClassifier{classifier: classifier}
22+
return &PageTypeClassifier{classifier: classifier}, nil
2323
}
2424

2525
func (n *PageTypeClassifier) Classify(html string) string {
26-
text := htmlToText(html)
27-
if text == "" {
26+
text, err := htmlToText(html)
27+
if err != nil || text == "" {
2828
return "other"
2929
}
3030
return n.classifier.Classify(text)
3131
}
3232

33-
func htmlToText(html string) string {
34-
text, err := htmltomarkdown.ConvertString(html)
35-
if err != nil {
36-
panic(err)
37-
}
38-
return text
33+
// htmlToText safely converts HTML to text and protects against panics from Go's HTML parser.
34+
func htmlToText(html string) (string, error) {
35+
return htmltomarkdown.ConvertString(html)
3936
}

common/pagetypeclassifier/pagetypeclassifier_test.go

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3,19 +3,22 @@ package pagetypeclassifier
33
import (
44
"testing"
55

6-
"github.com/stretchr/testify/assert"
6+
"github.com/stretchr/testify/require"
77
)
88

99
func TestPageTypeClassifier(t *testing.T) {
1010

1111
t.Run("test creation of new PageTypeClassifier", func(t *testing.T) {
12-
epc := New()
13-
assert.NotNil(t, epc)
12+
epc, err := New()
13+
require.NoError(t, err)
14+
require.NotNil(t, epc)
1415
})
1516

1617
t.Run("test classification non error page text", func(t *testing.T) {
17-
epc := New()
18-
assert.Equal(t, "nonerror", epc.Classify(`<!DOCTYPE html>
18+
epc, err := New()
19+
require.NoError(t, err)
20+
require.NotNil(t, epc)
21+
require.Equal(t, "nonerror", epc.Classify(`<!DOCTYPE html>
1922
<html lang="en">
2023
<head>
2124
<meta charset="UTF-8">
@@ -30,8 +33,10 @@ func TestPageTypeClassifier(t *testing.T) {
3033
})
3134

3235
t.Run("test classification on error page text", func(t *testing.T) {
33-
epc := New()
34-
assert.Equal(t, "error", epc.Classify(`<!DOCTYPE html>
36+
epc, err := New()
37+
require.NoError(t, err)
38+
require.NotNil(t, epc)
39+
require.Equal(t, "error", epc.Classify(`<!DOCTYPE html>
3540
<html>
3641
<head>
3742
<title>Error 403: Forbidden</title>

runner/runner.go

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -385,7 +385,11 @@ func New(options *Options) (*Runner, error) {
385385
}
386386

387387
runner.simHashes = gcache.New[uint64, struct{}](1000).ARC().Build()
388-
runner.pageTypeClassifier = pagetypeclassifier.New()
388+
pageTypeClassifier, err := pagetypeclassifier.New()
389+
if err != nil {
390+
return nil, err
391+
}
392+
runner.pageTypeClassifier = pageTypeClassifier
389393

390394
if options.HttpApiEndpoint != "" {
391395
apiServer := NewServer(options.HttpApiEndpoint, options)

0 commit comments

Comments
 (0)