Skip to content

Commit eeb495e

Browse files
committed
Fix UA robots check
1 parent 7fd06b9 commit eeb495e

File tree

7 files changed

+46
-9
lines changed

7 files changed

+46
-9
lines changed

go.mod

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ require (
1212
github.com/gorilla/sessions v1.4.0
1313
github.com/gorilla/websocket v1.5.3
1414
github.com/microcosm-cc/bluemonday v1.0.27
15+
github.com/mileusna/useragent v1.3.5
1516
github.com/oxffaa/gopher-parse-sitemap v0.0.0-20191021113419-005d2eb1def4
1617
github.com/slyrz/warc v0.0.0-20150806225202-a50edd19b690
1718
github.com/spf13/viper v1.20.1

go.sum

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,8 @@ github.com/lib/pq v1.10.9 h1:YXG7RB+JIjhP29X+OtkiDnYaXQwpS4JEWq7dtCCRUEw=
7878
github.com/lib/pq v1.10.9/go.mod h1:AlVN5x4E4T544tWzH6hKfbfQvm3HdbOxrmggDNAPY9o=
7979
github.com/microcosm-cc/bluemonday v1.0.27 h1:MpEUotklkwCSLeH+Qdx1VJgNqLlpY2KXwXFM08ygZfk=
8080
github.com/microcosm-cc/bluemonday v1.0.27/go.mod h1:jFi9vgW+H7c3V0lb6nR74Ib/DIB5OBs92Dimizgw2cA=
81+
github.com/mileusna/useragent v1.3.5 h1:SJM5NzBmh/hO+4LGeATKpaEX9+b4vcGg2qXGLiNGDws=
82+
github.com/mileusna/useragent v1.3.5/go.mod h1:3d8TOmwL/5I8pJjyVDteHtgDGcefrFUX4ccGOMKNYYc=
8183
github.com/moby/docker-image-spec v1.3.1 h1:jMKff3w6PgbfSa69GfNg+zN/XLhfXJGnEx3Nl2EsFP0=
8284
github.com/moby/docker-image-spec v1.3.1/go.mod h1:eKmb5VW8vQEh/BAr2yvVNvuiJuY6UIocYsFu/DxxRpo=
8385
github.com/moby/term v0.5.0 h1:xt8Q1nalod/v7BqbG21f8mQPqH+xAaC9C3N3wfWbVP0=

internal/crawler/basic_client.go

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@ import (
66
"net/http/httptrace"
77
"net/url"
88
"time"
9+
10+
"github.com/mileusna/useragent"
911
)
1012

1113
type HTTPRequester interface {
@@ -15,6 +17,7 @@ type HTTPRequester interface {
1517
type BasicClient struct {
1618
Options *ClientOptions
1719
client HTTPRequester
20+
uaName string
1821
}
1922

2023
type ClientOptions struct {
@@ -25,10 +28,15 @@ type ClientOptions struct {
2528
}
2629

2730
func NewBasicClient(options *ClientOptions, client HTTPRequester) *BasicClient {
28-
return &BasicClient{
31+
bc := &BasicClient{
2932
Options: options,
3033
client: client,
3134
}
35+
36+
parsedUA := useragent.Parse(options.UserAgent)
37+
bc.uaName = parsedUA.Name
38+
39+
return bc
3240
}
3341

3442
// Makes a request with the method specified in the method parameter to the specified URL.
@@ -97,7 +105,7 @@ func (c *BasicClient) do(req *http.Request) (*ClientResponse, error) {
97105
return cr, nil
98106
}
99107

100-
// GetUA returns the user-agent set for this client.
101-
func (c *BasicClient) GetUA() string {
102-
return c.Options.UserAgent
108+
// GetUAName returns the user-agent name for this client.
109+
func (c *BasicClient) GetUAName() string {
110+
return c.uaName
103111
}

internal/crawler/basic_client_test.go

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -145,3 +145,29 @@ func TestHTTPError(t *testing.T) {
145145
t.Fatal("expected an error, got none")
146146
}
147147
}
148+
149+
// Test UA name
150+
func TestUAName(t *testing.T) {
151+
table := []struct {
152+
UA string
153+
Name string
154+
}{
155+
{"Mozilla/5.0 (compatible; SEOnautBot/1.0; +https://seonaut.org/bot)", "SEOnautBot"},
156+
{"Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; Googlebot/2.1; +http://www.google.com/bot.html) Chrome/W.X.Y.Z Safari/537.36", "Googlebot"},
157+
{"Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; Bingbot/2.0; +http://www.bing.com/bingbot.htm) Chrome/W.X.Y.Z Safari/537.36", "Bingbot"},
158+
}
159+
160+
for _, e := range table {
161+
options := &crawler.ClientOptions{
162+
UserAgent: e.UA,
163+
}
164+
165+
mockClient := &mockClient{ForceError: true}
166+
client := crawler.NewBasicClient(options, mockClient)
167+
168+
if client.GetUAName() != e.Name {
169+
t.Errorf("Expected UA name: %s got: %s", e.Name, client.GetUAName())
170+
}
171+
172+
}
173+
}

internal/crawler/crawler.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ var ErrDomainNotAllowed = errors.New("domain not allowed")
3636
type Client interface {
3737
Get(urlStr string) (*ClientResponse, error)
3838
Head(urlStr string) (*ClientResponse, error)
39-
GetUA() string
39+
GetUAName() string
4040
}
4141

4242
type ResponseCallback func(r *ResponseMessage)

internal/crawler/robots_checker.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ func (r *RobotsChecker) IsBlocked(u *url.URL) bool {
3434
path += "?" + u.Query().Encode()
3535
}
3636

37-
return !robot.TestAgent(path, r.client.GetUA())
37+
return !robot.TestAgent(path, r.client.GetUAName())
3838
}
3939

4040
// Returns true if the robots.txt file exists and is valid

internal/crawler/robots_checker_test.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ func (t *MockClient) Get(u string) (*crawler.ClientResponse, error) {
2020
r := &http.Response{}
2121
if strings.HasPrefix(u, "https://example.com/") {
2222
body := `
23-
User-Agent: *
23+
User-Agent: SEOnautBot
2424
Disallow: /disallowed
2525
Sitemap: /sitemap.xml
2626
`
@@ -34,8 +34,8 @@ func (t *MockClient) Get(u string) (*crawler.ClientResponse, error) {
3434
return &crawler.ClientResponse{Response: r}, nil
3535
}
3636

37-
func (t *MockClient) GetUA() string {
38-
return "TEST UA"
37+
func (t *MockClient) GetUAName() string {
38+
return "SEOnautBot"
3939
}
4040

4141
// TestIsBlocked tests URLs allowed and disallowed in the robots.txt file.

0 commit comments

Comments
 (0)