Skip to content

Commit d68423d

Browse files
committed
Update README, align features
- Remove SkipRobots feature, we always want to be friendly, there is no realistic scenario where I would want to have this. - Remove SkipSitemapsDiscovery features, instead plan to implement discovery sources later on. Better fits the original use case. - Rename domains config to aliases, as this is its purpose. - Paths constraints are now controlled via ignores.
1 parent 136e17b commit d68423d

File tree

10 files changed

+232
-243
lines changed

10 files changed

+232
-243
lines changed

README.md

Lines changed: 173 additions & 155 deletions
Large diffs are not rendered by default.

api.go

Lines changed: 21 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -66,19 +66,10 @@ type APIRequest struct {
6666
URL string `json:"url"`
6767
URLs []string `json:"urls"`
6868

69-
AllowDomains []string `json:"domains"`
70-
AllowPaths []string `json:"paths"`
71-
DenyPaths []string `json:"!paths"`
69+
AliasedDomains []string `json:"aliases"`
70+
IgnorePaths []string `json:"ignores"`
7271

73-
UserAgent string `json:"user_agent"`
74-
75-
// If true, we'll bypass the robots.txt check, however we'll still
76-
// download the file to look for sitemaps.
77-
SkipRobots bool `json:"skip_robots"`
78-
79-
// If true we'll not use any sitemaps found automatically, only those that
80-
// have been explicitly provided.
81-
SkipSitemapDiscovery bool `json:"skip_sitemap_discovery"`
72+
UserAgent string `json:"ua"`
8273

8374
// A list of authentication configurations, that are used in the run.
8475
AuthConfigs []*AuthConfig `json:"auth"`
@@ -126,37 +117,33 @@ func (req *APIRequest) GetURLs(clean bool) []string {
126117
}
127118

128119
func (req *APIRequest) GetAllowDomains() []string {
129-
// Ensure at least the URL host is in allowed domains.
130120
var domains []string
131-
if req.AllowDomains != nil {
132-
domains = req.AllowDomains
133-
} else {
134-
for _, u := range req.GetURLs(false) {
135-
p, err := url.Parse(u)
136-
if err != nil {
137-
slog.Error("Failed to parse URL from request, not allowing that domain.", "url", u, "error", err)
138-
continue
139-
}
140-
domains = append(domains, p.Hostname())
121+
122+
// The domains of the targets are allways allowed.
123+
for _, u := range req.GetURLs(false) {
124+
p, err := url.Parse(u)
125+
if err != nil {
126+
slog.Error("Failed to parse URL from request, not allowing that domain.", "url", u, "error", err)
127+
continue
141128
}
129+
domains = append(domains, p.Hostname())
142130
}
143-
return domains
144-
}
145131

146-
func (req *APIRequest) GetAllowPaths() []string {
147-
var paths []string
148-
149-
if req.AllowPaths != nil {
150-
paths = req.AllowPaths
132+
if req.AliasedDomains != nil {
133+
domains = append(domains, req.AliasedDomains...)
151134
}
152-
return paths
135+
136+
return domains
153137
}
154138

155-
func (req *APIRequest) GetDenyPaths() []string {
139+
func (req *APIRequest) GetIgnorePaths() []string {
156140
var paths []string
157141

158-
if req.DenyPaths != nil {
159-
paths = req.DenyPaths
142+
if req.IgnorePaths == nil {
143+
return paths
144+
}
145+
for _, p := range req.IgnorePaths {
146+
paths = append(paths, p)
160147
}
161148
return paths
162149
}
@@ -228,14 +215,3 @@ type APIResponse struct {
228215
type APIError struct {
229216
Message string `json:"message"`
230217
}
231-
232-
type CloudEventJson struct {
233-
Specversion string `json:"specversion"`
234-
Event_type string `json:"type"`
235-
Source string `json:"source"`
236-
Subject string `json:"subject"`
237-
Id string `json:"id"`
238-
Time string `json:"time"`
239-
Datacontenttype string `json:"datacontenttype"`
240-
Data string `json:"data"`
241-
}

go.mod

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ require (
4646
github.com/aymanbagabas/go-osc52/v2 v2.0.1 // indirect
4747
github.com/beorn7/perks v1.0.1 // indirect
4848
github.com/bits-and-blooms/bitset v1.13.0 // indirect
49+
github.com/bmatcuk/doublestar/v4 v4.8.1 // indirect
4950
github.com/cespare/xxhash/v2 v2.3.0 // indirect
5051
github.com/charmbracelet/lipgloss v0.13.0 // indirect
5152
github.com/charmbracelet/x/ansi v0.1.4 // indirect
@@ -75,6 +76,7 @@ require (
7576
github.com/prometheus/procfs v0.15.1 // indirect
7677
github.com/redis/go-redis/extra/rediscmd/v9 v9.5.3 // indirect
7778
github.com/rivo/uniseg v0.4.7 // indirect
79+
github.com/sabhiram/go-gitignore v0.0.0-20210923224102-525f6e181f06 // indirect
7880
github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e // indirect
7981
github.com/yuin/gopher-lua v1.1.1 // indirect
8082
go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.28.0 // indirect

go.sum

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@ github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
1818
github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
1919
github.com/bits-and-blooms/bitset v1.13.0 h1:bAQ9OPNFYbGHV6Nez0tmNI0RiEu7/hxlYJRUA0wFAVE=
2020
github.com/bits-and-blooms/bitset v1.13.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8=
21+
github.com/bmatcuk/doublestar/v4 v4.8.1 h1:54Bopc5c2cAvhLRAzqOGCYHYyhcDHsFF4wWIR5wKP38=
22+
github.com/bmatcuk/doublestar/v4 v4.8.1/go.mod h1:xBQ8jztBU6kakFMg+8WGxn0c6z1fTSPVIjEY1Wr7jzc=
2123
github.com/bsm/ginkgo/v2 v2.12.0 h1:Ny8MWAHyOepLGlLKYmXG4IEkioBysk6GpaRTLC8zwWs=
2224
github.com/bsm/ginkgo/v2 v2.12.0/go.mod h1:SwYbGRRDovPVboqFv0tPTcG1sN61LM1Z4ARdbAV9g4c=
2325
github.com/bsm/gomega v1.27.10 h1:yeMWxP2pV2fG3FgAODIY8EiRE3dy0aeFYt4l7wh6yKA=
@@ -129,10 +131,13 @@ github.com/redis/go-redis/v9 v9.5.4/go.mod h1:hdY0cQFCN4fnSYT6TkisLufl/4W5UIXyv0
129131
github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc=
130132
github.com/rivo/uniseg v0.4.7 h1:WUdvkW8uEhrYfLC4ZzdpI2ztxP1I582+49Oc5Mq64VQ=
131133
github.com/rivo/uniseg v0.4.7/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88=
134+
github.com/sabhiram/go-gitignore v0.0.0-20210923224102-525f6e181f06 h1:OkMGxebDjyw0ULyrTYWeN0UNCCkmCWfjPnIA2W6oviI=
135+
github.com/sabhiram/go-gitignore v0.0.0-20210923224102-525f6e181f06/go.mod h1:+ePHsJ1keEjQtpvf9HHw0f4ZeJ0TLRsxhunSI2hYJSs=
132136
github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d h1:hrujxIzL1woJ7AwssoOcM/tq5JjjG2yYOc8odClEiXA=
133137
github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d/go.mod h1:uugorj2VCxiV1x+LzaIdVa9b4S4qGAcH6cbhh4qVxOU=
134138
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
135139
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
140+
github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
136141
github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
137142
github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
138143
github.com/temoto/robotstxt v1.1.2 h1:W2pOjSJ6SWvldyEuiFXNxz3xZ8aiWX5LbfDiOFd7Fxg=
@@ -233,5 +238,7 @@ google.golang.org/grpc v1.64.0 h1:KH3VH9y/MgNQg1dE7b3XfVK0GsPSIzJwdF617gUSbvY=
233238
google.golang.org/grpc v1.64.0/go.mod h1:oxjF8E3FBnjp+/gVFYdWacaLDx9na1aqy9oovLpxQYg=
234239
google.golang.org/protobuf v1.34.2 h1:6xV6lTsCfpGD21XK49h7MhtcApnLqkfYgPcdHftf6hg=
235240
google.golang.org/protobuf v1.34.2/go.mod h1:qYOHts0dSfpeUzUFpOMr/WGzszTmLH+DiWniOlNbLDw=
241+
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
242+
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
236243
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
237244
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=

internal/collector/collector.go

Lines changed: 3 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,10 @@ import (
1313
"log/slog"
1414
"net/http"
1515
"net/url"
16-
"regexp"
1716
"strings"
1817

1918
whatwgUrl "github.com/nlnwa/whatwg-url/url"
19+
ignore "github.com/sabhiram/go-gitignore"
2020
)
2121

2222
type EnqueueFn func(ctx context.Context, c *Collector, u string) error // Enqueues a scrape.
@@ -87,8 +87,7 @@ func NewCollector(
8787

8888
type Collector struct {
8989
AllowDomains []string // AllowedDomains is a domain allowlist.
90-
AllowPaths []string
91-
DenyPaths []string
90+
IgnorePaths []string
9291

9392
// UserAgent is the User-Agent string used by HTTP requests
9493
UserAgent string
@@ -308,17 +307,7 @@ func (c *Collector) IsVisitAllowed(in string) (bool, error) {
308307
return true
309308
}
310309
}
311-
for _, allow := range c.AllowPaths {
312-
if ok, err := regexp.MatchString(allow, u.Path); !ok || err != nil {
313-
return false
314-
}
315-
}
316-
for _, deny := range c.DenyPaths {
317-
if ok, err := regexp.MatchString(deny, u.Path); ok || err != nil {
318-
return false
319-
}
320-
}
321-
return true
310+
return !ignore.CompileIgnoreLines(c.IgnorePaths...).MatchesPath(u.Path)
322311
}
323312
if !checkPath(p) {
324313
return false, ErrForbiddenPath

results.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,14 @@ import (
1212
"tobey/internal/collector"
1313
)
1414

15+
type DiscoverySource string
16+
17+
const (
18+
DiscoverySourceSitemap DiscoverySource = "sitemap"
19+
DiscoverySourceRobots DiscoverySource = "robots"
20+
DiscoverySourceLink DiscoverySource = "link"
21+
)
22+
1523
// ResultReporter is a function type that can be used to report the result of a crawl. It comes
1624
// with a preconfigured config.
1725
type ResultReporter func(ctx context.Context, run *Run, res *collector.Response) error

results_disk.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ import (
2020
)
2121

2222
type diskResult struct {
23-
Run string `json:"run_uuid"`
23+
Run string `json:"run"`
2424
RunMetadata interface{} `json:"run_metadata,omitempty"`
2525
RequestURL string `json:"request_url"`
2626
ResponseBody []byte `json:"response_body"` // Will be base64 encoded when JSON marshalled.

results_webhook.go

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,11 +19,12 @@ import (
1919
var webhookHTTPClient = CreateRetryingHTTPClient(NoAuthFn, UserAgent)
2020

2121
type webhookResult struct {
22-
Run string `json:"run_uuid"`
23-
RunMetadata interface{} `json:"run_metadata,omitempty"`
24-
RequestURL string `json:"request_url"`
25-
ResponseBody []byte `json:"response_body"` // Will be base64 encoded when JSON marshalled.
26-
ResponseStatusCode int `json:"response_status_code"`
22+
Run string `json:"run_uuid"`
23+
RunMetadata interface{} `json:"run_metadata,omitempty"`
24+
// DiscoveredBy []DiscoverySource `json:"discovered_by"` // TODO: Implement.
25+
RequestURL string `json:"request_url"`
26+
ResponseBody []byte `json:"response_body"` // Will be base64 encoded when JSON marshalled.
27+
ResponseStatusCode int `json:"response_status_code"`
2728
}
2829

2930
// WebhookResultReporterConfig defines the configuration for webhook endpoints

routes.go

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -40,9 +40,12 @@ func setupRoutes(runs *RunManager, queue ctrlq.VisitWorkQueue, rr ResultReporter
4040

4141
var req APIRequest
4242
if bytes.HasPrefix(body, []byte("http://")) || bytes.HasPrefix(body, []byte("https://")) {
43-
// As a special case, and to support minimalism, we allow directly
44-
// posting a single URL.
45-
req.URL = string(body)
43+
// Support both single URL and newline-delimited URLs in plaintext for minimalism.
44+
urls := bytes.Split(bytes.TrimSpace(body), []byte("\n"))
45+
46+
for _, url := range urls {
47+
req.URLs = append(req.URLs, string(bytes.TrimSpace(url)))
48+
}
4649
} else {
4750
err := json.Unmarshal(body, &req)
4851
if err != nil {
@@ -89,14 +92,10 @@ func setupRoutes(runs *RunManager, queue ctrlq.VisitWorkQueue, rr ResultReporter
8992
AuthConfigs: req.GetAuthConfigs(),
9093

9194
AllowDomains: req.GetAllowDomains(),
92-
AllowPaths: req.GetAllowPaths(),
93-
DenyPaths: req.GetDenyPaths(),
95+
IgnorePaths: req.GetIgnorePaths(),
9496

9597
UserAgent: req.GetUserAgent(),
9698

97-
SkipRobots: req.SkipRobots,
98-
SkipSitemapDiscovery: req.SkipSitemapDiscovery,
99-
10099
ResultReporterDSN: req.ResultReporterDSN,
101100
},
102101
}

run.go

Lines changed: 4 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -42,14 +42,10 @@ type SerializableRun struct {
4242
AuthConfigs []*AuthConfig
4343

4444
AllowDomains []string
45-
AllowPaths []string
46-
DenyPaths []string
45+
IgnorePaths []string
4746

4847
UserAgent string
4948

50-
SkipRobots bool
51-
SkipSitemapDiscovery bool
52-
5349
ResultReporterDSN string
5450
}
5551

@@ -179,9 +175,6 @@ func (r *Run) GetCollector(ctx context.Context, q ctrlq.VisitWorkQueue, rr Resul
179175
// must ensure that this Client isn't shared with i.e. the Robots instance.
180176
r.GetClient(),
181177
func(a string, u string) (bool, error) {
182-
if r.SkipRobots {
183-
return true, nil
184-
}
185178
return r.robots.Check(u, r.getAuthFn(), a)
186179
},
187180
getEnqueueFn(r, q, p),
@@ -190,8 +183,7 @@ func (r *Run) GetCollector(ctx context.Context, q ctrlq.VisitWorkQueue, rr Resul
190183

191184
c.UserAgent = r.UserAgent
192185
c.AllowDomains = r.AllowDomains
193-
c.AllowPaths = r.AllowPaths
194-
c.DenyPaths = r.DenyPaths
186+
c.IgnorePaths = r.IgnorePaths
195187

196188
return c
197189
}
@@ -213,11 +205,8 @@ func (r *Run) Start(ctx context.Context, q ctrlq.VisitWorkQueue, rr ResultReport
213205
}
214206
}
215207

216-
// This only skips *automatic* sitemap discovery, if the user provided sitemaps we still want to crawl them.
217-
if !r.SkipSitemapDiscovery {
218-
for _, u := range r.sitemaps.Discover(ctx, r.getAuthFn(), r.UserAgent, urls) {
219-
r.sitemaps.Drain(context.WithoutCancel(ctx), r.getAuthFn(), r.UserAgent, u, c.Enqueue)
220-
}
208+
for _, u := range r.sitemaps.Discover(ctx, r.getAuthFn(), r.UserAgent, urls) {
209+
r.sitemaps.Drain(context.WithoutCancel(ctx), r.getAuthFn(), r.UserAgent, u, c.Enqueue)
221210
}
222211
}
223212

0 commit comments

Comments
 (0)