Skip to content

Commit e14423b

Browse files
committed
Work on cli mode, add memory progress reporter
1 parent 182bf22 commit e14423b

File tree

13 files changed

+366
-38
lines changed

13 files changed

+366
-38
lines changed

README.md

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,27 @@
22

33
Tobey is a throughput optimizing but friendly web crawler, that is scalable from a single instance to a cluster. It features intelligent rate limiting, distributed coordination, and flexible deployment options. Tobey honors resource exclusions in a robots.txt and tries its best not to overwhelm a host.
44

5-
## Getting Started
5+
## Quickstart
66

77
```sh
8-
go run . # Start the crawler.
9-
curl -X POST http://127.0.0.1:8080 -d 'https://www.example.org/' # Submit a crawl request.
8+
go run . # Start the crawler as a server.
9+
curl -X POST http://127.0.0.1:8080 -d 'https://www.example.org' # Submit a crawl request.
10+
```
11+
12+
```sh
13+
go build -o /usr/local/bin/tobey
14+
tobey -u https://example.org # Use the cli interface to submit a crawl request.
15+
```
16+
17+
## CLI Mode
18+
19+
Tobey offers an - albeit limited - cli mode that allows you to run ad hoc crawls. Target URLs can be provided via the `-u` flag. By default results
20+
will be saved in the current directory. Use the `-o` flag to specify a different output directory. For all remaining options, please review the cli help via `-h`.
21+
22+
```sh
23+
tobey -h
24+
tobey -u https://example.org
25+
tobey -u https://example.org/blog,https://example.org/values -o results
1026
```
1127

1228
## Submitting Crawl Requests
@@ -279,7 +295,7 @@ example configuration.
279295
| `TOBEY_PORT`, `-port` | `8080` | `1-65535` | Port to bind the HTTP server to. Alternatively you can use the `-port` command line flag. |
280296
| `TOBEY_WORKERS`, `-w`| `5` | `1-128` | Number of workers to start. |
281297
| `TOBEY_REDIS_DSN` | empty | i.e. `redis://localhost:6379` | DSN to reach a Redis instance for coordinting multiple instances. |
282-
| `TOBEY_PROGRESS_DSN` | `noop://` | `factorial://host:port`, `console://`, `noop://` | DSN for progress reporting service. |
298+
| `TOBEY_PROGRESS_DSN` | `noop://` | `memory://`, `factorial://host:port`, `console://`, `noop://` | DSN for progress reporting service. |
283299
| `TOBEY_RESULT_REPORTER_DSN` | `disk://results` | `disk:///path`, `webhook://host/path`, `noop://` | DSN specifying where crawl results should be stored. |
284300
| `TOBEY_TELEMETRY`, `-telemetry` | empty | `metrics`, `traces`, `pulse` | Space separated list of what kind of telemetry is emitted. |
285301

api.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -253,7 +253,7 @@ func (req *ConsoleRequest) GetOutputDir() string {
253253
if req.OutputDir != "" {
254254
p = req.OutputDir
255255
} else {
256-
p = "results"
256+
p = "."
257257
}
258258

259259
abs, _ := filepath.Abs(p)

configure.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ func configure() (flagDaemon bool, req ConsoleRequest) {
3838
var flagOutputDir string
3939

4040
flag.StringVar(&flagURLs, "u", "", "Comma separated list of URls to crawl (cli only)")
41-
flag.StringVar(&flagOutputDir, "o", "results", "Directory to store results (cli only)")
41+
flag.StringVar(&flagOutputDir, "o", ".", "Directory to store results (cli only, defaults to current directory)")
4242

4343
// parse
4444
flag.Parse()
@@ -77,7 +77,7 @@ func configure() (flagDaemon bool, req ConsoleRequest) {
7777
ListenPort = p
7878
}
7979

80-
if isFlagPassed("workers") {
80+
if isFlagPassed("w") {
8181
NumVisitWorkers = flagWorkers
8282
} else if v := os.Getenv("TOBEY_WORKERS"); v != "" {
8383
p, err := strconv.Atoi(v)

go.mod

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -66,14 +66,16 @@ require (
6666
github.com/bits-and-blooms/bitset v1.13.0 // indirect
6767
github.com/bmatcuk/doublestar/v4 v4.8.1 // indirect
6868
github.com/cespare/xxhash/v2 v2.3.0 // indirect
69-
github.com/charmbracelet/lipgloss v0.13.0 // indirect
70-
github.com/charmbracelet/x/ansi v0.1.4 // indirect
69+
github.com/charmbracelet/lipgloss v1.0.0 // indirect
70+
github.com/charmbracelet/log v0.4.1 // indirect
71+
github.com/charmbracelet/x/ansi v0.4.2 // indirect
7172
github.com/charmbracelet/x/input v0.1.0 // indirect
7273
github.com/charmbracelet/x/term v0.1.1 // indirect
7374
github.com/charmbracelet/x/windows v0.1.0 // indirect
7475
github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f // indirect
7576
github.com/erikgeiser/coninput v0.0.0-20211004153227-1c3628e74d0f // indirect
7677
github.com/felixge/httpsnoop v1.0.4 // indirect
78+
github.com/go-logfmt/logfmt v0.6.0 // indirect
7779
github.com/go-logr/logr v1.4.2 // indirect
7880
github.com/go-logr/stdr v1.2.2 // indirect
7981
github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect
@@ -87,7 +89,7 @@ require (
8789
github.com/mattn/go-runewidth v0.0.16 // indirect
8890
github.com/muesli/ansi v0.0.0-20230316100256-276c6243b2f6 // indirect
8991
github.com/muesli/cancelreader v0.2.2 // indirect
90-
github.com/muesli/termenv v0.15.2 // indirect
92+
github.com/muesli/termenv v0.16.0 // indirect
9193
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
9294
github.com/prometheus/client_model v0.6.1 // indirect
9395
github.com/prometheus/common v0.55.0 // indirect
@@ -102,7 +104,7 @@ require (
102104
go.opentelemetry.io/proto/otlp v1.3.1 // indirect
103105
golang.org/x/exp v0.0.0-20240823005443-9b4947da3948 // indirect
104106
golang.org/x/sync v0.8.0 // indirect
105-
golang.org/x/sys v0.24.0 // indirect
107+
golang.org/x/sys v0.30.0 // indirect
106108
golang.org/x/text v0.16.0 // indirect
107109
google.golang.org/genproto/googleapis/api v0.0.0-20240701130421-f6361c86f094 // indirect
108110
google.golang.org/genproto/googleapis/rpc v0.0.0-20240701130421-f6361c86f094 // indirect

go.sum

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,8 +70,14 @@ github.com/charmbracelet/bubbletea v0.27.1 h1:/yhaJKX52pxG4jZVKCNWj/oq0QouPdXycr
7070
github.com/charmbracelet/bubbletea v0.27.1/go.mod h1:xc4gm5yv+7tbniEvQ0naiG9P3fzYhk16cTgDZQQW6YE=
7171
github.com/charmbracelet/lipgloss v0.13.0 h1:4X3PPeoWEDCMvzDvGmTajSyYPcZM4+y8sCA/SsA3cjw=
7272
github.com/charmbracelet/lipgloss v0.13.0/go.mod h1:nw4zy0SBX/F/eAO1cWdcvy6qnkDUxr8Lw7dvFrAIbbY=
73+
github.com/charmbracelet/lipgloss v1.0.0 h1:O7VkGDvqEdGi93X+DeqsQ7PKHDgtQfF8j8/O2qFMQNg=
74+
github.com/charmbracelet/lipgloss v1.0.0/go.mod h1:U5fy9Z+C38obMs+T+tJqst9VGzlOYGj4ri9reL3qUlo=
75+
github.com/charmbracelet/log v0.4.1 h1:6AYnoHKADkghm/vt4neaNEXkxcXLSV2g1rdyFDOpTyk=
76+
github.com/charmbracelet/log v0.4.1/go.mod h1:pXgyTsqsVu4N9hGdHmQ0xEA4RsXof402LX9ZgiITn2I=
7377
github.com/charmbracelet/x/ansi v0.1.4 h1:IEU3D6+dWwPSgZ6HBH+v6oUuZ/nVawMiWj5831KfiLM=
7478
github.com/charmbracelet/x/ansi v0.1.4/go.mod h1:dk73KoMTT5AX5BsX0KrqhsTqAnhZZoCBjs7dGWp4Ktw=
79+
github.com/charmbracelet/x/ansi v0.4.2 h1:0JM6Aj/g/KC154/gOP4vfxun0ff6itogDYk41kof+qk=
80+
github.com/charmbracelet/x/ansi v0.4.2/go.mod h1:dk73KoMTT5AX5BsX0KrqhsTqAnhZZoCBjs7dGWp4Ktw=
7581
github.com/charmbracelet/x/input v0.1.0 h1:TEsGSfZYQyOtp+STIjyBq6tpRaorH0qpwZUj8DavAhQ=
7682
github.com/charmbracelet/x/input v0.1.0/go.mod h1:ZZwaBxPF7IG8gWWzPUVqHEtWhc1+HXJPNuerJGRGZ28=
7783
github.com/charmbracelet/x/term v0.1.1 h1:3cosVAiPOig+EV4X9U+3LDgtwwAoEzJjNdwbXDjF6yI=
@@ -89,6 +95,8 @@ github.com/fatih/color v1.16.0 h1:zmkK9Ngbjj+K0yRhTVONQh1p/HknKYSlNT+vZCzyokM=
8995
github.com/fatih/color v1.16.0/go.mod h1:fL2Sau1YI5c0pdGEVCbKQbLXB6edEj1ZgiY4NijnWvE=
9096
github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg=
9197
github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U=
98+
github.com/go-logfmt/logfmt v0.6.0 h1:wGYYu3uicYdqXVgoYbvnkrPVXkuLM1p1ifugDMEdRi4=
99+
github.com/go-logfmt/logfmt v0.6.0/go.mod h1:WYhtIu8zTZfxdn5+rREduYbwxfcBr/Vr6KEVveWlfTs=
92100
github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A=
93101
github.com/go-logr/logr v1.4.2 h1:6pFjapn8bFcIbiKo3XT4j/BhANplGihG6tvd+8rYgrY=
94102
github.com/go-logr/logr v1.4.2/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY=
@@ -140,6 +148,8 @@ github.com/muesli/cancelreader v0.2.2 h1:3I4Kt4BQjOR54NavqnDogx/MIoWBFa0StPA8ELU
140148
github.com/muesli/cancelreader v0.2.2/go.mod h1:3XuTXfFS2VjM+HTLZY9Ak0l6eUKfijIfMUZ4EgX0QYo=
141149
github.com/muesli/termenv v0.15.2 h1:GohcuySI0QmI3wN8Ok9PtKGkgkFIk7y6Vpb5PvrY+Wo=
142150
github.com/muesli/termenv v0.15.2/go.mod h1:Epx+iuz8sNs7mNKhxzH4fWXGNpZwUaJKRS1noLXviQ8=
151+
github.com/muesli/termenv v0.16.0 h1:S5AlUN9dENB57rsbnkPyfdGuWIlkmzJjbFf0Tf5FWUc=
152+
github.com/muesli/termenv v0.16.0/go.mod h1:ZRfOIKPFDYQoDFF4Olj7/QJbW60Ol/kL1pU3VfY/Cnk=
143153
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA=
144154
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ=
145155
github.com/nlnwa/whatwg-url v0.4.1 h1:m0+XWylS9IuCPd5GMW2lzmSI9ssSwynT3nug0p3bUIo=
@@ -176,6 +186,7 @@ github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UV
176186
github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
177187
github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
178188
github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
189+
github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA=
179190
github.com/temoto/robotstxt v1.1.2 h1:W2pOjSJ6SWvldyEuiFXNxz3xZ8aiWX5LbfDiOFd7Fxg=
180191
github.com/temoto/robotstxt v1.1.2/go.mod h1:+1AmkuG3IYkh1kv0d2qEB9Le88ehNO0zwOr3ujewlOo=
181192
github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e h1:JVG44RsyaB9T2KIHavMF/ppJZNG9ZpyihvCd0w101no=
@@ -243,6 +254,8 @@ golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
243254
golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
244255
golang.org/x/sys v0.24.0 h1:Twjiwq9dn6R1fQcyiK+wQyHWfaz/BJB+YIpzU/Cv3Xg=
245256
golang.org/x/sys v0.24.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
257+
golang.org/x/sys v0.30.0 h1:QjkSwP/36a20jFYWkSue1YwXzLmsV5Gfq7Eiy72C1uc=
258+
golang.org/x/sys v0.30.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
246259
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
247260
golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
248261
golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k=

main.go

Lines changed: 29 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ import (
1717
"time"
1818
"tobey/internal/ctrlq"
1919

20+
charmlog "github.com/charmbracelet/log"
2021
"github.com/mariuswilms/tears"
2122
_ "go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp"
2223
"go.opentelemetry.io/contrib/instrumentation/runtime"
@@ -200,7 +201,23 @@ func service() {
200201
down(context.Background())
201202
}
202203

204+
// cli runs tobey in cli mode. This preconfigures tobey in a way
205+
// that makes sense for the cli, i.e. outputting logs to the console.
203206
func cli(req ConsoleRequest) {
207+
// Sync log levels, limited.
208+
var charmlogLevel charmlog.Level
209+
if Debug {
210+
charmlogLevel = charmlog.DebugLevel
211+
} else {
212+
charmlogLevel = charmlog.InfoLevel
213+
}
214+
charmLogger := charmlog.NewWithOptions(os.Stdout, charmlog.Options{
215+
ReportTimestamp: false,
216+
ReportCaller: false,
217+
Level: charmlogLevel,
218+
})
219+
slog.SetDefault(slog.New(charmLogger))
220+
204221
tear, down := tears.New()
205222
ctx, stop := signal.NotifyContext(context.Background(), os.Interrupt)
206223
defer stop()
@@ -229,10 +246,7 @@ func cli(req ConsoleRequest) {
229246
if err != nil {
230247
panic(err)
231248
}
232-
progress, err := CreateProgressReporter("noop://")
233-
if err != nil {
234-
panic(err)
235-
}
249+
progress := NewMemoryProgressReporter()
236250

237251
vpool := NewVisitorPool(
238252
ctx,
@@ -259,12 +273,18 @@ func cli(req ConsoleRequest) {
259273
},
260274
}
261275

262-
slog.Info("Performing run...")
263-
runs.Add(ctx, run)
264-
run.Start(ctx, queue, rr, progress, req.GetURLs(true))
276+
slog.Info("Performing run, please hit STRG+C to cancel and exit.")
265277

266-
<-ctx.Done()
278+
runs.Add(ctx, run)
279+
go run.Start(ctx, queue, rr, progress, req.GetURLs(true))
280+
281+
select {
282+
case <-ctx.Done():
283+
slog.Info("Cancelled, exiting...")
284+
case <-progress.IsRunFinished(run.ID):
285+
slog.Info("Run finished, exiting...")
286+
stop() // Safe to call multiple times.
287+
}
267288

268-
slog.Info("Exiting...")
269289
down(context.Background())
270290
}

progress.go

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import (
1010
"fmt"
1111
"log/slog"
1212
"net/url"
13+
"time"
1314
)
1415

1516
// ProgressStatus represents the valid states for progress updates.
@@ -56,6 +57,9 @@ func CreateProgressReporter(dsn string) (ProgressReporter, error) {
5657
host: u.Host,
5758
client: CreateRetryingHTTPClient(NoAuthFn, UserAgent),
5859
}, nil
60+
case "memory":
61+
slog.Info("Progress Reporter: Using Memory for progress updates.")
62+
return &MemoryProgressReporter{}, nil
5963
case "noop":
6064
slog.Info("Progress Reporter: Disabled, not sharing progress updates.")
6165
return &NoopProgressReporter{}, nil
@@ -82,18 +86,22 @@ type Progress struct {
8286
}
8387

8488
type ProgressUpdate struct {
85-
Run *Run
86-
URL string
87-
Stage string
88-
Status ProgressStatus
89+
Run *Run
90+
URL string
91+
Stage string
92+
Status ProgressStatus
93+
Created time.Time
8994
}
9095

9196
// Update updates the progress with a new status
9297
func (p *Progress) Update(ctx context.Context, status ProgressStatus) error {
98+
slog.Debug("Progress: Updating status", "run", p.Run.ID, "url", p.URL, "stage", p.Stage, "status", status)
99+
93100
return p.reporter.Call(ctx, ProgressUpdate{
94-
Run: p.Run,
95-
URL: p.URL,
96-
Stage: p.Stage,
97-
Status: status,
101+
Run: p.Run,
102+
URL: p.URL,
103+
Stage: p.Stage,
104+
Status: status,
105+
Created: time.Now(),
98106
})
99107
}

0 commit comments

Comments
 (0)