Skip to content

Commit 172b1e2

Browse files
committed
Refactor failed visit handling
1 parent a836c1c commit 172b1e2

File tree

2 files changed

+61
-50
lines changed

2 files changed

+61
-50
lines changed

api.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ import (
1111
"fmt"
1212
"log/slog"
1313
"net/url"
14-
"os"
14+
"slices"
1515

1616
"github.com/google/uuid"
1717
)

visitor.go

Lines changed: 60 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ package main
88
import (
99
"context"
1010
"errors"
11+
"fmt"
1112
"log/slog"
1213
"sync"
1314
"sync/atomic"
@@ -26,6 +27,8 @@ const (
2627
MaxJobRetries = 3
2728
)
2829

30+
// Visitor is the main "work horse" of the crawler. It consumes jobs from the
31+
// work queue and processes them with the help of the Collector.
2932
type Visitor struct {
3033
id int
3134
runs *RunManager
@@ -35,6 +38,17 @@ type Visitor struct {
3538
logger *slog.Logger
3639
}
3740

41+
// Code is used when handling failed visits.
42+
type Code uint32
43+
44+
const (
45+
CodeUnknown Code = iota
46+
CodeUnhandled
47+
CodePermanent
48+
CodeTemporary
49+
CodeIgnore
50+
)
51+
3852
// Start launches the worker in a new goroutine.
3953
func (w *Visitor) Start(ctx context.Context, wg *sync.WaitGroup) {
4054
go func() {
@@ -135,79 +149,76 @@ func (w *Visitor) process(ctx context.Context, job *ctrlq.VisitJob) error {
135149
return nil
136150
}
137151

138-
if !w.handleError(jctx, span, job, res, err, p, jlogger) {
152+
span.AddEvent("Visitor: URL visit failed.", t)
153+
jlogger.Debug("Visitor: URL visit failed, handling ...", "error", err)
154+
155+
code, herr := w.failed(jctx, job, res, err)
156+
switch code {
157+
case CodeIgnore:
158+
p.Update(jctx, ProgressStateCancelled)
159+
span.End()
139160
return nil
161+
case CodeTemporary:
162+
// Leave job in "Crawling" state, and process it later.
163+
span.End()
164+
return nil // continue processing next job.
165+
default: // covers CodeUnknown, CodeUnhandled, CodePermanent
166+
jlogger.Error(
167+
"Visitor: Handling the failed visit error'ed.",
168+
"code", code,
169+
"fail", err,
170+
"error", herr,
171+
)
172+
p.Update(jctx, ProgressStateErrored)
173+
span.RecordError(herr)
174+
span.End()
175+
return herr // stop retrying processing this job.
140176
}
141-
142-
p.Update(jctx, ProgressStateErrored)
143-
jlogger.Error("Visitor: Error visiting URL.", "error", err)
144-
span.RecordError(err)
145-
146-
return nil
147177
}
148178

149-
// handleError handles errors that occur during URL visits
150-
func (w *Visitor) handleError(jctx context.Context, span trace.Span, job *ctrlq.VisitJob, res *collector.Response, err error, p *Progress, jlogger *slog.Logger) bool {
151-
t := trace.WithAttributes(attribute.String("Url", job.URL))
179+
// failed handles failed URL visits. This handler has two return values.
180+
// One returns the status code indicating *how* and if the fail was handled. The other return value
181+
// is an error, if one occurred during handling. An error means the fail was not handled
182+
// properly and the handling failed.
183+
func (w *Visitor) failed(ctx context.Context, job *ctrlq.VisitJob, res *collector.Response, err error) (Code, error) {
184+
attemptRepublish := func() (Code, error) {
185+
if job.Retries >= MaxJobRetries {
186+
return CodeUnhandled, fmt.Errorf("maximum retries reached")
187+
}
188+
if err := w.queue.Republish(ctx, job); err != nil {
189+
return CodeUnhandled, fmt.Errorf("failed to republish job: %w", err)
190+
}
191+
return CodeTemporary, nil
192+
}
152193

153194
if res != nil {
154195
// We have response information, use it to determine the correct error handling in detail.
155-
156196
switch res.StatusCode {
157197
case 302: // Redirect
158198
// When a redirect is encountered, the visit errors out. This is in fact
159199
// no an actual error, but just a skip.
160-
p.Update(jctx, ProgressStateCancelled)
161-
162-
jlogger.Info("Visitor: Skipped URL, got redirected.")
163-
span.AddEvent("Cancelled visiting URL", t)
164-
165-
span.End()
166-
return true
200+
return CodeIgnore, nil
167201
case 404:
168-
// FIXME: Probably want to lower the log level to Info for 404s here.
202+
return CodeIgnore, nil
169203
case 429: // Too Many Requests
170-
fallthrough
204+
return attemptRepublish()
171205
case 503: // Service Unavailable
172206
// Additionally want to retrieve the Retry-After header here and wait for that amount of time, if
173207
// we just have to wait than don't error out but reschedule the job and wait. In order to not do
174-
// that infinitely, we should have a maximum number of retries.
175-
176-
// Handling of Retry-After header is optional, so errors here
208+
// that infinitely, we should have a maximum number of retries. Handling of Retry-After header is optional, so errors here
177209
// are not critical.
178210
if v := res.Headers.Get("Retry-After"); v != "" {
179211
d, _ := time.ParseDuration(v + "s")
180-
w.queue.Pause(jctx, res.Request.URL.String(), d)
181-
}
182-
183-
if job.Retries < MaxJobRetries {
184-
if err := w.queue.Republish(jctx, job); err != nil {
185-
jlogger.Warn("Visitor: Republish failed, stopping retrying.", "error", err)
186-
} else {
187-
// Leave job in "Crawling" state.
188-
span.End()
189-
return true
190-
}
191-
} else {
192-
jlogger.Warn("Visitor: Maximum number of retries reached.")
212+
w.queue.Pause(ctx, res.Request.URL.String(), d)
193213
}
214+
return attemptRepublish()
194215
default:
195-
// Noop, fallthrough to generic error handling.
216+
return CodeUnhandled, fmt.Errorf("unhandled failed visit, unknown status code: %d", res.StatusCode)
196217
}
197218
} else if errors.Is(err, context.DeadlineExceeded) {
198219
// We react to timeouts as a temporary issue and retry the job, similary
199220
// to 429 and 503 errors.
200-
if job.Retries < MaxJobRetries {
201-
if err := w.queue.Republish(jctx, job); err != nil {
202-
jlogger.Warn("Visitor: Republish failed, stopping retrying.", "error", err)
203-
} else {
204-
// Leave job in "Crawling" state.
205-
span.End()
206-
return true
207-
}
208-
} else {
209-
jlogger.Warn("Visitor: Maximum number of retries reached.")
210-
}
221+
return attemptRepublish()
211222
}
212-
return true
223+
return CodeUnhandled, fmt.Errorf("unhandled failed visit")
213224
}

0 commit comments

Comments
 (0)