@@ -8,6 +8,7 @@ package main
88import (
99 "context"
1010 "errors"
11+ "fmt"
1112 "log/slog"
1213 "sync"
1314 "sync/atomic"
@@ -26,6 +27,8 @@ const (
2627 MaxJobRetries = 3
2728)
2829
30+ // Visitor is the main "work horse" of the crawler. It consumes jobs from the
31+ // work queue and processes them with the help of the Collector.
2932type Visitor struct {
3033 id int
3134 runs * RunManager
@@ -35,6 +38,17 @@ type Visitor struct {
3538 logger * slog.Logger
3639}
3740
41+ // Code is used when handling failed visits.
42+ type Code uint32
43+
44+ const (
45+ CodeUnknown Code = iota
46+ CodeUnhandled
47+ CodePermanent
48+ CodeTemporary
49+ CodeIgnore
50+ )
51+
3852// Start launches the worker in a new goroutine.
3953func (w * Visitor ) Start (ctx context.Context , wg * sync.WaitGroup ) {
4054 go func () {
@@ -135,79 +149,76 @@ func (w *Visitor) process(ctx context.Context, job *ctrlq.VisitJob) error {
135149 return nil
136150 }
137151
138- if ! w .handleError (jctx , span , job , res , err , p , jlogger ) {
152+ span .AddEvent ("Visitor: URL visit failed." , t )
153+ jlogger .Debug ("Visitor: URL visit failed, handling ..." , "error" , err )
154+
155+ code , herr := w .failed (jctx , job , res , err )
156+ switch code {
157+ case CodeIgnore :
158+ p .Update (jctx , ProgressStateCancelled )
159+ span .End ()
139160 return nil
161+ case CodeTemporary :
162+ // Leave job in "Crawling" state, and process it later.
163+ span .End ()
164+ return nil // continue processing next job.
165+ default : // covers CodeUnknown, CodeUnhandled, CodePermanent
166+ jlogger .Error (
167+ "Visitor: Handling the failed visit error'ed." ,
168+ "code" , code ,
169+ "fail" , err ,
170+ "error" , herr ,
171+ )
172+ p .Update (jctx , ProgressStateErrored )
173+ span .RecordError (herr )
174+ span .End ()
175+ return herr // stop retrying processing this job.
140176 }
141-
142- p .Update (jctx , ProgressStateErrored )
143- jlogger .Error ("Visitor: Error visiting URL." , "error" , err )
144- span .RecordError (err )
145-
146- return nil
147177}
148178
149- // handleError handles errors that occur during URL visits
150- func (w * Visitor ) handleError (jctx context.Context , span trace.Span , job * ctrlq.VisitJob , res * collector.Response , err error , p * Progress , jlogger * slog.Logger ) bool {
151- t := trace .WithAttributes (attribute .String ("Url" , job .URL ))
179+ // failed handles failed URL visits. This handler has two return values.
180+ // One returns the status code indicating *how* and if the fail was handled. The other return value
181+ // is an error, if one occurred during handling. An error means the fail was not handled
182+ // properly and the handling failed.
183+ func (w * Visitor ) failed (ctx context.Context , job * ctrlq.VisitJob , res * collector.Response , err error ) (Code , error ) {
184+ attemptRepublish := func () (Code , error ) {
185+ if job .Retries >= MaxJobRetries {
186+ return CodeUnhandled , fmt .Errorf ("maximum retries reached" )
187+ }
188+ if err := w .queue .Republish (ctx , job ); err != nil {
189+ return CodeUnhandled , fmt .Errorf ("failed to republish job: %w" , err )
190+ }
191+ return CodeTemporary , nil
192+ }
152193
153194 if res != nil {
154195 // We have response information, use it to determine the correct error handling in detail.
155-
156196 switch res .StatusCode {
157197 case 302 : // Redirect
158198 // When a redirect is encountered, the visit errors out. This is in fact
159199 // no an actual error, but just a skip.
160- p .Update (jctx , ProgressStateCancelled )
161-
162- jlogger .Info ("Visitor: Skipped URL, got redirected." )
163- span .AddEvent ("Cancelled visiting URL" , t )
164-
165- span .End ()
166- return true
200+ return CodeIgnore , nil
167201 case 404 :
168- // FIXME: Probably want to lower the log level to Info for 404s here.
202+ return CodeIgnore , nil
169203 case 429 : // Too Many Requests
170- fallthrough
204+ return attemptRepublish ()
171205 case 503 : // Service Unavailable
172206 // Additionally want to retrieve the Retry-After header here and wait for that amount of time, if
173207 // we just have to wait than don't error out but reschedule the job and wait. In order to not do
174- // that infinitely, we should have a maximum number of retries.
175-
176- // Handling of Retry-After header is optional, so errors here
208+ // that infinitely, we should have a maximum number of retries. Handling of Retry-After header is optional, so errors here
177209 // are not critical.
178210 if v := res .Headers .Get ("Retry-After" ); v != "" {
179211 d , _ := time .ParseDuration (v + "s" )
180- w .queue .Pause (jctx , res .Request .URL .String (), d )
181- }
182-
183- if job .Retries < MaxJobRetries {
184- if err := w .queue .Republish (jctx , job ); err != nil {
185- jlogger .Warn ("Visitor: Republish failed, stopping retrying." , "error" , err )
186- } else {
187- // Leave job in "Crawling" state.
188- span .End ()
189- return true
190- }
191- } else {
192- jlogger .Warn ("Visitor: Maximum number of retries reached." )
212+ w .queue .Pause (ctx , res .Request .URL .String (), d )
193213 }
214+ return attemptRepublish ()
194215 default :
195- // Noop, fallthrough to generic error handling.
216+ return CodeUnhandled , fmt . Errorf ( "unhandled failed visit, unknown status code: %d" , res . StatusCode )
196217 }
197218 } else if errors .Is (err , context .DeadlineExceeded ) {
198219 // We react to timeouts as a temporary issue and retry the job, similary
199220 // to 429 and 503 errors.
200- if job .Retries < MaxJobRetries {
201- if err := w .queue .Republish (jctx , job ); err != nil {
202- jlogger .Warn ("Visitor: Republish failed, stopping retrying." , "error" , err )
203- } else {
204- // Leave job in "Crawling" state.
205- span .End ()
206- return true
207- }
208- } else {
209- jlogger .Warn ("Visitor: Maximum number of retries reached." )
210- }
221+ return attemptRepublish ()
211222 }
212- return true
223+ return CodeUnhandled , fmt . Errorf ( "unhandled failed visit" )
213224}
0 commit comments