@@ -48,7 +48,6 @@ type Source struct {
4848 concurrency int
4949 conn * sourcespb.S3
5050
51- checkpointer * Checkpointer
5251 sources.Progress
5352 metricsCollector metricsCollector
5453
@@ -95,7 +94,6 @@ func (s *Source) Init(
9594 }
9695 s .conn = & conn
9796
98- s .checkpointer = NewCheckpointer (ctx , & s .Progress )
9997 s .metricsCollector = metricsInstance
10098
10199 s .setMaxObjectSize (conn .GetMaxObjectSize ())
@@ -299,7 +297,8 @@ func (s *Source) scanBuckets(
299297 }
300298 var totalObjectCount uint64
301299
302- pos := determineResumePosition (ctx , s .checkpointer , bucketsToScan )
300+ checkpointer := NewCheckpointer (ctx , & s .Progress , false )
301+ pos := determineResumePosition (ctx , checkpointer , bucketsToScan )
303302 switch {
304303 case pos .isNewScan :
305304 ctx .Logger ().Info ("Starting new scan from beginning" )
@@ -340,7 +339,7 @@ func (s *Source) scanBuckets(
340339 )
341340 }
342341
343- objectCount := s .scanBucket (ctx , client , role , bucket , sources.ChanReporter {Ch : chunksChan }, startAfter )
342+ objectCount := s .scanBucket (ctx , client , role , bucket , sources.ChanReporter {Ch : chunksChan }, startAfter , checkpointer )
344343 totalObjectCount += objectCount
345344 }
346345
@@ -359,6 +358,7 @@ func (s *Source) scanBucket(
359358 bucket string ,
360359 reporter sources.ChunkReporter ,
361360 startAfter * string ,
361+ checkpointer * Checkpointer ,
362362) uint64 {
363363 s .metricsCollector .RecordBucketForRole (role )
364364
@@ -412,7 +412,7 @@ func (s *Source) scanBucket(
412412 errorCount : & errorCount ,
413413 objectCount : & objectCount ,
414414 }
415- s .pageChunker (ctx , pageMetadata , processingState , reporter )
415+ s .pageChunker (ctx , pageMetadata , processingState , reporter , checkpointer )
416416
417417 pageNumber ++
418418 }
@@ -458,8 +458,9 @@ func (s *Source) pageChunker(
458458 metadata pageMetadata ,
459459 state processingState ,
460460 reporter sources.ChunkReporter ,
461+ checkpointer * Checkpointer ,
461462) {
462- s . checkpointer .Reset () // Reset the checkpointer for each PAGE
463+ checkpointer .Reset () // Reset the checkpointer for each PAGE
463464 ctx = context .WithValues (ctx , "bucket" , metadata .bucket , "page_number" , metadata .pageNumber )
464465 for objIdx , obj := range metadata .page .Contents {
465466 ctx = context .WithValues (ctx , "key" , * obj .Key , "size" , * obj .Size )
@@ -471,7 +472,7 @@ func (s *Source) pageChunker(
471472 if obj .StorageClass == s3types .ObjectStorageClassGlacier || obj .StorageClass == s3types .ObjectStorageClassGlacierIr {
472473 ctx .Logger ().V (5 ).Info ("Skipping object in storage class" , "storage_class" , obj .StorageClass )
473474 s .metricsCollector .RecordObjectSkipped (metadata .bucket , "storage_class" , float64 (* obj .Size ))
474- if err := s . checkpointer .UpdateObjectCompletion (ctx , objIdx , metadata .bucket , metadata .role , metadata .page .Contents ); err != nil {
475+ if err := checkpointer .UpdateObjectCompletion (ctx , objIdx , metadata .bucket , metadata .role , metadata .page .Contents ); err != nil {
475476 ctx .Logger ().Error (err , "could not update progress for glacier object" )
476477 }
477478 continue
@@ -481,7 +482,7 @@ func (s *Source) pageChunker(
481482 if * obj .Size > s .maxObjectSize {
482483 ctx .Logger ().V (5 ).Info ("Skipping large file" , "max_object_size" , s .maxObjectSize )
483484 s .metricsCollector .RecordObjectSkipped (metadata .bucket , "size_limit" , float64 (* obj .Size ))
484- if err := s . checkpointer .UpdateObjectCompletion (ctx , objIdx , metadata .bucket , metadata .role , metadata .page .Contents ); err != nil {
485+ if err := checkpointer .UpdateObjectCompletion (ctx , objIdx , metadata .bucket , metadata .role , metadata .page .Contents ); err != nil {
485486 ctx .Logger ().Error (err , "could not update progress for large file" )
486487 }
487488 continue
@@ -491,7 +492,7 @@ func (s *Source) pageChunker(
491492 if * obj .Size == 0 {
492493 ctx .Logger ().V (5 ).Info ("Skipping empty file" )
493494 s .metricsCollector .RecordObjectSkipped (metadata .bucket , "empty_file" , 0 )
494- if err := s . checkpointer .UpdateObjectCompletion (ctx , objIdx , metadata .bucket , metadata .role , metadata .page .Contents ); err != nil {
495+ if err := checkpointer .UpdateObjectCompletion (ctx , objIdx , metadata .bucket , metadata .role , metadata .page .Contents ); err != nil {
495496 ctx .Logger ().Error (err , "could not update progress for empty file" )
496497 }
497498 continue
@@ -501,7 +502,7 @@ func (s *Source) pageChunker(
501502 if common .SkipFile (* obj .Key ) {
502503 ctx .Logger ().V (5 ).Info ("Skipping file with incompatible extension" )
503504 s .metricsCollector .RecordObjectSkipped (metadata .bucket , "incompatible_extension" , float64 (* obj .Size ))
504- if err := s . checkpointer .UpdateObjectCompletion (ctx , objIdx , metadata .bucket , metadata .role , metadata .page .Contents ); err != nil {
505+ if err := checkpointer .UpdateObjectCompletion (ctx , objIdx , metadata .bucket , metadata .role , metadata .page .Contents ); err != nil {
505506 ctx .Logger ().Error (err , "could not update progress for incompatible file" )
506507 }
507508 continue
@@ -613,7 +614,7 @@ func (s *Source) pageChunker(
613614 state .errorCount .Store (prefix , 0 )
614615 }
615616 // Update progress after successful processing.
616- if err := s . checkpointer .UpdateObjectCompletion (ctx , objIdx , metadata .bucket , metadata .role , metadata .page .Contents ); err != nil {
617+ if err := checkpointer .UpdateObjectCompletion (ctx , objIdx , metadata .bucket , metadata .role , metadata .page .Contents ); err != nil {
617618 ctx .Logger ().Error (err , "could not update progress for scanned object" )
618619 }
619620 s .metricsCollector .RecordObjectScanned (metadata .bucket , float64 (* obj .Size ))
@@ -744,7 +745,7 @@ func (s *Source) ChunkUnit(ctx context.Context, unit sources.SourceUnit, reporte
744745 return fmt .Errorf ("could not create s3 client for bucket %s and role %s: %w" , s3unit .Bucket , s3unit .Role , err )
745746 }
746747
747- s . checkpointer . SetIsUnitScan ( true )
748+ checkpointer := NewCheckpointer ( ctx , & s . Progress , true )
748749
749750 var startAfterPtr * string
750751 startAfter := s .Progress .GetEncodedResumeInfoFor (unitID )
@@ -757,7 +758,7 @@ func (s *Source) ChunkUnit(ctx context.Context, unit sources.SourceUnit, reporte
757758 startAfterPtr = & startAfter
758759 }
759760 defer s .Progress .ClearEncodedResumeInfoFor (unitID )
760- s .scanBucket (ctx , defaultClient , s3unit .Role , s3unit .Bucket , reporter , startAfterPtr )
761+ s .scanBucket (ctx , defaultClient , s3unit .Role , s3unit .Bucket , reporter , startAfterPtr , checkpointer )
761762 return nil
762763}
763764
0 commit comments