Skip to content

Commit 9a6cad9

Browse files
authored
[refactor] - Rename S3 ProgressTracker (#3652)
* rename * update * fix typo
1 parent 3c69bbc commit 9a6cad9

File tree

2 files changed

+136
-102
lines changed

2 files changed

+136
-102
lines changed

pkg/sources/s3/progress_tracker.go renamed to pkg/sources/s3/checkpointer.go

Lines changed: 63 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -11,16 +11,12 @@ import (
1111
"github.com/trufflesecurity/trufflehog/v3/pkg/sources"
1212
)
1313

14-
// ProgressTracker maintains scan progress state for S3 bucket scanning,
14+
// Checkpointer maintains resumption state for S3 bucket scanning,
1515
// enabling resumable scans by tracking which objects have been successfully processed.
1616
// It provides checkpoints that can be used to resume interrupted scans without missing objects.
1717
//
1818
// S3 buckets are organized as flat namespaces of objects identified by unique keys.
19-
// When listing objects, S3 returns paginated results with a maximum of 1000 objects per page.
20-
// The ListObjectsV2 API accepts a 'StartAfter' parameter that allows resuming the listing
21-
// from a specific object key.
22-
//
23-
// The tracker maintains state for the current page of objects (up to 1000) using a boolean array
19+
// The checkpointer maintains state for the current page of objects (up to 1000) using a boolean array
2420
// to track completion status and an ordered list to record the sequence of completions.
2521
// This enables finding the highest consecutive completed index as a "low water mark".
2622
//
@@ -41,28 +37,38 @@ import (
4137
// Page 1 (objects 0-999): Fully processed, checkpoint saved at object 999
4238
// Page 2 (objects 1000-1999): Partially processed through 1600, but only consecutive through 1499
4339
// On resume: StartAfter=object1499 in saved bucket, scanning continues from object 1500
44-
type ProgressTracker struct {
40+
//
41+
// Important constraints:
42+
// - Only tracks completion state for a single page of objects (up to 1000)
43+
// - Supports concurrent object processing within a page
44+
// - Does NOT support concurrent page processing
45+
// - Must be Reset() between pages
46+
type Checkpointer struct {
4547
enabled bool
4648

4749
// completedObjects tracks which indices in the current page have been processed.
48-
sync.Mutex
50+
mu sync.Mutex // protects concurrent access to completion state.
4951
completedObjects []bool
5052
completionOrder []int // Track the order in which objects complete
5153

54+
// lowestIncompleteIdx tracks the first index that hasn't been completed.
55+
// This optimizes checkpoint creation by avoiding recalculation.
56+
lowestIncompleteIdx int
57+
5258
// progress holds the scan's overall progress state and enables persistence.
5359
// The EncodedResumeInfo field stores the JSON-encoded ResumeInfo checkpoint.
5460
progress *sources.Progress // Reference to source's Progress
5561
}
5662

5763
const defaultMaxObjectsPerPage = 1000
5864

59-
// NewProgressTracker creates a new progress tracker for S3 scanning operations.
60-
// The enabled parameter determines if progress tracking is active, and progress
65+
// NewCheckpointer creates a new checkpointer for S3 scanning operations.
66+
// The enabled parameter determines if checkpointing is active, and progress
6167
// provides the underlying mechanism for persisting scan state.
62-
func NewProgressTracker(ctx context.Context, enabled bool, progress *sources.Progress) *ProgressTracker {
63-
ctx.Logger().Info("Creating progress tracker")
68+
func NewCheckpointer(ctx context.Context, enabled bool, progress *sources.Progress) *Checkpointer {
69+
ctx.Logger().Info("Creating checkpointer")
6470

65-
return &ProgressTracker{
71+
return &Checkpointer{
6672
// We are resuming if we have completed objects from a previous scan.
6773
completedObjects: make([]bool, defaultMaxObjectsPerPage),
6874
completionOrder: make([]int, 0, defaultMaxObjectsPerPage),
@@ -72,16 +78,18 @@ func NewProgressTracker(ctx context.Context, enabled bool, progress *sources.Pro
7278
}
7379

7480
// Reset prepares the tracker for a new page of objects by clearing the completion state.
75-
func (p *ProgressTracker) Reset() {
81+
// Must be called before processing each new page of objects.
82+
func (p *Checkpointer) Reset() {
7683
if !p.enabled {
7784
return
7885
}
7986

80-
p.Lock()
81-
defer p.Unlock()
87+
p.mu.Lock()
88+
defer p.mu.Unlock()
8289
// Store the current completed count before moving to next page.
8390
p.completedObjects = make([]bool, defaultMaxObjectsPerPage)
8491
p.completionOrder = make([]int, 0, defaultMaxObjectsPerPage)
92+
p.lowestIncompleteIdx = 0
8593
}
8694

8795
// ResumeInfo represents the state needed to resume an interrupted operation.
@@ -92,11 +100,11 @@ type ResumeInfo struct {
92100
StartAfter string `json:"start_after"` // Last processed object key
93101
}
94102

95-
// GetResumePoint retrieves the last saved checkpoint state if one exists.
103+
// ResumePoint retrieves the last saved checkpoint state if one exists.
96104
// It returns nil if progress tracking is disabled or no resume state exists.
97105
// This method decodes the stored resume information and validates it contains
98106
// the minimum required data to enable resumption.
99-
func (p *ProgressTracker) GetResumePoint(ctx context.Context) (ResumeInfo, error) {
107+
func (p *Checkpointer) ResumePoint(ctx context.Context) (ResumeInfo, error) {
100108
resume := ResumeInfo{}
101109

102110
if !p.enabled || p.progress.EncodedResumeInfo == "" {
@@ -118,7 +126,7 @@ func (p *ProgressTracker) GetResumePoint(ctx context.Context) (ResumeInfo, error
118126

119127
// Complete marks the entire scanning operation as finished and clears the resume state.
120128
// This should only be called once all scanning operations are complete.
121-
func (p *ProgressTracker) Complete(_ context.Context, message string) error {
129+
func (p *Checkpointer) Complete(_ context.Context, message string) error {
122130
// Preserve existing progress counters while clearing resume state.
123131
p.progress.SetProgressComplete(
124132
int(p.progress.SectionsCompleted),
@@ -129,14 +137,11 @@ func (p *ProgressTracker) Complete(_ context.Context, message string) error {
129137
return nil
130138
}
131139

132-
// UpdateObjectProgress records successfully processed objects within the current page
140+
// UpdateObjectCompletion records successfully processed objects within the current page
133141
// and maintains fine-grained resumption checkpoints. It uses a conservative tracking
134142
// strategy that ensures no objects are missed by only checkpointing consecutively
135143
// completed objects.
136144
//
137-
// This method manages the detailed object-level progress tracking and creates
138-
// checkpoints that enable resumption of interrupted scans.
139-
//
140145
// This approach ensures scan reliability by only checkpointing consecutively completed
141146
// objects. While this may result in re-scanning some objects when resuming, it guarantees
142147
// no objects are missed in case of interruption.
@@ -146,10 +151,13 @@ func (p *ProgressTracker) Complete(_ context.Context, message string) error {
146151
// - Objects completed: [0,1,2,3,4,5,7,8]
147152
// - The checkpoint will only include objects 0-5 since they are consecutive
148153
// - If scanning is interrupted and resumed:
149-
// - Scan resumes after object 5 (the last checkpoint)
150-
// - Objects 7-8 will be re-scanned even though they completed before
151-
// - This ensures object 6 is not missed
152-
func (p *ProgressTracker) UpdateObjectProgress(
154+
// -- Scan resumes after object 5 (the last checkpoint)
155+
// -- Objects 7-8 will be re-scanned even though they completed before
156+
// -- This ensures object 6 is not missed
157+
//
158+
// Thread-safe for concurrent object processing within a single page.
159+
// WARNING: Not safe for concurrent page processing.
160+
func (p *Checkpointer) UpdateObjectCompletion(
153161
ctx context.Context,
154162
completedIdx int,
155163
bucket string,
@@ -166,46 +174,48 @@ func (p *ProgressTracker) UpdateObjectProgress(
166174
return fmt.Errorf("completed index %d exceeds maximum page size", completedIdx)
167175
}
168176

169-
p.Lock()
170-
defer p.Unlock()
177+
p.mu.Lock()
178+
defer p.mu.Unlock()
171179

172-
// Only track completion if this is the first time this index is marked complete.
180+
// Only process if this is the first time this index is marked complete.
173181
if !p.completedObjects[completedIdx] {
174182
p.completedObjects[completedIdx] = true
175183
p.completionOrder = append(p.completionOrder, completedIdx)
176-
}
177-
178-
// Find the highest safe checkpoint we can create.
179-
lastSafeIdx := -1
180-
var safeIndices [defaultMaxObjectsPerPage]bool
181184

182-
// Mark all completed indices.
183-
for _, idx := range p.completionOrder {
184-
safeIndices[idx] = true
185+
// If we completed the lowest incomplete index, scan forward to find the new lowest.
186+
if completedIdx == p.lowestIncompleteIdx {
187+
p.advanceLowestIncompleteIdx()
188+
}
185189
}
186190

187-
// Find the highest consecutive completed index.
188-
for i := range len(p.completedObjects) {
189-
if !safeIndices[i] {
190-
break
191-
}
192-
lastSafeIdx = i
191+
// lowestIncompleteIdx points to first incomplete object, so everything before
192+
// it is complete. We want to checkpoint at the last complete object.
193+
checkpointIdx := p.lowestIncompleteIdx - 1
194+
if checkpointIdx < 0 {
195+
return nil // No completed objects yet
193196
}
197+
obj := pageContents[checkpointIdx]
194198

195-
// Update progress if we have at least one completed object.
196-
if lastSafeIdx < 0 {
197-
return nil
199+
return p.updateCheckpoint(bucket, *obj.Key)
200+
}
201+
202+
// advanceLowestIncompleteIdx moves the lowest incomplete index forward to the next incomplete object.
203+
// Must be called with lock held.
204+
func (p *Checkpointer) advanceLowestIncompleteIdx() {
205+
for p.lowestIncompleteIdx < len(p.completedObjects) &&
206+
p.completedObjects[p.lowestIncompleteIdx] {
207+
p.lowestIncompleteIdx++
198208
}
209+
}
199210

200-
obj := pageContents[lastSafeIdx]
201-
info := &ResumeInfo{CurrentBucket: bucket, StartAfter: *obj.Key}
202-
encoded, err := json.Marshal(info)
211+
// updateCheckpoint persists the current resumption state.
212+
// Must be called with lock held.
213+
func (p *Checkpointer) updateCheckpoint(bucket string, lastKey string) error {
214+
encoded, err := json.Marshal(&ResumeInfo{CurrentBucket: bucket, StartAfter: lastKey})
203215
if err != nil {
204-
return err
216+
return fmt.Errorf("failed to encode resume info: %w", err)
205217
}
206218

207-
// Purposefully avoid updating any progress counts.
208-
// Only update resume info.
209219
p.progress.SetProgressComplete(
210220
int(p.progress.SectionsCompleted),
211221
int(p.progress.SectionsRemaining),

0 commit comments

Comments
 (0)