@@ -11,16 +11,12 @@ import (
11
11
"github.com/trufflesecurity/trufflehog/v3/pkg/sources"
12
12
)
13
13
14
- // ProgressTracker maintains scan progress state for S3 bucket scanning,
14
+ // Checkpointer maintains resumption state for S3 bucket scanning,
15
15
// enabling resumable scans by tracking which objects have been successfully processed.
16
16
// It provides checkpoints that can be used to resume interrupted scans without missing objects.
17
17
//
18
18
// S3 buckets are organized as flat namespaces of objects identified by unique keys.
19
- // When listing objects, S3 returns paginated results with a maximum of 1000 objects per page.
20
- // The ListObjectsV2 API accepts a 'StartAfter' parameter that allows resuming the listing
21
- // from a specific object key.
22
- //
23
- // The tracker maintains state for the current page of objects (up to 1000) using a boolean array
19
+ // The checkpointer maintains state for the current page of objects (up to 1000) using a boolean array
24
20
// to track completion status and an ordered list to record the sequence of completions.
25
21
// This enables finding the highest consecutive completed index as a "low water mark".
26
22
//
@@ -41,28 +37,38 @@ import (
41
37
// Page 1 (objects 0-999): Fully processed, checkpoint saved at object 999
42
38
// Page 2 (objects 1000-1999): Partially processed through 1600, but only consecutive through 1499
43
39
// On resume: StartAfter=object1499 in saved bucket, scanning continues from object 1500
44
- type ProgressTracker struct {
40
+ //
41
+ // Important constraints:
42
+ // - Only tracks completion state for a single page of objects (up to 1000)
43
+ // - Supports concurrent object processing within a page
44
+ // - Does NOT support concurrent page processing
45
+ // - Must be Reset() between pages
46
+ type Checkpointer struct {
45
47
enabled bool
46
48
47
49
// completedObjects tracks which indices in the current page have been processed.
48
- sync.Mutex
50
+ mu sync.Mutex // protects concurrent access to completion state.
49
51
completedObjects []bool
50
52
completionOrder []int // Track the order in which objects complete
51
53
54
+ // lowestIncompleteIdx tracks the first index that hasn't been completed.
55
+ // This optimizes checkpoint creation by avoiding recalculation.
56
+ lowestIncompleteIdx int
57
+
52
58
// progress holds the scan's overall progress state and enables persistence.
53
59
// The EncodedResumeInfo field stores the JSON-encoded ResumeInfo checkpoint.
54
60
progress * sources.Progress // Reference to source's Progress
55
61
}
56
62
57
63
const defaultMaxObjectsPerPage = 1000
58
64
59
- // NewProgressTracker creates a new progress tracker for S3 scanning operations.
60
- // The enabled parameter determines if progress tracking is active, and progress
65
+ // NewCheckpointer creates a new checkpointer for S3 scanning operations.
66
+ // The enabled parameter determines if checkpointing is active, and progress
61
67
// provides the underlying mechanism for persisting scan state.
62
- func NewProgressTracker (ctx context.Context , enabled bool , progress * sources.Progress ) * ProgressTracker {
63
- ctx .Logger ().Info ("Creating progress tracker " )
68
+ func NewCheckpointer (ctx context.Context , enabled bool , progress * sources.Progress ) * Checkpointer {
69
+ ctx .Logger ().Info ("Creating checkpointer " )
64
70
65
- return & ProgressTracker {
71
+ return & Checkpointer {
66
72
// We are resuming if we have completed objects from a previous scan.
67
73
completedObjects : make ([]bool , defaultMaxObjectsPerPage ),
68
74
completionOrder : make ([]int , 0 , defaultMaxObjectsPerPage ),
@@ -72,16 +78,18 @@ func NewProgressTracker(ctx context.Context, enabled bool, progress *sources.Pro
72
78
}
73
79
74
80
// Reset prepares the tracker for a new page of objects by clearing the completion state.
75
- func (p * ProgressTracker ) Reset () {
81
+ // Must be called before processing each new page of objects.
82
+ func (p * Checkpointer ) Reset () {
76
83
if ! p .enabled {
77
84
return
78
85
}
79
86
80
- p .Lock ()
81
- defer p .Unlock ()
87
+ p .mu . Lock ()
88
+ defer p .mu . Unlock ()
82
89
// Store the current completed count before moving to next page.
83
90
p .completedObjects = make ([]bool , defaultMaxObjectsPerPage )
84
91
p .completionOrder = make ([]int , 0 , defaultMaxObjectsPerPage )
92
+ p .lowestIncompleteIdx = 0
85
93
}
86
94
87
95
// ResumeInfo represents the state needed to resume an interrupted operation.
@@ -92,11 +100,11 @@ type ResumeInfo struct {
92
100
StartAfter string `json:"start_after"` // Last processed object key
93
101
}
94
102
95
- // GetResumePoint retrieves the last saved checkpoint state if one exists.
103
+ // ResumePoint retrieves the last saved checkpoint state if one exists.
96
104
// It returns nil if progress tracking is disabled or no resume state exists.
97
105
// This method decodes the stored resume information and validates it contains
98
106
// the minimum required data to enable resumption.
99
- func (p * ProgressTracker ) GetResumePoint (ctx context.Context ) (ResumeInfo , error ) {
107
+ func (p * Checkpointer ) ResumePoint (ctx context.Context ) (ResumeInfo , error ) {
100
108
resume := ResumeInfo {}
101
109
102
110
if ! p .enabled || p .progress .EncodedResumeInfo == "" {
@@ -118,7 +126,7 @@ func (p *ProgressTracker) GetResumePoint(ctx context.Context) (ResumeInfo, error
118
126
119
127
// Complete marks the entire scanning operation as finished and clears the resume state.
120
128
// This should only be called once all scanning operations are complete.
121
- func (p * ProgressTracker ) Complete (_ context.Context , message string ) error {
129
+ func (p * Checkpointer ) Complete (_ context.Context , message string ) error {
122
130
// Preserve existing progress counters while clearing resume state.
123
131
p .progress .SetProgressComplete (
124
132
int (p .progress .SectionsCompleted ),
@@ -129,14 +137,11 @@ func (p *ProgressTracker) Complete(_ context.Context, message string) error {
129
137
return nil
130
138
}
131
139
132
- // UpdateObjectProgress records successfully processed objects within the current page
140
+ // UpdateObjectCompletion records successfully processed objects within the current page
133
141
// and maintains fine-grained resumption checkpoints. It uses a conservative tracking
134
142
// strategy that ensures no objects are missed by only checkpointing consecutively
135
143
// completed objects.
136
144
//
137
- // This method manages the detailed object-level progress tracking and creates
138
- // checkpoints that enable resumption of interrupted scans.
139
- //
140
145
// This approach ensures scan reliability by only checkpointing consecutively completed
141
146
// objects. While this may result in re-scanning some objects when resuming, it guarantees
142
147
// no objects are missed in case of interruption.
@@ -146,10 +151,13 @@ func (p *ProgressTracker) Complete(_ context.Context, message string) error {
146
151
// - Objects completed: [0,1,2,3,4,5,7,8]
147
152
// - The checkpoint will only include objects 0-5 since they are consecutive
148
153
// - If scanning is interrupted and resumed:
149
- // - Scan resumes after object 5 (the last checkpoint)
150
- // - Objects 7-8 will be re-scanned even though they completed before
151
- // - This ensures object 6 is not missed
152
- func (p * ProgressTracker ) UpdateObjectProgress (
154
+ // -- Scan resumes after object 5 (the last checkpoint)
155
+ // -- Objects 7-8 will be re-scanned even though they completed before
156
+ // -- This ensures object 6 is not missed
157
+ //
158
+ // Thread-safe for concurrent object processing within a single page.
159
+ // WARNING: Not safe for concurrent page processing.
160
+ func (p * Checkpointer ) UpdateObjectCompletion (
153
161
ctx context.Context ,
154
162
completedIdx int ,
155
163
bucket string ,
@@ -166,46 +174,48 @@ func (p *ProgressTracker) UpdateObjectProgress(
166
174
return fmt .Errorf ("completed index %d exceeds maximum page size" , completedIdx )
167
175
}
168
176
169
- p .Lock ()
170
- defer p .Unlock ()
177
+ p .mu . Lock ()
178
+ defer p .mu . Unlock ()
171
179
172
- // Only track completion if this is the first time this index is marked complete.
180
+ // Only process if this is the first time this index is marked complete.
173
181
if ! p .completedObjects [completedIdx ] {
174
182
p .completedObjects [completedIdx ] = true
175
183
p .completionOrder = append (p .completionOrder , completedIdx )
176
- }
177
-
178
- // Find the highest safe checkpoint we can create.
179
- lastSafeIdx := - 1
180
- var safeIndices [defaultMaxObjectsPerPage ]bool
181
184
182
- // Mark all completed indices.
183
- for _ , idx := range p .completionOrder {
184
- safeIndices [idx ] = true
185
+ // If we completed the lowest incomplete index, scan forward to find the new lowest.
186
+ if completedIdx == p .lowestIncompleteIdx {
187
+ p .advanceLowestIncompleteIdx ()
188
+ }
185
189
}
186
190
187
- // Find the highest consecutive completed index.
188
- for i := range len (p .completedObjects ) {
189
- if ! safeIndices [i ] {
190
- break
191
- }
192
- lastSafeIdx = i
191
+ // lowestIncompleteIdx points to first incomplete object, so everything before
192
+ // it is complete. We want to checkpoint at the last complete object.
193
+ checkpointIdx := p .lowestIncompleteIdx - 1
194
+ if checkpointIdx < 0 {
195
+ return nil // No completed objects yet
193
196
}
197
+ obj := pageContents [checkpointIdx ]
194
198
195
- // Update progress if we have at least one completed object.
196
- if lastSafeIdx < 0 {
197
- return nil
199
+ return p .updateCheckpoint (bucket , * obj .Key )
200
+ }
201
+
202
+ // advanceLowestIncompleteIdx moves the lowest incomplete index forward to the next incomplete object.
203
+ // Must be called with lock held.
204
+ func (p * Checkpointer ) advanceLowestIncompleteIdx () {
205
+ for p .lowestIncompleteIdx < len (p .completedObjects ) &&
206
+ p .completedObjects [p .lowestIncompleteIdx ] {
207
+ p .lowestIncompleteIdx ++
198
208
}
209
+ }
199
210
200
- obj := pageContents [lastSafeIdx ]
201
- info := & ResumeInfo {CurrentBucket : bucket , StartAfter : * obj .Key }
202
- encoded , err := json .Marshal (info )
211
+ // updateCheckpoint persists the current resumption state.
212
+ // Must be called with lock held.
213
+ func (p * Checkpointer ) updateCheckpoint (bucket string , lastKey string ) error {
214
+ encoded , err := json .Marshal (& ResumeInfo {CurrentBucket : bucket , StartAfter : lastKey })
203
215
if err != nil {
204
- return err
216
+ return fmt . Errorf ( "failed to encode resume info: %w" , err )
205
217
}
206
218
207
- // Purposefully avoid updating any progress counts.
208
- // Only update resume info.
209
219
p .progress .SetProgressComplete (
210
220
int (p .progress .SectionsCompleted ),
211
221
int (p .progress .SectionsRemaining ),
0 commit comments