@@ -10,6 +10,7 @@ import { DeleteLocalFileTask } from './tasks'
1010import { TaskProcessorSpawner , WorkingStack } from '../processing/workingProcess'
1111import { DataObjectWithBagDetailsFragment } from '../queryNode/generated/queries'
1212import { Logger } from 'winston'
13+ import pLimit from 'p-limit'
1314
1415/**
1516 * The maximum allowed threshold by which the QN processor can lag behind
@@ -41,21 +42,21 @@ export const MINIMUM_REPLICATION_THRESHOLD = parseInt(process.env.CLEANUP_MIN_RE
4142 * - If the asset being pruned from this storage-node is currently being downloaded
4243 * by some external actors, then the cleanup action for this asset would be postponed
4344 *
44- * @param api - (optional) runtime API promise
45- * @param workerId - current storage provider ID
46- * @param buckets - Selected storage buckets
45+ * @param buckets - selected storage buckets
4746 * @param asyncWorkersNumber - maximum parallel cleanups number
48- * @param asyncWorkersTimeout - downloading asset timeout
47+ * @param api - runtime API promise
4948 * @param qnApi - Query Node API
5049 * @param uploadDirectory - local directory to get file names from
51- * @param tempDirectory - local directory for temporary data uploading
50+ * @param batchSize - max. number of data objects to process in a single batch
51+ * @param hostId
5252 */
5353export async function performCleanup (
5454 buckets : string [ ] ,
5555 asyncWorkersNumber : number ,
5656 api : ApiPromise ,
5757 qnApi : QueryNodeApi ,
5858 uploadDirectory : string ,
59+ batchSize : number ,
5960 hostId : string
6061) : Promise < void > {
6162 const logger = rootLogger . child ( { label : 'Cleanup' } )
@@ -98,11 +99,11 @@ export async function performCleanup(
9899 const workingStack = new WorkingStack ( )
99100 const processSpawner = new TaskProcessorSpawner ( workingStack , asyncWorkersNumber )
100101
101- // Execute deleted objects removal tasks in batches of 10_000
102+ // Execute deleted objects removal tasks in batches
102103 if ( deletedDataObjectIds . size ) {
103104 let deletedProcessed = 0
104105 logger . info ( `removing ${ deletedDataObjectIds . size } deleted objects...` )
105- for ( let deletedObjectsIdsBatch of _ . chunk ( [ ...deletedDataObjectIds ] , 10_000 ) ) {
106+ for ( let deletedObjectsIdsBatch of _ . chunk ( [ ...deletedDataObjectIds ] , batchSize ) ) {
106107 // Confirm whether the objects were actually deleted by fetching the related deletion events
107108 const dataObjectDeletedEvents = await qnApi . getDataObjectDeletedEvents ( deletedObjectsIdsBatch )
108109 const confirmedIds = new Set ( dataObjectDeletedEvents . map ( ( e ) => e . data . dataObjectId ) )
@@ -120,26 +121,35 @@ export async function performCleanup(
120121 deletedProcessed += deletedObjectsIdsBatch . length
121122 logger . debug ( `${ deletedProcessed } / ${ deletedDataObjectIds . size } deleted objects processed...` )
122123 }
124+ logger . info ( `${ deletedProcessed } /${ deletedDataObjectIds . size } deleted data objects successfully cleared.` )
123125 }
124126
125- // Execute moved objects removal tasks in batches of 10_000
127+ // Execute moved objects removal tasks in batches
126128 if ( movedObjectIds . size ) {
127129 let movedProcessed = 0
128130 logger . info ( `removing ${ movedObjectIds . size } moved objects...` )
129- for ( const movedObjectsIdsBatch of _ . chunk ( [ ...movedObjectIds ] , 10_000 ) ) {
131+ for ( const movedObjectsIdsBatch of _ . chunk ( [ ...movedObjectIds ] , batchSize ) ) {
130132 const movedDataObjectsBatch = await qnApi . getDataObjectsWithBagDetails ( movedObjectsIdsBatch )
131133 const deletionTasksOfMovedDataObjects = await getDeletionTasksFromMovedDataObjects (
132134 logger ,
133135 uploadDirectory ,
134136 model ,
135137 movedDataObjectsBatch ,
138+ asyncWorkersNumber ,
136139 hostId
137140 )
141+ const numberOfTasks = deletionTasksOfMovedDataObjects . length
142+ if ( numberOfTasks !== movedObjectsIdsBatch . length ) {
143+ logger . warn (
144+ `Only ${ numberOfTasks } / ${ movedObjectsIdsBatch . length } moved objects will be removed in this batch...`
145+ )
146+ }
138147 await workingStack . add ( deletionTasksOfMovedDataObjects )
139148 await processSpawner . process ( )
140- movedProcessed += movedDataObjectsBatch . length
149+ movedProcessed += numberOfTasks
141150 logger . debug ( `${ movedProcessed } / ${ movedObjectIds . size } moved objects processed...` )
142151 }
152+ logger . info ( `${ movedProcessed } /${ movedObjectIds . size } moved data objects successfully cleared.` )
143153 }
144154 } else {
145155 logger . info ( 'No objects to prune, skipping...' )
@@ -155,40 +165,79 @@ export async function performCleanup(
155165 * @param uploadDirectory - local directory for data uploading
156166 * @param dataObligations - defines the current data obligations for the node
157167 * @param movedDataObjects- obsolete (no longer assigned) data objects that has been moved to other buckets
168+ * @param asyncWorkersNumber - number of async workers assigned for cleanup tasks
158169 * @param hostId - host id of the current node
159170 */
160171async function getDeletionTasksFromMovedDataObjects (
161172 logger : Logger ,
162173 uploadDirectory : string ,
163174 dataObligations : DataObligations ,
164175 movedDataObjects : DataObjectWithBagDetailsFragment [ ] ,
176+ asyncWorkersNumber : number ,
165177 hostId : string
166178) : Promise < DeleteLocalFileTask [ ] > {
167179 const timeoutMs = 60 * 1000 // 1 minute since it's only a HEAD request
168180 const deletionTasks : DeleteLocalFileTask [ ] = [ ]
169181
170182 const { bucketOperatorUrlById } = dataObligations
171- await Promise . allSettled (
172- movedDataObjects . map ( async ( movedDataObject ) => {
173- let dataObjectReplicationCount = 0
174-
175- for ( const { storageBucket } of movedDataObject . storageBag . storageBuckets ) {
176- const nodeUrl = bucketOperatorUrlById . get ( storageBucket . id )
177- if ( nodeUrl ) {
178- const fileUrl = urljoin ( nodeUrl , 'api/v1/files' , movedDataObject . id )
183+ const limit = pLimit ( asyncWorkersNumber )
184+ let checkedObjects = 0
185+ const checkReplicationThreshold = async ( movedDataObject : DataObjectWithBagDetailsFragment ) => {
186+ ++ checkedObjects
187+ if ( checkedObjects % asyncWorkersNumber === 0 ) {
188+ logger . debug (
189+ `Checking replication: ${ checkedObjects } /${ movedDataObjects . length } (active: ${ limit . activeCount } , pending: ${ limit . pendingCount } )`
190+ )
191+ }
192+
193+ const externaBucketEndpoints = movedDataObject . storageBag . storageBuckets
194+ . map ( ( { storageBucket : { id } } ) => {
195+ return bucketOperatorUrlById . get ( id )
196+ } )
197+ . filter ( ( url ) : url is string => ! ! url )
198+ let lastErr = ''
199+ let successes = 0
200+ let failures = 0
201+
202+ if ( externaBucketEndpoints . length >= MINIMUM_REPLICATION_THRESHOLD ) {
203+ for ( const nodeUrl of externaBucketEndpoints ) {
204+ const fileUrl = urljoin ( nodeUrl , 'api/v1/files' , movedDataObject . id )
205+ try {
179206 await superagent . head ( fileUrl ) . timeout ( timeoutMs ) . set ( 'X-COLOSSUS-HOST-ID' , hostId )
180- dataObjectReplicationCount ++
207+ ++ successes
208+ } catch ( e ) {
209+ ++ failures
210+ lastErr = e instanceof Error ? e . message : e . toString ( )
211+ }
212+ if ( successes >= MINIMUM_REPLICATION_THRESHOLD ) {
213+ break
181214 }
182215 }
216+ }
183217
184- if ( dataObjectReplicationCount < MINIMUM_REPLICATION_THRESHOLD ) {
185- logger . warn ( `data object replication threshold unmet - file deletion canceled: ${ movedDataObject . id } ` )
186- return
187- }
218+ if ( successes < MINIMUM_REPLICATION_THRESHOLD ) {
219+ logger . debug (
220+ `Replication threshold unmet for object ${ movedDataObject . id } ` +
221+ `(buckets: ${ externaBucketEndpoints . length } , successes: ${ successes } , failures: ${ failures } ). ` +
222+ ( lastErr ? `Last error: ${ lastErr } . ` : '' ) +
223+ `File deletion canceled...`
224+ )
225+ return
226+ }
227+
228+ deletionTasks . push ( new DeleteLocalFileTask ( uploadDirectory , movedDataObject . id ) )
229+ }
230+
231+ await Promise . all ( movedDataObjects . map ( ( movedDataObject ) => limit ( ( ) => checkReplicationThreshold ( movedDataObject ) ) ) )
232+
233+ const failedCount = movedDataObjects . length - deletionTasks . length
234+ if ( failedCount > 0 ) {
235+ logger . warn (
236+ `Replication threshold was unmet or couldn't be verified for ${ failedCount } / ${ movedDataObjects . length } objects in the current batch.`
237+ )
238+ }
188239
189- deletionTasks . push ( new DeleteLocalFileTask ( uploadDirectory , movedDataObject . id ) )
190- } )
191- )
240+ logger . debug ( 'Checking replication: Done' )
192241
193242 return deletionTasks
194243}
0 commit comments