Skip to content

Commit 6fcd6a7

Browse files
committed
Configurable cleanup/sync batch size and workersNum, limit async requests during cleanup, improve logging
1 parent 1e76ce8 commit 6fcd6a7

File tree

9 files changed

+133
-36
lines changed

9 files changed

+133
-36
lines changed

storage-node/CHANGELOG.md

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
11
### 4.4.0
22

33
- **Optimizations:** The way data objects / data object ids are queried and processed during sync and cleanup has been optimized:
4-
- Sync and cleanup services now process tasks in batches of `10_000` to avoid overflowing the memory.
4+
- Sync and cleanup services now process tasks in batches of configurable size (`--syncBatchSize`, `--cleanupBatchSize`) to avoid overflowing the memory.
55
- Synchronous operations like `sort` or `filter` on larger arrays of data objects have been optimized (for example, by replacing `.filter(Array.includes(...))` with `.filter(Set.has(...))`).
6+
- Enforced a limit of max. results per single GraphQL query to `10,000` and max input arguments per query to `1,000`
7+
- Added `--cleanupWorkersNumber` flag to limit the number of concurrent async requests during cleanup.
68
- A safety mechanism was added to avoid removing "deleted" objects for which a related `DataObjectDeleted` event cannot be found in storage squid.
7-
- Improved logging during cleanup.
9+
- Improved logging during sync and cleanup.
810

911
### 4.3.0
1012

storage-node/package.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@
5454
"multihashes": "^4.0.2",
5555
"node-cache": "^5.1.2",
5656
"openapi-editor": "^0.3.0",
57+
"p-limit": "^3",
5758
"promise-timeout": "^1.3.0",
5859
"proper-lockfile": "^4.1.2",
5960
"react": "^18.2.0",

storage-node/src/commands/server.ts

Lines changed: 29 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -78,16 +78,29 @@ export default class Server extends ApiCommandBase {
7878
description: 'Interval before retrying failed synchronization run (in minutes)',
7979
default: 3,
8080
}),
81+
syncBatchSize: flags.integer({
82+
description: 'Maximum number of objects to process in a single batch during synchronization.',
83+
default: 10_000,
84+
}),
8185
cleanup: flags.boolean({
8286
char: 'c',
8387
description: 'Enable cleanup/pruning of no-longer assigned assets.',
8488
default: false,
8589
}),
90+
cleanupBatchSize: flags.integer({
91+
description: 'Maximum number of objects to process in a single batch during cleanup.',
92+
default: 10_000,
93+
}),
8694
cleanupInterval: flags.integer({
8795
char: 'i',
8896
description: 'Interval between periodic cleanup actions (in minutes)',
8997
default: 360,
9098
}),
99+
cleanupWorkersNumber: flags.integer({
100+
required: false,
101+
description: 'Cleanup workers number (max async operations in progress).',
102+
default: 100,
103+
}),
91104
storageSquidEndpoint: flags.string({
92105
char: 'q',
93106
required: true,
@@ -299,6 +312,7 @@ Supported values: warn, error, debug, info. Default:debug`,
299312
flags.syncWorkersTimeout,
300313
flags.syncInterval,
301314
flags.syncRetryInterval,
315+
flags.syncBatchSize,
302316
X_HOST_ID
303317
),
304318
0
@@ -319,8 +333,9 @@ Supported values: warn, error, debug, info. Default:debug`,
319333
api,
320334
qnApi,
321335
flags.uploads,
322-
flags.syncWorkersNumber,
336+
flags.cleanupWorkersNumber,
323337
flags.cleanupInterval,
338+
flags.cleanupBatchSize,
324339
X_HOST_ID
325340
),
326341
0
@@ -397,14 +412,24 @@ async function runSyncWithInterval(
397412
syncWorkersTimeout: number,
398413
syncIntervalMinutes: number,
399414
syncRetryIntervalMinutes: number,
415+
syncBatchSize: number,
400416
hostId: string
401417
) {
402418
const sleepInterval = syncIntervalMinutes * 60 * 1000
403419
const retrySleepInterval = syncRetryIntervalMinutes * 60 * 1000
404420
while (true) {
405421
try {
406422
logger.info(`Resume syncing....`)
407-
await performSync(buckets, syncWorkersNumber, syncWorkersTimeout, qnApi, uploadsDirectory, tempDirectory, hostId)
423+
await performSync(
424+
buckets,
425+
syncWorkersNumber,
426+
syncWorkersTimeout,
427+
qnApi,
428+
uploadsDirectory,
429+
tempDirectory,
430+
syncBatchSize,
431+
hostId
432+
)
408433
logger.info(`Sync run complete. Next run in ${syncIntervalMinutes} minute(s).`)
409434
await sleep(sleepInterval)
410435
} catch (err) {
@@ -434,6 +459,7 @@ async function runCleanupWithInterval(
434459
uploadsDirectory: string,
435460
syncWorkersNumber: number,
436461
cleanupIntervalMinutes: number,
462+
cleanupBatchSize: number,
437463
hostId: string
438464
) {
439465
const sleepInterval = cleanupIntervalMinutes * 60 * 1000
@@ -442,7 +468,7 @@ async function runCleanupWithInterval(
442468
await sleep(sleepInterval)
443469
try {
444470
logger.info(`Resume cleanup....`)
445-
await performCleanup(buckets, syncWorkersNumber, api, qnApi, uploadsDirectory, hostId)
471+
await performCleanup(buckets, syncWorkersNumber, api, qnApi, uploadsDirectory, cleanupBatchSize, hostId)
446472
} catch (err) {
447473
logger.error(`Critical cleanup error: ${err}`)
448474
}

storage-node/src/commands/util/cleanup.ts

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,10 @@ export default class Cleanup extends ApiCommandBase {
2727
required: true,
2828
description: 'The buckerId to sync prune/cleanup',
2929
}),
30+
cleanupBatchSize: flags.integer({
31+
description: 'Maximum number of objects to process in a single batch during cleanup.',
32+
default: 10_000,
33+
}),
3034
cleanupWorkersNumber: flags.integer({
3135
char: 'p',
3236
required: false,
@@ -57,7 +61,15 @@ export default class Cleanup extends ApiCommandBase {
5761
logger.info('Cleanup...')
5862

5963
try {
60-
await performCleanup([bucketId], flags.cleanupWorkersNumber, api, qnApi, flags.uploads, '')
64+
await performCleanup(
65+
[bucketId],
66+
flags.cleanupWorkersNumber,
67+
api,
68+
qnApi,
69+
flags.uploads,
70+
flags.cleanupBatchSize,
71+
''
72+
)
6173
} catch (err) {
6274
logger.error(err)
6375
logger.error(stringify(err))

storage-node/src/commands/util/fetch-bucket.ts

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,10 @@ export default class FetchBucket extends Command {
3636
description: 'Asset downloading timeout for the syncronization (in minutes).',
3737
default: 30,
3838
}),
39+
syncBatchSize: flags.integer({
40+
description: 'Maximum number of objects to process in a single batch.',
41+
default: 10_000,
42+
}),
3943
queryNodeEndpoint: flags.string({
4044
char: 'q',
4145
required: false,
@@ -74,6 +78,7 @@ export default class FetchBucket extends Command {
7478
qnApi,
7579
flags.uploads,
7680
flags.tempFolder ? flags.tempFolder : path.join(flags.uploads, 'temp'),
81+
flags.syncBatchSize,
7782
'',
7883
flags.dataSourceOperatorUrl
7984
)

storage-node/src/services/sync/cleanupService.ts

Lines changed: 75 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import { DeleteLocalFileTask } from './tasks'
1010
import { TaskProcessorSpawner, WorkingStack } from '../processing/workingProcess'
1111
import { DataObjectWithBagDetailsFragment } from '../queryNode/generated/queries'
1212
import { Logger } from 'winston'
13+
import pLimit from 'p-limit'
1314

1415
/**
1516
* The maximum allowed threshold by which the QN processor can lag behind
@@ -41,21 +42,21 @@ export const MINIMUM_REPLICATION_THRESHOLD = parseInt(process.env.CLEANUP_MIN_RE
4142
* - If the asset being pruned from this storage-node is currently being downloaded
4243
* by some external actors, then the cleanup action for this asset would be postponed
4344
*
44-
* @param api - (optional) runtime API promise
45-
* @param workerId - current storage provider ID
46-
* @param buckets - Selected storage buckets
45+
* @param buckets - selected storage buckets
4746
* @param asyncWorkersNumber - maximum parallel cleanups number
48-
* @param asyncWorkersTimeout - downloading asset timeout
47+
* @param api - runtime API promise
4948
* @param qnApi - Query Node API
5049
* @param uploadDirectory - local directory to get file names from
51-
* @param tempDirectory - local directory for temporary data uploading
50+
* @param batchSize - max. number of data objects to process in a single batch
51+
* @param hostId
5252
*/
5353
export async function performCleanup(
5454
buckets: string[],
5555
asyncWorkersNumber: number,
5656
api: ApiPromise,
5757
qnApi: QueryNodeApi,
5858
uploadDirectory: string,
59+
batchSize: number,
5960
hostId: string
6061
): Promise<void> {
6162
const logger = rootLogger.child({ label: 'Cleanup' })
@@ -98,11 +99,11 @@ export async function performCleanup(
9899
const workingStack = new WorkingStack()
99100
const processSpawner = new TaskProcessorSpawner(workingStack, asyncWorkersNumber)
100101

101-
// Execute deleted objects removal tasks in batches of 10_000
102+
// Execute deleted objects removal tasks in batches
102103
if (deletedDataObjectIds.size) {
103104
let deletedProcessed = 0
104105
logger.info(`removing ${deletedDataObjectIds.size} deleted objects...`)
105-
for (let deletedObjectsIdsBatch of _.chunk([...deletedDataObjectIds], 10_000)) {
106+
for (let deletedObjectsIdsBatch of _.chunk([...deletedDataObjectIds], batchSize)) {
106107
// Confirm whether the objects were actually deleted by fetching the related deletion events
107108
const dataObjectDeletedEvents = await qnApi.getDataObjectDeletedEvents(deletedObjectsIdsBatch)
108109
const confirmedIds = new Set(dataObjectDeletedEvents.map((e) => e.data.dataObjectId))
@@ -120,26 +121,35 @@ export async function performCleanup(
120121
deletedProcessed += deletedObjectsIdsBatch.length
121122
logger.debug(`${deletedProcessed} / ${deletedDataObjectIds.size} deleted objects processed...`)
122123
}
124+
logger.info(`${deletedProcessed}/${deletedDataObjectIds.size} deleted data objects successfully cleared.`)
123125
}
124126

125-
// Execute moved objects removal tasks in batches of 10_000
127+
// Execute moved objects removal tasks in batches
126128
if (movedObjectIds.size) {
127129
let movedProcessed = 0
128130
logger.info(`removing ${movedObjectIds.size} moved objects...`)
129-
for (const movedObjectsIdsBatch of _.chunk([...movedObjectIds], 10_000)) {
131+
for (const movedObjectsIdsBatch of _.chunk([...movedObjectIds], batchSize)) {
130132
const movedDataObjectsBatch = await qnApi.getDataObjectsWithBagDetails(movedObjectsIdsBatch)
131133
const deletionTasksOfMovedDataObjects = await getDeletionTasksFromMovedDataObjects(
132134
logger,
133135
uploadDirectory,
134136
model,
135137
movedDataObjectsBatch,
138+
asyncWorkersNumber,
136139
hostId
137140
)
141+
const numberOfTasks = deletionTasksOfMovedDataObjects.length
142+
if (numberOfTasks !== movedObjectsIdsBatch.length) {
143+
logger.warn(
144+
`Only ${numberOfTasks} / ${movedObjectsIdsBatch.length} moved objects will be removed in this batch...`
145+
)
146+
}
138147
await workingStack.add(deletionTasksOfMovedDataObjects)
139148
await processSpawner.process()
140-
movedProcessed += movedDataObjectsBatch.length
149+
movedProcessed += numberOfTasks
141150
logger.debug(`${movedProcessed} / ${movedObjectIds.size} moved objects processed...`)
142151
}
152+
logger.info(`${movedProcessed}/${movedObjectIds.size} moved data objects successfully cleared.`)
143153
}
144154
} else {
145155
logger.info('No objects to prune, skipping...')
@@ -155,40 +165,79 @@ export async function performCleanup(
155165
* @param uploadDirectory - local directory for data uploading
156166
* @param dataObligations - defines the current data obligations for the node
157167
* @param movedDataObjects- obsolete (no longer assigned) data objects that has been moved to other buckets
168+
* @param asyncWorkersNumber - number of async workers assigned for cleanup tasks
158169
* @param hostId - host id of the current node
159170
*/
160171
async function getDeletionTasksFromMovedDataObjects(
161172
logger: Logger,
162173
uploadDirectory: string,
163174
dataObligations: DataObligations,
164175
movedDataObjects: DataObjectWithBagDetailsFragment[],
176+
asyncWorkersNumber: number,
165177
hostId: string
166178
): Promise<DeleteLocalFileTask[]> {
167179
const timeoutMs = 60 * 1000 // 1 minute since it's only a HEAD request
168180
const deletionTasks: DeleteLocalFileTask[] = []
169181

170182
const { bucketOperatorUrlById } = dataObligations
171-
await Promise.allSettled(
172-
movedDataObjects.map(async (movedDataObject) => {
173-
let dataObjectReplicationCount = 0
174-
175-
for (const { storageBucket } of movedDataObject.storageBag.storageBuckets) {
176-
const nodeUrl = bucketOperatorUrlById.get(storageBucket.id)
177-
if (nodeUrl) {
178-
const fileUrl = urljoin(nodeUrl, 'api/v1/files', movedDataObject.id)
183+
const limit = pLimit(asyncWorkersNumber)
184+
let checkedObjects = 0
185+
const checkReplicationThreshold = async (movedDataObject: DataObjectWithBagDetailsFragment) => {
186+
++checkedObjects
187+
if (checkedObjects % asyncWorkersNumber === 0) {
188+
logger.debug(
189+
`Checking replication: ${checkedObjects}/${movedDataObjects.length} (active: ${limit.activeCount}, pending: ${limit.pendingCount})`
190+
)
191+
}
192+
193+
const externaBucketEndpoints = movedDataObject.storageBag.storageBuckets
194+
.map(({ storageBucket: { id } }) => {
195+
return bucketOperatorUrlById.get(id)
196+
})
197+
.filter((url): url is string => !!url)
198+
let lastErr = ''
199+
let successes = 0
200+
let failures = 0
201+
202+
if (externaBucketEndpoints.length >= MINIMUM_REPLICATION_THRESHOLD) {
203+
for (const nodeUrl of externaBucketEndpoints) {
204+
const fileUrl = urljoin(nodeUrl, 'api/v1/files', movedDataObject.id)
205+
try {
179206
await superagent.head(fileUrl).timeout(timeoutMs).set('X-COLOSSUS-HOST-ID', hostId)
180-
dataObjectReplicationCount++
207+
++successes
208+
} catch (e) {
209+
++failures
210+
lastErr = e instanceof Error ? e.message : e.toString()
211+
}
212+
if (successes >= MINIMUM_REPLICATION_THRESHOLD) {
213+
break
181214
}
182215
}
216+
}
183217

184-
if (dataObjectReplicationCount < MINIMUM_REPLICATION_THRESHOLD) {
185-
logger.warn(`data object replication threshold unmet - file deletion canceled: ${movedDataObject.id}`)
186-
return
187-
}
218+
if (successes < MINIMUM_REPLICATION_THRESHOLD) {
219+
logger.debug(
220+
`Replication threshold unmet for object ${movedDataObject.id} ` +
221+
`(buckets: ${externaBucketEndpoints.length}, successes: ${successes}, failures: ${failures}). ` +
222+
(lastErr ? `Last error: ${lastErr}. ` : '') +
223+
`File deletion canceled...`
224+
)
225+
return
226+
}
227+
228+
deletionTasks.push(new DeleteLocalFileTask(uploadDirectory, movedDataObject.id))
229+
}
230+
231+
await Promise.all(movedDataObjects.map((movedDataObject) => limit(() => checkReplicationThreshold(movedDataObject))))
232+
233+
const failedCount = movedDataObjects.length - deletionTasks.length
234+
if (failedCount > 0) {
235+
logger.warn(
236+
`Replication threshold was unmet or couldn't be verified for ${failedCount} / ${movedDataObjects.length} objects in the current batch.`
237+
)
238+
}
188239

189-
deletionTasks.push(new DeleteLocalFileTask(uploadDirectory, movedDataObject.id))
190-
})
191-
)
240+
logger.debug('Checking replication: Done')
192241

193242
return deletionTasks
194243
}

storage-node/src/services/sync/storageObligations.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -194,7 +194,7 @@ async function getAllBuckets(api: QueryNodeApi): Promise<StorageBucketDetailsFra
194194
return await getAllObjectsWithPaging(async (offset, limit) => {
195195
const idsPart = ids.slice(offset, offset + limit)
196196
if (!_.isEmpty(idsPart)) {
197-
logger.debug(`Sync - getting all storage buckets: offset = ${offset}, limit = ${limit}`)
197+
logger.debug(`Getting all storage buckets: offset = ${offset}, limit = ${limit}`)
198198
return await api.getStorageBucketDetails(idsPart)
199199
} else {
200200
return false

storage-node/src/services/sync/synchronizer.ts

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ export const PendingDirName = 'pending'
3535
* @param qnApi - Query Node API
3636
* @param uploadDirectory - local directory to get file names from
3737
* @param tempDirectory - local directory for temporary data uploading
38+
* @param batchSize - maximum number of data objects to process in a single batch
3839
* @param selectedOperatorUrl - (optional) defines the data source URL. If not set
3940
* the source URL is resolved for each data object separately using the Query
4041
* Node information about the storage providers.
@@ -46,6 +47,7 @@ export async function performSync(
4647
qnApi: QueryNodeApi,
4748
uploadDirectory: string,
4849
tempDirectory: string,
50+
batchSize: number,
4951
hostId: string,
5052
selectedOperatorUrl?: string
5153
): Promise<void> {
@@ -64,10 +66,10 @@ export async function performSync(
6466
const workingStack = new WorkingStack()
6567
const processSpawner = new TaskProcessorSpawner(workingStack, asyncWorkersNumber)
6668

67-
// Process unsynced objects in batches od 10_000
69+
// Process unsynced objects in batches
6870
logger.debug(`Sync - started processing...`)
6971
let processed = 0
70-
for (const unsyncedIdsBatch of _.chunk(unsyncedObjectIds, 10_000)) {
72+
for (const unsyncedIdsBatch of _.chunk(unsyncedObjectIds, batchSize)) {
7173
const objectsBatch = await getDataObjectsByIDs(qnApi, unsyncedIdsBatch)
7274
const syncTasks = await getDownloadTasks(
7375
model,

yarn.lock

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18274,7 +18274,7 @@ p-is-promise@^2.0.0:
1827418274
resolved "https://registry.npmjs.org/p-is-promise/-/p-is-promise-2.1.0.tgz"
1827518275
integrity sha512-Y3W0wlRPK8ZMRbNq97l4M5otioeA5lm1z7bkNkxCka8HSPjR0xRWmpCmc9utiaLP9Jb1eD8BgeIxTW4AIF45Pg==
1827618276

18277-
p-limit@3.1.0, p-limit@^3.0.2:
18277+
p-limit@3.1.0, p-limit@^3, p-limit@^3.0.2:
1827818278
version "3.1.0"
1827918279
resolved "https://registry.npmjs.org/p-limit/-/p-limit-3.1.0.tgz"
1828018280
integrity sha512-TYOanM3wGwNGsZN2cVTYPArw454xnXj5qmWF1bEoAc4+cU/ol7GVh7odevjp1FNHduHc3KZMcFduxU5Xc6uJRQ==

0 commit comments

Comments
 (0)