Skip to content

Commit 7144462

Browse files
authored
Merge pull request #5194 from Lezek123/rework-sync-and-cleanup
Colossus: Rework sync and cleanup
2 parents 9308382 + 03ff3ba commit 7144462

File tree

15 files changed

+777
-613
lines changed

15 files changed

+777
-613
lines changed

storage-node/CHANGELOG.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,13 @@
1+
### 4.4.0
2+
3+
- **Optimizations:** The way data objects / data object ids are queried and processed during sync and cleanup has been optimized:
4+
- Sync and cleanup services now process tasks in batches of configurable size (`--syncBatchSize`, `--cleanupBatchSize`) to avoid overflowing the memory.
5+
- Synchronous operations like `sort` or `filter` on larger arrays of data objects have been optimized (for example, by replacing `.filter(Array.includes(...))` with `.filter(Set.has(...))`).
6+
- Enforced a limit of max. results per single GraphQL query to `10,000` and max input arguments per query to `1,000`.
7+
- Added `--cleanupWorkersNumber` flag to limit the number of concurrent async requests during cleanup.
8+
- A safety mechanism was added to avoid removing "deleted" objects for which a related `DataObjectDeleted` event cannot be found in storage squid.
9+
- Improved logging during sync and cleanup.
10+
111
### 4.3.0
212

313
- Adds `archive` mode / command, which allows downloading, compressing and uploading all data objects to an external S3 bucket that can be used as a backup.

storage-node/README.md

Lines changed: 260 additions & 292 deletions
Large diffs are not rendered by default.

storage-node/package.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
{
22
"name": "storage-node",
33
"description": "Joystream storage subsystem.",
4-
"version": "4.3.0",
4+
"version": "4.4.0",
55
"author": "Joystream contributors",
66
"bin": {
77
"storage-node": "./bin/run"
@@ -54,6 +54,7 @@
5454
"multihashes": "^4.0.2",
5555
"node-cache": "^5.1.2",
5656
"openapi-editor": "^0.3.0",
57+
"p-limit": "^3",
5758
"promise-timeout": "^1.3.0",
5859
"proper-lockfile": "^4.1.2",
5960
"react": "^18.2.0",

storage-node/src/commands/server.ts

Lines changed: 29 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -78,16 +78,29 @@ export default class Server extends ApiCommandBase {
7878
description: 'Interval before retrying failed synchronization run (in minutes)',
7979
default: 3,
8080
}),
81+
syncBatchSize: flags.integer({
82+
description: 'Maximum number of objects to process in a single batch during synchronization.',
83+
default: 10_000,
84+
}),
8185
cleanup: flags.boolean({
8286
char: 'c',
8387
description: 'Enable cleanup/pruning of no-longer assigned assets.',
8488
default: false,
8589
}),
90+
cleanupBatchSize: flags.integer({
91+
description: 'Maximum number of objects to process in a single batch during cleanup.',
92+
default: 10_000,
93+
}),
8694
cleanupInterval: flags.integer({
8795
char: 'i',
8896
description: 'Interval between periodic cleanup actions (in minutes)',
8997
default: 360,
9098
}),
99+
cleanupWorkersNumber: flags.integer({
100+
required: false,
101+
description: 'Cleanup workers number (max async operations in progress).',
102+
default: 100,
103+
}),
91104
storageSquidEndpoint: flags.string({
92105
char: 'q',
93106
required: true,
@@ -299,6 +312,7 @@ Supported values: warn, error, debug, info. Default:debug`,
299312
flags.syncWorkersTimeout,
300313
flags.syncInterval,
301314
flags.syncRetryInterval,
315+
flags.syncBatchSize,
302316
X_HOST_ID
303317
),
304318
0
@@ -319,8 +333,9 @@ Supported values: warn, error, debug, info. Default:debug`,
319333
api,
320334
qnApi,
321335
flags.uploads,
322-
flags.syncWorkersNumber,
336+
flags.cleanupWorkersNumber,
323337
flags.cleanupInterval,
338+
flags.cleanupBatchSize,
324339
X_HOST_ID
325340
),
326341
0
@@ -397,14 +412,24 @@ async function runSyncWithInterval(
397412
syncWorkersTimeout: number,
398413
syncIntervalMinutes: number,
399414
syncRetryIntervalMinutes: number,
415+
syncBatchSize: number,
400416
hostId: string
401417
) {
402418
const sleepInterval = syncIntervalMinutes * 60 * 1000
403419
const retrySleepInterval = syncRetryIntervalMinutes * 60 * 1000
404420
while (true) {
405421
try {
406422
logger.info(`Resume syncing....`)
407-
await performSync(buckets, syncWorkersNumber, syncWorkersTimeout, qnApi, uploadsDirectory, tempDirectory, hostId)
423+
await performSync(
424+
buckets,
425+
syncWorkersNumber,
426+
syncWorkersTimeout,
427+
qnApi,
428+
uploadsDirectory,
429+
tempDirectory,
430+
syncBatchSize,
431+
hostId
432+
)
408433
logger.info(`Sync run complete. Next run in ${syncIntervalMinutes} minute(s).`)
409434
await sleep(sleepInterval)
410435
} catch (err) {
@@ -434,6 +459,7 @@ async function runCleanupWithInterval(
434459
uploadsDirectory: string,
435460
syncWorkersNumber: number,
436461
cleanupIntervalMinutes: number,
462+
cleanupBatchSize: number,
437463
hostId: string
438464
) {
439465
const sleepInterval = cleanupIntervalMinutes * 60 * 1000
@@ -442,7 +468,7 @@ async function runCleanupWithInterval(
442468
await sleep(sleepInterval)
443469
try {
444470
logger.info(`Resume cleanup....`)
445-
await performCleanup(buckets, syncWorkersNumber, api, qnApi, uploadsDirectory, hostId)
471+
await performCleanup(buckets, syncWorkersNumber, api, qnApi, uploadsDirectory, cleanupBatchSize, hostId)
446472
} catch (err) {
447473
logger.error(`Critical cleanup error: ${err}`)
448474
}

storage-node/src/commands/util/cleanup.ts

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,10 @@ export default class Cleanup extends ApiCommandBase {
2727
required: true,
2828
description: 'The buckerId to sync prune/cleanup',
2929
}),
30+
cleanupBatchSize: flags.integer({
31+
description: 'Maximum number of objects to process in a single batch during cleanup.',
32+
default: 10_000,
33+
}),
3034
cleanupWorkersNumber: flags.integer({
3135
char: 'p',
3236
required: false,
@@ -57,7 +61,15 @@ export default class Cleanup extends ApiCommandBase {
5761
logger.info('Cleanup...')
5862

5963
try {
60-
await performCleanup([bucketId], flags.cleanupWorkersNumber, api, qnApi, flags.uploads, '')
64+
await performCleanup(
65+
[bucketId],
66+
flags.cleanupWorkersNumber,
67+
api,
68+
qnApi,
69+
flags.uploads,
70+
flags.cleanupBatchSize,
71+
''
72+
)
6173
} catch (err) {
6274
logger.error(err)
6375
logger.error(stringify(err))

storage-node/src/commands/util/fetch-bucket.ts

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,10 @@ export default class FetchBucket extends Command {
3636
description: 'Asset downloading timeout for the syncronization (in minutes).',
3737
default: 30,
3838
}),
39+
syncBatchSize: flags.integer({
40+
description: 'Maximum number of objects to process in a single batch.',
41+
default: 10_000,
42+
}),
3943
queryNodeEndpoint: flags.string({
4044
char: 'q',
4145
required: false,
@@ -74,6 +78,7 @@ export default class FetchBucket extends Command {
7478
qnApi,
7579
flags.uploads,
7680
flags.tempFolder ? flags.tempFolder : path.join(flags.uploads, 'temp'),
81+
flags.syncBatchSize,
7782
'',
7883
flags.dataSourceOperatorUrl
7984
)

storage-node/src/services/archive/ArchiveService.ts

Lines changed: 40 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ import {
1313
OBJECTS_TRACKING_FILENAME,
1414
} from './tracking'
1515
import { QueryNodeApi } from '../queryNode/api'
16-
import { getStorageObligationsFromRuntime } from '../sync/storageObligations'
16+
import { getDataObjectsByIDs, getStorageObligationsFromRuntime } from '../sync/storageObligations'
1717
import { getDownloadTasks } from '../sync/synchronizer'
1818
import sleep from 'sleep-promise'
1919
import { Logger } from 'winston'
@@ -369,40 +369,49 @@ export class ArchiveService {
369369
public async performSync(): Promise<void> {
370370
const model = await getStorageObligationsFromRuntime(this.queryNodeApi)
371371

372-
const assignedObjects = model.dataObjects
373-
const added = assignedObjects.filter((obj) => !this.objectTrackingService.isTracked(obj.id))
374-
added.sort((a, b) => parseInt(b.id) - parseInt(a.id))
372+
const unsyncedIds = (await model.getAssignedDataObjectIds(true))
373+
.filter((id) => !this.objectTrackingService.isTracked(id))
374+
.map((id) => parseInt(id))
375+
// Sort unsynced ids in ASCENDING order (oldest first)
376+
.sort((a, b) => a - b)
375377

376-
this.logger.info(`Sync - new objects: ${added.length}`)
378+
this.logger.info(`Sync - new objects: ${unsyncedIds.length}`)
377379

378-
// Add new download tasks while the upload dir size limit allows
379-
while (added.length) {
380-
const uploadDirectorySize = await this.getUploadDirSize()
381-
while (true) {
382-
const object = added.pop()
383-
if (!object) {
384-
break
385-
}
386-
if (object.size + uploadDirectorySize + this.syncQueueObjectsSize > this.uploadDirSizeLimit) {
387-
this.logger.debug(
388-
`Waiting for some disk space to free ` +
389-
`(upload_dir: ${uploadDirectorySize} / ${this.uploadDirSizeLimit}, ` +
390-
`sync_q=${this.syncQueueObjectsSize}, obj_size=${object.size})... `
380+
// Sync objects in batches of 10_000
381+
for (const unsyncedIdsBatch of _.chunk(unsyncedIds, 10_000)) {
382+
const objectIdsBatch = unsyncedIdsBatch.map((id) => id.toString())
383+
// Sort objectsBatch by ids in DESCENDING order (because we're using .pop() to get the next object)
384+
const objectsBatch = (await getDataObjectsByIDs(this.queryNodeApi, objectIdsBatch)).sort(
385+
(a, b) => parseInt(b.id) - parseInt(a.id)
386+
)
387+
// Add new download tasks while the upload dir size limit allows
388+
while (objectsBatch.length) {
389+
const uploadDirectorySize = await this.getUploadDirSize()
390+
while (true) {
391+
const object = objectsBatch.pop()
392+
if (!object) {
393+
break
394+
}
395+
if (object.size + uploadDirectorySize + this.syncQueueObjectsSize > this.uploadDirSizeLimit) {
396+
this.logger.debug(
397+
`Waiting for some disk space to free ` +
398+
`(upload_dir: ${uploadDirectorySize} / ${this.uploadDirSizeLimit}, ` +
399+
`sync_q=${this.syncQueueObjectsSize}, obj_size=${object.size})... `
400+
)
401+
objectsBatch.push(object)
402+
await sleep(60_000)
403+
break
404+
}
405+
const [downloadTask] = await getDownloadTasks(
406+
model,
407+
[object],
408+
this.uploadQueueDir,
409+
this.tmpDownloadDir,
410+
this.syncWorkersTimeout,
411+
this.hostId
391412
)
392-
added.push(object)
393-
await sleep(60_000)
394-
break
413+
await this.addDownloadTask(downloadTask, object.size)
395414
}
396-
const [downloadTask] = await getDownloadTasks(
397-
model,
398-
[],
399-
[object],
400-
this.uploadQueueDir,
401-
this.tmpDownloadDir,
402-
this.syncWorkersTimeout,
403-
this.hostId
404-
)
405-
await this.addDownloadTask(downloadTask, object.size)
406415
}
407416
}
408417
}

0 commit comments

Comments
 (0)