Skip to content

Commit 05ec3b7

Browse files
authored
Merge pull request #547 from RADAR-base/addStorageIndex
Added a StorageIndex for the source storage to reduce LIST calls
2 parents 0de47d2 + c0f5373 commit 05ec3b7

40 files changed

+592
-104
lines changed

.editorconfig

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1 @@
11
root = true
2-
3-
[*.kt]
4-
ktlint_standard_no-wildcard-imports = disabled

README.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,8 +118,16 @@ source:
118118
# only actually needed if source type is hdfs
119119
azure:
120120
# azure options
121+
index:
122+
# Interval to fully synchronize the index with the source storage
123+
fullSyncInterval: 3600
124+
# Interval to sync empty directories with.
125+
# They are also synced during a full sync.
126+
emptyDirectorySyncInterval: 900
121127
```
122128

129+
The index makes a scan of the source before any operations. Further list operations are done on the index only. This is especially relevant for S3 storage where list operations are priced.
130+
123131
The target is similar, and in addition supports the local file system (`local`).
124132

125133
```yaml

restructure.yml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,12 @@ source:
3434
# only actually needed if source type is hdfs
3535
hdfs:
3636
nameNodes: [hdfs-namenode-1, hdfs-namenode-2]
37+
index:
38+
# Interval to fully synchronize the index with the storage
39+
fullSyncInterval: 3600
40+
# Interval to sync empty directories with.
41+
# They are also synced during a full sync.
42+
emptyDirectorySyncInterval: 900
3743

3844
# Target data resource
3945
# @since: 0.7.0

src/integrationTest/java/org/radarbase/output/RestructureS3IntegrationTest.kt

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,28 @@
11
package org.radarbase.output
22

3-
import io.minio.*
3+
import io.minio.BucketExistsArgs
4+
import io.minio.GetObjectArgs
5+
import io.minio.ListObjectsArgs
6+
import io.minio.MakeBucketArgs
47
import io.minio.ObjectWriteArgs.MAX_PART_SIZE
5-
import kotlinx.coroutines.*
8+
import io.minio.PutObjectArgs
9+
import io.minio.RemoveBucketArgs
10+
import io.minio.RemoveObjectArgs
11+
import kotlinx.coroutines.Dispatchers
12+
import kotlinx.coroutines.coroutineScope
13+
import kotlinx.coroutines.joinAll
14+
import kotlinx.coroutines.launch
615
import kotlinx.coroutines.test.runTest
16+
import kotlinx.coroutines.withContext
717
import org.junit.jupiter.api.Assertions.assertEquals
818
import org.junit.jupiter.api.Test
9-
import org.radarbase.output.config.*
19+
import org.radarbase.output.config.PathConfig
20+
import org.radarbase.output.config.PathFormatterConfig
21+
import org.radarbase.output.config.ResourceConfig
22+
import org.radarbase.output.config.RestructureConfig
23+
import org.radarbase.output.config.S3Config
24+
import org.radarbase.output.config.TopicConfig
25+
import org.radarbase.output.config.WorkerConfig
1026
import org.radarbase.output.util.SuspendedCloseable.Companion.useSuspended
1127
import org.radarbase.output.util.Timer
1228
import org.radarbase.output.util.bucketBuild

src/integrationTest/java/org/radarbase/output/accounting/OffsetRangeRedisTest.kt

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,11 @@ package org.radarbase.output.accounting
22

33
import kotlinx.coroutines.test.runTest
44
import org.junit.jupiter.api.AfterEach
5-
import org.junit.jupiter.api.Assertions.*
5+
import org.junit.jupiter.api.Assertions.assertEquals
6+
import org.junit.jupiter.api.Assertions.assertFalse
7+
import org.junit.jupiter.api.Assertions.assertNotNull
8+
import org.junit.jupiter.api.Assertions.assertNull
9+
import org.junit.jupiter.api.Assertions.assertTrue
610
import org.junit.jupiter.api.BeforeEach
711
import org.junit.jupiter.api.Test
812
import org.radarbase.output.accounting.OffsetRedisPersistence.Companion.redisOffsetReader

src/main/java/org/radarbase/output/Application.kt

Lines changed: 30 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,18 +18,26 @@ package org.radarbase.output
1818

1919
import com.beust.jcommander.JCommander
2020
import com.beust.jcommander.ParameterException
21-
import kotlinx.coroutines.*
21+
import kotlinx.coroutines.launch
22+
import kotlinx.coroutines.runBlocking
2223
import kotlinx.coroutines.sync.Mutex
2324
import kotlinx.coroutines.sync.Semaphore
24-
import org.radarbase.output.accounting.*
25+
import org.radarbase.output.accounting.Accountant
26+
import org.radarbase.output.accounting.OffsetPersistenceFactory
27+
import org.radarbase.output.accounting.OffsetRedisPersistence
28+
import org.radarbase.output.accounting.RedisHolder
29+
import org.radarbase.output.accounting.RedisRemoteLockManager
30+
import org.radarbase.output.accounting.RemoteLockManager
2531
import org.radarbase.output.cleaner.SourceDataCleaner
2632
import org.radarbase.output.compression.Compression
2733
import org.radarbase.output.config.CommandLineArgs
2834
import org.radarbase.output.config.RestructureConfig
2935
import org.radarbase.output.format.RecordConverterFactory
3036
import org.radarbase.output.path.RecordPathFactory
37+
import org.radarbase.output.source.InMemoryStorageIndex
3138
import org.radarbase.output.source.SourceStorage
3239
import org.radarbase.output.source.SourceStorageFactory
40+
import org.radarbase.output.source.StorageIndexManager
3341
import org.radarbase.output.target.TargetStorage
3442
import org.radarbase.output.target.TargetStorageFactory
3543
import org.radarbase.output.util.Timer
@@ -39,7 +47,9 @@ import org.radarbase.output.worker.RadarKafkaRestructure
3947
import org.slf4j.LoggerFactory
4048
import redis.clients.jedis.JedisPool
4149
import java.io.IOException
50+
import java.nio.file.Path
4251
import java.text.NumberFormat
52+
import java.time.Duration
4353
import java.time.LocalDateTime
4454
import java.time.format.DateTimeFormatter
4555
import java.util.concurrent.atomic.LongAdder
@@ -78,9 +88,27 @@ class Application(
7888

7989
override val workerSemaphore = Semaphore(config.worker.numThreads * 2)
8090

91+
override val storageIndexManagers: Map<Path, StorageIndexManager>
92+
8193
private val jobs: List<Job>
8294

8395
init {
96+
val indexConfig = config.source.index
97+
val (fullScan, emptyScan) = if (indexConfig == null) {
98+
listOf(3600L, 900L)
99+
} else {
100+
listOf(indexConfig.fullSyncInterval, indexConfig.emptyDirectorySyncInterval)
101+
}.map { Duration.ofSeconds(it) }
102+
103+
storageIndexManagers = config.paths.inputs.associateWith { input ->
104+
StorageIndexManager(
105+
InMemoryStorageIndex(),
106+
sourceStorage,
107+
input,
108+
fullScan,
109+
emptyScan,
110+
)
111+
}
84112
val serviceMutex = Mutex()
85113
jobs = listOfNotNull(
86114
RadarKafkaRestructure.job(config, serviceMutex),

src/main/java/org/radarbase/output/FileStoreFactory.kt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,9 +26,11 @@ import org.radarbase.output.config.RestructureConfig
2626
import org.radarbase.output.format.RecordConverterFactory
2727
import org.radarbase.output.path.RecordPathFactory
2828
import org.radarbase.output.source.SourceStorage
29+
import org.radarbase.output.source.StorageIndexManager
2930
import org.radarbase.output.target.TargetStorage
3031
import org.radarbase.output.worker.FileCacheStore
3132
import java.io.IOException
33+
import java.nio.file.Path
3234

3335
/** Factory for all factory classes and settings. */
3436
interface FileStoreFactory {
@@ -42,6 +44,7 @@ interface FileStoreFactory {
4244
val redisHolder: RedisHolder
4345
val offsetPersistenceFactory: OffsetPersistenceFactory
4446
val workerSemaphore: Semaphore
47+
val storageIndexManagers: Map<Path, StorageIndexManager>
4548

4649
@Throws(IOException::class)
4750
fun newFileCacheStore(accountant: Accountant): FileCacheStore

src/main/java/org/radarbase/output/cleaner/SourceDataCleaner.kt

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@ import org.radarbase.output.FileStoreFactory
1111
import org.radarbase.output.accounting.Accountant
1212
import org.radarbase.output.accounting.AccountantImpl
1313
import org.radarbase.output.config.RestructureConfig
14+
import org.radarbase.output.source.StorageIndex
15+
import org.radarbase.output.source.StorageNode
1416
import org.radarbase.output.util.ResourceContext.Companion.resourceContext
1517
import org.radarbase.output.util.SuspendedCloseable.Companion.useSuspended
1618
import org.radarbase.output.util.Timer
@@ -43,11 +45,11 @@ class SourceDataCleaner(
4345
private val supervisor = SupervisorJob()
4446

4547
@Throws(IOException::class, InterruptedException::class)
46-
suspend fun process(directoryName: String) {
48+
suspend fun process(storageIndex: StorageIndex, directoryName: String) {
4749
// Get files and directories
4850
val absolutePath = Paths.get(directoryName)
4951

50-
val paths = topicPaths(absolutePath)
52+
val paths = topicPaths(storageIndex, absolutePath)
5153

5254
logger.info("{} topics found", paths.size)
5355

@@ -56,7 +58,7 @@ class SourceDataCleaner(
5658
launch {
5759
try {
5860
val deleteCount = fileStoreFactory.workerSemaphore.withPermit {
59-
mapTopic(p)
61+
mapTopic(storageIndex, p)
6062
}
6163
if (deleteCount > 0) {
6264
logger.info("Removed {} files in topic {}", deleteCount, p.fileName)
@@ -70,7 +72,7 @@ class SourceDataCleaner(
7072
}
7173
}
7274

73-
private suspend fun mapTopic(topicPath: Path): Long {
75+
private suspend fun mapTopic(storageIndex: StorageIndex, topicPath: Path): Long {
7476
val topic = topicPath.fileName.toString()
7577
return try {
7678
lockManager.tryWithLock(topic) {
@@ -84,7 +86,7 @@ class SourceDataCleaner(
8486
fileStoreFactory,
8587
)
8688
}
87-
deleteOldFiles(accountant, extractionCheck, topic, topicPath).toLong()
89+
deleteOldFiles(storageIndex, accountant, extractionCheck, topic, topicPath).toLong()
8890
}
8991
}
9092
}
@@ -95,14 +97,15 @@ class SourceDataCleaner(
9597
}
9698

9799
private suspend fun deleteOldFiles(
100+
storageIndex: StorageIndex,
98101
accountant: Accountant,
99102
extractionCheck: ExtractionCheck,
100103
topic: String,
101104
topicPath: Path,
102105
): Int {
103106
val offsets = accountant.offsets.copyForTopic(topic)
104107

105-
val paths = sourceStorage.listTopicFiles(topic, topicPath, maxFilesPerTopic) { f ->
108+
val paths = sourceStorage.listTopicFiles(storageIndex, topic, topicPath, maxFilesPerTopic) { f ->
106109
f.lastModified.isBefore(deleteThreshold) &&
107110
// ensure that there is a file with a larger offset also
108111
// processed, so the largest offset is never removed.
@@ -115,6 +118,7 @@ class SourceDataCleaner(
115118
logger.info("Removing {}", file.path)
116119
Timer.time("cleaner.delete") {
117120
sourceStorage.delete(file.path)
121+
storageIndex.remove(StorageNode.StorageFile(file.path, Instant.MIN))
118122
}
119123
true
120124
} else {
@@ -127,8 +131,8 @@ class SourceDataCleaner(
127131
}
128132
}
129133

130-
private suspend fun topicPaths(path: Path): List<Path> =
131-
sourceStorage.listTopics(path, excludeTopics)
134+
private suspend fun topicPaths(storageIndex: StorageIndex, path: Path): List<Path> =
135+
sourceStorage.listTopics(storageIndex, path, excludeTopics)
132136
// different services start on different topics to decrease lock contention
133137
.shuffled()
134138

@@ -147,9 +151,10 @@ class SourceDataCleaner(
147151

148152
private suspend fun runCleaner(factory: FileStoreFactory) {
149153
SourceDataCleaner(factory).useSuspended { cleaner ->
150-
for (input in factory.config.paths.inputs) {
154+
for ((input, indexManager) in factory.storageIndexManagers) {
155+
indexManager.update()
151156
logger.info("Cleaning {}", input)
152-
cleaner.process(input.toString())
157+
cleaner.process(indexManager.storageIndex, input.toString())
153158
}
154159
logger.info("Cleaned up {} files", cleaner.deletedFileCount.format())
155160
}

src/main/java/org/radarbase/output/config/ResourceConfig.kt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ data class ResourceConfig(
1111
val hdfs: HdfsConfig? = null,
1212
val local: LocalConfig? = null,
1313
val azure: AzureConfig? = null,
14+
val index: StorageIndexConfig? = null,
1415
) {
1516
@get:JsonIgnore
1617
val sourceType: ResourceType by lazy {
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
package org.radarbase.output.config
2+
3+
data class StorageIndexConfig(
4+
/** How often to fully sync the storage index, in seconds. */
5+
val fullSyncInterval: Long = 3600L,
6+
/**
7+
* How often to sync empty directories with the storage index, in seconds.
8+
* If this is very large, empty directories will only be scanned during
9+
* full sync.
10+
*/
11+
val emptyDirectorySyncInterval: Long = 900L,
12+
)

0 commit comments

Comments
 (0)