diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 0000000..78b36ca --- /dev/null +++ b/.editorconfig @@ -0,0 +1 @@ +root = true diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml new file mode 100644 index 0000000..ce2214a --- /dev/null +++ b/.github/workflows/codeql.yml @@ -0,0 +1,89 @@ +# For most projects, this workflow file will not need changing; you simply need +# to commit it to your repository. +# +# You may wish to alter this file to override the set of languages analyzed, +# or to provide custom queries or build logic. +# +# ******** NOTE ******** +# We have attempted to detect the languages in your repository. Please check +# the `language` matrix defined below to confirm you have the correct set of +# supported CodeQL languages. +# +name: "CodeQL" + +on: + push: + branches: [ "main", "dev" ] + pull_request: + branches: [ "main", "dev" ] + schedule: + - cron: '24 21 * * 0' + +jobs: + analyze: + name: Analyze + # Runner size impacts CodeQL analysis time. To learn more, please see: + # - https://gh.io/recommended-hardware-resources-for-running-codeql + # - https://gh.io/supported-runners-and-hardware-resources + # - https://gh.io/using-larger-runners + # Consider using larger runners for possible analysis time improvements. + runs-on: ${{ (matrix.language == 'swift' && 'macos-latest') || 'ubuntu-latest' }} + timeout-minutes: ${{ (matrix.language == 'swift' && 120) || 360 }} + permissions: + # required for all workflows + security-events: write + + # only required for workflows in private repositories + actions: read + contents: read + + strategy: + fail-fast: false + matrix: + language: [ 'java-kotlin' ] + # CodeQL supports [ 'c-cpp', 'csharp', 'go', 'java-kotlin', 'javascript-typescript', 'python', 'ruby', 'swift' ] + # Use only 'java-kotlin' to analyze code written in Java, Kotlin or both + # Use only 'javascript-typescript' to analyze code written in JavaScript, TypeScript or both + # Learn more about CodeQL language support at https://aka.ms/codeql-docs/language-support + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - uses: actions/setup-java@v4 + with: + distribution: 'temurin' # See 'Supported distributions' for available options + java-version: '17' + + # Initializes the CodeQL tools for scanning. + - name: Initialize CodeQL + uses: github/codeql-action/init@v3 + with: + languages: ${{ matrix.language }} + # If you wish to specify custom queries, you can do so here or in a config file. + # By default, queries listed here will override any specified in a config file. + # Prefix the list here with "+" to use these queries and those in the config file. + + # For more details on CodeQL's query packs, refer to: https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs + # queries: security-extended,security-and-quality + + + # Autobuild attempts to build any compiled languages (C/C++, C#, Go, Java, or Swift). + # If this step fails, then you should remove it and run the build manually (see below) + - name: Autobuild + uses: github/codeql-action/autobuild@v3 + + # â„šī¸ Command-line programs to run using the OS shell. + # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun + + # If the Autobuild fails above, remove it and uncomment the following three lines. + # modify them (or add more) to build your code if your project, please refer to the EXAMPLE below for guidance. + + # - run: | + # echo "Run, Build Application using script" + # ./location_of_script_within_repo/buildscript.sh + + - name: Perform CodeQL Analysis + uses: github/codeql-action/analyze@v3 + with: + category: "/language:${{matrix.language}}" \ No newline at end of file diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 8279283..7c03db5 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -95,8 +95,8 @@ jobs: # Use runtime labels from docker_meta as well as fixed labels labels: | ${{ steps.docker_meta.outputs.labels }} - maintainer=Joris Borgdorff - org.opencontainers.image.authors=Joris Borgdorff + maintainer=Bastiaan de Graaf + org.opencontainers.image.authors=Bastiaan de Graaf org.opencontainers.image.vendor=RADAR-base org.opencontainers.image.licenses=Apache-2.0 diff --git a/.github/workflows/publish_snapshots.yml b/.github/workflows/publish_snapshots.yml index e9eb6fb..5459a8e 100644 --- a/.github/workflows/publish_snapshots.yml +++ b/.github/workflows/publish_snapshots.yml @@ -17,10 +17,6 @@ jobs: # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it - uses: actions/checkout@v3 - - name: Has SNAPSHOT version - id: is-snapshot - run: grep 'version = ".*-SNAPSHOT"' build.gradle.kts - - uses: actions/setup-java@v3 with: distribution: temurin @@ -29,6 +25,11 @@ jobs: - name: Setup Gradle uses: gradle/gradle-build-action@v2 + - name: Has SNAPSHOT version + id: is-snapshot + run: | + ./gradlew properties | grep 'version: .*-SNAPSHOT' + - name: Install gpg secret key run: | cat <(echo -e "${{ secrets.OSSRH_GPG_SECRET_KEY }}") | gpg --batch --import diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index d48e3dd..d224bdf 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -91,8 +91,8 @@ jobs: # Use runtime labels from docker_meta as well as fixed labels labels: | ${{ steps.docker_meta.outputs.labels }} - maintainer=Joris Borgdorff - org.opencontainers.image.authors=Joris Borgdorff + maintainer=Bastiaan de Graaf + org.opencontainers.image.authors=Bastiaan de Graaf org.opencontainers.image.vendor=RADAR-base org.opencontainers.image.licenses=Apache-2.0 diff --git a/.github/workflows/snyk.yaml b/.github/workflows/snyk.yaml index 06f5c8b..2fe88dc 100644 --- a/.github/workflows/snyk.yaml +++ b/.github/workflows/snyk.yaml @@ -3,6 +3,7 @@ on: pull_request: branches: - main + - dev jobs: security: @@ -29,3 +30,6 @@ jobs: --configuration-matching='^runtimeClasspath$' --org=radar-base --policy-path=$PWD/.snyk + --all-projects + --severity-threshold=high + --fail-on=upgradable diff --git a/Dockerfile b/Dockerfile index 04e1eff..9077fd1 100644 --- a/Dockerfile +++ b/Dockerfile @@ -10,20 +10,21 @@ # See the License for the specific language governing permissions and # limitations under the License. -FROM --platform=$BUILDPLATFORM gradle:7.5-jdk17 AS builder +FROM --platform=$BUILDPLATFORM gradle:8.4-jdk17 AS builder RUN mkdir /code WORKDIR /code ENV GRADLE_USER_HOME=/code/.gradlecache \ - GRADLE_OPTS=-Djdk.lang.Process.launchMechanism=vfork + GRADLE_OPTS="-Djdk.lang.Process.launchMechanism=vfork -Dorg.gradle.vfs.watch=false" COPY ./build.gradle.kts ./gradle.properties ./settings.gradle.kts /code/ +COPY ./buildSrc /code/buildSrc -RUN gradle downloadDependencies copyDependencies startScripts --no-watch-fs +RUN gradle downloadDependencies copyDependencies startScripts COPY ./src /code/src -RUN gradle jar --no-watch-fs +RUN gradle jar FROM eclipse-temurin:17-jre diff --git a/README.md b/README.md index e1f8ab6..7e56876 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # Restructure Kafka connector output files Data streamed by a Kafka Connector will be converted to a RADAR-base oriented output directory, by organizing it by project, user and collection date. -It supports data written by [RADAR S3 sink connector](https://github.com/RADAR-base/RADAR-S3-Connector) is streamed to files based on topic name only. This package transforms that output to a local directory structure as follows: `projectId/userId/topic/date_hour.csv`. The date and hour are extracted from the `time` field of each record, and is formatted in UTC time. This package is included in the [RADAR-Docker](https://github.com/RADAR-base/RADAR-Docker) repository, in the `dcompose/radar-cp-hadoop-stack/bin/hdfs-restructure` script. +It supports data written by [RADAR S3 sink connector](https://github.com/RADAR-base/RADAR-S3-Connector) is streamed to files based on topic name only. This package transforms that output to a local directory structure as follows: `projectId/userId/topic/date_hour.csv`. The date and hour are extracted from the `time` field of each record, and is formatted in UTC time. ## Upgrade instructions @@ -90,7 +90,7 @@ By default, this will output the data in CSV format. If JSON format is preferred radar-output-restructure --format json --output-directory [ ...] ``` -By default, files records are not deduplicated after writing. To enable this behaviour, specify the option `--deduplicate` or `-d`. This set to false by default because of an issue with Biovotion data. Please see - [issue #16](https://github.com/RADAR-base/Restructure-HDFS-topic/issues/16) before enabling it. Deduplication can also be enabled or disabled per topic using the config file. If lines should be deduplicated using a subset of fields, e.g. only `sourceId` and `time` define a unique record and only the last record with duplicate values should be kept, then specify `topics: : deduplication: distinctFields: [key.sourceId, value.time]`. +By default, files records are not deduplicated after writing. To enable this behaviour, specify the option `--deduplicate` or `-d`. This set to false by default because of an issue with Biovotion data. Please see - [issue #16](https://github.com/RADAR-base/radar-output-restructure/issues/16) before enabling it. Deduplication can also be enabled or disabled per topic using the config file. If lines should be deduplicated using a subset of fields, e.g. only `sourceId` and `time` define a unique record and only the last record with duplicate values should be kept, then specify `topics: : deduplication: distinctFields: [key.sourceId, value.time]`. ### Compression @@ -118,8 +118,16 @@ source: # only actually needed if source type is hdfs azure: # azure options + index: + # Interval to fully synchronize the index with the source storage + fullSyncInterval: 3600 + # Interval to sync empty directories with. + # They are also synced during a full sync. + emptyDirectorySyncInterval: 900 ``` +The index makes a scan of the source before any operations. Further list operations are done on the index only. This is especially relevant for S3 storage where list operations are priced. + The target is similar, and in addition supports the local file system (`local`). ```yaml diff --git a/build.gradle.kts b/build.gradle.kts index 391a360..9f40c6b 100644 --- a/build.gradle.kts +++ b/build.gradle.kts @@ -1,32 +1,43 @@ -import com.github.benmanes.gradle.versions.updates.DependencyUpdatesTask -import org.gradle.api.tasks.testing.logging.TestExceptionFormat.FULL -import org.jetbrains.kotlin.gradle.tasks.KotlinCompile +import org.radarbase.gradle.plugin.radarKotlin +import org.radarbase.gradle.plugin.radarPublishing import java.time.Duration plugins { - kotlin("jvm") - application - `maven-publish` - signing - id("org.jetbrains.dokka") - id("com.avast.gradle.docker-compose") - id("com.github.ben-manes.versions") - id("io.github.gradle-nexus.publish-plugin") - id("org.jlleitschuh.gradle.ktlint") version "11.0.0" + id("application") + id("org.radarbase.radar-root-project") version Versions.radarCommons + id("org.radarbase.radar-dependency-management") version Versions.radarCommons + id("org.radarbase.radar-kotlin") version Versions.radarCommons + id("org.radarbase.radar-publishing") version Versions.radarCommons + id("com.avast.gradle.docker-compose") version Versions.dockerCompose } -group = "org.radarbase" -version = "2.3.2" +description = "RADAR-base output restructuring" -repositories { - mavenCentral() +radarRootProject { + projectVersion.set(Versions.project) + gradleVersion.set(Versions.wrapper) } -description = "RADAR-base output restructuring" -val website = "https://radar-base.org" -val githubRepoName = "RADAR-base/radar-output-restructure" -val githubUrl = "https://github.com/$githubRepoName" -val issueUrl = "$githubUrl/issues" +radarKotlin { + kotlinVersion.set(Versions.kotlin) + javaVersion.set(Versions.java) + log4j2Version.set(Versions.log4j2) + slf4jVersion.set(Versions.slf4j) + junitVersion.set(Versions.junit) +} + +radarPublishing { + val githubRepoName = "RADAR-base/radar-output-restructure" + githubUrl.set("https://github.com/$githubRepoName.git") + developers { + developer { + id.set("bdegraaf1234") + name.set("Bastiaan de Graaf") + email.set("bastiaan@thehyve.nl") + organization.set("The Hyve") + } + } +} sourceSets { create("integrationTest") { @@ -37,101 +48,59 @@ sourceSets { configurations["integrationTestImplementation"].extendsFrom( configurations.implementation.get(), - configurations.testImplementation.get() + configurations.testImplementation.get(), ) configurations["integrationTestRuntimeOnly"].extendsFrom( configurations.runtimeOnly.get(), - configurations.testRuntimeOnly.get() + configurations.testRuntimeOnly.get(), ) dependencies { - val avroVersion: String by project - api("org.apache.avro:avro:$avroVersion") - val snappyVersion: String by project - runtimeOnly("org.xerial.snappy:snappy-java:$snappyVersion") + api("org.apache.avro:avro:${Versions.avro}") + runtimeOnly("org.xerial.snappy:snappy-java:${Versions.snappy}") implementation(kotlin("reflect")) - val coroutinesVersion: String by project - implementation("org.jetbrains.kotlinx:kotlinx-coroutines-core:$coroutinesVersion") + implementation("org.jetbrains.kotlinx:kotlinx-coroutines-core:${Versions.coroutines}") - val jacksonVersion: String by project - api(platform("com.fasterxml.jackson:jackson-bom:$jacksonVersion")) + api(platform("com.fasterxml.jackson:jackson-bom:${Versions.jackson}")) implementation("com.fasterxml.jackson.core:jackson-databind") implementation("com.fasterxml.jackson.dataformat:jackson-dataformat-yaml") { - val snakeYamlVersion: String by project - runtimeOnly("org.yaml:snakeyaml:$snakeYamlVersion") + runtimeOnly("org.yaml:snakeyaml:${Versions.snakeYaml}") } implementation("com.fasterxml.jackson.dataformat:jackson-dataformat-csv") implementation("com.fasterxml.jackson.module:jackson-module-kotlin") implementation("com.fasterxml.jackson.datatype:jackson-datatype-jsr310") - val jedisVersion: String by project - implementation("redis.clients:jedis:$jedisVersion") - - val jCommanderVersion: String by project - implementation("com.beust:jcommander:$jCommanderVersion") + implementation("redis.clients:jedis:${Versions.jedis}") - val almworksVersion: String by project - implementation("com.almworks.integers:integers:$almworksVersion") + implementation("com.beust:jcommander:${Versions.jCommander}") - val minioVersion: String by project - implementation("io.minio:minio:$minioVersion") { - val guavaVersion: String by project - runtimeOnly("com.google.guava:guava:$guavaVersion") + implementation("com.almworks.integers:integers:${Versions.almworks}") - val okhttpVersion: String by project - runtimeOnly("com.squareup.okhttp3:okhttp:$okhttpVersion") + implementation("io.minio:minio:${Versions.minio}") { + runtimeOnly("com.google.guava:guava:${Versions.guava}") + runtimeOnly("com.squareup.okhttp3:okhttp:${Versions.okhttp}") } - val azureStorageVersion: String by project - implementation("com.azure:azure-storage-blob:$azureStorageVersion") { - val nettyVersion: String by project - runtimeOnly(platform("io.netty:netty-bom:$nettyVersion")) - val projectReactorNettyVersion: String by project - runtimeOnly("io.projectreactor.netty:reactor-netty-http:$projectReactorNettyVersion") + implementation("com.azure:azure-storage-blob:${Versions.azureStorage}") { + runtimeOnly(platform("io.netty:netty-bom:${Versions.netty}")) + runtimeOnly("io.projectreactor.netty:reactor-netty-http:${Versions.projectReactorNetty}") } - val opencsvVersion: String by project - implementation("com.opencsv:opencsv:$opencsvVersion") { - val apacheCommonsTextVersion: String by project - runtimeOnly("org.apache.commons:commons-text:$apacheCommonsTextVersion") + implementation("com.opencsv:opencsv:${Versions.opencsv}") { + runtimeOnly("org.apache.commons:commons-text:${Versions.apacheCommonsText}") } + implementation("org.radarbase:managementportal-client:${Versions.managementPortal}") + implementation("org.radarbase:radar-commons-kotlin:${Versions.radarCommons}") - val slf4jVersion: String by project - implementation("org.slf4j:slf4j-api:$slf4jVersion") + testImplementation("org.radarbase:radar-schemas-commons:${Versions.radarSchemas}") + testImplementation("org.jetbrains.kotlinx:kotlinx-coroutines-test:${Versions.coroutines}") - val log4jVersion: String by project - runtimeOnly("org.apache.logging.log4j:log4j-core:$log4jVersion") - runtimeOnly("org.apache.logging.log4j:log4j-slf4j2-impl:$log4jVersion") - runtimeOnly("org.apache.logging.log4j:log4j-jul:$log4jVersion") - - val radarSchemasVersion: String by project - testImplementation("org.radarbase:radar-schemas-commons:$radarSchemasVersion") - testImplementation("org.jetbrains.kotlinx:kotlinx-coroutines-test:$coroutinesVersion") - - val junitVersion: String by project - testImplementation("org.junit.jupiter:junit-jupiter-api:$junitVersion") - testImplementation("org.junit.jupiter:junit-jupiter-params:$junitVersion") - testRuntimeOnly("org.junit.jupiter:junit-jupiter-engine:$junitVersion") - testImplementation("org.hamcrest:hamcrest:2.2") - val mockitoKotlinVersion: String by project - testImplementation("org.mockito.kotlin:mockito-kotlin:$mockitoKotlinVersion") - - val dokkaVersion: String by project - dokkaHtmlPlugin("org.jetbrains.dokka:kotlin-as-java-plugin:$dokkaVersion") - - val jsoupVersion: String by project - dokkaPlugin("org.jsoup:jsoup:$jsoupVersion") - dokkaRuntime("org.jsoup:jsoup:$jsoupVersion") - dokkaPlugin(platform("com.fasterxml.jackson:jackson-bom:$jacksonVersion")) - dokkaRuntime(platform("com.fasterxml.jackson:jackson-bom:$jacksonVersion")) + testImplementation("org.hamcrest:hamcrest:${Versions.hamcrest}") + testImplementation("org.mockito.kotlin:mockito-kotlin:${Versions.mockitoKotlin}") } application { mainClass.set("org.radarbase.output.Application") - applicationDefaultJvmArgs = listOf( - "-Djava.security.egd=file:/dev/./urandom", - "-Djava.util.logging.manager=org.apache.logging.log4j.jul.LogManager", - ) } distributions { @@ -144,42 +113,6 @@ distributions { } } -tasks.withType { - kotlinOptions { - jvmTarget = "17" - apiVersion = "1.6" - languageVersion = "1.6" - freeCompilerArgs = listOf("-opt-in=kotlin.RequiresOptIn") - } -} - -// custom tasks for creating source/javadoc jars -val sourcesJar by tasks.registering(Jar::class) { - archiveClassifier.set("sources") - from(sourceSets.main.get().allSource) - dependsOn(tasks.classes) -} - -val dokkaJar by tasks.registering(Jar::class) { - archiveClassifier.set("javadoc") - from("$buildDir/dokka/javadoc/") - dependsOn(tasks.dokkaJavadoc) -} - -tasks.withType { - compression = Compression.GZIP - archiveExtension.set("tar.gz") -} - -tasks.withType { - manifest { - attributes( - "Implementation-Title" to project.name, - "Implementation-Version" to project.version - ) - } -} - tasks.startScripts { classpath = classpath?.let { it + files("lib/PlaceHolderForPluginPath") } @@ -189,77 +122,6 @@ tasks.startScripts { } } -publishing { - publications { - create("mavenJar") { - from(components["java"]) - artifact(sourcesJar) - artifact(dokkaJar) - pom { - name.set(project.name) - url.set(githubUrl) - description.set(project.description) - - licenses { - license { - name.set("The Apache Software License, Version 2.0") - url.set("https://www.apache.org/licenses/LICENSE-2.0.txt") - distribution.set("repo") - } - } - developers { - developer { - id.set("blootsvoets") - name.set("Joris Borgdorff") - email.set("joris@thehyve.nl") - organization.set("The Hyve") - } - } - issueManagement { - system.set("GitHub") - url.set(issueUrl) - } - organization { - name.set("RADAR-base") - url.set(website) - } - scm { - connection.set("scm:git:$githubUrl") - url.set(githubUrl) - } - } - } - } -} - -signing { - useGpgCmd() - isRequired = true - sign(tasks["sourcesJar"], tasks["dokkaJar"]) - sign(publishing.publications["mavenJar"]) -} - -tasks.withType { - onlyIf { gradle.taskGraph.hasTask(project.tasks["publish"]) } -} - -fun Project.propertyOrEnv(propertyName: String, envName: String): String? { - return if (hasProperty(propertyName)) { - property(propertyName)?.toString() - } else { - System.getenv(envName) - } -} - -nexusPublishing { - repositories { - sonatype { - username.set(propertyOrEnv("ossrh.user", "OSSRH_USER")) - password.set(propertyOrEnv("ossrh.password", "OSSRH_PASSWORD")) - } - } -} - val integrationTest by tasks.registering(Test::class) { description = "Runs integration tests." group = "verification" @@ -280,48 +142,3 @@ dockerCompose { tasks["composeUp"].dependsOn("composePull") tasks["check"].dependsOn(integrationTest) - -tasks.withType { - useJUnitPlatform() - testLogging { - events("passed", "skipped", "failed") - showStandardStreams = true - exceptionFormat = FULL - } -} - -tasks.register("downloadDependencies") { - doLast { - description = "Pre-downloads dependencies" - configurations.compileClasspath.get().files - configurations.runtimeClasspath.get().files - } - outputs.upToDateWhen { false } -} - -tasks.register("copyDependencies") { - from(configurations.runtimeClasspath.get().files) - into("$buildDir/third-party/") -} - -fun isNonStable(version: String): Boolean { - val stableKeyword = listOf("RELEASE", "FINAL", "GA", "JRE").any { version.toUpperCase().contains(it) } - val regex = "^[0-9,.v-]+(-r)?$".toRegex() - val isStable = stableKeyword || regex.matches(version) - return isStable.not() -} - -tasks.named("dependencyUpdates").configure { - rejectVersionIf { - isNonStable(candidate.version) - } -} - -ktlint { - version.set("0.45.2") - disabledRules.set(setOf("no-wildcard-imports")) -} - -tasks.wrapper { - gradleVersion = "7.6" -} diff --git a/buildSrc/build.gradle.kts b/buildSrc/build.gradle.kts new file mode 100644 index 0000000..7663976 --- /dev/null +++ b/buildSrc/build.gradle.kts @@ -0,0 +1,21 @@ +import org.jetbrains.kotlin.gradle.dsl.JvmTarget +import org.jetbrains.kotlin.gradle.tasks.KotlinCompile + +plugins { + kotlin("jvm") version "1.9.22" +} + +repositories { + mavenCentral() +} + +tasks.withType { + sourceCompatibility = "17" + targetCompatibility = "17" +} + +tasks.withType { + compilerOptions { + jvmTarget.set(JvmTarget.JVM_17) + } +} diff --git a/buildSrc/src/main/kotlin/Versions.kt b/buildSrc/src/main/kotlin/Versions.kt new file mode 100644 index 0000000..9e86d43 --- /dev/null +++ b/buildSrc/src/main/kotlin/Versions.kt @@ -0,0 +1,37 @@ +@Suppress("ConstPropertyName") +object Versions { + const val project = "3.0.0" + + const val java = 17 + const val kotlin = "1.9.22" + const val dockerCompose = "0.17.5" + + const val radarCommons = "1.1.2" + const val radarSchemas = "0.8.7" + const val jackson = "2.15.3" + const val slf4j = "2.0.9" + const val log4j2 = "2.21.0" + const val junit = "5.10.0" + const val avro = "1.11.3" + + const val mockitoKotlin = "5.1.0" + const val hamcrest = "2.2" + + const val wrapper = "8.4" + + const val managementPortal = "2.1.1" + const val coroutines = "1.7.3" + const val snappy = "1.1.10.5" + const val jCommander = "1.82" + const val almworks = "1.1.2" + const val minio = "8.5.10" + const val guava = "31.1-jre" + const val opencsv = "5.8" + const val okhttp = "4.12.0" + const val jedis = "jedis-3.6.2" + const val azureStorage = "12.25.1" + const val netty = "4.1.100.Final" + const val snakeYaml = "2.2" + const val apacheCommonsText = "1.10.0" + const val projectReactorNetty = "1.1.13" +} diff --git a/gradle.properties b/gradle.properties index cd4fcfd..7fc6f1f 100644 --- a/gradle.properties +++ b/gradle.properties @@ -1,31 +1 @@ kotlin.code.style=official - -kotlinVersion=1.7.22 -dokkaVersion=1.7.20 -dockerComposeVersion=0.16.11 -dependencyUpdateVersion=0.44.0 -nexusPublishVersion=1.1.0 -jsoupVersion=1.15.3 - -coroutinesVersion=1.6.4 -avroVersion=1.11.1 -snappyVersion=1.1.8.4 -jacksonVersion=2.14.1 -jCommanderVersion=1.82 -almworksVersion=1.1.2 -minioVersion=8.4.6 -guavaVersion=31.1-jre -opencsvVersion=5.7.1 -okhttpVersion=4.10.0 -jedisVersion=4.3.1 -slf4jVersion=2.0.5 -log4jVersion=2.19.0 -azureStorageVersion=12.20.1 -nettyVersion=4.1.85.Final -snakeYamlVersion=1.33 -apacheCommonsTextVersion=1.10.0 -projectReactorNettyVersion=1.0.24 - -junitVersion=5.9.1 -mockitoKotlinVersion=4.1.0 -radarSchemasVersion=0.8.2 diff --git a/gradle/wrapper/gradle-wrapper.jar b/gradle/wrapper/gradle-wrapper.jar index 943f0cb..7f93135 100644 Binary files a/gradle/wrapper/gradle-wrapper.jar and b/gradle/wrapper/gradle-wrapper.jar differ diff --git a/gradle/wrapper/gradle-wrapper.properties b/gradle/wrapper/gradle-wrapper.properties index f398c33..3fa8f86 100644 --- a/gradle/wrapper/gradle-wrapper.properties +++ b/gradle/wrapper/gradle-wrapper.properties @@ -1,6 +1,7 @@ distributionBase=GRADLE_USER_HOME distributionPath=wrapper/dists -distributionUrl=https\://services.gradle.org/distributions/gradle-7.6-bin.zip +distributionUrl=https\://services.gradle.org/distributions/gradle-8.4-bin.zip networkTimeout=10000 +validateDistributionUrl=true zipStoreBase=GRADLE_USER_HOME zipStorePath=wrapper/dists diff --git a/gradlew b/gradlew index 65dcd68..0adc8e1 100755 --- a/gradlew +++ b/gradlew @@ -83,10 +83,8 @@ done # This is normally unused # shellcheck disable=SC2034 APP_BASE_NAME=${0##*/} -APP_HOME=$( cd "${APP_HOME:-./}" && pwd -P ) || exit - -# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. -DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"' +# Discard cd standard output in case $CDPATH is set (https://github.com/gradle/gradle/issues/25036) +APP_HOME=$( cd "${APP_HOME:-./}" > /dev/null && pwd -P ) || exit # Use the maximum available, or set MAX_FD != -1 to use that value. MAX_FD=maximum @@ -133,10 +131,13 @@ location of your Java installation." fi else JAVACMD=java - which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. + if ! command -v java >/dev/null 2>&1 + then + die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. Please set the JAVA_HOME variable in your environment to match the location of your Java installation." + fi fi # Increase the maximum file descriptors if we can. @@ -144,7 +145,7 @@ if ! "$cygwin" && ! "$darwin" && ! "$nonstop" ; then case $MAX_FD in #( max*) # In POSIX sh, ulimit -H is undefined. That's why the result is checked to see if it worked. - # shellcheck disable=SC3045 + # shellcheck disable=SC3045 MAX_FD=$( ulimit -H -n ) || warn "Could not query maximum file descriptor limit" esac @@ -152,7 +153,7 @@ if ! "$cygwin" && ! "$darwin" && ! "$nonstop" ; then '' | soft) :;; #( *) # In POSIX sh, ulimit -n is undefined. That's why the result is checked to see if it worked. - # shellcheck disable=SC3045 + # shellcheck disable=SC3045 ulimit -n "$MAX_FD" || warn "Could not set maximum file descriptor limit to $MAX_FD" esac @@ -197,6 +198,10 @@ if "$cygwin" || "$msys" ; then done fi + +# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. +DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"' + # Collect all arguments for the java command; # * $DEFAULT_JVM_OPTS, $JAVA_OPTS, and $GRADLE_OPTS can contain fragments of # shell script including quotes and variable substitutions, so put them in diff --git a/restructure.yml b/restructure.yml index d8aa837..ae57190 100644 --- a/restructure.yml +++ b/restructure.yml @@ -7,7 +7,7 @@ service: # Source data resource # @since: 0.7.0 source: - type: s3 # hdfs, azure or s3 + type: s3 # azure or s3 s3: endpoint: http://localhost:9000 # using AWS S3 endpoint is also possible. bucket: radar @@ -31,9 +31,12 @@ source: # If true, try to read the metadata property "endOffset" to determine the # final offset of an input object. #endOffsetFromMetadata: false - # only actually needed if source type is hdfs - hdfs: - nameNodes: [hdfs-namenode-1, hdfs-namenode-2] + index: + # Interval to fully synchronize the index with the storage + fullSyncInterval: 3600 + # Interval to sync empty directories with. + # They are also synced during a full sync. + emptyDirectorySyncInterval: 900 # Target data resource # @since: 0.7.0 @@ -74,7 +77,7 @@ compression: # Compression type: none, zip or gzip type: gzip # Compression Factory class - # factory: org.radarbase.hdfs.compression.CompressionFactory + # factory: org.radarbase.output.compression.CompressionFactory # Additional compression properties # properties: {} @@ -90,9 +93,11 @@ format: # Ignore specific fields to consider records distinct. Disregarded if empty. # ignoreFields: [] # Format factory class - # factory: org.radarbase.hdfs.format.FormatFactory + # factory: org.radarbase.output.format.FormatFactory # Additional format properties # properties: {} + # Do not write certain fields to file + # excludeFields: [] # Worker settings. Each worker thread has its own cache and topic, so the # settings only apply to a single thread. @@ -147,6 +152,11 @@ topics: enable: true # deduplicate this topic only using given fields. distinctFields: [key.sourceId, value.time] + # Do not write certain fields to file + # In this case, exclude user and project ID since they are always the same + excludeFields: + - key.userId + - key.projectId # topic name connect_fitbit_source2: # deduplicate this topic, regardless of the format settings diff --git a/settings.gradle.kts b/settings.gradle.kts index 1df5164..8d5c2e7 100644 --- a/settings.gradle.kts +++ b/settings.gradle.kts @@ -1,20 +1,8 @@ -pluginManagement { - plugins { - val kotlinVersion: String by settings - kotlin("jvm") version kotlinVersion - - val dokkaVersion: String by settings - id("org.jetbrains.dokka") version dokkaVersion - - val dockerComposeVersion: String by settings - id("com.avast.gradle.docker-compose") version dockerComposeVersion - - val dependencyUpdateVersion: String by settings - id("com.github.ben-manes.versions") version dependencyUpdateVersion +rootProject.name = "radar-output-restructure" - val nexusPublishVersion: String by settings - id("io.github.gradle-nexus.publish-plugin") version nexusPublishVersion +pluginManagement { + repositories { + gradlePluginPortal() + mavenCentral() } } - -rootProject.name = "radar-output-restructure" diff --git a/src/integrationTest/java/org/radarbase/output/RestructureS3IntegrationTest.kt b/src/integrationTest/java/org/radarbase/output/RestructureS3IntegrationTest.kt index 1453ff5..d366350 100644 --- a/src/integrationTest/java/org/radarbase/output/RestructureS3IntegrationTest.kt +++ b/src/integrationTest/java/org/radarbase/output/RestructureS3IntegrationTest.kt @@ -1,12 +1,28 @@ package org.radarbase.output -import io.minio.* +import io.minio.BucketExistsArgs +import io.minio.GetObjectArgs +import io.minio.ListObjectsArgs +import io.minio.MakeBucketArgs import io.minio.ObjectWriteArgs.MAX_PART_SIZE -import kotlinx.coroutines.* +import io.minio.PutObjectArgs +import io.minio.RemoveBucketArgs +import io.minio.RemoveObjectArgs +import kotlinx.coroutines.Dispatchers +import kotlinx.coroutines.coroutineScope +import kotlinx.coroutines.joinAll +import kotlinx.coroutines.launch import kotlinx.coroutines.test.runTest +import kotlinx.coroutines.withContext import org.junit.jupiter.api.Assertions.assertEquals import org.junit.jupiter.api.Test -import org.radarbase.output.config.* +import org.radarbase.output.config.PathConfig +import org.radarbase.output.config.PathFormatterConfig +import org.radarbase.output.config.ResourceConfig +import org.radarbase.output.config.RestructureConfig +import org.radarbase.output.config.S3Config +import org.radarbase.output.config.TopicConfig +import org.radarbase.output.config.WorkerConfig import org.radarbase.output.util.SuspendedCloseable.Companion.useSuspended import org.radarbase.output.util.Timer import org.radarbase.output.util.bucketBuild @@ -14,8 +30,40 @@ import org.radarbase.output.util.objectBuild import java.nio.charset.StandardCharsets.UTF_8 import java.nio.file.Paths -@OptIn(ExperimentalCoroutinesApi::class) class RestructureS3IntegrationTest { + @Test + fun configuration() = runTest { + Timer.isEnabled = true + val sourceConfig = S3Config( + endpoint = "http://localhost:9000", + accessToken = "minioadmin", + secretKey = "minioadmin", + bucket = "source", + ) + val targetConfig = sourceConfig.copy(bucket = "target") + val topicConfig = mapOf( + "application_server_status" to TopicConfig( + pathProperties = PathFormatterConfig( + format = "\${projectId}/\${userId}/\${topic}/\${value:serverStatus}/\${filename}", + ), + ), + ) + val config = RestructureConfig( + source = ResourceConfig("s3", s3 = sourceConfig), + target = ResourceConfig("s3", s3 = targetConfig), + paths = PathConfig( + inputs = listOf(Paths.get("in")), + // These properties were added to verify that they are present later in PathConfig.createFactory() + properties = mapOf("one" to "1", "two" to "2", "three" to "3"), + ), + worker = WorkerConfig(minimumFileAge = 0L), + topics = topicConfig, + ) + val application = Application(config) + + assertEquals(4, application.pathFactory.pathConfig.path.properties.count()) + } + @Test fun integration() = runTest { Timer.isEnabled = true @@ -28,22 +76,30 @@ class RestructureS3IntegrationTest { val targetConfig = sourceConfig.copy(bucket = "target") val topicConfig = mapOf( "application_server_status" to TopicConfig( - pathProperties = mapOf( - "format" to "\${projectId}/\${userId}/\${topic}/\${value:serverStatus}/\${filename}" - ) - ) + pathProperties = PathFormatterConfig( + format = "\${projectId}/\${userId}/\${topic}/\${value:serverStatus}/\${filename}", + ), + ), ) val config = RestructureConfig( source = ResourceConfig("s3", s3 = sourceConfig), target = ResourceConfig("s3", s3 = targetConfig), - paths = PathConfig(inputs = listOf(Paths.get("in"))), + paths = PathConfig( + inputs = listOf(Paths.get("in")), + // These properties were added to verify that they are present later in PathConfig.createFactory() + properties = mapOf("one" to "1", "two" to "2", "three" to "3"), + ), worker = WorkerConfig(minimumFileAge = 0L), topics = topicConfig, ) val application = Application(config) + + assertEquals(4, application.pathFactory.pathConfig.path.properties.count()) + val sourceClient = sourceConfig.createS3Client() - if (!sourceClient.bucketExists(BucketExistsArgs.builder().bucketBuild(sourceConfig.bucket))) { - sourceClient.makeBucket(MakeBucketArgs.builder().bucketBuild(sourceConfig.bucket)) + val sourceBucket = requireNotNull(sourceConfig.bucket) + if (!sourceClient.bucketExists(BucketExistsArgs.builder().bucketBuild(sourceBucket))) { + sourceClient.makeBucket(MakeBucketArgs.builder().bucketBuild(sourceBucket)) } val resourceFiles = listOf( @@ -58,9 +114,9 @@ class RestructureS3IntegrationTest { .useSuspended { statusFile -> sourceClient.putObject( PutObjectArgs.Builder() - .objectBuild(sourceConfig.bucket, targetFiles[i]) { + .objectBuild(sourceBucket, targetFiles[i]) { stream(statusFile, -1, MAX_PART_SIZE) - } + }, ) } } @@ -80,6 +136,8 @@ class RestructureS3IntegrationTest { val secondParticipantOutput = "output/radar-test-root/4ab9b985-6eec-4e51-9a29-f4c571c89f99/android_phone_acceleration" + val targetBucket = requireNotNull(targetConfig.bucket) + val files = coroutineScope { launch(Dispatchers.IO) { val csvContents = """ @@ -90,9 +148,9 @@ class RestructureS3IntegrationTest { """.trimIndent() val targetContent = targetClient.getObject( - GetObjectArgs.Builder().bucketBuild(targetConfig.bucket) { + GetObjectArgs.Builder().bucketBuild(targetBucket) { `object`("$firstParticipantOutput/20200128_1300.csv") - } + }, ).use { response -> response.readBytes() } @@ -102,11 +160,11 @@ class RestructureS3IntegrationTest { return@coroutineScope withContext(Dispatchers.IO) { targetClient.listObjects( - ListObjectsArgs.Builder().bucketBuild(targetConfig.bucket) { + ListObjectsArgs.Builder().bucketBuild(targetBucket) { prefix("output") recursive(true) useUrlEncodingType(false) - } + }, ) .mapTo(HashSet()) { it.get().objectName() } } @@ -129,14 +187,14 @@ class RestructureS3IntegrationTest { targetFiles.map { launch(Dispatchers.IO) { sourceClient.removeObject( - RemoveObjectArgs.Builder().objectBuild(sourceConfig.bucket, it) + RemoveObjectArgs.Builder().objectBuild(sourceBucket, it), ) } }.joinAll() launch(Dispatchers.IO) { sourceClient.removeBucket( - RemoveBucketArgs.Builder().bucketBuild(sourceConfig.bucket) + RemoveBucketArgs.Builder().bucketBuild(sourceBucket), ) } } @@ -146,15 +204,15 @@ class RestructureS3IntegrationTest { files.map { launch(Dispatchers.IO) { targetClient.removeObject( - RemoveObjectArgs.Builder().bucketBuild(targetConfig.bucket) { + RemoveObjectArgs.Builder().bucketBuild(targetBucket) { `object`(it) - } + }, ) } }.joinAll() launch(Dispatchers.IO) { targetClient.removeBucket( - RemoveBucketArgs.Builder().bucketBuild(targetConfig.bucket) + RemoveBucketArgs.Builder().bucketBuild(targetBucket), ) } } diff --git a/src/integrationTest/java/org/radarbase/output/accounting/OffsetRangeRedisTest.kt b/src/integrationTest/java/org/radarbase/output/accounting/OffsetRangeRedisTest.kt index 70615e9..5c26ded 100644 --- a/src/integrationTest/java/org/radarbase/output/accounting/OffsetRangeRedisTest.kt +++ b/src/integrationTest/java/org/radarbase/output/accounting/OffsetRangeRedisTest.kt @@ -1,9 +1,12 @@ package org.radarbase.output.accounting -import kotlinx.coroutines.ExperimentalCoroutinesApi import kotlinx.coroutines.test.runTest import org.junit.jupiter.api.AfterEach -import org.junit.jupiter.api.Assertions.* +import org.junit.jupiter.api.Assertions.assertEquals +import org.junit.jupiter.api.Assertions.assertFalse +import org.junit.jupiter.api.Assertions.assertNotNull +import org.junit.jupiter.api.Assertions.assertNull +import org.junit.jupiter.api.Assertions.assertTrue import org.junit.jupiter.api.BeforeEach import org.junit.jupiter.api.Test import org.radarbase.output.accounting.OffsetRedisPersistence.Companion.redisOffsetReader @@ -14,7 +17,6 @@ import java.nio.file.Path import java.nio.file.Paths import java.time.Instant -@OptIn(ExperimentalCoroutinesApi::class) class OffsetRangeRedisTest { private lateinit var testFile: Path private lateinit var redisHolder: RedisHolder @@ -83,7 +85,7 @@ class OffsetRangeRedisTest { redisHolder.execute { redis -> val range = redisOffsetReader.readValue( - redis.get(testFile.toString()) + redis.get(testFile.toString()), ) assertEquals( OffsetRedisPersistence.Companion.RedisOffsetRangeSet( @@ -98,7 +100,7 @@ class OffsetRangeRedisTest { ), ), ), - range + range, ) } diff --git a/src/integrationTest/java/org/radarbase/output/accounting/RedisRemoteLockManagerTest.kt b/src/integrationTest/java/org/radarbase/output/accounting/RedisRemoteLockManagerTest.kt index c820ae6..acd6cf7 100644 --- a/src/integrationTest/java/org/radarbase/output/accounting/RedisRemoteLockManagerTest.kt +++ b/src/integrationTest/java/org/radarbase/output/accounting/RedisRemoteLockManagerTest.kt @@ -1,6 +1,5 @@ package org.radarbase.output.accounting -import kotlinx.coroutines.ExperimentalCoroutinesApi import kotlinx.coroutines.test.runTest import org.hamcrest.MatcherAssert.assertThat import org.hamcrest.Matchers.not @@ -11,7 +10,6 @@ import org.junit.jupiter.api.Test import org.radarbase.output.util.SuspendedCloseable.Companion.useSuspended import redis.clients.jedis.JedisPool -@OptIn(ExperimentalCoroutinesApi::class) internal class RedisRemoteLockManagerTest { private lateinit var redisHolder: RedisHolder private lateinit var lockManager1: RemoteLockManager diff --git a/src/main/java/org/radarbase/output/Application.kt b/src/main/java/org/radarbase/output/Application.kt index 961e755..b6baddb 100644 --- a/src/main/java/org/radarbase/output/Application.kt +++ b/src/main/java/org/radarbase/output/Application.kt @@ -18,18 +18,26 @@ package org.radarbase.output import com.beust.jcommander.JCommander import com.beust.jcommander.ParameterException -import kotlinx.coroutines.* +import kotlinx.coroutines.launch +import kotlinx.coroutines.runBlocking import kotlinx.coroutines.sync.Mutex import kotlinx.coroutines.sync.Semaphore -import org.radarbase.output.accounting.* +import org.radarbase.output.accounting.Accountant +import org.radarbase.output.accounting.OffsetPersistenceFactory +import org.radarbase.output.accounting.OffsetRedisPersistence +import org.radarbase.output.accounting.RedisHolder +import org.radarbase.output.accounting.RedisRemoteLockManager +import org.radarbase.output.accounting.RemoteLockManager import org.radarbase.output.cleaner.SourceDataCleaner import org.radarbase.output.compression.Compression import org.radarbase.output.config.CommandLineArgs import org.radarbase.output.config.RestructureConfig import org.radarbase.output.format.RecordConverterFactory import org.radarbase.output.path.RecordPathFactory +import org.radarbase.output.source.InMemoryStorageIndex import org.radarbase.output.source.SourceStorage import org.radarbase.output.source.SourceStorageFactory +import org.radarbase.output.source.StorageIndexManager import org.radarbase.output.target.TargetStorage import org.radarbase.output.target.TargetStorageFactory import org.radarbase.output.util.Timer @@ -39,7 +47,9 @@ import org.radarbase.output.worker.RadarKafkaRestructure import org.slf4j.LoggerFactory import redis.clients.jedis.JedisPool import java.io.IOException +import java.nio.file.Path import java.text.NumberFormat +import java.time.Duration import java.time.LocalDateTime import java.time.format.DateTimeFormatter import java.util.concurrent.atomic.LongAdder @@ -54,12 +64,11 @@ class Application( override val config = config.apply { validate() } override val recordConverter: RecordConverterFactory = config.format.createConverter() override val compression: Compression = config.compression.createCompression() - override val pathFactory: RecordPathFactory = config.paths.createFactory().apply { - fileStoreFactory = this@Application - extension = recordConverter.extension + compression.extension - root = config.paths.output - addTopicConfiguration(config.topics) - } + override val pathFactory: RecordPathFactory = config.paths.createFactory( + config.target, + recordConverter.extension + compression.extension, + config.topics, + ) private val sourceStorageFactory = SourceStorageFactory(config.source, config.paths.temp) override val sourceStorage: SourceStorage @@ -79,9 +88,27 @@ class Application( override val workerSemaphore = Semaphore(config.worker.numThreads * 2) + override val storageIndexManagers: Map + private val jobs: List init { + val indexConfig = config.source.index + val (fullScan, emptyScan) = if (indexConfig == null) { + listOf(3600L, 900L) + } else { + listOf(indexConfig.fullSyncInterval, indexConfig.emptyDirectorySyncInterval) + }.map { Duration.ofSeconds(it) } + + storageIndexManagers = config.paths.inputs.associateWith { input -> + StorageIndexManager( + InMemoryStorageIndex(), + sourceStorage, + input, + fullScan, + emptyScan, + ) + } val serviceMutex = Mutex() jobs = listOfNotNull( RadarKafkaRestructure.job(config, serviceMutex), @@ -188,7 +215,7 @@ class Application( .apply { addArgs(commandLineArgs) validate() - } + }, ) } catch (ex: IllegalArgumentException) { logger.error("Illegal argument", ex) diff --git a/src/main/java/org/radarbase/output/FileStoreFactory.kt b/src/main/java/org/radarbase/output/FileStoreFactory.kt index d3d1601..9448ac3 100644 --- a/src/main/java/org/radarbase/output/FileStoreFactory.kt +++ b/src/main/java/org/radarbase/output/FileStoreFactory.kt @@ -26,9 +26,11 @@ import org.radarbase.output.config.RestructureConfig import org.radarbase.output.format.RecordConverterFactory import org.radarbase.output.path.RecordPathFactory import org.radarbase.output.source.SourceStorage +import org.radarbase.output.source.StorageIndexManager import org.radarbase.output.target.TargetStorage import org.radarbase.output.worker.FileCacheStore import java.io.IOException +import java.nio.file.Path /** Factory for all factory classes and settings. */ interface FileStoreFactory { @@ -42,6 +44,7 @@ interface FileStoreFactory { val redisHolder: RedisHolder val offsetPersistenceFactory: OffsetPersistenceFactory val workerSemaphore: Semaphore + val storageIndexManagers: Map @Throws(IOException::class) fun newFileCacheStore(accountant: Accountant): FileCacheStore diff --git a/src/main/java/org/radarbase/output/accounting/AccountantImpl.kt b/src/main/java/org/radarbase/output/accounting/AccountantImpl.kt index b08d401..f59a951 100644 --- a/src/main/java/org/radarbase/output/accounting/AccountantImpl.kt +++ b/src/main/java/org/radarbase/output/accounting/AccountantImpl.kt @@ -47,7 +47,9 @@ open class AccountantImpl( return if (offsetsPath.exists()) { OffsetFilePersistence(targetStorage).read(offsetsPath) .also { offsetsPath.deleteExisting() } - } else null + } else { + null + } } override suspend fun remove(range: TopicPartitionOffsetRange) = diff --git a/src/main/java/org/radarbase/output/accounting/OffsetFilePersistence.kt b/src/main/java/org/radarbase/output/accounting/OffsetFilePersistence.kt index 4efe738..f9d82d2 100644 --- a/src/main/java/org/radarbase/output/accounting/OffsetFilePersistence.kt +++ b/src/main/java/org/radarbase/output/accounting/OffsetFilePersistence.kt @@ -52,7 +52,9 @@ class OffsetFilePersistence( } } } - } else null + } else { + null + } } catch (ex: IOException) { logger.error("Error reading offsets file. Processing all offsets.", ex) null @@ -76,7 +78,9 @@ class OffsetFilePersistence( } val lastModified = if (cols.size >= 5) { Instant.parse(cols[4]) - } else Instant.now() + } else { + Instant.now() + } return TopicPartitionOffsetRange( topic, diff --git a/src/main/java/org/radarbase/output/accounting/OffsetRedisPersistence.kt b/src/main/java/org/radarbase/output/accounting/OffsetRedisPersistence.kt index 51f2266..a07987b 100644 --- a/src/main/java/org/radarbase/output/accounting/OffsetRedisPersistence.kt +++ b/src/main/java/org/radarbase/output/accounting/OffsetRedisPersistence.kt @@ -79,7 +79,7 @@ class OffsetRedisPersistence( topicPartition.partition, offsetIntervals.toList(), ) - } + }, ) redisHolder.execute { redis -> diff --git a/src/main/java/org/radarbase/output/accounting/TopicPartition.kt b/src/main/java/org/radarbase/output/accounting/TopicPartition.kt index 0099abc..7b30966 100644 --- a/src/main/java/org/radarbase/output/accounting/TopicPartition.kt +++ b/src/main/java/org/radarbase/output/accounting/TopicPartition.kt @@ -28,6 +28,6 @@ data class TopicPartition(val topic: String, val partition: Int) : Comparable 0) { logger.info("Removed {} files in topic {}", deleteCount, p.fileName) @@ -70,7 +72,7 @@ class SourceDataCleaner( } } - private suspend fun mapTopic(topicPath: Path): Long { + private suspend fun mapTopic(storageIndex: StorageIndex, topicPath: Path): Long { val topic = topicPath.fileName.toString() return try { lockManager.tryWithLock(topic) { @@ -84,7 +86,7 @@ class SourceDataCleaner( fileStoreFactory, ) } - deleteOldFiles(accountant, extractionCheck, topic, topicPath).toLong() + deleteOldFiles(storageIndex, accountant, extractionCheck, topic, topicPath).toLong() } } } @@ -95,6 +97,7 @@ class SourceDataCleaner( } private suspend fun deleteOldFiles( + storageIndex: StorageIndex, accountant: Accountant, extractionCheck: ExtractionCheck, topic: String, @@ -102,7 +105,7 @@ class SourceDataCleaner( ): Int { val offsets = accountant.offsets.copyForTopic(topic) - val paths = sourceStorage.listTopicFiles(topic, topicPath, maxFilesPerTopic) { f -> + val paths = sourceStorage.listTopicFiles(storageIndex, topic, topicPath, maxFilesPerTopic) { f -> f.lastModified.isBefore(deleteThreshold) && // ensure that there is a file with a larger offset also // processed, so the largest offset is never removed. @@ -115,6 +118,7 @@ class SourceDataCleaner( logger.info("Removing {}", file.path) Timer.time("cleaner.delete") { sourceStorage.delete(file.path) + storageIndex.remove(StorageNode.StorageFile(file.path, Instant.MIN)) } true } else { @@ -127,8 +131,8 @@ class SourceDataCleaner( } } - private suspend fun topicPaths(path: Path): List = - sourceStorage.listTopics(path, excludeTopics) + private suspend fun topicPaths(storageIndex: StorageIndex, path: Path): List = + sourceStorage.listTopics(storageIndex, path, excludeTopics) // different services start on different topics to decrease lock contention .shuffled() @@ -141,13 +145,16 @@ class SourceDataCleaner( fun job(config: RestructureConfig, serviceMutex: Mutex): Job? = if (config.cleaner.enable) { Job("cleaner", config.cleaner.interval, ::runCleaner, serviceMutex) - } else null + } else { + null + } private suspend fun runCleaner(factory: FileStoreFactory) { SourceDataCleaner(factory).useSuspended { cleaner -> - for (input in factory.config.paths.inputs) { + for ((input, indexManager) in factory.storageIndexManagers) { + indexManager.update() logger.info("Cleaning {}", input) - cleaner.process(input.toString()) + cleaner.process(indexManager.storageIndex, input.toString()) } logger.info("Cleaned up {} files", cleaner.deletedFileCount.format()) } diff --git a/src/main/java/org/radarbase/output/cleaner/TimestampExtractionCheck.kt b/src/main/java/org/radarbase/output/cleaner/TimestampExtractionCheck.kt index a2df3bf..7e0c4d6 100644 --- a/src/main/java/org/radarbase/output/cleaner/TimestampExtractionCheck.kt +++ b/src/main/java/org/radarbase/output/cleaner/TimestampExtractionCheck.kt @@ -24,7 +24,7 @@ class TimestampExtractionCheck( val result = resourceContext { val input = createResource { reader.newInput(file) } // processing zero-length files may trigger a stall. See: - // https://github.com/RADAR-base/Restructure-HDFS-topic/issues/3 + // https://github.com/RADAR-base/radar-output-restructure/issues/3 if (input.length() == 0L) { logger.warn("File {} has zero length, skipping.", file.path) return@resourceContext false @@ -64,7 +64,7 @@ class TimestampExtractionCheck( val path = pathFactory.getRecordPath( topicFile.topic, record, - suffix + suffix, ) try { diff --git a/src/main/java/org/radarbase/output/cleaner/TimestampFileCacheStore.kt b/src/main/java/org/radarbase/output/cleaner/TimestampFileCacheStore.kt index 0e449ed..9674e01 100644 --- a/src/main/java/org/radarbase/output/cleaner/TimestampFileCacheStore.kt +++ b/src/main/java/org/radarbase/output/cleaner/TimestampFileCacheStore.kt @@ -95,6 +95,6 @@ class TimestampFileCacheStore(private val factory: FileStoreFactory) { FILE_NOT_FOUND, BAD_SCHEMA, NOT_FOUND, - FOUND + FOUND, } } diff --git a/src/main/java/org/radarbase/output/config/AzureConfig.kt b/src/main/java/org/radarbase/output/config/AzureConfig.kt index 1456e7a..5372de6 100644 --- a/src/main/java/org/radarbase/output/config/AzureConfig.kt +++ b/src/main/java/org/radarbase/output/config/AzureConfig.kt @@ -13,7 +13,7 @@ data class AzureConfig( /** URL to reach object store at. */ val endpoint: String, /** Name of the Azure Blob Storage container. */ - val container: String, + val container: String? = null, /** If no endOffset is in the filename, read it from object metadata. */ val endOffsetFromMetadata: Boolean = false, /** Azure username. */ diff --git a/src/main/java/org/radarbase/output/config/BucketFormatterConfig.kt b/src/main/java/org/radarbase/output/config/BucketFormatterConfig.kt new file mode 100644 index 0000000..dc65ec3 --- /dev/null +++ b/src/main/java/org/radarbase/output/config/BucketFormatterConfig.kt @@ -0,0 +1,21 @@ +package org.radarbase.output.config + +/** Configuration on how to format the target bucket name. */ +data class BucketFormatterConfig( + /** Format string. May include any variables computed by the configured plugins. */ + val format: String = "radar-output-storage", + /** + * Spaces separated list of plugins to use for formatting the format string. May include + * custom class names. + */ + val plugins: String = "fixed time key value", + /** List of regexes to disable the formatted string for and use [defaultName] instead. */ + val disabledFormats: List = emptyList(), + /** + * Default name to use for the output storage if the output format is disabled via + * [disabledFormats]. + */ + val defaultName: String = "radar-output-storage", + /** Additional plugin properties. */ + val properties: Map = emptyMap(), +) diff --git a/src/main/java/org/radarbase/output/config/CommandLineArgs.kt b/src/main/java/org/radarbase/output/config/CommandLineArgs.kt index 4269945..e56a76e 100644 --- a/src/main/java/org/radarbase/output/config/CommandLineArgs.kt +++ b/src/main/java/org/radarbase/output/config/CommandLineArgs.kt @@ -51,20 +51,13 @@ class CommandLineArgs { ) var compression: String? = null - // Default set to false because causes loss of records from Biovotion data. https://github.com/RADAR-base/Restructure-HDFS-topic/issues/16 + // Default set to false because causes loss of records from Biovotion data. https://github.com/RADAR-base/radar-output-restructure/issues/16 @Parameter( names = ["-d", "--deduplicate"], description = "Boolean to define if to use deduplication or not.", ) var deduplicate: Boolean? = null - @Parameter( - names = ["-n", "--nameservice"], - description = "The HDFS name services to connect to. Eg - '' for single configurations or for high availability web services.", - validateWith = [NonEmptyValidator::class], - ) - var hdfsName: String? = null - @Parameter( names = ["-o", "--output-directory"], description = "The output folder where the files are to be extracted.", diff --git a/src/main/java/org/radarbase/output/config/Extensions.kt b/src/main/java/org/radarbase/output/config/Extensions.kt index e75cd5b..e4ec008 100644 --- a/src/main/java/org/radarbase/output/config/Extensions.kt +++ b/src/main/java/org/radarbase/output/config/Extensions.kt @@ -2,10 +2,15 @@ package org.radarbase.output.config import org.radarbase.output.Plugin -internal inline fun String.toPluginInstance(properties: Map): T { +internal inline fun String.toPluginInstance( + properties: Map, +): T = constructClass().apply { + init(properties) +} + +internal inline fun String.constructClass(): T { return try { (Class.forName(this).getConstructor().newInstance() as T) - .also { it.init(properties) } } catch (ex: ReflectiveOperationException) { throw IllegalStateException("Cannot map class $this to ${T::class.java.name}") } diff --git a/src/main/java/org/radarbase/output/config/FormatConfig.kt b/src/main/java/org/radarbase/output/config/FormatConfig.kt index e73a3d7..6f049b4 100644 --- a/src/main/java/org/radarbase/output/config/FormatConfig.kt +++ b/src/main/java/org/radarbase/output/config/FormatConfig.kt @@ -14,6 +14,7 @@ data class FormatConfig( distinctFields = emptySet(), ignoreFields = emptySet(), ), + val excludeFields: Set = emptySet(), ) : PluginConfig { fun createFactory(): FormatFactory = factory.toPluginInstance(properties) fun createConverter(): RecordConverterFactory = createFactory()[type] diff --git a/src/main/java/org/radarbase/output/config/HdfsConfig.kt b/src/main/java/org/radarbase/output/config/HdfsConfig.kt deleted file mode 100644 index d37f714..0000000 --- a/src/main/java/org/radarbase/output/config/HdfsConfig.kt +++ /dev/null @@ -1,12 +0,0 @@ -package org.radarbase.output.config - -data class HdfsConfig( - /** HDFS name nodes to use. */ - val nameNodes: List = emptyList(), - /** Additional HDFS configuration parameters. */ - val properties: Map = emptyMap(), -) { - fun validate() { - check(nameNodes.isNotEmpty()) { "Cannot use HDFS without any name nodes." } - } -} diff --git a/src/main/java/org/radarbase/output/config/NonEmptyValidator.kt b/src/main/java/org/radarbase/output/config/NonEmptyValidator.kt index 573f597..36b7796 100644 --- a/src/main/java/org/radarbase/output/config/NonEmptyValidator.kt +++ b/src/main/java/org/radarbase/output/config/NonEmptyValidator.kt @@ -24,7 +24,7 @@ class NonEmptyValidator : IParameterValidator { if (value.isNullOrEmpty()) { throw ParameterException( "Parameter $name should be supplied. It cannot be empty or null. (found $value)." + - " Please run with --help or -h for more information." + " Please run with --help or -h for more information.", ) } } diff --git a/src/main/java/org/radarbase/output/config/PathConfig.kt b/src/main/java/org/radarbase/output/config/PathConfig.kt index 04622ed..00078f3 100644 --- a/src/main/java/org/radarbase/output/config/PathConfig.kt +++ b/src/main/java/org/radarbase/output/config/PathConfig.kt @@ -16,8 +16,51 @@ data class PathConfig( val temp: Path = createTempDirectory("radar-output-restructure"), /** Output path on the target resource. */ val output: Path = Paths.get("output"), - /** Output path on the target resource. */ - val snapshots: Path = Paths.get("snapshots"), + /** Path formatting rules. */ + val path: PathFormatterConfig = PathFormatterConfig(), + /** + * Bucket formatting rules for the target storage. If no configuration is provided, this + * will not format any bucket for local storage, and it will use the target bucket (s3) + * or container (azure) as the default target bucket. + */ + val bucket: BucketFormatterConfig? = null, ) : PluginConfig { - fun createFactory(): RecordPathFactory = factory.toPluginInstance(properties) + fun createFactory( + target: ResourceConfig, + extension: String, + topics: Map, + ): RecordPathFactory { + val pathFactory = factory.constructClass() + + val bucketConfig = bucket + ?: when (target.sourceType) { + ResourceType.AZURE -> { + val container = requireNotNull(target.azure?.container) { "Either target container or bucket formatter config needs to be configured." } + BucketFormatterConfig(format = container, plugins = "", defaultName = container) + } + ResourceType.S3 -> { + val bucket = requireNotNull(target.s3?.bucket) { "Either target container or bucket formatter config needs to be configured." } + BucketFormatterConfig(format = bucket, plugins = "", defaultName = bucket) + } + else -> null + } + + // Pass any properties from the given PathConfig to the PathFormatterConfig for the factory. + // Properties passed in the PathConfig.path.properties take precedent + val pathProperties = buildMap { + putAll(path.properties) + putAll(properties) + } + + val pathFormatterConfig = path.copy(properties = pathProperties) + val pathConfig = copy(bucket = bucketConfig, path = pathFormatterConfig) + + pathFactory.init( + extension = extension, + config = pathConfig, + topics = topics, + ) + + return pathFactory + } } diff --git a/src/main/java/org/radarbase/output/config/PathFormatterConfig.kt b/src/main/java/org/radarbase/output/config/PathFormatterConfig.kt new file mode 100644 index 0000000..30d905d --- /dev/null +++ b/src/main/java/org/radarbase/output/config/PathFormatterConfig.kt @@ -0,0 +1,34 @@ +package org.radarbase.output.config + +data class PathFormatterConfig( + /** Format string. May include any variables computed by the configured plugins. */ + val format: String = DEFAULT_FORMAT, + /** + * Space separated list of plugins to use for formatting the format string. May include custom + * class names. + */ + val plugins: String = "fixed time key value", + /** Additional plugin properties. */ + val properties: Map = mapOf(), +) { + /** + * Combine this config with given config. If no changes are made, just return the current + * object. + */ + fun copy(values: PathFormatterConfig): PathFormatterConfig { + val copy = PathFormatterConfig( + format = values.format, + plugins = values.plugins, + properties = buildMap(properties.size + values.properties.size) { + putAll(properties) + putAll(values.properties) + }, + ) + return if (this == copy) this else copy + } + + companion object { + /** Default path format. */ + const val DEFAULT_FORMAT = "\${projectId}/\${userId}/\${topic}/\${filename}" + } +} diff --git a/src/main/java/org/radarbase/output/config/ResourceConfig.kt b/src/main/java/org/radarbase/output/config/ResourceConfig.kt index 6d8c911..cb9781a 100644 --- a/src/main/java/org/radarbase/output/config/ResourceConfig.kt +++ b/src/main/java/org/radarbase/output/config/ResourceConfig.kt @@ -5,22 +5,21 @@ import org.radarbase.output.config.ResourceType.Companion.toResourceType import org.radarbase.output.config.RestructureConfig.Companion.copyOnChange data class ResourceConfig( - /** Resource type. One of s3, hdfs or local. */ + /** Resource type. One of s3, azure or local. */ val type: String, val s3: S3Config? = null, - val hdfs: HdfsConfig? = null, val local: LocalConfig? = null, val azure: AzureConfig? = null, + val index: StorageIndexConfig? = null, ) { @get:JsonIgnore val sourceType: ResourceType by lazy { - requireNotNull(type.toResourceType()) { "Unknown resource type $type, choose s3, hdfs or local" } + requireNotNull(type.toResourceType()) { "Unknown resource type $type, choose s3, azure or local" } } fun validate() { when (sourceType) { ResourceType.S3 -> checkNotNull(s3) { "No S3 configuration provided." } - ResourceType.HDFS -> checkNotNull(hdfs) { "No HDFS configuration provided." }.also { it.validate() } ResourceType.LOCAL -> checkNotNull(local) { "No local configuration provided." } ResourceType.AZURE -> checkNotNull(azure) { "No Azure configuration provided." } } @@ -28,7 +27,6 @@ data class ResourceConfig( fun withEnv(prefix: String): ResourceConfig = when (sourceType) { ResourceType.S3 -> copyOnChange(s3, { it?.withEnv(prefix) }) { copy(s3 = it) } - ResourceType.HDFS -> this ResourceType.LOCAL -> this ResourceType.AZURE -> copyOnChange(azure, { it?.withEnv(prefix) }) { copy(azure = it) } } diff --git a/src/main/java/org/radarbase/output/config/ResourceType.kt b/src/main/java/org/radarbase/output/config/ResourceType.kt index 185e4d6..209fa31 100644 --- a/src/main/java/org/radarbase/output/config/ResourceType.kt +++ b/src/main/java/org/radarbase/output/config/ResourceType.kt @@ -1,12 +1,11 @@ package org.radarbase.output.config enum class ResourceType { - S3, HDFS, LOCAL, AZURE; + S3, LOCAL, AZURE; companion object { fun String.toResourceType() = when (lowercase()) { "s3" -> S3 - "hdfs" -> HDFS "local" -> LOCAL "azure" -> AZURE else -> null diff --git a/src/main/java/org/radarbase/output/config/RestructureConfig.kt b/src/main/java/org/radarbase/output/config/RestructureConfig.kt index 9c8c0ff..122a42a 100644 --- a/src/main/java/org/radarbase/output/config/RestructureConfig.kt +++ b/src/main/java/org/radarbase/output/config/RestructureConfig.kt @@ -24,10 +24,7 @@ data class RestructureConfig( val compression: CompressionConfig = CompressionConfig(), /** File format to use for output files. */ val format: FormatConfig = FormatConfig(), - /** Snapshot */ - val snapshot: SnapshotConfig = SnapshotConfig(), ) { - fun validate() { source.validate() target.validate() @@ -46,14 +43,6 @@ data class RestructureConfig( args.tmpDir?.let { copy(paths = paths.copy(temp = Paths.get(it))) } args.inputPaths?.let { inputs -> copy(paths = paths.copy(inputs = inputs.map { Paths.get(it) })) } args.outputDirectory?.let { copy(paths = paths.copy(output = Paths.get(it))) } - args.hdfsName?.let { - copy( - source = source.copy( - hdfs = source.hdfs?.copy(nameNodes = listOf(it)) - ?: HdfsConfig(nameNodes = listOf(it)) - ), - ) - } args.format?.let { copy(format = format.copy(type = it)) } args.deduplicate?.let { copy(format = format.copy(deduplication = format.deduplication.copy(enable = it))) @@ -96,7 +85,9 @@ data class RestructureConfig( val newValue = modification(original) return if (newValue != original) { doCopy(newValue) - } else this + } else { + this + } } } } diff --git a/src/main/java/org/radarbase/output/config/S3Config.kt b/src/main/java/org/radarbase/output/config/S3Config.kt index a827d8e..6e75982 100644 --- a/src/main/java/org/radarbase/output/config/S3Config.kt +++ b/src/main/java/org/radarbase/output/config/S3Config.kt @@ -14,7 +14,7 @@ data class S3Config( /** Secret key belonging to access token. */ val secretKey: String?, /** Bucket name. */ - val bucket: String, + val bucket: String? = null, /** If no endOffset is in the filename, read it from object tags. */ val endOffsetFromTags: Boolean = false, /** HTTP connect timeout. */ @@ -36,7 +36,7 @@ data class S3Config( connectTimeout.toMillisOrDefault(), writeTimeout.toMillisOrDefault(), readTimeout.toMillisOrDefault(), - ) + ), ) }.build() diff --git a/src/main/java/org/radarbase/output/config/SnapshotConfig.kt b/src/main/java/org/radarbase/output/config/SnapshotConfig.kt deleted file mode 100644 index f4b7356..0000000 --- a/src/main/java/org/radarbase/output/config/SnapshotConfig.kt +++ /dev/null @@ -1,11 +0,0 @@ -package org.radarbase.output.config - -import java.time.Duration - -data class SnapshotConfig( - val enable: Boolean = false, - val frequency: Duration = Duration.ofDays(31), - val numberOfSnapshots: Int = 12, - val sourceFormat: String = "\${projectId}", - val targetFormat: String = "\${projectId}", -) diff --git a/src/main/java/org/radarbase/output/config/StorageIndexConfig.kt b/src/main/java/org/radarbase/output/config/StorageIndexConfig.kt new file mode 100644 index 0000000..05fda27 --- /dev/null +++ b/src/main/java/org/radarbase/output/config/StorageIndexConfig.kt @@ -0,0 +1,12 @@ +package org.radarbase.output.config + +data class StorageIndexConfig( + /** How often to fully sync the storage index, in seconds. */ + val fullSyncInterval: Long = 3600L, + /** + * How often to sync empty directories with the storage index, in seconds. + * If this is very large, empty directories will only be scanned during + * full sync. + */ + val emptyDirectorySyncInterval: Long = 900L, +) diff --git a/src/main/java/org/radarbase/output/config/TopicConfig.kt b/src/main/java/org/radarbase/output/config/TopicConfig.kt index de8981f..4b2b1f1 100644 --- a/src/main/java/org/radarbase/output/config/TopicConfig.kt +++ b/src/main/java/org/radarbase/output/config/TopicConfig.kt @@ -14,7 +14,11 @@ data class TopicConfig( * Specify alternative path format, following * [org.radarbase.output.path.FormattedPathFactory] format. */ - val pathProperties: Map = emptyMap(), + val pathProperties: PathFormatterConfig, + /** + * Exclude given fields from output files. + */ + val excludeFields: Set? = null, ) { fun deduplication(deduplicationDefault: DeduplicationConfig): DeduplicationConfig = deduplication diff --git a/src/main/java/org/radarbase/output/format/CsvAvroConverter.kt b/src/main/java/org/radarbase/output/format/CsvAvroConverter.kt index cc61c1f..f2f7dd4 100644 --- a/src/main/java/org/radarbase/output/format/CsvAvroConverter.kt +++ b/src/main/java/org/radarbase/output/format/CsvAvroConverter.kt @@ -33,22 +33,37 @@ class CsvAvroConverter( writeHeader: Boolean, reader: Reader, recordHeader: Array, + excludeFields: Set, ) : RecordConverter { private val csvWriter = CSVWriter(writer) private val converter: CsvAvroDataConverter init { - converter = if (writeHeader) { - csvWriter.writeNext(recordHeader, false) - CsvAvroDataConverter(recordHeader) - } else { - CsvAvroDataConverter( - CSVReader(reader).use { + val (header, excludedFromHeader) = when { + !writeHeader -> { + val readHeader = CSVReader(reader).use { requireNotNull(it.readNext()) { "No header found" } } - ) + Pair( + readHeader, + excludeFields - readHeader.toHashSet(), + ) + } + excludeFields.isEmpty() -> Pair(recordHeader, excludeFields) + else -> { + val excludedHeaderSet = recordHeader.toHashSet() + Pair( + recordHeader.filter { it !in excludeFields }.toTypedArray(), + excludeFields.filterTo(HashSet()) { it in excludedHeaderSet }, + ) + } } + if (writeHeader) { + csvWriter.writeNext(header, false) + } + + converter = CsvAvroDataConverter(header, excludedFromHeader) } /** @@ -60,8 +75,7 @@ class CsvAvroConverter( @Throws(IOException::class) override fun writeRecord(record: GenericRecord): Boolean { return try { - val retValues = converter.convertRecordValues(record) - csvWriter.writeNext(retValues.toTypedArray(), false) + csvWriter.writeNext(converter.convertRecordValues(record), false) true } catch (ex: IllegalArgumentException) { false diff --git a/src/main/java/org/radarbase/output/format/CsvAvroConverterFactory.kt b/src/main/java/org/radarbase/output/format/CsvAvroConverterFactory.kt index 112af74..4536b28 100644 --- a/src/main/java/org/radarbase/output/format/CsvAvroConverterFactory.kt +++ b/src/main/java/org/radarbase/output/format/CsvAvroConverterFactory.kt @@ -75,7 +75,9 @@ class CsvAvroConverterFactory : RecordConverterFactory { if (indexIndex < lineIndexes.size && lineIndexes[indexIndex] == i) { indexIndex += 1 true - } else false + } else { + false + } }, ) } @@ -127,7 +129,9 @@ class CsvAvroConverterFactory : RecordConverterFactory { Pair( header, - if (parsers.isEmpty()) emptyList() else { + if (parsers.isEmpty()) { + emptyList() + } else { lines.mapNotNull { line -> for ((index, parser) in parsers) { parser(line[index])?.let { @@ -150,8 +154,8 @@ class CsvAvroConverterFactory : RecordConverterFactory { ): Boolean = source.inputStream().use { input -> processLines(input, compression) { header, lines -> checkNotNull(header) { "Empty file found" } - val converter = CsvAvroDataConverter(header) - val recordValues = converter.convertRecordValues(record).toTypedArray() + val converter = CsvAvroDataConverter(header, emptySet()) + val recordValues = converter.convertRecordValues(record) val indexes = fieldIndexes(header, usingFields, ignoreFields) if (indexes == null) { @@ -168,7 +172,8 @@ class CsvAvroConverterFactory : RecordConverterFactory { record: GenericRecord, writeHeader: Boolean, reader: Reader, - ): CsvAvroConverter = CsvAvroConverter(writer, writeHeader, reader, headerFor(record)) + excludeFields: Set, + ): CsvAvroConverter = CsvAvroConverter(writer, writeHeader, reader, headerFor(record), excludeFields) override val hasHeader: Boolean = true @@ -189,7 +194,9 @@ class CsvAvroConverterFactory : RecordConverterFactory { val header = csvReader.readNext() val lines = if (header != null) { generateSequence { csvReader.readNext() } - } else emptySequence() + } else { + emptySequence() + } process(header, lines) } @@ -216,7 +223,9 @@ class CsvAvroConverterFactory : RecordConverterFactory { @Throws(IndexOutOfBoundsException::class) inline fun Array.byIndex( indexes: IntArray?, - ): Array = if (indexes == null) this else { + ): Array = if (indexes == null) { + this + } else { Array(indexes.size) { i -> this[indexes[i]] } } } diff --git a/src/main/java/org/radarbase/output/format/CsvAvroDataConverter.kt b/src/main/java/org/radarbase/output/format/CsvAvroDataConverter.kt index c9e87e8..1e51a66 100644 --- a/src/main/java/org/radarbase/output/format/CsvAvroDataConverter.kt +++ b/src/main/java/org/radarbase/output/format/CsvAvroDataConverter.kt @@ -9,93 +9,85 @@ import java.util.* internal class CsvAvroDataConverter( private val headers: Array, + private val excludeFields: Set, ) { - private val values: MutableList = ArrayList(this.headers.size) - - fun convertRecord(record: GenericRecord): Map { - values.clear() - val schema = record.schema - for (field in schema.fields) { - convertAvro(values, record.get(field.pos()), field.schema(), field.name()) + fun convertRecord(record: GenericRecord): Map = buildMap { + convertRecordValues(record).forEachIndexed { i, value -> + put(headers[i], value) } - val map = LinkedHashMap() - for (i in headers.indices) { - map[headers[i]] = values[i] - } - values.clear() - return map } - fun convertRecordValues(record: GenericRecord): List { - values.clear() + fun convertRecordValues(record: GenericRecord): Array { + val values = arrayOfNulls(headers.size) val schema = record.schema - for (field in schema.fields) { - convertAvro(values, record.get(field.pos()), field.schema(), field.name()) + val endIndex = schema.fields.fold(0) { valueIndex, field -> + convertAvro(values, valueIndex, record.get(field.pos()), field.schema(), field.name()) } - require(values.size >= headers.size) { "Values and headers do not match" } - return values + require(endIndex >= headers.size) { "Values and headers do not match" } + @Suppress("UNCHECKED_CAST") + return values as Array } private fun convertAvro( - values: MutableList, + values: Array, + startIndex: Int, data: Any?, schema: Schema, prefix: String, - ) { - when (schema.type) { - Schema.Type.RECORD -> { - val record = data as GenericRecord - val subSchema = record.schema - for (field in subSchema.fields) { - val subData = record.get(field.pos()) - convertAvro( - values, - subData, - field.schema(), - prefix + '.'.toString() + field.name(), - ) - } - } - Schema.Type.MAP -> { - val valueType = schema.valueType - for ((key, value) in data as Map<*, *>) { - val name = "$prefix.$key" - convertAvro(values, value, valueType, name) - } - } - Schema.Type.ARRAY -> { - val itemType = schema.elementType - for ((i, orig) in (data as List<*>).withIndex()) { - convertAvro(values, orig, itemType, "$prefix.$i") - } - } - Schema.Type.UNION -> { - val type = GenericData().resolveUnion(schema, data) - convertAvro(values, data, schema.types[type], prefix) - } - Schema.Type.BYTES -> { - checkHeader(prefix, values.size) - values.add(BASE64_ENCODER.encodeToString((data as ByteBuffer).array())) - } - Schema.Type.FIXED -> { - checkHeader(prefix, values.size) - values.add(BASE64_ENCODER.encodeToString((data as GenericFixed).bytes())) + ): Int = when (schema.type) { + Schema.Type.RECORD -> { + val record = data as GenericRecord + val subSchema = record.schema + subSchema.fields.fold(startIndex) { index, field -> + val subData = record.get(field.pos()) + convertAvro( + values, + index, + subData, + field.schema(), + "$prefix.${field.name()}", + ) } - Schema.Type.STRING, Schema.Type.ENUM, Schema.Type.INT, Schema.Type.LONG, - Schema.Type.DOUBLE, Schema.Type.FLOAT, Schema.Type.BOOLEAN -> { - checkHeader(prefix, values.size) - values.add(data.toString()) + } + Schema.Type.MAP -> { + val valueType = schema.valueType + (data as Map<*, *>).entries.fold(startIndex) { index, (key, value) -> + convertAvro(values, index, value, valueType, "$prefix.$key") } - Schema.Type.NULL -> { - checkHeader(prefix, values.size) - values.add("") + } + Schema.Type.ARRAY -> { + val itemType = schema.elementType + (data as List<*>).foldIndexed(startIndex) { i, index, orig -> + convertAvro(values, index, orig, itemType, "$prefix.$i") } - else -> throw IllegalArgumentException("Cannot parse field type " + schema.type) } + Schema.Type.UNION -> { + val type = GenericData().resolveUnion(schema, data) + convertAvro(values, startIndex, data, schema.types[type], prefix) + } + Schema.Type.BYTES -> { + addValue(prefix, values, startIndex, BASE64_ENCODER.encodeToString((data as ByteBuffer).array())) + } + Schema.Type.FIXED -> { + addValue(prefix, values, startIndex, BASE64_ENCODER.encodeToString((data as GenericFixed).bytes())) + } + Schema.Type.STRING, Schema.Type.ENUM, Schema.Type.INT, Schema.Type.LONG, + Schema.Type.DOUBLE, Schema.Type.FLOAT, Schema.Type.BOOLEAN, + -> { + addValue(prefix, values, startIndex, data.toString()) + } + Schema.Type.NULL -> { + addValue(prefix, values, startIndex, "") + } + else -> throw IllegalArgumentException("Cannot parse field type " + schema.type) } - private fun checkHeader(prefix: String, size: Int) { - require(prefix == headers[size]) { "Header $prefix does not match ${headers[size]}" } + private fun addValue(prefix: String, values: Array, index: Int, value: String): Int { + if (prefix in excludeFields) return index + val header = headers[index] + require(prefix == header) { "Header $prefix does not match $header" } + values[index] = value + return index + 1 } companion object { diff --git a/src/main/java/org/radarbase/output/format/Format.kt b/src/main/java/org/radarbase/output/format/Format.kt index 0eaebfa..1a248ee 100644 --- a/src/main/java/org/radarbase/output/format/Format.kt +++ b/src/main/java/org/radarbase/output/format/Format.kt @@ -21,7 +21,6 @@ interface Format { val extension: String - open fun matchesFilename(name: String): Boolean { - return name.lowercase().endsWith(extension.lowercase()) - } + fun matchesFilename(name: String): Boolean = + name.endsWith(extension, ignoreCase = true) } diff --git a/src/main/java/org/radarbase/output/format/JsonAvroConverter.kt b/src/main/java/org/radarbase/output/format/JsonAvroConverter.kt index 830e0e9..efb724f 100644 --- a/src/main/java/org/radarbase/output/format/JsonAvroConverter.kt +++ b/src/main/java/org/radarbase/output/format/JsonAvroConverter.kt @@ -29,8 +29,9 @@ import java.io.Writer */ class JsonAvroConverter( writer: Writer, - private val converter: JsonAvroDataConverter, + excludeFields: Set, ) : RecordConverter { + private val converter = JsonAvroDataConverter(excludeFields) private val generator: JsonGenerator = JSON_FACTORY.createGenerator(writer) .setPrettyPrinter(MinimalPrettyPrinter("\n")) diff --git a/src/main/java/org/radarbase/output/format/JsonAvroConverterFactory.kt b/src/main/java/org/radarbase/output/format/JsonAvroConverterFactory.kt index 963a3c8..02226b9 100644 --- a/src/main/java/org/radarbase/output/format/JsonAvroConverterFactory.kt +++ b/src/main/java/org/radarbase/output/format/JsonAvroConverterFactory.kt @@ -6,7 +6,11 @@ import org.radarbase.output.format.JsonAvroConverter.Companion.JSON_READER import org.radarbase.output.format.JsonAvroConverter.Companion.JSON_WRITER import org.radarbase.output.util.ResourceContext.Companion.resourceContext import org.radarbase.output.util.TimeUtil.getDate -import java.io.* +import java.io.BufferedReader +import java.io.IOException +import java.io.InputStream +import java.io.Reader +import java.io.Writer import java.nio.file.Path import kotlin.io.path.inputStream @@ -15,7 +19,7 @@ class JsonAvroConverterFactory : RecordConverterFactory { override val formats: Collection = setOf("json") - private val converter = JsonAvroDataConverter() + private val converter = JsonAvroDataConverter(setOf()) @Throws(IOException::class) override fun converterFor( @@ -23,7 +27,8 @@ class JsonAvroConverterFactory : RecordConverterFactory { record: GenericRecord, writeHeader: Boolean, reader: Reader, - ): RecordConverter = JsonAvroConverter(writer, converter) + excludeFields: Set, + ): RecordConverter = JsonAvroConverter(writer, excludeFields) override suspend fun readTimeSeconds( source: InputStream, diff --git a/src/main/java/org/radarbase/output/format/JsonAvroDataConverter.kt b/src/main/java/org/radarbase/output/format/JsonAvroDataConverter.kt index 8f49268..23b0c32 100644 --- a/src/main/java/org/radarbase/output/format/JsonAvroDataConverter.kt +++ b/src/main/java/org/radarbase/output/format/JsonAvroDataConverter.kt @@ -5,46 +5,62 @@ import org.apache.avro.generic.GenericData import org.apache.avro.generic.GenericFixed import org.apache.avro.generic.GenericRecord import java.nio.ByteBuffer +import java.util.EnumSet -class JsonAvroDataConverter { - fun convertRecord(record: GenericRecord): Map { - val map = HashMap() +class JsonAvroDataConverter( + private val excludeFields: Set, +) { + fun convertRecord(record: GenericRecord, prefix: String? = null): Map { val schema = record.schema - for (field in schema.fields) { - map[field.name()] = convertAvro(record.get(field.pos()), field.schema()) + return buildMap { + for (field in schema.fields) { + val fieldPrefix = if (prefix == null) field.name() else "$prefix.${field.name()}" + convertAvro(record.get(field.pos()), field.schema(), fieldPrefix) + .ifNotExcluded { put(field.name(), it) } + } } - return map } - private fun convertAvro(data: Any?, schema: Schema): Any? { - when (schema.type) { - Schema.Type.RECORD -> return convertRecord(data as GenericRecord) + private fun convertAvro(data: Any?, schema: Schema, prefix: String): Any? { + if (schema.type !in compositeTypes && prefix in excludeFields) return EXCLUDE_FIELD + return when (schema.type) { + Schema.Type.RECORD -> convertRecord(data as GenericRecord) Schema.Type.MAP -> { - val value = HashMap() val valueType = schema.valueType - for ((key, value1) in data as Map<*, *>) { - value[key.toString()] = convertAvro(value1, valueType) + buildMap { + for ((key, value1) in data as Map<*, *>) { + convertAvro(value1, valueType, "$prefix.$key") + .ifNotExcluded { put(key.toString(), it) } + } } - return value } Schema.Type.ARRAY -> { - val origList = data as List<*> val itemType = schema.elementType - val list = ArrayList(origList.size) - for (orig in origList) { - list.add(convertAvro(orig, itemType)) + buildList { + (data as List<*>).forEachIndexed { i, orig -> + convertAvro(orig, itemType, "$prefix.$i") + .ifNotExcluded { add(it) } + } } - return list } Schema.Type.UNION -> { - val type = GenericData().resolveUnion(schema, data) - return convertAvro(data, schema.types[type]) + val typeIndex = GenericData().resolveUnion(schema, data) + convertAvro(data, schema.types[typeIndex], prefix) } - Schema.Type.BYTES -> return (data as ByteBuffer).array() - Schema.Type.FIXED -> return (data as GenericFixed).bytes() - Schema.Type.ENUM, Schema.Type.STRING -> return data.toString() - Schema.Type.INT, Schema.Type.LONG, Schema.Type.DOUBLE, Schema.Type.FLOAT, Schema.Type.BOOLEAN, Schema.Type.NULL -> return data + Schema.Type.BYTES -> (data as ByteBuffer).array() + Schema.Type.FIXED -> (data as GenericFixed).bytes() + Schema.Type.ENUM, Schema.Type.STRING -> data.toString() + Schema.Type.INT, Schema.Type.LONG, Schema.Type.DOUBLE, Schema.Type.FLOAT, Schema.Type.BOOLEAN, Schema.Type.NULL -> data else -> throw IllegalArgumentException("Cannot parse field type " + schema.type) } } + + companion object { + private val compositeTypes = EnumSet.of(Schema.Type.RECORD, Schema.Type.MAP, Schema.Type.ARRAY, Schema.Type.UNION) + private val EXCLUDE_FIELD = Any() + + private fun Any?.ifNotExcluded(apply: (Any?) -> Unit) { + if (this !== EXCLUDE_FIELD) apply(this) + } + } } diff --git a/src/main/java/org/radarbase/output/format/RecordConverterFactory.kt b/src/main/java/org/radarbase/output/format/RecordConverterFactory.kt index 53d2771..f0c70a9 100644 --- a/src/main/java/org/radarbase/output/format/RecordConverterFactory.kt +++ b/src/main/java/org/radarbase/output/format/RecordConverterFactory.kt @@ -21,7 +21,11 @@ import org.apache.avro.generic.GenericData import org.apache.avro.generic.GenericRecord import org.radarbase.output.compression.Compression import org.radarbase.output.util.ResourceContext.Companion.resourceContext -import java.io.* +import java.io.BufferedReader +import java.io.IOException +import java.io.InputStream +import java.io.Reader +import java.io.Writer import java.nio.file.Path import java.util.regex.Pattern import kotlin.collections.component1 @@ -45,6 +49,7 @@ interface RecordConverterFactory : Format { record: GenericRecord, writeHeader: Boolean, reader: Reader, + excludeFields: Set = emptySet(), ): RecordConverter val hasHeader: Boolean @@ -151,7 +156,8 @@ interface RecordConverterFactory : Format { } Schema.Type.BYTES, Schema.Type.FIXED, Schema.Type.ENUM, Schema.Type.STRING, Schema.Type.INT, Schema.Type.LONG, Schema.Type.DOUBLE, Schema.Type.FLOAT, - Schema.Type.BOOLEAN, Schema.Type.NULL -> + Schema.Type.BOOLEAN, Schema.Type.NULL, + -> headers.add(prefix) else -> throw IllegalArgumentException("Cannot parse field type " + schema.type) } @@ -166,7 +172,9 @@ interface RecordConverterFactory : Format { fun readFile(reader: BufferedReader, withHeader: Boolean): Pair> { val header = if (withHeader) { reader.readLine() ?: return Pair(null, emptySet()) - } else null + } else { + null + } return Pair(header, reader.lineSequence().toCollection(LinkedHashSet())) } diff --git a/src/main/java/org/radarbase/output/path/FixedPathFormatterPlugin.kt b/src/main/java/org/radarbase/output/path/FixedPathFormatterPlugin.kt index ad617fe..d8db7a9 100644 --- a/src/main/java/org/radarbase/output/path/FixedPathFormatterPlugin.kt +++ b/src/main/java/org/radarbase/output/path/FixedPathFormatterPlugin.kt @@ -7,38 +7,17 @@ import java.time.format.DateTimeFormatter class FixedPathFormatterPlugin : PathFormatterPlugin.Factory { override fun create( - properties: Map + properties: Map, ): PathFormatterPlugin = Plugin(properties) internal class Plugin(properties: Map) : PathFormatterPlugin() { - private val timeBinFormat: DateTimeFormatter + private val extension: String = properties["extension"] ?: "" + private val timeBinFormat: DateTimeFormatter = createTimeBinFormatter(properties["timeBinFormat"]) override val name: String = "fixed" override val allowedFormats: String = allowedParamNames.joinToString(separator = ", ") - init { - timeBinFormat = createTimeBinFormatter(properties["timeBinFormat"]) - } - - private fun createTimeBinFormatter(pattern: String?): DateTimeFormatter { - pattern ?: return HOURLY_TIME_BIN_FORMAT - - return try { - DateTimeFormatter - .ofPattern(pattern) - .withZone(ZoneOffset.UTC) - } catch (ex: IllegalArgumentException) { - logger.error( - "Cannot use time bin format {}, using {} instead", - pattern, - HOURLY_TIME_BIN_FORMAT, - ex, - ) - HOURLY_TIME_BIN_FORMAT - } - } - - override fun lookup(parameterContents: String): PathFormatParameters.() -> String = + override fun lookup(parameterContents: String): suspend PathFormatParameters.() -> String = when (parameterContents) { "projectId" -> ({ sanitizeId(key.get("projectId"), "unknown-project") }) "userId" -> ({ sanitizeId(key.get("userId"), "unknown-user") }) @@ -48,7 +27,9 @@ class FixedPathFormatterPlugin : PathFormatterPlugin.Factory { { val timeBin = if (time != null) { timeBinFormat.format(time) - } else "unknown-time" + } else { + "unknown-time" + } timeBin + attempt.toAttemptSuffix() + extension } } @@ -71,10 +52,28 @@ class FixedPathFormatterPlugin : PathFormatterPlugin.Factory { "extension", ) - val HOURLY_TIME_BIN_FORMAT: DateTimeFormatter = DateTimeFormatter + private val HOURLY_TIME_BIN_FORMAT: DateTimeFormatter = DateTimeFormatter .ofPattern("yyyyMMdd_HH'00'") .withZone(ZoneOffset.UTC) + private fun createTimeBinFormatter(pattern: String?): DateTimeFormatter { + pattern ?: return HOURLY_TIME_BIN_FORMAT + + return try { + DateTimeFormatter + .ofPattern(pattern) + .withZone(ZoneOffset.UTC) + } catch (ex: IllegalArgumentException) { + logger.error( + "Cannot use time bin format {}, using {} instead", + pattern, + HOURLY_TIME_BIN_FORMAT, + ex, + ) + HOURLY_TIME_BIN_FORMAT + } + } + private fun Int.toAttemptSuffix() = if (this == 0) "" else "_$this" private val logger = LoggerFactory.getLogger(Plugin::class.java) diff --git a/src/main/java/org/radarbase/output/path/FormattedPathFactory.kt b/src/main/java/org/radarbase/output/path/FormattedPathFactory.kt index 1b3d092..9fc7fbe 100644 --- a/src/main/java/org/radarbase/output/path/FormattedPathFactory.kt +++ b/src/main/java/org/radarbase/output/path/FormattedPathFactory.kt @@ -16,107 +16,79 @@ package org.radarbase.output.path -import org.apache.avro.generic.GenericRecord +import org.radarbase.output.config.BucketFormatterConfig +import org.radarbase.output.config.PathConfig +import org.radarbase.output.config.PathFormatterConfig import org.radarbase.output.config.TopicConfig import org.slf4j.LoggerFactory -import java.nio.file.Path -import java.time.Instant -import kotlin.reflect.jvm.jvmName open class FormattedPathFactory : RecordPathFactory() { - private lateinit var formatter: PathFormatter - private lateinit var config: PathFormatterConfig + private lateinit var pathFormatter: PathFormatter private var topicFormatters: Map = emptyMap() + private var bucketFormatter: PathFormatter? = null + private lateinit var disabledBucketRegexes: List + private lateinit var defaultBucketName: String - override fun init(properties: Map) { - super.init(properties) + override fun init( + extension: String, + config: PathConfig, + topics: Map, + ) { + super.init(extension, config, topics) + pathFormatter = pathConfig.path.toPathFormatter() + bucketFormatter = pathConfig.bucket?.toBucketFormatter() + disabledBucketRegexes = pathConfig.bucket + ?.disabledFormats + ?.map { it.toRegex(RegexOption.IGNORE_CASE) } + ?: emptyList() + defaultBucketName = pathConfig.bucket + ?.defaultName + ?: "radar-output-storage" - this.config = DEFAULTS.withValues(properties) - formatter = config.toPathFormatter() - logger.info("Formatting path with {}", formatter) + logger.info("Formatting path with {}", pathFormatter) + } + + override suspend fun bucket(pathParameters: PathFormatParameters?): String? { + val formatter = bucketFormatter ?: return null + pathParameters ?: return pathConfig.bucket?.defaultName + val format = formatter.format(pathParameters) + return if (disabledBucketRegexes.any { it.matches(format) }) { + defaultBucketName + } else { + format + } } override fun addTopicConfiguration(topicConfig: Map) { - topicFormatters = topicConfig - .filter { (_, config) -> config.pathProperties.isNotEmpty() } - .mapValues { (_, config) -> - this.config.withValues(config.pathProperties) - .toPathFormatter() - } - .onEach { (topic, formatter) -> - logger.info("Formatting path of topic {} with {}", topic, formatter) + topicFormatters = buildMap { + topicConfig.forEach { (topic, config) -> + val topicFormatConfig = pathConfig.path.copy(config.pathProperties) + if (topicFormatConfig != pathConfig.path) { + val formatter = topicFormatConfig.toPathFormatter() + logger.info("Formatting path of topic {} with {}", topic, formatter) + put(topic, formatter) + } } + } } - override fun getRelativePath( - topic: String, - key: GenericRecord, - value: GenericRecord, - time: Instant?, - attempt: Int, - ): Path = (topicFormatters[topic] ?: formatter) - .format(PathFormatParameters(topic, key, value, time, attempt, extension)) + override suspend fun relativePath( + pathParameters: PathFormatParameters, + ): String = (topicFormatters[pathParameters.topic] ?: pathFormatter) + .format(pathParameters) companion object { - private fun PathFormatterConfig.toPathFormatter(): PathFormatter { - return PathFormatter(format, createPlugins()) - } - - internal val DEFAULTS = PathFormatterConfig( - format = "\${projectId}/\${userId}/\${topic}/\${filename}", - pluginNames = "fixed time key value", + private fun PathFormatterConfig.toPathFormatter(): PathFormatter = PathFormatter( + format, + plugins.toPathFormatterPlugins(properties), ) - private val logger = LoggerFactory.getLogger(FormattedPathFactory::class.java) - internal fun String.toPathFormatterPlugin( - properties: Map, - ): PathFormatterPlugin? = when (this) { - "fixed" -> FixedPathFormatterPlugin().create(properties) - "time" -> TimePathFormatterPlugin() - "key" -> KeyPathFormatterPlugin() - "value" -> ValuePathFormatterPlugin() - else -> { - try { - val clazz = Class.forName(this) - when (val plugin = clazz.getConstructor().newInstance()) { - is PathFormatterPlugin -> plugin - is PathFormatterPlugin.Factory -> plugin.create(properties) - else -> { - logger.error( - "Failed to instantiate plugin {}, it does not extend {} or {}", - this, - PathFormatterPlugin::class.jvmName, - PathFormatterPlugin.Factory::class.jvmName - ) - null - } - } - } catch (ex: ReflectiveOperationException) { - logger.error("Failed to instantiate plugin {}", this) - null - } - } - } - - data class PathFormatterConfig( - val format: String, - val pluginNames: String, - val properties: Map = mapOf(), - ) { - fun createPlugins(): List = pluginNames - .trim() - .split("\\s+".toRegex()) - .mapNotNull { it.toPathFormatterPlugin(properties) } - - fun withValues(values: Map): PathFormatterConfig { - val newProperties = HashMap(properties).apply { - putAll(values) - } - val format = newProperties.remove("format") ?: this.format - val pluginNames = newProperties.remove("plugins") ?: this.pluginNames + private fun BucketFormatterConfig.toBucketFormatter(): PathFormatter = PathFormatter( + format, + plugins.toPathFormatterPlugins(properties), + checkMinimalDistinction = false, + ) - return PathFormatterConfig(format, pluginNames, newProperties) - } - } + private val logger = LoggerFactory.getLogger(FormattedPathFactory::class.java) } } diff --git a/src/main/java/org/radarbase/output/path/KeyPathFormatterPlugin.kt b/src/main/java/org/radarbase/output/path/KeyPathFormatterPlugin.kt index f90c624..71f7b7f 100644 --- a/src/main/java/org/radarbase/output/path/KeyPathFormatterPlugin.kt +++ b/src/main/java/org/radarbase/output/path/KeyPathFormatterPlugin.kt @@ -8,7 +8,7 @@ class KeyPathFormatterPlugin : PathFormatterPlugin() { override val allowedFormats: String = "key:my.key.index" - override fun lookup(parameterContents: String): PathFormatParameters.() -> String { + override fun lookup(parameterContents: String): suspend PathFormatParameters.() -> String { val index = parameterContents.split('.') require(index.none { it.isBlank() }) { "Cannot format key record with index $parameterContents" } return { diff --git a/src/main/java/org/radarbase/output/path/MPPathFormatterPlugin.kt b/src/main/java/org/radarbase/output/path/MPPathFormatterPlugin.kt new file mode 100644 index 0000000..883bc80 --- /dev/null +++ b/src/main/java/org/radarbase/output/path/MPPathFormatterPlugin.kt @@ -0,0 +1,157 @@ +package org.radarbase.output.path + +import kotlinx.coroutines.CoroutineScope +import kotlinx.coroutines.Dispatchers +import kotlinx.coroutines.SupervisorJob +import kotlinx.coroutines.delay +import kotlinx.coroutines.isActive +import kotlinx.coroutines.launch +import org.radarbase.kotlin.coroutines.CacheConfig +import org.radarbase.kotlin.coroutines.CachedMap +import org.radarbase.ktor.auth.ClientCredentialsConfig +import org.radarbase.ktor.auth.clientCredentials +import org.radarbase.management.client.MPClient +import org.radarbase.management.client.MPProject +import org.radarbase.management.client.MPSubject +import org.radarbase.management.client.mpClient +import org.radarbase.output.path.RecordPathFactory.Companion.getOrNull +import org.radarbase.output.path.RecordPathFactory.Companion.sanitizeId +import java.util.concurrent.ConcurrentHashMap +import java.util.concurrent.ConcurrentMap +import kotlin.time.Duration.Companion.minutes +import kotlin.time.Duration.Companion.seconds + +/** + * Plugin to read values from ManagementPortal. It requires the plugin properties + * managementPortalUrl, managementPortalClientId and managementPortalClientSecret to be set, + * or managementPortalUrl in combination with the environment variables MANAGEMENT_PORTAL_CLIENT_ID + * and MANAGEMENT_PORTAL_CLIENT_SECRET. + */ +class MPPathFormatterPlugin : PathFormatterPlugin.Factory { + private val supervisorJob = SupervisorJob() + private val pluginScope = CoroutineScope(Dispatchers.Default + supervisorJob) + + override fun create( + properties: Map, + ): PathFormatterPlugin = Plugin(properties, pluginScope) + + internal class Plugin( + properties: Map, + pluginScope: CoroutineScope, + ) : PathFormatterPlugin() { + override val name: String = "mp" + + override val allowedFormats: String = setOf( + "organization", + "project", + "user", + "externalId", + "group", + "", + "project:", + ).joinToString { ", " } + + private val mpClient: MPClient + + private val cacheConfig = CacheConfig( + refreshDuration = 10.minutes, + retryDuration = 10.seconds, + maxSimultaneousCompute = 2, + ) + private val projectCache: CachedMap + private val subjectCache: ConcurrentMap> = ConcurrentHashMap() + + init { + val mpUrl = requireNotNull(properties["managementPortalUrl"]) { "Missing managementPortalUrl configuration" } + .trimEnd('/') + + mpClient = mpClient { + url = "$mpUrl/" + auth { + clientCredentials( + ClientCredentialsConfig( + tokenUrl = "$mpUrl/oauth/token", + clientId = properties["managementPortalClientId"], + clientSecret = properties["managementPortalClientSecret"], + ).copyWithEnv(), + ) + } + } + + projectCache = CachedMap(cacheConfig) { + mpClient.requestProjects().associateBy { it.id } + } + + pluginScope.launch { + while (isActive) { + delay(30.minutes) + subjectCache + .filter { it.value.isStale(20.minutes) } + .forEach { (key, value) -> + subjectCache.remove(key, value) + } + if (projectCache.isStale()) { + projectCache.clear() + } + } + } + } + + override fun lookup(parameterContents: String): suspend PathFormatParameters.() -> String = + when (parameterContents) { + "organization" -> projectProperty("unknown-organization") { + organization?.id + } + "project" -> projectProperty("unknown-project") { id } + "group" -> subjectProperty("default") { group } + "externalId" -> subjectProperty("unknown-user") { externalId ?: id } + "userId", "login", "id" -> subjectProperty("unknown-user") { id } + else -> if (parameterContents.startsWith("project:")) { + projectProperty("unknown-$parameterContents") { + attributes[parameterContents.removePrefix("project:")] + } + } else { + subjectProperty("unknown-$parameterContents") { + attributes[parameterContents] + } + } + } + + private inline fun subjectProperty( + defaultValue: String, + crossinline compute: MPSubject.() -> String?, + ): suspend PathFormatParameters.() -> String = { + sanitizeId(lookupSubject()?.compute(), defaultValue) + } + + private suspend fun PathFormatParameters.lookupSubject(): MPSubject? { + val projectId = key.getOrNull("projectId") ?: return null + val userId = key.getOrNull("userId") ?: return null + + val cache = subjectCache.computeIfAbsent(projectId.toString()) { projectIdString -> + CachedMap(cacheConfig) { + val subjects = mpClient.requestSubjects(projectIdString) + buildMap(subjects.size) { + subjects.forEach { subject -> + val subjectId = subject.id ?: return@forEach + put(subjectId, subject) + } + } + } + } + return cache.get(userId.toString()) + } + + private inline fun projectProperty( + defaultValue: String, + crossinline compute: MPProject.() -> String?, + ): suspend PathFormatParameters.() -> String = { + sanitizeId(lookupProject()?.compute(), defaultValue) + } + + private suspend fun PathFormatParameters.lookupProject(): MPProject? { + val projectId = key.getOrNull("projectId") ?: return null + return projectCache.get(projectId.toString()) + } + } +} diff --git a/src/main/java/org/radarbase/output/path/PathFormatParameters.kt b/src/main/java/org/radarbase/output/path/PathFormatParameters.kt index b6f6bff..3fe552c 100644 --- a/src/main/java/org/radarbase/output/path/PathFormatParameters.kt +++ b/src/main/java/org/radarbase/output/path/PathFormatParameters.kt @@ -9,5 +9,4 @@ data class PathFormatParameters( val value: GenericRecord, val time: Instant?, val attempt: Int, - val extension: String, ) diff --git a/src/main/java/org/radarbase/output/path/PathFormatter.kt b/src/main/java/org/radarbase/output/path/PathFormatter.kt index 5ffcece..10349ca 100644 --- a/src/main/java/org/radarbase/output/path/PathFormatter.kt +++ b/src/main/java/org/radarbase/output/path/PathFormatter.kt @@ -17,16 +17,16 @@ package org.radarbase.output.path import org.slf4j.LoggerFactory -import java.nio.file.Path -import java.nio.file.Paths class PathFormatter( private val format: String, - private val plugins: List + private val plugins: List, + checkMinimalDistinction: Boolean = true, ) { - private val parameterLookups: Map String> + private val parameterLookups: Map String> init { + require(format.isNotBlank()) { "Path format may not be an empty string" } val foundParameters = "\\$\\{([^}]*)}".toRegex() .findAll(format) .mapTo(HashSet()) { it.groupValues[1] } @@ -39,7 +39,7 @@ class PathFormatter( } catch (ex: IllegalArgumentException) { logger.error("Cannot parse path format {}, illegal format parameter found by plugin {}", format, plugin.name, ex) throw ex - } + }, ) } } @@ -49,25 +49,23 @@ class PathFormatter( "Cannot use path format $format: unknown parameters $unsupportedParameters." + " Legal parameter names are parameters $allowedFormats" } - require("topic" in parameterLookups) { "Path must include topic parameter." } - require( - "filename" in parameterLookups || - ("extension" in parameterLookups && "attempt" in parameterLookups) - ) { - "Path must include filename parameter or extension and attempt parameters." + if (checkMinimalDistinction) { + require("topic" in parameterLookups) { "Path must include topic parameter." } + require( + "filename" in parameterLookups || + ("extension" in parameterLookups && "attempt" in parameterLookups), + ) { + "Path must include filename parameter or extension and attempt parameters." + } } } - fun format( + suspend fun format( parameters: PathFormatParameters, - ): Path { - val path = parameterLookups.asSequence() - .fold(format) { p, (name, lookup) -> - p.replace("\${$name}", parameters.lookup()) - } - - return Paths.get(path) - } + ): String = parameterLookups.asSequence() + .fold(format) { p, (name, lookup) -> + p.replace("\${$name}", parameters.lookup()) + } override fun toString(): String = "PathFormatter{" + "format=$format," + diff --git a/src/main/java/org/radarbase/output/path/PathFormatterPlugin.kt b/src/main/java/org/radarbase/output/path/PathFormatterPlugin.kt index 122f2c5..25e37b2 100644 --- a/src/main/java/org/radarbase/output/path/PathFormatterPlugin.kt +++ b/src/main/java/org/radarbase/output/path/PathFormatterPlugin.kt @@ -1,5 +1,49 @@ package org.radarbase.output.path +import org.slf4j.LoggerFactory +import kotlin.reflect.jvm.jvmName + +private val logger = LoggerFactory.getLogger(PathFormatterPlugin::class.java) + +internal fun String.toPathFormatterPlugins( + properties: Map, +): List = + splitToSequence("\\s+".toRegex()) + .filter { it.isNotEmpty() } + .mapNotNull { it.toPathFormatterPlugin(properties) } + .toList() + +internal fun String.toPathFormatterPlugin( + properties: Map, +): PathFormatterPlugin? = when (this) { + "fixed" -> FixedPathFormatterPlugin().create(properties) + "time" -> TimePathFormatterPlugin() + "key" -> KeyPathFormatterPlugin() + "value" -> ValuePathFormatterPlugin() + "mp" -> MPPathFormatterPlugin().create(properties) + else -> { + try { + val clazz = Class.forName(this) + when (val plugin = clazz.getConstructor().newInstance()) { + is PathFormatterPlugin -> plugin + is PathFormatterPlugin.Factory -> plugin.create(properties) + else -> { + logger.error( + "Failed to instantiate plugin {}, it does not extend {} or {}", + this, + PathFormatterPlugin::class.jvmName, + PathFormatterPlugin.Factory::class.jvmName, + ) + null + } + } + } catch (ex: ReflectiveOperationException) { + logger.error("Failed to instantiate plugin {}", this) + null + } + } +} + abstract class PathFormatterPlugin { /** * Short name to reference this plugin by. @@ -22,8 +66,8 @@ abstract class PathFormatterPlugin { * @throws IllegalArgumentException if any of the parameter contents are invalid. */ fun createLookupTable( - parameterNames: Collection - ): Map String> = buildMap { + parameterNames: Collection, + ): Map String> = buildMap { parameterNames.forEach { paramName -> val paramContents = extractParamContents(paramName) if (paramContents != null) { @@ -49,7 +93,7 @@ abstract class PathFormatterPlugin { * Create a lookup function from a record to formatted value, based on parameter contents. * @throws IllegalArgumentException if the parameter contents are invalid. */ - protected abstract fun lookup(parameterContents: String): PathFormatParameters.() -> String + protected abstract fun lookup(parameterContents: String): suspend PathFormatParameters.() -> String interface Factory { /** diff --git a/src/main/java/org/radarbase/output/path/RecordPathFactory.kt b/src/main/java/org/radarbase/output/path/RecordPathFactory.kt index 68f5156..8b6c2ed 100644 --- a/src/main/java/org/radarbase/output/path/RecordPathFactory.kt +++ b/src/main/java/org/radarbase/output/path/RecordPathFactory.kt @@ -16,22 +16,44 @@ package org.radarbase.output.path +import kotlinx.coroutines.async +import kotlinx.coroutines.coroutineScope import org.apache.avro.Schema import org.apache.avro.generic.GenericRecord import org.apache.avro.generic.GenericRecordBuilder -import org.radarbase.output.FileStoreFactory -import org.radarbase.output.Plugin +import org.radarbase.output.config.PathConfig import org.radarbase.output.config.TopicConfig import org.radarbase.output.util.TimeUtil import org.slf4j.LoggerFactory import java.nio.file.Path -import java.time.Instant +import java.nio.file.Paths import java.util.regex.Pattern -abstract class RecordPathFactory : Plugin { - lateinit var root: Path - lateinit var extension: String - lateinit var fileStoreFactory: FileStoreFactory +abstract class RecordPathFactory { + lateinit var pathConfig: PathConfig + private set + + open fun init( + extension: String, + config: PathConfig, + topics: Map = emptyMap(), + ) { + this.pathConfig = config.copy( + output = if (config.output.isAbsolute) { + rootPath.relativize(config.output) + } else { + config.output + }, + path = config.path.copy( + properties = buildMap { + putAll(config.path.properties) + putIfAbsent("extension", extension) + }, + ), + ) + + this.addTopicConfiguration(topics) + } /** * Get the organization of given record in given topic. @@ -41,18 +63,14 @@ abstract class RecordPathFactory : Plugin { * paths already existed and are incompatible. * @return organization of given record */ - open fun getRecordPath( + open suspend fun getRecordPath( topic: String, record: GenericRecord, attempt: Int, ): Path { - val keyField = record.get("key") - val valueField = record.get("value") as? GenericRecord - - if (keyField == null || valueField == null) { - logger.error("Failed to process {}", record) - throw IllegalArgumentException("Failed to process $record; no key or value") - } + val keyField = requireNotNull(record.get("key")) { "Failed to process $record; no key present" } + val valueField = + requireNotNull(record.get("value") as? GenericRecord) { "Failed to process $record; no value present" } val keyRecord: GenericRecord = if (keyField is GenericRecord) { keyField @@ -64,33 +82,42 @@ abstract class RecordPathFactory : Plugin { }.build() } - val time = TimeUtil.getDate(keyRecord, valueField) + val params = PathFormatParameters( + topic = topic, + key = keyRecord, + value = valueField, + time = TimeUtil.getDate(keyRecord, valueField), + attempt = attempt, + ) + + return coroutineScope { + val bucketJob = async { bucket(params) } + val pathJob = async { relativePath(params) } - val relativePath = getRelativePath(topic, keyRecord, valueField, time, attempt) - return root.resolve(relativePath) + val path = pathConfig.output.resolve(pathJob.await()) + val bucket = bucketJob.await() + if (bucket != null) { + Paths.get(bucket).resolve(path) + } else { + path + } + } } + abstract suspend fun bucket(pathParameters: PathFormatParameters?): String? + /** * Get the relative path corresponding to given record on given topic. - * @param topic Kafka topic name - * @param key record key - * @param value record value - * @param time time contained in the record - * @param attempt number of previous attempts to write given record. This increases if previous - * paths already existed and are incompatible. + * @param pathParameters Parameters needed to determine the path * @return relative path corresponding to given parameters. */ - abstract fun getRelativePath( - topic: String, - key: GenericRecord, - value: GenericRecord, - time: Instant?, - attempt: Int, - ): Path + abstract suspend fun relativePath( + pathParameters: PathFormatParameters, + ): String companion object { - private val logger = LoggerFactory.getLogger(RecordPathFactory::class.java) private val ILLEGAL_CHARACTER_PATTERN = Pattern.compile("[^a-zA-Z0-9_-]+") + private val rootPath = Paths.get("/") fun sanitizeId(id: Any?, defaultValue: String): String = id ?.let { ILLEGAL_CHARACTER_PATTERN.matcher(it.toString()).replaceAll("") } @@ -110,7 +137,7 @@ abstract class RecordPathFactory : Plugin { {"name": "sourceId", "type": "string", "doc": "Unique identifier associated with the source."} ] } - """.trimIndent() + """.trimIndent(), ) fun GenericRecord.getFieldOrNull(fieldName: String): Schema.Field? { @@ -122,5 +149,7 @@ abstract class RecordPathFactory : Plugin { ?.let { get(it.pos()) } } - open fun addTopicConfiguration(topicConfig: Map) = Unit + private val logger = LoggerFactory.getLogger(RecordPathFactory::class.java) + + protected open fun addTopicConfiguration(topicConfig: Map) = Unit } diff --git a/src/main/java/org/radarbase/output/path/TimePathFormatterPlugin.kt b/src/main/java/org/radarbase/output/path/TimePathFormatterPlugin.kt index 9c7ad43..8ce6817 100644 --- a/src/main/java/org/radarbase/output/path/TimePathFormatterPlugin.kt +++ b/src/main/java/org/radarbase/output/path/TimePathFormatterPlugin.kt @@ -9,7 +9,7 @@ class TimePathFormatterPlugin : PathFormatterPlugin() { override val allowedFormats: String = "time:YYYY-mm-dd" - override fun lookup(parameterContents: String): PathFormatParameters.() -> String { + override fun lookup(parameterContents: String): suspend PathFormatParameters.() -> String { val dateFormatter = DateTimeFormatter .ofPattern(parameterContents) .withZone(ZoneOffset.UTC) diff --git a/src/main/java/org/radarbase/output/path/ValuePathFormatterPlugin.kt b/src/main/java/org/radarbase/output/path/ValuePathFormatterPlugin.kt index c8e794f..65f2668 100644 --- a/src/main/java/org/radarbase/output/path/ValuePathFormatterPlugin.kt +++ b/src/main/java/org/radarbase/output/path/ValuePathFormatterPlugin.kt @@ -10,7 +10,7 @@ class ValuePathFormatterPlugin : PathFormatterPlugin() { override val allowedFormats: String = "value:my.value.index" - override fun lookup(parameterContents: String): PathFormatParameters.() -> String { + override fun lookup(parameterContents: String): suspend PathFormatParameters.() -> String { val index = parameterContents.split('.') require(index.none { it.isBlank() }) { "Cannot format value record with index $parameterContents" } return { diff --git a/src/main/java/org/radarbase/output/source/AzureSourceStorage.kt b/src/main/java/org/radarbase/output/source/AzureSourceStorage.kt index ae86d0e..2ca026d 100644 --- a/src/main/java/org/radarbase/output/source/AzureSourceStorage.kt +++ b/src/main/java/org/radarbase/output/source/AzureSourceStorage.kt @@ -8,9 +8,10 @@ import org.apache.avro.file.SeekableFileInput import org.apache.avro.file.SeekableInput import org.radarbase.output.config.AzureConfig import org.radarbase.output.util.TemporaryDirectory -import org.radarbase.output.util.toKey +import org.radarbase.output.util.withoutFirstSegment import java.nio.file.Path import java.nio.file.Paths +import java.time.Instant import kotlin.io.path.createTempFile import kotlin.io.path.deleteIfExists @@ -22,24 +23,30 @@ class AzureSourceStorage( private val blobContainerClient = client.getBlobContainerClient(config.container) private val readOffsetFromMetadata = config.endOffsetFromMetadata - private fun blobClient(path: Path) = blobContainerClient.getBlobClient(path.toKey()) + private fun blobClient(path: Path) = blobContainerClient.getBlobClient(path.withoutFirstSegment()) - override suspend fun list(path: Path, maxKeys: Int?): List = + override suspend fun list(path: Path, startAfter: Path?, maxKeys: Int?): List = withContext(Dispatchers.IO) { var iterable: Iterable = blobContainerClient.listBlobsByHierarchy("$path/") + if (startAfter != null) { + iterable = iterable.filter { Paths.get(it.name) > startAfter } + } if (maxKeys != null) { iterable = iterable.take(maxKeys) } iterable.map { - SimpleFileStatus( - Paths.get(it.name), - it.isPrefix ?: false, - it.properties?.lastModified?.toInstant(), - ) + if (it.isPrefix == true) { + StorageNode.StorageFile( + Paths.get(it.name), + it.properties?.lastModified?.toInstant() ?: Instant.now(), + ) + } else { + StorageNode.StorageDirectory(Paths.get(it.name)) + } } } - override suspend fun createTopicFile(topic: String, status: SimpleFileStatus): TopicFile { + override suspend fun createTopicFile(topic: String, status: StorageNode): TopicFile { var topicFile = super.createTopicFile(topic, status) if (readOffsetFromMetadata && topicFile.range.range.to == null) { diff --git a/src/main/java/org/radarbase/output/source/DelegatingStorageIndex.kt b/src/main/java/org/radarbase/output/source/DelegatingStorageIndex.kt new file mode 100644 index 0000000..a9754df --- /dev/null +++ b/src/main/java/org/radarbase/output/source/DelegatingStorageIndex.kt @@ -0,0 +1,14 @@ +package org.radarbase.output.source + +/** + * Delegate all calls directly to the underlying storage. This effectively means that no caching + * takes place. + */ +class DelegatingStorageIndex( + private val sourceStorage: SourceStorage, +) : StorageIndex { + override suspend fun list(dir: StorageNode.StorageDirectory, maxKeys: Int?): List = + sourceStorage.list(dir.path, maxKeys = maxKeys) + + override suspend fun remove(file: StorageNode.StorageFile) = Unit +} diff --git a/src/main/java/org/radarbase/output/source/InMemoryStorageIndex.kt b/src/main/java/org/radarbase/output/source/InMemoryStorageIndex.kt new file mode 100644 index 0000000..e167624 --- /dev/null +++ b/src/main/java/org/radarbase/output/source/InMemoryStorageIndex.kt @@ -0,0 +1,121 @@ +package org.radarbase.output.source + +import org.radarbase.output.source.StorageIndex.Companion.ROOT +import java.nio.file.Path +import java.util.concurrent.ConcurrentHashMap +import java.util.concurrent.ConcurrentMap + +/** + * Storage index that keeps the given file tree in memory. + * For very large file systems, this may + * cause a memory issue. + */ +class InMemoryStorageIndex : MutableStorageIndex { + private val fileIndex: ConcurrentMap> = ConcurrentHashMap() + private val rootSet = ConcurrentHashMap() + + init { + fileIndex[ROOT] = rootSet + } + + override suspend fun list(dir: StorageNode.StorageDirectory, maxKeys: Int?): List { + val listing = if (dir === ROOT) { + rootSet + } else { + fileIndex[dir] ?: return listOf() + } + + return if (maxKeys != null) { + listing.values.take(maxKeys) + } else { + listing.values.toList() + } + } + + /** Adds a node and all its parents to the file hierarchy. */ + private fun add(dir: StorageNode.StorageDirectory) { + var currentDir = dir + var parentDir = currentDir.parent() + while (parentDir != null) { + fileIndex.compute(parentDir) { _, map -> + if (map == null) { + mapOf(currentDir.path to currentDir) + } else { + buildMap(map.size + 1) { + putAll(map) + put(currentDir.path, currentDir) + } + } + } + currentDir = parentDir + parentDir = currentDir.parent() + } + rootSet[currentDir.path] = currentDir + } + + override suspend fun addAll(parent: StorageNode.StorageDirectory, nodes: List): Collection { + add(parent) + + if (nodes.isEmpty()) { + return fileIndex[parent]?.values + ?: listOf() + } + + nodes.asSequence() + .filterIsInstance() + .forEach { node -> + fileIndex.computeIfAbsent(node) { + mapOf() + } + } + val newMap = fileIndex.compute(parent) { _, map -> + if (map == null) { + buildMap(nodes.size) { + nodes.forEach { put(it.path, it) } + } + } else { + buildMap(nodes.size + map.size) { + putAll(map) + nodes.forEach { put(it.path, it) } + } + } + } ?: mapOf() + + return newMap.values + } + + override suspend fun sync(parent: StorageNode.StorageDirectory, nodes: List) { + add(parent) + val newMap = buildMap(nodes.size) { + nodes.forEach { put(it.path, it) } + } + + fileIndex[parent] = newMap + + nodes.asSequence() + .filterIsInstance() + .filter { it.path !in newMap } + .forEach { removeRecursive(it) } + } + + override suspend fun remove(file: StorageNode.StorageFile) { + val parent = file.parent() + + if (parent != null) { + fileIndex.computeIfPresent(parent) { _, map -> + (map - file.path).takeIf { it.isNotEmpty() } + } + } else { + rootSet.remove(file.path) + } + } + + private fun removeRecursive(node: StorageNode.StorageDirectory) { + val directoriesToRemove = ArrayDeque() + fileIndex.remove(node)?.values?.filterIsInstanceTo(directoriesToRemove) + while (directoriesToRemove.isNotEmpty()) { + val first = directoriesToRemove.removeFirst() + fileIndex.remove(first)?.values?.filterIsInstanceTo(directoriesToRemove) + } + } +} diff --git a/src/main/java/org/radarbase/output/source/MutableStorageIndex.kt b/src/main/java/org/radarbase/output/source/MutableStorageIndex.kt new file mode 100644 index 0000000..30f850a --- /dev/null +++ b/src/main/java/org/radarbase/output/source/MutableStorageIndex.kt @@ -0,0 +1,20 @@ +package org.radarbase.output.source + +/** Storage index that may be modified by the storage index manager. */ +interface MutableStorageIndex : StorageIndex { + /** + * Add a list of storage nodes to the given directory. + * All values in [nodes] should have [parent] as parent node. No nodes will be removed from the + * current directory listing, but updated values (e.g. last modified values) will be overridden. + * + * @return the current file listing after adding new nodes. + */ + suspend fun addAll(parent: StorageNode.StorageDirectory, nodes: List): Collection + + /** + * Fully sync a storage node list with the index. + * All values in [nodes] should have [parent] as parent node. All nodes in the index + * corresponding to [parent] will be removed from that directory and replaced by the given list. + */ + suspend fun sync(parent: StorageNode.StorageDirectory, nodes: List) +} diff --git a/src/main/java/org/radarbase/output/source/S3SourceStorage.kt b/src/main/java/org/radarbase/output/source/S3SourceStorage.kt index c36dda6..8a33444 100644 --- a/src/main/java/org/radarbase/output/source/S3SourceStorage.kt +++ b/src/main/java/org/radarbase/output/source/S3SourceStorage.kt @@ -1,6 +1,9 @@ package org.radarbase.output.source -import io.minio.* +import io.minio.GetObjectTagsArgs +import io.minio.ListObjectsArgs +import io.minio.MinioClient +import io.minio.RemoveObjectArgs import io.minio.errors.ErrorResponseException import io.minio.messages.Tags import kotlinx.coroutines.Dispatchers @@ -9,23 +12,14 @@ import kotlinx.coroutines.flow.first import kotlinx.coroutines.flow.flow import kotlinx.coroutines.flow.flowOn import kotlinx.coroutines.flow.retryWhen -import kotlinx.coroutines.withContext -import org.apache.avro.file.SeekableFileInput -import org.apache.avro.file.SeekableInput import org.radarbase.output.config.S3Config -import org.radarbase.output.util.TemporaryDirectory import org.radarbase.output.util.bucketBuild import org.radarbase.output.util.objectBuild import org.slf4j.LoggerFactory import java.io.FileNotFoundException -import java.io.IOException import java.nio.file.Path import java.nio.file.Paths -import java.nio.file.StandardOpenOption -import kotlin.io.path.createTempFile -import kotlin.io.path.deleteExisting -import kotlin.io.path.deleteIfExists -import kotlin.io.path.outputStream +import kotlin.io.path.pathString import kotlin.time.Duration.Companion.seconds class S3SourceStorage( @@ -33,19 +27,23 @@ class S3SourceStorage( config: S3Config, private val tempPath: Path, ) : SourceStorage { - private val bucket = config.bucket + private val bucket = requireNotNull(config.bucket) { "Source storage requires a bucket name" } private val readEndOffset = config.endOffsetFromTags override suspend fun list( path: Path, + startAfter: Path?, maxKeys: Int?, - ): List { + ): List { val listRequest = ListObjectsArgs.Builder().bucketBuild(bucket) { if (maxKeys != null) { maxKeys(maxKeys.coerceAtMost(1000)) } prefix("$path/") recursive(false) + if (startAfter != null) { + startAfter(startAfter.pathString) + } useUrlEncodingType(false) } var iterable = faultTolerant { s3Client.listObjects(listRequest) } @@ -55,15 +53,16 @@ class S3SourceStorage( return iterable .map { val item = it.get() - SimpleFileStatus( - Paths.get(item.objectName()), - item.isDir, - if (item.isDir) null else item.lastModified().toInstant() - ) + val itemPath = Paths.get(item.objectName()) + if (item.isDir) { + StorageNode.StorageDirectory(itemPath) + } else { + StorageNode.StorageFile(itemPath, item.lastModified().toInstant()) + } } } - override suspend fun createTopicFile(topic: String, status: SimpleFileStatus): TopicFile { + override suspend fun createTopicFile(topic: String, status: StorageNode): TopicFile { var topicFile = super.createTopicFile(topic, status) if (readEndOffset && topicFile.range.range.to == null) { @@ -74,7 +73,7 @@ class S3SourceStorage( topicFile = topicFile.copy( range = topicFile.range.mapRange { it.copy(to = endOffset) - } + }, ) } } catch (ex: Exception) { @@ -95,49 +94,7 @@ class S3SourceStorage( faultTolerant { s3Client.removeObject(removeRequest) } } - override fun createReader(): SourceStorage.SourceStorageReader = S3SourceStorageReader() - - private inner class S3SourceStorageReader : SourceStorage.SourceStorageReader { - private val tempDir = TemporaryDirectory(tempPath, "worker-") - - override suspend fun newInput(file: TopicFile): SeekableInput = - withContext(Dispatchers.IO) { - val tempFile = createTempFile( - directory = tempDir.path, - prefix = "${file.topic}-${file.path.fileName}", - suffix = ".avro", - ) - try { - faultTolerant { - tempFile.outputStream(StandardOpenOption.TRUNCATE_EXISTING).use { out -> - s3Client.getObject( - GetObjectArgs.Builder() - .objectBuild(bucket, file.path) - ).use { input -> - input.copyTo(out) - } - } - } - } catch (ex: Exception) { - try { - tempFile.deleteExisting() - } catch (ex: IOException) { - logger.warn("Failed to delete temporary file {}", tempFile) - } - throw ex - } - object : SeekableFileInput(tempFile.toFile()) { - override fun close() { - super.close() - tempFile.deleteIfExists() - } - } - } - - override suspend fun closeAndJoin() = withContext(Dispatchers.IO) { - tempDir.close() - } - } + override fun createReader(): SourceStorage.SourceStorageReader = S3SourceStorageReader(tempPath, s3Client, bucket) companion object { private val logger = LoggerFactory.getLogger(S3SourceStorage::class.java) diff --git a/src/main/java/org/radarbase/output/source/S3SourceStorageReader.kt b/src/main/java/org/radarbase/output/source/S3SourceStorageReader.kt new file mode 100644 index 0000000..3a4a067 --- /dev/null +++ b/src/main/java/org/radarbase/output/source/S3SourceStorageReader.kt @@ -0,0 +1,67 @@ +package org.radarbase.output.source + +import io.minio.GetObjectArgs +import io.minio.MinioClient +import kotlinx.coroutines.Dispatchers +import kotlinx.coroutines.withContext +import org.apache.avro.file.SeekableFileInput +import org.apache.avro.file.SeekableInput +import org.radarbase.output.source.S3SourceStorage.Companion.faultTolerant +import org.radarbase.output.util.TemporaryDirectory +import org.radarbase.output.util.objectBuild +import org.slf4j.LoggerFactory +import java.io.IOException +import java.nio.file.Path +import java.nio.file.StandardOpenOption +import kotlin.io.path.deleteExisting +import kotlin.io.path.deleteIfExists +import kotlin.io.path.outputStream + +internal class S3SourceStorageReader( + tempPath: Path, + private val s3Client: MinioClient, + private val bucket: String, +) : SourceStorage.SourceStorageReader { + private val tempDir = TemporaryDirectory(tempPath, "worker-") + + override suspend fun newInput(file: TopicFile): SeekableInput = withContext(Dispatchers.IO) { + val tempFile = kotlin.io.path.createTempFile( + directory = tempDir.path, + prefix = "${file.topic}-${file.path.fileName}", + suffix = ".avro", + ) + try { + faultTolerant { + tempFile.outputStream(StandardOpenOption.TRUNCATE_EXISTING).use { out -> + s3Client.getObject( + GetObjectArgs.Builder() + .objectBuild(bucket, file.path), + ).use { input -> + input.copyTo(out) + } + } + } + } catch (ex: Exception) { + try { + tempFile.deleteExisting() + } catch (ex: IOException) { + logger.warn("Failed to delete temporary file {}", tempFile) + } + throw ex + } + object : SeekableFileInput(tempFile.toFile()) { + override fun close() { + super.close() + tempFile.deleteIfExists() + } + } + } + + override suspend fun closeAndJoin() = withContext(Dispatchers.IO) { + tempDir.close() + } + + companion object { + private val logger = LoggerFactory.getLogger(S3SourceStorageReader::class.java) + } +} diff --git a/src/main/java/org/radarbase/output/source/SourceStorage.kt b/src/main/java/org/radarbase/output/source/SourceStorage.kt index 45dda78..d07cf23 100644 --- a/src/main/java/org/radarbase/output/source/SourceStorage.kt +++ b/src/main/java/org/radarbase/output/source/SourceStorage.kt @@ -16,15 +16,16 @@ interface SourceStorage { /** List all files in the given directory. */ suspend fun list( path: Path, + startAfter: Path? = null, maxKeys: Int? = null, - ): List + ): List /** Delete given file. Will not delete any directories. */ suspend fun delete(path: Path) - suspend fun createTopicFile(topic: String, status: SimpleFileStatus): TopicFile = TopicFile( + suspend fun createTopicFile(topic: String, status: StorageNode): TopicFile = TopicFile( topic = topic, path = status.path, - lastModified = status.lastModified ?: Instant.now(), + lastModified = if (status is StorageNode.StorageFile) status.lastModified else Instant.now(), ) /** @@ -32,11 +33,12 @@ interface SourceStorage { * The path must only contain records of a single topic, this is not verified. */ suspend fun listTopicFiles( + storageIndex: StorageIndex, topic: String, topicPath: Path, limit: Int, predicate: (TopicFile) -> Boolean, - ): List = avroFileTreeLister() + ): List = storageIndex.avroFileTreeLister(this) .list(TopicPath(topic, topicPath), limit, predicate) /** @@ -44,9 +46,10 @@ interface SourceStorage { * Exclude paths belonging to the set of given excluded topics. */ suspend fun listTopics( + storageIndex: StorageIndex, root: Path, exclude: Set, - ): List = avroTopicTreeLister() + ): List = storageIndex.avroTopicTreeLister() .listTo(LinkedHashSet(), root) .filter { it.fileName.toString() !in exclude } diff --git a/src/main/java/org/radarbase/output/source/SourceStorageFactory.kt b/src/main/java/org/radarbase/output/source/SourceStorageFactory.kt index 4d26f35..6396a49 100644 --- a/src/main/java/org/radarbase/output/source/SourceStorageFactory.kt +++ b/src/main/java/org/radarbase/output/source/SourceStorageFactory.kt @@ -13,13 +13,17 @@ class SourceStorageFactory( private val s3SourceClient: MinioClient? = if (resourceConfig.sourceType == ResourceType.S3) { requireNotNull(resourceConfig.s3) { "Missing S3 configuration" } .createS3Client() - } else null + } else { + null + } private val azureSourceClient: BlobServiceClient? = if (resourceConfig.sourceType == ResourceType.AZURE) { requireNotNull(resourceConfig.azure) { "Missing Azure configuration" } .createAzureClient() - } else null + } else { + null + } fun createSourceStorage() = when (resourceConfig.sourceType) { ResourceType.S3 -> { @@ -27,14 +31,6 @@ class SourceStorageFactory( val minioClient = requireNotNull(s3SourceClient) { "Missing S3 client configuration for source storage" } S3SourceStorage(minioClient, s3Config, tempPath) } - ResourceType.HDFS -> { - val storage = Class.forName("org.radarbase.output.source.HdfsSourceStorageFactory") - val constructor = - storage.getDeclaredConstructor(ResourceConfig::class.java, Path::class.java) - val factory = constructor.newInstance(resourceConfig, tempPath) - val createSourceStorage = storage.getDeclaredMethod("createSourceStorage") - createSourceStorage.invoke(factory) as SourceStorage - } ResourceType.AZURE -> { val azureClient = requireNotNull(azureSourceClient) { "Missing Azure client configuration for source storage" } val azureConfig = requireNotNull(resourceConfig.azure) { "Missing Azure configuration for source storage" } diff --git a/src/main/java/org/radarbase/output/source/StorageIndex.kt b/src/main/java/org/radarbase/output/source/StorageIndex.kt new file mode 100644 index 0000000..51dd258 --- /dev/null +++ b/src/main/java/org/radarbase/output/source/StorageIndex.kt @@ -0,0 +1,29 @@ +package org.radarbase.output.source + +import java.nio.file.Paths + +/** + * Index of files in a source storage. + * This index does not modify itself so it needs to be synced by a [StorageIndexManager]. + */ +interface StorageIndex { + /** + * List given directory. + * If [maxKeys] is given, no more than that many entries will be returned. + */ + suspend fun list(dir: StorageNode.StorageDirectory, maxKeys: Int? = null): List + + /** + * Remove a file from the index. + * This will typically be called if the file was removed by the current process. + */ + suspend fun remove(file: StorageNode.StorageFile) + + companion object { + /** + * Root directory. All files that are in the index can be found by traversing the index + * starting at this root. + */ + val ROOT = StorageNode.StorageDirectory(Paths.get("/")) + } +} diff --git a/src/main/java/org/radarbase/output/source/StorageIndexManager.kt b/src/main/java/org/radarbase/output/source/StorageIndexManager.kt new file mode 100644 index 0000000..c9ed4ed --- /dev/null +++ b/src/main/java/org/radarbase/output/source/StorageIndexManager.kt @@ -0,0 +1,100 @@ +package org.radarbase.output.source + +import org.radarbase.kotlin.coroutines.forkJoin +import org.slf4j.LoggerFactory +import java.nio.file.Path +import java.time.Duration +import java.time.Instant + +/** Manager to manage a storage index. */ +class StorageIndexManager( + /** Storage index to manage. */ + val storageIndex: StorageIndex, + /** Source storage to index. */ + private val sourceStorage: SourceStorage, + /** Root directory in source storage to start scanning. */ + root: Path, + /** How often to rescan the full directory structure. */ + private val rescanDirectoryDuration: Duration, + /** How often to rescan empty directories. */ + private val rescanEmptyDuration: Duration, +) { + private val root = StorageNode.StorageDirectory(root) + + private var nextSync = Instant.MIN + + private var nextEmptySync = Instant.MIN + + /** Update the storage index, taking into account caching times. */ + suspend fun update() { + if (storageIndex !is MutableStorageIndex) return + if (nextSync < Instant.now()) { + sync() + } else { + val rescanEmpty = nextEmptySync < Instant.now() + if (rescanEmpty) { + logger.info("Updating source {} index (including empty directories)...", root) + nextEmptySync = Instant.now() + rescanEmptyDuration + } else { + logger.info("Updating source {} index (excluding empty directories)...", root) + } + val listOperations = storageIndex.updateLevel(root, rescanEmpty) + logger.debug("Updated source {} with {} list operations...", root, listOperations) + } + } + + private suspend fun MutableStorageIndex.updateLevel(node: StorageNode.StorageDirectory, rescanEmpty: Boolean): Long { + val list = list(node) + if (list.isEmpty()) { + return if (rescanEmpty) { + syncLevel(node) + } else { + 0L + } + } + val lastFile = list.asSequence() + .filterIsInstance() + .maxByOrNull { it.path } + + val currentOperations = if (lastFile != null) { + addAll(node, sourceStorage.list(node.path, startAfter = lastFile.path)) + 1L + } else { + 0L + } + + val listOperations = list(node) + .filterIsInstance() + .filterNot { it.path.fileName.toString() == "+tmp" } + .forkJoin { updateLevel(it, rescanEmpty) } + .sum() + + return currentOperations + listOperations + } + + /** Fully synchronize the storage index with the source storage. */ + suspend fun sync() { + if (storageIndex !is MutableStorageIndex) return + logger.info("Syncing source {} index...", root) + val listOperations = storageIndex.syncLevel(root) + logger.debug("Synced source {} index with {} list operations...", root, listOperations) + nextSync = Instant.now() + rescanDirectoryDuration + nextEmptySync = Instant.now() + rescanEmptyDuration + } + + private suspend fun MutableStorageIndex.syncLevel(node: StorageNode.StorageDirectory): Long { + sync(node, sourceStorage.list(node.path)) + + val listOperations = list(node) + .filterIsInstance() + .filterNot { it.path.fileName.toString() == "+tmp" } + .forkJoin { syncLevel(it) } + .sum() + + return 1L + listOperations + } + + companion object { + private val logger = LoggerFactory.getLogger(StorageIndexManager::class.java) + } +} diff --git a/src/main/java/org/radarbase/output/source/StorageNode.kt b/src/main/java/org/radarbase/output/source/StorageNode.kt new file mode 100644 index 0000000..14fd1d1 --- /dev/null +++ b/src/main/java/org/radarbase/output/source/StorageNode.kt @@ -0,0 +1,37 @@ +package org.radarbase.output.source + +import java.nio.file.Path +import java.time.Instant + +/** + * A node in a file tree of the source or target storage. + */ +sealed interface StorageNode { + /** Path that the node represents.. */ + val path: Path + + /** + * Parent of the current node, or `null` if the current node is the storage root or topmost + * level of a relative path. + */ + fun parent(): StorageDirectory? { + val parentPath = path.parent + return if (parentPath != null) { + StorageDirectory(parentPath) + } else { + null + } + } + + /** Storage node that represents a directory. */ + data class StorageDirectory(override val path: Path) : StorageNode + + /** + * Storage node that represents a file. + */ + data class StorageFile( + override val path: Path, + /** Time that the file was last modified. */ + val lastModified: Instant, + ) : StorageNode +} diff --git a/src/main/java/org/radarbase/output/source/TopicFileList.kt b/src/main/java/org/radarbase/output/source/TopicFileList.kt index 8d8f89b..d8a9027 100644 --- a/src/main/java/org/radarbase/output/source/TopicFileList.kt +++ b/src/main/java/org/radarbase/output/source/TopicFileList.kt @@ -21,10 +21,8 @@ data class TopicFile( topic, path, lastModified, - range = TopicPartitionOffsetRange.parseFilename(path.fileName.toString(), lastModified) + range = TopicPartitionOffsetRange.parseFilename(path.fileName.toString(), lastModified), ) val size: Long? = range.range.size } - -data class SimpleFileStatus(val path: Path, val isDirectory: Boolean, val lastModified: Instant?) diff --git a/src/main/java/org/radarbase/output/target/AzureTargetStorage.kt b/src/main/java/org/radarbase/output/target/AzureTargetStorage.kt index b7703c1..dff4eb1 100644 --- a/src/main/java/org/radarbase/output/target/AzureTargetStorage.kt +++ b/src/main/java/org/radarbase/output/target/AzureTargetStorage.kt @@ -18,55 +18,76 @@ package org.radarbase.output.target import com.azure.storage.blob.BlobClient import com.azure.storage.blob.BlobContainerClient +import com.azure.storage.blob.BlobServiceClient import com.azure.storage.blob.models.ListBlobContainersOptions import kotlinx.coroutines.Dispatchers import kotlinx.coroutines.withContext +import org.radarbase.kotlin.coroutines.CacheConfig +import org.radarbase.kotlin.coroutines.CachedValue import org.radarbase.output.config.AzureConfig -import org.radarbase.output.util.toKey +import org.radarbase.output.util.firstSegment +import org.radarbase.output.util.splitFirstSegment import org.slf4j.LoggerFactory import java.io.IOException import java.io.InputStream import java.nio.file.Path +import java.util.concurrent.ConcurrentHashMap +import java.util.concurrent.ConcurrentMap import kotlin.io.path.deleteExisting +import kotlin.time.Duration.Companion.days +import kotlin.time.Duration.Companion.hours +import kotlin.time.Duration.Companion.minutes class AzureTargetStorage(private val config: AzureConfig) : TargetStorage { - private val container: String = config.container - private lateinit var containerClient: BlobContainerClient + private lateinit var serviceClient: BlobServiceClient + private val containerClient: ConcurrentMap> = ConcurrentHashMap() + private val cacheConfig = CacheConfig( + refreshDuration = 1.days, + retryDuration = 1.hours, + exceptionCacheDuration = 1.minutes, + ) init { logger.info( - "Azure Blob storage configured with endpoint {} in container {}", + "Azure Blob storage configured with endpoint {}", config.endpoint, - config.container, ) } override suspend fun initialize() { - val serviceClient = try { + serviceClient = try { config.createAzureClient() } catch (ex: IllegalArgumentException) { logger.warn("Invalid S3 configuration", ex) throw ex } + } - // Check if the bucket already exists. - val listContainer = ListBlobContainersOptions().apply { prefix = container } - - val isExist: Boolean = withContext(Dispatchers.IO) { - serviceClient.listBlobContainers(listContainer, null) - }.any { it.name == container } - - if (isExist) { - logger.info("Container $container already exists.") - } else { - withContext(Dispatchers.IO) { - serviceClient.createBlobContainer(container) + private suspend fun client(path: Path) = client(path.firstSegment()) + + private suspend fun client(container: String) = + containerClient.computeIfAbsent(container) { + CachedValue( + cacheConfig, + ) { + withContext(Dispatchers.IO) { + // Check if the bucket already exists. + val listContainer = ListBlobContainersOptions().apply { prefix = container } + + val isExist: Boolean = serviceClient.listBlobContainers(listContainer, null) + .any { it.name == container } + + if (isExist) { + logger.info("Container {} already exists.", container) + } else { + serviceClient.createBlobContainer(container) + logger.info("Container {} was created.", container) + } + + serviceClient.getBlobContainerClient(container) + } } - logger.info("Container $container was created.") - } - - containerClient = serviceClient.getBlobContainerClient(container) - } + }.get() override suspend fun status(path: Path): TargetStorage.PathStatus? = withContext(Dispatchers.IO) { @@ -89,7 +110,7 @@ class AzureTargetStorage(private val config: AzureConfig) : TargetStorage { @Throws(IOException::class) override suspend fun move(oldPath: Path, newPath: Path) = withContext(Dispatchers.IO) { - blob(newPath).copyFromUrl("${config.endpoint}/${config.container}/${oldPath.toKey()}") + blob(newPath).copyFromUrl("${config.endpoint}/$oldPath") doDelete(oldPath) } @@ -104,15 +125,19 @@ class AzureTargetStorage(private val config: AzureConfig) : TargetStorage { doDelete(path) } - private fun doDelete(path: Path) { + private suspend fun doDelete(path: Path) { blob(path).delete() } - override fun createDirectories(directory: Path) { - // noop + override suspend fun createDirectories(directory: Path) { + // ensure bucket exists + client(directory) } - private fun blob(path: Path): BlobClient = containerClient.getBlobClient(path.toKey()) + private suspend fun blob(path: Path): BlobClient { + val (container, key) = path.splitFirstSegment() + return client(container).getBlobClient(key) + } companion object { private val logger = LoggerFactory.getLogger(AzureTargetStorage::class.java) diff --git a/src/main/java/org/radarbase/output/target/LocalTargetStorage.kt b/src/main/java/org/radarbase/output/target/LocalTargetStorage.kt index f6c2b7e..be1aafb 100644 --- a/src/main/java/org/radarbase/output/target/LocalTargetStorage.kt +++ b/src/main/java/org/radarbase/output/target/LocalTargetStorage.kt @@ -27,7 +27,14 @@ import java.nio.file.Path import java.nio.file.StandardCopyOption.ATOMIC_MOVE import java.nio.file.StandardCopyOption.REPLACE_EXISTING import java.nio.file.attribute.PosixFilePermissions -import kotlin.io.path.* +import kotlin.io.path.createDirectories +import kotlin.io.path.deleteExisting +import kotlin.io.path.exists +import kotlin.io.path.fileSize +import kotlin.io.path.inputStream +import kotlin.io.path.moveTo +import kotlin.io.path.setAttribute +import kotlin.io.path.setPosixFilePermissions class LocalTargetStorage(private val config: LocalConfig) : TargetStorage { init { @@ -75,7 +82,7 @@ class LocalTargetStorage(private val config: LocalConfig) : TargetStorage { doMove(localPath, newPath) } - override fun createDirectories(directory: Path) { + override suspend fun createDirectories(directory: Path) = withContext(Dispatchers.IO) { directory.createDirectories( PosixFilePermissions.asFileAttribute( PosixFilePermissions.fromString("rwxr-xr-x"), @@ -94,7 +101,9 @@ class LocalTargetStorage(private val config: LocalConfig) : TargetStorage { } @Throws(IOException::class) - override suspend fun delete(path: Path) = path.deleteExisting() + override suspend fun delete(path: Path) = withContext(Dispatchers.IO) { + path.deleteExisting() + } companion object { private val logger = LoggerFactory.getLogger(LocalTargetStorage::class.java) diff --git a/src/main/java/org/radarbase/output/target/S3TargetStorage.kt b/src/main/java/org/radarbase/output/target/S3TargetStorage.kt index 80e58ac..6eba54a 100644 --- a/src/main/java/org/radarbase/output/target/S3TargetStorage.kt +++ b/src/main/java/org/radarbase/output/target/S3TargetStorage.kt @@ -16,20 +16,37 @@ package org.radarbase.output.target -import io.minio.* +import io.minio.BucketArgs +import io.minio.BucketExistsArgs +import io.minio.CopyObjectArgs +import io.minio.CopySource +import io.minio.GetObjectArgs +import io.minio.MakeBucketArgs +import io.minio.MinioClient +import io.minio.RemoveObjectArgs +import io.minio.StatObjectArgs +import io.minio.UploadObjectArgs +import org.radarbase.kotlin.coroutines.CacheConfig +import org.radarbase.kotlin.coroutines.CachedValue import org.radarbase.output.config.S3Config import org.radarbase.output.source.S3SourceStorage.Companion.faultTolerant import org.radarbase.output.util.bucketBuild +import org.radarbase.output.util.firstSegment import org.radarbase.output.util.objectBuild import org.slf4j.LoggerFactory import java.io.FileNotFoundException import java.io.IOException import java.io.InputStream import java.nio.file.Path +import java.util.concurrent.ConcurrentHashMap import kotlin.io.path.deleteExisting +import kotlin.time.Duration.Companion.days +import kotlin.time.Duration.Companion.hours +import kotlin.time.Duration.Companion.minutes -class S3TargetStorage(config: S3Config) : TargetStorage { - private val bucket: String = config.bucket +class S3TargetStorage( + config: S3Config, +) : TargetStorage { private val s3Client: MinioClient = try { config.createS3Client() } catch (ex: IllegalArgumentException) { @@ -37,29 +54,25 @@ class S3TargetStorage(config: S3Config) : TargetStorage { throw ex } + private val buckets = ConcurrentHashMap>() + private val cacheConfig = CacheConfig( + refreshDuration = 1.days, + retryDuration = 1.hours, + exceptionCacheDuration = 1.minutes, + ) + init { logger.info( - "Object storage configured with endpoint {} in bucket {}", + "Object storage configured with endpoint {}", config.endpoint, - config.bucket, ) } - override suspend fun initialize() { - // Check if the bucket already exists. - val bucketExistsRequest = BucketExistsArgs.Builder().bucketBuild(bucket) - val isExist: Boolean = faultTolerant { s3Client.bucketExists(bucketExistsRequest) } - if (isExist) { - logger.info("Bucket $bucket already exists.") - } else { - val makeBucketRequest = MakeBucketArgs.Builder().bucketBuild(bucket) - faultTolerant { s3Client.makeBucket(makeBucketRequest) } - logger.info("Bucket $bucket was created.") - } - } + override suspend fun initialize() {} override suspend fun status(path: Path): TargetStorage.PathStatus? { - val statRequest = StatObjectArgs.Builder().objectBuild(bucket, path) + val statRequest = StatObjectArgs.builder().objectBuild(path) + .also { it.ensureBucket() } return try { faultTolerant { s3Client.statObject(statRequest) @@ -70,16 +83,44 @@ class S3TargetStorage(config: S3Config) : TargetStorage { } } + private suspend fun BucketArgs.ensureBucket() = ensureBucket(bucket()) + + private suspend fun ensureBucket(bucket: String) { + try { + buckets.computeIfAbsent(bucket) { + CachedValue(cacheConfig) { + val bucketExistsRequest = BucketExistsArgs.builder().bucketBuild(bucket) + val isExist: Boolean = faultTolerant { s3Client.bucketExists(bucketExistsRequest) } + if (isExist) { + logger.info("Bucket $bucket already exists.") + } else { + val makeBucketRequest = MakeBucketArgs.builder().bucketBuild(bucket) + faultTolerant { s3Client.makeBucket(makeBucketRequest) } + logger.info("Bucket $bucket was created.") + } + } + }.get() + } catch (ex: Exception) { + logger.error( + "Failed to create bucket {}: {}", + bucket, + ex.message, + ) + throw ex + } + } + @Throws(IOException::class) override suspend fun newInputStream(path: Path): InputStream { - val getRequest = GetObjectArgs.Builder().objectBuild(bucket, path) + val getRequest = GetObjectArgs.builder().objectBuild(path) + .also { it.ensureBucket() } return faultTolerant { s3Client.getObject(getRequest) } } @Throws(IOException::class) override suspend fun move(oldPath: Path, newPath: Path) { - val copyRequest = CopyObjectArgs.Builder().objectBuild(bucket, newPath) { - source(CopySource.Builder().objectBuild(bucket, oldPath)) + val copyRequest = CopyObjectArgs.builder().objectBuild(newPath) { + source(CopySource.Builder().objectBuild(oldPath)) } faultTolerant { s3Client.copyObject(copyRequest) } delete(oldPath) @@ -87,21 +128,24 @@ class S3TargetStorage(config: S3Config) : TargetStorage { @Throws(IOException::class) override suspend fun store(localPath: Path, newPath: Path) { - val uploadRequest = UploadObjectArgs.Builder().objectBuild(bucket, newPath) { + val uploadRequest = UploadObjectArgs.builder().objectBuild(newPath) { filename(localPath.toAbsolutePath().toString()) } + .also { it.ensureBucket() } + faultTolerant { s3Client.uploadObject(uploadRequest) } localPath.deleteExisting() } @Throws(IOException::class) override suspend fun delete(path: Path) { - val removeRequest = RemoveObjectArgs.Builder().objectBuild(bucket, path) + val removeRequest = RemoveObjectArgs.builder().objectBuild(path) + .also { it.ensureBucket() } faultTolerant { s3Client.removeObject(removeRequest) } } - override fun createDirectories(directory: Path) { - // noop + override suspend fun createDirectories(directory: Path) { + ensureBucket(directory.firstSegment()) } companion object { diff --git a/src/main/java/org/radarbase/output/target/TargetStorage.kt b/src/main/java/org/radarbase/output/target/TargetStorage.kt index e4ffed0..d1bf212 100644 --- a/src/main/java/org/radarbase/output/target/TargetStorage.kt +++ b/src/main/java/org/radarbase/output/target/TargetStorage.kt @@ -59,7 +59,7 @@ interface TargetStorage { /** Create given directory, by recursively creating all parent directories. */ @Throws(IOException::class) - fun createDirectories(directory: Path) + suspend fun createDirectories(directory: Path) data class PathStatus( /** Size in bytes */ diff --git a/src/main/java/org/radarbase/output/target/TargetStorageFactory.kt b/src/main/java/org/radarbase/output/target/TargetStorageFactory.kt index fbdf37f..aa9b50b 100644 --- a/src/main/java/org/radarbase/output/target/TargetStorageFactory.kt +++ b/src/main/java/org/radarbase/output/target/TargetStorageFactory.kt @@ -8,6 +8,5 @@ class TargetStorageFactory(private val config: ResourceConfig) { ResourceType.S3 -> S3TargetStorage(config.s3!!) ResourceType.LOCAL -> LocalTargetStorage(config.local!!) ResourceType.AZURE -> AzureTargetStorage(config.azure!!) - else -> throw IllegalStateException("Cannot create storage driver for ${config.sourceType}") } } diff --git a/src/main/java/org/radarbase/output/util/AvroFileLister.kt b/src/main/java/org/radarbase/output/util/AvroFileLister.kt index ab07983..a4267aa 100644 --- a/src/main/java/org/radarbase/output/util/AvroFileLister.kt +++ b/src/main/java/org/radarbase/output/util/AvroFileLister.kt @@ -1,10 +1,14 @@ package org.radarbase.output.util +import kotlinx.coroutines.flow.toList import org.radarbase.output.source.SourceStorage +import org.radarbase.output.source.StorageIndex +import org.radarbase.output.source.StorageNode import org.radarbase.output.source.TopicFile class AvroFileLister( private val storage: SourceStorage, + private val storageIndex: StorageIndex, ) : TreeLister.LevelLister { override suspend fun listLevel( @@ -12,17 +16,19 @@ class AvroFileLister( descend: suspend (TopicPath) -> Unit, emit: suspend (TopicFile) -> Unit, ) { - storage.list(context.path).forEach { status -> - val filename = status.path.fileName.toString() - when { - status.isDirectory && filename != "+tmp" -> descend(context.copy(path = status.path)) - filename.endsWith(".avro") -> emit(storage.createTopicFile(context.topic, status)) - else -> {} + storageIndex.list(StorageNode.StorageDirectory(context.path)) + .toList() + .forEach { status -> + val filename = status.path.fileName.toString() + when { + status is StorageNode.StorageDirectory && filename != "+tmp" -> descend(context.copy(path = status.path)) + status is StorageNode.StorageFile && filename.endsWith(".avro") -> emit(storage.createTopicFile(context.topic, status)) + else -> {} + } } - } } companion object { - fun SourceStorage.avroFileTreeLister() = TreeLister(AvroFileLister(this)) + fun StorageIndex.avroFileTreeLister(sourceStorage: SourceStorage) = TreeLister(AvroFileLister(sourceStorage, this)) } } diff --git a/src/main/java/org/radarbase/output/util/AvroTopicLister.kt b/src/main/java/org/radarbase/output/util/AvroTopicLister.kt index 564a8b6..3e052a1 100644 --- a/src/main/java/org/radarbase/output/util/AvroTopicLister.kt +++ b/src/main/java/org/radarbase/output/util/AvroTopicLister.kt @@ -1,20 +1,23 @@ package org.radarbase.output.util -import org.radarbase.output.source.SourceStorage +import kotlinx.coroutines.flow.toList +import org.radarbase.output.source.StorageIndex +import org.radarbase.output.source.StorageNode import java.nio.file.Path class AvroTopicLister( - private val storage: SourceStorage, + private val storage: StorageIndex, ) : TreeLister.LevelLister { override suspend fun listLevel( context: Path, descend: suspend (Path) -> Unit, emit: suspend (Path) -> Unit, ) { - val fileStatuses = storage.list(context, maxKeys = 256) + val fileStatuses = storage.list(StorageNode.StorageDirectory(context), maxKeys = 256) + .toList() val avroFile = fileStatuses.find { file -> - !file.isDirectory && + file is StorageNode.StorageFile && file.path.fileName.toString().endsWith(".avro", true) } @@ -22,12 +25,12 @@ class AvroTopicLister( emit(avroFile.path.parent.parent) } else { fileStatuses - .filter { file -> file.isDirectory && file.path.fileName.toString() != "+tmp" } + .filter { file -> file is StorageNode.StorageDirectory && file.path.fileName.toString() != "+tmp" } .forEach { file -> descend(file.path) } } } companion object { - fun SourceStorage.avroTopicTreeLister() = TreeLister(AvroTopicLister(this)) + fun StorageIndex.avroTopicTreeLister() = TreeLister(AvroTopicLister(this)) } } diff --git a/src/main/java/org/radarbase/output/util/Path.kt b/src/main/java/org/radarbase/output/util/Path.kt index 0dbb239..7fdd8fe 100644 --- a/src/main/java/org/radarbase/output/util/Path.kt +++ b/src/main/java/org/radarbase/output/util/Path.kt @@ -3,15 +3,23 @@ package org.radarbase.output.util import io.minio.BucketArgs import io.minio.ObjectArgs import java.nio.file.Path -import java.nio.file.Paths -private val rootPath = Paths.get("/") +fun Path.withoutFirstSegment(): String { + // remove bucket prefix + return first().relativize(this).toString() +} + +fun Path.splitFirstSegment(): Pair { + val bucketPath = first() + return Pair( + bucketPath.toString(), + bucketPath.relativize(this).toString(), + ) +} -fun Path.toKey() = if (startsWith(rootPath)) { - rootPath.relativize(this).toString() -} else toString() +fun Path.firstSegment(): String = first().toString() -inline fun > T.bucketBuild( +inline fun > T.bucketBuild( bucket: String, configure: T.() -> T = { this }, ): S { @@ -20,13 +28,24 @@ inline fun > T.buck return build() } -inline fun > T.objectBuild( - bucket: String, +inline fun > T.objectBuild( path: Path, configure: T.() -> T = { this }, +): S { + val (bucket, key) = path.splitFirstSegment() + return bucketBuild(bucket) { + `object`(key) + configure() + } +} + +inline fun > T.objectBuild( + bucket: String, + key: Path, + configure: T.() -> T = { this }, ): S { return bucketBuild(bucket) { - `object`(path.toKey()) + `object`(key.toString()) configure() } } diff --git a/src/main/java/org/radarbase/output/util/PostponedWriter.kt b/src/main/java/org/radarbase/output/util/PostponedWriter.kt index 8ebccbd..d1da23d 100644 --- a/src/main/java/org/radarbase/output/util/PostponedWriter.kt +++ b/src/main/java/org/radarbase/output/util/PostponedWriter.kt @@ -16,7 +16,11 @@ package org.radarbase.output.util -import kotlinx.coroutines.* +import kotlinx.coroutines.CoroutineScope +import kotlinx.coroutines.Job +import kotlinx.coroutines.coroutineScope +import kotlinx.coroutines.delay +import kotlinx.coroutines.launch import kotlinx.coroutines.sync.Mutex import kotlinx.coroutines.sync.withLock import java.io.IOException diff --git a/src/main/java/org/radarbase/output/util/ProgressBar.kt b/src/main/java/org/radarbase/output/util/ProgressBar.kt index 4d65e80..cbf1a13 100644 --- a/src/main/java/org/radarbase/output/util/ProgressBar.kt +++ b/src/main/java/org/radarbase/output/util/ProgressBar.kt @@ -59,8 +59,11 @@ class ProgressBar( if (progress == total) { // go through only once - if (isDone) return - else isDone = true + if (isDone) { + return + } else { + isDone = true + } } val builder = StringBuilder(numStripes + 30 + label.length) @@ -97,8 +100,9 @@ class ProgressBar( private fun percentage(builder: StringBuilder, progressPercent: Float) { val percent = progressPercent.toInt() - if (percent < 10) builder.append(" ") - else if (percent < 100) builder.append(' ') + if (percent < 10) { + builder.append(" ") + } else if (percent < 100) builder.append(' ') builder.append(percent).append('%') } diff --git a/src/main/java/org/radarbase/output/util/TimeUtil.kt b/src/main/java/org/radarbase/output/util/TimeUtil.kt index cd1252a..aef9dc5 100644 --- a/src/main/java/org/radarbase/output/util/TimeUtil.kt +++ b/src/main/java/org/radarbase/output/util/TimeUtil.kt @@ -5,7 +5,11 @@ import org.apache.avro.Schema import org.apache.avro.generic.GenericRecord import org.radarbase.output.path.RecordPathFactory.Companion.getFieldOrNull import java.math.RoundingMode -import java.time.* +import java.time.Duration +import java.time.Instant +import java.time.LocalDate +import java.time.LocalDateTime +import java.time.ZoneOffset.UTC import java.time.format.DateTimeParseException import java.time.temporal.Temporal @@ -131,7 +135,7 @@ object TimeUtil { fun String.parseDate(): Instant? = try { LocalDate.parse(this) - .atStartOfDay(ZoneOffset.UTC) + .atStartOfDay(UTC) .toInstant() } catch (ex: DateTimeParseException) { null @@ -141,7 +145,7 @@ object TimeUtil { if (this[lastIndex] == 'Z') { Instant.parse(this) } else { - LocalDateTime.parse(this).toInstant(ZoneOffset.UTC) + LocalDateTime.parse(this).toInstant(UTC) } } catch (ex: DateTimeParseException) { null diff --git a/src/main/java/org/radarbase/output/util/TreeLister.kt b/src/main/java/org/radarbase/output/util/TreeLister.kt index eca1d5f..dfa7b29 100644 --- a/src/main/java/org/radarbase/output/util/TreeLister.kt +++ b/src/main/java/org/radarbase/output/util/TreeLister.kt @@ -21,16 +21,12 @@ class TreeLister( predicate: ((T) -> Boolean)? = null, ): S = coroutineScope { val channel = Channel(capacity = limit) + val producer = launch { - coroutineScope { - descend( - context, - if (predicate == null) channel::send else ( - { value -> - if (predicate(value)) channel.send(value) - } - ), - ) + descend(context) { value -> + if (predicate == null || predicate(value)) { + channel.send(value) + } } channel.close() } @@ -47,8 +43,15 @@ class TreeLister( collection } - private suspend fun descend(context: C, emit: suspend (T) -> Unit) { - levelLister.listLevel(context, { descend(it, emit) }, emit) + private suspend fun descend( + context: C, + emit: suspend (T) -> Unit, + ) { + levelLister.listLevel( + context = context, + descend = { descend(it, emit) }, + emit = emit, + ) } interface LevelLister { diff --git a/src/main/java/org/radarbase/output/worker/FileCache.kt b/src/main/java/org/radarbase/output/worker/FileCache.kt index 6690a65..a7602e6 100644 --- a/src/main/java/org/radarbase/output/worker/FileCache.kt +++ b/src/main/java/org/radarbase/output/worker/FileCache.kt @@ -17,8 +17,6 @@ package org.radarbase.output.worker import kotlinx.coroutines.Dispatchers -import kotlinx.coroutines.coroutineScope -import kotlinx.coroutines.launch import kotlinx.coroutines.withContext import org.apache.avro.generic.GenericRecord import org.radarbase.output.FileStoreFactory @@ -32,7 +30,12 @@ import org.radarbase.output.util.SuspendedCloseable import org.radarbase.output.util.SuspendedCloseable.Companion.useSuspended import org.radarbase.output.util.Timer.time import org.slf4j.LoggerFactory -import java.io.* +import java.io.ByteArrayInputStream +import java.io.FileNotFoundException +import java.io.IOException +import java.io.InputStream +import java.io.OutputStream +import java.io.Writer import java.nio.file.AtomicMoveNotSupportedException import java.nio.file.Path import java.nio.file.StandardCopyOption @@ -63,12 +66,16 @@ class FileCache( private var lastUse: Long = 0 private val hasError: AtomicBoolean = AtomicBoolean(false) private val deduplicate: DeduplicationConfig + private val excludeFields: Set init { val topicConfig = factory.config.topics[topic] val defaultDeduplicate = factory.config.format.deduplication deduplicate = topicConfig?.deduplication(defaultDeduplicate) ?: defaultDeduplicate + val defaultExclude = factory.config.format.excludeFields + excludeFields = topicConfig?.excludeFields ?: defaultExclude + this.tmpPath = createTempFile(tmpDir, fileName, ".tmp" + compression.extension) } @@ -102,16 +109,14 @@ class FileCache( this.recordConverter = try { inputStream.reader().useSuspended { reader -> - converterFactory.converterFor(writer, record, fileIsNew, reader) + converterFactory.converterFor(writer, record, fileIsNew, reader, excludeFields) } } catch (ex: IOException) { - coroutineScope { - launch(Dispatchers.IO) { - try { - writer.close() - } catch (exClose: IOException) { - logger.error("Failed to close writer for {}", path, ex) - } + withContext(Dispatchers.IO) { + try { + writer.close() + } catch (exClose: IOException) { + logger.error("Failed to close writer for {}", path, ex) } } diff --git a/src/main/java/org/radarbase/output/worker/FileCacheStore.kt b/src/main/java/org/radarbase/output/worker/FileCacheStore.kt index 4486b22..de6d41f 100644 --- a/src/main/java/org/radarbase/output/worker/FileCacheStore.kt +++ b/src/main/java/org/radarbase/output/worker/FileCacheStore.kt @@ -205,7 +205,8 @@ class FileCacheStore( NO_CACHE_AND_WRITE(true), /** Cache miss and write was unsuccessful because of a mismatch in number of columns. */ - NO_CACHE_AND_NO_WRITE(false); + NO_CACHE_AND_NO_WRITE(false), + ; companion object { fun valueOf(isCacheHit: Boolean, isSuccessful: Boolean) = when { diff --git a/src/main/java/org/radarbase/output/worker/RadarKafkaRestructure.kt b/src/main/java/org/radarbase/output/worker/RadarKafkaRestructure.kt index de526de..4fec805 100644 --- a/src/main/java/org/radarbase/output/worker/RadarKafkaRestructure.kt +++ b/src/main/java/org/radarbase/output/worker/RadarKafkaRestructure.kt @@ -28,6 +28,7 @@ import org.radarbase.output.accounting.Accountant import org.radarbase.output.accounting.AccountantImpl import org.radarbase.output.accounting.OffsetRangeSet import org.radarbase.output.config.RestructureConfig +import org.radarbase.output.source.StorageIndex import org.radarbase.output.source.TopicFileList import org.radarbase.output.util.SuspendedCloseable.Companion.useSuspended import org.radarbase.output.util.TimeUtil.durationSince @@ -77,13 +78,13 @@ class RadarKafkaRestructure( val processedRecordsCount = LongAdder() @Throws(IOException::class, InterruptedException::class) - suspend fun process(directoryName: String) { + suspend fun process(directoryName: String, storageIndex: StorageIndex) { // Get files and directories val absolutePath = Paths.get(directoryName) logger.info("Scanning topics...") - val paths = topicPaths(absolutePath) + val paths = topicPaths(storageIndex, absolutePath) logger.info("{} topics found", paths.size) @@ -92,7 +93,7 @@ class RadarKafkaRestructure( launch { try { val (fileCount, recordCount) = fileStoreFactory.workerSemaphore.withPermit { - mapTopic(p) + mapTopic(storageIndex, p) } processedFileCount.add(fileCount) processedRecordsCount.add(recordCount) @@ -104,7 +105,7 @@ class RadarKafkaRestructure( } } - private suspend fun mapTopic(topicPath: Path): ProcessingStatistics { + private suspend fun mapTopic(storageIndex: StorageIndex, topicPath: Path): ProcessingStatistics { val topic = topicPath.fileName.toString() return try { @@ -112,7 +113,7 @@ class RadarKafkaRestructure( coroutineScope { AccountantImpl(fileStoreFactory, topic).useSuspended { accountant -> accountant.initialize(this) - startWorker(topic, topicPath, accountant, accountant.offsets) + startWorker(storageIndex, topic, topicPath, accountant, accountant.offsets) } } } @@ -127,6 +128,7 @@ class RadarKafkaRestructure( } private suspend fun startWorker( + storageIndex: StorageIndex, topic: String, topicPath: Path, accountant: Accountant, @@ -135,12 +137,12 @@ class RadarKafkaRestructure( return RestructureWorker( sourceStorage, accountant, - fileStoreFactory + fileStoreFactory, ).useSuspended { worker -> try { val topicPaths = TopicFileList( topic, - sourceStorage.listTopicFiles(topic, topicPath, maxFilesPerTopic) { f -> + sourceStorage.listTopicFiles(storageIndex, topic, topicPath, maxFilesPerTopic) { f -> !seenFiles.contains(f.range) && f.lastModified.durationSince() >= minimumFileAge }, @@ -161,8 +163,8 @@ class RadarKafkaRestructure( supervisor.cancel() } - private suspend fun topicPaths(root: Path): List = - sourceStorage.listTopics(root, excludeTopics) + private suspend fun topicPaths(storageIndex: StorageIndex, root: Path): List = + sourceStorage.listTopics(storageIndex, root, excludeTopics) // different services start on different topics to decrease lock contention .shuffled() @@ -176,14 +178,22 @@ class RadarKafkaRestructure( fun job(config: RestructureConfig, serviceMutex: Mutex): Job? = if (config.worker.enable) { Job("restructure", config.service.interval, ::runRestructure, serviceMutex) - } else null + } else { + null + } private suspend fun runRestructure(factory: FileStoreFactory) { RadarKafkaRestructure(factory).useSuspended { restructure -> - for (input in factory.config.paths.inputs) { + for ((input, index) in factory.storageIndexManagers) { + index.update() logger.info("In: {}", input) - logger.info("Out: {}", factory.pathFactory.root) - restructure.process(input.toString()) + logger.info( + "Out: bucket {} (default {}) - path {}", + factory.pathFactory.pathConfig.bucket?.format, + factory.pathFactory.pathConfig.bucket?.defaultName, + factory.pathFactory.pathConfig.path.format, + ) + restructure.process(input.toString(), index.storageIndex) } logger.info( diff --git a/src/main/java/org/radarbase/output/worker/RestructureWorker.kt b/src/main/java/org/radarbase/output/worker/RestructureWorker.kt index 1a4cef7..f73027e 100644 --- a/src/main/java/org/radarbase/output/worker/RestructureWorker.kt +++ b/src/main/java/org/radarbase/output/worker/RestructureWorker.kt @@ -50,14 +50,14 @@ internal class RestructureWorker( logger.info( "Processing topic {}: converting {} files", topic, - numberFormat.format(numFiles) + numberFormat.format(numFiles), ) } else { logger.info( "Processing topic {}: converting {} files with {} records", topic, numberFormat.format(numFiles), - numberFormat.format(numOffsets) + numberFormat.format(numOffsets), ) } @@ -125,7 +125,7 @@ internal class RestructureWorker( return reader.newInput(file).use { input -> // processing zero-length files may trigger a stall. See: - // https://github.com/RADAR-base/Restructure-HDFS-topic/issues/3 + // https://github.com/RADAR-base/radar-output-restructure/issues/3 if (input.length() == 0L) { logger.warn("File {} has zero length, skipping.", file.path) return 0L @@ -170,7 +170,7 @@ internal class RestructureWorker( val path = pathFactory.getRecordPath( transaction.topicPartition.topic, record, - attempt = currentSuffix + attempt = currentSuffix, ) // Write data diff --git a/src/main/resources/log4j2.properties b/src/main/resources/log4j2.properties deleted file mode 100644 index 7ba2364..0000000 --- a/src/main/resources/log4j2.properties +++ /dev/null @@ -1,9 +0,0 @@ -status=error -dest=err -name=PropertiesConfig -appender.console.type=Console -appender.console.name=STDOUT -appender.console.layout.type=PatternLayout -appender.console.layout.pattern=%d{yyyy-MM-dd HH:mm:ss} %-5p - %m (%c{1}:%L)%n -rootLogger.level=info -rootLogger.appenderRef.stdout.ref=STDOUT diff --git a/src/main/resources/log4j2.xml b/src/main/resources/log4j2.xml new file mode 100644 index 0000000..725abb3 --- /dev/null +++ b/src/main/resources/log4j2.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + diff --git a/src/test/java/org/radarbase/output/OffsetRangeFileTest.kt b/src/test/java/org/radarbase/output/OffsetRangeFileTest.kt index 1eebeaf..bc51430 100644 --- a/src/test/java/org/radarbase/output/OffsetRangeFileTest.kt +++ b/src/test/java/org/radarbase/output/OffsetRangeFileTest.kt @@ -16,9 +16,12 @@ package org.radarbase.output -import kotlinx.coroutines.ExperimentalCoroutinesApi import kotlinx.coroutines.test.runTest -import org.junit.jupiter.api.Assertions.* +import org.junit.jupiter.api.Assertions.assertEquals +import org.junit.jupiter.api.Assertions.assertFalse +import org.junit.jupiter.api.Assertions.assertNotNull +import org.junit.jupiter.api.Assertions.assertNull +import org.junit.jupiter.api.Assertions.assertTrue import org.junit.jupiter.api.BeforeEach import org.junit.jupiter.api.Test import org.junit.jupiter.api.io.TempDir @@ -35,7 +38,6 @@ import java.nio.file.Path import java.time.Instant import kotlin.io.path.createFile -@OptIn(ExperimentalCoroutinesApi::class) class OffsetRangeFileTest { private lateinit var testFile: Path private lateinit var targetStorage: TargetStorage diff --git a/src/test/java/org/radarbase/output/OffsetRangeSetTest.kt b/src/test/java/org/radarbase/output/OffsetRangeSetTest.kt index f291f87..bc8bc4d 100644 --- a/src/test/java/org/radarbase/output/OffsetRangeSetTest.kt +++ b/src/test/java/org/radarbase/output/OffsetRangeSetTest.kt @@ -1,6 +1,8 @@ package org.radarbase.output -import org.junit.jupiter.api.Assertions.* +import org.junit.jupiter.api.Assertions.assertEquals +import org.junit.jupiter.api.Assertions.assertFalse +import org.junit.jupiter.api.Assertions.assertTrue import org.junit.jupiter.api.Test import org.radarbase.output.accounting.OffsetRangeSet import org.radarbase.output.accounting.TopicPartition diff --git a/src/test/java/org/radarbase/output/accounting/OffsetIntervalsTest.kt b/src/test/java/org/radarbase/output/accounting/OffsetIntervalsTest.kt index aa6a19f..349d343 100644 --- a/src/test/java/org/radarbase/output/accounting/OffsetIntervalsTest.kt +++ b/src/test/java/org/radarbase/output/accounting/OffsetIntervalsTest.kt @@ -1,6 +1,8 @@ package org.radarbase.output.accounting -import org.junit.jupiter.api.Assertions.* +import org.junit.jupiter.api.Assertions.assertEquals +import org.junit.jupiter.api.Assertions.assertFalse +import org.junit.jupiter.api.Assertions.assertTrue import org.junit.jupiter.api.Test import java.time.Instant @@ -76,7 +78,7 @@ internal class OffsetIntervalsTest { assertEquals( listOf(OffsetRangeSet.Range(0, 0, lastModified)), - toList() + toList(), ) } } diff --git a/src/test/java/org/radarbase/output/cleaner/TimestampFileCacheTest.kt b/src/test/java/org/radarbase/output/cleaner/TimestampFileCacheTest.kt index ac3f63e..66e4794 100644 --- a/src/test/java/org/radarbase/output/cleaner/TimestampFileCacheTest.kt +++ b/src/test/java/org/radarbase/output/cleaner/TimestampFileCacheTest.kt @@ -1,6 +1,5 @@ package org.radarbase.output.cleaner -import kotlinx.coroutines.ExperimentalCoroutinesApi import kotlinx.coroutines.test.runTest import org.apache.avro.Schema import org.apache.avro.generic.GenericData @@ -26,7 +25,6 @@ import java.io.FileNotFoundException import java.nio.file.Path import kotlin.io.path.bufferedWriter -@OptIn(ExperimentalCoroutinesApi::class) internal class TimestampFileCacheTest { private lateinit var record: GenericData.Record private var now: Double = 0.0 diff --git a/src/test/java/org/radarbase/output/data/CompressionFactoryTest.kt b/src/test/java/org/radarbase/output/data/CompressionFactoryTest.kt index f95fff9..98c9933 100644 --- a/src/test/java/org/radarbase/output/data/CompressionFactoryTest.kt +++ b/src/test/java/org/radarbase/output/data/CompressionFactoryTest.kt @@ -4,7 +4,11 @@ import org.junit.jupiter.api.Assertions.assertEquals import org.junit.jupiter.api.Assertions.assertSame import org.junit.jupiter.api.BeforeEach import org.junit.jupiter.api.Test -import org.radarbase.output.compression.* +import org.radarbase.output.compression.Compression +import org.radarbase.output.compression.CompressionFactory +import org.radarbase.output.compression.GzipCompression +import org.radarbase.output.compression.IdentityCompression +import org.radarbase.output.compression.ZipCompression import java.io.ByteArrayInputStream import java.io.ByteArrayOutputStream import java.io.IOException diff --git a/src/test/java/org/radarbase/output/data/CsvAvroConverterTest.kt b/src/test/java/org/radarbase/output/data/CsvAvroConverterTest.kt index ccb05b2..429b8f6 100644 --- a/src/test/java/org/radarbase/output/data/CsvAvroConverterTest.kt +++ b/src/test/java/org/radarbase/output/data/CsvAvroConverterTest.kt @@ -17,7 +17,6 @@ package org.radarbase.output.data import kotlinx.coroutines.Dispatchers -import kotlinx.coroutines.ExperimentalCoroutinesApi import kotlinx.coroutines.test.runTest import kotlinx.coroutines.withContext import org.apache.avro.Schema.Parser @@ -27,7 +26,9 @@ import org.apache.avro.generic.GenericDatumReader import org.apache.avro.generic.GenericRecord import org.apache.avro.generic.GenericRecordBuilder import org.apache.avro.io.DecoderFactory -import org.junit.jupiter.api.Assertions.* +import org.junit.jupiter.api.Assertions.assertEquals +import org.junit.jupiter.api.Assertions.assertFalse +import org.junit.jupiter.api.Assertions.assertTrue import org.junit.jupiter.api.Test import org.junit.jupiter.api.io.TempDir import org.radarbase.output.compression.GzipCompression @@ -49,7 +50,6 @@ import kotlin.io.path.bufferedWriter import kotlin.io.path.inputStream import kotlin.io.path.outputStream -@OptIn(ExperimentalCoroutinesApi::class) class CsvAvroConverterTest { @Test @Throws(IOException::class) @@ -63,11 +63,10 @@ class CsvAvroConverterTest { val writer = StringWriter() val factory = CsvAvroConverter.factory - val converter = factory.converterFor(writer, record, true, StringReader("test")) + val converter = factory.converterFor(writer, record, true, StringReader("test"), setOf("a", "i.other")) val map = converter.convertRecord(record) val keys = listOf( - "a", "b", "c", "d", @@ -76,7 +75,6 @@ class CsvAvroConverterTest { "g", "h", "i.some", - "i.other", "j.0", "j.1", "k", @@ -87,7 +85,6 @@ class CsvAvroConverterTest { val actualIterator = map.values.iterator() val expectedIterator = listOf( - "a", byteArrayOf(255.toByte()), byteArrayOf(255.toByte()), "1000000000000000000", @@ -96,7 +93,6 @@ class CsvAvroConverterTest { "132101", "", "1", - "-1", "", "some", "Y", @@ -250,7 +246,7 @@ class CsvAvroConverterTest { source = path, target = toPath, compression = IdentityCompression(), - distinctFields = setOf("a") + distinctFields = setOf("a"), ) assertEquals(listOf("a,b", "1,2", "a,a", "3,3"), toPath.readAllLines()) } diff --git a/src/test/java/org/radarbase/output/data/FileCacheStoreTest.kt b/src/test/java/org/radarbase/output/data/FileCacheStoreTest.kt index 5b1ebad..6a16eab 100644 --- a/src/test/java/org/radarbase/output/data/FileCacheStoreTest.kt +++ b/src/test/java/org/radarbase/output/data/FileCacheStoreTest.kt @@ -17,7 +17,6 @@ package org.radarbase.output.data import kotlinx.coroutines.Dispatchers -import kotlinx.coroutines.ExperimentalCoroutinesApi import kotlinx.coroutines.launch import kotlinx.coroutines.test.runTest import org.apache.avro.SchemaBuilder @@ -36,7 +35,11 @@ import org.radarbase.output.accounting.Accountant import org.radarbase.output.accounting.OffsetRangeSet import org.radarbase.output.accounting.TopicPartition import org.radarbase.output.accounting.TopicPartitionOffsetRange -import org.radarbase.output.config.* +import org.radarbase.output.config.PathConfig +import org.radarbase.output.config.ResourceConfig +import org.radarbase.output.config.RestructureConfig +import org.radarbase.output.config.S3Config +import org.radarbase.output.config.WorkerConfig import org.radarbase.output.util.SuspendedCloseable.Companion.useSuspended import org.radarbase.output.worker.FileCacheStore import java.io.IOException @@ -45,7 +48,6 @@ import java.nio.file.Path import java.time.Instant import kotlin.io.path.createDirectories -@OptIn(ExperimentalCoroutinesApi::class) class FileCacheStoreTest { private val lastModified = Instant.now() @@ -83,10 +85,10 @@ class FileCacheStoreTest { RestructureConfig( paths = PathConfig( output = root, - temp = tmpDir + temp = tmpDir, ), worker = WorkerConfig(cacheSize = 2), - source = ResourceConfig("hdfs", hdfs = HdfsConfig(listOf("test"))), + source = ResourceConfig(type = "s3", s3 = S3Config("endpoint", null, null)), ), ) @@ -179,7 +181,7 @@ class FileCacheStoreTest { ).process( check { offsets.addAll(it.offsets) - } + }, ) assertTrue(offsets.contains(offsetRange0)) diff --git a/src/test/java/org/radarbase/output/data/FileCacheTest.kt b/src/test/java/org/radarbase/output/data/FileCacheTest.kt index 4f0872e..111fe20 100644 --- a/src/test/java/org/radarbase/output/data/FileCacheTest.kt +++ b/src/test/java/org/radarbase/output/data/FileCacheTest.kt @@ -17,7 +17,6 @@ package org.radarbase.output.data import kotlinx.coroutines.Dispatchers -import kotlinx.coroutines.ExperimentalCoroutinesApi import kotlinx.coroutines.joinAll import kotlinx.coroutines.launch import kotlinx.coroutines.test.runTest @@ -33,10 +32,10 @@ import org.mockito.kotlin.mock import org.radarbase.output.Application import org.radarbase.output.accounting.Accountant import org.radarbase.output.accounting.TopicPartition -import org.radarbase.output.config.HdfsConfig import org.radarbase.output.config.PathConfig import org.radarbase.output.config.ResourceConfig import org.radarbase.output.config.RestructureConfig +import org.radarbase.output.config.S3Config import org.radarbase.output.util.ResourceContext.Companion.resourceContext import org.radarbase.output.util.SuspendedCloseable.Companion.useSuspended import org.radarbase.output.worker.FileCache @@ -51,7 +50,6 @@ import kotlin.io.path.inputStream /** * Created by joris on 03/07/2017. */ -@OptIn(ExperimentalCoroutinesApi::class) class FileCacheTest { private lateinit var path: Path private lateinit var exampleRecord: Record @@ -78,9 +76,9 @@ class FileCacheTest { config = RestructureConfig( paths = PathConfig( output = path.parent, - temp = tmpPath + temp = tmpPath, ), - source = ResourceConfig("hdfs", hdfs = HdfsConfig(listOf("test"))), + source = ResourceConfig("s3", S3Config("endpoint", null, null)), ) setUp(config) diff --git a/src/test/java/org/radarbase/output/data/JsonAvroConverterTest.kt b/src/test/java/org/radarbase/output/data/JsonAvroConverterTest.kt index 3860346..cbe666c 100644 --- a/src/test/java/org/radarbase/output/data/JsonAvroConverterTest.kt +++ b/src/test/java/org/radarbase/output/data/JsonAvroConverterTest.kt @@ -18,7 +18,6 @@ package org.radarbase.output.data import com.fasterxml.jackson.databind.ObjectMapper import com.fasterxml.jackson.databind.SerializationFeature -import kotlinx.coroutines.ExperimentalCoroutinesApi import kotlinx.coroutines.test.runTest import org.apache.avro.Schema.Parser import org.apache.avro.generic.GenericDatumReader @@ -38,7 +37,6 @@ import java.io.StringWriter import java.nio.file.Path import kotlin.io.path.bufferedWriter -@OptIn(ExperimentalCoroutinesApi::class) class JsonAvroConverterTest { @Test @Throws(IOException::class) diff --git a/src/test/java/org/radarbase/output/path/FormattedPathFactoryTest.kt b/src/test/java/org/radarbase/output/path/FormattedPathFactoryTest.kt index 903438e..2a67819 100644 --- a/src/test/java/org/radarbase/output/path/FormattedPathFactoryTest.kt +++ b/src/test/java/org/radarbase/output/path/FormattedPathFactoryTest.kt @@ -1,69 +1,76 @@ package org.radarbase.output.path +import kotlinx.coroutines.runBlocking import org.hamcrest.MatcherAssert.assertThat import org.hamcrest.Matchers.instanceOf import org.hamcrest.Matchers.nullValue import org.junit.jupiter.api.Assertions.assertEquals import org.junit.jupiter.api.Test import org.junit.jupiter.api.assertThrows -import org.radarbase.output.path.FormattedPathFactory.Companion.toPathFormatterPlugin +import org.radarbase.output.config.PathConfig +import org.radarbase.output.config.PathFormatterConfig import org.radarcns.kafka.ObservationKey import org.radarcns.passive.phone.PhoneLight -import java.nio.file.Paths import java.time.Instant import kotlin.reflect.jvm.jvmName internal class FormattedPathFactoryTest { @Test - fun testFormat() { + fun testFormat() = runBlocking { val factory = createFactory( - format = "\${topic}/\${projectId}/\${userId}/\${sourceId}/\${time:yyyyMM}/\${time:dd}/\${filename}" + format = "\${topic}/\${projectId}/\${userId}/\${sourceId}/\${time:yyyyMM}/\${time:dd}/\${filename}", ) val t = Instant.parse("2021-01-02T10:05:00Z") - val path = factory.getRelativePath( - "t", - ObservationKey( - "p", - "u", - "s", + val path = factory.relativePath( + PathFormatParameters( + topic = "t", + key = ObservationKey( + "p", + "u", + "s", + ), + value = PhoneLight( + t.epochSecond.toDouble(), + t.epochSecond.toDouble(), + 1.0f, + ), + time = t, + attempt = 0, ), - PhoneLight( - t.epochSecond.toDouble(), - t.epochSecond.toDouble(), - 1.0f, - ), - t, - 0, ) - assertEquals(Paths.get("t/p/u/s/202101/02/20210102_1000.csv.gz"), path) + assertEquals("t/p/u/s/202101/02/20210102_1000.csv.gz", path) } @Test - fun unparameterized() { + fun unparameterized() = runBlocking { val factory = FormattedPathFactory().apply { - init(emptyMap()) - extension = ".csv.gz" + init( + extension = ".csv.gz", + config = PathConfig(), + ) } val t = Instant.parse("2021-01-02T10:05:00Z") - val path = factory.getRelativePath( - "t", - ObservationKey( - "p", - "u", - "s", - ), - PhoneLight( - t.epochSecond.toDouble(), - t.epochSecond.toDouble(), - 1.0f, + val path = factory.relativePath( + PathFormatParameters( + topic = "t", + key = ObservationKey( + "p", + "u", + "s", + ), + value = PhoneLight( + t.epochSecond.toDouble(), + t.epochSecond.toDouble(), + 1.0f, + ), + time = t, + attempt = 0, ), - t, - 0, ) - assertEquals(Paths.get("p/u/t/20210102_1000.csv.gz"), path) + assertEquals("p/u/t/20210102_1000.csv.gz", path) } @Test @@ -100,9 +107,13 @@ internal class FormattedPathFactoryTest { private fun createFactory(format: String): FormattedPathFactory = FormattedPathFactory().apply { init( - mapOf("format" to format), + extension = ".csv.gz", + config = PathConfig( + path = PathFormatterConfig( + format = format, + ), + ), ) - extension = ".csv.gz" } @Test diff --git a/src/test/java/org/radarbase/output/path/PathFormatterTest.kt b/src/test/java/org/radarbase/output/path/PathFormatterTest.kt index 5b1f511..dbd81b7 100644 --- a/src/test/java/org/radarbase/output/path/PathFormatterTest.kt +++ b/src/test/java/org/radarbase/output/path/PathFormatterTest.kt @@ -1,18 +1,20 @@ package org.radarbase.output.path +import kotlinx.coroutines.runBlocking import org.hamcrest.MatcherAssert.assertThat import org.hamcrest.Matchers.equalTo import org.junit.jupiter.api.Assertions.assertThrows import org.junit.jupiter.api.BeforeEach import org.junit.jupiter.api.Test +import org.radarbase.output.config.PathFormatterConfig.Companion.DEFAULT_FORMAT import org.radarcns.kafka.ObservationKey import org.radarcns.monitor.application.ApplicationServerStatus import org.radarcns.monitor.application.ServerStatus -import java.nio.file.Paths import java.time.Instant internal class PathFormatterTest { - lateinit var params: PathFormatParameters + private lateinit var fixedPlugin: PathFormatterPlugin + private lateinit var params: PathFormatParameters @BeforeEach fun setupRecord() { @@ -30,121 +32,121 @@ internal class PathFormatterTest { ), time = Instant.ofEpochMilli(1000), attempt = 0, - extension = ".csv", ) + fixedPlugin = FixedPathFormatterPlugin().create(mapOf("extension" to ".csv")) } @Test - fun testDefaultPath() { + fun testDefaultPath() = runBlocking { val formatter = PathFormatter( - format = FormattedPathFactory.Companion.DEFAULTS.format, + format = DEFAULT_FORMAT, plugins = listOf( - FixedPathFormatterPlugin().create(), + fixedPlugin, TimePathFormatterPlugin(), KeyPathFormatterPlugin(), ValuePathFormatterPlugin(), - ) + ), ) - assertThat(formatter.format(params), equalTo(Paths.get("p/u/my_topic/19700101_0000.csv"))) + assertThat(formatter.format(params), equalTo("p/u/my_topic/19700101_0000.csv")) } @Test - fun testDefaultPathFewerPlugins() { + fun testDefaultPathFewerPlugins() = runBlocking { val formatter = PathFormatter( - format = FormattedPathFactory.Companion.DEFAULTS.format, + format = DEFAULT_FORMAT, plugins = listOf( - FixedPathFormatterPlugin().create(), - ) + fixedPlugin, + ), ) - assertThat(formatter.format(params), equalTo(Paths.get("p/u/my_topic/19700101_0000.csv"))) + assertThat(formatter.format(params), equalTo("p/u/my_topic/19700101_0000.csv")) } @Test - fun testDefaultPathNoTime() { + fun testDefaultPathNoTime() = runBlocking { val formatter = PathFormatter( - format = FormattedPathFactory.Companion.DEFAULTS.format, + format = DEFAULT_FORMAT, plugins = listOf( - FixedPathFormatterPlugin().create(), - ) + fixedPlugin, + ), ) - assertThat(formatter.format(params.copy(time = null)), equalTo(Paths.get("p/u/my_topic/unknown-time.csv"))) + assertThat(formatter.format(params.copy(time = null)), equalTo("p/u/my_topic/unknown-time.csv")) } @Test fun testDefaultPathWrongPlugins() { assertThrows(IllegalArgumentException::class.java) { PathFormatter( - format = FormattedPathFactory.Companion.DEFAULTS.format, + format = DEFAULT_FORMAT, plugins = listOf( TimePathFormatterPlugin(), KeyPathFormatterPlugin(), ValuePathFormatterPlugin(), - ) + ), ) } } @Test - fun testCorrectTimeFormatPlugins() { + fun testCorrectTimeFormatPlugins() = runBlocking { val formatter = PathFormatter( format = "\${topic}/\${time:YYYY-MM-dd_HH:mm:ss}\${attempt}\${extension}", plugins = listOf( - FixedPathFormatterPlugin().create(), + fixedPlugin, TimePathFormatterPlugin(), ), ) - assertThat(formatter.format(params), equalTo(Paths.get("my_topic/1970-01-01_000001.csv"))) + assertThat(formatter.format(params), equalTo("my_topic/1970-01-01_000001.csv")) } @Test - fun testBadTimeFormatPlugins() { + fun testBadTimeFormatPlugins(): Unit = runBlocking { assertThrows(IllegalArgumentException::class.java) { PathFormatter( format = "\${topic}/\${time:VVV}\${attempt}\${extension}", plugins = listOf( - FixedPathFormatterPlugin().create(), + fixedPlugin, TimePathFormatterPlugin(), - ) + ), ) } } @Test - fun testCorrectKeyFormat() { + fun testCorrectKeyFormat() = runBlocking { val formatter = PathFormatter( format = "\${topic}/\${key:projectId}\${attempt}\${extension}", plugins = listOf( - FixedPathFormatterPlugin().create(), + fixedPlugin, KeyPathFormatterPlugin(), - ) + ), ) - assertThat(formatter.format(params), equalTo(Paths.get("my_topic/p.csv"))) + assertThat(formatter.format(params), equalTo("my_topic/p.csv")) } @Test - fun testUnknownKeyFormat() { + fun testUnknownKeyFormat() = runBlocking { val formatter = PathFormatter( format = "\${topic}/\${key:doesNotExist}\${attempt}\${extension}", plugins = listOf( - FixedPathFormatterPlugin().create(), + fixedPlugin, KeyPathFormatterPlugin(), - ) + ), ) - assertThat(formatter.format(params), equalTo(Paths.get("my_topic/unknown-key.csv"))) + assertThat(formatter.format(params), equalTo("my_topic/unknown-key.csv")) } @Test - fun testCorrectValueFormat() { + fun testCorrectValueFormat() = runBlocking { val formatter = PathFormatter( format = "\${topic}/\${value:serverStatus}\${attempt}\${extension}", plugins = listOf( - FixedPathFormatterPlugin().create(), + fixedPlugin, ValuePathFormatterPlugin(), - ) + ), ) - assertThat(formatter.format(params), equalTo(Paths.get("my_topic/CONNECTED.csv"))) + assertThat(formatter.format(params), equalTo("my_topic/CONNECTED.csv")) } } diff --git a/src/test/java/org/radarbase/output/path/RecordPathFactoryTest.kt b/src/test/java/org/radarbase/output/path/RecordPathFactoryTest.kt new file mode 100644 index 0000000..db8a332 --- /dev/null +++ b/src/test/java/org/radarbase/output/path/RecordPathFactoryTest.kt @@ -0,0 +1,46 @@ +package org.radarbase.output.path + +import org.junit.jupiter.api.Assertions.assertEquals +import org.junit.jupiter.api.Test +import org.radarbase.output.config.PathConfig +import org.radarbase.output.config.PathFormatterConfig +import org.radarbase.output.config.ResourceConfig +import org.radarbase.output.config.S3Config + +internal class RecordPathFactoryTest { + + @Test + fun testInit() { + var properties = mapOf("key1" to "value1", "key2" to "value2") + + val pathConfig = PathConfig( + factory = "org.radarbase.output.path.FormattedPathFactory", + properties = properties, + path = PathFormatterConfig( + format = "\${topic}/\${projectId}/\${userId}/\${sourceId}/\${filename}", + plugins = "fixed", + ), + ) + + val targetConfig = S3Config( + endpoint = "http://localhost:9000", + accessToken = "minioadmin", + secretKey = "minioadmin", + bucket = "target", + ) + + val factory = pathConfig.createFactory( + ResourceConfig("s3", s3 = targetConfig), + "test-extension", + topics = mapOf(), + ) + + properties = buildMap { + putAll(properties) + putIfAbsent("extension", "test-extension") + } + + assertEquals(properties, factory.pathConfig.path.properties) + assertEquals(properties, factory.pathConfig.path.properties) + } +} diff --git a/src/test/java/org/radarbase/output/source/S3SourceStorageTest.kt b/src/test/java/org/radarbase/output/source/S3SourceStorageTest.kt index a84e054..90e6b00 100644 --- a/src/test/java/org/radarbase/output/source/S3SourceStorageTest.kt +++ b/src/test/java/org/radarbase/output/source/S3SourceStorageTest.kt @@ -16,6 +16,7 @@ class S3SourceStorageTest { } } } + @Test fun testFaultTolerant() { assertThrows {