RADAR-base
diff --git a/‎.editorconfig‎
Lines changed: 1 addition & 0 deletions b/‎.editorconfig‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.github/workflows/codeql.yml‎
Lines changed: 89 additions & 0 deletions b/‎.github/workflows/codeql.yml‎
Lines changed: 89 additions & 0 deletions
diff --git a/‎.github/workflows/main.yml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/main.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.github/workflows/publish_snapshots.yml‎
Lines changed: 5 additions & 4 deletions b/‎.github/workflows/publish_snapshots.yml‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎.github/workflows/release.yml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/release.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.github/workflows/snyk.yaml‎
Lines changed: 4 additions & 0 deletions b/‎.github/workflows/snyk.yaml‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎Dockerfile‎
Lines changed: 5 additions & 4 deletions b/‎Dockerfile‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎README.md‎
Lines changed: 10 additions & 2 deletions b/‎README.md‎
Lines changed: 10 additions & 2 deletions
@@ -0,0 +1 @@
+root = true
@@ -0,0 +1,89 @@
+# For most projects, this workflow file will not need changing; you simply need
+# to commit it to your repository.
+#
+# You may wish to alter this file to override the set of languages analyzed,
+# or to provide custom queries or build logic.
+#
+# ******** NOTE ********
+# We have attempted to detect the languages in your repository. Please check
+# the `language` matrix defined below to confirm you have the correct set of
+# supported CodeQL languages.
+#
+name: "CodeQL"
+
+on:
+  push:
+    branches: [ "main", "dev" ]
+  pull_request:
+    branches: [ "main", "dev" ]
+  schedule:
+    - cron: '24 21 * * 0'
+
+jobs:
+  analyze:
+    name: Analyze
+    # Runner size impacts CodeQL analysis time. To learn more, please see:
+    #   - https://gh.io/recommended-hardware-resources-for-running-codeql
+    #   - https://gh.io/supported-runners-and-hardware-resources
+    #   - https://gh.io/using-larger-runners
+    # Consider using larger runners for possible analysis time improvements.
+    runs-on: ${{ (matrix.language == 'swift' && 'macos-latest') || 'ubuntu-latest' }}
+    timeout-minutes: ${{ (matrix.language == 'swift' && 120) || 360 }}
+    permissions:
+      # required for all workflows
+      security-events: write
+
+      # only required for workflows in private repositories
+      actions: read
+      contents: read
+
+    strategy:
+      fail-fast: false
+      matrix:
+        language: [ 'java-kotlin' ]
+        # CodeQL supports [ 'c-cpp', 'csharp', 'go', 'java-kotlin', 'javascript-typescript', 'python', 'ruby', 'swift' ]
+        # Use only 'java-kotlin' to analyze code written in Java, Kotlin or both
+        # Use only 'javascript-typescript' to analyze code written in JavaScript, TypeScript or both
+        # Learn more about CodeQL language support at https://aka.ms/codeql-docs/language-support
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - uses: actions/setup-java@v4
+        with:
+          distribution: 'temurin' # See 'Supported distributions' for available options
+          java-version: '17'
+
+      # Initializes the CodeQL tools for scanning.
+      - name: Initialize CodeQL
+        uses: github/codeql-action/init@v3
+        with:
+          languages: ${{ matrix.language }}
+          # If you wish to specify custom queries, you can do so here or in a config file.
+          # By default, queries listed here will override any specified in a config file.
+          # Prefix the list here with "+" to use these queries and those in the config file.
+
+          # For more details on CodeQL's query packs, refer to: https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs
+          # queries: security-extended,security-and-quality
+
+
+      # Autobuild attempts to build any compiled languages (C/C++, C#, Go, Java, or Swift).
+      # If this step fails, then you should remove it and run the build manually (see below)
+      - name: Autobuild
+        uses: github/codeql-action/autobuild@v3
+
+      # ℹ️ Command-line programs to run using the OS shell.
+      # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun
+
+      #   If the Autobuild fails above, remove it and uncomment the following three lines.
+      #   modify them (or add more) to build your code if your project, please refer to the EXAMPLE below for guidance.
+
+      # - run: |
+      #     echo "Run, Build Application using script"
+      #     ./location_of_script_within_repo/buildscript.sh
+
+      - name: Perform CodeQL Analysis
+        uses: github/codeql-action/analyze@v3
+        with:
+          category: "/language:${{matrix.language}}"
@@ -95,8 +95,8 @@ jobs:
           # Use runtime labels from docker_meta as well as fixed labels
           labels: |
             ${{ steps.docker_meta.outputs.labels }}
-            maintainer=Joris Borgdorff <joris@thehyve.nl>
-            org.opencontainers.image.authors=Joris Borgdorff <joris@thehyve.nl>
+            maintainer=Bastiaan de Graaf <bastiaan@thehyve.nl>
+            org.opencontainers.image.authors=Bastiaan de Graaf <bastiaan@thehyve.nl>
             org.opencontainers.image.vendor=RADAR-base
             org.opencontainers.image.licenses=Apache-2.0
 
 
@@ -17,10 +17,6 @@ jobs:
       # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
       - uses: actions/checkout@v3
 
-      - name: Has SNAPSHOT version
-        id: is-snapshot
-        run: grep 'version = ".*-SNAPSHOT"' build.gradle.kts
-
       - uses: actions/setup-java@v3
         with:
           distribution: temurin
@@ -29,6 +25,11 @@ jobs:
       - name: Setup Gradle
         uses: gradle/gradle-build-action@v2
 
+      - name: Has SNAPSHOT version
+        id: is-snapshot
+        run: |
+          ./gradlew properties | grep 'version: .*-SNAPSHOT'
+
       - name: Install gpg secret key
         run: |
           cat <(echo -e "${{ secrets.OSSRH_GPG_SECRET_KEY }}") | gpg --batch --import
 
@@ -91,8 +91,8 @@ jobs:
           # Use runtime labels from docker_meta as well as fixed labels
           labels: |
             ${{ steps.docker_meta.outputs.labels }}
-            maintainer=Joris Borgdorff <joris@thehyve.nl>
-            org.opencontainers.image.authors=Joris Borgdorff <joris@thehyve.nl>
+            maintainer=Bastiaan de Graaf <bastiaan@thehyve.nl>
+            org.opencontainers.image.authors=Bastiaan de Graaf <bastiaan@thehyve.nl>
             org.opencontainers.image.vendor=RADAR-base
             org.opencontainers.image.licenses=Apache-2.0
 
 
@@ -3,6 +3,7 @@ on:
   pull_request:
     branches:
       - main
+      - dev
 
 jobs:
   security:
@@ -29,3 +30,6 @@ jobs:
           --configuration-matching='^runtimeClasspath$'
           --org=radar-base
           --policy-path=$PWD/.snyk
+          --all-projects
+          --severity-threshold=high
+          --fail-on=upgradable
@@ -10,20 +10,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-FROM --platform=$BUILDPLATFORM gradle:7.5-jdk17 AS builder
+FROM --platform=$BUILDPLATFORM gradle:8.4-jdk17 AS builder
 
 RUN mkdir /code
 WORKDIR /code
 ENV GRADLE_USER_HOME=/code/.gradlecache \
-   GRADLE_OPTS=-Djdk.lang.Process.launchMechanism=vfork
+   GRADLE_OPTS="-Djdk.lang.Process.launchMechanism=vfork -Dorg.gradle.vfs.watch=false"
 
 COPY ./build.gradle.kts ./gradle.properties ./settings.gradle.kts /code/
+COPY ./buildSrc /code/buildSrc
 
-RUN gradle downloadDependencies copyDependencies startScripts --no-watch-fs
+RUN gradle downloadDependencies copyDependencies startScripts
 
 COPY ./src /code/src
 
-RUN gradle jar --no-watch-fs
+RUN gradle jar
 
 FROM eclipse-temurin:17-jre
 
 
@@ -1,7 +1,7 @@
 # Restructure Kafka connector output files
 
 Data streamed by a Kafka Connector will be converted to a RADAR-base oriented output directory, by organizing it by project, user and collection date.
-It supports data written by [RADAR S3 sink connector](https://github.com/RADAR-base/RADAR-S3-Connector) is streamed to files based on topic name only. This package transforms that output to a local directory structure as follows: `projectId/userId/topic/date_hour.csv`. The date and hour are extracted from the `time` field of each record, and is formatted in UTC time. This package is included in the [RADAR-Docker](https://github.com/RADAR-base/RADAR-Docker) repository, in the `dcompose/radar-cp-hadoop-stack/bin/hdfs-restructure` script.
+It supports data written by [RADAR S3 sink connector](https://github.com/RADAR-base/RADAR-S3-Connector) is streamed to files based on topic name only. This package transforms that output to a local directory structure as follows: `projectId/userId/topic/date_hour.csv`. The date and hour are extracted from the `time` field of each record, and is formatted in UTC time.
 
 ## Upgrade instructions
 
@@ -90,7 +90,7 @@ By default, this will output the data in CSV format. If JSON format is preferred
 radar-output-restructure --format json --output-directory <output_folder>  <input_path_1> [<input_path_2> ...]
 ```
 
-By default, files records are not deduplicated after writing. To enable this behaviour, specify the option `--deduplicate` or `-d`. This set to false by default because of an issue with Biovotion data. Please see - [issue #16](https://github.com/RADAR-base/Restructure-HDFS-topic/issues/16) before enabling it. Deduplication can also be enabled or disabled per topic using the config file. If lines should be deduplicated using a subset of fields, e.g. only `sourceId` and `time` define a unique record and only the last record with duplicate values should be kept, then specify `topics: <topicName>: deduplication: distinctFields: [key.sourceId, value.time]`.
+By default, files records are not deduplicated after writing. To enable this behaviour, specify the option `--deduplicate` or `-d`. This set to false by default because of an issue with Biovotion data. Please see - [issue #16](https://github.com/RADAR-base/radar-output-restructure/issues/16) before enabling it. Deduplication can also be enabled or disabled per topic using the config file. If lines should be deduplicated using a subset of fields, e.g. only `sourceId` and `time` define a unique record and only the last record with duplicate values should be kept, then specify `topics: <topicName>: deduplication: distinctFields: [key.sourceId, value.time]`.
 
 ### Compression
 
@@ -118,8 +118,16 @@ source:
   # only actually needed if source type is hdfs
   azure:
     # azure options
+  index:
+    # Interval to fully synchronize the index with the source storage
+    fullSyncInterval: 3600
+    # Interval to sync empty directories with.
+    # They are also synced during a full sync.
+    emptyDirectorySyncInterval: 900
 ```
 
+The index makes a scan of the source before any operations. Further list operations are done on the index only. This is especially relevant for S3 storage where list operations are priced.
+
 The target is similar, and in addition supports the local file system (`local`).
 
 ```yaml