marklogic
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎CODEOWNERS‎
Lines changed: 1 addition & 1 deletion b/‎CODEOWNERS‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎CONTRIBUTING.md‎
Lines changed: 10 additions & 41 deletions b/‎CONTRIBUTING.md‎
Lines changed: 10 additions & 41 deletions
diff --git a/‎Jenkinsfile‎
Lines changed: 31 additions & 3 deletions b/‎Jenkinsfile‎
Lines changed: 31 additions & 3 deletions
diff --git a/‎NOTICE.txt‎
Lines changed: 16 additions & 7 deletions b/‎NOTICE.txt‎
Lines changed: 16 additions & 7 deletions
diff --git a/‎README.md‎
Lines changed: 1 addition & 0 deletions b/‎README.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎build.gradle‎
Lines changed: 79 additions & 2 deletions b/‎build.gradle‎
Lines changed: 79 additions & 2 deletions
diff --git a/‎code-coverage-report/build.gradle‎
Lines changed: 23 additions & 0 deletions b/‎code-coverage-report/build.gradle‎
Lines changed: 23 additions & 0 deletions
@@ -12,3 +12,4 @@ flux/conf
 flux-cli/src/dist/ext/*.jar
 flux-version.properties
 docker/sonarqube
+optionsExperiments
@@ -2,4 +2,4 @@
 # Each line is a file pattern followed by one or more owners.
 
 # These owners will be the default owners for everything in the repo.
-* @anu3990 @billfarber @rjrudin
+* @anu3990 @billfarber @rjrudin @stevebio
@@ -10,7 +10,7 @@ application installed:
 Next, run the following to pull a small model for the test instance of Ollama to use; this will be used by one or more
 embedder tests:
 
-    docker exec -it flux-ollama-1 ollama pull all-minilm
+    docker exec -it docker-tests-flux-ollama-1 ollama pull all-minilm
 
 Some of the tests depend on the Postgres instance deployed via Docker. Follow these steps to load a sample dataset
 into it:
@@ -24,11 +24,11 @@ downloading the `dvdrental.zip` and extracting it to produce a file named `dvdre
 Once you have the `dvdrental.tar` file in place, run these commands to load it into Postgres:
 
 ```
-docker exec -it flux-postgres-1 psql -U postgres -c "CREATE DATABASE dvdrental"
-docker exec -it flux-postgres-1 pg_restore -U postgres -d dvdrental /opt/dvdrental.tar
+docker exec -it docker-tests-flux-postgres-1 psql -U postgres -c "CREATE DATABASE dvdrental"
+docker exec -it docker-tests-flux-postgres-1 pg_restore -U postgres -d dvdrental /opt/dvdrental.tar
 ```
 
-The Docker file includes a pgadmin instance which can be accessed at <http://localhost:15432/>. 
+The Docker file includes a pgadmin instance which can be accessed at <http://localhost:5480/>. 
 If you wish to login to this, do so with "postgres@pgadmin.com" and 
 a password of "postgres". For logging into Postgres itself, use "postgres" as the username and password. You can then
 register a server that connects to the "postgres" server.
@@ -104,44 +104,13 @@ tests. You do not need to do this if you have Intellij configured to use Gradle
 
 ## Generating code quality reports with SonarQube
 
-In order to use SonarQube, you must have used Docker to run this project's `docker-compose.yml` file, and you must
-have the services in that file running. You must also use Java 17 to run the `sonar` Gradle task. 
+Please see our internal Wiki page - search for "Developer Experience SonarQube" -
+for information on setting up SonarQube and using it with this repository.
 
-To configure the SonarQube service, perform the following steps:
-
-1. Go to http://localhost:9000 .
-2. Login as admin/admin. SonarQube will ask you to change this password; you can choose whatever you want ("password" works).
-3. Click on "Create project manually".
-4. Enter "flux" for the Project Name; use that as the Project Key too.
-5. Enter "main" as the main branch name.
-6. Click on "Next".
-7. Click on "Use the global setting" and then "Create project".
-8. On the "Analysis Method" page, click on "Locally".
-9. In the "Provide a token" panel, click on "Generate". Copy the token.
-10. Add `systemProp.sonar.login=your token pasted here` to `gradle-local.properties` in the root of your project, creating
-    that file if it does not exist yet.
-
-To run SonarQube, run the following Gradle tasks with Java 17 or higher, which will run all the tests with code 
-coverage and then generate a quality report with SonarQube:
-
-    ./gradlew test sonar
-
-If you do not add `systemProp.sonar.login` to your `gradle-local.properties` file, you can specify the token via the
-following:
-
-    ./gradlew test sonar -Dsonar.login=paste your token here
-
-When that completes, you will see a line like this near the end of the logging:
-
-    ANALYSIS SUCCESSFUL, you can find the results at: http://localhost:9000/dashboard?id=flux
-
-Click on that link. If it's the first time you've run the report, you'll see all issues. If you've run the report
-before, then SonarQube will show "New Code" by default. That's handy, as you can use that to quickly see any issues
-you've introduced on the feature branch you're working on. You can then click on "Overall Code" to see all issues.
-
-Note that if you only need results on code smells and vulnerabilities, you can repeatedly run `./gradlew sonar`
-without having to re-run the tests. If you get an error from Sonar about Java sources, you just need to compile the 
-Java code, so run `./gradlew compileTestJava sonar`. 
+You can run `./gradlew clean testCodeCoverageReport` to run the tests and generate code coverage data. The output will
+be written to `code-coverage-report/build`. Unfortunately though, Sonarqube does not appear to consume this data 
+correctly. For example, as of 2025-04-23, the Jacoco test report will show 84% coverage but Sonarqube will only report 
+76% coverage.
 
 ## Testing the documentation locally
 
 
@@ -28,13 +28,14 @@ def runtests(){
     ./gradlew -i  mlDeploy;
     wget https://www.postgresqltutorial.com/wp-content/uploads/2019/05/dvdrental.zip;
     unzip dvdrental.zip -d docker/postgres/ ;
-    docker exec -i flux-postgres-1 psql -U postgres -c "CREATE DATABASE dvdrental";
-    docker exec -i  flux-postgres-1 pg_restore -U postgres -d dvdrental /opt/dvdrental.tar;
+    docker exec -i docker-tests-flux-postgres-1 psql -U postgres -c "CREATE DATABASE dvdrental";
+    docker exec -i docker-tests-flux-postgres-1 pg_restore -U postgres -d dvdrental /opt/dvdrental.tar;
     cd $WORKSPACE/flux/;
-    ./gradlew --refresh-dependencies clean test || true;
+    ./gradlew --refresh-dependencies clean testCodeCoverageReport || true;
   '''
   junit '**/*.xml'
 }
+
 def postCleanup(){
   sh label:'mlcleanup', script: '''#!/bin/bash
     cd $WORKSPACE/flux;
@@ -45,6 +46,7 @@ def postCleanup(){
     echo "y" | docker volume prune --filter all=1 || true;
   '''
 }
+
 def runSonarScan(String javaVersion){
     sh label:'test', script: '''#!/bin/bash
       export JAVA_HOME=$'''+javaVersion+'''
@@ -54,20 +56,25 @@ def runSonarScan(String javaVersion){
      ./gradlew sonar -Dsonar.projectKey='ML-DevExp-marklogic-flux' -Dsonar.projectName='ML-DevExp-marklogic-flux' || true
     '''
 }
+
 pipeline{
   agent none
+
   options {
     checkoutToSubdirectory 'flux'
     buildDiscarder logRotator(artifactDaysToKeepStr: '7', artifactNumToKeepStr: '', daysToKeepStr: '30', numToKeepStr: '')
   }
+
   environment{
     JAVA_HOME_DIR="/home/builder/java/jdk-11.0.2"
     JAVA17_HOME_DIR="/home/builder/java/jdk-17.0.2"
     GRADLE_DIR   =".gradle"
     DMC_USER     = credentials('MLBUILD_USER')
     DMC_PASSWORD = credentials('MLBUILD_PASSWORD')
   }
+
   stages{
+
     stage('tests'){
       environment{
         scannerHome = tool 'SONAR_Progress'
@@ -85,6 +92,25 @@ pipeline{
         }
       }
     }
+
+    stage('publishApi'){
+      agent {label 'devExpLinuxPool'}
+      when {
+        branch 'develop'
+      }
+      steps{
+        sh label:'publishApi', script: '''#!/bin/bash
+          export JAVA_HOME=`eval echo "$JAVA_HOME_DIR"`;
+          export GRADLE_USER_HOME=$WORKSPACE/$GRADLE_DIR
+          export PATH=$JAVA_HOME/bin:$GRADLE_USER_HOME:$PATH;
+          ./gradlew clean;
+          cp ~/.gradle/gradle.properties $GRADLE_USER_HOME/gradle.properties;
+          cd $WORKSPACE/flux;
+          ./gradlew publish
+        '''
+      }
+    }
+
     stage('publish'){
       agent{ label 'devExpLinuxPool'}
       when {
@@ -116,6 +142,7 @@ pipeline{
         }
       }
     }
+
     stage('regressions'){
       when{
         allOf{
@@ -136,5 +163,6 @@ pipeline{
         }
       }
     }
+
   }
 }
@@ -11,16 +11,18 @@ hadoop-aws 3.3.4 (Apache-2.0)
 hadoop-client 3.3.4 (Apache-2.0)
 marklogic-spark-connector 2.5.1 (Apache-2.0)
 picocli 4.7.6 (Apache-2.0)
-spark-avro_2.12 3.5.3 (Apache-2.0)
-spark-sql_2.12 3.5.3 (Apache-2.0)
+spark-avro_2.12 3.5.5 (Apache-2.0)
+spark-sql_2.12 3.5.5 (Apache-2.0)
+tika-parser-microsoft-module 3.1.0 (Apache-2.0)
+tika-parser-pdf-module 3.1.0 (Apache-2.0)
 
 Common Licenses
 
 Apache License 2.0 (Apache-2.0)
 
 Third-Party Components
 
-The following is a list of the third-party components used by MarkLogic® Flux™ 1.2.1 (last updated January 7, 2025):
+The following is a list of the third-party components used by MarkLogic® Flux™ 1.3.0 (last updated May 1, 2025):
 
 aws-java-sdk-s3 1.12.262 (Apache-2.0)
 https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-s3
@@ -34,26 +36,33 @@ hadoop-client 3.3.4 (Apache-2.0)
 https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-client
 For the full text of the Apache-2.0 license, see Apache License 2.0 (Apache-2.0)
 
-marklogic-spark-connector 2.5.1 (Apache-2.0)
+marklogic-spark-connector 2.6.0 (Apache-2.0)
 https://repo1.maven.org/maven2/com/marklogic/marklogic-spark-connector
 For the full text of the Apache-2.0 license, see Apache License 2.0 (Apache-2.0)
 
 picocli 4.7.6 (Apache-2.0)
 https://repo1.maven.org/maven2/info/picocli/picocli
 For the full text of the Apache-2.0 license, see Apache License 2.0 (Apache-2.0)
 
-spark-avro_2.12 3.5.3 (Apache-2.0)
+spark-avro_2.12 3.5.5 (Apache-2.0)
 https://repo1.maven.org/maven2/org/apache/spark/spark-avro_2.12
 For the full text of the Apache-2.0 license, see Apache License 2.0 (Apache-2.0)
 
-spark-sql_2.12 3.5.3 (Apache-2.0)
+spark-sql_2.12 3.5.5 (Apache-2.0)
 https://repo1.maven.org/maven2/org/apache/spark/spark-sql_2.12
 For the full text of the Apache-2.0 license, see Apache License 2.0 (Apache-2.0)
 
+tika-parser-microsoft-module 3.1.0 (Apache-2.0)
+https://repo1.maven.org/maven2/org/apache/tika/tika-parser-microsoft-module/
+For the full text of the Apache-2.0 license, see Apache License 2.0 (Apache-2.0)
+
+tika-parser-pdf-module 3.1.0 (Apache-2.0)
+https://repo1.maven.org/maven2/org/apache/tika/tika-parser-pdf-module/
+For the full text of the Apache-2.0 license, see Apache License 2.0 (Apache-2.0)
 
 Common Licenses
 
-This section shows the text of common third-party licenses used by MarkLogic® Flux™ 1.2.1 (last updated January 7, 2025):
+This section shows the text of common third-party licenses used by MarkLogic® Flux™ 1.3.0 (last updated January 7, 2025):
 
 Apache License 2.0 (Apache-2.0)
 https://spdx.org/licenses/Apache-2.0.html
 
@@ -6,6 +6,7 @@ With Flux, you can automate common data movement use cases including:
 
 - Importing rows from an RDBMS.
 - Importing JSON, XML, CSV, Parquet and other file types from a local filesystem or S3.
+- Extract text from binary documents and classify it using [Progress Semaphore](https://www.progress.com/semaphore).
 - Implementing a data pipeline for a [RAG solution with MarkLogic](https://www.progress.com/marklogic/solutions/generative-ai).
 - Copying data from one MarkLogic database to another database.
 - Reprocessing data in MarkLogic via custom code.
 
@@ -1,13 +1,40 @@
+plugins {
+  id "org.sonarqube" version "6.1.0.5360"
+}
+
+sonar {
+  properties {
+    property "sonar.projectKey", "flux"
+    property "sonar.host.url", "http://localhost:9000"
+    property "sonar.coverage.jacoco.xmlReportPaths", "code-coverage-report/build/reports/jacoco/testCodeCoverageReport/testCodeCoverageReport.xml"
+    // Avoids a warning from Gradle.
+    property "sonar.gradle.skipCompile", "true"
+  }
+}
+
 subprojects {
   apply plugin: "java-library"
 
   group = "com.marklogic"
 
   java {
-    sourceCompatibility = 11
-    targetCompatibility = 11
+    // Flux requires Java 11 for all operations besides splitting and embedding, which require Java 17 due to
+    // the requirements of the langchain4j dependency.
+    toolchain {
+      languageVersion = JavaLanguageVersion.of(11)
+    }
   }
 
+  // Allows for quickly identifying compiler warnings.
+  tasks.withType(JavaCompile) {
+    options.compilerArgs << '-Xlint:unchecked'
+    options.deprecation = true
+  }
+
+  javadoc.failOnError = false
+  // Ignores warnings on params that don't have descriptions, which is a little too noisy
+  javadoc.options.addStringOption('Xdoclint:none', '-quiet')
+
   repositories {
     mavenCentral()
     mavenLocal()
@@ -22,7 +49,42 @@ subprojects {
         details.useVersion '2.15.2'
         details.because 'Need to match the version used by Spark.'
       }
+      if (details.requested.group.equals("org.slf4j")) {
+        details.useVersion "2.0.16"
+        details.because "Ensures that slf4j-api 1.x does not appear on the Flux classpath in particular, which can " +
+          "lead to this issue - https://www.slf4j.org/codes.html#StaticLoggerBinder."
+      }
+      if (details.requested.group.equals("org.apache.logging.log4j")) {
+        details.useVersion "2.24.3"
+        details.because "Need to match the version used by Apache Tika. Spark uses 2.20.0 but automated tests confirm " +
+          "that Spark seems fine with 2.24.3."
+      }
     }
+
+    resolutionStrategy {
+      // By default, Spark 3.5.x does not include the log4j 1.x dependency via its zookeeper dependency. But somehow, by
+      // adding hadoop-client 3.3.4 to the mix, the log4j 1.x dependency comes via the zookeeper 3.6.3 dependency. Per
+      // the release notes at https://zookeeper.apache.org/doc/r3.6.4/releasenotes.html, using zookeeper 3.6.4 - which
+      // removes log4j 1.x, thus avoiding the major CVE associated with log4j 1.x - appears safe, which is confirmed by
+      // tests as well.
+      force "org.apache.zookeeper:zookeeper:3.6.4"
+
+      // Avoids a classpath conflict between Spark and tika-parser-microsoft-module. Forces Spark to use the
+      // version that tika-parser-microsoft-module wants.
+      // Avoids another classpath conflict between Spark and tika-parser-microsoft-module.
+      force "org.apache.commons:commons-compress:1.27.1"
+    }
+
+    // Without this exclusion, we have multiple slf4j providers, leading to an ugly warning at the start
+    // of each Flux execution.
+    exclude group: "org.slf4j", module: "slf4j-reload4j"
+
+    // The rocksdbjni dependency weighs in at 50mb and so far does not appear necessary for our use of Spark.
+    exclude module: "rocksdbjni"
+  }
+
+  task allDeps(type: DependencyReportTask) {
+    description = "Allows for generating dependency reports for every subproject in a single task."
   }
 
   test {
@@ -31,6 +93,20 @@ subprojects {
       events 'started', 'passed', 'skipped', 'failed'
       exceptionFormat 'full'
     }
+    jvmArgs = [
+      // Needed for all Java 17 testing.
+      "--add-opens", "java.base/sun.nio.ch=ALL-UNNAMED",
+
+      // For Spark's SerializationDebugger when using Java 17. See ReprocessTest for one example of why this is needed.
+      "--add-opens", "java.base/sun.security.action=ALL-UNNAMED",
+
+      // Needed by the JDBC tests.
+      "--add-opens", "java.base/sun.util.calendar=ALL-UNNAMED",
+
+      // Needed by CustomImportTest
+      "--add-opens", "java.base/java.io=ALL-UNNAMED",
+      "--add-opens", "java.base/sun.nio.cs=ALL-UNNAMED"
+    ]
   }
 }
 
@@ -39,6 +115,7 @@ task gettingStartedZip(type: Zip) {
     "on the GitHub release page."
   from "examples/getting-started"
   exclude "build", ".gradle", "gradle-*.properties", "flux", ".gitignore", "marklogic-flux"
+  exclude "src/main/ml-schemas/tde/chunks.json"
   into "marklogic-flux-getting-started-${version}"
   archiveFileName = "marklogic-flux-getting-started-${version}.zip"
   destinationDirectory = file("build")
 
@@ -0,0 +1,23 @@
+// See https://docs.gradle.org/current/samples/sample_jvm_multi_project_with_code_coverage_standalone.html
+// for more information on how this file was created.
+
+plugins {
+  id 'jacoco-report-aggregation'
+}
+
+dependencies {
+  jacocoAggregation project(':flux-embedding-model-azure-open-ai')
+  jacocoAggregation project(':flux-embedding-model-minilm')
+  jacocoAggregation project(':flux-embedding-model-ollama')
+  jacocoAggregation project(':flux-tests-api')
+  jacocoAggregation project(':flux-cli')
+  jacocoAggregation project(':flux-java17-tests')
+}
+
+reporting {
+  reports {
+    testCodeCoverageReport(JacocoCoverageReport) {
+      testType = TestSuiteType.UNIT_TEST
+    }
+  }
+}