From 90822e40ec07c6783a385c4678cdabc228a677a3 Mon Sep 17 00:00:00 2001 From: Rob Rudin Date: Thu, 25 Sep 2025 14:56:13 -0400 Subject: [PATCH] MLE-24402 Trying out Spark 4.1.0 preview1 This is tempting to start using now because it requires fewer dependency alterations to minimize CVEs. Tests all appear to be fine. And 4.1.0 is using Jackson 2.19.0, which avoids a lot of messing around. --- build.gradle | 30 ++---------- gradle.properties | 2 +- marklogic-spark-connector/build.gradle | 68 +++++++++++--------------- 3 files changed, 32 insertions(+), 68 deletions(-) diff --git a/build.gradle b/build.gradle index 76367726..eb3c80d1 100644 --- a/build.gradle +++ b/build.gradle @@ -42,23 +42,11 @@ subprojects { configurations.all { resolutionStrategy.eachDependency { DependencyResolveDetails details -> + // These all impact Spark and its dependencies, but not the published connector as the connector does not contain + // any Spark libraries. if (details.requested.group.equals("org.apache.hadoop") and details.requested.version.equals("3.4.1")) { details.useVersion "3.4.2" - details.because "Using 3.4.2 to minimize CVEs and because Flux is doing the same thing. This only affects the connector tests." - } - if (details.requested.group.startsWith('com.fasterxml.jackson')) { - details.useVersion '2.18.2' - details.because 'Need to match the version used by Spark 4.0.1.' - } - if (details.requested.group.equals("org.slf4j")) { - details.useVersion "2.0.17" - details.because "Ensures that slf4j-api 1.x does not appear on the Flux classpath in particular, which can " + - "lead to this issue - https://www.slf4j.org/codes.html#StaticLoggerBinder." - } - if (details.requested.group.equals("org.apache.logging.log4j")) { - details.useVersion "2.24.3" - details.because "Need to match the version used by Apache Tika. Spark uses 2.20.0 but automated tests confirm " + - "that Spark seems fine with 2.24.3." + details.because "Using 3.4.2 to minimize CVEs and because Flux is doing the same thing." } if (details.requested.group.equals("org.codehaus.janino")) { details.useVersion "3.1.12" @@ -69,18 +57,6 @@ subprojects { details.because "Bumping from 4.1.118 (what Spark SQL 4.0.1 depends on) to 4.1.127 to minimize CVEs." } } - - resolutionStrategy { - // Avoids a classpath conflict between Spark and the tika-parser-microsoft-module. Tika needs a - // more recent version and Spark (and Jena as well) both seems fine with this (as they should be per semver). - force "org.apache.commons:commons-compress:1.27.1" - - // Avoids CVEs in earlier minor versions. - force "org.apache.commons:commons-lang3:3.18.0" - } - - // Excluded from Flux for size reasons, so excluded here as well to ensure we don't need it when running tests. - exclude module: "rocksdbjni" } test { diff --git a/gradle.properties b/gradle.properties index 151d37de..9f841b11 100644 --- a/gradle.properties +++ b/gradle.properties @@ -1,5 +1,5 @@ version=3.0-SNAPSHOT -sparkVersion=4.0.1 +sparkVersion=4.1.0-preview1 tikaVersion=3.2.3 semaphoreVersion=5.10.0 langchain4jVersion=1.5.0 diff --git a/marklogic-spark-connector/build.gradle b/marklogic-spark-connector/build.gradle index f4d962c1..d0ad6558 100644 --- a/marklogic-spark-connector/build.gradle +++ b/marklogic-spark-connector/build.gradle @@ -1,72 +1,61 @@ plugins { - id 'com.gradleup.shadow' version '8.3.3' + id 'com.gradleup.shadow' version '9.2.1' id 'maven-publish' } -configurations { - // Defines all the implementation dependencies, but in such a way that they are not included as dependencies in the - // library's pom.xml file. This is due to the shadow jar being published instead of a jar only containing this - // project's classes. The shadow jar is published due to the need to relocate several packages to avoid conflicts - // with Spark. - shadowDependencies - - // This approach allows for all of the dependencies to be available for compilation and for running tests. - compileOnly.extendsFrom(shadowDependencies) - testImplementation.extendsFrom(compileOnly) -} - dependencies { // Need to compile against Spark, but its libraries are not part of the connector jar. - compileOnly "org.apache.spark:spark-sql_2.13:${sparkVersion}" + compileOnly ("org.apache.spark:spark-sql_2.13:${sparkVersion}") { + // Excluded from Flux for size reasons, so excluded here as well to ensure we don't need it when running tests. + exclude module: "rocksdbjni" + } // This is compileOnly as Spark will provide its own copy at runtime. compileOnly "com.fasterxml.jackson.core:jackson-databind:2.18.2" - shadowDependencies("com.marklogic:marklogic-client-api:7.2.0") { + implementation("com.marklogic:marklogic-client-api:7.2.0") { // Need to use the versions of Jackson preferred by Spark. exclude group: "com.fasterxml.jackson.core" exclude group: "com.fasterxml.jackson.dataformat" } // For logging. - shadowDependencies "org.slf4j:jcl-over-slf4j:2.0.17" + implementation "org.slf4j:jcl-over-slf4j:2.0.17" // Needed for splitting XML documents via XPath. - shadowDependencies "jaxen:jaxen:2.0.0" + implementation "jaxen:jaxen:2.0.0" // Needed for classifying documents via Semaphore. - shadowDependencies("com.smartlogic.csclient:Semaphore-CS-Client:${semaphoreVersion}") { + implementation("com.smartlogic.csclient:Semaphore-CS-Client:${semaphoreVersion}") { exclude group: "com.fasterxml.jackson.core" } - shadowDependencies("com.smartlogic.cloud:Semaphore-Cloud-Client:${semaphoreVersion}") { + implementation("com.smartlogic.cloud:Semaphore-Cloud-Client:${semaphoreVersion}") { exclude group: "com.fasterxml.jackson.core" } // Adding this in 2.6.0. tika-core is very small and only brings in commons-io and and slf4j-api. Flux can then // include the necessary parsers. - shadowDependencies "org.apache.tika:tika-core:${tikaVersion}" + implementation "org.apache.tika:tika-core:${tikaVersion}" // Needed for using XmlMapper. - shadowDependencies("com.fasterxml.jackson.dataformat:jackson-dataformat-xml:2.18.2") { + implementation("com.fasterxml.jackson.dataformat:jackson-dataformat-xml:2.19.0") { // Not needed, as the modules in this group that this dependency depends on are all provided by Spark. exclude group: "com.fasterxml.jackson.core" } // Supports reading and writing RDF data. Including this here so it's available to the tests as well. // Bumped to 5.x, which requires Java 17, while upgrading Spark to 4.x. - shadowDependencies("org.apache.jena:jena-arq:5.5.0") { + implementation("org.apache.jena:jena-arq:5.5.0") { exclude group: "com.fasterxml.jackson.core" exclude group: "com.fasterxml.jackson.dataformat" } // Needed for some XML operations that are far easier with JDOM2 than with DOM. - shadowDependencies "org.jdom:jdom2:2.0.6.1" + implementation "org.jdom:jdom2:2.0.6.1" - shadowDependencies "dev.langchain4j:langchain4j:${langchain4jVersion}" - - // Ensuring the desired version of commons-compress is included in the connector jar. Some tests have failed in Flux - // because an older version - likely the one depended on by Jena - is included instead. - shadowDependencies "org.apache.commons:commons-compress:1.27.1" + implementation ("dev.langchain4j:langchain4j:${langchain4jVersion}") { + exclude group: "com.fasterxml.jackson.core" + } // Need this so that an OkHttpClientConfigurator can be created. // Only needs compileOnly, as the Java Client brings this as an implementation dependency. @@ -78,10 +67,14 @@ dependencies { // org.junit.platform.commons.JUnitException: TestEngine with ID 'junit-jupiter' failed to discover tests testRuntimeOnly "org.junit.platform:junit-platform-launcher:1.13.4" - testImplementation "org.apache.spark:spark-sql_2.13:${sparkVersion}" + testImplementation ("org.apache.spark:spark-sql_2.13:${sparkVersion}") { + exclude module: "rocksdbjni" + } // Supports testing the embedder feature. - testImplementation "dev.langchain4j:langchain4j-embeddings-all-minilm-l6-v2:1.5.0-beta11" + testImplementation ("dev.langchain4j:langchain4j-embeddings-all-minilm-l6-v2:1.5.0-beta11") { + exclude group: "com.fasterxml.jackson.core" + } testImplementation('com.marklogic:ml-app-deployer:6.0.1') { exclude group: "com.fasterxml.jackson.core" @@ -106,7 +99,6 @@ dependencies { testImplementation "org.springframework:spring-test:6.2.11" testImplementation "ch.qos.logback:logback-classic:1.5.18" - testImplementation "org.slf4j:jcl-over-slf4j:2.0.17" testImplementation "org.skyscreamer:jsonassert:1.5.3" testImplementation "org.apache.tika:tika-parser-microsoft-module:${tikaVersion}" @@ -126,15 +118,6 @@ test { ] } -shadowJar { - configurations = [project.configurations.shadowDependencies] - - // "all" is the default; no need for that in the connector filename. This also results in this becoming the library - // artifact that is published as a dependency. That is desirable as it includes the relocated packages listed below, - // which a dependent would otherwise have to manage themselves. - archiveClassifier.set("") -} - // Publishing setup - see https://docs.gradle.org/current/userguide/publishing_setup.html . java { withJavadocJar() @@ -152,6 +135,11 @@ javadoc.failOnError = false // Ignores warnings on params that don't have descriptions, which is a little too noisy javadoc.options.addStringOption('Xdoclint:none', '-quiet') +// We don't want the shadow jar to be published to a Maven repository. +shadow { + addShadowVariantIntoJavaComponent = false +} + publishing { publications { mainJava(MavenPublication) {