Skip to content

Commit f02f1f2

Browse files
committed
MLE-24402 Trying out Spark 4.1.0 preview1
This is tempting to start using now because it requires fewer dependency alterations to minimize CVEs. Tests all appear to be fine. And 4.1.0 is using Jackson 2.19.0, which avoids a lot of messing around.
1 parent ca98049 commit f02f1f2

File tree

3 files changed

+32
-68
lines changed

3 files changed

+32
-68
lines changed

build.gradle

Lines changed: 3 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -42,23 +42,11 @@ subprojects {
4242

4343
configurations.all {
4444
resolutionStrategy.eachDependency { DependencyResolveDetails details ->
45+
// These all impact Spark and its dependencies, but not the published connector as the connector does not contain
46+
// any Spark libraries.
4547
if (details.requested.group.equals("org.apache.hadoop") and details.requested.version.equals("3.4.1")) {
4648
details.useVersion "3.4.2"
47-
details.because "Using 3.4.2 to minimize CVEs and because Flux is doing the same thing. This only affects the connector tests."
48-
}
49-
if (details.requested.group.startsWith('com.fasterxml.jackson')) {
50-
details.useVersion '2.18.2'
51-
details.because 'Need to match the version used by Spark 4.0.1.'
52-
}
53-
if (details.requested.group.equals("org.slf4j")) {
54-
details.useVersion "2.0.17"
55-
details.because "Ensures that slf4j-api 1.x does not appear on the Flux classpath in particular, which can " +
56-
"lead to this issue - https://www.slf4j.org/codes.html#StaticLoggerBinder."
57-
}
58-
if (details.requested.group.equals("org.apache.logging.log4j")) {
59-
details.useVersion "2.24.3"
60-
details.because "Need to match the version used by Apache Tika. Spark uses 2.20.0 but automated tests confirm " +
61-
"that Spark seems fine with 2.24.3."
49+
details.because "Using 3.4.2 to minimize CVEs and because Flux is doing the same thing."
6250
}
6351
if (details.requested.group.equals("org.codehaus.janino")) {
6452
details.useVersion "3.1.12"
@@ -69,18 +57,6 @@ subprojects {
6957
details.because "Bumping from 4.1.118 (what Spark SQL 4.0.1 depends on) to 4.1.127 to minimize CVEs."
7058
}
7159
}
72-
73-
resolutionStrategy {
74-
// Avoids a classpath conflict between Spark and the tika-parser-microsoft-module. Tika needs a
75-
// more recent version and Spark (and Jena as well) both seems fine with this (as they should be per semver).
76-
force "org.apache.commons:commons-compress:1.27.1"
77-
78-
// Avoids CVEs in earlier minor versions.
79-
force "org.apache.commons:commons-lang3:3.18.0"
80-
}
81-
82-
// Excluded from Flux for size reasons, so excluded here as well to ensure we don't need it when running tests.
83-
exclude module: "rocksdbjni"
8460
}
8561

8662
test {

gradle.properties

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
version=3.0-SNAPSHOT
2-
sparkVersion=4.0.1
2+
sparkVersion=4.1.0-preview1
33
tikaVersion=3.2.3
44
semaphoreVersion=5.10.0
55
langchain4jVersion=1.5.0

marklogic-spark-connector/build.gradle

Lines changed: 28 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -1,72 +1,61 @@
11
plugins {
2-
id 'com.gradleup.shadow' version '8.3.3'
2+
id 'com.gradleup.shadow' version '9.2.1'
33
id 'maven-publish'
44
}
55

6-
configurations {
7-
// Defines all the implementation dependencies, but in such a way that they are not included as dependencies in the
8-
// library's pom.xml file. This is due to the shadow jar being published instead of a jar only containing this
9-
// project's classes. The shadow jar is published due to the need to relocate several packages to avoid conflicts
10-
// with Spark.
11-
shadowDependencies
12-
13-
// This approach allows for all of the dependencies to be available for compilation and for running tests.
14-
compileOnly.extendsFrom(shadowDependencies)
15-
testImplementation.extendsFrom(compileOnly)
16-
}
17-
186
dependencies {
197
// Need to compile against Spark, but its libraries are not part of the connector jar.
20-
compileOnly "org.apache.spark:spark-sql_2.13:${sparkVersion}"
8+
compileOnly ("org.apache.spark:spark-sql_2.13:${sparkVersion}") {
9+
// Excluded from Flux for size reasons, so excluded here as well to ensure we don't need it when running tests.
10+
exclude module: "rocksdbjni"
11+
}
2112

2213
// This is compileOnly as Spark will provide its own copy at runtime.
2314
compileOnly "com.fasterxml.jackson.core:jackson-databind:2.18.2"
2415

25-
shadowDependencies("com.marklogic:marklogic-client-api:7.2.0") {
16+
implementation("com.marklogic:marklogic-client-api:7.2.0") {
2617
// Need to use the versions of Jackson preferred by Spark.
2718
exclude group: "com.fasterxml.jackson.core"
2819
exclude group: "com.fasterxml.jackson.dataformat"
2920
}
3021

3122
// For logging.
32-
shadowDependencies "org.slf4j:jcl-over-slf4j:2.0.17"
23+
implementation "org.slf4j:jcl-over-slf4j:2.0.17"
3324

3425
// Needed for splitting XML documents via XPath.
35-
shadowDependencies "jaxen:jaxen:2.0.0"
26+
implementation "jaxen:jaxen:2.0.0"
3627

3728
// Needed for classifying documents via Semaphore.
38-
shadowDependencies("com.smartlogic.csclient:Semaphore-CS-Client:${semaphoreVersion}") {
29+
implementation("com.smartlogic.csclient:Semaphore-CS-Client:${semaphoreVersion}") {
3930
exclude group: "com.fasterxml.jackson.core"
4031
}
41-
shadowDependencies("com.smartlogic.cloud:Semaphore-Cloud-Client:${semaphoreVersion}") {
32+
implementation("com.smartlogic.cloud:Semaphore-Cloud-Client:${semaphoreVersion}") {
4233
exclude group: "com.fasterxml.jackson.core"
4334
}
4435

4536
// Adding this in 2.6.0. tika-core is very small and only brings in commons-io and and slf4j-api. Flux can then
4637
// include the necessary parsers.
47-
shadowDependencies "org.apache.tika:tika-core:${tikaVersion}"
38+
implementation "org.apache.tika:tika-core:${tikaVersion}"
4839

4940
// Needed for using XmlMapper.
50-
shadowDependencies("com.fasterxml.jackson.dataformat:jackson-dataformat-xml:2.18.2") {
41+
implementation("com.fasterxml.jackson.dataformat:jackson-dataformat-xml:2.19.0") {
5142
// Not needed, as the modules in this group that this dependency depends on are all provided by Spark.
5243
exclude group: "com.fasterxml.jackson.core"
5344
}
5445

5546
// Supports reading and writing RDF data. Including this here so it's available to the tests as well.
5647
// Bumped to 5.x, which requires Java 17, while upgrading Spark to 4.x.
57-
shadowDependencies("org.apache.jena:jena-arq:5.5.0") {
48+
implementation("org.apache.jena:jena-arq:5.5.0") {
5849
exclude group: "com.fasterxml.jackson.core"
5950
exclude group: "com.fasterxml.jackson.dataformat"
6051
}
6152

6253
// Needed for some XML operations that are far easier with JDOM2 than with DOM.
63-
shadowDependencies "org.jdom:jdom2:2.0.6.1"
54+
implementation "org.jdom:jdom2:2.0.6.1"
6455

65-
shadowDependencies "dev.langchain4j:langchain4j:${langchain4jVersion}"
66-
67-
// Ensuring the desired version of commons-compress is included in the connector jar. Some tests have failed in Flux
68-
// because an older version - likely the one depended on by Jena - is included instead.
69-
shadowDependencies "org.apache.commons:commons-compress:1.27.1"
56+
implementation ("dev.langchain4j:langchain4j:${langchain4jVersion}") {
57+
exclude group: "com.fasterxml.jackson.core"
58+
}
7059

7160
// Need this so that an OkHttpClientConfigurator can be created.
7261
// Only needs compileOnly, as the Java Client brings this as an implementation dependency.
@@ -78,10 +67,14 @@ dependencies {
7867
// org.junit.platform.commons.JUnitException: TestEngine with ID 'junit-jupiter' failed to discover tests
7968
testRuntimeOnly "org.junit.platform:junit-platform-launcher:1.13.4"
8069

81-
testImplementation "org.apache.spark:spark-sql_2.13:${sparkVersion}"
70+
testImplementation ("org.apache.spark:spark-sql_2.13:${sparkVersion}") {
71+
exclude module: "rocksdbjni"
72+
}
8273

8374
// Supports testing the embedder feature.
84-
testImplementation "dev.langchain4j:langchain4j-embeddings-all-minilm-l6-v2:1.5.0-beta11"
75+
testImplementation ("dev.langchain4j:langchain4j-embeddings-all-minilm-l6-v2:1.5.0-beta11") {
76+
exclude group: "com.fasterxml.jackson.core"
77+
}
8578

8679
testImplementation('com.marklogic:ml-app-deployer:6.0.1') {
8780
exclude group: "com.fasterxml.jackson.core"
@@ -106,7 +99,6 @@ dependencies {
10699
testImplementation "org.springframework:spring-test:6.2.11"
107100

108101
testImplementation "ch.qos.logback:logback-classic:1.5.18"
109-
testImplementation "org.slf4j:jcl-over-slf4j:2.0.17"
110102
testImplementation "org.skyscreamer:jsonassert:1.5.3"
111103

112104
testImplementation "org.apache.tika:tika-parser-microsoft-module:${tikaVersion}"
@@ -126,15 +118,6 @@ test {
126118
]
127119
}
128120

129-
shadowJar {
130-
configurations = [project.configurations.shadowDependencies]
131-
132-
// "all" is the default; no need for that in the connector filename. This also results in this becoming the library
133-
// artifact that is published as a dependency. That is desirable as it includes the relocated packages listed below,
134-
// which a dependent would otherwise have to manage themselves.
135-
archiveClassifier.set("")
136-
}
137-
138121
// Publishing setup - see https://docs.gradle.org/current/userguide/publishing_setup.html .
139122
java {
140123
withJavadocJar()
@@ -152,6 +135,11 @@ javadoc.failOnError = false
152135
// Ignores warnings on params that don't have descriptions, which is a little too noisy
153136
javadoc.options.addStringOption('Xdoclint:none', '-quiet')
154137

138+
// We don't want the shadow jar to be published to a Maven repository.
139+
shadow {
140+
addShadowVariantIntoJavaComponent = false
141+
}
142+
155143
publishing {
156144
publications {
157145
mainJava(MavenPublication) {

0 commit comments

Comments
 (0)