Skip to content

Commit 5bfe11c

Browse files
committed
MLE-24402 Trying out Spark 4.1.0 preview1
This is tempting to start using now because it requires fewer dependency alterations to minimize CVEs. Tests all appear to be fine. And 4.1.0 is using Jackson 2.19.0, which avoids a lot of messing around.
1 parent ca98049 commit 5bfe11c

File tree

3 files changed

+33
-68
lines changed

3 files changed

+33
-68
lines changed

build.gradle

Lines changed: 3 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -42,23 +42,11 @@ subprojects {
4242

4343
configurations.all {
4444
resolutionStrategy.eachDependency { DependencyResolveDetails details ->
45+
// These all impact Spark and its dependencies, but not the published connector as the connector does not contain
46+
// any Spark libraries.
4547
if (details.requested.group.equals("org.apache.hadoop") and details.requested.version.equals("3.4.1")) {
4648
details.useVersion "3.4.2"
47-
details.because "Using 3.4.2 to minimize CVEs and because Flux is doing the same thing. This only affects the connector tests."
48-
}
49-
if (details.requested.group.startsWith('com.fasterxml.jackson')) {
50-
details.useVersion '2.18.2'
51-
details.because 'Need to match the version used by Spark 4.0.1.'
52-
}
53-
if (details.requested.group.equals("org.slf4j")) {
54-
details.useVersion "2.0.17"
55-
details.because "Ensures that slf4j-api 1.x does not appear on the Flux classpath in particular, which can " +
56-
"lead to this issue - https://www.slf4j.org/codes.html#StaticLoggerBinder."
57-
}
58-
if (details.requested.group.equals("org.apache.logging.log4j")) {
59-
details.useVersion "2.24.3"
60-
details.because "Need to match the version used by Apache Tika. Spark uses 2.20.0 but automated tests confirm " +
61-
"that Spark seems fine with 2.24.3."
49+
details.because "Using 3.4.2 to minimize CVEs and because Flux is doing the same thing."
6250
}
6351
if (details.requested.group.equals("org.codehaus.janino")) {
6452
details.useVersion "3.1.12"
@@ -69,18 +57,6 @@ subprojects {
6957
details.because "Bumping from 4.1.118 (what Spark SQL 4.0.1 depends on) to 4.1.127 to minimize CVEs."
7058
}
7159
}
72-
73-
resolutionStrategy {
74-
// Avoids a classpath conflict between Spark and the tika-parser-microsoft-module. Tika needs a
75-
// more recent version and Spark (and Jena as well) both seems fine with this (as they should be per semver).
76-
force "org.apache.commons:commons-compress:1.27.1"
77-
78-
// Avoids CVEs in earlier minor versions.
79-
force "org.apache.commons:commons-lang3:3.18.0"
80-
}
81-
82-
// Excluded from Flux for size reasons, so excluded here as well to ensure we don't need it when running tests.
83-
exclude module: "rocksdbjni"
8460
}
8561

8662
test {

gradle.properties

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
version=3.0-SNAPSHOT
2-
sparkVersion=4.0.1
2+
sparkVersion=4.1.0-preview1
33
tikaVersion=3.2.3
44
semaphoreVersion=5.10.0
55
langchain4jVersion=1.5.0

marklogic-spark-connector/build.gradle

Lines changed: 29 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -1,72 +1,62 @@
11
plugins {
2-
id 'com.gradleup.shadow' version '8.3.3'
2+
id 'com.gradleup.shadow' version '9.2.1'
33
id 'maven-publish'
44
}
55

6-
configurations {
7-
// Defines all the implementation dependencies, but in such a way that they are not included as dependencies in the
8-
// library's pom.xml file. This is due to the shadow jar being published instead of a jar only containing this
9-
// project's classes. The shadow jar is published due to the need to relocate several packages to avoid conflicts
10-
// with Spark.
11-
shadowDependencies
12-
13-
// This approach allows for all of the dependencies to be available for compilation and for running tests.
14-
compileOnly.extendsFrom(shadowDependencies)
15-
testImplementation.extendsFrom(compileOnly)
16-
}
17-
186
dependencies {
197
// Need to compile against Spark, but its libraries are not part of the connector jar.
20-
compileOnly "org.apache.spark:spark-sql_2.13:${sparkVersion}"
8+
compileOnly ("org.apache.spark:spark-sql_2.13:${sparkVersion}") {
9+
// Excluded from Flux for size reasons, so excluded here as well to ensure we don't need it when running tests.
10+
exclude module: "rocksdbjni"
11+
}
2112

2213
// This is compileOnly as Spark will provide its own copy at runtime.
2314
compileOnly "com.fasterxml.jackson.core:jackson-databind:2.18.2"
2415

25-
shadowDependencies("com.marklogic:marklogic-client-api:7.2.0") {
16+
// Using 'api' so that Flux does not need to declare this as well.
17+
api("com.marklogic:marklogic-client-api:7.2.0") {
2618
// Need to use the versions of Jackson preferred by Spark.
2719
exclude group: "com.fasterxml.jackson.core"
2820
exclude group: "com.fasterxml.jackson.dataformat"
2921
}
3022

3123
// For logging.
32-
shadowDependencies "org.slf4j:jcl-over-slf4j:2.0.17"
24+
implementation "org.slf4j:jcl-over-slf4j:2.0.17"
3325

3426
// Needed for splitting XML documents via XPath.
35-
shadowDependencies "jaxen:jaxen:2.0.0"
27+
implementation "jaxen:jaxen:2.0.0"
3628

3729
// Needed for classifying documents via Semaphore.
38-
shadowDependencies("com.smartlogic.csclient:Semaphore-CS-Client:${semaphoreVersion}") {
30+
implementation("com.smartlogic.csclient:Semaphore-CS-Client:${semaphoreVersion}") {
3931
exclude group: "com.fasterxml.jackson.core"
4032
}
41-
shadowDependencies("com.smartlogic.cloud:Semaphore-Cloud-Client:${semaphoreVersion}") {
33+
implementation("com.smartlogic.cloud:Semaphore-Cloud-Client:${semaphoreVersion}") {
4234
exclude group: "com.fasterxml.jackson.core"
4335
}
4436

4537
// Adding this in 2.6.0. tika-core is very small and only brings in commons-io and and slf4j-api. Flux can then
4638
// include the necessary parsers.
47-
shadowDependencies "org.apache.tika:tika-core:${tikaVersion}"
39+
implementation "org.apache.tika:tika-core:${tikaVersion}"
4840

4941
// Needed for using XmlMapper.
50-
shadowDependencies("com.fasterxml.jackson.dataformat:jackson-dataformat-xml:2.18.2") {
42+
implementation("com.fasterxml.jackson.dataformat:jackson-dataformat-xml:2.19.0") {
5143
// Not needed, as the modules in this group that this dependency depends on are all provided by Spark.
5244
exclude group: "com.fasterxml.jackson.core"
5345
}
5446

5547
// Supports reading and writing RDF data. Including this here so it's available to the tests as well.
5648
// Bumped to 5.x, which requires Java 17, while upgrading Spark to 4.x.
57-
shadowDependencies("org.apache.jena:jena-arq:5.5.0") {
49+
implementation("org.apache.jena:jena-arq:5.5.0") {
5850
exclude group: "com.fasterxml.jackson.core"
5951
exclude group: "com.fasterxml.jackson.dataformat"
6052
}
6153

6254
// Needed for some XML operations that are far easier with JDOM2 than with DOM.
63-
shadowDependencies "org.jdom:jdom2:2.0.6.1"
55+
implementation "org.jdom:jdom2:2.0.6.1"
6456

65-
shadowDependencies "dev.langchain4j:langchain4j:${langchain4jVersion}"
66-
67-
// Ensuring the desired version of commons-compress is included in the connector jar. Some tests have failed in Flux
68-
// because an older version - likely the one depended on by Jena - is included instead.
69-
shadowDependencies "org.apache.commons:commons-compress:1.27.1"
57+
implementation ("dev.langchain4j:langchain4j:${langchain4jVersion}") {
58+
exclude group: "com.fasterxml.jackson.core"
59+
}
7060

7161
// Need this so that an OkHttpClientConfigurator can be created.
7262
// Only needs compileOnly, as the Java Client brings this as an implementation dependency.
@@ -78,10 +68,14 @@ dependencies {
7868
// org.junit.platform.commons.JUnitException: TestEngine with ID 'junit-jupiter' failed to discover tests
7969
testRuntimeOnly "org.junit.platform:junit-platform-launcher:1.13.4"
8070

81-
testImplementation "org.apache.spark:spark-sql_2.13:${sparkVersion}"
71+
testImplementation ("org.apache.spark:spark-sql_2.13:${sparkVersion}") {
72+
exclude module: "rocksdbjni"
73+
}
8274

8375
// Supports testing the embedder feature.
84-
testImplementation "dev.langchain4j:langchain4j-embeddings-all-minilm-l6-v2:1.5.0-beta11"
76+
testImplementation ("dev.langchain4j:langchain4j-embeddings-all-minilm-l6-v2:1.5.0-beta11") {
77+
exclude group: "com.fasterxml.jackson.core"
78+
}
8579

8680
testImplementation('com.marklogic:ml-app-deployer:6.0.1') {
8781
exclude group: "com.fasterxml.jackson.core"
@@ -106,7 +100,6 @@ dependencies {
106100
testImplementation "org.springframework:spring-test:6.2.11"
107101

108102
testImplementation "ch.qos.logback:logback-classic:1.5.18"
109-
testImplementation "org.slf4j:jcl-over-slf4j:2.0.17"
110103
testImplementation "org.skyscreamer:jsonassert:1.5.3"
111104

112105
testImplementation "org.apache.tika:tika-parser-microsoft-module:${tikaVersion}"
@@ -126,15 +119,6 @@ test {
126119
]
127120
}
128121

129-
shadowJar {
130-
configurations = [project.configurations.shadowDependencies]
131-
132-
// "all" is the default; no need for that in the connector filename. This also results in this becoming the library
133-
// artifact that is published as a dependency. That is desirable as it includes the relocated packages listed below,
134-
// which a dependent would otherwise have to manage themselves.
135-
archiveClassifier.set("")
136-
}
137-
138122
// Publishing setup - see https://docs.gradle.org/current/userguide/publishing_setup.html .
139123
java {
140124
withJavadocJar()
@@ -152,6 +136,11 @@ javadoc.failOnError = false
152136
// Ignores warnings on params that don't have descriptions, which is a little too noisy
153137
javadoc.options.addStringOption('Xdoclint:none', '-quiet')
154138

139+
// We don't want the shadow jar to be published to a Maven repository.
140+
shadow {
141+
addShadowVariantIntoJavaComponent = false
142+
}
143+
155144
publishing {
156145
publications {
157146
mainJava(MavenPublication) {

0 commit comments

Comments
 (0)