-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbuild.sbt
More file actions
145 lines (133 loc) · 7.51 KB
/
build.sbt
File metadata and controls
145 lines (133 loc) · 7.51 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
// Learn more in: https://www.scala-sbt.org/1.x/docs/sbt-by-example.html
import sbtassembly.AssemblyPlugin.autoImport._
ThisBuild / scalaVersion := "2.13.16"
ThisBuild / version := "0.1.0-SNAPSHOT"
/* kind-projector is used to handle GeoTrellis source code
* (`[K: * => SpatialKey, V: * => Tile]`) in ExtendedIterativeCostDistance.scala */
addCompilerPlugin(
"org.typelevel" % "kind-projector" % "0.13.3" cross CrossVersion.full
)
// See: https://www.scalatest.org/user_guide/using_scalatest_with_sbt
Test / logBuffered := false
// Parallel testing is off because Spark operates under the assumption of one SparkSession per JVM
Test / parallelExecution := false
// Fork tests so module-opening JVM flags apply to the actual Spark test process.
Test / fork := true
// Include old libraries in import resolution to class not found errors
Test / javaOptions ++= Seq(
"--add-opens=java.base/java.lang.invoke=ALL-UNNAMED",
"--add-opens=java.base/java.nio=ALL-UNNAMED",
"--add-exports=java.base/sun.security.action=ALL-UNNAMED"
)
// Variable `logFileSuffix` is used to have distinct log-file name for test runs
// See: `./src/main/resources/log4j2.properties`
Test / javaOptions += "-DlogFileSuffix=test"
// scala3-migrations warnings as errors hindered the debug-console experience, so I disabled them
// ThisBuild / scalacOptions ++= Seq(
// "-Wconf:cat=scala3-migration:s"
// )
/* NOTE: Why we shade Spire (and algebra):
* I got hit with `java.lang.NoSuchMethodError: 'spire.math.IntIsIntegral spire.math.Integral$.IntIsIntegral()'`
* when submitting a geotrellis-spark dependent program to my standalone Spark cluster (Ex07RddProgram)
* After numerous debugging sessions, and after analyzing the dependency graph (using `sbt dependencyDot`),
* I (and ChatGPT) figured out that the problem was in a "dependency-race"
* between Spark's MLib bundled Spire library (0.18.0) and Geotrellis `sbt-assembly` bundled Spire library (0.17.0).
* Because Spark is `% provided` this conflict didn't surface in `sbt evicted`, but only at runtime.
* At runtime, Spark would load MLib's Spire (0.18.0) BEFORE the driver had a chance to
* load geotrellis-spark and it's Spire (0.17.0) library from the fat-jar.
*
* One of the ways ChatGPT diagnosed this is by looking at the JARs bundled in the Spark distribution, using this command:
* ```
* ls -1 $SPARK_HOME/jars | grep -i spire || true
* sbt evicted | grep -i spire -n
* ```
* To get better context into this hellish bug, view the dependency graph dot file interactively
* using `sbt dependencyDot` and the installed `Graphviz Interactive Preview` VSCode extension */
lazy val shadePrefix = "gtshade" // geotrellis shaded package
ThisBuild / assemblyShadeRules := Seq(
// relocate Spire
ShadeRule.rename("spire.**" -> s"$shadePrefix.spire.@1").inAll,
// Spire depends on algebra; relocating it avoids similar collisions
ShadeRule.rename("algebra.**" -> s"$shadePrefix.algebra.@1").inAll
)
/* WARNING: Technically we got another problem like this between
* `scala-collection-compat` 2.2.0 (expected by Geotrellis, packaged in the fat jar)
* and Spark provided jar `scala-collection-compat` 2.7.0
* Run `ls -1 $SPARK_HOME/jars | grep -i 'scala-collection-compat' || true` to see the conflicting jar file.
* WARNING: Uncomment if you see any sign of trouble!
* dependencyOverrides += "org.scala-lang.modules" %% "scala-collection-compat" % "2.7.0" */
lazy val deps = Seq(
// For Ex02 and beyond
// For logging, use the same log4j version as apache Spark
// TODO: validate log4j version
"org.apache.logging.log4j" % "log4j-api" % "2.20.0",
"org.apache.logging.log4j" % "log4j-core" % "2.20.0",
"com.github.tototoshi" %% "scala-csv" % "2.0.0",
"com.lihaoyi" %% "pprint" % "0.9.6",
// For Ex05 and Ex07; See: https://github.com/locationtech/geotrellis#getting-started
"org.locationtech.geotrellis" %% "geotrellis-raster" % "3.8.0",
// For Ex05_5 and beyond
// NOTE: "provided" is used to avoid packaging Spark with the application JAR
// as it is expected to be available in the Spark runtime environment.
// See: https://www.scala-sbt.org/1.x/docs/Scopes.html
"org.apache.spark" %% "spark-core" % "3.5.5" % "provided",
"org.apache.spark" %% "spark-sql" % "3.5.5" % "provided",
// Used to access OS-Lib (os.<*>) and launch sbt processes in Spark exercises
"org.scala-lang" %% "toolkit" % "0.7.0",
// For Ex07 basics
"org.locationtech.geotrellis" %% "geotrellis-layer" % "3.8.0",
"org.locationtech.geotrellis" %% "geotrellis-spark" % "3.8.0",
"org.locationtech.geotrellis" %% "geotrellis-vector" % "3.8.0",
// For Ex07 complex cost analysis
// `sedona-spark-shaded-3.5` results in a 10MB bigger assembly artifact (173MB->183MB)
// compared to `sedona-spark-3.5` but results in a more stable fat-jar.
"org.apache.sedona" %% "sedona-spark-shaded-3.5" % "1.8.1",
"org.datasyslab" % "geotools-wrapper" % "1.8.1-33.1",
// For testing Ex07 custom algorithms
"org.scalatest" %% "scalatest" % "3.2.19" % "test"
)
lazy val root = (project in file("."))
.settings(
name := "learning-scala",
// NOTE: Using 2.13.8, we hit eviction issues with GeoTrellis requiring 2.13.16,
// but I have to use 2.13.8 because I want Spark 3.5.5 to be rock solid with Almond.
allowUnsafeScalaLibUpgrade := true,
Compile / mainClass := Some("learningscala.Main"),
libraryDependencies ++= deps,
// sbt-assembly configuration
// Name the resulting fat jar predictably: learning-scala-assembly-<version>.jar
assembly / assemblyJarName := s"${name.value}-assembly-${version.value}.jar",
// Ensure the assembled jar has the right entrypoint when needed
assembly / mainClass := Some("learningscala.Main"),
// Caching (2x performance on cache hits); see: https://github.com/sbt/sbt-assembly?tab=readme-ov-file#jar-assembly-performance
ThisBuild / assemblyRepeatableBuild := true,
// Merge strategy to avoid weird conflicts (META-INF, services, HOCON, module-info)
// See https://github.com/sbt/sbt-assembly?tab=readme-ov-file#merge-strategy
assembly / assemblyMergeStrategy := {
case PathList("META-INF", xs @ _*) =>
xs match {
case Seq("MANIFEST.MF") | Seq("INDEX.LIST") | Seq("DEPENDENCIES") =>
MergeStrategy.discard
case Seq("versions", _ @_*) => MergeStrategy.first
case Seq("plexus", _ @_*) => MergeStrategy.discard
case Seq("services", _ @_*) => MergeStrategy.concat
case _ => MergeStrategy.discard
}
case "module-info.class" => MergeStrategy.discard
case "reference.conf" => MergeStrategy.concat
case "application.conf" => MergeStrategy.concat
case PathList("META-INF", "native-image", _ @_*) => MergeStrategy.discard
case PathList("META-INF", "maven", _ @_*) => MergeStrategy.discard
case PathList("META-INF", "licenses", _ @_*) => MergeStrategy.discard
case "META-INF/NOTICE" | "META-INF/NOTICE.txt" => MergeStrategy.discard
case "META-INF/LICENSE" | "META-INF/LICENSE.txt" => MergeStrategy.discard
// Prefer the first occurrence for everything else
case _ => MergeStrategy.first
}
)
lazy val worksheets = (project in file("worksheets"))
.settings(
assembly / skip := true,
Compile / scalaSource := baseDirectory.value, // Place `worksheet.sc` files under ./worksheets/
libraryDependencies ++= deps
)