Skip to content

Commit ddbc5c5

Browse files
authored
Merge pull request #1086 from alexarchambault/package-spark-job
Add --spark option to package sub-command
2 parents 34b3cf8 + 5000acc commit ddbc5c5

File tree

10 files changed

+226
-3
lines changed

10 files changed

+226
-3
lines changed

modules/cli-options/src/main/scala/scala/cli/commands/PackageOptions.scala

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,10 @@ final case class PackageOptions(
4242
@HelpMessage("For assembly JAR, whether to add a bash / bat preamble")
4343
preamble: Boolean = true,
4444
@Group("Package")
45+
@Hidden
46+
@HelpMessage("Generate an assembly JAR for Spark (assembly that doesn't contain Spark, nor any of its dependencies)")
47+
spark: Boolean = false,
48+
@Group("Package")
4549
@HelpMessage("Package standalone JARs")
4650
standalone: Option[Boolean] = None,
4751
@Recurse

modules/cli/src/main/scala/scala/cli/commands/Package.scala

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ import scala.build.internal.{Runner, ScalaJsLinkerConfig}
3030
import scala.build.options.{PackageType, Platform}
3131
import scala.cli.CurrentParams
3232
import scala.cli.commands.OptionsHelper._
33+
import scala.cli.commands.packaging.Spark
3334
import scala.cli.commands.util.MainClassOptionsUtil._
3435
import scala.cli.commands.util.PackageOptionsUtil._
3536
import scala.cli.commands.util.SharedOptionsUtil._
@@ -197,6 +198,7 @@ object Package extends ScalaCommand[PackageOptions] {
197198
case PackageType.SourceJar => ".jar"
198199
case PackageType.DocJar => ".jar"
199200
case _: PackageType.Assembly => ".jar"
201+
case PackageType.Spark => ".jar"
200202
case PackageType.Js => ".js"
201203
case PackageType.Debian => ".deb"
202204
case PackageType.Dmg => ".dmg"
@@ -214,6 +216,7 @@ object Package extends ScalaCommand[PackageOptions] {
214216
case PackageType.SourceJar => "source.jar"
215217
case PackageType.DocJar => "scaladoc.jar"
216218
case _: PackageType.Assembly => "app.jar"
219+
case PackageType.Spark => "job.jar"
217220
case PackageType.Js => "app.js"
218221
case PackageType.Debian => "app.deb"
219222
case PackageType.Dmg => "app.dmg"
@@ -294,12 +297,30 @@ object Package extends ScalaCommand[PackageOptions] {
294297
build,
295298
destPath,
296299
value(mainClass),
300+
Nil,
297301
withPreamble = a.addPreamble,
298302
() => alreadyExistsCheck(),
299303
logger
300304
)
301305
}
302306
destPath
307+
case PackageType.Spark =>
308+
value {
309+
assembly(
310+
build,
311+
destPath,
312+
value(mainClass),
313+
// The Spark modules are assumed to be already on the class path,
314+
// along with all their transitive dependencies (originating from
315+
// the Spark distribution), so we don't include any of them in the
316+
// assembly.
317+
Spark.sparkModules,
318+
withPreamble = false,
319+
() => alreadyExistsCheck(),
320+
logger
321+
)
322+
}
323+
destPath
303324

304325
case PackageType.Js =>
305326
value(buildJs(build, destPath, value(mainClass), logger))
@@ -696,6 +717,7 @@ object Package extends ScalaCommand[PackageOptions] {
696717
build: Build.Successful,
697718
destPath: os.Path,
698719
mainClass: String,
720+
extraProvided: Seq[dependency.AnyModule],
699721
withPreamble: Boolean,
700722
alreadyExistsCheck: () => Unit,
701723
logger: Logger
@@ -712,7 +734,7 @@ object Package extends ScalaCommand[PackageOptions] {
712734
(ent, content)
713735
}
714736

715-
val provided = build.options.notForBloopOptions.packageOptions.provided
737+
val provided = build.options.notForBloopOptions.packageOptions.provided ++ extraProvided
716738
val allFiles = build.artifacts.artifacts.map(_._2)
717739
val files =
718740
if (provided.isEmpty) allFiles
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
package scala.cli.commands.packaging
2+
3+
import dependency._
4+
5+
object Spark {
6+
7+
private def names = Seq(
8+
// FIXME Add more?
9+
// (see "cs complete-dependency org.apache.spark: | grep '_2\.12$'"
10+
// or `ls "$(cs get https://archive.apache.org/dist/spark/spark-2.4.2/spark-2.4.2-bin-hadoop2.7.tgz --archive)"/*/jars | grep '^spark-'`)
11+
"core",
12+
"graphx",
13+
"hive",
14+
"hive-thriftserver",
15+
"kubernetes",
16+
"mesos",
17+
"mllib",
18+
"repl",
19+
"sql",
20+
"streaming",
21+
"yarn"
22+
)
23+
24+
def sparkModules: Seq[AnyModule] =
25+
names.map(name => mod"org.apache.spark::spark-$name")
26+
}

modules/cli/src/main/scala/scala/cli/commands/util/PackageOptionsUtil.scala

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ object PackageOptionsUtil {
2020
if (v.library) Some(PackageType.LibraryJar)
2121
else if (source) Some(PackageType.SourceJar)
2222
else if (assembly) Some(PackageType.Assembly(addPreamble = preamble))
23+
else if (spark) Some(PackageType.Spark)
2324
else if (deb) Some(PackageType.Debian)
2425
else if (dmg) Some(PackageType.Dmg)
2526
else if (pkg) Some(PackageType.Pkg)
@@ -90,8 +91,9 @@ object PackageOptionsUtil {
9091
)
9192
),
9293
internal = baseOptions.internal.copy(
93-
// computing the provided modules sub-graph needs the final Resolution instance
94-
keepResolution = provided.nonEmpty
94+
// computing the provided modules sub-graph need the final Resolution instance
95+
// Spark packaging adds provided modules, so it needs it too
96+
keepResolution = provided.nonEmpty || packageTypeOpt.contains(PackageType.Spark)
9597
),
9698
internalDependencies = baseOptions.internalDependencies.copy(
9799
addRunnerDependencyOpt = Some(false)
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
package scala.cli.integration
2+
3+
class SparkTestDefinitions extends munit.FunSuite {
4+
5+
protected lazy val extraOptions: Seq[String] = TestUtil.extraOptions
6+
7+
}
Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
package scala.cli.integration
2+
3+
import com.eed3si9n.expecty.Expecty.expect
4+
5+
import java.io.File
6+
7+
import scala.util.Properties
8+
9+
object SparkTests212 {
10+
11+
private final case class Spark(sparkVersion: String, scalaVersion: String) {
12+
def sparkHome(): os.Path = {
13+
val url =
14+
s"https://archive.apache.org/dist/spark/spark-$sparkVersion/spark-$sparkVersion-bin-hadoop2.7.tgz"
15+
val dirName = url.drop(url.lastIndexOf('/') + 1).stripSuffix(".tgz")
16+
val baseDir =
17+
os.Path(os.proc(TestUtil.cs, "get", "--archive", url).call().out.text().trim, os.pwd)
18+
baseDir / dirName
19+
}
20+
}
21+
22+
private val spark30 = Spark(
23+
"3.0.3",
24+
// The spark distribution actually ships with Scala 2.12.10, but we run into #1092 if we use it here
25+
"2.12.15"
26+
)
27+
28+
private val spark24 = Spark(
29+
"2.4.2",
30+
// The spark distribution actually ships with Scala 2.12.8, but we run into #1092 if we use it here
31+
"2.12.15"
32+
)
33+
34+
}
35+
36+
class SparkTests212 extends SparkTestDefinitions {
37+
38+
import SparkTests212._
39+
40+
def simplePackageSparkJobTest(spark: Spark): Unit = {
41+
val master = "local[4]"
42+
val inputs = TestInputs(
43+
Seq(
44+
os.rel / "SparkJob.scala" ->
45+
s"""//> using lib "org.apache.spark::spark-sql:${spark.sparkVersion}"
46+
|//> using scala "${spark.scalaVersion}"
47+
|
48+
|import org.apache.spark._
49+
|import org.apache.spark.sql._
50+
|
51+
|object SparkJob {
52+
| def main(args: Array[String]): Unit = {
53+
| val spark = SparkSession.builder()
54+
| .appName("Test job")
55+
| .getOrCreate()
56+
| import spark.implicits._
57+
| def sc = spark.sparkContext
58+
| val accum = sc.longAccumulator
59+
| sc.parallelize(1 to 10).foreach(x => accum.add(x))
60+
| println("Result: " + accum.value)
61+
| }
62+
|}
63+
|""".stripMargin
64+
)
65+
)
66+
inputs.fromRoot { root =>
67+
val dest = os.rel / "SparkJob.jar"
68+
os.proc(TestUtil.cli, "package", extraOptions, "--spark", "--jvm", "8", ".", "-o", dest)
69+
.call(cwd = root)
70+
71+
val java8Home =
72+
os.Path(os.proc(TestUtil.cs, "java-home", "--jvm", "8").call().out.trim(), os.pwd)
73+
74+
val ext = if (Properties.isWin) ".cmd" else ""
75+
val res =
76+
os.proc(spark.sparkHome() / "bin" / s"spark-submit$ext", "--master", master, dest).call(
77+
cwd = root,
78+
env = Map(
79+
"JAVA_HOME" -> java8Home.toString,
80+
"PATH" -> ((java8Home / "bin").toString + File.pathSeparator + System.getenv("PATH"))
81+
)
82+
)
83+
84+
val expectedOutput = "Result: 55"
85+
86+
expect(res.out.trim() == expectedOutput)
87+
}
88+
}
89+
90+
test("spark 2.4") {
91+
simplePackageSparkJobTest(spark24)
92+
}
93+
94+
test("spark 3.0") {
95+
simplePackageSparkJobTest(spark30)
96+
}
97+
98+
}
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
package scala.cli.integration
2+
3+
class SparkTests213 extends SparkTestDefinitions

modules/options/src/main/scala/scala/build/options/PackageType.scala

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,9 @@ object PackageType {
1919
final case class Assembly(addPreamble: Boolean) extends PackageType {
2020
override def runnable = Some(addPreamble)
2121
}
22+
case object Spark extends PackageType {
23+
override def runnable = Some(false)
24+
}
2225
case object Js extends PackageType
2326
case object Native extends PackageType {
2427
override def runnable = Some(true)
@@ -42,6 +45,7 @@ object PackageType {
4245
"library" -> LibraryJar,
4346
"source" -> SourceJar,
4447
"doc" -> DocJar,
48+
"spark" -> Spark,
4549
"js" -> Js,
4650
"native" -> Native,
4751
"docker" -> Docker,

website/docs/reference/cli-options.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -866,6 +866,10 @@ Generate an assembly JAR
866866

867867
For assembly JAR, whether to add a bash / bat preamble
868868

869+
#### `--spark`
870+
871+
Generate an assembly JAR for Spark (assembly that doesn't contain Spark, nor any of its dependencies)
872+
869873
#### `--standalone`
870874

871875
Package standalone JARs

website/src/pages/spark.md

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
# Experimental Spark features
2+
3+
import {ChainedSnippets, GiflikeVideo} from "../../src/components/MarkdownComponents.js";
4+
5+
## Packaging
6+
7+
The `package` sub-commands offers to package Scala CLI projects as JARs ready to be passed
8+
to `spark-submit`, and optimized for it.
9+
10+
<ChainedSnippets>
11+
12+
```scala title=SparkJob.scala
13+
//> using lib "org.apache.spark::spark-sql:3.0.3"
14+
//> using scala "2.12.15"
15+
16+
import org.apache.spark._
17+
import org.apache.spark.sql._
18+
19+
object SparkJob {
20+
def main(args: Array[String]): Unit = {
21+
val spark = SparkSession.builder()
22+
.appName("Test job")
23+
.getOrCreate()
24+
import spark.implicits._
25+
def sc = spark.sparkContext
26+
val accum = sc.longAccumulator
27+
sc.parallelize(1 to 10).foreach(x => accum.add(x))
28+
println("Result: " + accum.value)
29+
}
30+
}
31+
```
32+
33+
```bash
34+
scala-cli package --spark SparkJob.scala -o spark-job.jar
35+
```
36+
37+
```text
38+
Compiling project (Scala 2.12.15, JVM)
39+
Compiled project (Scala 2.12.15, JVM)
40+
Wrote spark-job.jar
41+
```
42+
43+
```bash
44+
spark-submit spark-job.jar
45+
```
46+
47+
```text
48+
49+
Result: 55
50+
51+
```
52+
53+
</ChainedSnippets>

0 commit comments

Comments
 (0)