Allow setting a custom CondaEnvironment (manually or automatically) without having to run PythonRunner (apache-spark-on-k8s#260)

dansanduleac · robert3005 · commit 72d0f1e39dca · 2017-09-13T10:04:59.000+01:00
diff --git a/core/src/main/scala/org/apache/spark/deploy/CondaRunner.scala b/core/src/main/scala/org/apache/spark/deploy/CondaRunner.scala
@@ -20,6 +20,7 @@ package org.apache.spark.deploy
 import java.util.concurrent.atomic.AtomicReference
 
 import org.apache.spark.SparkConf
+import org.apache.spark.SparkContext
 import org.apache.spark.api.conda.CondaEnvironment
 import org.apache.spark.api.conda.CondaEnvironmentManager
 import org.apache.spark.internal.Logging
@@ -33,31 +34,46 @@ import org.apache.spark.util.Utils
 abstract class CondaRunner extends Logging {
   final def main(args: Array[String]): Unit = {
     val sparkConf = new SparkConf()
+    run(args, CondaRunner.setupCondaEnvironmentAutomatically(sparkConf))
+  }
 
+  def run(args: Array[String], maybeConda: Option[CondaEnvironment]): Unit
+}
+
+object CondaRunner {
+  private[spark] val condaEnvironment: AtomicReference[Option[CondaEnvironment]] =
+      new AtomicReference(None)
+
+  /**
+   * Sets up a conda environment if [[CondaEnvironmentManager.isConfigured]] returns true.
+   * Once an environment has been set up, calling this method again (or the [[main]] method)
+   * will throw a [[RuntimeException]].
+   */
+  def setupCondaEnvironmentAutomatically(sparkConf: SparkConf): Option[CondaEnvironment] = {
     if (CondaEnvironmentManager.isConfigured(sparkConf)) {
       val condaBootstrapDeps = sparkConf.get(CONDA_BOOTSTRAP_PACKAGES)
       val condaChannelUrls = sparkConf.get(CONDA_CHANNEL_URLS)
       val condaBaseDir = Utils.createTempDir(Utils.getLocalDir(sparkConf), "conda").getAbsolutePath
       val condaEnvironmentManager = CondaEnvironmentManager.fromConf(sparkConf)
       val environment = condaEnvironmentManager
-        .create(condaBaseDir, condaBootstrapDeps, condaChannelUrls)
-
-      // Save this as a global in order for SparkContext to be able to access it later, in case we
-      // are shelling out, but providing a bridge back into this JVM.
-      require(CondaRunner.condaEnvironment.compareAndSet(None, Some(environment)),
-        "Couldn't set condaEnvironment to the newly created environment, it was already set to: "
-          + CondaRunner.condaEnvironment.get())
-
-      run(args, Some(environment))
+                        .create(condaBaseDir, condaBootstrapDeps, condaChannelUrls)
+      setCondaEnvironment(environment)
+      Some(environment)
     } else {
-      run(args, None)
+      None
     }
   }
 
-  def run(args: Array[String], maybeConda: Option[CondaEnvironment]): Unit
-}
-
-object CondaRunner {
-  private[spark] val condaEnvironment: AtomicReference[Option[CondaEnvironment]] =
-      new AtomicReference(None)
+  /**
+   * Sets the given environment as the global environment, which will be accessible by calling
+   * [[SparkContext.condaEnvironment]]. This method can only be called once! If an environment
+   * has already been set, calling this method again will throw a [[RuntimeException]].
+   */
+  def setCondaEnvironment(environment: CondaEnvironment): Unit = {
+    // Save this as a global in order for SparkContext to be able to access it later, in case we
+    // are shelling out, but providing a bridge back into this JVM.
+    require(CondaRunner.condaEnvironment.compareAndSet(None, Some(environment)),
+      "Couldn't set condaEnvironment to the newly created environment, it was already set to: "
+        + CondaRunner.condaEnvironment.get())
+  }
 }