Merge pull request apache-spark-on-k8s#425 from palantir/yh/logging-v2

yifeih · web-flow · commit 697572670ec7 · 2018-10-30T17:06:36.000Z
add safelogging
diff --git a/core/pom.xml b/core/pom.xml
@@ -416,6 +416,12 @@
       <scope>provided</scope>
     </dependency>
 
+    <dependency>
+      <groupId>com.palantir.safe-logging</groupId>
+      <artifactId>safe-logging</artifactId>
+      <version>1.5.1</version>
+    </dependency>
+
   </dependencies>
   <build>
     <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
diff --git a/core/src/main/scala/org/apache/spark/internal/SafeLogging.scala b/core/src/main/scala/org/apache/spark/internal/SafeLogging.scala
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.internal
+
+import com.palantir.logsafe.Arg
+import org.slf4j.LoggerFactory
+
+trait SafeLogging {
+  private[this] val log_ = LoggerFactory.getLogger(this.getClass.getName)
+
+  def safeLogInfo(message: String, args: Arg[_]*): Unit = {
+    if (log_.isInfoEnabled) log_.info(message, args: _*)
+  }
+
+  def safeLogInfo(message: String, error: Throwable, args: Arg[_]*): Unit = {
+    if (log_.isInfoEnabled) log_.info(message, args :+ error: _*)
+  }
+
+  def safeLogDebug(message: String, args: Arg[_]*): Unit = {
+    if (log_.isDebugEnabled) log_.debug(message, args: _*)
+  }
+
+  def safeLogDebug(message: String, error: Throwable, args: Arg[_]*): Unit = {
+    if (log_.isDebugEnabled) log_.debug(message, args :+ error: _*)
+  }
+
+  def safeLogTrace(message: String, args: Arg[_]*): Unit = {
+    if (log_.isTraceEnabled) log_.trace(message, args: _*)
+  }
+
+  def safeLogTrace(message: String, error: Throwable, args: Arg[_]*): Unit = {
+    if (log_.isTraceEnabled) log_.trace(message, args :+ error: _*)
+  }
+
+  def safeLogWarning(message: String, args: Arg[_]*): Unit = {
+    if (log_.isWarnEnabled) log_.warn(message, args: _*)
+  }
+
+  def safeLogWarning(message: String, error: Throwable, args: Arg[_]*): Unit = {
+    if (log_.isWarnEnabled) log_.warn(message, args :+ error: _*)
+  }
+
+  def safeLogError(message: String, args: Arg[_]*): Unit = {
+    if (log_.isErrorEnabled) log_.error(message, args: _*)
+  }
+
+  def safeLogError(message: String, error: Throwable, args: Arg[_]*): Unit = {
+    if (log_.isErrorEnabled) log_.error(message, args :+ error: _*)
+  }
+}
+
diff --git a/dev/deps/spark-deps-hadoop-palantir b/dev/deps/spark-deps-hadoop-palantir
@@ -176,6 +176,7 @@ parquet-jackson-1.10.1-palantir.3.jar
 protobuf-java-2.5.0.jar
 py4j-0.10.7.jar
 pyrolite-4.13.jar
+safe-logging-1.5.1.jar
 scala-compiler-2.11.12.jar
 scala-library-2.11.12.jar
 scala-parser-combinators_2.11-1.1.0.jar
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsAllocator.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsAllocator.scala
@@ -18,6 +18,7 @@ package org.apache.spark.scheduler.cluster.k8s
 
 import java.util.concurrent.atomic.{AtomicInteger, AtomicLong}
 
+import com.palantir.logsafe.SafeArg
 import io.fabric8.kubernetes.api.model.PodBuilder
 import io.fabric8.kubernetes.client.KubernetesClient
 import scala.collection.mutable
@@ -26,15 +27,15 @@ import org.apache.spark.{SparkConf, SparkException}
 import org.apache.spark.deploy.k8s.Config._
 import org.apache.spark.deploy.k8s.Constants._
 import org.apache.spark.deploy.k8s.KubernetesConf
-import org.apache.spark.internal.Logging
+import org.apache.spark.internal.SafeLogging
 import org.apache.spark.util.{Clock, Utils}
 
 private[spark] class ExecutorPodsAllocator(
     conf: SparkConf,
     executorBuilder: KubernetesExecutorBuilder,
     kubernetesClient: KubernetesClient,
     snapshotsStore: ExecutorPodsSnapshotsStore,
-    clock: Clock) extends Logging {
+    clock: Clock) extends SafeLogging {
 
   private val EXECUTOR_ID_COUNTER = new AtomicLong(0L)
 
@@ -82,10 +83,12 @@ private[spark] class ExecutorPodsAllocator(
     newlyCreatedExecutors.foreach { case (execId, timeCreated) =>
       val currentTime = clock.getTimeMillis()
       if (currentTime - timeCreated > podCreationTimeout) {
-        logWarning(s"Executor with id $execId was not detected in the Kubernetes" +
-          s" cluster after $podCreationTimeout milliseconds despite the fact that a" +
+        safeLogWarning("Executor was not detected in the Kubernetes" +
+          " cluster after timeout despite the fact that a" +
           " previous allocation attempt tried to create it. The executor may have been" +
-          " deleted but the application missed the deletion event.")
+          " deleted but the application missed the deletion event.",
+          SafeArg.of("executorId", execId),
+          SafeArg.of("podCreationTimeoutMs", podCreationTimeout))
         Utils.tryLogNonFatalError {
           kubernetesClient
             .pods()
@@ -94,8 +97,10 @@ private[spark] class ExecutorPodsAllocator(
         }
         newlyCreatedExecutors -= execId
       } else {
-        logDebug(s"Executor with id $execId was not found in the Kubernetes cluster since it" +
-          s" was created ${currentTime - timeCreated} milliseconds ago.")
+        safeLogDebug("Executor was not found in the Kubernetes cluster since it" +
+          " was created some time ago.",
+          SafeArg.of("executorId", execId),
+          SafeArg.of("timeSinceCreationMs", currentTime - timeCreated))
       }
     }
 
@@ -112,15 +117,19 @@ private[spark] class ExecutorPodsAllocator(
         case _ => false
       }
       val currentTotalExpectedExecutors = totalExpectedExecutors.get
-      logDebug(s"Currently have $currentRunningExecutors running executors and" +
-        s" $currentPendingExecutors pending executors. $newlyCreatedExecutors executors" +
-        s" have been requested but are pending appearance in the cluster.")
+      safeLogDebug("Currently have running executors and" +
+        " pending executors. Newly created executors" +
+        " have been requested but are pending appearance in the cluster.",
+        SafeArg.of("numCurrentRunningExecutors", currentRunningExecutors),
+        SafeArg.of("numCurrentPendingExecutors", currentPendingExecutors),
+        SafeArg.of("newlyCreatedExecutors", newlyCreatedExecutors))
       if (newlyCreatedExecutors.isEmpty
         && currentPendingExecutors == 0
         && currentRunningExecutors < currentTotalExpectedExecutors) {
         val numExecutorsToAllocate = math.min(
           currentTotalExpectedExecutors - currentRunningExecutors, podAllocationSize)
-        logInfo(s"Going to request $numExecutorsToAllocate executors from Kubernetes.")
+        safeLogInfo("Going to request executors from Kubernetes.",
+          SafeArg.of("numExecutorsToAllocate", numExecutorsToAllocate))
         for ( _ <- 0 until numExecutorsToAllocate) {
           val newExecutorId = EXECUTOR_ID_COUNTER.incrementAndGet()
           val executorConf = KubernetesConf.createExecutorConf(
@@ -136,18 +145,22 @@ private[spark] class ExecutorPodsAllocator(
             .build()
           kubernetesClient.pods().create(podWithAttachedContainer)
           newlyCreatedExecutors(newExecutorId) = clock.getTimeMillis()
-          logDebug(s"Requested executor with id $newExecutorId from Kubernetes.")
+          safeLogDebug("Requested executor from Kubernetes.",
+            SafeArg.of("newExecutorId", newExecutorId))
         }
       } else if (currentRunningExecutors >= currentTotalExpectedExecutors) {
         // TODO handle edge cases if we end up with more running executors than expected.
-        logDebug("Current number of running executors is equal to the number of requested" +
+        safeLogDebug("Current number of running executors is equal to the number of requested" +
           " executors. Not scaling up further.")
       } else if (newlyCreatedExecutors.nonEmpty || currentPendingExecutors != 0) {
-        logDebug(s"Still waiting for ${newlyCreatedExecutors.size + currentPendingExecutors}" +
-          s" executors to begin running before requesting for more executors. # of executors in" +
-          s" pending status in the cluster: $currentPendingExecutors. # of executors that we have" +
-          s" created but we have not observed as being present in the cluster yet:" +
-          s" ${newlyCreatedExecutors.size}.")
+        safeLogDebug("Still waiting for" +
+          " executors to begin running before requesting for more executors, including executors" +
+          " in pending status in the cluster, and executors that we have" +
+          " created but we have not observed as being present in the cluster yet.",
+          SafeArg.of("numTotalCurrentWaitingExecutors",
+            newlyCreatedExecutors.size + currentPendingExecutors),
+          SafeArg.of("numCurrentPendingExecutors", currentPendingExecutors),
+          SafeArg.of("numNewlyCreatedExecutors", newlyCreatedExecutors.size))
       }
     }
   }
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsLifecycleManager.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsLifecycleManager.scala
@@ -17,6 +17,7 @@
 package org.apache.spark.scheduler.cluster.k8s
 
 import com.google.common.cache.Cache
+import com.palantir.logsafe.SafeArg
 import io.fabric8.kubernetes.api.model.Pod
 import io.fabric8.kubernetes.client.KubernetesClient
 import scala.collection.JavaConverters._
@@ -25,7 +26,7 @@ import scala.collection.mutable
 import org.apache.spark.SparkConf
 import org.apache.spark.deploy.k8s.Config._
 import org.apache.spark.deploy.k8s.KubernetesUtils._
-import org.apache.spark.internal.Logging
+import org.apache.spark.internal.SafeLogging
 import org.apache.spark.scheduler.ExecutorExited
 import org.apache.spark.util.Utils
 
@@ -37,7 +38,7 @@ private[spark] class ExecutorPodsLifecycleManager(
     // job-breaking if we remove executors more than once but it's ideal if we make an attempt
     // to avoid doing so. Expire cache entries so that this data structure doesn't grow beyond
     // bounds.
-    removedExecutorsCache: Cache[java.lang.Long, java.lang.Long]) extends Logging {
+    removedExecutorsCache: Cache[java.lang.Long, java.lang.Long]) extends SafeLogging {
 
   import ExecutorPodsLifecycleManager._
 
@@ -57,18 +58,24 @@ private[spark] class ExecutorPodsLifecycleManager(
       snapshot.executorPods.foreach { case (execId, state) =>
         state match {
           case deleted@PodDeleted(_) =>
-            logDebug(s"Snapshot reported deleted executor with id $execId," +
-              s" pod name ${state.pod.getMetadata.getName}")
+            safeLogDebug(
+              "Snapshot reported deleted executor",
+              SafeArg.of("executorId", execId),
+              SafeArg.of("podName", state.pod.getMetadata.getName))
             removeExecutorFromSpark(schedulerBackend, deleted, execId)
             execIdsRemovedInThisRound += execId
           case failed@PodFailed(_) =>
-            logDebug(s"Snapshot reported failed executor with id $execId," +
-              s" pod name ${state.pod.getMetadata.getName}")
+            safeLogDebug(
+              "Snapshot reported failed executor",
+              SafeArg.of("executorId", execId),
+              SafeArg.of("podName", state.pod.getMetadata.getName))
             onFinalNonDeletedState(failed, execId, schedulerBackend, execIdsRemovedInThisRound)
           case succeeded@PodSucceeded(_) =>
-            logDebug(s"Snapshot reported succeeded executor with id $execId," +
-              s" pod name ${state.pod.getMetadata.getName}. Note that succeeded executors are" +
-              s" unusual unless Spark specifically informed the executor to exit.")
+            safeLogDebug(
+              "Snapshot reported succeeded executor." +
+              " Note that unusual unless Spark specifically informed the executor to exit.",
+              SafeArg.of("executorId", execId),
+              SafeArg.of("podName", state.pod.getMetadata.getName))
             onFinalNonDeletedState(succeeded, execId, schedulerBackend, execIdsRemovedInThisRound)
           case _ =>
         }
@@ -89,7 +96,10 @@ private[spark] class ExecutorPodsLifecycleManager(
           val exitReasonMessage = s"The executor with ID $missingExecutorId was not found in the" +
             s" cluster but we didn't get a reason why. Marking the executor as failed. The" +
             s" executor may have been deleted but the driver missed the deletion event."
-          logDebug(exitReasonMessage)
+          safeLogDebug("The executor was not found in the" +
+            " cluster but we didn't get a reason why. Marking the executor as failed. The" +
+            " executor may have been deleted but the driver missed the deletion event.",
+            SafeArg.of("missingExecutorId", missingExecutorId))
           val exitReason = ExecutorExited(
             UNKNOWN_EXIT_CODE,
             exitCausedByApp = false,
@@ -101,8 +111,9 @@ private[spark] class ExecutorPodsLifecycleManager(
     }
 
     if (execIdsRemovedInThisRound.nonEmpty) {
-      logDebug(s"Removed executors with ids ${execIdsRemovedInThisRound.mkString(",")}" +
-        s" from Spark that were either found to be deleted or non-existent in the cluster.")
+      safeLogDebug("Removed executors" +
+        " from Spark that were either found to be deleted or non-existent in the cluster.",
+        SafeArg.of("executorIdsRemovedInThisRound", execIdsRemovedInThisRound.mkString(",")))
     }
   }