@@ -66,16 +66,10 @@ private[spark] class KubernetesClusterSchedulerBackend(
66
66
private implicit val requestExecutorContext = ExecutionContext .fromExecutorService(
67
67
requestExecutorsService)
68
68
69
- private val driverPod = try {
70
- kubernetesClient.pods()
71
- .inNamespace(kubernetesNamespace)
72
- .withName(kubernetesDriverPodName)
73
- .get()
74
- } catch {
75
- case throwable : Throwable =>
76
- logError(s " Executor cannot find driver pod. " , throwable)
77
- throw new SparkException (s " Executor cannot find driver pod " , throwable)
78
- }
69
+ private val driverPod = kubernetesClient.pods()
70
+ .inNamespace(kubernetesNamespace)
71
+ .withName(kubernetesDriverPodName)
72
+ .get()
79
73
80
74
override val minRegisteredRatio =
81
75
if (conf.getOption(" spark.scheduler.minRegisteredResourcesRatio" ).isEmpty) {
@@ -142,13 +136,16 @@ private[spark] class KubernetesClusterSchedulerBackend(
142
136
knownExitReason.fold {
143
137
removeExecutorOrIncrementLossReasonCheckCount(executorId)
144
138
} { executorExited =>
145
- logDebug (s " Removing executor $executorId with loss reason " + executorExited.message)
139
+ logWarning (s " Removing executor $executorId with loss reason " + executorExited.message)
146
140
removeExecutor(executorId, executorExited)
147
141
// We keep around executors that have exit conditions caused by the application. This
148
142
// allows them to be debugged later on. Otherwise, mark them as to be deleted from the
149
143
// the API server.
150
144
if (! executorExited.exitCausedByApp) {
145
+ logInfo(s " Executor $executorId failed because of a framework error. " )
151
146
deleteExecutorFromClusterAndDataStructures(executorId)
147
+ } else {
148
+ logInfo(s " Executor $executorId exited because of the application. " )
152
149
}
153
150
}
154
151
}
@@ -192,8 +189,6 @@ private[spark] class KubernetesClusterSchedulerBackend(
192
189
193
190
}
194
191
195
- override def applicationId (): String = conf.get(" spark.app.id" , super .applicationId())
196
-
197
192
override def sufficientResourcesRegistered (): Boolean = {
198
193
totalRegisteredExecutors.get() >= initialExecutors * minRegisteredRatio
199
194
}
@@ -331,10 +326,10 @@ private[spark] class KubernetesClusterSchedulerBackend(
331
326
332
327
override def eventReceived (action : Action , pod : Pod ): Unit = {
333
328
if (action == Action .MODIFIED && pod.getStatus.getPhase == " Running"
334
- && pod.getMetadata.getDeletionTimestamp == null ) {
329
+ && pod.getMetadata.getDeletionTimestamp == null ) {
335
330
val podIP = pod.getStatus.getPodIP
336
331
val clusterNodeName = pod.getSpec.getNodeName
337
- logDebug (s " Executor pod $pod ready, launched at $clusterNodeName as IP $podIP. " )
332
+ logInfo (s " Executor pod $pod ready, launched at $clusterNodeName as IP $podIP. " )
338
333
executorPodsByIPs.put(podIP, pod)
339
334
} else if ((action == Action .MODIFIED && pod.getMetadata.getDeletionTimestamp != null ) ||
340
335
action == Action .DELETED || action == Action .ERROR ) {
@@ -345,10 +340,10 @@ private[spark] class KubernetesClusterSchedulerBackend(
345
340
executorPodsByIPs.remove(podIP)
346
341
}
347
342
if (action == Action .ERROR ) {
348
- logInfo (s " Received pod $podName exited event. Reason: " + pod.getStatus.getReason)
343
+ logWarning (s " Received pod $podName exited event. Reason: " + pod.getStatus.getReason)
349
344
handleErroredPod(pod)
350
345
} else if (action == Action .DELETED ) {
351
- logInfo (s " Received delete pod $podName event. Reason: " + pod.getStatus.getReason)
346
+ logWarning (s " Received delete pod $podName event. Reason: " + pod.getStatus.getReason)
352
347
handleDeletedPod(pod)
353
348
}
354
349
}
@@ -386,8 +381,8 @@ private[spark] class KubernetesClusterSchedulerBackend(
386
381
// container was probably actively killed by the driver.
387
382
val exitReason = if (isPodAlreadyReleased(pod)) {
388
383
ExecutorExited (containerExitStatus, exitCausedByApp = false ,
389
- s " Container in pod " + pod.getMetadata.getName +
390
- " exited from explicit termination request." )
384
+ s " Container in pod ${ pod.getMetadata.getName} exited from explicit termination " +
385
+ " request." )
391
386
} else {
392
387
val containerExitReason = s " Pod ${pod.getMetadata.getName}'s executor container " +
393
388
s " exited with exit status code $containerExitStatus. "
0 commit comments