Fix the Spark batch job log fetching issue for bad network

wezhang · wezhang · commit 75c14a99ffdf · 2018-03-06T19:13:30.000+08:00
Signed-off-by: Wei Zhang &lt;wezhang@outlook.com&gt;
diff --git a/PluginsAndFeatures/azure-toolkit-for-intellij/src/com/microsoft/azure/hdinsight/spark/run/SparkBatchJobRemoteProcess.java b/PluginsAndFeatures/azure-toolkit-for-intellij/src/com/microsoft/azure/hdinsight/spark/run/SparkBatchJobRemoteProcess.java
@@ -68,9 +68,6 @@ public class SparkBatchJobRemoteProcess extends RemoteProcess {
 
     private boolean isDisconnected;
 
-    @Nullable
-    private Subscription jobLogSubscription;
-
     public SparkBatchJobRemoteProcess(@NotNull Project project, @NotNull SparkSubmitModel sparkSubmitModel,
                                       @NotNull PublishSubject<SimpleImmutableEntry<MessageInfoType, String>> ctrlSubject)
             throws ExecutionException {
@@ -162,50 +159,17 @@ public void start() {
                             submitModel.getSubmissionParameter().getClusterName(),
                             ctrlSubject)
                     .subscribeOn(IdeaSchedulers.processBarVisibleAsync(project, "Deploy the jar file into cluster")))
-                .flatMap(clusterArtifactUriPair -> {
-                    IClusterDetail cluster = clusterArtifactUriPair.getKey();
-                    submitModel.getSubmissionParameter().setFilePath(clusterArtifactUriPair.getValue());
-                    return JobUtils.submit(cluster, submitModel.getSubmissionParameter())
-                            .subscribeOn(IdeaSchedulers.processBarVisibleAsync(project, "Submit the Spark batch job"));
-                })
-                .doOnSuccess(job -> {
-                    getEventSubject().onNext(new SparkBatchJobSubmissionEvent(
-                            SparkBatchJobSubmissionEvent.Type.SUBMITTED, job));
-
-                    jobLogSubscription = job.getSubmissionLog()
-                            .subscribeOn(Schedulers.io())
-                            .subscribe(ctrlSubject::onNext, ctrlSubject::onError);
-                })
                 .toObservable()
-                .flatMap(job -> Observable
-                        .create((Subscriber<? super SparkBatchJob> ob) -> {
-                            try {
-                                jobStderrLogInputSteam.attachJob(job);
-                                jobStdoutLogInputSteam.attachJob(job);
-
-                                sparkJob = job;
-
-                                ob.onNext(job);
-                                ob.onCompleted();
-                            } catch (IOException e) {
-                                ob.onError(e);
-                            }
-                        })
-                        .retryWhen(attempts -> attempts.flatMap(err -> {
-                            try {
-                                final String state = job.getState();
-
-                                if (state.equals("starting") || state.equals("not_started") || state.equals("running")) {
-                                    logInfo("Job is waiting for start due to cluster busy, please wait or disconnect (The job will run when the cluster is free).");
-
-                                    return Observable.timer(5, TimeUnit.SECONDS);
-                                }
-                            } catch (IOException ignored) {
-                            }
-
-                            return Observable.error(new SparkJobException("Spark Job Service not available, please check HDInsight cluster status.", err));
-                        })))
-                .flatMap(runningJob -> runningJob.getJobDoneObservable().subscribeOn(Schedulers.io()))
+                .flatMap(this::submitJob)
+                .flatMap(job -> Observable.zip(
+                        attachJobInputStream(jobStderrLogInputSteam, job),
+                        attachJobInputStream(jobStdoutLogInputSteam, job),
+                        (job1, job2) -> {
+                            sparkJob = job;
+                            return job;
+                        }))
+                .flatMap(runningJob -> runningJob.getJobDoneObservable()
+                        .subscribeOn(IdeaSchedulers.processBarVisibleAsync(project, "Spark batch job is running")))
                 .subscribe(sdPair -> {
                     if (sdPair.getKey() == SparkBatchJobState.SUCCESS) {
                         logInfo("Job run successfully.");
@@ -221,9 +185,28 @@ public void start() {
                 });
     }
 
+    private Observable<SparkBatchJob> attachJobInputStream(SparkJobLogInputStream inputStream, SparkBatchJob job) {
+        return Observable.just(inputStream)
+                .map(stream -> stream.attachJob(job))
+                .subscribeOn(IdeaSchedulers.processBarVisibleAsync(project, "Attach Spark batch job outputs " + inputStream.getLogType()))
+                .retryWhen(attempts -> attempts.flatMap(err -> {
+                    try {
+                        final String state = job.getState();
+
+                        if (state.equals("starting") || state.equals("not_started") || state.equals("running")) {
+                            logInfo("Job is waiting for start due to cluster busy, please wait or disconnect (The job will run when the cluster is free).");
+
+                            return Observable.timer(5, TimeUnit.SECONDS);
+                        }
+                    } catch (IOException ignored) {
+                    }
+
+                    return Observable.error(new SparkJobException("Spark Job Service not available, please check HDInsight cluster status.", err));
+                }));
+    }
+
     public void disconnect() {
         this.isDisconnected = true;
-        Optional.ofNullable(this.jobLogSubscription).ifPresent(Subscription::unsubscribe);
 
         this.ctrlSubject.onCompleted();
         this.eventSubject.onCompleted();
@@ -239,4 +222,24 @@ private void logInfo(String message) {
     public PublishSubject<SparkBatchJobSubmissionEvent> getEventSubject() {
         return eventSubject;
     }
+
+    private Observable<SparkBatchJob> startJobSubmissionLogReceiver(SparkBatchJob job) {
+        getEventSubject().onNext(new SparkBatchJobSubmissionEvent(SparkBatchJobSubmissionEvent.Type.SUBMITTED, job));
+
+        return job.getSubmissionLog()
+                .doOnNext(ctrlSubject::onNext)
+                .doOnError(ctrlSubject::onError)
+                .last()
+                .map(messageTypeText -> job);
+
+    }
+
+    private Observable<SparkBatchJob> submitJob(SimpleImmutableEntry<IClusterDetail, String> clusterArtifactUriPair) {
+        IClusterDetail cluster = clusterArtifactUriPair.getKey();
+        submitModel.getSubmissionParameter().setFilePath(clusterArtifactUriPair.getValue());
+        return JobUtils.submit(cluster, submitModel.getSubmissionParameter())
+                .subscribeOn(IdeaSchedulers.processBarVisibleAsync(project, "Submit the Spark batch job"))
+                .toObservable()
+                .flatMap(this::startJobSubmissionLogReceiver);   // To receive the Livy submission log
+    }
 }
diff --git a/PluginsAndFeatures/azure-toolkit-for-intellij/src/com/microsoft/azure/hdinsight/spark/run/SparkJobLogInputStream.java b/PluginsAndFeatures/azure-toolkit-for-intellij/src/com/microsoft/azure/hdinsight/spark/run/SparkJobLogInputStream.java
@@ -30,7 +30,6 @@
 
 import java.io.IOException;
 import java.io.InputStream;
-import java.io.UncheckedIOException;
 import java.util.AbstractMap;
 import java.util.Optional;
 
@@ -51,9 +50,11 @@ public SparkJobLogInputStream(@NotNull String logType) {
         this.logType = logType;
     }
 
-    public void attachJob(@NotNull SparkBatchJob sparkJob) throws IOException {
+    public SparkBatchJob attachJob(@NotNull SparkBatchJob sparkJob) {
+        refreshLogUrl(sparkJob);
         this.sparkBatchJob = sparkJob;
-        refreshLogUrl();
+
+        return sparkJob;
     }
 
     private synchronized Optional<String> fetchLog(long logOffset, int fetchSize) {
@@ -96,7 +97,7 @@ public int available() throws IOException {
 
                         return buffer.length;
                     }).orElseGet(() -> {
-                        refreshLogUrl();
+                        getAttachedJob().ifPresent(this::refreshLogUrl);
 
                         return 0;
                     });
@@ -105,22 +106,14 @@ public int available() throws IOException {
         }
     }
 
-    private void refreshLogUrl() {
-        getAttachedJob()
-            .ifPresent(
-                    sparkJob -> {
-                        try {
-                            String currentLogUrl = sparkJob.getSparkJobDriverLogUrl(sparkJob.getConnectUri(), sparkJob.getBatchId());
-
-                            if (!StringUtils.equals(currentLogUrl, this.logUrl)) {
-                                // The driver log url's changed due to the job was rerun, read it from beginning
-                                this.logUrl = currentLogUrl;
-                                offset = 0;
-                            }
-                        } catch (IOException ex) {
-                            throw new UncheckedIOException(ex);
-                        }
-                    });
+    private void refreshLogUrl(SparkBatchJob sparkJob) {
+        String currentLogUrl = sparkJob.getSparkJobDriverLogUrlObservable().toBlocking().single();
+
+        if (!StringUtils.equals(currentLogUrl, this.logUrl)) {
+            // The driver log url's changed due to the job was rerun, read it from beginning
+            this.logUrl = currentLogUrl;
+            offset = 0;
+        }
     }
 
     public Optional<String> getLogUrl() {
diff --git a/Utils/hdinsight-node-common/src/com/microsoft/azure/hdinsight/spark/common/SparkBatchJob.java b/Utils/hdinsight-node-common/src/com/microsoft/azure/hdinsight/spark/common/SparkBatchJob.java
@@ -40,6 +40,7 @@
 import com.microsoft.azure.hdinsight.sdk.rest.yarn.rm.AppResponse;
 import com.microsoft.azure.hdinsight.spark.jobs.JobUtils;
 import com.microsoft.azuretools.azurecommons.helpers.NotNull;
+import com.microsoft.azuretools.azurecommons.helpers.Nullable;
 import rx.Observable;
 import rx.Subscriber;
 
@@ -89,7 +90,7 @@ public class SparkBatchJob implements ISparkBatchJob, ILogger {
     /**
      * The setting of delay seconds between tries in RestAPI calling
      */
-    private int delaySeconds = 10;
+    private int delaySeconds = 2;
 
     /**
      * The global cache for fetched Yarn UI page by browser
@@ -548,11 +549,17 @@ private Observable<HtmlPage> loadPageByBrowserObservable(String url) {
     /**
      * Get Spark Job driver log URL with retries
      *
+     * @deprecated
+     * The Livy Rest API driver log Url field only get the running job.
+     * Use getSparkJobDriverLogUrlObservable() please, with RxJava supported.
+     *
      * @param batchBaseUri the connection URI
      * @param batchId the Livy batch job ID
      * @return the Spark Job driver log URL
      * @throws IOException exceptions in transaction
      */
+    @Nullable
+    @Deprecated
     public String getSparkJobDriverLogUrl(URI batchBaseUri, int batchId) throws IOException {
         int retries = 0;
 
@@ -628,9 +635,9 @@ public Observable<SimpleImmutableEntry<MessageInfoType, String>> getSubmissionLo
                 int start = 0;
                 final int maxLinesPerGet = 128;
                 int linesGot = 0;
-                boolean isJobActive = true;
+                boolean isSubmitting = true;
 
-                while (isJobActive) {
+                while (isSubmitting) {
                     String logUrl = String.format("%s/%d/log?from=%d&size=%d",
                             this.getConnectUri().toString(), batchId, start, maxLinesPerGet);
 
@@ -650,7 +657,7 @@ public Observable<SimpleImmutableEntry<MessageInfoType, String>> getSubmissionLo
                     // Retry interval
                     if (linesGot == 0) {
                         sleep(TimeUnit.SECONDS.toMillis(this.getDelaySeconds()));
-                        isJobActive = this.isActive();
+                        isSubmitting = this.getState().equals("starting");
                     }
                 }
             } catch (IOException ex) {
@@ -751,4 +758,17 @@ public Observable<SimpleImmutableEntry<SparkBatchJobState, String>> getJobDoneOb
             }
         });
     }
+
+    /**
+     * New RxAPI: Get Job Driver Log URL from the container
+     *
+     * @return Job Driver log URL observable
+     */
+    public Observable<String> getSparkJobDriverLogUrlObservable() {
+        return getSparkJobYarnCurrentAppAttempt()
+                .map(AppAttempt::getLogsLink)
+                .map(URI::create)
+                .map(logUriWithIP -> getConnectUri().resolve(
+                        String.format("/yarnui/%s%s", logUriWithIP.getHost(), logUriWithIP.getPath())).toString());
+    }
 }