Skip to content

Commit 75c14a9

Browse files
committed
Fix the Spark batch job log fetching issue for bad network
Signed-off-by: Wei Zhang <[email protected]>
1 parent 02075dc commit 75c14a9

File tree

3 files changed

+87
-71
lines changed

3 files changed

+87
-71
lines changed

PluginsAndFeatures/azure-toolkit-for-intellij/src/com/microsoft/azure/hdinsight/spark/run/SparkBatchJobRemoteProcess.java

Lines changed: 50 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -68,9 +68,6 @@ public class SparkBatchJobRemoteProcess extends RemoteProcess {
6868

6969
private boolean isDisconnected;
7070

71-
@Nullable
72-
private Subscription jobLogSubscription;
73-
7471
public SparkBatchJobRemoteProcess(@NotNull Project project, @NotNull SparkSubmitModel sparkSubmitModel,
7572
@NotNull PublishSubject<SimpleImmutableEntry<MessageInfoType, String>> ctrlSubject)
7673
throws ExecutionException {
@@ -162,50 +159,17 @@ public void start() {
162159
submitModel.getSubmissionParameter().getClusterName(),
163160
ctrlSubject)
164161
.subscribeOn(IdeaSchedulers.processBarVisibleAsync(project, "Deploy the jar file into cluster")))
165-
.flatMap(clusterArtifactUriPair -> {
166-
IClusterDetail cluster = clusterArtifactUriPair.getKey();
167-
submitModel.getSubmissionParameter().setFilePath(clusterArtifactUriPair.getValue());
168-
return JobUtils.submit(cluster, submitModel.getSubmissionParameter())
169-
.subscribeOn(IdeaSchedulers.processBarVisibleAsync(project, "Submit the Spark batch job"));
170-
})
171-
.doOnSuccess(job -> {
172-
getEventSubject().onNext(new SparkBatchJobSubmissionEvent(
173-
SparkBatchJobSubmissionEvent.Type.SUBMITTED, job));
174-
175-
jobLogSubscription = job.getSubmissionLog()
176-
.subscribeOn(Schedulers.io())
177-
.subscribe(ctrlSubject::onNext, ctrlSubject::onError);
178-
})
179162
.toObservable()
180-
.flatMap(job -> Observable
181-
.create((Subscriber<? super SparkBatchJob> ob) -> {
182-
try {
183-
jobStderrLogInputSteam.attachJob(job);
184-
jobStdoutLogInputSteam.attachJob(job);
185-
186-
sparkJob = job;
187-
188-
ob.onNext(job);
189-
ob.onCompleted();
190-
} catch (IOException e) {
191-
ob.onError(e);
192-
}
193-
})
194-
.retryWhen(attempts -> attempts.flatMap(err -> {
195-
try {
196-
final String state = job.getState();
197-
198-
if (state.equals("starting") || state.equals("not_started") || state.equals("running")) {
199-
logInfo("Job is waiting for start due to cluster busy, please wait or disconnect (The job will run when the cluster is free).");
200-
201-
return Observable.timer(5, TimeUnit.SECONDS);
202-
}
203-
} catch (IOException ignored) {
204-
}
205-
206-
return Observable.error(new SparkJobException("Spark Job Service not available, please check HDInsight cluster status.", err));
207-
})))
208-
.flatMap(runningJob -> runningJob.getJobDoneObservable().subscribeOn(Schedulers.io()))
163+
.flatMap(this::submitJob)
164+
.flatMap(job -> Observable.zip(
165+
attachJobInputStream(jobStderrLogInputSteam, job),
166+
attachJobInputStream(jobStdoutLogInputSteam, job),
167+
(job1, job2) -> {
168+
sparkJob = job;
169+
return job;
170+
}))
171+
.flatMap(runningJob -> runningJob.getJobDoneObservable()
172+
.subscribeOn(IdeaSchedulers.processBarVisibleAsync(project, "Spark batch job is running")))
209173
.subscribe(sdPair -> {
210174
if (sdPair.getKey() == SparkBatchJobState.SUCCESS) {
211175
logInfo("Job run successfully.");
@@ -221,9 +185,28 @@ public void start() {
221185
});
222186
}
223187

188+
private Observable<SparkBatchJob> attachJobInputStream(SparkJobLogInputStream inputStream, SparkBatchJob job) {
189+
return Observable.just(inputStream)
190+
.map(stream -> stream.attachJob(job))
191+
.subscribeOn(IdeaSchedulers.processBarVisibleAsync(project, "Attach Spark batch job outputs " + inputStream.getLogType()))
192+
.retryWhen(attempts -> attempts.flatMap(err -> {
193+
try {
194+
final String state = job.getState();
195+
196+
if (state.equals("starting") || state.equals("not_started") || state.equals("running")) {
197+
logInfo("Job is waiting for start due to cluster busy, please wait or disconnect (The job will run when the cluster is free).");
198+
199+
return Observable.timer(5, TimeUnit.SECONDS);
200+
}
201+
} catch (IOException ignored) {
202+
}
203+
204+
return Observable.error(new SparkJobException("Spark Job Service not available, please check HDInsight cluster status.", err));
205+
}));
206+
}
207+
224208
public void disconnect() {
225209
this.isDisconnected = true;
226-
Optional.ofNullable(this.jobLogSubscription).ifPresent(Subscription::unsubscribe);
227210

228211
this.ctrlSubject.onCompleted();
229212
this.eventSubject.onCompleted();
@@ -239,4 +222,24 @@ private void logInfo(String message) {
239222
public PublishSubject<SparkBatchJobSubmissionEvent> getEventSubject() {
240223
return eventSubject;
241224
}
225+
226+
private Observable<SparkBatchJob> startJobSubmissionLogReceiver(SparkBatchJob job) {
227+
getEventSubject().onNext(new SparkBatchJobSubmissionEvent(SparkBatchJobSubmissionEvent.Type.SUBMITTED, job));
228+
229+
return job.getSubmissionLog()
230+
.doOnNext(ctrlSubject::onNext)
231+
.doOnError(ctrlSubject::onError)
232+
.last()
233+
.map(messageTypeText -> job);
234+
235+
}
236+
237+
private Observable<SparkBatchJob> submitJob(SimpleImmutableEntry<IClusterDetail, String> clusterArtifactUriPair) {
238+
IClusterDetail cluster = clusterArtifactUriPair.getKey();
239+
submitModel.getSubmissionParameter().setFilePath(clusterArtifactUriPair.getValue());
240+
return JobUtils.submit(cluster, submitModel.getSubmissionParameter())
241+
.subscribeOn(IdeaSchedulers.processBarVisibleAsync(project, "Submit the Spark batch job"))
242+
.toObservable()
243+
.flatMap(this::startJobSubmissionLogReceiver); // To receive the Livy submission log
244+
}
242245
}

PluginsAndFeatures/azure-toolkit-for-intellij/src/com/microsoft/azure/hdinsight/spark/run/SparkJobLogInputStream.java

Lines changed: 13 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,6 @@
3030

3131
import java.io.IOException;
3232
import java.io.InputStream;
33-
import java.io.UncheckedIOException;
3433
import java.util.AbstractMap;
3534
import java.util.Optional;
3635

@@ -51,9 +50,11 @@ public SparkJobLogInputStream(@NotNull String logType) {
5150
this.logType = logType;
5251
}
5352

54-
public void attachJob(@NotNull SparkBatchJob sparkJob) throws IOException {
53+
public SparkBatchJob attachJob(@NotNull SparkBatchJob sparkJob) {
54+
refreshLogUrl(sparkJob);
5555
this.sparkBatchJob = sparkJob;
56-
refreshLogUrl();
56+
57+
return sparkJob;
5758
}
5859

5960
private synchronized Optional<String> fetchLog(long logOffset, int fetchSize) {
@@ -96,7 +97,7 @@ public int available() throws IOException {
9697

9798
return buffer.length;
9899
}).orElseGet(() -> {
99-
refreshLogUrl();
100+
getAttachedJob().ifPresent(this::refreshLogUrl);
100101

101102
return 0;
102103
});
@@ -105,22 +106,14 @@ public int available() throws IOException {
105106
}
106107
}
107108

108-
private void refreshLogUrl() {
109-
getAttachedJob()
110-
.ifPresent(
111-
sparkJob -> {
112-
try {
113-
String currentLogUrl = sparkJob.getSparkJobDriverLogUrl(sparkJob.getConnectUri(), sparkJob.getBatchId());
114-
115-
if (!StringUtils.equals(currentLogUrl, this.logUrl)) {
116-
// The driver log url's changed due to the job was rerun, read it from beginning
117-
this.logUrl = currentLogUrl;
118-
offset = 0;
119-
}
120-
} catch (IOException ex) {
121-
throw new UncheckedIOException(ex);
122-
}
123-
});
109+
private void refreshLogUrl(SparkBatchJob sparkJob) {
110+
String currentLogUrl = sparkJob.getSparkJobDriverLogUrlObservable().toBlocking().single();
111+
112+
if (!StringUtils.equals(currentLogUrl, this.logUrl)) {
113+
// The driver log url's changed due to the job was rerun, read it from beginning
114+
this.logUrl = currentLogUrl;
115+
offset = 0;
116+
}
124117
}
125118

126119
public Optional<String> getLogUrl() {

Utils/hdinsight-node-common/src/com/microsoft/azure/hdinsight/spark/common/SparkBatchJob.java

Lines changed: 24 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040
import com.microsoft.azure.hdinsight.sdk.rest.yarn.rm.AppResponse;
4141
import com.microsoft.azure.hdinsight.spark.jobs.JobUtils;
4242
import com.microsoft.azuretools.azurecommons.helpers.NotNull;
43+
import com.microsoft.azuretools.azurecommons.helpers.Nullable;
4344
import rx.Observable;
4445
import rx.Subscriber;
4546

@@ -89,7 +90,7 @@ public class SparkBatchJob implements ISparkBatchJob, ILogger {
8990
/**
9091
* The setting of delay seconds between tries in RestAPI calling
9192
*/
92-
private int delaySeconds = 10;
93+
private int delaySeconds = 2;
9394

9495
/**
9596
* The global cache for fetched Yarn UI page by browser
@@ -548,11 +549,17 @@ private Observable<HtmlPage> loadPageByBrowserObservable(String url) {
548549
/**
549550
* Get Spark Job driver log URL with retries
550551
*
552+
* @deprecated
553+
* The Livy Rest API driver log Url field only get the running job.
554+
* Use getSparkJobDriverLogUrlObservable() please, with RxJava supported.
555+
*
551556
* @param batchBaseUri the connection URI
552557
* @param batchId the Livy batch job ID
553558
* @return the Spark Job driver log URL
554559
* @throws IOException exceptions in transaction
555560
*/
561+
@Nullable
562+
@Deprecated
556563
public String getSparkJobDriverLogUrl(URI batchBaseUri, int batchId) throws IOException {
557564
int retries = 0;
558565

@@ -628,9 +635,9 @@ public Observable<SimpleImmutableEntry<MessageInfoType, String>> getSubmissionLo
628635
int start = 0;
629636
final int maxLinesPerGet = 128;
630637
int linesGot = 0;
631-
boolean isJobActive = true;
638+
boolean isSubmitting = true;
632639

633-
while (isJobActive) {
640+
while (isSubmitting) {
634641
String logUrl = String.format("%s/%d/log?from=%d&size=%d",
635642
this.getConnectUri().toString(), batchId, start, maxLinesPerGet);
636643

@@ -650,7 +657,7 @@ public Observable<SimpleImmutableEntry<MessageInfoType, String>> getSubmissionLo
650657
// Retry interval
651658
if (linesGot == 0) {
652659
sleep(TimeUnit.SECONDS.toMillis(this.getDelaySeconds()));
653-
isJobActive = this.isActive();
660+
isSubmitting = this.getState().equals("starting");
654661
}
655662
}
656663
} catch (IOException ex) {
@@ -751,4 +758,17 @@ public Observable<SimpleImmutableEntry<SparkBatchJobState, String>> getJobDoneOb
751758
}
752759
});
753760
}
761+
762+
/**
763+
* New RxAPI: Get Job Driver Log URL from the container
764+
*
765+
* @return Job Driver log URL observable
766+
*/
767+
public Observable<String> getSparkJobDriverLogUrlObservable() {
768+
return getSparkJobYarnCurrentAppAttempt()
769+
.map(AppAttempt::getLogsLink)
770+
.map(URI::create)
771+
.map(logUriWithIP -> getConnectUri().resolve(
772+
String.format("/yarnui/%s%s", logUriWithIP.getHost(), logUriWithIP.getPath())).toString());
773+
}
754774
}

0 commit comments

Comments
 (0)