|
29 | 29 | import com.microsoft.azure.hdinsight.sdk.rest.yarn.rm.App; |
30 | 30 | import com.microsoft.azure.hdinsight.sdk.rest.yarn.rm.AppResponse; |
31 | 31 | import rx.Observable; |
| 32 | +import rx.Subscriber; |
32 | 33 |
|
33 | 34 | import java.io.IOException; |
34 | 35 | import java.net.URI; |
|
42 | 43 | import static com.microsoft.azure.hdinsight.common.MessageInfoType.Error; |
43 | 44 | import static com.microsoft.azure.hdinsight.common.MessageInfoType.Log; |
44 | 45 | import static java.lang.Thread.sleep; |
45 | | -import static rx.exceptions.Exceptions.propagate; |
46 | 46 |
|
47 | 47 | public class SparkBatchJob implements ISparkBatchJob, ILogger { |
48 | 48 | /** |
@@ -270,7 +270,7 @@ public String getState() throws IOException { |
270 | 270 | } |
271 | 271 | } while (++retries < this.getRetriesMax()); |
272 | 272 |
|
273 | | - throw new UnknownServiceException("Unknown service error after " + --retries + " retries"); |
| 273 | + throw new UnknownServiceException("Failed to get job state: Unknown service error after " + --retries + " retries"); |
274 | 274 | } |
275 | 275 |
|
276 | 276 | /** |
@@ -312,7 +312,7 @@ protected String getSparkJobApplicationId(URI batchBaseUri, int batchId) throws |
312 | 312 | } |
313 | 313 | } while (++retries < this.getRetriesMax()); |
314 | 314 |
|
315 | | - throw new UnknownServiceException("Unknown service error after " + --retries + " retries"); |
| 315 | + throw new UnknownServiceException("Failed to get job Application ID: Unknown service error after " + --retries + " retries"); |
316 | 316 | } |
317 | 317 |
|
318 | 318 | /** |
@@ -356,7 +356,7 @@ protected App getSparkJobYarnApplication(URI batchBaseUri, String applicationID) |
356 | 356 | } |
357 | 357 | } while (++retries < this.getRetriesMax()); |
358 | 358 |
|
359 | | - throw new UnknownServiceException("Unknown service error after " + --retries + " retries"); |
| 359 | + throw new UnknownServiceException("Failed to get job Yarn application: Unknown service error after " + --retries + " retries"); |
360 | 360 | } |
361 | 361 |
|
362 | 362 | /** |
@@ -399,7 +399,7 @@ public String getSparkJobDriverLogUrl(URI batchBaseUri, int batchId) throws IOEx |
399 | 399 | } |
400 | 400 | } while (++retries < this.getRetriesMax()); |
401 | 401 |
|
402 | | - throw new UnknownServiceException("Unknown service error after " + --retries + " retries"); |
| 402 | + throw new UnknownServiceException("Failed to get job driver log URL: Unknown service error after " + --retries + " retries"); |
403 | 403 | } |
404 | 404 |
|
405 | 405 | /** |
@@ -505,45 +505,69 @@ public boolean isActive() throws IOException { |
505 | 505 | } |
506 | 506 | } while (++retries < this.getRetriesMax()); |
507 | 507 |
|
508 | | - throw new UnknownServiceException("Unknown service error after " + --retries + " retries"); |
| 508 | + throw new UnknownServiceException("Failed to detect job activity: Unknown service error after " + --retries + " retries"); |
509 | 509 | } |
510 | 510 |
|
511 | | - public boolean isLogAggregated() throws IOException { |
512 | | - String applicationId = this.getSparkJobApplicationId(this.getConnectUri(), this.getBatchId()); |
513 | | - App yarnApp = this.getSparkJobYarnApplication(this.getConnectUri(), applicationId); |
514 | | - |
515 | | - switch (yarnApp.getLogAggregationStatus().toUpperCase()) { |
516 | | - case "SUCCEEDED": |
517 | | - return true; |
518 | | - case "DISABLED": |
519 | | - case "NOT_START": |
520 | | - case "RUNNING": |
521 | | - case "RUNNING_WITH_FAILURE": |
522 | | - case "FAILED": |
523 | | - case "TIME_OUT": |
524 | | - default: |
525 | | - return false; |
526 | | - } |
527 | | - } |
| 511 | + public Observable<SimpleImmutableEntry<SparkBatchJobState, String>> getJobDoneObservable() { |
| 512 | + return Observable.create((Subscriber<? super SimpleImmutableEntry<SparkBatchJobState, String>> ob) -> { |
| 513 | + try { |
| 514 | + boolean isJobActive = true; |
| 515 | + boolean isLogAggregateDone = false; |
| 516 | + SparkBatchJobState state = SparkBatchJobState.NOT_STARTED; |
| 517 | + String applicationId = null; |
| 518 | + String diagnostics = ""; |
| 519 | + |
| 520 | + while (true) { |
| 521 | + if (isJobActive) { |
| 522 | + HttpResponse httpResponse = this.getSubmission().getBatchSparkJobStatus( |
| 523 | + this.getConnectUri().toString(), batchId); |
| 524 | + |
| 525 | + if (httpResponse.getCode() >= 200 && httpResponse.getCode() < 300) { |
| 526 | + SparkSubmitResponse jobResp = ObjectConvertUtils.convertJsonToObject( |
| 527 | + httpResponse.getMessage(), SparkSubmitResponse.class) |
| 528 | + .orElseThrow(() -> new UnknownServiceException( |
| 529 | + "Bad spark job response: " + httpResponse.getMessage())); |
| 530 | + |
| 531 | + state = SparkBatchJobState.valueOf(jobResp.getState().toUpperCase()); |
| 532 | + |
| 533 | + isJobActive = !state.isJobDone(); |
| 534 | + applicationId = jobResp.getAppId(); |
| 535 | + } |
| 536 | + } |
528 | 537 |
|
529 | | - public Observable<SparkBatchJobState> getJobDoneObservable() { |
530 | | - return Observable.interval(200, TimeUnit.MILLISECONDS) |
531 | | - .map((times) -> { |
532 | | - try { |
533 | | - return getState(); |
534 | | - } catch (IOException e) { |
535 | | - throw propagate(e); |
| 538 | + if (!isLogAggregateDone && applicationId != null) { |
| 539 | + App yarnApp = this.getSparkJobYarnApplication(this.getConnectUri(), applicationId); |
| 540 | + diagnostics = yarnApp.getDiagnostics(); |
| 541 | + |
| 542 | + switch (yarnApp.getLogAggregationStatus().toUpperCase()) { |
| 543 | + case "SUCCEEDED": |
| 544 | + case "FAILED": |
| 545 | + isLogAggregateDone = true; |
| 546 | + break; |
| 547 | + case "DISABLED": |
| 548 | + case "NOT_START": |
| 549 | + case "RUNNING": |
| 550 | + case "RUNNING_WITH_FAILURE": |
| 551 | + case "TIME_OUT": |
| 552 | + default: |
| 553 | + isLogAggregateDone = false; |
| 554 | + } |
536 | 555 | } |
537 | | - }) |
538 | | - .map(s -> SparkBatchJobState.valueOf(s.toUpperCase())) |
539 | | - .filter(SparkBatchJobState::isJobDone) |
540 | | - .filter((state) -> { |
541 | | - try { |
542 | | - return isLogAggregated(); |
543 | | - } catch (IOException e) { |
544 | | - throw propagate(e); |
| 556 | + |
| 557 | + // Retry interval |
| 558 | + if (!isJobActive && isLogAggregateDone) { |
| 559 | + ob.onNext(new SimpleImmutableEntry<>(state, diagnostics)); |
| 560 | + break; |
| 561 | + } else { |
| 562 | + sleep(1000); |
545 | 563 | } |
546 | | - }) |
547 | | - .delay(3, TimeUnit.SECONDS); |
| 564 | + } |
| 565 | + } catch (IOException ex) { |
| 566 | + ob.onError(ex); |
| 567 | + } catch (InterruptedException ignored) { |
| 568 | + } finally { |
| 569 | + ob.onCompleted(); |
| 570 | + } |
| 571 | + }); |
548 | 572 | } |
549 | 573 | } |
0 commit comments