Skip to content

Commit 0907d4a

Browse files
authored
Make sure we return after gracefully shutdown (#170)
* Make sure we return after gracefully shutdown # Motivation After calling `shutdownGracefully` in our `ServiceGroup` we know that one of two things happened: 1. We got an error so something went wrong 2. We got no error so all services finished In the 2. case, there might still be a few child tasks in the task group e.g. the graceful shutdown timeout task. We are currently still iterating the remaining child task results which might lead into unexpected fatal errors. # Modification This PR makes sure that after we have successfully shutdown we are stopping iterating the child task results. # Result No more unexpected fatal errors. * Fix flaky test
1 parent 02a26fd commit 0907d4a

File tree

2 files changed

+123
-96
lines changed

2 files changed

+123
-96
lines changed

Sources/ServiceLifecycle/ServiceGroup.swift

Lines changed: 95 additions & 90 deletions
Original file line numberDiff line numberDiff line change
@@ -332,6 +332,7 @@ public actor ServiceGroup: Sendable {
332332
group: &group,
333333
gracefulShutdownManagers: gracefulShutdownManagers
334334
)
335+
return .success(())
335336
} catch {
336337
return .failure(error)
337338
}
@@ -425,6 +426,7 @@ public actor ServiceGroup: Sendable {
425426
group: &group,
426427
gracefulShutdownManagers: gracefulShutdownManagers
427428
)
429+
return .success(())
428430
} catch {
429431
return .failure(error)
430432
}
@@ -451,6 +453,7 @@ public actor ServiceGroup: Sendable {
451453
group: &group,
452454
gracefulShutdownManagers: gracefulShutdownManagers
453455
)
456+
return .success(())
454457
} catch {
455458
return .failure(error)
456459
}
@@ -511,7 +514,7 @@ public actor ServiceGroup: Sendable {
511514
// We have to shutdown the services in reverse. To do this
512515
// we are going to signal each child task the graceful shutdown and then wait for
513516
// its exit.
514-
for (gracefulShutdownIndex, gracefulShutdownManager) in gracefulShutdownManagers.lazy.enumerated().reversed() {
517+
gracefulShutdownLoop: for (gracefulShutdownIndex, gracefulShutdownManager) in gracefulShutdownManagers.lazy.enumerated().reversed() {
515518
guard let service = services[gracefulShutdownIndex] else {
516519
self.logger.debug(
517520
"Service already finished. Skipping shutdown"
@@ -527,112 +530,114 @@ public actor ServiceGroup: Sendable {
527530

528531
gracefulShutdownManager.shutdownGracefully()
529532

530-
let result = try await group.next()
533+
while let result = try await group.next() {
534+
switch result {
535+
case .serviceFinished(let service, let index):
536+
if group.isCancelled {
537+
// The group is cancelled and we expect all services to finish
538+
continue gracefulShutdownLoop
539+
}
531540

532-
switch result {
533-
case .serviceFinished(let service, let index):
534-
if group.isCancelled {
535-
// The group is cancelled and we expect all services to finish
536-
continue
537-
}
541+
if index == gracefulShutdownIndex {
542+
// The service that we signalled graceful shutdown did exit/
543+
// We can continue to the next one.
544+
self.logger.debug(
545+
"Service finished",
546+
metadata: [
547+
self.loggingConfiguration.keys.serviceKey: "\(service.service)",
548+
]
549+
)
550+
continue gracefulShutdownLoop
551+
} else {
552+
// Another service exited unexpectedly
553+
self.logger.debug(
554+
"Service finished unexpectedly during graceful shutdown. Cancelling all other services now",
555+
metadata: [
556+
self.loggingConfiguration.keys.serviceKey: "\(service.service)",
557+
]
558+
)
538559

539-
if index == gracefulShutdownIndex {
540-
// The service that we signalled graceful shutdown did exit/
541-
// We can continue to the next one.
542-
self.logger.debug(
543-
"Service finished",
544-
metadata: [
545-
self.loggingConfiguration.keys.serviceKey: "\(service.service)",
546-
]
547-
)
548-
continue
549-
} else {
550-
// Another service exited unexpectedly
551-
self.logger.debug(
552-
"Service finished unexpectedly during graceful shutdown. Cancelling all other services now",
553-
metadata: [
554-
self.loggingConfiguration.keys.serviceKey: "\(service.service)",
555-
]
556-
)
560+
self.cancelGroupAndSpawnTimeoutIfNeeded(group: &group, cancellationTimeoutTask: &cancellationTimeoutTask)
561+
throw ServiceGroupError.serviceFinishedUnexpectedly()
562+
}
557563

558-
self.cancelGroupAndSpawnTimeoutIfNeeded(group: &group, cancellationTimeoutTask: &cancellationTimeoutTask)
559-
throw ServiceGroupError.serviceFinishedUnexpectedly()
560-
}
564+
case .serviceThrew(let service, _, let serviceError):
565+
switch service.failureTerminationBehavior.behavior {
566+
case .cancelGroup:
567+
self.logger.debug(
568+
"Service threw error during graceful shutdown. Cancelling group.",
569+
metadata: [
570+
self.loggingConfiguration.keys.serviceKey: "\(service.service)",
571+
self.loggingConfiguration.keys.errorKey: "\(serviceError)",
572+
]
573+
)
574+
group.cancelAll()
575+
throw serviceError
561576

562-
case .serviceThrew(let service, _, let serviceError):
563-
switch service.failureTerminationBehavior.behavior {
564-
case .cancelGroup:
565-
self.logger.debug(
566-
"Service threw error during graceful shutdown. Cancelling group.",
567-
metadata: [
568-
self.loggingConfiguration.keys.serviceKey: "\(service.service)",
569-
self.loggingConfiguration.keys.errorKey: "\(serviceError)",
570-
]
571-
)
572-
group.cancelAll()
573-
throw serviceError
577+
case .gracefullyShutdownGroup:
578+
self.logger.debug(
579+
"Service threw error during graceful shutdown.",
580+
metadata: [
581+
self.loggingConfiguration.keys.serviceKey: "\(service.service)",
582+
self.loggingConfiguration.keys.errorKey: "\(serviceError)",
583+
]
584+
)
574585

575-
case .gracefullyShutdownGroup:
576-
self.logger.debug(
577-
"Service threw error during graceful shutdown.",
578-
metadata: [
579-
self.loggingConfiguration.keys.serviceKey: "\(service.service)",
580-
self.loggingConfiguration.keys.errorKey: "\(serviceError)",
581-
]
582-
)
586+
if error == nil {
587+
error = serviceError
588+
}
583589

584-
if error == nil {
585-
error = serviceError
590+
// We can continue shutting down the next service now
591+
continue gracefulShutdownLoop
592+
593+
case .ignore:
594+
self.logger.debug(
595+
"Service threw error during graceful shutdown.",
596+
metadata: [
597+
self.loggingConfiguration.keys.serviceKey: "\(service.service)",
598+
self.loggingConfiguration.keys.errorKey: "\(serviceError)",
599+
]
600+
)
601+
602+
// We can continue shutting down the next service now
603+
continue gracefulShutdownLoop
586604
}
587605

588-
case .ignore:
589-
self.logger.debug(
590-
"Service threw error during graceful shutdown.",
591-
metadata: [
592-
self.loggingConfiguration.keys.serviceKey: "\(service.service)",
593-
self.loggingConfiguration.keys.errorKey: "\(serviceError)",
594-
]
595-
)
606+
case .signalCaught(let signal):
607+
if self.cancellationSignals.contains(signal) {
608+
// We got signalled cancellation after graceful shutdown
609+
self.logger.debug(
610+
"Signal caught. Cancelling the group.",
611+
metadata: [
612+
self.loggingConfiguration.keys.signalKey: "\(signal)",
613+
]
614+
)
596615

597-
continue
598-
}
616+
self.cancelGroupAndSpawnTimeoutIfNeeded(group: &group, cancellationTimeoutTask: &cancellationTimeoutTask)
617+
}
599618

600-
case .signalCaught(let signal):
601-
if self.cancellationSignals.contains(signal) {
602-
// We got signalled cancellation after graceful shutdown
619+
case .gracefulShutdownTimedOut:
620+
// Gracefully shutting down took longer than the user configured
621+
// so we have to escalate it now.
603622
self.logger.debug(
604-
"Signal caught. Cancelling the group.",
623+
"Graceful shutdown took longer than allowed by the configuration. Cancelling the group now.",
605624
metadata: [
606-
self.loggingConfiguration.keys.signalKey: "\(signal)",
625+
self.loggingConfiguration.keys.serviceKey: "\(service.service)",
607626
]
608627
)
609-
610628
self.cancelGroupAndSpawnTimeoutIfNeeded(group: &group, cancellationTimeoutTask: &cancellationTimeoutTask)
611-
}
612629

613-
case .gracefulShutdownTimedOut:
614-
// Gracefully shutting down took longer than the user configured
615-
// so we have to escalate it now.
616-
self.logger.debug(
617-
"Graceful shutdown took longer than allowed by the configuration. Cancelling the group now.",
618-
metadata: [
619-
self.loggingConfiguration.keys.serviceKey: "\(service.service)",
620-
]
621-
)
622-
self.cancelGroupAndSpawnTimeoutIfNeeded(group: &group, cancellationTimeoutTask: &cancellationTimeoutTask)
623-
624-
case .cancellationCaught:
625-
// We caught cancellation in our child task so we have to spawn
626-
// our cancellation timeout task if needed
627-
self.logger.debug("Caught cancellation.")
628-
self.cancelGroupAndSpawnTimeoutIfNeeded(group: &group, cancellationTimeoutTask: &cancellationTimeoutTask)
629-
630-
case .signalSequenceFinished, .gracefulShutdownCaught, .gracefulShutdownFinished:
631-
// We just have to tolerate this since signals and parent graceful shutdowns downs can race.
632-
continue
630+
case .cancellationCaught:
631+
// We caught cancellation in our child task so we have to spawn
632+
// our cancellation timeout task if needed
633+
self.logger.debug("Caught cancellation.")
634+
self.cancelGroupAndSpawnTimeoutIfNeeded(group: &group, cancellationTimeoutTask: &cancellationTimeoutTask)
633635

634-
case nil:
635-
fatalError("Invalid result from group.next().")
636+
case .signalSequenceFinished, .gracefulShutdownCaught, .gracefulShutdownFinished:
637+
// We just have to tolerate this since signals and parent graceful shutdowns downs can race.
638+
// We are going to continue the
639+
break
640+
}
636641
}
637642
}
638643

Tests/ServiceLifecycleTests/ServiceGroupTests.swift

Lines changed: 28 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -320,12 +320,10 @@ final class ServiceGroupTests: XCTestCase {
320320
await service2.resumeRunContinuation(with: .success(()))
321321

322322
// Waiting to see that the remaining is still running
323+
await XCTAsyncAssertEqual(await eventIterator1.next(), .shutdownGracefully)
323324
service1.sendPing()
324325
await XCTAsyncAssertEqual(await eventIterator1.next(), .runPing)
325326

326-
// The first service should now receive the signal
327-
await XCTAsyncAssertEqual(await eventIterator1.next(), .shutdownGracefully)
328-
329327
// Waiting to see that the one remaining are still running
330328
service1.sendPing()
331329
await XCTAsyncAssertEqual(await eventIterator1.next(), .runPing)
@@ -523,12 +521,10 @@ final class ServiceGroupTests: XCTestCase {
523521
await service3.resumeRunContinuation(with: .success(()))
524522

525523
// Waiting to see that the remaining is still running
524+
await XCTAsyncAssertEqual(await eventIterator1.next(), .shutdownGracefully)
526525
service1.sendPing()
527526
await XCTAsyncAssertEqual(await eventIterator1.next(), .runPing)
528527

529-
// The first service should now receive the signal
530-
await XCTAsyncAssertEqual(await eventIterator1.next(), .shutdownGracefully)
531-
532528
// Waiting to see that the one remaining are still running
533529
service1.sendPing()
534530
await XCTAsyncAssertEqual(await eventIterator1.next(), .runPing)
@@ -1112,6 +1108,32 @@ final class ServiceGroupTests: XCTestCase {
11121108
}
11131109
}
11141110

1111+
func testGracefulShutdownWithMaximumDuration() async throws {
1112+
let mockService = MockService(description: "Service1")
1113+
let serviceGroup = self.makeServiceGroup(
1114+
services: [.init(service: mockService)],
1115+
gracefulShutdownSignals: [.sigalrm],
1116+
maximumGracefulShutdownDuration: .seconds(0.1)
1117+
)
1118+
1119+
try await withThrowingTaskGroup(of: Void.self) { group in
1120+
group.addTask {
1121+
try await serviceGroup.run()
1122+
}
1123+
1124+
var eventIterator = mockService.events.makeAsyncIterator()
1125+
await XCTAsyncAssertEqual(await eventIterator.next(), .run)
1126+
1127+
await serviceGroup.triggerGracefulShutdown()
1128+
1129+
await XCTAsyncAssertEqual(await eventIterator.next(), .shutdownGracefully)
1130+
1131+
await mockService.resumeRunContinuation(with: .success(()))
1132+
1133+
try await XCTAsyncAssertNoThrow(await group.next())
1134+
}
1135+
}
1136+
11151137
func testGracefulShutdownEscalation_whenNoCancellationEscalation() async throws {
11161138
let mockService = MockService(description: "Service1")
11171139
let serviceGroup = self.makeServiceGroup(

0 commit comments

Comments
 (0)