Skip to content

Commit b07bc77

Browse files
authored
Clean shutdowns (#331)
* add an async shutdown method to sinks * add a method to shutdown health checks * add connection close on ingest actor
1 parent ed51a84 commit b07bc77

31 files changed

+518
-41
lines changed

config/config.conf

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ clusterHostSuffix=".cluster"
5252
reaggregationDimensions=["host"]
5353
#reaggregationInjectClusterAsHost=True
5454
#reaggregationTimeout="PT1M"
55+
healthcheckShutdownDelay="PT5S"
5556

5657
# Pekko
5758
# ~~~~
@@ -77,6 +78,7 @@ pekkoConfiguration {
7778
unhandled="on"
7879
}
7980
}
81+
coordinated-shutdown.run-by-jvm-shutdown-hook = off
8082
cluster {
8183
seed-nodes=["pekko://[email protected]:2551"]
8284
downing-provider-class = "org.apache.pekko.cluster.sbr.SplitBrainResolverProvider"

lib/awaitility-4.0.2.jar

-87.8 KB
Binary file not shown.

lib/hamcrest-2.1.jar

-120 KB
Binary file not shown.

pom.xml

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@
8585
<name>Ville Koskela</name>
8686
<email>[email protected]</email>
8787
<organization>Inscope Metrics</organization>
88-
<organizationUrl>http://www.inscopemetrics.com</organizationUrl>
88+
<organizationUrl>https://www.inscopemetrics.com</organizationUrl>
8989
<roles>
9090
<role>developer</role>
9191
</roles>
@@ -191,6 +191,12 @@
191191
<directory>src/main/resources</directory>
192192
<filtering>true</filtering>
193193
</resource>
194+
<resource>
195+
<directory>config</directory>
196+
<includes>
197+
<include>**/*.conf</include>
198+
</includes>
199+
</resource>
194200
</resources>
195201
<plugins>
196202
<!-- Enable Inherited Plugins -->

src/main/java/com/arpnetworking/clusteraggregator/Emitter.java

Lines changed: 53 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,9 +26,11 @@
2626
import com.google.common.collect.ImmutableMap;
2727
import org.apache.pekko.actor.AbstractActor;
2828
import org.apache.pekko.actor.Props;
29+
import org.apache.pekko.pattern.Patterns;
2930

3031
import java.time.Duration;
3132
import java.time.ZonedDateTime;
33+
import java.util.concurrent.CompletionStage;
3234

3335
/**
3436
* Holds the sinks and emits to them.
@@ -43,7 +45,7 @@ public class Emitter extends AbstractActor {
4345
* @return A new {@link Props}.
4446
*/
4547
public static Props props(final EmitterConfiguration config) {
46-
return Props.create(Emitter.class, config);
48+
return Props.create(Emitter.class, () -> new Emitter(config));
4749
}
4850

4951
/**
@@ -62,6 +64,11 @@ public Emitter(final EmitterConfiguration config) {
6264
.log();
6365
}
6466

67+
@Override
68+
public void preStart() throws Exception, Exception {
69+
super.preStart();
70+
}
71+
6572
@SuppressWarnings("deprecation")
6673
@Override
6774
public Receive createReceive() {
@@ -90,6 +97,22 @@ public Receive createReceive() {
9097
.log();
9198
_sink.recordAggregateData(periodicData);
9299
})
100+
.match(Shutdown.class, ignored -> {
101+
LOGGER.info()
102+
.setMessage("Shutting down emitter")
103+
.log();
104+
105+
final CompletionStage<Object> shutdownFuture = _sink.shutdownGracefully()
106+
.thenApply(ignore -> ShutdownComplete.getInstance());
107+
Patterns.pipe(shutdownFuture, context().dispatcher()).to(self(), sender());
108+
})
109+
.match(ShutdownComplete.class, ignored -> {
110+
LOGGER.info()
111+
.setMessage("Emitter shutdown complete")
112+
.log();
113+
sender().tell("OK", self());
114+
context().stop(self());
115+
})
93116
.build();
94117
}
95118

@@ -101,4 +124,33 @@ public void postStop() throws Exception {
101124

102125
private final Sink _sink;
103126
private static final Logger LOGGER = LoggerFactory.getLogger(Emitter.class);
127+
/**
128+
* Message to initiate a graceful shutdown.
129+
*/
130+
public static final class Shutdown {
131+
private Shutdown() {}
132+
133+
/**
134+
* Get the singleton instance.
135+
*
136+
* @return the singleton instance
137+
*/
138+
public static Shutdown getInstance() {
139+
return INSTANCE;
140+
}
141+
private static final Shutdown INSTANCE = new Shutdown();
142+
}
143+
private static final class ShutdownComplete {
144+
private ShutdownComplete() {}
145+
146+
/**
147+
* Get the singleton instance.
148+
*
149+
* @return the singleton instance
150+
*/
151+
public static ShutdownComplete getInstance() {
152+
return INSTANCE;
153+
}
154+
private static final ShutdownComplete INSTANCE = new ShutdownComplete();
155+
}
104156
}

src/main/java/com/arpnetworking/clusteraggregator/GracefulShutdownActor.java

Lines changed: 113 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515
*/
1616
package com.arpnetworking.clusteraggregator;
1717

18+
import com.arpnetworking.clusteraggregator.client.HttpSourceActor;
19+
import com.arpnetworking.clusteraggregator.http.Routes;
1820
import com.arpnetworking.steno.Logger;
1921
import com.arpnetworking.steno.LoggerFactory;
2022
import com.google.inject.Inject;
@@ -23,9 +25,13 @@
2325
import org.apache.pekko.actor.AbstractActor;
2426
import org.apache.pekko.actor.ActorRef;
2527
import org.apache.pekko.actor.ActorSystem;
26-
import org.apache.pekko.actor.Terminated;
2728
import org.apache.pekko.cluster.Cluster;
2829
import org.apache.pekko.cluster.sharding.ShardRegion;
30+
import org.apache.pekko.pattern.Patterns;
31+
32+
import java.time.Duration;
33+
import java.util.concurrent.CompletableFuture;
34+
import java.util.concurrent.CompletionStage;
2935

3036
/**
3137
* Shuts down the Pekko cluster gracefully.
@@ -37,11 +43,27 @@ public class GracefulShutdownActor extends AbstractActor {
3743
* Public constructor.
3844
*
3945
* @param shardRegion aggregator shard region
46+
* @param hostEmitter host emitter
47+
* @param clusterEmitter cluster emitter
48+
* @param routes routes
49+
* @param healthcheckShutdownDelay delay after shutting down healthcheck before shutting down emitters
50+
* @param ingestActor ingest actor
4051
*/
4152
@Inject
4253
@SuppressFBWarnings(value = "MC_OVERRIDABLE_METHOD_CALL_IN_CONSTRUCTOR", justification = "Context is safe to use in constructor.")
43-
public GracefulShutdownActor(@Named("aggregator-shard-region") final ActorRef shardRegion) {
54+
public GracefulShutdownActor(
55+
@Named("aggregator-shard-region") final ActorRef shardRegion,
56+
@Named("host-emitter") final ActorRef hostEmitter,
57+
@Named("cluster-emitter") final ActorRef clusterEmitter,
58+
final Routes routes,
59+
@Named("healthcheck-shutdown-delay") final Duration healthcheckShutdownDelay,
60+
@Named("http-ingest-v1") final ActorRef ingestActor) {
4461
_shardRegion = shardRegion;
62+
_hostEmitter = hostEmitter;
63+
_clusterEmitter = clusterEmitter;
64+
_routes = routes;
65+
_healthcheckShutdownDelay = healthcheckShutdownDelay;
66+
_ingestActor = ingestActor;
4567
}
4668

4769
@Override
@@ -52,13 +74,65 @@ public Receive createReceive() {
5274
.setMessage("Initiating graceful shutdown")
5375
.addData("actor", self())
5476
.log();
55-
context().watch(_shardRegion);
77+
self().tell(ShutdownHealthcheck.getInstance(), sender());
78+
})
79+
.match(ShutdownHealthcheck.class, message -> {
80+
LOGGER.info()
81+
.setMessage("Shutting down healthcheck")
82+
.addData("actor", self())
83+
.log();
84+
_routes.shutdownHealthcheck();
85+
_ingestActor.tell(HttpSourceActor.Shutdown.getInstance(), self());
86+
LOGGER.info()
87+
.setMessage("Waiting before proceeding with shutdown of emitters")
88+
.addData("delay", _healthcheckShutdownDelay)
89+
.addData("actor", self())
90+
.log();
91+
context().system().scheduler().scheduleOnce(
92+
_healthcheckShutdownDelay,
93+
self(),
94+
ShutdownEmitter.getInstance(),
95+
context().dispatcher(),
96+
sender());
97+
})
98+
.match(ShutdownEmitter.class, message -> {
99+
LOGGER.info()
100+
.setMessage("Shutting down emitters")
101+
.addData("actor", self())
102+
.log();
103+
final CompletionStage<Object> host = Patterns.ask(_hostEmitter,
104+
Emitter.Shutdown.getInstance(),
105+
Duration.ofSeconds(30));
106+
final CompletionStage<Object> cluster = Patterns.ask(_clusterEmitter,
107+
Emitter.Shutdown.getInstance(),
108+
Duration.ofSeconds(30));
109+
final CompletableFuture<ShutdownShardRegion> allShutdown = CompletableFuture.allOf(
110+
host.toCompletableFuture(),
111+
cluster.toCompletableFuture())
112+
.thenApply(result -> ShutdownShardRegion.getInstance());
113+
Patterns.pipe(allShutdown, context().dispatcher()).to(self(), sender());
114+
115+
})
116+
.match(ShutdownShardRegion.class, message -> {
117+
LOGGER.info()
118+
.setMessage("Shutting down shard region")
119+
.addData("actor", self())
120+
.log();
121+
context().watchWith(_shardRegion, new ShutdownShardRegionComplete(sender()));
56122
_shardRegion.tell(ShardRegion.gracefulShutdownInstance(), self());
57123
})
58-
.match(Terminated.class, terminated -> {
124+
.match(ShutdownShardRegionComplete.class, terminated -> {
125+
terminated._replyTo.tell("OK", self());
59126
_cluster.registerOnMemberRemoved(_system::terminate);
60127
_cluster.leave(_cluster.selfAddress());
61128
})
129+
.matchAny(unhandled -> {
130+
LOGGER.warn()
131+
.setMessage("Received unhandled message")
132+
.addData("message", unhandled)
133+
.addData("actor", self())
134+
.log();
135+
})
62136
.build();
63137
}
64138

@@ -69,7 +143,12 @@ public void preStart() throws Exception {
69143
_system = context().system();
70144
}
71145

72-
private ActorRef _shardRegion;
146+
private final ActorRef _shardRegion;
147+
private final ActorRef _hostEmitter;
148+
private final ActorRef _clusterEmitter;
149+
private final Routes _routes;
150+
private final Duration _healthcheckShutdownDelay;
151+
private final ActorRef _ingestActor;
73152
private Cluster _cluster;
74153
private ActorSystem _system;
75154
private static final Logger LOGGER = LoggerFactory.getLogger(GracefulShutdownActor.class);
@@ -83,10 +162,38 @@ private Shutdown() {}
83162
*
84163
* @return a singleton instance
85164
*/
86-
public static Shutdown instance() {
165+
public static Shutdown getInstance() {
87166
return SHUTDOWN;
88167
}
89168

90169
private static final Shutdown SHUTDOWN = new Shutdown();
91170
}
171+
private static final class ShutdownHealthcheck {
172+
private ShutdownHealthcheck() {}
173+
public static ShutdownHealthcheck getInstance() {
174+
return INSTANCE;
175+
}
176+
private static final ShutdownHealthcheck INSTANCE = new ShutdownHealthcheck();
177+
}
178+
private static final class ShutdownShardRegion {
179+
private ShutdownShardRegion() {}
180+
public static ShutdownShardRegion getInstance() {
181+
return INSTANCE;
182+
}
183+
private static final ShutdownShardRegion INSTANCE = new ShutdownShardRegion();
184+
}
185+
private static final class ShutdownEmitter {
186+
private ShutdownEmitter() {}
187+
public static ShutdownEmitter getInstance() {
188+
return INSTANCE;
189+
}
190+
private static final ShutdownEmitter INSTANCE = new ShutdownEmitter();
191+
}
192+
private static final class ShutdownShardRegionComplete {
193+
ShutdownShardRegionComplete(final ActorRef replyTo) {
194+
_replyTo = replyTo;
195+
}
196+
197+
private final ActorRef _replyTo;
198+
}
92199
}

src/main/java/com/arpnetworking/clusteraggregator/GuiceModule.java

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -207,7 +207,7 @@ private ActorRef provideHostEmitter(final Injector injector, final ActorSystem s
207207

208208
private ActorRef launchEmitter(final Injector injector, final ActorSystem system, final File pipelineFile, final String name) {
209209
final ActorRef emitterConfigurationProxy = system.actorOf(
210-
ConfigurableActorProxy.props(new RoundRobinEmitterFactory()),
210+
ConfigurableActorProxy.props(new RoundRobinEmitterFactory(_shutdown)),
211211
name);
212212
final ActorConfigurator<EmitterConfiguration> configurator =
213213
new ActorConfigurator<>(emitterConfigurationProxy, EmitterConfiguration.class);
@@ -384,6 +384,13 @@ private boolean provideReaggregationInjectClusterAsHost(final ClusterAggregatorC
384384
return config.getReaggregationInjectClusterAsHost();
385385
}
386386

387+
@Provides
388+
@Named("healthcheck-shutdown-delay")
389+
@SuppressFBWarnings("UPM_UNCALLED_PRIVATE_METHOD") // Invoked reflectively by Guice
390+
private Duration provideHealthCheckShutdownDelay(final ClusterAggregatorConfiguration config) {
391+
return config.getHealthcheckShutdownDelay();
392+
}
393+
387394
@Provides
388395
@Named("reaggregation-timeout")
389396
@SuppressFBWarnings("UPM_UNCALLED_PRIVATE_METHOD") // Invoked reflectively by Guice
@@ -440,6 +447,11 @@ static List<Sink> createSinks(final ImmutableList<JsonNode> monitoringSinks) {
440447
private static final ObjectMapper OBJECT_MAPPER = ObjectMapperFactory.getInstance();
441448

442449
private static final class RoundRobinEmitterFactory implements ConfiguredLaunchableFactory<Props, EmitterConfiguration> {
450+
/**
451+
* Constructor.
452+
*/
453+
RoundRobinEmitterFactory(final LifecycleRegistration shutdown) {
454+
}
443455

444456
@Override
445457
public Props create(final EmitterConfiguration config) {

src/main/java/com/arpnetworking/clusteraggregator/Main.java

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -37,17 +37,19 @@
3737
import org.apache.pekko.actor.ActorRef;
3838
import org.apache.pekko.actor.ActorSystem;
3939
import org.apache.pekko.http.javadsl.ServerBinding;
40+
import org.apache.pekko.pattern.Patterns;
4041
import org.slf4j.LoggerFactory;
41-
import scala.concurrent.Await;
42-
import scala.concurrent.duration.Duration;
4342

4443
import java.io.File;
44+
import java.time.Duration;
4545
import java.util.List;
4646
import java.util.Locale;
4747
import java.util.Optional;
4848
import java.util.concurrent.CompletionStage;
49+
import java.util.concurrent.ExecutionException;
4950
import java.util.concurrent.Semaphore;
5051
import java.util.concurrent.TimeUnit;
52+
import java.util.concurrent.TimeoutException;
5153

5254
/**
5355
* Entry point for the pekko-based cluster aggregator.
@@ -236,15 +238,21 @@ private void shutdownPekko() {
236238
LOGGER.info()
237239
.setMessage("Stopping Pekko")
238240
.log();
239-
if (_shutdownActor != null) {
240-
_shutdownActor.tell(GracefulShutdownActor.Shutdown.instance(), ActorRef.noSender());
241-
}
241+
242242
try {
243+
if (_shutdownActor != null) {
244+
final CompletionStage<Object> gracefulShutdown =
245+
Patterns.ask(_shutdownActor, GracefulShutdownActor.Shutdown.getInstance(), Duration.ofMinutes(10));
246+
gracefulShutdown.toCompletableFuture().join();
247+
LOGGER.info()
248+
.setMessage("Graceful shutdown actor reported completion")
249+
.log();
250+
}
243251
if (_system != null) {
244-
Await.result(_system.whenTerminated(), SHUTDOWN_TIMEOUT);
252+
_system.getWhenTerminated().toCompletableFuture().get(SHUTDOWN_TIMEOUT.toSeconds(), TimeUnit.SECONDS);
245253
}
246254
// CHECKSTYLE.OFF: IllegalCatch - Prevent program shutdown
247-
} catch (final Exception e) {
255+
} catch (final InterruptedException | TimeoutException | ExecutionException e) {
248256
// CHECKSTYLE.ON: IllegalCatch
249257
LOGGER.warn()
250258
.setMessage("Interrupted at shutdown")
@@ -274,7 +282,7 @@ private static Builder<? extends JsonNodeSource> getFileSourceBuilder(
274282
private volatile List<Database> _databases;
275283

276284
private static final Logger LOGGER = com.arpnetworking.steno.LoggerFactory.getLogger(Main.class);
277-
private static final Duration SHUTDOWN_TIMEOUT = Duration.create(3, TimeUnit.MINUTES);
285+
private static final Duration SHUTDOWN_TIMEOUT = Duration.ofMinutes(3);
278286
private static final SourceTypeLiteral SOURCE_TYPE_LITERAL = new SourceTypeLiteral();
279287
private static final Semaphore SHUTDOWN_SEMAPHORE = new Semaphore(0);
280288
private static final Thread SHUTDOWN_THREAD = new ShutdownThread();

0 commit comments

Comments
 (0)