Skip to content

Commit 3ccd93f

Browse files
authored
Change process kill order for testclusters shutdown (#50216)
The testclusters shutdown code was killing child processes of the ES JVM before the ES JVM. This causes any running ML jobs to be recorded as failed, as the ES JVM notices that they have disconnected from it without being told to stop, as they would if they crashed. In many test suites this doesn't matter because the test cluster will never be restarted, but in the case of upgrade tests it makes it impossible to test what happens when an ML job is running at the time of the upgrade. This change reverses the order of killing the ES process tree such that the parent processes are killed before their children. A list of children is stored before killing the parent so that they can subsequently be killed (if they don't exit by themselves as a side effect of the parent dying). Backport of #50175
1 parent 185cc47 commit 3ccd93f

File tree

2 files changed

+28
-24
lines changed

2 files changed

+28
-24
lines changed

buildSrc/src/main/java/org/elasticsearch/gradle/testclusters/ElasticsearchNode.java

Lines changed: 28 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -772,6 +772,7 @@ public synchronized void stop(boolean tailLogs) {
772772
requireNonNull(esProcess, "Can't stop `" + this + "` as it was not started or already stopped.");
773773
// Test clusters are not reused, don't spend time on a graceful shutdown
774774
stopHandle(esProcess.toHandle(), true);
775+
reaper.unregister(toString());
775776
if (tailLogs) {
776777
logFileContents("Standard output of node", esStdoutFile);
777778
logFileContents("Standard error of node", esStderrFile);
@@ -796,39 +797,43 @@ public void setNameCustomization(Function<String, String> nameCustomizer) {
796797
}
797798

798799
private void stopHandle(ProcessHandle processHandle, boolean forcibly) {
799-
// Stop all children first, ES could actually be a child when there's some wrapper process like on Windows.
800+
// No-op if the process has already exited by itself.
800801
if (processHandle.isAlive() == false) {
801802
LOGGER.info("Process was not running when we tried to terminate it.");
802803
return;
803804
}
804805

805-
// Stop all children first, ES could actually be a child when there's some wrapper process like on Windows.
806-
processHandle.children().forEach(each -> stopHandle(each, forcibly));
806+
// Stop all children last - if the ML processes are killed before the ES JVM then
807+
// they'll be recorded as having failed and won't restart when the cluster restarts.
808+
// ES could actually be a child when there's some wrapper process like on Windows,
809+
// and in that case the ML processes will be grandchildren of the wrapper.
810+
List<ProcessHandle> children = processHandle.children().collect(Collectors.toList());
811+
try {
812+
logProcessInfo(
813+
"Terminating elasticsearch process" + (forcibly ? " forcibly " : "gracefully") + ":",
814+
processHandle.info()
815+
);
807816

808-
logProcessInfo(
809-
"Terminating elasticsearch process" + (forcibly ? " forcibly " : "gracefully") + ":",
810-
processHandle.info()
811-
);
817+
if (forcibly) {
818+
processHandle.destroyForcibly();
819+
} else {
820+
processHandle.destroy();
821+
waitForProcessToExit(processHandle);
822+
if (processHandle.isAlive() == false) {
823+
return;
824+
}
825+
LOGGER.info("process did not terminate after {} {}, stopping it forcefully",
826+
ES_DESTROY_TIMEOUT, ES_DESTROY_TIMEOUT_UNIT);
827+
processHandle.destroyForcibly();
828+
}
812829

813-
if (forcibly) {
814-
processHandle.destroyForcibly();
815-
} else {
816-
processHandle.destroy();
817830
waitForProcessToExit(processHandle);
818-
if (processHandle.isAlive() == false) {
819-
return;
831+
if (processHandle.isAlive()) {
832+
throw new TestClustersException("Was not able to terminate elasticsearch process for " + this);
820833
}
821-
LOGGER.info("process did not terminate after {} {}, stopping it forcefully",
822-
ES_DESTROY_TIMEOUT, ES_DESTROY_TIMEOUT_UNIT);
823-
processHandle.destroyForcibly();
824-
}
825-
826-
waitForProcessToExit(processHandle);
827-
if (processHandle.isAlive()) {
828-
throw new TestClustersException("Was not able to terminate elasticsearch process for " + this);
834+
} finally {
835+
children.forEach(each -> stopHandle(each, forcibly));
829836
}
830-
831-
reaper.unregister(toString());
832837
}
833838

834839
private void logProcessInfo(String prefix, ProcessHandle.Info info) {

x-pack/qa/rolling-upgrade/src/test/java/org/elasticsearch/upgrades/MlMappingsUpgradeIT.java

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,6 @@ protected Collection<String> templatesToWaitFor() {
3838
* The purpose of this test is to ensure that when a job is open through a rolling upgrade we upgrade the results
3939
* index mappings when it is assigned to an upgraded node even if no other ML endpoint is called after the upgrade
4040
*/
41-
@AwaitsFix(bugUrl = "https://github.com/elastic/elasticsearch/issues/46262")
4241
public void testMappingsUpgrade() throws Exception {
4342

4443
switch (CLUSTER_TYPE) {

0 commit comments

Comments
 (0)