Skip to content

Commit 3b66e6f

Browse files
authored
Log hot threads after cluster cleanup timeout (elastic#122341) (elastic#122462)
In addition to logging the pending cluster tasks after the cluster health request times out during cluster cleanup in REST tests, we should log the hot threads to help identify any issues that could cause tasks to get stuck. Follow-up of elastic#119186 Relates elastic#111632 Relates elastic#111431 Relates elastic#111662
1 parent 80a090f commit 3b66e6f

File tree

1 file changed

+22
-3
lines changed

1 file changed

+22
-3
lines changed

test/framework/src/main/java/org/elasticsearch/test/rest/ESRestTestCase.java

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1012,14 +1012,21 @@ private void wipeCluster() throws Exception {
10121012

10131013
private void waitForClusterUpdates() throws Exception {
10141014
logger.info("Waiting for all cluster updates up to this moment to be processed");
1015+
10151016
try {
10161017
assertOK(adminClient().performRequest(new Request("GET", "_cluster/health?wait_for_events=languid")));
10171018
} catch (ResponseException e) {
10181019
if (e.getResponse().getStatusLine().getStatusCode() == HttpStatus.SC_REQUEST_TIMEOUT) {
1020+
StringBuilder logMessage = new StringBuilder("Timed out waiting for cluster updates to be processed.");
10191021
final var pendingTasks = getPendingClusterStateTasks();
10201022
if (pendingTasks != null) {
1021-
logger.error("Timed out waiting for cluster updates to be processed, {}", pendingTasks);
1023+
logMessage.append('\n').append(pendingTasks);
1024+
}
1025+
final var hotThreads = getHotThreads();
1026+
if (hotThreads != null) {
1027+
logMessage.append("\nHot threads: ").append(hotThreads);
10221028
}
1029+
logger.error(logMessage.toString());
10231030
}
10241031
throw e;
10251032
}
@@ -1029,8 +1036,8 @@ private static String getPendingClusterStateTasks() {
10291036
try {
10301037
Response response = adminClient().performRequest(new Request("GET", "/_cluster/pending_tasks"));
10311038
List<?> tasks = (List<?>) entityAsMap(response).get("tasks");
1032-
if (false == tasks.isEmpty()) {
1033-
StringBuilder message = new StringBuilder("there are still running tasks:");
1039+
if (tasks.isEmpty() == false) {
1040+
StringBuilder message = new StringBuilder("There are still running tasks:");
10341041
for (Object task : tasks) {
10351042
message.append('\n').append(task.toString());
10361043
}
@@ -1042,6 +1049,18 @@ private static String getPendingClusterStateTasks() {
10421049
return null;
10431050
}
10441051

1052+
private String getHotThreads() {
1053+
try {
1054+
Response response = adminClient().performRequest(
1055+
new Request("GET", "/_nodes/hot_threads?ignore_idle_threads=false&threads=9999")
1056+
);
1057+
return EntityUtils.toString(response.getEntity());
1058+
} catch (IOException e) {
1059+
logger.error("Failed to retrieve hot threads in the cluster during cleanup", e);
1060+
}
1061+
return null;
1062+
}
1063+
10451064
/**
10461065
* This method checks whether ILM policies or templates get recreated after they have been deleted. If so, we are probably deleting
10471066
* them unnecessarily, potentially causing test performance problems. This could happen for example if someone adds a new standard ILM

0 commit comments

Comments
 (0)