Skip to content

Commit ee3542a

Browse files
authored
Log hot threads after cluster cleanup timeout (#122341)
In addition to logging the pending cluster tasks after the cluster health request times out during cluster cleanup in REST tests, we should log the hot threads to help identify any issues that could cause tasks to get stuck. Follow-up of #119186 Relates #111632 Relates #111431 Relates #111662
1 parent 3c18ea6 commit ee3542a

File tree

1 file changed

+22
-3
lines changed

1 file changed

+22
-3
lines changed

test/framework/src/main/java/org/elasticsearch/test/rest/ESRestTestCase.java

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1014,14 +1014,21 @@ private void wipeCluster() throws Exception {
10141014

10151015
private void waitForClusterUpdates() throws Exception {
10161016
logger.info("Waiting for all cluster updates up to this moment to be processed");
1017+
10171018
try {
10181019
assertOK(adminClient().performRequest(new Request("GET", "_cluster/health?wait_for_events=languid")));
10191020
} catch (ResponseException e) {
10201021
if (e.getResponse().getStatusLine().getStatusCode() == HttpStatus.SC_REQUEST_TIMEOUT) {
1022+
StringBuilder logMessage = new StringBuilder("Timed out waiting for cluster updates to be processed.");
10211023
final var pendingTasks = getPendingClusterStateTasks();
10221024
if (pendingTasks != null) {
1023-
logger.error("Timed out waiting for cluster updates to be processed, {}", pendingTasks);
1025+
logMessage.append('\n').append(pendingTasks);
1026+
}
1027+
final var hotThreads = getHotThreads();
1028+
if (hotThreads != null) {
1029+
logMessage.append("\nHot threads: ").append(hotThreads);
10241030
}
1031+
logger.error(logMessage.toString());
10251032
}
10261033
throw e;
10271034
}
@@ -1031,8 +1038,8 @@ private static String getPendingClusterStateTasks() {
10311038
try {
10321039
Response response = adminClient().performRequest(new Request("GET", "/_cluster/pending_tasks"));
10331040
List<?> tasks = (List<?>) entityAsMap(response).get("tasks");
1034-
if (false == tasks.isEmpty()) {
1035-
StringBuilder message = new StringBuilder("there are still running tasks:");
1041+
if (tasks.isEmpty() == false) {
1042+
StringBuilder message = new StringBuilder("There are still running tasks:");
10361043
for (Object task : tasks) {
10371044
message.append('\n').append(task.toString());
10381045
}
@@ -1044,6 +1051,18 @@ private static String getPendingClusterStateTasks() {
10441051
return null;
10451052
}
10461053

1054+
private String getHotThreads() {
1055+
try {
1056+
Response response = adminClient().performRequest(
1057+
new Request("GET", "/_nodes/hot_threads?ignore_idle_threads=false&threads=9999")
1058+
);
1059+
return EntityUtils.toString(response.getEntity());
1060+
} catch (IOException e) {
1061+
logger.error("Failed to retrieve hot threads in the cluster during cleanup", e);
1062+
}
1063+
return null;
1064+
}
1065+
10471066
/**
10481067
* This method checks whether ILM policies or templates get recreated after they have been deleted. If so, we are probably deleting
10491068
* them unnecessarily, potentially causing test performance problems. This could happen for example if someone adds a new standard ILM

0 commit comments

Comments
 (0)