Log hot threads after cluster cleanup timeout (#122341)

nielsbauman · web-flow · commit ee3542a27c8f · 2025-02-13T04:39:05.000+01:00
In addition to logging the pending cluster tasks after the cluster health request times out during cluster cleanup in REST tests, we should log the hot threads to help identify any issues that could cause tasks to get stuck. Follow-up of #119186 Relates #111632 Relates #111431 Relates #111662
diff --git a/test/framework/src/main/java/org/elasticsearch/test/rest/ESRestTestCase.java b/test/framework/src/main/java/org/elasticsearch/test/rest/ESRestTestCase.java
@@ -1014,14 +1014,21 @@ private void wipeCluster() throws Exception {
 
     private void waitForClusterUpdates() throws Exception {
         logger.info("Waiting for all cluster updates up to this moment to be processed");
+
         try {
             assertOK(adminClient().performRequest(new Request("GET", "_cluster/health?wait_for_events=languid")));
         } catch (ResponseException e) {
             if (e.getResponse().getStatusLine().getStatusCode() == HttpStatus.SC_REQUEST_TIMEOUT) {
+                StringBuilder logMessage = new StringBuilder("Timed out waiting for cluster updates to be processed.");
                 final var pendingTasks = getPendingClusterStateTasks();
                 if (pendingTasks != null) {
-                    logger.error("Timed out waiting for cluster updates to be processed, {}", pendingTasks);
+                    logMessage.append('\n').append(pendingTasks);
+                }
+                final var hotThreads = getHotThreads();
+                if (hotThreads != null) {
+                    logMessage.append("\nHot threads: ").append(hotThreads);
                 }
+                logger.error(logMessage.toString());
             }
             throw e;
         }
@@ -1031,8 +1038,8 @@ private static String getPendingClusterStateTasks() {
         try {
             Response response = adminClient().performRequest(new Request("GET", "/_cluster/pending_tasks"));
             List<?> tasks = (List<?>) entityAsMap(response).get("tasks");
-            if (false == tasks.isEmpty()) {
-                StringBuilder message = new StringBuilder("there are still running tasks:");
+            if (tasks.isEmpty() == false) {
+                StringBuilder message = new StringBuilder("There are still running tasks:");
                 for (Object task : tasks) {
                     message.append('\n').append(task.toString());
                 }
@@ -1044,6 +1051,18 @@ private static String getPendingClusterStateTasks() {
         return null;
     }
 
+    private String getHotThreads() {
+        try {
+            Response response = adminClient().performRequest(
+                new Request("GET", "/_nodes/hot_threads?ignore_idle_threads=false&threads=9999")
+            );
+            return EntityUtils.toString(response.getEntity());
+        } catch (IOException e) {
+            logger.error("Failed to retrieve hot threads in the cluster during cleanup", e);
+        }
+        return null;
+    }
+
     /**
      * This method checks whether ILM policies or templates get recreated after they have been deleted. If so, we are probably deleting
      * them unnecessarily, potentially causing test performance problems. This could happen for example if someone adds a new standard ILM