[core][autoscaler] Fix RAY_NODE_TYPE_NAME handling when autoscaler is in read-only mode (#58460)

rueian · web-flow · commit e793631896f6 · 2025-11-08T17:32:10.000-06:00
This ensures node type names are correctly reported even when the autoscaler is disabled (read-only mode). ## Description Autoscaler v2 fails to report prometheus metrics when operating in read-only mode on KubeRay with the following KeyError error: ``` 2025-11-08 12:06:57,402 ERROR autoscaler.py:215 -- 'small-group' Traceback (most recent call last): File "/home/ray/anaconda3/lib/python3.9/site-packages/ray/autoscaler/v2/autoscaler.py", line 200, in update_autoscaling_state return Reconciler.reconcile( File "/home/ray/anaconda3/lib/python3.9/site-packages/ray/autoscaler/v2/instance_manager/reconciler.py", line 120, in reconcile Reconciler._step_next( File "/home/ray/anaconda3/lib/python3.9/site-packages/ray/autoscaler/v2/instance_manager/reconciler.py", line 275, in _step_next Reconciler._scale_cluster( File "/home/ray/anaconda3/lib/python3.9/site-packages/ray/autoscaler/v2/instance_manager/reconciler.py", line 1125, in _scale_cluster reply = scheduler.schedule(sched_request) File "/home/ray/anaconda3/lib/python3.9/site-packages/ray/autoscaler/v2/scheduler.py", line 933, in schedule ResourceDemandScheduler._enforce_max_workers_per_type(ctx) File "/home/ray/anaconda3/lib/python3.9/site-packages/ray/autoscaler/v2/scheduler.py", line 1006, in _enforce_max_workers_per_type node_config = ctx.get_node_type_configs()[node_type] KeyError: 'small-group' ``` This happens because the `ReadOnlyProviderConfigReader` populates `ctx.get_node_type_configs()` using node IDs as node types, which is correct for local Ray (where local ray does not have `RAY_NODE_TYPE_NAME` set), but incorrect for KubeRay where `ray_node_type_name` is present and expected with `RAY_NODE_TYPE_NAME` set. As a result, in read-only mode the scheduler sees a node type name (ex. small-group) that never exists in the populated configs. This PR fixes the issue by using `ray_node_type_name` when it exists, and only falling back to node ID when it does not. ## Related issues Fixes #58227 Signed-off-by: Rueian <rueiancsie@gmail.com>
diff --git a/python/ray/autoscaler/_private/readonly/defaults.yaml b/python/ray/autoscaler/_private/readonly/defaults.yaml
@@ -13,7 +13,6 @@ available_node_types:
         max_workers: 0
 head_node_type: ray.head.default
 upscaling_speed: 1.0
-idle_timeout_minutes: 0
 #
 # !!! Configurations below are not supported in fake cluster mode !!!
 #
diff --git a/python/ray/autoscaler/v2/instance_manager/config.py b/python/ray/autoscaler/v2/instance_manager/config.py
@@ -522,16 +522,23 @@ def refresh_cached_autoscaling_config(self) -> AutoscalingConfig:
 
         head_node_type = None
         for node_state in ray_cluster_resource_state.node_states:
-            node_type = format_readonly_node_type(binary_to_hex(node_state.node_id))
+            node_type = node_state.ray_node_type_name
+            if not node_type:
+                node_type = format_readonly_node_type(binary_to_hex(node_state.node_id))
+
             if is_head_node(node_state):
                 head_node_type = node_type
 
-            available_node_types[node_type] = {
-                "resources": dict(node_state.total_resources),
-                "min_workers": 0,
-                "max_workers": 0 if is_head_node(node_state) else 1,
-                "node_config": {},
-            }
+            if node_type not in available_node_types:
+                available_node_types[node_type] = {
+                    "resources": dict(node_state.total_resources),
+                    "min_workers": 0,
+                    "max_workers": 0 if is_head_node(node_state) else 1,
+                    "node_config": {},
+                }
+            elif not is_head_node(node_state):
+                available_node_types[node_type]["max_workers"] += 1
+
         if available_node_types:
             self._configs["available_node_types"].update(available_node_types)
             self._configs["max_workers"] = len(available_node_types)
diff --git a/python/ray/autoscaler/v2/tests/test_config.py b/python/ray/autoscaler/v2/tests/test_config.py
@@ -5,9 +5,16 @@
 
 import pytest  # noqa
 
+from ray._common.utils import binary_to_hex
 from ray._private.test_utils import get_test_config_path
 from ray.autoscaler import AUTOSCALER_DIR_PATH
-from ray.autoscaler.v2.instance_manager.config import FileConfigReader, Provider
+from ray.autoscaler._private.util import format_readonly_node_type
+from ray.autoscaler.v2.instance_manager import config as config_mod
+from ray.autoscaler.v2.instance_manager.config import (
+    FileConfigReader,
+    Provider,
+    ReadOnlyProviderConfigReader,
+)
 
 
 @pytest.mark.parametrize(
@@ -179,6 +186,56 @@ def test_read_config():
     assert config_reader.get_cached_autoscaling_config().provider == Provider.GCP
 
 
+def test_readonly_node_type_name_and_fallback(monkeypatch):
+    class _DummyNodeState:
+        def __init__(self, ray_node_type_name, node_id, total_resources):
+            self.ray_node_type_name = ray_node_type_name
+            self.node_id = node_id
+            self.total_resources = total_resources
+
+    class _DummyClusterState:
+        def __init__(self, node_states):
+            self.node_states = node_states
+
+    # Avoid real GCS usage.
+    monkeypatch.setattr(config_mod, "GcsClient", lambda address: object())
+    # Build a cluster with:
+    # - 1 named head type
+    # - 2 named worker types of the same type (aggregation check)
+    # - 1 worker type without name (fallback to node_id-based type)
+    unnamed_worker_id = b"\xab"
+    fallback_name = format_readonly_node_type(binary_to_hex(unnamed_worker_id))
+    nodes = [
+        _DummyNodeState(
+            "ray.head.default", b"\x01", {"CPU": 1, "node:__internal_head__": 1}
+        ),
+        _DummyNodeState("worker.custom", b"\x02", {"CPU": 2}),
+        _DummyNodeState("worker.custom", b"\x03", {"CPU": 2}),
+        _DummyNodeState("", unnamed_worker_id, {"CPU": 3}),
+    ]
+    monkeypatch.setattr(
+        config_mod,
+        "get_cluster_resource_state",
+        lambda _gc: _DummyClusterState(nodes),
+    )
+
+    reader = ReadOnlyProviderConfigReader("dummy:0")
+    reader.refresh_cached_autoscaling_config()
+    cfg = reader.get_cached_autoscaling_config()
+
+    node_types = cfg.get_config("available_node_types")
+    # Head assertions
+    assert "ray.head.default" in node_types
+    assert node_types["ray.head.default"]["max_workers"] == 0
+    assert cfg.get_head_node_type() == "ray.head.default"
+    # Preferred name aggregation
+    assert "worker.custom" in node_types
+    assert node_types["worker.custom"]["max_workers"] == 2
+    # Fallback for unnamed worker
+    assert fallback_name in node_types
+    assert node_types[fallback_name]["max_workers"] == 1
+
+
 if __name__ == "__main__":
     if os.environ.get("PARALLEL_CI"):
         sys.exit(pytest.main(["-n", "auto", "--boxed", "-vs", __file__]))

Original file line number	Diff line number	Diff line change
`@@ -13,7 +13,6 @@ available_node_types:`
`13`	`13`	`max_workers: 0`
`14`	`14`	`head_node_type: ray.head.default`
`15`	`15`	`upscaling_speed: 1.0`
`16`		`-idle_timeout_minutes: 0`
`17`	`16`	`#`
`18`	`17`	`# !!! Configurations below are not supported in fake cluster mode !!!`
`19`	`18`	`#`