[Data] DefaultAutoscalerV2 doesn't scale nodes from zero (ray-project#59896)

ryankert01 · bveeramani · jasonwrwang · commit c9cce6ac9db8 · 2026-01-13T10:23:19.000+08:00
## Description Addresses a critical issue in the `DefaultAutoscalerV2`, where nodes were not being properly scaled from zero. With this update, clusters managed by Ray will now automatically provision additional nodes when there is workload demand, even when starting from an idle (zero-node) state. ## Related issues Closes ray-project#59682 ## Additional information > Optional: Add implementation details, API changes, usage examples, screenshots, etc. --------- Signed-off-by: Hsien-Cheng Huang <ryankert01@gmail.com> Co-authored-by: Balaji Veeramani <balaji@anyscale.com> Signed-off-by: jasonwrwang <jasonwrwang@tencent.com>
diff --git a/python/ray/data/_internal/cluster_autoscaler/default_cluster_autoscaler_v2.py b/python/ray/data/_internal/cluster_autoscaler/default_cluster_autoscaler_v2.py
@@ -117,16 +117,36 @@ def __init__(
         # so the first `get_total_resources` call can get the allocated resources.
         self._send_resource_request([])
 
-    def _get_node_resource_spec_and_count(self) -> Dict[_NodeResourceSpec, int]:
-        """Get the unique node resource specs and their count in the cluster."""
-        # Filter out the head node.
+    def get_node_resource_spec_and_count(self) -> Dict[_NodeResourceSpec, int]:
+        """Get node types from cluster config and count alive nodes.
+
+        Enables scaling from zero by discovering node types from cluster config
+        even when no worker nodes are running.
+        """
+        nodes_resource_spec_count = defaultdict(int)
+
+        # Discover node types from cluster config
+        cluster_config = ray._private.state.state.get_cluster_config()
+        if cluster_config and cluster_config.node_group_configs:
+            for node_group_config in cluster_config.node_group_configs:
+                # Skip if no resources or max_count=0 (cannot scale)
+                if not node_group_config.resources or node_group_config.max_count == 0:
+                    continue
+
+                node_resource_spec = _NodeResourceSpec.of(
+                    cpu=node_group_config.resources.get("CPU", 0),
+                    gpu=node_group_config.resources.get("GPU", 0),
+                    mem=node_group_config.resources.get("memory", 0),
+                )
+                nodes_resource_spec_count[node_resource_spec] = 0
+
+        # Count alive worker nodes
         node_resources = [
             node["Resources"]
             for node in ray.nodes()
             if node["Alive"] and "node:__internal_head__" not in node["Resources"]
         ]
 
-        nodes_resource_spec_count = defaultdict(int)
         for r in node_resources:
             node_resource_spec = _NodeResourceSpec.of(
                 cpu=r["CPU"], gpu=r.get("GPU", 0), mem=r["memory"]
@@ -168,7 +188,7 @@ def try_trigger_scaling(self):
             return
 
         resource_request = []
-        node_resource_spec_count = self._get_node_resource_spec_and_count()
+        node_resource_spec_count = self.get_node_resource_spec_and_count()
         debug_msg = ""
         if logger.isEnabledFor(logging.DEBUG):
             debug_msg = (
diff --git a/python/ray/data/tests/test_default_cluster_autoscaler_v2.py b/python/ray/data/tests/test_default_cluster_autoscaler_v2.py
@@ -3,6 +3,7 @@
 import pytest
 
 import ray
+from ray.core.generated import autoscaler_pb2
 from ray.data._internal.cluster_autoscaler.default_cluster_autoscaler_v2 import (
     DefaultClusterAutoscalerV2,
     _NodeResourceSpec,
@@ -72,7 +73,7 @@ def teardown_class(self):
         ray.shutdown()
 
     def test_get_node_resource_spec_and_count(self):
-        # Test _get_node_resource_spec_and_count
+        # Test get_node_resource_spec_and_count
         autoscaler = DefaultClusterAutoscalerV2(
             resource_manager=MagicMock(),
             execution_id="test_execution_id",
@@ -123,8 +124,13 @@ def test_get_node_resource_spec_and_count(self):
             ): 1,
         }
 
+        # Patch cluster config to return None
         with patch("ray.nodes", return_value=node_table):
-            assert autoscaler._get_node_resource_spec_and_count() == expected
+            with patch(
+                "ray._private.state.state.get_cluster_config",
+                return_value=None,
+            ):
+                assert autoscaler.get_node_resource_spec_and_count() == expected
 
     @pytest.mark.parametrize("cpu_util", [0.5, 0.75])
     @pytest.mark.parametrize("gpu_util", [0.5, 0.75])
@@ -154,7 +160,7 @@ def test_try_scale_up_cluster(
 
         resource_spec1 = _NodeResourceSpec.of(cpu=4, gpu=0, mem=1000)
         resource_spec2 = _NodeResourceSpec.of(cpu=8, gpu=1, mem=1000)
-        autoscaler._get_node_resource_spec_and_count = MagicMock(
+        autoscaler.get_node_resource_spec_and_count = MagicMock(
             return_value={
                 resource_spec1: 2,
                 resource_spec2: 1,
@@ -195,6 +201,157 @@ def test_try_scale_up_cluster(
 
             _send_resource_request.assert_called_with(expected_resource_request)
 
+    def test_get_node_resource_spec_and_count_from_zero(self):
+        """Test that get_node_resource_spec_and_count can discover node types
+        from cluster config even when there are zero worker nodes."""
+        autoscaler = DefaultClusterAutoscalerV2(
+            resource_manager=MagicMock(),
+            execution_id="test_execution_id",
+        )
+
+        # Simulate a cluster with only head node (no worker nodes)
+        node_table = [
+            {
+                "Resources": self._head_node,
+                "Alive": True,
+            },
+        ]
+
+        # Create a mock cluster config with 2 worker node types
+        cluster_config = autoscaler_pb2.ClusterConfig()
+
+        # Node type 1: 4 CPU, 0 GPU, 1000 memory
+        node_group_config1 = autoscaler_pb2.NodeGroupConfig()
+        node_group_config1.resources["CPU"] = 4
+        node_group_config1.resources["memory"] = 1000
+        node_group_config1.max_count = 10
+        cluster_config.node_group_configs.append(node_group_config1)
+
+        # Node type 2: 8 CPU, 2 GPU, 2000 memory
+        node_group_config2 = autoscaler_pb2.NodeGroupConfig()
+        node_group_config2.resources["CPU"] = 8
+        node_group_config2.resources["GPU"] = 2
+        node_group_config2.resources["memory"] = 2000
+        node_group_config2.max_count = 5
+        cluster_config.node_group_configs.append(node_group_config2)
+
+        expected = {
+            _NodeResourceSpec.of(cpu=4, gpu=0, mem=1000): 0,
+            _NodeResourceSpec.of(cpu=8, gpu=2, mem=2000): 0,
+        }
+
+        with patch("ray.nodes", return_value=node_table):
+            with patch(
+                "ray._private.state.state.get_cluster_config",
+                return_value=cluster_config,
+            ):
+                result = autoscaler.get_node_resource_spec_and_count()
+                assert result == expected
+
+    @patch(
+        "ray.data._internal.cluster_autoscaler.default_cluster_autoscaler_v2.DefaultClusterAutoscalerV2._send_resource_request"
+    )
+    def test_try_scale_up_cluster_from_zero(self, _send_resource_request):
+        """Test that the autoscaler can scale up from zero worker nodes."""
+        scale_up_threshold = 0.75
+        scale_up_delta = 1
+        # High utilization to trigger scaling
+        utilization = ExecutionResources(cpu=0.9, gpu=0.9, object_store_memory=0.9)
+
+        autoscaler = DefaultClusterAutoscalerV2(
+            resource_manager=MagicMock(),
+            execution_id="test_execution_id",
+            cluster_scaling_up_delta=scale_up_delta,
+            resource_utilization_calculator=StubUtilizationGauge(utilization),
+            cluster_scaling_up_util_threshold=scale_up_threshold,
+        )
+        _send_resource_request.assert_called_with([])
+
+        # Mock the node resource spec with zero counts
+        resource_spec1 = _NodeResourceSpec.of(cpu=4, gpu=0, mem=1000)
+        resource_spec2 = _NodeResourceSpec.of(cpu=8, gpu=2, mem=2000)
+        autoscaler.get_node_resource_spec_and_count = MagicMock(
+            return_value={
+                resource_spec1: 0,  # Zero nodes of this type
+                resource_spec2: 0,  # Zero nodes of this type
+            },
+        )
+
+        autoscaler.try_trigger_scaling()
+
+        # Should request scale_up_delta nodes of each type
+        expected_resource_request = []
+
+        expected_resource_request.extend(
+            [
+                {
+                    "CPU": resource_spec1.cpu,
+                    "GPU": resource_spec1.gpu,
+                    "memory": resource_spec1.mem,
+                }
+            ]
+            * scale_up_delta
+        )
+
+        expected_resource_request.extend(
+            [
+                {
+                    "CPU": resource_spec2.cpu,
+                    "GPU": resource_spec2.gpu,
+                    "memory": resource_spec2.mem,
+                }
+            ]
+            * scale_up_delta
+        )
+
+        _send_resource_request.assert_called_with(expected_resource_request)
+
+    def test_get_node_resource_spec_and_count_skips_max_count_zero(self):
+        """Test that node types with max_count=0 are skipped."""
+        autoscaler = DefaultClusterAutoscalerV2(
+            resource_manager=MagicMock(),
+            execution_id="test_execution_id",
+        )
+
+        # Simulate a cluster with only head node (no worker nodes)
+        node_table = [
+            {
+                "Resources": self._head_node,
+                "Alive": True,
+            },
+        ]
+
+        # Create a mock cluster config with one valid node type and one with max_count=0
+        cluster_config = autoscaler_pb2.ClusterConfig()
+
+        # Node type 1: 4 CPU, 0 GPU, 1000 memory, max_count=10
+        node_group_config1 = autoscaler_pb2.NodeGroupConfig()
+        node_group_config1.resources["CPU"] = 4
+        node_group_config1.resources["memory"] = 1000
+        node_group_config1.max_count = 10
+        cluster_config.node_group_configs.append(node_group_config1)
+
+        # Node type 2: 8 CPU, 2 GPU, 2000 memory, max_count=0 (should be skipped)
+        node_group_config2 = autoscaler_pb2.NodeGroupConfig()
+        node_group_config2.resources["CPU"] = 8
+        node_group_config2.resources["GPU"] = 2
+        node_group_config2.resources["memory"] = 2000
+        node_group_config2.max_count = 0  # This should be skipped
+        cluster_config.node_group_configs.append(node_group_config2)
+
+        # Only the first node type should be discovered
+        expected = {
+            _NodeResourceSpec.of(cpu=4, gpu=0, mem=1000): 0,
+        }
+
+        with patch("ray.nodes", return_value=node_table):
+            with patch(
+                "ray._private.state.state.get_cluster_config",
+                return_value=cluster_config,
+            ):
+                result = autoscaler.get_node_resource_spec_and_count()
+                assert result == expected
+
 
 if __name__ == "__main__":
     import sys