change: autofill required instance-type label (#68)

benieric · web-flow · commit 2696940837c2 · 2025-03-17T12:52:52.000-07:00
* change: autofill required instance-type label

* Add validation for CLI input

* Respect user provided label_selector preferred instance_type

* safer assignment of label

* fix assignment in validator

* use node.* prefix instead of beta.* prefix in label

* Update README.md
diff --git a/README.md b/README.md
@@ -147,7 +147,7 @@ hyperpod start-job --job-name <job-name> [--namespace <namespace>] [--job-kind <
 * `script-args` (list[string]) - Optional. The list of arguments for entry scripts.
 * `environment` (dict[string, string]) - Optional. The environment variables (key-value pairs) to set in the containers.
 * `node-count` (int) - Required. The number of nodes (instances) to launch the jobs on.
-* `instance-type` (string) - Required. The instance type to launch the job on. Note that the instance types you can use are the available instances within your SageMaker quotas for instances prefixed with `ml`.
+* `instance-type` (string) - Required. The instance type to launch the job on. Note that the instance types you can use are the available instances within your SageMaker quotas for instances prefixed with `ml`. If `node.kubernetes.io/instance-type` is provided via the `label-selector` it will take precedence for node selection.
 * `tasks-per-node` (int) - Optional. The number of devices to use per instance.
 * `label-selector` (dict[string, list[string]]) - Optional. A dictionary of labels and their values that will override the predefined node selection rules based on the SageMaker HyperPod `node-health-status` label and values. If users provide this field, the CLI will launch the job with this customized label selection.
 * `deep-health-check-passed-nodes-only` (bool) - Optional. If set to `true`, the job will be launched only on nodes that have the `deep-health-check-status` label with the value `passed`.
diff --git a/src/hyperpod_cli/commands/job.py b/src/hyperpod_cli/commands/job.py
@@ -32,6 +32,7 @@
     HYPERPOD_KUBERNETES_JOB_PREFIX,
     HYPERPOD_MAX_RETRY_ANNOTATION_KEY,
     HYPERPOD_NAMESPACE_PREFIX,
+    INSTANCE_TYPE_LABEL,
     KUEUE_JOB_UID_LABEL_KEY,
     KUEUE_QUEUE_NAME_LABEL_KEY,
     KUEUE_WORKLOAD_PRIORITY_CLASS_LABEL_KEY,
@@ -661,7 +662,7 @@ def start_job(
                 config["cluster"]["cluster_config"]["volumes"] = volume_mount
 
             if label_selector is not None:
-                config["cluster"]["cluster_config"]["label_selector"] = label_selector
+                config["cluster"]["cluster_config"]["label_selector"] = json.loads(label_selector)
             elif deep_health_check_passed_nodes_only:
                 config["cluster"]["cluster_config"]["label_selector"] = (
                     DEEP_HEALTH_CHECK_PASSED_ONLY_NODE_AFFINITY_DICT
@@ -671,6 +672,20 @@ def start_job(
                     NODE_AFFINITY_DICT
                 )
 
+            label_selector = config["cluster"]["cluster_config"].setdefault("label_selector",{})
+            required_labels = label_selector.get("required", {})
+            preferred_labels = label_selector.get("preferred", {})
+
+            if (
+                not required_labels.get(INSTANCE_TYPE_LABEL) and
+                not preferred_labels.get(INSTANCE_TYPE_LABEL)
+            ):
+                if "required" not in label_selector:
+                    label_selector["required"] = {}
+                label_selector["required"][INSTANCE_TYPE_LABEL] = (
+                    [str(instance_type)]
+                )
+
             if auto_resume:
                 # Set max_retry default to 1
                 if max_retry is None:
diff --git a/src/hyperpod_cli/validators/job_validator.py b/src/hyperpod_cli/validators/job_validator.py
@@ -24,6 +24,7 @@
     KUEUE_QUEUE_NAME_LABEL_KEY,
     HYPERPOD_AUTO_RESUME_ANNOTATION_KEY,
     HYPERPOD_MAX_RETRY_ANNOTATION_KEY,
+    INSTANCE_TYPE_LABEL,
     SchedulerType
 )
 from hyperpod_cli.constants.hyperpod_instance_types import (
@@ -184,6 +185,20 @@ def validate_yaml_content(data):
     if custom_labels is not None:
         queue_name = custom_labels.get(KUEUE_QUEUE_NAME_LABEL_KEY, None)
 
+    label_selector = cluster_config_fields.setdefault("label_selector",{})
+    required_labels = label_selector.get("required", {})
+    preferred_labels = label_selector.get("preferred", {})
+
+    if (
+        not required_labels.get(INSTANCE_TYPE_LABEL) and
+        not preferred_labels.get(INSTANCE_TYPE_LABEL)
+    ):
+        if "required" not in label_selector:
+            label_selector["required"] = {}
+        label_selector["required"][INSTANCE_TYPE_LABEL] = (
+            [str(instance_type)]
+        )
+
     auto_resume = False
     max_retry = None
     if annotations is not None:
diff --git a/test/unit_tests/test_job.py b/test/unit_tests/test_job.py
@@ -493,6 +493,169 @@ def test_start_job_with_cli_args(
             print(f"Exception: {result.exception}")
         self.assertEqual(result.exit_code, 0)
 
+    @mock.patch('subprocess.run')
+    @mock.patch("yaml.dump")
+    @mock.patch("os.path.exists", return_value=True)
+    @mock.patch("os.remove", return_value=None)
+    @mock.patch("hyperpod_cli.utils.get_cluster_console_url")
+    @mock.patch("hyperpod_cli.clients.kubernetes_client.KubernetesClient.__new__")
+    @mock.patch("hyperpod_cli.commands.job.JobValidator")
+    @mock.patch("boto3.Session")
+    def test_start_job_default_label_selector_config(
+        self,
+        mock_boto3,
+        mock_validator_cls,
+        mock_kubernetes_client,
+        mock_get_console_link,
+        mock_remove,
+        mock_exists,
+        mock_yaml_dump,
+        mock_subprocess_run,
+    ):
+        # Setup mocks
+        mock_validator = mock_validator_cls.return_value
+        mock_validator.validate_aws_credential.return_value = True
+        mock_kubernetes_client.get_current_context_namespace.return_value = "kubeflow"
+        mock_get_console_link.return_value = "test-console-link"
+        mock_subprocess_run.return_value = subprocess.CompletedProcess(
+            args=['some_command'],
+            returncode=0,
+            stdout='Command executed successfully',
+            stderr=''
+        )
+
+        expected_default_label_selector_config = {
+            "required": {
+                "sagemaker.amazonaws.com/node-health-status": ["Schedulable"],
+                "node.kubernetes.io/instance-type": ["ml.c5.xlarge"]
+            },
+            "preferred": {"sagemaker.amazonaws.com/deep-health-check-status": ["Passed"]},
+            "weights": [100],
+        }
+
+        # Capture the yaml.dump calls to inspect the config
+        configs_dumped = []
+        def capture_yaml_dump(config, *args, **kwargs):
+            configs_dumped.append(config)
+            print(f"Dumped config: {config}")
+            return None
+        mock_yaml_dump.side_effect = capture_yaml_dump
+
+        # Run the command
+        result = self.runner.invoke(
+            start_job,
+            [
+                "--job-name", "test-job",
+                "--instance-type", "ml.c5.xlarge",
+                "--image", "pytorch:1.9.0-cuda11.1-cudnn8-runtime",
+                "--node-count", "2",
+                "--entry-script", "/opt/train/src/train.py",
+            ],
+            catch_exceptions=False
+        )
+
+        # Verify the command executed successfully
+        self.assertEqual(result.exit_code, 0)
+
+        # Get the config that was generated
+        self.assertTrue(len(configs_dumped) > 0, "No config was generated")
+        config = configs_dumped[0]  # Get the first config that was dumped
+
+        # Verify label_selector configuration
+        self.assertIn('cluster', config)
+        self.assertIn('cluster_config', config['cluster'])
+        self.assertIn('label_selector', config['cluster']['cluster_config'])
+
+        self.assertEqual(
+            config['cluster']['cluster_config']['label_selector'],
+            expected_default_label_selector_config
+        )
+
+        print(f"Exit code: {result.exit_code}")
+        print(f"Output: {result.output}")
+        if result.exception:
+            print(f"Exception: {result.exception}")
+            
+    @mock.patch('subprocess.run')
+    @mock.patch("yaml.dump")
+    @mock.patch("os.path.exists", return_value=True)
+    @mock.patch("os.remove", return_value=None)
+    @mock.patch("hyperpod_cli.utils.get_cluster_console_url")
+    @mock.patch("hyperpod_cli.clients.kubernetes_client.KubernetesClient.__new__")
+    @mock.patch("hyperpod_cli.commands.job.JobValidator")
+    @mock.patch("boto3.Session")
+    def test_start_job_label_selector_preferred_instance_type(
+        self,
+        mock_boto3,
+        mock_validator_cls,
+        mock_kubernetes_client,
+        mock_get_console_link,
+        mock_remove,
+        mock_exists,
+        mock_yaml_dump,
+        mock_subprocess_run,
+    ):
+        # Setup mocks
+        mock_validator = mock_validator_cls.return_value
+        mock_validator.validate_aws_credential.return_value = True
+        mock_kubernetes_client.get_current_context_namespace.return_value = "kubeflow"
+        mock_get_console_link.return_value = "test-console-link"
+        mock_subprocess_run.return_value = subprocess.CompletedProcess(
+            args=['some_command'],
+            returncode=0,
+            stdout='Command executed successfully',
+            stderr=''
+        )
+
+        expected_default_label_selector_config = {
+            "preferred": {"node.kubernetes.io/instance-type": ["ml.c5.xlarge"]},
+        }
+
+        # Capture the yaml.dump calls to inspect the config
+        configs_dumped = []
+        def capture_yaml_dump(config, *args, **kwargs):
+            configs_dumped.append(config)
+            print(f"Dumped config: {config}")
+            return None
+        mock_yaml_dump.side_effect = capture_yaml_dump
+
+        # Run the command
+        result = self.runner.invoke(
+            start_job,
+            [
+                "--job-name", "test-job",
+                "--instance-type", "ml.c5.xlarge",
+                "--image", "pytorch:1.9.0-cuda11.1-cudnn8-runtime",
+                "--node-count", "2",
+                "--entry-script", "/opt/train/src/train.py",
+                "--label-selector", 
+                '{"preferred": {"node.kubernetes.io/instance-type": ["ml.c5.xlarge"]}}',
+            ],
+            catch_exceptions=False
+        )
+
+        # Verify the command executed successfully
+        self.assertEqual(result.exit_code, 0)
+
+        # Get the config that was generated
+        self.assertTrue(len(configs_dumped) > 0, "No config was generated")
+        config = configs_dumped[0]  # Get the first config that was dumped
+
+        # Verify label_selector configuration
+        self.assertIn('cluster', config)
+        self.assertIn('cluster_config', config['cluster'])
+        self.assertIn('label_selector', config['cluster']['cluster_config'])
+
+        self.assertEqual(
+            config['cluster']['cluster_config']['label_selector'],
+            expected_default_label_selector_config
+        )
+
+        print(f"Exit code: {result.exit_code}")
+        print(f"Output: {result.output}")
+        if result.exception:
+            print(f"Exception: {result.exception}")
+
     @mock.patch('subprocess.run')
     @mock.patch("yaml.dump")
     @mock.patch("os.path.exists", return_value=True)
diff --git a/test/unit_tests/validators/test_job_validator.py b/test/unit_tests/validators/test_job_validator.py
@@ -1086,6 +1086,101 @@ def test_validate_yaml_content_valid(self):
         }
         result = validate_yaml_content(mock_data)
         self.assertTrue(result)
+        
+    def test_validate_yaml_content_preferred_instance_type_label(self):
+        expected_label_selector = {
+            "preferred": {
+                "node.kubernetes.io/instance-type": [
+                    "ml.g5.xlarge"
+                ]
+            }
+        }
+        
+        # Respect user provided label_selector
+        mock_data = {
+            "cluster": {
+                "cluster_type": "k8s",
+                "instance_type": "ml.g5.xlarge",
+                "cluster_config": {
+                    "scheduler": "SageMaker",
+                    "label_selector": {
+                        "preferred": {
+                            "node.kubernetes.io/instance-type": [
+                                "ml.g5.xlarge"
+                            ]
+                        }
+                    }
+                },
+            }
+        }
+        
+        result = validate_yaml_content(mock_data)
+        self.assertTrue(result)
+        self.assertEqual(
+            mock_data["cluster"]["cluster_config"]["label_selector"], expected_label_selector
+        )
+    
+    def test_validate_yaml_content_required_instance_type_label(self):
+        expected_label_selector = {
+            "required": {
+                "node.kubernetes.io/instance-type": [
+                    "ml.g5.xlarge"
+                ]
+            }
+        }
+        
+        # User does not provide label_selector
+        mock_data = {
+            "cluster": {
+                "cluster_type": "k8s",
+                "instance_type": "ml.g5.xlarge",
+                "cluster_config": {
+                    "scheduler": "SageMaker"
+                },
+            }
+        }
+        
+        result = validate_yaml_content(mock_data)
+        self.assertTrue(result)
+        self.assertEqual(
+            mock_data["cluster"]["cluster_config"]["label_selector"], expected_label_selector
+        )
+        
+        expected_label_selector = {
+            "required": {
+                "sagemaker.amazonaws.com/node-health-status": [
+                    "Schedulable"
+                ],
+                "node.kubernetes.io/instance-type": [
+                    "ml.g5.xlarge"
+                ]
+            }
+        }
+        
+        # User provides label_selector without instance_type
+        mock_data = {
+            "cluster": {
+                "cluster_type": "k8s",
+                "instance_type": "ml.g5.xlarge",
+                "cluster_config": {
+                    "scheduler": "SageMaker",
+                    "label_selector": {
+                        "required": {
+                            "sagemaker.amazonaws.com/node-health-status": [
+                                "Schedulable"
+                            ]
+                        }
+                    }
+                },
+            }
+        }
+        
+        result = validate_yaml_content(mock_data)
+        self.assertTrue(result)
+        self.assertEqual(
+            mock_data["cluster"]["cluster_config"]["label_selector"], expected_label_selector
+        )
+
 
     def test_validate_yaml_content_error_no_cluster(
         self,