Skip to content

Commit eca509a

Browse files
fix: corrected delimiter in documentation and updatetd config file instead of using override-parameters
1 parent f8beddb commit eca509a

File tree

2 files changed

+21
-13
lines changed

2 files changed

+21
-13
lines changed

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -148,8 +148,8 @@ hyperpod start-job --job-name <job-name> [--namespace <namespace>] [--job-kind <
148148
* `environment` (dict[string, string]) - Optional. The environment variables (key-value pairs) to set in the containers.
149149
* `node-count` (int) - Required. The number of nodes (instances) to launch the jobs on.
150150
* `instance-type` (string) - Required. The instance type to launch the job on. Note that the instance types you can use are the available instances within your SageMaker quotas for instances prefixed with `ml`.
151-
* `pre-script` (string) - Optional. Commands to run before the job starts. Multiple commands should be separated by semicolons.
152-
* `post-script` (string) - Optional. Commands to run after the job completes. Multiple commands should be separated by semicolons.
151+
* `pre-script` (list[string]) - Optional. Commands to run before the job starts. Multiple commands should be separated by comma.
152+
* `post-script` (list[string]) - Optional. Commands to run after the job completes. Multiple commands should be separated by comma.
153153
* `tasks-per-node` (int) - Optional. The number of devices to use per instance.
154154
* `label-selector` (dict[string, list[string]]) - Optional. A dictionary of labels and their values that will override the predefined node selection rules based on the SageMaker HyperPod `node-health-status` label and values. If users provide this field, the CLI will launch the job with this customized label selection.
155155
* `deep-health-check-passed-nodes-only` (bool) - Optional. If set to `true`, the job will be launched only on nodes that have the `deep-health-check-status` label with the value `passed`.

src/hyperpod_cli/commands/job.py

Lines changed: 19 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -720,6 +720,23 @@ def start_job(
720720
custom_labels[KUEUE_WORKLOAD_PRIORITY_CLASS_LABEL_KEY] = priority
721721
priority = None
722722

723+
# Handle pre_script
724+
if pre_script:
725+
_override_or_remove(
726+
config["cluster"]["cluster_config"],
727+
"pre_script",
728+
pre_script.split(',')
729+
)
730+
731+
# Handle post_script
732+
if post_script:
733+
_override_or_remove(
734+
config["cluster"]["cluster_config"],
735+
"post_script",
736+
post_script.split(',')
737+
)
738+
739+
723740
_override_or_remove(
724741
config["cluster"]["cluster_config"],
725742
"custom_labels",
@@ -806,9 +823,7 @@ def start_job(
806823
auto_resume=auto_resume,
807824
label_selector=label_selector,
808825
max_retry=max_retry,
809-
deep_health_check_passed_nodes_only=deep_health_check_passed_nodes_only,
810-
pre_script=pre_script,
811-
post_script=post_script,
826+
deep_health_check_passed_nodes_only=deep_health_check_passed_nodes_only
812827
)
813828
# TODO: Unblock this after fixing customer using EKS cluster.
814829
console_link = utils.get_cluster_console_url()
@@ -989,8 +1004,7 @@ def execute_command(cmd, env=None):
9891004
def start_training_job(recipe, override_parameters, job_name, config_file, launcher_config_path=None, launcher_config_file_name=None,
9901005
pull_policy=None, restart_policy=None, namespace=None,
9911006
service_account_name=None, priority_class_name=None, volumes=None, persistent_volume_claims=None,
992-
auto_resume=None, label_selector=None, max_retry=None, deep_health_check_passed_nodes_only=None,
993-
pre_script=None, post_script=None):
1007+
auto_resume=None, label_selector=None, max_retry=None, deep_health_check_passed_nodes_only=None):
9941008

9951009
logger.info(f"recipe: {recipe}, override_parameters: {override_parameters}, job_name: {job_name}, config_file: {config_file}, launcher_config_path: {launcher_config_path}, launcher_config_file_name: {launcher_config_file_name}")
9961010
env = os.environ.copy()
@@ -1052,12 +1066,6 @@ def start_training_job(recipe, override_parameters, job_name, config_file, launc
10521066
cmd.append(f'+cluster.persistent_volume_claims.{idx}.claimName="{claim_name}"')
10531067
cmd.append(f'+cluster.persistent_volume_claims.{idx}.mountPath="{mount_path}"')
10541068

1055-
if pre_script:
1056-
cmd.append(f'+cluster.pre_script="{pre_script}"')
1057-
1058-
if post_script:
1059-
cmd.append(f'+cluster.post_script="{post_script}"')
1060-
10611069
if label_selector:
10621070
cmd.append(f'+cluster.label_selector={label_selector}')
10631071
elif deep_health_check_passed_nodes_only:

0 commit comments

Comments
 (0)