@@ -257,7 +257,7 @@ def _sanitize_k8s_name(name: str, max_length: int = 63) -> tuple[str, bool]:
257257 if not name :
258258 name = "job"
259259
260- was_modified = name != original . lower () or original != original . lower ()
260+ was_modified = name != original
261261 return name , was_modified
262262
263263
@@ -537,9 +537,10 @@ def _run_nemo_run(
537537 LOG .info (f"Job '{ job_name } ' depends on task handle '{ dep } ' (from reused experiment)" )
538538 elif isinstance (dep , dict ):
539539 # Dict dependency = internal job reference (by job spec object)
540- dep_name = dep .get ("name" )
541- if not dep_name :
542- raise ValueError (f"Job dependency must have a 'name' field: { dep } " )
540+ try :
541+ dep_name = dep ["name" ]
542+ except KeyError as exc :
543+ raise ValueError (f"Job dependency must have a 'name' field: { dep } " ) from exc
543544 if dep_name in job_name_to_handle :
544545 internal_deps .append (job_name_to_handle [dep_name ])
545546 LOG .info (
@@ -695,9 +696,10 @@ def _run_kubernetes(self, dry_run: bool = False, log_dir: Optional[str] = None,
695696 LOG .warning (f"External dependency '{ dep } ' not supported on Kubernetes, skipping" )
696697 elif isinstance (dep , dict ):
697698 # Dict dependency = internal job reference (same as _run_nemo_run)
698- dep_name = dep .get ("name" )
699- if not dep_name :
700- raise ValueError (f"Job dependency must have a 'name' field: { dep } " )
699+ try :
700+ dep_name = dep ["name" ]
701+ except KeyError as exc :
702+ raise ValueError (f"Job dependency must have a 'name' field: { dep } " ) from exc
701703 if dep_name in job_name_to_handle :
702704 dependency_handles .append (job_name_to_handle [dep_name ])
703705 LOG .info (f"Job '{ original_job_name } ' depends on internal job '{ dep_name } '" )
@@ -744,8 +746,8 @@ def _run_kubernetes(self, dry_run: bool = False, log_dir: Optional[str] = None,
744746 LOG .info (f"Waiting for job '{ job_name } ' to complete (sequential mode)..." )
745747 status = backend .wait_for_completion (handle )
746748 LOG .info (f"Job '{ job_name } ' completed with status: { status .value } " )
747- if status == JobStatus .FAILED :
748- raise RuntimeError (f"Job '{ job_name } ' failed , aborting pipeline" )
749+ if status != JobStatus .SUCCEEDED :
750+ raise RuntimeError (f"Job '{ job_name } ' did not succeed (status= { status . value } ) , aborting pipeline" )
749751
750752 if dry_run :
751753 LOG .info ("Dry run complete. No jobs were submitted." )
@@ -808,15 +810,12 @@ def _convert_groups_to_job_spec(
808810 # Prepare the command (evaluates lazy commands)
809811 script , exec_config = self ._prepare_command (command , self .cluster_config )
810812
811- # Get the command string
812- if callable (script .inline ):
813- cmd_result = script .inline ()
814- if isinstance (cmd_result , tuple ):
815- cmd_str , _ = cmd_result
816- else :
817- cmd_str = cmd_result
818- else :
819- cmd_str = script .inline
813+ # _prepare_command() resolves lazy callables; inline is expected to be a string now.
814+ cmd_str = script .inline
815+ if not isinstance (cmd_str , str ):
816+ raise TypeError (
817+ f"Command '{ command .name } ' must resolve to a string inline command, got { type (cmd_str ).__name__ } "
818+ )
820819
821820 # Resolve container image
822821 container_image = self ._resolve_container (exec_config , command , self .cluster_config )
@@ -843,7 +842,15 @@ def _convert_groups_to_job_spec(
843842 # Get ports from script if available
844843 ports = []
845844 if hasattr (script , "port" ):
846- ports = [script .port ]
845+ script_port = script .port
846+ if isinstance (script_port , int ) and 1 <= script_port <= 65535 :
847+ ports = [script_port ]
848+ elif script_port is not None :
849+ LOG .warning (
850+ "Ignoring invalid port value %r on command '%s'; expected int in [1, 65535]" ,
851+ script_port ,
852+ command .name ,
853+ )
847854
848855 # Create container spec
849856 container = ContainerSpec (
@@ -926,7 +933,11 @@ def _print_dry_run_job(self, job_name: str, spec: JobSpec):
926933 LOG .info (f" - { container .name } " )
927934 LOG .info (f" Image: { container .image } " )
928935 LOG .info (f" GPUs: { container .resources .gpus } " )
929- LOG .info (f" Command: { ' ' .join (container .command [:50 ])} ..." )
936+ command_text = " " .join (container .command )
937+ max_chars = 200
938+ if len (command_text ) > max_chars :
939+ command_text = f"{ command_text [:max_chars ]} ..."
940+ LOG .info (f" Command: { command_text } " )
930941 if spec .dependencies :
931942 LOG .info (f"Dependencies: { spec .dependencies } " )
932943 LOG .info (f"Timeout: { spec .timeout_seconds } s" )
0 commit comments