Override env vars if exist in custom envs coming from commands (#524)

sharabiani · web-flow · commit 414a6e115c0e · 2025-07-10T10:25:34.000+02:00
* env vars become a dictionary and values overrided

* removed excesive arg

* imported missing modules

* added missing arg

* removed excesive imports

* fixed imports

* fixed dict merge
diff --git a/src/xpk/commands/workload.py b/src/xpk/commands/workload.py
@@ -27,16 +27,12 @@
     setup_k8s_env,
 )
 from ..core.commands import run_command_with_updates, run_commands
-from ..core.config import (
-    VERTEX_TENSORBOARD_FEATURE_FLAG,
-    XPK_CURRENT_VERSION,
-    parse_env_config,
-)
+from ..core.config import (VERTEX_TENSORBOARD_FEATURE_FLAG, XPK_CURRENT_VERSION)
 from ..core.docker_container import (
     get_main_container_docker_image,
     get_user_workload_container,
 )
-from ..core.docker_resources import get_volumes
+from ..core.docker_resources import get_volumes, parse_env_config
 from ..core.gcloud_context import add_zone_and_project
 from ..core.kueue import LOCAL_QUEUE_NAME
 from ..core.monitoring import get_gke_outlier_dashboard
@@ -353,7 +349,7 @@ def workload_create(args) -> None:
     if not tensorboard_config:
       xpk_exit(1)
 
-  parse_env_config(args, tensorboard_config, system)
+  parse_env_config(args, tensorboard_config)
 
   autoprovisioning_args = ''
   autoprovisioning_enabled, return_code = is_autoprovisioning_enabled(
diff --git a/src/xpk/core/config.py b/src/xpk/core/config.py
@@ -15,13 +15,11 @@
 """
 
 import os
-import re
 
 import ruamel.yaml
 
 from ..utils import file
 from ..utils.console import xpk_print
-from .system_characteristics import AcceleratorType, SystemCharacteristics
 
 # This is the version for XPK PyPI package
 __version__ = 'v0.8.0'
@@ -117,65 +115,3 @@ def get_all(
       return None
     val: dict[str, str] = config_yaml[CONFIGS_KEY]
     return val
-
-
-def parse_env_config(args, tensorboard_config, system: SystemCharacteristics):
-  """Parses the environment configurations to the jobset config.
-
-  Args:
-    args: user provided arguments for running the command.
-    tensorboard_config: configuration of Vertex Tensorboard.
-    system: system characteristics.
-  """
-  env = {}
-
-  env_pat = re.compile(r'(^[a-zA-Z_][a-zA-Z0-9_]*?)(?:=(.*))?$', re.M)
-  if args.env_file:
-    print('Setting container environment from', args.env_file)
-    with open(file=args.env_file, mode='r', encoding='utf-8') as f:
-      for match in env_pat.finditer(f.read()):
-        variable = match.group(1)
-        if match.group(2) is not None:
-          env[variable] = match.group(2)
-        else:
-          assert variable in os.environ, (
-              f'Variable {variable} is not set in the current '
-              'environment, a value must be specified.'
-          )
-          env[variable] = os.environ[variable]
-  if args.env:
-    for var in args.env:
-      match = env_pat.match(var)
-      assert match and match.group(2) is not None, (
-          'Invalid environment variable, format must be '
-          f'`--env VARIABLE=value`: {var}'
-      )
-      variable = match.group(1)
-      env[variable] = match.group(2)
-
-  if not args.use_pathways:
-    if args.debug_dump_gcs:
-      if 'XLA_FLAGS' in env:
-        raise ValueError(
-            'Conflict: XLA_FLAGS defined in both --debug_dump_gcs '
-            'and environment file. Please choose one way to define '
-            'XLA_FLAGS.'
-        )
-      env['XLA_FLAGS'] = '--xla_dump_to=/tmp/xla_dump/'
-
-    if tensorboard_config:
-      env['UPLOAD_DATA_TO_TENSORBOARD'] = True
-      for key, value in tensorboard_config.items():
-        env[key.upper()] = value
-
-  if system.accelerator_type == AcceleratorType['GPU']:
-    # For GPUs, it has two more spaces ahead of name and value respectively
-    env_format = '''
-                  - name: {key}
-                    value: "{value}"'''
-  else:
-    env_format = '''
-                - name: {key}
-                  value: "{value}"'''
-
-  args.env = ''.join(env_format.format(key=k, value=v) for k, v in env.items())
diff --git a/src/xpk/core/docker_resources.py b/src/xpk/core/docker_resources.py
@@ -14,6 +14,8 @@
 limitations under the License.
 """
 
+import os
+import re
 from .capacity import H100_DEVICE_TYPE, H100_MEGA_DEVICE_TYPE, H200_DEVICE_TYPE
 from .cluster import setup_k8s_env
 from .storage import GCS_FUSE_TYPE, GCP_FILESTORE_TYPE, Storage, get_storages_to_mount
@@ -64,6 +66,25 @@ def get_env_container(args, system: SystemCharacteristics) -> str:
     str:
       YAML with the env config for the main container, as a YAML string.
   """
+  if system.accelerator_type == AcceleratorType['GPU']:
+    return get_gpu_env(args, system)
+
+  if system.accelerator_type == AcceleratorType['CPU']:
+    return get_cpu_env(args, system)
+
+  return format_env_dict(args.env, system)  # pytype: disable=bad-return-type
+
+
+def get_gpu_env(args, system) -> str:
+  """Generate environment variables for GPU nodepools
+  Args:
+    num_slices: Number of slices to be used in the workload.
+    env_vars: Environment variables, processed from user args.
+    system: system characteristics
+
+  Returns:
+    str: yaml containing env variables
+  """
   gpu_env_yaml = """
                   - name: REPLICATED_JOB_NAME
                     valueFrom:
@@ -73,8 +94,6 @@ def get_env_container(args, system: SystemCharacteristics) -> str:
                     valueFrom:
                       fieldRef:
                         fieldPath: metadata.annotations['jobset.sigs.k8s.io/jobset-name']
-                  - name: JAX_COORDINATOR_ADDRESS
-                    value: "$(JOBSET_NAME)-$(REPLICATED_JOB_NAME)-0-0.$(JOBSET_NAME)"
                   - name: NNODES
                     value: "{args.num_nodes}"
                   - name: NODE_RANK
@@ -84,32 +103,37 @@ def get_env_container(args, system: SystemCharacteristics) -> str:
                   - name: USE_GPUDIRECT
                     value: {gpu_direct_name}
                   - name: GPUS_PER_NODE
-                    value: "{system.chips_per_vm}"
-                  - name: JAX_COORDINATOR_PORT
-                    value: "6002"
+                    value: "{chips_per_vm}"
                   - name: COMMAND
                     value: "{args.command}"
-                  {args.env}"""
-
-  if system.accelerator_type == AcceleratorType['GPU']:
-    gpu_direct_name = 'fastrak'
-    if args.device_type == H100_DEVICE_TYPE:
-      gpu_direct_name = 'tcpx'
-    elif args.device_type == H100_MEGA_DEVICE_TYPE:
-      gpu_direct_name = 'tcpxo'
-    elif args.device_type == H200_DEVICE_TYPE:
-      gpu_direct_name = 'rdma'
-    return gpu_env_yaml.format(
-        args=args, system=system, gpu_direct_name=gpu_direct_name
-    )
-
-  if system.accelerator_type == AcceleratorType['CPU']:
-    return get_cpu_env(args.num_slices, args.env, system)
-
-  return args.env  # pytype: disable=bad-return-type
+                  {custom_envs}"""
+
+  gpu_direct_name = 'fastrak'
+  if args.device_type == H100_DEVICE_TYPE:
+    gpu_direct_name = 'tcpx'
+  elif args.device_type == H100_MEGA_DEVICE_TYPE:
+    gpu_direct_name = 'tcpxo'
+  elif args.device_type == H200_DEVICE_TYPE:
+    gpu_direct_name = 'rdma'
+
+  gpu_env_dic = {
+      'JAX_COORDINATOR_PORT': '6002',
+      'JAX_COORDINATOR_ADDRESS': (
+          '$(JOBSET_NAME)-$(REPLICATED_JOB_NAME)-0-0.$(JOBSET_NAME)'
+      ),
+  }
+
+  args.env = gpu_env_dic | args.env
+
+  return gpu_env_yaml.format(
+      args=args,
+      chips_per_vm=system.chips_per_vm,
+      gpu_direct_name=gpu_direct_name,
+      custom_envs=format_env_dict(args.env, system),
+  )
 
 
-def get_cpu_env(num_slices, env_vars, system) -> str:
+def get_cpu_env(args, system) -> str:
   """Generate environment variables for CPU nodepools
   Args:
     num_slices: Number of slices to be used in the workload.
@@ -132,19 +156,87 @@ def get_cpu_env(num_slices, env_vars, system) -> str:
                   valueFrom:
                     fieldRef:
                       fieldPath: metadata.annotations['batch.kubernetes.io/job-completion-index']
-                - name: PROCESSES_IN_JOB
-                  value: "{processes_in_job}"
-                - name: JAX_PROCESS_COUNT
-                  value: "{process_count}"
-                {env_vars}
-                - name: JAX_COORDINATOR_ADDRESS
-                  value: "$(JOBSET_NAME)-$(REPLICATED_JOB_NAME)-0-0.$(JOBSET_NAME)"
+                {custom_envs}
   """
-  return yaml.format(
-      processes_in_job=system.vms_per_slice,
-      process_count=calculate_process_count(num_slices, system.vms_per_slice),
-      env_vars=env_vars,
-  )
+
+  cpu_env_dic = {
+      'PROCESSES_IN_JOB': str(system.vms_per_slice),
+      'JAX_PROCESS_COUNT': str(
+          calculate_process_count(args.num_slices, system.vms_per_slice)
+      ),
+      'JAX_COORDINATOR_ADDRESS': (
+          '$(JOBSET_NAME)-$(REPLICATED_JOB_NAME)-0-0.$(JOBSET_NAME)'
+      ),
+  }
+
+  args.env = cpu_env_dic | args.env
+
+  return yaml.format(custom_envs=format_env_dict(args.env, system))
+
+
+def format_env_dict(env, system: SystemCharacteristics) -> str:
+  if system.accelerator_type == AcceleratorType['GPU']:
+    # For GPUs, it has two more spaces ahead of name and value respectively
+    env_format = '''
+                  - name: {key}
+                    value: "{value}"'''
+  else:
+    env_format = '''
+                - name: {key}
+                  value: "{value}"'''
+  return ''.join(env_format.format(key=k, value=v) for k, v in env.items())
+
+
+def parse_env_config(args, tensorboard_config):
+  """Parses the environment configurations to the a dictionary.
+
+  Args:
+    args: user provided arguments for running the command.
+    tensorboard_config: configuration of Vertex Tensorboard.
+    system: system characteristics.
+  """
+  env = {}
+
+  env_pat = re.compile(r'(^[a-zA-Z_][a-zA-Z0-9_]*?)(?:=(.*))?$', re.M)
+  if args.env_file:
+    print('Setting container environment from', args.env_file)
+    with open(file=args.env_file, mode='r', encoding='utf-8') as f:
+      for match in env_pat.finditer(f.read()):
+        variable = match.group(1)
+        if match.group(2) is not None:
+          env[variable] = match.group(2)
+        else:
+          assert variable in os.environ, (
+              f'Variable {variable} is not set in the current '
+              'environment, a value must be specified.'
+          )
+          env[variable] = os.environ[variable]
+  if args.env:
+    for var in args.env:
+      match = env_pat.match(var)
+      assert match and match.group(2) is not None, (
+          'Invalid environment variable, format must be '
+          f'`--env VARIABLE=value`: {var}'
+      )
+      variable = match.group(1)
+      env[variable] = match.group(2)
+
+  if not args.use_pathways:
+    if args.debug_dump_gcs:
+      if 'XLA_FLAGS' in env:
+        raise ValueError(
+            'Conflict: XLA_FLAGS defined in both --debug_dump_gcs '
+            'and environment file. Please choose one way to define '
+            'XLA_FLAGS.'
+        )
+      env['XLA_FLAGS'] = '--xla_dump_to=/tmp/xla_dump/'
+
+    if tensorboard_config:
+      env['UPLOAD_DATA_TO_TENSORBOARD'] = True
+      for key, value in tensorboard_config.items():
+        env[key.upper()] = value
+
+  args.env = env
 
 
 def get_volumes(args, system: SystemCharacteristics) -> str:
diff --git a/src/xpk/core/kjob.py b/src/xpk/core/kjob.py
@@ -40,11 +40,8 @@
     XpkConfig,
 )
 from .network import get_cluster_subnetworks
-from .resources import (
-    AcceleratorType,
-    SystemCharacteristics,
-    get_cluster_system_characteristics,
-)
+from .system_characteristics import AcceleratorType, SystemCharacteristics
+from .resources import get_cluster_system_characteristics
 from .storage import (
     GCS_FUSE_ANNOTATIONS,
     PARALLELSTORE_ANNOTATIONS,
diff --git a/src/xpk/core/pathways.py b/src/xpk/core/pathways.py
@@ -19,8 +19,7 @@
 from ..core.gcloud_context import zone_to_region
 from ..core.nodepool import get_all_nodepools_programmatic
 from ..utils.console import xpk_exit, xpk_print
-from .config import AcceleratorType
-from .system_characteristics import SystemCharacteristics
+from .system_characteristics import AcceleratorType, SystemCharacteristics
 
 
 def add_pw_resource_flavors(args):