@@ -1362,6 +1362,160 @@ Enabling sshGitConfig injects the envvars, volumes, and volumeMounts:
13621362 - emptyDir :
13631363 medium : Memory
13641364 name : dshm
1365+ Harmless environment variables can be set when topologyFileConfigMap is provided :
1366+ 1 : |
1367+ apiVersion : workload .codeflare .dev / v1beta2
1368+ kind : AppWrapper
1369+ metadata :
1370+ annotations :
1371+ workload .codeflare .dev .mlbatch / pytorchGeneratorVersion : 1.1 .9
1372+ labels :
1373+ kueue .x - k8s .io / queue - name : default - queue
1374+ name : my - job
1375+ namespace : my - namespace
1376+ spec :
1377+ components :
1378+ - template :
1379+ apiVersion : kubeflow .org / v1
1380+ kind : PyTorchJob
1381+ metadata :
1382+ name : my - job
1383+ spec :
1384+ pytorchReplicaSpecs :
1385+ Master :
1386+ replicas : 1
1387+ restartPolicy : Never
1388+ template :
1389+ spec :
1390+ affinity :
1391+ nodeAffinity :
1392+ requiredDuringSchedulingIgnoredDuringExecution :
1393+ nodeSelectorTerms :
1394+ - matchExpressions :
1395+ - key : autopilot .ibm .com / gpuhealth
1396+ operator : NotIn
1397+ values :
1398+ - ERR
1399+ - TESTING
1400+ - EVICT
1401+ containers :
1402+ - command :
1403+ - sh
1404+ - - c
1405+ - |
1406+ echo " Environment variables set by the kubeflow training operator:"
1407+ echo $ {MASTER_ADDR }:$ {MASTER_PORT }
1408+ echo " PYTHONUNBUFFERED:" $ {PYTHONUNBUFFERED }
1409+ echo My global rank is $ {RANK } / $ {WORLD_SIZE }
1410+ echo " Other injected environment variables:"
1411+ echo " NVME_MOUNT_PATH: " $ {NVME_MOUNT_PATH }
1412+ #
1413+ # User commands
1414+ #
1415+ git clone https :// github.com/dbarnett/python-helloworld
1416+ cd python - helloworld
1417+ echo executing : torchrun -- nnodes = $ {WORLD_SIZE } -- node_rank = $ {RANK } -- nproc_per_node = 8 -- rdzv_id = 101 -- rdzv_endpoint = " ${MASTER_ADDR}:${MASTER_PORT}" helloworld .py
1418+ torchrun -- nnodes = $ {WORLD_SIZE } -- node_rank = $ {RANK } -- nproc_per_node = 8 -- rdzv_id = 101 -- rdzv_endpoint = " ${MASTER_ADDR}:${MASTER_PORT}" helloworld .py
1419+ env :
1420+ - name : NCCL_TOPO_FILE
1421+ value : / var /run/nvidia-topologyd/virtualTopology.xml
1422+ - name : EXAMPLE_VAR1
1423+ value : " 42"
1424+ image : ghcr .io / foundation - model - stack / base :pytorch - latest - nightly - 20230126
1425+ imagePullPolicy : IfNotPresent
1426+ name : pytorch
1427+ resources :
1428+ limits :
1429+ cpu : 500m
1430+ memory : 1Gi
1431+ nvidia .com / gpu : 8
1432+ nvidia .com / roce_gdr : 0
1433+ requests :
1434+ cpu : 500m
1435+ memory : 1Gi
1436+ nvidia .com / gpu : 8
1437+ nvidia .com / roce_gdr : 0
1438+ volumeMounts :
1439+ - mountPath : / var /run/nvidia-topologyd
1440+ name : topology - volume
1441+ - mountPath : / dev / shm
1442+ name : dshm
1443+ imagePullSecrets : []
1444+ priorityClassName : default - priority
1445+ volumes :
1446+ - configMap :
1447+ name : nvidia - topo - gdr
1448+ name : topology - volume
1449+ - emptyDir :
1450+ medium : Memory
1451+ name : dshm
1452+ Worker :
1453+ replicas : 3
1454+ restartPolicy : Never
1455+ template :
1456+ spec :
1457+ affinity :
1458+ nodeAffinity :
1459+ requiredDuringSchedulingIgnoredDuringExecution :
1460+ nodeSelectorTerms :
1461+ - matchExpressions :
1462+ - key : autopilot .ibm .com / gpuhealth
1463+ operator : NotIn
1464+ values :
1465+ - ERR
1466+ - TESTING
1467+ - EVICT
1468+ containers :
1469+ - command :
1470+ - sh
1471+ - - c
1472+ - |
1473+ echo " Environment variables set by the kubeflow training operator:"
1474+ echo $ {MASTER_ADDR }:$ {MASTER_PORT }
1475+ echo " PYTHONUNBUFFERED:" $ {PYTHONUNBUFFERED }
1476+ echo My global rank is $ {RANK } / $ {WORLD_SIZE }
1477+ echo " Other injected environment variables:"
1478+ echo " NVME_MOUNT_PATH: " $ {NVME_MOUNT_PATH }
1479+ #
1480+ # User commands
1481+ #
1482+ git clone https :// github.com/dbarnett/python-helloworld
1483+ cd python - helloworld
1484+ echo executing : torchrun -- nnodes = $ {WORLD_SIZE } -- node_rank = $ {RANK } -- nproc_per_node = 8 -- rdzv_id = 101 -- rdzv_endpoint = " ${MASTER_ADDR}:${MASTER_PORT}" helloworld .py
1485+ torchrun -- nnodes = $ {WORLD_SIZE } -- node_rank = $ {RANK } -- nproc_per_node = 8 -- rdzv_id = 101 -- rdzv_endpoint = " ${MASTER_ADDR}:${MASTER_PORT}" helloworld .py
1486+ env :
1487+ - name : NCCL_TOPO_FILE
1488+ value : / var /run/nvidia-topologyd/virtualTopology.xml
1489+ - name : EXAMPLE_VAR1
1490+ value : " 42"
1491+ image : ghcr .io / foundation - model - stack / base :pytorch - latest - nightly - 20230126
1492+ imagePullPolicy : IfNotPresent
1493+ name : pytorch
1494+ resources :
1495+ limits :
1496+ cpu : 500m
1497+ memory : 1Gi
1498+ nvidia .com / gpu : 8
1499+ nvidia .com / roce_gdr : 0
1500+ requests :
1501+ cpu : 500m
1502+ memory : 1Gi
1503+ nvidia .com / gpu : 8
1504+ nvidia .com / roce_gdr : 0
1505+ volumeMounts :
1506+ - mountPath : / var /run/nvidia-topologyd
1507+ name : topology - volume
1508+ - mountPath : / dev / shm
1509+ name : dshm
1510+ imagePullSecrets : []
1511+ priorityClassName : default - priority
1512+ volumes :
1513+ - configMap :
1514+ name : nvidia - topo - gdr
1515+ name : topology - volume
1516+ - emptyDir :
1517+ medium : Memory
1518+ name : dshm
13651519scheduler can be set :
13661520 1 : |
13671521 apiVersion : workload .codeflare .dev / v1beta2
0 commit comments