Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/fsdp-regression-test-container.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ jobs:
max-parallel: 3
matrix:
cluster: [p5, p5-smhp]
model_config: [llama2_7b, llama2_13b, llama2_70b]
model_config: [llama2_7b, llama2_13b, llama2_70b, llama3_1_8b, llama3_1_70b]
runs-on: [self-hosted, "${{ matrix.cluster }}"]
concurrency:
group: ${{ github.workflow }}-${{ matrix.cluster }}-${{ matrix.model_config }}
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/fsdp-regression-test-venv.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ jobs:
max-parallel: 3
matrix:
cluster: [p5, p5-smhp]
model_config: [llama2_7b, llama2_13b, llama2_70b]
model_config: [llama2_7b, llama2_13b, llama2_70b, llama3_1_8b, llama3_1_70b]
runs-on: [self-hosted, "${{ matrix.cluster }}"]
concurrency:
group: ${{ github.workflow }}-${{ matrix.cluster }}-${{ matrix.model_config }}
Expand Down
115 changes: 115 additions & 0 deletions 3.test_cases/pytorch/FSDP/kubernetes/llama3_1_70b-fsdp.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
apiVersion: "kubeflow.org/v1"
kind: PyTorchJob
metadata:
name: llama3-1-70b-fsdp
spec:
elasticPolicy:
rdzvBackend: c10d
minReplicas: 1
maxReplicas: 64
maxRestarts: 100
metrics:
- type: Resource
resource:
name: cpu
target:
type: Utilization
averageUtilization: 90
pytorchReplicaSpecs:
Worker:
replicas: $NUM_NODES
restartPolicy: OnFailure
template:
metadata:
labels:
app: llama3-1-70b-fsdp
spec:
volumes:
- name: shmem
hostPath:
path: /dev/shm
- name: local
hostPath:
path: /mnt/k8s-disks/0
#nodeSelector:
# node.kubernetes.io/instance-type: "${INSTANCE_TYPE}"
containers:
- name: pytorch
image: ${IMAGE_URI}
imagePullPolicy: Always
resources:
requests:
nvidia.com/gpu: $GPU_PER_NODE
vpc.amazonaws.com/efa: $EFA_PER_NODE
limits:
nvidia.com/gpu: $GPU_PER_NODE
vpc.amazonaws.com/efa: $EFA_PER_NODE
env:
# for P5 FI_* should be commented out
- name: LOGLEVEL
value: "DEBUG"
#- name: FI_PROVIDER
# value: $FI_PROVIDER
#- name: FI_EFA_USE_DEVICE_RDMA
# value: "1"
#- name: FI_EFA_FORK_SAFE
# value: "1"
#- name: FI_LOG_LEVEL
# value: "1"
#- name: FI_EFA_ENABLE_SHM_TRANSFER
# value: "1"
- name: TORCH_DISTRIBUTED_DEBUG
value: "DETAIL"
- name: TORCH_NCCL_ENABLE_MONITORING
value: "1"
- name: TORCH_NCCL_TRACE_BUFFER_SIZE
value: "20000"
- name: TORCH_NCCL_DUMP_ON_TIMEOUT
value: "1"
- name: TORCH_NCCL_DEBUG_INFO_TEMP_FILE
value: "/local/nccl_trace_rank_"
- name: PYTORCH_CUDA_ALLOC_CONF
value: "expandable_segments:True"
- name: NCCL_DEBUG
value: "INFO"
- name: NCCL_SOCKET_IFNAME
value: "^lo"
- name: TORCH_NCCL_ASYNC_ERROR_HANDLING
value: "1"
- name: HF_TOKEN
value: "${HF_TOKEN}"
#- name: TORCH_DIST_INIT_BARRIER
# value: "1"
#- name: NCCL_IGNORE_DISABLED_P2P
# value: "1"
#- name: NCCL_NVLS_ENABLE
# value: "0"
command:
- /usr/local/bin/torchrun
- --nproc_per_node=$GPU_PER_NODE
- --nnodes=$NUM_NODES
- /fsdp/train.py
- --max_context_width=8192
- --num_key_value_heads=8
- --intermediate_size=28672
- --hidden_width=8192
- --num_layers=80
- --num_heads=64
- --model_type=llama_v3
- --tokenizer=hf-internal-testing/llama-tokenizer
- --checkpoint_freq=15
- --validation_freq=10
- --max_steps=30
- --checkpoint_dir=./checkpoints
- --dataset=allenai/c4
- --dataset_config_name=en
- --resume_from_checkpoint=./checkpoints
- --train_batch_size=1
- --val_batch_size=1
- --sharding_strategy=full # https://pytorch.org/docs/stable/fsdp.html
- --offload_activations=1
volumeMounts:
- name: shmem
mountPath: /dev/shm
- name: local
mountPath: /local
115 changes: 115 additions & 0 deletions 3.test_cases/pytorch/FSDP/kubernetes/llama3_1_8b-fsdp.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
apiVersion: "kubeflow.org/v1"
kind: PyTorchJob
metadata:
name: llama3-1-8b-fsdp
spec:
elasticPolicy:
rdzvBackend: c10d
minReplicas: 1
maxReplicas: 64
maxRestarts: 100
metrics:
- type: Resource
resource:
name: cpu
target:
type: Utilization
averageUtilization: 90
pytorchReplicaSpecs:
Worker:
replicas: $NUM_NODES
restartPolicy: OnFailure
template:
metadata:
labels:
app: llama3-1-8b-fsdp
spec:
volumes:
- name: shmem
hostPath:
path: /dev/shm
- name: local
hostPath:
path: /mnt/k8s-disks/0
#nodeSelector:
# node.kubernetes.io/instance-type: "${INSTANCE_TYPE}"
containers:
- name: pytorch
image: ${IMAGE_URI}
imagePullPolicy: Always
resources:
requests:
nvidia.com/gpu: $GPU_PER_NODE
vpc.amazonaws.com/efa: $EFA_PER_NODE
limits:
nvidia.com/gpu: $GPU_PER_NODE
vpc.amazonaws.com/efa: $EFA_PER_NODE
env:
# for P5 FI_* should be commented out
- name: LOGLEVEL
value: "DEBUG"
#- name: FI_PROVIDER
# value: $FI_PROVIDER
#- name: FI_EFA_USE_DEVICE_RDMA
# value: "1"
#- name: FI_EFA_FORK_SAFE
# value: "1"
#- name: FI_LOG_LEVEL
# value: "1"
#- name: FI_EFA_ENABLE_SHM_TRANSFER
# value: "1"
- name: TORCH_DISTRIBUTED_DEBUG
value: "DETAIL"
- name: TORCH_NCCL_ENABLE_MONITORING
value: "1"
- name: TORCH_NCCL_TRACE_BUFFER_SIZE
value: "20000"
- name: TORCH_NCCL_DUMP_ON_TIMEOUT
value: "1"
- name: TORCH_NCCL_DEBUG_INFO_TEMP_FILE
value: "/local/nccl_trace_rank_"
- name: PYTORCH_CUDA_ALLOC_CONF
value: "expandable_segments:True"
- name: NCCL_DEBUG
value: "INFO"
- name: NCCL_SOCKET_IFNAME
value: "^lo"
- name: TORCH_NCCL_ASYNC_ERROR_HANDLING
value: "1"
- name: HF_TOKEN
value: "${HF_TOKEN}"
#- name: TORCH_DIST_INIT_BARRIER
# value: "1"
#- name: NCCL_IGNORE_DISABLED_P2P
# value: "1"
#- name: NCCL_NVLS_ENABLE
# value: "0"
command:
- /usr/local/bin/torchrun
- --nproc_per_node=$GPU_PER_NODE
- --nnodes=$NUM_NODES
- /fsdp/train.py
- --max_context_width=8192
- --num_key_value_heads=8
- --intermediate_size=14336
- --hidden_width=4096
- --num_layers=32
- --num_heads=32
- --model_type=llama_v3
- --tokenizer=hf-internal-testing/llama-tokenizer
- --checkpoint_freq=50
- --validation_freq=25
- --max_steps=100
- --checkpoint_dir=./checkpoints
- --dataset=allenai/c4
- --dataset_config_name=en
- --resume_from_checkpoint=./checkpoints
- --train_batch_size=1
- --val_batch_size=1
- --sharding_strategy=full # https://pytorch.org/docs/stable/fsdp.html
- --offload_activations=1
volumeMounts:
- name: shmem
mountPath: /dev/shm
- name: local
mountPath: /local
115 changes: 115 additions & 0 deletions 3.test_cases/pytorch/FSDP/kubernetes/llama3_2_1b-fsdp.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
apiVersion: "kubeflow.org/v1"
kind: PyTorchJob
metadata:
name: llama3-2-1b-fsdp
spec:
elasticPolicy:
rdzvBackend: c10d
minReplicas: 1
maxReplicas: 64
maxRestarts: 100
metrics:
- type: Resource
resource:
name: cpu
target:
type: Utilization
averageUtilization: 90
pytorchReplicaSpecs:
Worker:
replicas: $NUM_NODES
restartPolicy: OnFailure
template:
metadata:
labels:
app: llama3-2-1b-fsdp
spec:
volumes:
- name: shmem
hostPath:
path: /dev/shm
- name: local
hostPath:
path: /mnt/k8s-disks/0
#nodeSelector:
# node.kubernetes.io/instance-type: "${INSTANCE_TYPE}"
containers:
- name: pytorch
image: ${IMAGE_URI}
imagePullPolicy: Always
resources:
requests:
nvidia.com/gpu: $GPU_PER_NODE
vpc.amazonaws.com/efa: $EFA_PER_NODE
limits:
nvidia.com/gpu: $GPU_PER_NODE
vpc.amazonaws.com/efa: $EFA_PER_NODE
env:
# for P5 FI_* should be commented out
- name: LOGLEVEL
value: "DEBUG"
#- name: FI_PROVIDER
# value: $FI_PROVIDER
#- name: FI_EFA_USE_DEVICE_RDMA
# value: "1"
#- name: FI_EFA_FORK_SAFE
# value: "1"
#- name: FI_LOG_LEVEL
# value: "1"
#- name: FI_EFA_ENABLE_SHM_TRANSFER
# value: "1"
- name: TORCH_DISTRIBUTED_DEBUG
value: "DETAIL"
- name: TORCH_NCCL_ENABLE_MONITORING
value: "1"
- name: TORCH_NCCL_TRACE_BUFFER_SIZE
value: "20000"
- name: TORCH_NCCL_DUMP_ON_TIMEOUT
value: "1"
- name: TORCH_NCCL_DEBUG_INFO_TEMP_FILE
value: "/local/nccl_trace_rank_"
- name: PYTORCH_CUDA_ALLOC_CONF
value: "expandable_segments:True"
- name: NCCL_DEBUG
value: "INFO"
- name: NCCL_SOCKET_IFNAME
value: "^lo"
- name: TORCH_NCCL_ASYNC_ERROR_HANDLING
value: "1"
- name: HF_TOKEN
value: "${HF_TOKEN}"
#- name: TORCH_DIST_INIT_BARRIER
# value: "1"
#- name: NCCL_IGNORE_DISABLED_P2P
# value: "1"
#- name: NCCL_NVLS_ENABLE
# value: "0"
command:
- /usr/local/bin/torchrun
- --nproc_per_node=$GPU_PER_NODE
- --nnodes=$NUM_NODES
- /fsdp/train.py
- --max_context_width=8192
- --num_key_value_heads=2
- --intermediate_size=8192
- --hidden_width=2048
- --num_layers=16
- --num_heads=32
- --model_type=llama_v3
- --tokenizer=hf-internal-testing/llama-tokenizer
- --checkpoint_freq=50
- --validation_freq=100
- --max_steps=100
- --checkpoint_dir=./checkpoints
- --dataset=allenai/c4
- --dataset_config_name=en
- --resume_from_checkpoint=./checkpoints
- --train_batch_size=1
- --val_batch_size=1
- --sharding_strategy=full # https://pytorch.org/docs/stable/fsdp.html
- --offload_activations=1
volumeMounts:
- name: shmem
mountPath: /dev/shm
- name: local
mountPath: /local
Loading