Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions cli/jobs/nebulaml/PyTorch_CNN_MNIST/job.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,15 +22,15 @@ resources:
# https://learn.microsoft.com/en-us/azure/machine-learning/concept-environments
#
# for the latest ACPT image using PyTorch 2.2, CUDA 12.1 on Python 3.10
# environment: azureml:AzureML-acpt-pytorch-2.2-cuda12.1@latest
environment: azureml:AzureML-acpt-pytorch-2.2-cuda12.1@latest
# environment: azureml:AzureML-acpt-pytorch-2.8-cuda12.6@latest
environment: azureml:AzureML-acpt-pytorch-2.8-cuda12.6@latest
# for a specific version
#environment: azureml:AzureML-acpt-pytorch-2.2-cuda12.1:3
#environment: azureml:AzureML-acpt-pytorch-2.8-cuda12.6:1
# for the latest version of a custom environment: (====install Nebula in dockerfile environment=====)
#environment: azureml:ACPT-Extended@latest
# for a specific Docker image (has to be compatible with Azure ML):
#environment:
# image: mcr.microsoft.com/azureml/curated/acpt-pytorch-2.2-cuda12.1:2
# image: mcr.microsoft.com/azureml/curated/acpt-pytorch-2.8-cuda12.6:1

# command that should be run by the job (====install Nebula with comand OR use dockerfile environment=====)
command: >-
Expand Down
10 changes: 1 addition & 9 deletions cli/jobs/nebulaml/PyTorch_CNN_MNIST/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,6 @@
from torchvision import datasets, transforms
from torch.optim.lr_scheduler import StepLR

import nebulaml as nm


class Net(nn.Module):
def __init__(self):
Expand Down Expand Up @@ -169,9 +167,6 @@ def main():
else:
device = torch.device("cpu")

# Nebula Initialization
nm.init(persistent_storage_path="/tmp/test", persistent_time_interval=2)

train_kwargs = {"batch_size": args.batch_size}
test_kwargs = {"batch_size": args.test_batch_size}
if use_cuda:
Expand All @@ -197,10 +192,7 @@ def main():
scheduler.step()

if args.save_model:
# torch.save(model.state_dict(), "mnist_cnn.pt")
# Nebula Save
ckpt = nm.Checkpoint(f"global_step{epoch}", 1)
ckpt.save("mnist_cnn.pt", model.state_dict())
torch.save(model.state_dict(), "mnist_cnn.pt")


if __name__ == "__main__":
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ name: py_torch_hello_world
display_name: PyTorch-hello-world
version: 1

environment: azureml://registries/azureml/environments/acpt-pytorch-2.2-cuda12.1/labels/latest
environment: azureml://registries/azureml/environments/acpt-pytorch-2.8-cuda12.6/labels/latest
command: >-
echo "
RANK: $RANK \n
Expand Down
4 changes: 2 additions & 2 deletions cli/jobs/pipelines/cifar-10/pipeline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ jobs:
model_dir:
type: uri_folder
mode: upload
environment: azureml://registries/azureml/environments/acpt-pytorch-2.2-cuda12.1/labels/latest
environment: azureml://registries/azureml/environments/acpt-pytorch-2.8-cuda12.6/labels/latest
compute: azureml:gpu-cluster
distribution:
type: pytorch
Expand All @@ -47,7 +47,7 @@ jobs:
--data-dir ${{inputs.cifar}}
--model-dir ${{inputs.model_dir}}/model
code: src/eval-model
environment: azureml://registries/azureml/environments/acpt-pytorch-2.2-cuda12.1/labels/latest
environment: azureml://registries/azureml/environments/acpt-pytorch-2.8-cuda12.6/labels/latest
compute: azureml:gpu-cluster
distribution:
type: pytorch
Expand Down
2 changes: 1 addition & 1 deletion cli/jobs/single-step/pytorch/cifar-distributed/job.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ inputs:
cifar:
type: uri_folder
path: azureml:cifar-10-example@latest
environment: azureml:AzureML-acpt-pytorch-2.2-cuda12.1@latest
environment: azureml:AzureML-acpt-pytorch-2.8-cuda12.6@latest
compute: azureml:gpu-cluster
distribution:
type: pytorch
Expand Down
2 changes: 1 addition & 1 deletion cli/jobs/single-step/pytorch/word-language-model/job.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ inputs:
corpus:
path: data
mode: download
environment: azureml:AzureML-acpt-pytorch-2.2-cuda12.1@latest
environment: azureml:AzureML-acpt-pytorch-2.8-cuda12.6@latest
compute: azureml:gpu-cluster
display_name: pytorch-word-language-model-example
experiment_name: pytorch-word-language-model-example
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,4 @@ distribution:
process_count_per_instance: 1
resources:
instance_count: 2
environment: azureml://registries/azureml/environments/acpt-pytorch-2.2-cuda12.1/labels/latest
environment: azureml://registries/azureml/environments/acpt-pytorch-2.8-cuda12.6/labels/latest
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,4 @@ distribution:
process_count_per_instance: 1
resources:
instance_count: 2
environment: azureml://registries/azureml/environments/acpt-pytorch-2.2-cuda12.1/labels/latest
environment: azureml://registries/azureml/environments/acpt-pytorch-2.8-cuda12.6/labels/latest
Original file line number Diff line number Diff line change
Expand Up @@ -354,7 +354,7 @@
" code=\"./src\", # local path where the code is stored\n",
" command=\"python train.py --data-dir ${{inputs.cifar}} --epochs ${{inputs.epoch}} --batch-size ${{inputs.batchsize}} --workers ${{inputs.workers}} --learning-rate ${{inputs.lr}} --momentum ${{inputs.momen}} --print-freq ${{inputs.prtfreq}} --model-dir ${{inputs.output}}\",\n",
" inputs=inputs,\n",
" environment=\"azureml:AzureML-acpt-pytorch-2.2-cuda12.1@latest\",\n",
" environment=\"azureml:AzureML-acpt-pytorch-2.8-cuda12.6@latest\",\n",
" instance_count=2, # In this, only 2 node cluster was created.\n",
" distribution={\n",
" \"type\": \"PyTorch\",\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@
},
"outputs": [],
"source": [
"curated_env_name = \"AzureML-acpt-pytorch-2.2-cuda12.1@latest\""
"curated_env_name = \"AzureML-acpt-pytorch-2.8-cuda12.6@latest\""
]
},
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -412,7 +412,7 @@
" # \"register_model_as\": \"places_dev\",\n",
" \"enable_profiling\": False,\n",
" },\n",
" environment=\"AzureML-acpt-pytorch-2.2-cuda12.1@latest\",\n",
" environment=\"AzureML-acpt-pytorch-2.8-cuda12.6@latest\",\n",
" compute=\"gpu-cluster\"\n",
" if (gpu_cluster)\n",
" else None, # No compute needs to be passed to use serverless\n",
Expand Down
Loading