diff --git a/cli/jobs/nebulaml/PyTorch_CNN_MNIST/job.yml b/cli/jobs/nebulaml/PyTorch_CNN_MNIST/job.yml index dfc2e4a831..f836c66aec 100644 --- a/cli/jobs/nebulaml/PyTorch_CNN_MNIST/job.yml +++ b/cli/jobs/nebulaml/PyTorch_CNN_MNIST/job.yml @@ -22,15 +22,15 @@ resources: # https://learn.microsoft.com/en-us/azure/machine-learning/concept-environments # # for the latest ACPT image using PyTorch 2.2, CUDA 12.1 on Python 3.10 -# environment: azureml:AzureML-acpt-pytorch-2.2-cuda12.1@latest -environment: azureml:AzureML-acpt-pytorch-2.2-cuda12.1@latest +# environment: azureml:AzureML-acpt-pytorch-2.8-cuda12.6@latest +environment: azureml:AzureML-acpt-pytorch-2.8-cuda12.6@latest # for a specific version -#environment: azureml:AzureML-acpt-pytorch-2.2-cuda12.1:3 +#environment: azureml:AzureML-acpt-pytorch-2.8-cuda12.6:1 # for the latest version of a custom environment: (====install Nebula in dockerfile environment=====) #environment: azureml:ACPT-Extended@latest # for a specific Docker image (has to be compatible with Azure ML): #environment: -# image: mcr.microsoft.com/azureml/curated/acpt-pytorch-2.2-cuda12.1:2 +# image: mcr.microsoft.com/azureml/curated/acpt-pytorch-2.8-cuda12.6:1 # command that should be run by the job (====install Nebula with comand OR use dockerfile environment=====) command: >- diff --git a/cli/jobs/nebulaml/PyTorch_CNN_MNIST/train.py b/cli/jobs/nebulaml/PyTorch_CNN_MNIST/train.py index d29d4e60c1..e537b5e318 100644 --- a/cli/jobs/nebulaml/PyTorch_CNN_MNIST/train.py +++ b/cli/jobs/nebulaml/PyTorch_CNN_MNIST/train.py @@ -7,8 +7,6 @@ from torchvision import datasets, transforms from torch.optim.lr_scheduler import StepLR -import nebulaml as nm - class Net(nn.Module): def __init__(self): @@ -169,9 +167,6 @@ def main(): else: device = torch.device("cpu") - # Nebula Initialization - nm.init(persistent_storage_path="/tmp/test", persistent_time_interval=2) - train_kwargs = {"batch_size": args.batch_size} test_kwargs = {"batch_size": args.test_batch_size} if use_cuda: @@ -197,10 +192,7 @@ def main(): scheduler.step() if args.save_model: - # torch.save(model.state_dict(), "mnist_cnn.pt") - # Nebula Save - ckpt = nm.Checkpoint(f"global_step{epoch}", 1) - ckpt.save("mnist_cnn.pt", model.state_dict()) + torch.save(model.state_dict(), "mnist_cnn.pt") if __name__ == "__main__": diff --git a/cli/jobs/pipelines-with-components/basics/6b_pytorch_hello_world/component.yml b/cli/jobs/pipelines-with-components/basics/6b_pytorch_hello_world/component.yml index 7e1fe821d5..164629c606 100644 --- a/cli/jobs/pipelines-with-components/basics/6b_pytorch_hello_world/component.yml +++ b/cli/jobs/pipelines-with-components/basics/6b_pytorch_hello_world/component.yml @@ -7,7 +7,7 @@ name: py_torch_hello_world display_name: PyTorch-hello-world version: 1 -environment: azureml://registries/azureml/environments/acpt-pytorch-2.2-cuda12.1/labels/latest +environment: azureml://registries/azureml/environments/acpt-pytorch-2.8-cuda12.6/labels/latest command: >- echo " RANK: $RANK \n diff --git a/cli/jobs/pipelines/cifar-10/pipeline.yml b/cli/jobs/pipelines/cifar-10/pipeline.yml index 60258b47fa..b3e6eaeeed 100644 --- a/cli/jobs/pipelines/cifar-10/pipeline.yml +++ b/cli/jobs/pipelines/cifar-10/pipeline.yml @@ -33,7 +33,7 @@ jobs: model_dir: type: uri_folder mode: upload - environment: azureml://registries/azureml/environments/acpt-pytorch-2.2-cuda12.1/labels/latest + environment: azureml://registries/azureml/environments/acpt-pytorch-2.8-cuda12.6/labels/latest compute: azureml:gpu-cluster distribution: type: pytorch @@ -47,7 +47,7 @@ jobs: --data-dir ${{inputs.cifar}} --model-dir ${{inputs.model_dir}}/model code: src/eval-model - environment: azureml://registries/azureml/environments/acpt-pytorch-2.2-cuda12.1/labels/latest + environment: azureml://registries/azureml/environments/acpt-pytorch-2.8-cuda12.6/labels/latest compute: azureml:gpu-cluster distribution: type: pytorch diff --git a/cli/jobs/single-step/pytorch/cifar-distributed/job.yml b/cli/jobs/single-step/pytorch/cifar-distributed/job.yml index 228268f906..f474140048 100644 --- a/cli/jobs/single-step/pytorch/cifar-distributed/job.yml +++ b/cli/jobs/single-step/pytorch/cifar-distributed/job.yml @@ -11,7 +11,7 @@ inputs: cifar: type: uri_folder path: azureml:cifar-10-example@latest -environment: azureml:AzureML-acpt-pytorch-2.2-cuda12.1@latest +environment: azureml:AzureML-acpt-pytorch-2.8-cuda12.6@latest compute: azureml:gpu-cluster distribution: type: pytorch diff --git a/cli/jobs/single-step/pytorch/word-language-model/job.yml b/cli/jobs/single-step/pytorch/word-language-model/job.yml index 257fb293a7..5e14bab0bb 100644 --- a/cli/jobs/single-step/pytorch/word-language-model/job.yml +++ b/cli/jobs/single-step/pytorch/word-language-model/job.yml @@ -16,7 +16,7 @@ inputs: corpus: path: data mode: download -environment: azureml:AzureML-acpt-pytorch-2.2-cuda12.1@latest +environment: azureml:AzureML-acpt-pytorch-2.8-cuda12.6@latest compute: azureml:gpu-cluster display_name: pytorch-word-language-model-example experiment_name: pytorch-word-language-model-example diff --git a/sdk/python/jobs/pipelines/2b_train_cifar_10_with_pytorch/eval-model.yml b/sdk/python/jobs/pipelines/2b_train_cifar_10_with_pytorch/eval-model.yml index 303bc3facf..46fab4b717 100644 --- a/sdk/python/jobs/pipelines/2b_train_cifar_10_with_pytorch/eval-model.yml +++ b/sdk/python/jobs/pipelines/2b_train_cifar_10_with_pytorch/eval-model.yml @@ -17,4 +17,4 @@ distribution: process_count_per_instance: 1 resources: instance_count: 2 -environment: azureml://registries/azureml/environments/acpt-pytorch-2.2-cuda12.1/labels/latest +environment: azureml://registries/azureml/environments/acpt-pytorch-2.8-cuda12.6/labels/latest diff --git a/sdk/python/jobs/pipelines/2b_train_cifar_10_with_pytorch/train-model.yml b/sdk/python/jobs/pipelines/2b_train_cifar_10_with_pytorch/train-model.yml index cc71833cc9..eba0ba918b 100644 --- a/sdk/python/jobs/pipelines/2b_train_cifar_10_with_pytorch/train-model.yml +++ b/sdk/python/jobs/pipelines/2b_train_cifar_10_with_pytorch/train-model.yml @@ -20,4 +20,4 @@ distribution: process_count_per_instance: 1 resources: instance_count: 2 -environment: azureml://registries/azureml/environments/acpt-pytorch-2.2-cuda12.1/labels/latest +environment: azureml://registries/azureml/environments/acpt-pytorch-2.8-cuda12.6/labels/latest diff --git a/sdk/python/jobs/single-step/pytorch/distributed-training/distributed-cifar10.ipynb b/sdk/python/jobs/single-step/pytorch/distributed-training/distributed-cifar10.ipynb index 4c5a392391..3bbee46c7d 100644 --- a/sdk/python/jobs/single-step/pytorch/distributed-training/distributed-cifar10.ipynb +++ b/sdk/python/jobs/single-step/pytorch/distributed-training/distributed-cifar10.ipynb @@ -354,7 +354,7 @@ " code=\"./src\", # local path where the code is stored\n", " command=\"python train.py --data-dir ${{inputs.cifar}} --epochs ${{inputs.epoch}} --batch-size ${{inputs.batchsize}} --workers ${{inputs.workers}} --learning-rate ${{inputs.lr}} --momentum ${{inputs.momen}} --print-freq ${{inputs.prtfreq}} --model-dir ${{inputs.output}}\",\n", " inputs=inputs,\n", - " environment=\"azureml:AzureML-acpt-pytorch-2.2-cuda12.1@latest\",\n", + " environment=\"azureml:AzureML-acpt-pytorch-2.8-cuda12.6@latest\",\n", " instance_count=2, # In this, only 2 node cluster was created.\n", " distribution={\n", " \"type\": \"PyTorch\",\n", diff --git a/sdk/python/jobs/single-step/pytorch/train-hyperparameter-tune-deploy-with-pytorch/train-hyperparameter-tune-deploy-with-pytorch.ipynb b/sdk/python/jobs/single-step/pytorch/train-hyperparameter-tune-deploy-with-pytorch/train-hyperparameter-tune-deploy-with-pytorch.ipynb index 06aece0e1f..c59d13ce6e 100644 --- a/sdk/python/jobs/single-step/pytorch/train-hyperparameter-tune-deploy-with-pytorch/train-hyperparameter-tune-deploy-with-pytorch.ipynb +++ b/sdk/python/jobs/single-step/pytorch/train-hyperparameter-tune-deploy-with-pytorch/train-hyperparameter-tune-deploy-with-pytorch.ipynb @@ -199,7 +199,7 @@ }, "outputs": [], "source": [ - "curated_env_name = \"AzureML-acpt-pytorch-2.2-cuda12.1@latest\"" + "curated_env_name = \"AzureML-acpt-pytorch-2.8-cuda12.6@latest\"" ] }, { diff --git a/tutorials/e2e-distributed-pytorch-image/e2e-object-classification-distributed-pytorch.ipynb b/tutorials/e2e-distributed-pytorch-image/e2e-object-classification-distributed-pytorch.ipynb index 9955e241be..0414ba68a8 100644 --- a/tutorials/e2e-distributed-pytorch-image/e2e-object-classification-distributed-pytorch.ipynb +++ b/tutorials/e2e-distributed-pytorch-image/e2e-object-classification-distributed-pytorch.ipynb @@ -412,7 +412,7 @@ " # \"register_model_as\": \"places_dev\",\n", " \"enable_profiling\": False,\n", " },\n", - " environment=\"AzureML-acpt-pytorch-2.2-cuda12.1@latest\",\n", + " environment=\"AzureML-acpt-pytorch-2.8-cuda12.6@latest\",\n", " compute=\"gpu-cluster\"\n", " if (gpu_cluster)\n", " else None, # No compute needs to be passed to use serverless\n",