diff --git a/accelerator/roles/intel/tasks/install_ubuntu.yml b/accelerator/roles/intel/tasks/install_ubuntu.yml index c2a175ed7b..38dfc65ae6 100644 --- a/accelerator/roles/intel/tasks/install_ubuntu.yml +++ b/accelerator/roles/intel/tasks/install_ubuntu.yml @@ -19,6 +19,9 @@ - name: Install drivers on Gaudi nodes when: node_has_accelerator block: + - name: make sure required kernel command line are set + ansible.builtin.include_tasks: verify_kernel_boot_cmdline.yml + - name: Gather package facts ansible.builtin.package_facts: manager: auto diff --git a/accelerator/roles/intel/tasks/make_sure_scale_out_interfaces_up.yml b/accelerator/roles/intel/tasks/make_sure_scale_out_interfaces_up.yml index 2161e3e73b..75899f9756 100644 --- a/accelerator/roles/intel/tasks/make_sure_scale_out_interfaces_up.yml +++ b/accelerator/roles/intel/tasks/make_sure_scale_out_interfaces_up.yml @@ -39,11 +39,10 @@ - name: Create gaudi-network.sh when: not check_scale_up_script.stat.exists - ansible.builtin.blockinfile: - path: "{{ intel_scale_up_ports_script_path }}" - create: true + ansible.builtin.copy: + dest: "{{ intel_scale_up_ports_script_path }}" mode: "{{ file_permissions }}" - block: | + content: | #!/bin/bash EXT_PORTS="24" RETRIES=10 diff --git a/accelerator/roles/intel/tasks/verify_kernel_boot_cmdline.yml b/accelerator/roles/intel/tasks/verify_kernel_boot_cmdline.yml new file mode 100644 index 0000000000..d155b276c0 --- /dev/null +++ b/accelerator/roles/intel/tasks/verify_kernel_boot_cmdline.yml @@ -0,0 +1,56 @@ +# Copyright 2025 Intel Corporation. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +- name: Read current GRUB_CMDLINE_LINUX + ansible.builtin.command: grep '^GRUB_CMDLINE_LINUX=' /etc/default/grub + register: grub_cmdline + changed_when: false + +- name: Ensure iommu=pt is present + ansible.builtin.lineinfile: + path: /etc/default/grub + regexp: '^GRUB_CMDLINE_LINUX="(.*)"' + line: 'GRUB_CMDLINE_LINUX="\1 iommu=pt"' + backrefs: true + when: '"iommu=pt" not in grub_cmdline.stdout' + +- name: Ensure intel_iommu=on is present + ansible.builtin.lineinfile: + path: /etc/default/grub + regexp: '^GRUB_CMDLINE_LINUX="(.*)"' + line: 'GRUB_CMDLINE_LINUX="\1 intel_iommu=on"' + backrefs: true + when: '"intel_iommu=on" not in grub_cmdline.stdout' + +- name: Update GRUB configuration + ansible.builtin.command: update-grub + when: '"iommu=pt" not in grub_cmdline.stdout or "intel_iommu=on" not in grub_cmdline.stdout' + changed_when: false + +- name: Prompt for reboot + ansible.builtin.pause: + seconds: "{{ warning_wait_time }}" + prompt: "{{ reboot_warning_msg }}" + when: '"iommu=pt" not in grub_cmdline.stdout or "intel_iommu=on" not in grub_cmdline.stdout' + +- name: Initiate reboot + when: '"iommu=pt" not in grub_cmdline.stdout or "intel_iommu=on" not in grub_cmdline.stdout' + block: + - name: Rebooting node (This task will take some time) + ansible.builtin.reboot: + reboot_timeout: 600 + rescue: + - name: Failed to reboot node + ansible.builtin.fail: + msg: "{{ reboot_fail_msg }}" diff --git a/accelerator/roles/intel/vars/main.yml b/accelerator/roles/intel/vars/main.yml index 75c56bc44b..b37d5ddaf6 100644 --- a/accelerator/roles/intel/vars/main.yml +++ b/accelerator/roles/intel/vars/main.yml @@ -19,6 +19,7 @@ intel_gaudi_kernel_module_to_load: - habanalabs - habanalabs_cn - habanalabs_en + - habanalabs_compat # TODO: move to a central config file intel_habana_packages: @@ -70,3 +71,6 @@ intel_scale_up_ports_service_name: "gaudi-network.service" file_permissions: "0755" svc_file_permissions: "0644" gaudi3_pci_vendor_device_class: "1da3:1060:1200" +warning_wait_time: 30 # Time to wait for user input +reboot_warning_msg: "Changes have been made to the GRUB configuration. Do you want to reboot the system now? Press 'Enter' to continue or 'Ctrl+C' to abort." +reboot_fail_msg: "Failed to reboot the node. Please check the system manually." diff --git a/accelerator/tests/test_vars/test_Gaudi_vars.yml b/accelerator/tests/test_vars/test_Gaudi_vars.yml index 8c775cb3f1..b188ac1872 100644 --- a/accelerator/tests/test_vars/test_Gaudi_vars.yml +++ b/accelerator/tests/test_vars/test_Gaudi_vars.yml @@ -18,7 +18,7 @@ oim_dir: "../" Gaudi_validation_script_path: test_Gaudi_validation.yml inventory: ../inventory.ini -Gaudi_Default_version: "1.19.2" +Gaudi_Default_version: "1.21.1" version_pass: 'Gaudi driver version installed on the nodes matched successfully with the default version' version_fail: 'Gaudi driver version installed on the nodes does not matched with the default version' diff --git a/docs/source/OmniaInstallGuide/Ubuntu/AdvancedConfigurationsUbuntu/CustomLocalRepo.rst b/docs/source/OmniaInstallGuide/Ubuntu/AdvancedConfigurationsUbuntu/CustomLocalRepo.rst index 8bec845f8a..d83c981a00 100644 --- a/docs/source/OmniaInstallGuide/Ubuntu/AdvancedConfigurationsUbuntu/CustomLocalRepo.rst +++ b/docs/source/OmniaInstallGuide/Ubuntu/AdvancedConfigurationsUbuntu/CustomLocalRepo.rst @@ -29,7 +29,7 @@ Use the local repository feature to create a customized set of local repositorie {"name": "telemetry"}, {"name": "ucx", "version": "1.15.0"}, {"name": "openmpi", "version": "4.1.6"}, - {"name": "intelgaudi", "version": "1.19.2-32"}, + {"name": "intelgaudi", "version": "1.21.1-16"}, {"name": "csi_driver_powerscale", "version":"v2.13.0"} ], "bcm_roce": [ diff --git a/docs/source/OmniaInstallGuide/Ubuntu/CreateLocalRepo/InputParameters.rst b/docs/source/OmniaInstallGuide/Ubuntu/CreateLocalRepo/InputParameters.rst index fa7fe77f17..8431926279 100644 --- a/docs/source/OmniaInstallGuide/Ubuntu/CreateLocalRepo/InputParameters.rst +++ b/docs/source/OmniaInstallGuide/Ubuntu/CreateLocalRepo/InputParameters.rst @@ -37,7 +37,7 @@ Input parameters for Local Repositories {"name": "telemetry"}, {"name": "ucx", "version": "1.15.0"}, {"name": "openmpi", "version": "4.1.6"}, - {"name": "intelgaudi", "version": "1.19.2-32"}, + {"name": "intelgaudi", "version": "1.21.1-16"}, {"name": "csi_driver_powerscale", "version":"v2.13.0"} {"name": "intel_benchmarks", "version": "2024.1"}, {"name": "amd_benchmarks"} diff --git a/docs/source/OmniaInstallGuide/Ubuntu/CreateLocalRepo/localrepos.rst b/docs/source/OmniaInstallGuide/Ubuntu/CreateLocalRepo/localrepos.rst index 77863af30f..661fe40b19 100644 --- a/docs/source/OmniaInstallGuide/Ubuntu/CreateLocalRepo/localrepos.rst +++ b/docs/source/OmniaInstallGuide/Ubuntu/CreateLocalRepo/localrepos.rst @@ -31,7 +31,7 @@ Configure specific local repositories :: - {"name": "intelgaudi", "version": "1.19.2-32"}, + {"name": "intelgaudi", "version": "1.21.1-16"}, * Add the following line below the ``softwares`` section: diff --git a/docs/source/OmniaInstallGuide/samplefiles.rst b/docs/source/OmniaInstallGuide/samplefiles.rst index d8265183eb..d4c95e3f67 100644 --- a/docs/source/OmniaInstallGuide/samplefiles.rst +++ b/docs/source/OmniaInstallGuide/samplefiles.rst @@ -90,7 +90,7 @@ software_config.json for Ubuntu {"name": "telemetry"}, {"name": "ucx", "version": "1.15.0"}, {"name": "openmpi", "version": "4.1.6"}, - {"name": "intelgaudi", "version": "1.19.2-32"}, + {"name": "intelgaudi", "version": "1.21.1-16"}, {"name": "csi_driver_powerscale", "version":"v2.13.0"} ], diff --git a/docs/source/Overview/newfeatures.rst b/docs/source/Overview/newfeatures.rst index 2082cb145d..e4f54f9973 100644 --- a/docs/source/Overview/newfeatures.rst +++ b/docs/source/Overview/newfeatures.rst @@ -23,7 +23,7 @@ New Features +--------------------------+-----------------------------------+-------------------------------+ | Kubespray | 2.27 | 2.25 | +--------------------------+-----------------------------------+-------------------------------+ - | Intel Gaudi driver | 1.19.2 | 1.19.1 | + | Intel Gaudi driver | 1.21.1 | 1.19.2 | +--------------------------+-----------------------------------+-------------------------------+ | CSI PowerScale driver | 2.13.0 | 2.11.0 | +--------------------------+-----------------------------------+-------------------------------+ diff --git a/docs/source/Tables/omnia_installed_software.csv b/docs/source/Tables/omnia_installed_software.csv index f66b9bdfdb..1040d0901d 100644 --- a/docs/source/Tables/omnia_installed_software.csv +++ b/docs/source/Tables/omnia_installed_software.csv @@ -50,12 +50,12 @@ beegfs,BeeGFS (formerly known as FhGFS) END USER LICENSE,Public repository for t beegfs on ubuntu 24.04,BeeGFS (formerly known as FhGFS) END USER LICENSE,Public repository for the BeeGFS Parallel File System,7.4.5,,,, csi powerscale driver ,Apache-2.0 license,CSI Driver for Dell PowerScale,2.13.0,,,, CUDA toolkit,NVIDIA Software License,"The NVIDIA® CUDA® Toolkit provides a development environment for creating high-performance, GPU-accelerated applications.",12.8,,,, -Intel Gaudi3 driver,MIT license,Intel Gaudi3 drivers,1.19.2,,,, +Intel Gaudi3 driver,MIT license,Intel Gaudi3 drivers,1.21.1,,,, FreeIPA (RHEL/Rocky),GNU-General Public License v3.0," FreeIPA, an integrated security information management solution",4.9.11,,,, NVIDIA device plugin,Apache License 2.0,NVIDIA device plugin for Kubernetes,0.14.4,,,, rocm device plugin,Apache License 2.0,Kubernetes (k8s) device plugin to enable registration of AMD GPU to a container cluster,0.19.0,,,, xilinx-device-plugin,Apache License 2.0,The AMD-Xilinx device plugin for Kubernetes is a Daemonset deployed on the Kubernetes(k8s) cluster,1.2.0,,,, -habanalabs-k8s-device-plugin,Apache License 2.0,HABANA device plugin for Kubernetes,1.19.2,,,, +habanalabs-k8s-device-plugin,Apache License 2.0,HABANA device plugin for Kubernetes,1.21.1,,,, Jupyterhub,BSD-3-Clause license,Multi-user server for Jupyter notebooks,3.2.0,,,, Kserve,Apache License 2.0,Standardized Serverless ML Inference Platform on Kubernetes,0.13.0,,,, kubeflow,Apache-2.0,A repository for Kustomize manifests,1.9.1,,,, @@ -141,7 +141,7 @@ Tensorflow Nvidia,Apache-2.0 license,An Open Source Machine Learning Framework f Kustomize (RHEL/Rocky 8.8),Apache License 2.0,Customization of kubernetes YAML configurations,5.0.3,,,, Kustomize (ubuntu 22.04/24.04),Apache License 2.0,Customization of kubernetes YAML configurations,5.4.3,,,, nfs-subdir-external-provisioner,Apache License 2.0,Dynamic sub-dir volume provisioner on a remote NFS server.,4.0.18,,,, -habana-container-runtime,Apache License 2.0,Habana container runtime,1.19.2,,,, +habana-container-runtime,Apache License 2.0,Habana container runtime,1.21.1,,,, nvidia-container-toolkit,Apache License 2.0,NVIDIA container runtime library,1.17.5,,,, helm-charts,Apache-2.0 license,The source for Dell Helm charts.,csi-isilon-2.13.0,,,, csi-powerscale,Apache-2.0 license,CSI Driver for Dell PowerScale,2.13.0,,,, diff --git a/docs/source/Tables/software_matrix_rhel_rocky.csv b/docs/source/Tables/software_matrix_rhel_rocky.csv index 4598a18074..5d97710ccb 100644 --- a/docs/source/Tables/software_matrix_rhel_rocky.csv +++ b/docs/source/Tables/software_matrix_rhel_rocky.csv @@ -50,7 +50,7 @@ FreeIPA,GNU-General Public License v3.0," FreeIPA, an integrated security infor NVIDIA device plugin,Apache License 2.0,NVIDIA device plugin for Kubernetes,0.14.4 rocm device plugin,Apache License 2.0,Kubernetes (k8s) device plugin to enable registration of AMD GPU to a container cluster,0.19.0 xilinx-device-plugin,Apache License 2.0,The AMD-Xilinx device plugin for Kubernetes is a Daemonset deployed on the Kubernetes(k8s) cluster,1.2.0 -habanalabs-k8s-device-plugin,Apache License 2.0,HABANA device plugin for Kubernetes,1.19.2 +habanalabs-k8s-device-plugin,Apache License 2.0,HABANA device plugin for Kubernetes,1.21.1 Jupyterhub,BSD-3-Clause license,Multi-user server for Jupyter notebooks,3.2.0 Kserve,Apache License 2.0,Standardized Serverless ML Inference Platform on Kubernetes,0.13.0 kubeflow,Apache-2.0,A repository for Kustomize manifests,1.9.1 diff --git a/docs/source/Tables/software_matrix_ubuntu_22.csv b/docs/source/Tables/software_matrix_ubuntu_22.csv index 8c19db0eea..7b9fe7b5bd 100644 --- a/docs/source/Tables/software_matrix_ubuntu_22.csv +++ b/docs/source/Tables/software_matrix_ubuntu_22.csv @@ -45,11 +45,11 @@ rccl,MIT License,The ROCm Communication Collectives Library (RCCL) is a stand-al beegfs,BeeGFS (formerly known as FhGFS) END USER LICENSE,Public repository for the BeeGFS Parallel File System,7.4.5 beeGFS-Client,GPLv2,"BeeGFS is a parallel file system, developed and optimized for high-performance computing.",7.4.5 CUDA toolkit,NVIDIA Software License,"The NVIDIA® CUDA® Toolkit provides a development environment for creating high-performance, GPU-accelerated applications.",12.8 -Intel Gaudi3 driver,MIT license,Intel Gaudi3 drivers,1.19.2 +Intel Gaudi3 driver,MIT license,Intel Gaudi3 drivers,1.21.1 NVIDIA device plugin,Apache License 2.0,NVIDIA device plugin for Kubernetes,0.14.4 rocm device plugin,Apache License 2.0,Kubernetes (k8s) device plugin to enable registration of AMD GPU to a container cluster,0.19.0 xilinx-device-plugin,Apache License 2.0,The AMD-Xilinx device plugin for Kubernetes is a Daemonset deployed on the Kubernetes(k8s) cluster,1.2.0 -habanalabs-k8s-device-plugin,Apache License 2.0,HABANA device plugin for Kubernetes,1.19.2 +habanalabs-k8s-device-plugin,Apache License 2.0,HABANA device plugin for Kubernetes,1.21.1 Jupyterhub,BSD-3-Clause license,Multi-user server for Jupyter notebooks,3.2.0 Kserve,Apache License 2.0,Standardized Serverless ML Inference Platform on Kubernetes,0.13.0 kubeflow,Apache-2.0,A repository for Kustomize manifests,1.9.1 @@ -109,8 +109,8 @@ Tensorflow AMD,Apache-2.0 license,An Open Source Machine Learning Framework for Tensorflow NVIDIA,Apache-2.0 license,An Open Source Machine Learning Framework for Everyone,23.12-tf2-py3 Kustomize,Apache License 2.0,Customization of kubernetes YAML configurations,5.4.3 nfs-subdir-external-provisioner,Apache License 2.0,Dynamic sub-dir volume provisioner on a remote NFS server.,4.0.18 -habana-container-runtime,Apache License 2.0,Habana container runtime,1.19.2 -hccl,Habana Outbound Software License Agreement,The Habana Collective Communications Library (HCCL) is Intel® Gaudi®’s emulation layer of the NVIDIA Collective Communication Library (NCCL) and is included in the Intel Gaudi software suite.,1.19.2 +habana-container-runtime,Apache License 2.0,Habana container runtime,1.21.1 +hccl,Habana Outbound Software License Agreement,The Habana Collective Communications Library (HCCL) is Intel® Gaudi®’s emulation layer of the NVIDIA Collective Communication Library (NCCL) and is included in the Intel Gaudi software suite.,1.21.1 nvidia-container-toolkit,Apache License 2.0,NVIDIA container runtime library,1.17.5 helm-charts,Apache-2.0 license,The source for Dell Helm charts.,csi-isilon-2.13.0 csi-powerscale,Apache-2.0 license,CSI Driver for Dell PowerScale,2.13.0 diff --git a/docs/source/Tables/software_matrix_ubuntu_24.csv b/docs/source/Tables/software_matrix_ubuntu_24.csv index 73b0c3f83c..d3ce7ad482 100644 --- a/docs/source/Tables/software_matrix_ubuntu_24.csv +++ b/docs/source/Tables/software_matrix_ubuntu_24.csv @@ -45,11 +45,11 @@ rccl,MIT License,The ROCm Communication Collectives Library (RCCL) is a stand-al beegfs,BeeGFS (formerly known as FhGFS) END USER LICENSE,Public repository for the BeeGFS Parallel File System,7.4.5 Beegfs-Client,GPLv2,"BeeGFS is a parallel file system, developed and optimized for high-performance computing.",7.4.5 CUDA toolkit,NVIDIA Software License,"The NVIDIA® CUDA® Toolkit provides a development environment for creating high-performance, GPU-accelerated applications.",12.8 -Intel Gaudi3 driver,MIT license,Intel Gaudi3 drivers,1.19.2 +Intel Gaudi3 driver,MIT license,Intel Gaudi3 drivers,1.21.1 NVIDIA device plugin,Apache License 2.0,NVIDIA device plugin for Kubernetes,0.14.4 rocm device plugin,Apache License 2.0,Kubernetes (k8s) device plugin to enable registration of AMD GPU to a container cluster,0.19.0 xilinx-device-plugin,Apache License 2.0,The AMD-Xilinx device plugin for Kubernetes is a Daemonset deployed on the Kubernetes(k8s) cluster,1.2.0 -habanalabs-k8s-device-plugin,Apache License 2.0,HABANA device plugin for Kubernetes,1.19.2 +habanalabs-k8s-device-plugin,Apache License 2.0,HABANA device plugin for Kubernetes,1.21.1 Jupyterhub,BSD-3-Clause license,Multi-user server for Jupyter notebooks,3.2.0 Kserve,Apache License 2.0,Standardized Serverless ML Inference Platform on Kubernetes,0.13.0 kubeflow,Apache-2.0,A repository for Kustomize manifests,1.9.1 @@ -109,8 +109,8 @@ Tensorflow AMD,Apache-2.0 license,An Open Source Machine Learning Framework for Tensorflow NVIDIA,Apache-2.0 license,An Open Source Machine Learning Framework for Everyone,23.12-tf2-py3 Kustomize,Apache License 2.0,Customization of kubernetes YAML configurations,5.4.3 nfs-subdir-external-provisioner,Apache License 2.0,Dynamic sub-dir volume provisioner on a remote NFS server.,4.0.18 -habana-container-runtime,Apache License 2.0,Habana container runtime,1.19.2 -hccl,Habana Outbound Software License Agreement,The Habana Collective Communications Library (HCCL) is Intel® Gaudi®’s emulation layer of the NVIDIA Collective Communication Library (NCCL) and is included in the Intel Gaudi software suite.,1.19.2 +habana-container-runtime,Apache License 2.0,Habana container runtime,1.21.1 +hccl,Habana Outbound Software License Agreement,The Habana Collective Communications Library (HCCL) is Intel® Gaudi®’s emulation layer of the NVIDIA Collective Communication Library (NCCL) and is included in the Intel Gaudi software suite.,1.21.1 nvidia-container-toolkit,Apache License 2.0,NVIDIA container runtime library,1.17.5 helm-charts,Apache-2.0 license,The source for Dell Helm charts.,csi-isilon-2.13.0 csi-powerscale driver,Apache-2.0 license,CSI Driver for Dell PowerScale,2.13.0 diff --git a/examples/ai_examples/intel/deepSpeed/ds_configuration.yml b/examples/ai_examples/intel/deepSpeed/ds_configuration.yml index b027c99846..3a9e5578c1 100644 --- a/examples/ai_examples/intel/deepSpeed/ds_configuration.yml +++ b/examples/ai_examples/intel/deepSpeed/ds_configuration.yml @@ -14,7 +14,7 @@ spec: template: spec: containers: - - image: vault.habana.ai/gaudi-docker/1.19.2/ubuntu24.04/habanalabs/pytorch-installer-2.5.1:latest + - image: vault.habana.ai/gaudi-docker/1.21.1/ubuntu24.04/habanalabs/pytorch-installer-2.6.0:latest name: gaudi-llm-ds-ft-launcher env: - name: HF_HOME @@ -66,11 +66,11 @@ spec: git clone https://github.com/huggingface/optimum-habana /optimum-habana; cd /optimum-habana; - git checkout v1.15.0; + git checkout v1.17.0; sed -i '194s|deepspeed|deepspeed --force_multi|' optimum/habana/distributed/distributed_runner.py; retry_until_success pip install .; retry_until_success pip install -r examples/language-modeling/requirements.txt; - retry_until_success pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.19.0; + retry_until_success pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.21.0; mpirun --npernode 1 \ --tag-output \ @@ -108,7 +108,7 @@ spec: git clone https://github.com/huggingface/optimum-habana /optimum-habana cd /optimum-habana - git checkout v1.15.0 + git checkout v1.17.0 hf_home_var="os.environ[\"HF_HOME\"] = \"${HF_HOME}\"" token_var="os.environ[\"HUGGING_FACE_HUB_TOKEN\"] = \"${HUGGING_FACE_HUB_TOKEN}\"" https_var="os.environ[\"https_proxy\"] = \"${https_proxy}\"" @@ -121,7 +121,7 @@ spec: sed -i "60i\\${no_proxy_var}" examples/language-modeling/run_lora_clm.py retry_until_success pip install . retry_until_success pip install -r examples/language-modeling/requirements.txt - retry_until_success pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.19.0 + retry_until_success pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.21.0 '; eval $(ssh-agent); @@ -172,7 +172,7 @@ spec: spec: hostIPC: true containers: - - image: vault.habana.ai/gaudi-docker/1.19.2/ubuntu24.04/habanalabs/pytorch-installer-2.5.1:latest + - image: vault.habana.ai/gaudi-docker/1.21.1/ubuntu24.04/habanalabs/pytorch-installer-2.6.0:latest name: gaudi-llm-ds-ft-worker command: ["/bin/bash", "-c"] args: diff --git a/examples/ai_examples/intel/vllm/vllm_configuration.yml b/examples/ai_examples/intel/vllm/vllm_configuration.yml index 856b4ff597..5dbe493a56 100644 --- a/examples/ai_examples/intel/vllm/vllm_configuration.yml +++ b/examples/ai_examples/intel/vllm/vllm_configuration.yml @@ -34,7 +34,7 @@ spec: app: vllm-llama-app spec: containers: - - image: vault.habana.ai/gaudi-docker/1.19.2/ubuntu24.04/habanalabs/pytorch-installer-2.5.1:latest + - image: vault.habana.ai/gaudi-docker/1.21.1/ubuntu24.04/habanalabs/pytorch-installer-2.6.0:latest name: vllm-llama-openai imagePullPolicy: Always workingDir: /root @@ -63,7 +63,7 @@ spec: - "/bin/sh" - "-c" - | - git clone -b v0.6.4.post2+Gaudi-1.19.2 https://github.com/HabanaAI/vllm-fork.git + git clone -b v0.7.2+Gaudi-1.21.0 https://github.com/HabanaAI/vllm-fork.git cd vllm-fork pip install -v -r requirements-hpu.txt export VLLM_TARGET_DEVICE=hpu diff --git a/examples/software_config_template/template_ubuntu_22.04_software_config.json b/examples/software_config_template/template_ubuntu_22.04_software_config.json index 800144755c..5e947bbc74 100644 --- a/examples/software_config_template/template_ubuntu_22.04_software_config.json +++ b/examples/software_config_template/template_ubuntu_22.04_software_config.json @@ -5,7 +5,7 @@ "softwares": [ {"name": "amdgpu", "version": "6.3.1"}, {"name": "bcm_roce", "version": "232.1.133.2"}, - {"name": "intelgaudi", "version": "1.19.2-32"}, + {"name": "intelgaudi", "version": "1.21.1-16"}, {"name": "cuda", "version": "12.8.0"}, {"name": "ofed", "version": "24.01-0.3.3.1"}, {"name": "openldap"}, diff --git a/examples/software_config_template/template_ubuntu_24.04_software_config.json b/examples/software_config_template/template_ubuntu_24.04_software_config.json index a42ebbd35d..dad7e8f3d8 100644 --- a/examples/software_config_template/template_ubuntu_24.04_software_config.json +++ b/examples/software_config_template/template_ubuntu_24.04_software_config.json @@ -5,7 +5,7 @@ "softwares": [ {"name": "amdgpu", "version": "6.3.1"}, {"name": "bcm_roce", "version": "232.1.133.2"}, - {"name": "intelgaudi", "version": "1.19.2-32"}, + {"name": "intelgaudi", "version": "1.21.1-16"}, {"name": "cuda", "version": "12.8.0"}, {"name": "ofed", "version": "24.07-0.6.1.0"}, {"name": "openldap"}, diff --git a/examples/ubuntu_software_config.json b/examples/ubuntu_software_config.json index d431bb4c66..a5ee5d51dc 100644 --- a/examples/ubuntu_software_config.json +++ b/examples/ubuntu_software_config.json @@ -12,7 +12,7 @@ {"name": "jupyter"}, {"name": "pytorch"}, {"name": "tensorflow"}, - {"name": "intelgaudi", "version": "1.19.2-32"} + {"name": "intelgaudi", "version": "1.21.1-16"} ], "bcm_roce": [ diff --git a/input/config/ubuntu/22.04/k8s.json b/input/config/ubuntu/22.04/k8s.json index e379d5b663..fb37b55e8b 100644 --- a/input/config/ubuntu/22.04/k8s.json +++ b/input/config/ubuntu/22.04/k8s.json @@ -250,7 +250,7 @@ }, { "package": "vault.habana.ai/docker-k8s-device-plugin/docker-k8s-device-plugin", - "tag": "1.19.2-32", + "tag": "1.21.1-16", "type": "image" }, { diff --git a/input/config/ubuntu/22.04/pytorch.json b/input/config/ubuntu/22.04/pytorch.json index add84a1c63..49f8247178 100644 --- a/input/config/ubuntu/22.04/pytorch.json +++ b/input/config/ubuntu/22.04/pytorch.json @@ -46,7 +46,7 @@ "cluster": [ { - "package": "vault.habana.ai/gaudi-docker/1.19.2/ubuntu22.04/habanalabs/pytorch-installer-2.5.1", + "package": "vault.habana.ai/gaudi-docker/1.21.1/ubuntu22.04/habanalabs/pytorch-installer-2.6.0", "tag": "latest", "type": "image" } diff --git a/input/config/ubuntu/24.04/k8s.json b/input/config/ubuntu/24.04/k8s.json index 9f4d31b3fc..175806eb4f 100644 --- a/input/config/ubuntu/24.04/k8s.json +++ b/input/config/ubuntu/24.04/k8s.json @@ -249,7 +249,7 @@ }, { "package": "vault.habana.ai/docker-k8s-device-plugin/docker-k8s-device-plugin", - "tag": "1.19.2-32", + "tag": "1.21.1-16", "type": "image" }, { diff --git a/input/config/ubuntu/24.04/pytorch.json b/input/config/ubuntu/24.04/pytorch.json index 93c1f3a0ab..2abef2b176 100644 --- a/input/config/ubuntu/24.04/pytorch.json +++ b/input/config/ubuntu/24.04/pytorch.json @@ -46,7 +46,7 @@ "cluster": [ { - "package": "vault.habana.ai/gaudi-docker/1.19.2/ubuntu24.04/habanalabs/pytorch-installer-2.5.1", + "package": "vault.habana.ai/gaudi-docker/1.21.1/ubuntu24.04/habanalabs/pytorch-installer-2.6.0", "tag": "latest", "type": "image" } diff --git a/input/software_config.json b/input/software_config.json index b8cc145bac..999702bc51 100644 --- a/input/software_config.json +++ b/input/software_config.json @@ -12,7 +12,7 @@ {"name": "jupyter"}, {"name": "pytorch"}, {"name": "tensorflow"}, - {"name": "intelgaudi", "version": "1.19.2-32"} + {"name": "intelgaudi", "version": "1.21.1-16"} ], "bcm_roce": [ diff --git a/utils/roles/check_package_lock/files/check_apt_lock.sh b/utils/roles/check_package_lock/files/check_apt_lock.sh old mode 100644 new mode 100755 diff --git a/utils/verify_intel_gaudi/roles/verify_intel_gaudi/tasks/hlqual_gaudi2_validation.yml b/utils/verify_intel_gaudi/roles/verify_intel_gaudi/tasks/hlqual_gaudi2_validation.yml index 6678bb52cd..f6c462c86b 100644 --- a/utils/verify_intel_gaudi/roles/verify_intel_gaudi/tasks/hlqual_gaudi2_validation.yml +++ b/utils/verify_intel_gaudi/roles/verify_intel_gaudi/tasks/hlqual_gaudi2_validation.yml @@ -130,23 +130,6 @@ failed_when: "'FAILED' in serdes_base_allgather_test_result.stdout" changed_when: true -- name: Run hl_qual serdes base direct bandwidth test - environment: - __python_cmd: "python{{ pver }}" - LOG_LEVEL_ALL: "{{ verify_intel_gaudi_habana_tests['log_level_all'] }}" - ENABLE_CONSOLE: "{{ verify_intel_gaudi_habana_tests['enable_console'] }}" - HABANA_LOGS: "{{ verify_intel_gaudi_habana_tests['habana_logs'] }}" - HABANALABS_HLTHUNK_TESTS_BIN_PATH: "{{ verify_intel_gaudi_habana_tests['habanalabs_hlthunk_tests_bin_path'] }}" - ansible.builtin.shell: | - set -o pipefail - ./hl_qual -gaudi2 -c all -rmod parallel -i 40 -ep 40 -nic_base -test_type dir_bw -dis_mon - args: - executable: /bin/bash - chdir: "{{ verify_intel_gaudi_habana_tests['gaudi2_qual_bin_path'] }}" - register: serdes_base_dirbw_test_result - failed_when: "'FAILED' in serdes_base_dirbw_test_result.stdout" - changed_when: true - - name: Unload habanalabs kernel module community.general.modprobe: name: habanalabs diff --git a/utils/verify_intel_gaudi/roles/verify_intel_gaudi/tasks/hlqual_gaudi3_validation.yml b/utils/verify_intel_gaudi/roles/verify_intel_gaudi/tasks/hlqual_gaudi3_validation.yml index 7631d1817a..f7332fdcec 100644 --- a/utils/verify_intel_gaudi/roles/verify_intel_gaudi/tasks/hlqual_gaudi3_validation.yml +++ b/utils/verify_intel_gaudi/roles/verify_intel_gaudi/tasks/hlqual_gaudi3_validation.yml @@ -130,23 +130,6 @@ failed_when: "'FAILED' in serdes_base_allgather_test_result.stdout" changed_when: true -- name: Run hl_qual serdes base direct bandwidth test - environment: - __python_cmd: "python{{ pver }}" - LOG_LEVEL_ALL: "{{ verify_intel_gaudi_habana_tests['log_level_all'] }}" - ENABLE_CONSOLE: "{{ verify_intel_gaudi_habana_tests['enable_console'] }}" - HABANA_LOGS: "{{ verify_intel_gaudi_habana_tests['habana_logs'] }}" - HABANALABS_HLTHUNK_TESTS_BIN_PATH: "{{ verify_intel_gaudi_habana_tests['habanalabs_hlthunk_tests_bin_path'] }}" - ansible.builtin.shell: | - set -o pipefail - ./hl_qual -gaudi3 -c all -rmod parallel -i 40 -ep 40 -nic_base -test_type dir_bw -dis_mon - args: - executable: /bin/bash - chdir: "{{ verify_intel_gaudi_habana_tests['gaudi3_qual_bin_path'] }}" - register: serdes_base_dirbw_test_result - failed_when: "'FAILED' in serdes_base_dirbw_test_result.stdout" - changed_when: true - - name: Run hl_qual HBM DMA stress test environment: __python_cmd: "python{{ pver }}"