diff --git a/cloud-infrastructure/ai-infra-gpu/ai-infrastructure/nemo-megatron-training-oke/LICENSE b/cloud-infrastructure/ai-infra-gpu/ai-infrastructure/nemo-megatron-training-oke/LICENSE new file mode 100644 index 000000000..46c0c79d9 --- /dev/null +++ b/cloud-infrastructure/ai-infra-gpu/ai-infrastructure/nemo-megatron-training-oke/LICENSE @@ -0,0 +1,35 @@ +Copyright (c) 2025 Oracle and/or its affiliates. + +The Universal Permissive License (UPL), Version 1.0 + +Subject to the condition set forth below, permission is hereby granted to any +person obtaining a copy of this software, associated documentation and/or data +(collectively the "Software"), free of charge and under any and all copyright +rights in the Software, and any and all patent rights owned or freely +licensable by each licensor hereunder covering either (i) the unmodified +Software as contributed to or provided by such licensor, or (ii) the Larger +Works (as defined below), to deal in both + +(a) the Software, and +(b) any piece of software and/or hardware listed in the lrgrwrks.txt file if +one is included with the Software (each a "Larger Work" to which the Software +is contributed by such licensors), + +without restriction, including without limitation the rights to copy, create +derivative works of, display, perform, and distribute the Software and make, +use, sell, offer for sale, import, export, have made, and have sold the +Software and the Larger Work(s), and to sublicense the foregoing rights on +either these or other terms. + +This license is subject to the following condition: +The above copyright notice and either this complete permission notice or at +a minimum a reference to the UPL must be included in all copies or +substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/cloud-infrastructure/ai-infra-gpu/ai-infrastructure/nemo-megatron-training-oke/README.md b/cloud-infrastructure/ai-infra-gpu/ai-infrastructure/nemo-megatron-training-oke/README.md new file mode 100644 index 000000000..dcabe366e --- /dev/null +++ b/cloud-infrastructure/ai-infra-gpu/ai-infrastructure/nemo-megatron-training-oke/README.md @@ -0,0 +1,132 @@ +# Training LLMs with NVIDIA NeMo using Oracle Container Engine for Kubernetes + +This repository demonstrates how to train LLM using +[NVIDIA NeMo](https://www.nvidia.com/en-gb/ai-data-science/products/nemo/) +on the Oracle Container Engine for Kubernetes (OKE) using +[NVIDIA Megatron](https://developer.nvidia.com/megatron-core). + +Reference results from NVIDIA to train Llama 3 can be found on the +[NGC Catalog](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/dgxc-benchmarking/resources/llama3-dgxc-benchmarking). + +Reviewed: 18.03.2025 + +# When to use this asset? + +* If you want to get started with training LLM like Llama 3 on Kubernetes using OCI. + +# How to use this asset? + +## Prerequisites + +* You have access to an Orcale Cloud Tenancy. +* You have access to shapes with NVIDIA GPUs such as H100. +* You have a HuggingFace account and access to `meta-llama/Llama-3.1-8B-Instruct`. + +This guide is loosely based on the +[NVIDIA NeMo Framework Launcher guide for Kubernetes](https://docs.nvidia.com/nemo-framework/user-guide/24.07/playbooks/kubernetes.html). + +## Infrastructure Setup + +1. Create an OKE cluster according + [to the instructions](https://github.com/oracle-quickstart/oci-hpc-oke/tree/main#instructions-for-deploying-an-oke-cluster-with-gpus-and-rdma-connectivity), + importing one of the images and creating a GPU partition with BM.GPU.H100.8 nodes. + + The configuration here assumes a minimum of 16 BM.GPU.H100.8 nodes. + + - Ensure that the follwing setting is selected under the "OKE Cluster" section: + + > Disable OKE GPU device plugin + + as this tutorial will install the GPU operator later. + +2. Create a new File System for NFS, and modify the [persistent volume configuration in `pv.yaml`](./files/pv.yaml) to match. + Optimally, this will utilize High Performance Mount Targets (HMPT) as described in the following two whitepapers: + * [Scale Out OCI File Storage Performance for AI/ML and +Data-Intensive Workloads](https://docs.oracle.com/en-us/iaas/Content/Resources/Assets/whitepapers/scale-out-oci-file-storage-performance-for-data-intensive-workloads.pdf) + * [File Storage Performance Guide](https://docs.oracle.com/en-us/iaas/Content/Resources/Assets/whitepapers/file-storage-performance-guide.pdf) + +3. Install the NVIDIA GPU Operator according to + [NVIDIA NeMo Framework Launcher guide for Kubernetes](https://docs.nvidia.com/nemo-framework/user-guide/24.07/playbooks/kubernetes.html), then install the [Volcano scheduler](https://github.com/volcano-sh/volcano) with: + ```sh + kubectl apply -f https://raw.githubusercontent.com/volcano-sh/volcano/master/installer/volcano-development.yaml + ``` + +4. Copy the [files in this repository](./files) to the Kubernetes operator node. + You can download them from this repository via: + ```sh + BRANCH=main + curl -L https://github.com/oracle-devrel/technology-engineering/archive/refs/heads/${BRANCH}.tar.gz|tar xzf - --strip-components=6 technology-engineering-${BRANCH}/cloud-infrastructure/ai-infra-gpu/ai-infrastructure/nemo-megatron-training-oke/files + ``` + + Then modify the values in [`training/values.yaml`](./files/training/values.yaml) to match the storage server and export path. + +5. Mount the file system on the Kubernetes operator node. In the following, the mount location is assumed to be `/mnt/data/`. + +## Data Preparation and Training + +1. Download the tokenizer model from HuggingFace: + ```sh + mkdir -p /mnt/data/tokenizer + huggingface-cli login + huggingface-cli download meta-llama/Llama-3.1-8B-Instruct tokenizer_config.json --local-dir /mnt/data/tokenizer + huggingface-cli download meta-llama/Llama-3.1-8B-Instruct tokenizer.json --local-dir /mnt/data/tokenizer + ``` + +2. Apply the preprocessing job that will download and tokenize parts of the Pile dataset: + ```sh + helm install --set num_nodes=1 --set download_data=true "my-preprocessing" ./training + ``` + + The progress can then be monitored by + ```sh + kubectl logs -f megatron-prep-my-preprocessing-mpimaster-0 + ``` + +3. Following successful preprocessing, the training can be started with: + ```sh + helm install --set num_nodes=1 "my-training-v0" ./training + ``` + + The progress can then be monitored by + ```sh + kubectl logs -f megatron-train-my-training-v0-mpimaster-0 + ``` + +4. Calculate training throughput. For this, the following data is required from the training output: + ``` + [NeMo I 2025-03-10 16:24:43 perf_metrics_utils:42] train_step_timing in s: [7.13, 7.12, 7.12, 7.13, 7.13, 7.13, 7.12, 7.13, 7.14, 7.13, 7.14, 7.26, 7.13, 7.13, 7.13, 7.13, 7.15, 7.14, 7.14, 7.13, 7.14, 7.14, 7.14, 7.14, 7.13, 7.14, 7.14, 7.14, 7.14, 7.14] + ``` + This log can be saved into a file with: + ```sh + kubectl logs megatron-train-my-training-v0-mpimaster-0 > training.log + ``` + and the performance analyzed with + ```sh + python3 utils/performance.py training.log + ``` + +## Potential Issues + +* **PyTorch can't resolve hostnames via c10d** + + If the rendezvous backend for PyTorch fails to connect to an OCI style + hostname for Kubernetes clusters, one work around this resolution failure by + augmenting `/etc/hosts` for every pod. + + For convenience, this is facilitated by enhancing `mpi.yaml` via + ```sh + ./utils/host_list.sh >> ./training/files/mpi.yaml + ``` + and afterwards reinstalling the training job via Helm. + +# Acknowledgments + +- **Author** - Matthias Wolf (GPU Solution Specialist) + +# License + +Copyright (c) 2025 Oracle and/or its affiliates. + +Licensed under the Universal Permissive License (UPL), Version 1.0. + +See [LICENSE](https://github.com/oracle-devrel/technology-engineering/blob/main/LICENSE) for more details. diff --git a/cloud-infrastructure/ai-infra-gpu/ai-infrastructure/nemo-megatron-training-oke/files/training/.helmignore b/cloud-infrastructure/ai-infra-gpu/ai-infrastructure/nemo-megatron-training-oke/files/training/.helmignore new file mode 100644 index 000000000..0e8a0eb36 --- /dev/null +++ b/cloud-infrastructure/ai-infra-gpu/ai-infrastructure/nemo-megatron-training-oke/files/training/.helmignore @@ -0,0 +1,23 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/cloud-infrastructure/ai-infra-gpu/ai-infrastructure/nemo-megatron-training-oke/files/training/Chart.yaml b/cloud-infrastructure/ai-infra-gpu/ai-infrastructure/nemo-megatron-training-oke/files/training/Chart.yaml new file mode 100644 index 000000000..01b27cc34 --- /dev/null +++ b/cloud-infrastructure/ai-infra-gpu/ai-infrastructure/nemo-megatron-training-oke/files/training/Chart.yaml @@ -0,0 +1,7 @@ +# Copyright (c) 2025 Oracle and/or its affiliates. +apiVersion: v2 +name: training +description: A Helm chart to train LLM on Kubernetes using NVIDIA NeMo and Megatron +type: application +version: 0.1.0 +appVersion: "1.16.0" diff --git a/cloud-infrastructure/ai-infra-gpu/ai-infrastructure/nemo-megatron-training-oke/files/training/files/config_llama3_70b.yaml b/cloud-infrastructure/ai-infra-gpu/ai-infrastructure/nemo-megatron-training-oke/files/training/files/config_llama3_70b.yaml new file mode 100644 index 000000000..eef6460de --- /dev/null +++ b/cloud-infrastructure/ai-infra-gpu/ai-infrastructure/nemo-megatron-training-oke/files/training/files/config_llama3_70b.yaml @@ -0,0 +1,229 @@ +# Copyright (c) 2025 Oracle and/or its affiliates. +run: + name: llama3_70b + results_dir: /mnt/data/results/llama3_70b + time_limit: 0-01:00:00 + dependency: singleton +trainer: + num_nodes: 8 + devices: 8 + accelerator: gpu + precision: bf16-mixed + logger: false + enable_checkpointing: false + use_distributed_sampler: false + max_epochs: null + max_steps: 100 + max_time: '19:23:30:00' + log_every_n_steps: 1 + val_check_interval: 100 + limit_val_batches: 32 + limit_test_batches: 50 + accumulate_grad_batches: 1 + gradient_clip_val: 1.0 + num_sanity_val_steps: 0 +exp_manager: + explicit_log_dir: /mnt/data/results/llama3_70b/logs + exp_dir: null + name: megatron_llama + create_wandb_logger: false + wandb_logger_kwargs: + project: nemo_llama_pretrain + name: llama3_1_70b + resume_if_exists: false + resume_ignore_no_checkpoint: true + create_checkpoint_callback: false + checkpoint_callback_params: + monitor: val_loss + save_top_k: 10 + mode: min + always_save_nemo: false + save_nemo_on_train_end: false + filename: megatron_llama--{val_loss:.2f}-{step}-{consumed_samples} + model_parallel_size: 16 + log_step_timing: true + step_timing_kwargs: + sync_cuda: true + seconds_to_sleep: 60 +model: + mcore_gpt: true + micro_batch_size: 1 + global_batch_size: 128 + rampup_batch_size: null + tensor_model_parallel_size: 4 + pipeline_model_parallel_size: 4 + virtual_pipeline_model_parallel_size: 5 + context_parallel_size: 2 + encoder_seq_length: 8192 + max_position_embeddings: 8192 + num_layers: 80 + hidden_size: 8192 + ffn_hidden_size: 28672 + num_attention_heads: 64 + num_query_groups: 8 + init_method_std: 0.008944 + use_scaled_init_method: true + hidden_dropout: 0.0 + attention_dropout: 0.0 + ffn_dropout: 0.0 + kv_channels: null + apply_query_key_layer_scaling: true + normalization: rmsnorm + layernorm_epsilon: 1.0e-05 + do_layer_norm_weight_decay: false + make_vocab_size_divisible_by: 128 + pre_process: true + post_process: true + persist_layer_norm: true + bias: false + activation: fast-swiglu + headscale: false + transformer_block_type: pre_ln + openai_gelu: false + normalize_attention_scores: true + position_embedding_type: rope + rotary_percentage: 1.0 + apply_rope_fusion: true + attention_type: multihead + share_embeddings_and_output_weights: false + tokenizer: + library: huggingface + type: /mnt/data/tokenizer + use_fast: true + native_amp_init_scale: 4294967296 + native_amp_growth_interval: 1000 + hysteresis: 2 + fp32_residual_connection: false + fp16_lm_cross_entropy: false + megatron_amp_O2: true + grad_allreduce_chunk_size_mb: 125 + grad_div_ar_fusion: true + gradient_accumulation_fusion: true + cross_entropy_loss_fusion: true + bias_activation_fusion: true + bias_dropout_add_fusion: true + masked_softmax_fusion: true + seed: 1234 + resume_from_checkpoint: null + use_cpu_initialization: false + onnx_safe: false + apex_transformer_log_level: 30 + gradient_as_bucket_view: true + sync_batch_comm: false + activations_checkpoint_granularity: null + activations_checkpoint_method: null + activations_checkpoint_num_layers: null + num_micro_batches_with_partial_activation_checkpoints: null + activations_checkpoint_layers_per_pipeline: null + sequence_parallel: true + defer_embedding_wgrad_compute: true + wgrad_deferral_limit: 22 + deterministic_mode: false + transformer_engine: true + fp8: false + fp8_e4m3: false + fp8_hybrid: false + fp8_margin: 0 + fp8_interval: 1 + fp8_amax_history_len: 1024 + fp8_amax_compute_algo: max + ub_tp_comm_overlap: true + use_flash_attention: true + overlap_p2p_comm: true + batch_p2p_comm: false + gc_interval: 25 + nsys_profile: + enabled: false + trace: + - nvtx + - cuda + start_step: 10 + end_step: 10 + ranks: + - 0 + gen_shape: false + optim: + name: distributed_fused_adam + lr: 0.0003 + weight_decay: 0.1 + betas: + - 0.9 + - 0.95 + bucket_cap_mb: 75 + overlap_grad_sync: true + overlap_param_sync: true + contiguous_grad_buffer: true + contiguous_param_buffer: true + sched: + name: CosineAnnealing + warmup_steps: 2000 + constant_steps: 0 + min_lr: 2.9999999999999997e-05 + grad_sync_dtype: bf16 + data: + data_impl: mmap + splits_string: 99990,8,2 + seq_length: 8192 + skip_warmup: true + num_workers: 2 + dataloader_type: single + reset_position_ids: true + reset_attention_mask: true + eod_mask_loss: false + index_mapping_dir: null + data_prefix: + - 1 + - /mnt/data/pile/my-gpt3_00_text_document + ub_tp_comm_overlap_cfg: + fc1_dgrad: + cga_size: 2 + method: bulk + num_sm: 2 + set_sm_margin: 0 + fc1_fprop: + aggregate: 0 + method: ring_exchange + num_sm: 1 + set_sm_margin: 0 + fc1_wgrad: + cga_size: 2 + method: bulk + num_sm: 4 + set_sm_margin: 0 + fc2_dgrad: + aggregate: 0 + method: ring_exchange + num_sm: 1 + set_sm_margin: 0 + fc2_fprop: + cga_size: 2 + method: pipeline + num_sm: 16 + num_splits: 4 + set_sm_margin: 1 + proj_dgrad: + aggregate: 0 + method: ring_exchange + num_sm: 1 + set_sm_margin: 0 + proj_fprop: + cga_size: 2 + method: pipeline + num_sm: 24 + num_splits: 4 + set_sm_margin: 1 + qkv_dgrad: + cga_size: 2 + method: bulk + num_sm: 4 + set_sm_margin: 0 + qkv_fprop: + aggregate: 0 + method: ring_exchange + num_sm: 1 + set_sm_margin: 0 + qkv_wgrad: + cga_size: 2 + method: bulk + num_sm: 24 + set_sm_margin: 0 diff --git a/cloud-infrastructure/ai-infra-gpu/ai-infrastructure/nemo-megatron-training-oke/files/training/files/config_llama3_8b.yaml b/cloud-infrastructure/ai-infra-gpu/ai-infrastructure/nemo-megatron-training-oke/files/training/files/config_llama3_8b.yaml new file mode 100644 index 000000000..7f8068831 --- /dev/null +++ b/cloud-infrastructure/ai-infra-gpu/ai-infrastructure/nemo-megatron-training-oke/files/training/files/config_llama3_8b.yaml @@ -0,0 +1,163 @@ +# Copyright (c) 2025 Oracle and/or its affiliates. +hydra: + searchpath: + - file:///opt/NeMo/examples/nlp/language_modeling/conf +run: + name: llama3_8b + results_dir: /mnt/data/results/llama3_8b + time_limit: 0-01:00:00 + dependency: singleton +trainer: + num_nodes: 4 + devices: 8 + accelerator: gpu + precision: bf16 + logger: false + enable_checkpointing: false + use_distributed_sampler: false + max_epochs: null + max_steps: 100 + max_time: '19:23:30:00' + log_every_n_steps: 1 + val_check_interval: 100 + limit_val_batches: 32 + limit_test_batches: 50 + accumulate_grad_batches: 1 + gradient_clip_val: 1.0 +exp_manager: + explicit_log_dir: /mnt/data/results/llama3_8b/logs + exp_dir: null + name: megatron_llama + create_wandb_logger: false + wandb_logger_kwargs: + project: nemo_llama_pretrain + name: llama3_8b_bf16 + resume_if_exists: false + resume_ignore_no_checkpoint: true + create_checkpoint_callback: false + checkpoint_callback_params: + monitor: val_loss + save_top_k: 10 + mode: min + always_save_nemo: false + save_nemo_on_train_end: true + filename: megatron_llama--{val_loss:.2f}-{step}-{consumed_samples} + model_parallel_size: 1 + log_step_timing: true + step_timing_kwargs: + sync_cuda: true + buffer_size: 5 + seconds_to_sleep: 60 +model: + mcore_gpt: true + micro_batch_size: 1 + global_batch_size: 512 + rampup_batch_size: null + tensor_model_parallel_size: 1 + pipeline_model_parallel_size: 1 + virtual_pipeline_model_parallel_size: null + context_parallel_size: 2 + encoder_seq_length: 8192 + max_position_embeddings: 8192 + num_layers: 32 + hidden_size: 4096 + ffn_hidden_size: 14336 + num_attention_heads: 32 + num_query_groups: 8 + init_method_std: 0.01 + use_scaled_init_method: true + hidden_dropout: 0.0 + attention_dropout: 0.0 + ffn_dropout: 0.0 + kv_channels: null + apply_query_key_layer_scaling: true + normalization: rmsnorm + layernorm_epsilon: 1.0e-05 + do_layer_norm_weight_decay: false + make_vocab_size_divisible_by: 128 + pre_process: true + post_process: true + persist_layer_norm: true + bias: false + activation: fast-swiglu + headscale: false + transformer_block_type: pre_ln + openai_gelu: false + normalize_attention_scores: true + position_embedding_type: rope + rotary_percentage: 1.0 + apply_rope_fusion: true + attention_type: multihead + share_embeddings_and_output_weights: false + tokenizer: + library: huggingface + type: /mnt/data/tokenizer + use_fast: true + native_amp_init_scale: 4294967296 + native_amp_growth_interval: 1000 + hysteresis: 2 + fp32_residual_connection: false + fp16_lm_cross_entropy: false + megatron_amp_O2: true + grad_allreduce_chunk_size_mb: 125 + grad_div_ar_fusion: true + gradient_accumulation_fusion: true + bias_activation_fusion: true + bias_dropout_add_fusion: true + masked_softmax_fusion: true + seed: 1234 + resume_from_checkpoint: null + use_cpu_initialization: false + onnx_safe: false + apex_transformer_log_level: 30 + gradient_as_bucket_view: true + sync_batch_comm: false + activations_checkpoint_granularity: null + activations_checkpoint_method: null + activations_checkpoint_num_layers: null + num_micro_batches_with_partial_activation_checkpoints: null + activations_checkpoint_layers_per_pipeline: null + sequence_parallel: false + transformer_engine: true + fp8: false + fp8_e4m3: false + fp8_hybrid: false + fp8_margin: 0 + fp8_interval: 1 + fp8_amax_history_len: 1024 + fp8_amax_compute_algo: max + ub_tp_comm_overlap: false + use_flash_attention: true + gc_interval: 25 + optim: + name: distributed_fused_adam + lr: 0.0001 + weight_decay: 0.1 + betas: + - 0.9 + - 0.95 + bucket_cap_mb: 125 + overlap_grad_sync: true + overlap_param_sync: true + contiguous_grad_buffer: true + contiguous_param_buffer: true + sched: + name: CosineAnnealing + warmup_steps: 500 + constant_steps: 0 + min_lr: 1.0e-05 + grad_sync_dtype: bf16 + data: + data_impl: mmap + splits_string: 99990,8,2 + seq_length: 8192 + skip_warmup: true + num_workers: 2 + dataloader_type: single + reset_position_ids: false + reset_attention_mask: false + eod_mask_loss: false + index_mapping_dir: null + data_prefix: + - 1 + - /mnt/data/pile/my-gpt3_00_text_document diff --git a/cloud-infrastructure/ai-infra-gpu/ai-infrastructure/nemo-megatron-training-oke/files/training/files/config_llama3_8b_v2.yaml b/cloud-infrastructure/ai-infra-gpu/ai-infrastructure/nemo-megatron-training-oke/files/training/files/config_llama3_8b_v2.yaml new file mode 100644 index 000000000..71dd2992f --- /dev/null +++ b/cloud-infrastructure/ai-infra-gpu/ai-infrastructure/nemo-megatron-training-oke/files/training/files/config_llama3_8b_v2.yaml @@ -0,0 +1,173 @@ +# Copyright (c) 2025 Oracle and/or its affiliates. +run: + name: llama3_8b + results_dir: /mnt/data/results/llama3_8b + time_limit: 0-01:00:00 + dependency: singleton +trainer: + num_nodes: 1 + devices: 8 + accelerator: gpu + precision: bf16 + logger: false + enable_checkpointing: false + use_distributed_sampler: false + max_epochs: null + max_steps: 100 + max_time: '19:23:30:00' + log_every_n_steps: 1 + val_check_interval: 100 + limit_val_batches: 1 + limit_test_batches: 50 + accumulate_grad_batches: 1 + gradient_clip_val: 1.0 + num_sanity_val_steps: 0 +exp_manager: + explicit_log_dir: /mnt/data/results/llama3_8b/logs + exp_dir: null + name: megatron_llama + create_wandb_logger: false + wandb_logger_kwargs: + project: nemo_llama_pretrain + name: llama3_1_8b + resume_if_exists: false + resume_ignore_no_checkpoint: true + create_checkpoint_callback: false + checkpoint_callback_params: + monitor: val_loss + save_top_k: 10 + mode: min + always_save_nemo: false + save_nemo_on_train_end: false + filename: megatron_llama--{val_loss:.2f}-{step}-{consumed_samples} + model_parallel_size: 1 + log_step_timing: true + step_timing_kwargs: + sync_cuda: true + buffer_size: 5 + seconds_to_sleep: 60 +model: + mcore_gpt: true + micro_batch_size: 1 + global_batch_size: 512 + rampup_batch_size: null + tensor_model_parallel_size: 1 + pipeline_model_parallel_size: 1 + virtual_pipeline_model_parallel_size: null + context_parallel_size: 2 + encoder_seq_length: 8192 + max_position_embeddings: 8192 + num_layers: 32 + hidden_size: 4096 + ffn_hidden_size: 14336 + num_attention_heads: 32 + num_query_groups: 8 + init_method_std: 0.01 + use_scaled_init_method: true + hidden_dropout: 0.0 + attention_dropout: 0.0 + ffn_dropout: 0.0 + kv_channels: null + apply_query_key_layer_scaling: true + normalization: rmsnorm + layernorm_epsilon: 1.0e-05 + do_layer_norm_weight_decay: false + make_vocab_size_divisible_by: 128 + pre_process: true + post_process: true + persist_layer_norm: true + bias: false + activation: fast-swiglu + headscale: false + transformer_block_type: pre_ln + openai_gelu: false + normalize_attention_scores: true + position_embedding_type: rope + rotary_percentage: 1.0 + apply_rope_fusion: true + cross_entropy_loss_fusion: true + attention_type: multihead + share_embeddings_and_output_weights: false + tokenizer: + library: huggingface + type: /mnt/data/tokenizer + use_fast: true + native_amp_init_scale: 4294967296 + native_amp_growth_interval: 1000 + hysteresis: 2 + fp32_residual_connection: false + fp16_lm_cross_entropy: false + megatron_amp_O2: true + grad_allreduce_chunk_size_mb: 125 + grad_div_ar_fusion: true + gradient_accumulation_fusion: true + bias_activation_fusion: true + bias_dropout_add_fusion: true + masked_softmax_fusion: true + seed: 1234 + resume_from_checkpoint: null + use_cpu_initialization: false + onnx_safe: false + apex_transformer_log_level: 30 + gradient_as_bucket_view: true + sync_batch_comm: false + activations_checkpoint_granularity: null + activations_checkpoint_method: null + activations_checkpoint_num_layers: null + num_micro_batches_with_partial_activation_checkpoints: null + activations_checkpoint_layers_per_pipeline: null + sequence_parallel: false + deterministic_mode: false + transformer_engine: true + fp8: false + fp8_e4m3: false + fp8_hybrid: false + fp8_margin: 0 + fp8_interval: 1 + fp8_amax_history_len: 1024 + fp8_amax_compute_algo: max + ub_tp_comm_overlap: false + use_flash_attention: true + gc_interval: 25 + nsys_profile: + enabled: false + trace: + - nvtx + - cuda + start_step: 10 + end_step: 10 + ranks: + - 0 + gen_shape: false + optim: + name: distributed_fused_adam + lr: 0.0003 + weight_decay: 0.1 + betas: + - 0.9 + - 0.95 + bucket_cap_mb: 75 + overlap_grad_sync: true + overlap_param_sync: true + contiguous_grad_buffer: true + contiguous_param_buffer: true + sched: + name: CosineAnnealing + warmup_steps: 2000 + constant_steps: 0 + min_lr: 2.9999999999999997e-05 + grad_sync_dtype: bf16 + data: + data_impl: mmap + splits_string: 99990,8,2 + seq_length: 8192 + skip_warmup: true + num_workers: 2 + dataloader_type: single + reset_position_ids: true + reset_attention_mask: true + eod_mask_loss: false + index_mapping_dir: null + data_prefix: + - 1 + - /mnt/data/pile/my-gpt3_00_text_document diff --git a/cloud-infrastructure/ai-infra-gpu/ai-infrastructure/nemo-megatron-training-oke/files/training/files/sort_hosts.sh b/cloud-infrastructure/ai-infra-gpu/ai-infrastructure/nemo-megatron-training-oke/files/training/files/sort_hosts.sh new file mode 100755 index 000000000..46fb15f74 --- /dev/null +++ b/cloud-infrastructure/ai-infra-gpu/ai-infrastructure/nemo-megatron-training-oke/files/training/files/sort_hosts.sh @@ -0,0 +1,38 @@ +#!/bin/bash +# Copyright (c) 2025 Oracle and/or its affiliates. + +# Use this script inside a Volcano Kubernetes job. +# +# Assumes that the workers are named "mpiworker", and host management is +# enabled. Currently only reliable for BM H100 shapes, as it uses the block +# rather than rack ID to sort hosts. +# +# This may add 5-10% performance gain by speeding up MPI operations when passed +# to `mpirun` via +# +# ./utils/sort_hosts.sh myhosts.sorted +# mpirun -hostfile myhosts.sorted ... + +for host in ${VC_MPIWORKER_HOSTS//,/ }; do + echo -n "$host " >> myhosts + mpirun --allow-run-as-root \ + -mca plm_rsh_args "-p 2222" \ + --host $host -n 1 bash -c "curl -s http://169.254.169.254/opc/v1/host/| jq .rdmaTopologyData.customerLocalBlock" >> myhosts || echo "none" >> myhosts +done +python << EOF > ${1:-myhosts.sorted} +import sys + +DATA="""\ +$(> /etc/hosts + + export HYDRA_FULL_ERROR=1 + export CUDA_DEVICE_MAX_CONNECTIONS=1 + export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python + + # From NVIDIA SLURM reference setup; + export SLURM_UNBUFFEREDIO=1 + export TORCHX_MAX_RETRIES=0 + export CUDA_DEVICE_MAX_CONNECTIONS=1 + export TOKENIZERS_PARALLELISM=False + export TRANSFORMERS_OFFLINE=1 + export TORCH_NCCL_AVOID_RECORD_STREAMS=1 + export NCCL_NVLS_ENABLE=0 + export NVTE_DP_AMAX_REDUCE_INTERVAL=0 + export NVTE_ASYNC_AMAX_REDUCTION=1 + export NVTE_APPLY_QK_LAYER_SCALING=0 + export NVTE_FLASH_ATTN=0 + export NVTE_FUSED_ATTN=1 + export NEMO_LOG_MEMORY_USAGE=1 + export NVTE_FWD_LAYERNORM_SM_MARGIN=8 + export NVTE_BWD_LAYERNORM_SM_MARGIN=8 + export HYDRA_FULL_ERROR=1 + + # Gloo connectFullMesh failed + export GLOO_SOCKET_IFNAME=eth0 + export TP_SOCKET_IFNAME=eth0 + + python -u /opt/NeMo/examples/nlp/language_modeling/megatron_gpt_pretraining.py \ + --config-path=/config \ + --config-name={{ .Values.training.configuration }} \ + trainer.num_nodes={{ .Values.num_nodes }} \ + model.global_batch_size={{ mul .Values.num_nodes .Values.training.base_global_batch_size }}" + ports: + - { name: mpijob-port, containerPort: 2222, protocol: TCP } + image: nvcr.io/nvidia/nemo:24.09 + name: mpimaster + env: + - name: OMP_NUM_THREADS + value: "14" + envFrom: + - configMapRef: + name: {{ .Release.Name }}-mpi-setup + resources: + limits: + ephemeral-storage: 32Gi + requests: + cpu: 4 + ephemeral-storage: 32Gi + memory: 1Gi + securityContext: + privileged: true + capabilities: + add: + - IPC_LOCK + volumeMounts: + - { mountPath: /dev/infiniband, name: devinf } + - { mountPath: /dev/shm, name: shm } + - { mountPath: /mnt/data, name: workspace, readOnly: false } + workingDir: /workspace + dnsPolicy: ClusterFirstWithHostNet + hostNetwork: true + restartPolicy: OnFailure + terminationGracePeriodSeconds: 2 + volumes: + - { name: devinf, hostPath: { path: /dev/infiniband }} + - { name: shm, emptyDir: { medium: Memory, sizeLimit: 32Gi }} + - { name: workspace, persistentVolumeClaim: { claimName: {{ .Release.Name }}-pv }} + - name: mpiworker + replicas: {{ .Values.num_nodes }} + template: + metadata: + spec: + containers: + - command: + - /bin/bash + - -c + - mkdir -p /var/run/sshd; /usr/sbin/sshd -D -p 2222 || sleep 999999999; + image: nvcr.io/nvidia/nemo:24.09 + name: mpiworker + ports: + - { name: mpijob-port, containerPort: 2222, protocol: TCP } + - { name: c10d, containerPort: 29500, protocol: TCP } + envFrom: + - configMapRef: + name: {{ .Release.Name }}-mpi-setup + resources: + limits: + ephemeral-storage: 32Gi + nvidia.com/gpu: 8 + requests: + cpu: 112 + ephemeral-storage: 32Gi + memory: 768Gi + nvidia.com/gpu: 8 + securityContext: + privileged: true + capabilities: + add: + - IPC_LOCK + - CAP_SYS_ADMIN + volumeMounts: + - { mountPath: /dev/infiniband, name: devinf } + - { mountPath: /dev/shm, name: shm } + - { mountPath: /mnt/data, name: workspace, readOnly: false } + - { mountPath: /config, name: config } + workingDir: /workspace + dnsPolicy: ClusterFirstWithHostNet + hostNetwork: true + restartPolicy: OnFailure + terminationGracePeriodSeconds: 15 + tolerations: + - { key: nvidia.com/gpu, operator: Exists } + volumes: + - { name: devinf, hostPath: { path: /dev/infiniband }} + - { name: shm, emptyDir: { medium: Memory, sizeLimit: 32Gi }} + - { name: workspace, persistentVolumeClaim: { claimName: {{ .Release.Name }}-pv }} + - { name: config, configMap: { name: {{ .Release.Name }}-config }} + # If you need to exclude certain nodes, use this: + # + # affinity: + # nodeAffinity: + # requiredDuringSchedulingIgnoredDuringExecution: + # nodeSelectorTerms: + # - matchExpressions: + # - key: kubernetes.io/hostname + # operator: NotIn + # values: + # - +{{- end }} diff --git a/cloud-infrastructure/ai-infra-gpu/ai-infrastructure/nemo-megatron-training-oke/files/training/values.yaml b/cloud-infrastructure/ai-infra-gpu/ai-infrastructure/nemo-megatron-training-oke/files/training/values.yaml new file mode 100644 index 000000000..1c27cd173 --- /dev/null +++ b/cloud-infrastructure/ai-infra-gpu/ai-infrastructure/nemo-megatron-training-oke/files/training/values.yaml @@ -0,0 +1,26 @@ +# Copyright (c) 2025 Oracle and/or its affiliates. + +# Enable this for the initial data download and preprocessing +download_data: false + +# Number of nodes to use for training / preprocessing +num_nodes: 16 + +preprocessing: + # How many cpu cores to use on each node to preprocess data + processes_per_node: 16 + # The last file number to process, indexed to 0: change to, e.g., 3 to + # process files 0, 1, 2, 3 + final_file_number: 0 + +training: + # Which configuration file to use + configuration: config_llama3_8b.yaml + # Global Batch Size for a single node; will be multiplied + base_global_batch_size: 128 + +storage: + # Export path of the file storage server + fss_path: /data + # IP Address or DNS name of the file storage server + fss_name: fs1.fss.zone diff --git a/cloud-infrastructure/ai-infra-gpu/ai-infrastructure/nemo-megatron-training-oke/files/utils/host_list.sh b/cloud-infrastructure/ai-infra-gpu/ai-infrastructure/nemo-megatron-training-oke/files/utils/host_list.sh new file mode 100644 index 000000000..c69249da5 --- /dev/null +++ b/cloud-infrastructure/ai-infra-gpu/ai-infrastructure/nemo-megatron-training-oke/files/utils/host_list.sh @@ -0,0 +1,13 @@ +#!/bin/bash +# Copyright (c) 2025 Oracle and/or its affiliates. + +# This script creates a mapping of node displayname (normally an OCI-style name +# for OKE nodes) to an alias used by Kubernetes. +# +# Use whenever PyTorch's c10d cannot resolve hostnames with: +# +# ./utils/host_list.sh >> mpi.yaml +# kubectl apply -f mpi.yaml + +echo " HOST_LIST: |" +kubectl get nodes -o custom-columns=HOSTNAME:.status.addresses[0].address,NODE:.metadata.labels.displayName --no-headers|sed 's/^/ /' diff --git a/cloud-infrastructure/ai-infra-gpu/ai-infrastructure/nemo-megatron-training-oke/files/utils/performance.py b/cloud-infrastructure/ai-infra-gpu/ai-infrastructure/nemo-megatron-training-oke/files/utils/performance.py new file mode 100644 index 000000000..6c59b7bc1 --- /dev/null +++ b/cloud-infrastructure/ai-infra-gpu/ai-infrastructure/nemo-megatron-training-oke/files/utils/performance.py @@ -0,0 +1,53 @@ +# Copyright (c) 2025 Oracle and/or its affiliates. +"""Generate a small performance report from PyTorch logs. +""" +import argparse +import re +import textwrap +import yaml +from pathlib import Path + + +SETTINGS_RE = re.compile(r'^\s*(devices|num_nodes|global_batch_size|encoder_seq_length): ([0-9]*)$') +TIMING_RE = re.compile(r'train_step_timing in s: \[((?:[0-9.]+(?:, )?)+)\]') + + +parser = argparse.ArgumentParser("calculate training performance based on log files and configuration") +parser.add_argument("logfile", type=Path, help="the log file of the training run") + +args = parser.parse_args() + +num_gpus = 1 +num_tokens = 1 + +settings = set() + +try: + with args.logfile.open() as fd: + for line in fd: + if m := TIMING_RE.search(line): + timings = [float(n) for n in m.group(1).split(", ")] + elif m := SETTINGS_RE.match(line): + setting = m.group(1) + value = int(m.group(2)) + settings.add(setting) + if setting in ("devices", "num_nodes"): + num_gpus *= value + else: + num_tokens *= value +except Exception as e: + parser.error(f"failed to parse log: {e}") + +timing_avg = sum(timings) / len(timings) +throughput = num_tokens / timing_avg + +print( + textwrap.dedent( + f"""\ + Number of GPUs: {num_gpus} + Training step time (seconds per step): {timing_avg} + Total token throughput per second: {throughput} + Total token throughput per GPU per second: {throughput / num_gpus}\ + """ + ) +)