diff --git a/cloud-infrastructure/ai-infra-gpu/ai-infrastructure/nemo-megatron-training-oke/LICENSE b/cloud-infrastructure/ai-infra-gpu/ai-infrastructure/nemo-megatron-training-oke/LICENSE
new file mode 100644
index 000000000..46c0c79d9
--- /dev/null
+++ b/cloud-infrastructure/ai-infra-gpu/ai-infrastructure/nemo-megatron-training-oke/LICENSE
@@ -0,0 +1,35 @@
+Copyright (c) 2025 Oracle and/or its affiliates.
+
+The Universal Permissive License (UPL), Version 1.0
+
+Subject to the condition set forth below, permission is hereby granted to any
+person obtaining a copy of this software, associated documentation and/or data
+(collectively the "Software"), free of charge and under any and all copyright
+rights in the Software, and any and all patent rights owned or freely
+licensable by each licensor hereunder covering either (i) the unmodified
+Software as contributed to or provided by such licensor, or (ii) the Larger
+Works (as defined below), to deal in both
+
+(a) the Software, and
+(b) any piece of software and/or hardware listed in the lrgrwrks.txt file if
+one is included with the Software (each a "Larger Work" to which the Software
+is contributed by such licensors),
+
+without restriction, including without limitation the rights to copy, create
+derivative works of, display, perform, and distribute the Software and make,
+use, sell, offer for sale, import, export, have made, and have sold the
+Software and the Larger Work(s), and to sublicense the foregoing rights on
+either these or other terms.
+
+This license is subject to the following condition:
+The above copyright notice and either this complete permission notice or at
+a minimum a reference to the UPL must be included in all copies or
+substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/cloud-infrastructure/ai-infra-gpu/ai-infrastructure/nemo-megatron-training-oke/README.md b/cloud-infrastructure/ai-infra-gpu/ai-infrastructure/nemo-megatron-training-oke/README.md
new file mode 100644
index 000000000..dcabe366e
--- /dev/null
+++ b/cloud-infrastructure/ai-infra-gpu/ai-infrastructure/nemo-megatron-training-oke/README.md
@@ -0,0 +1,132 @@
+# Training LLMs with NVIDIA NeMo using Oracle Container Engine for Kubernetes
+
+This repository demonstrates how to train LLM using
+[NVIDIA NeMo](https://www.nvidia.com/en-gb/ai-data-science/products/nemo/)
+on the Oracle Container Engine for Kubernetes (OKE) using
+[NVIDIA Megatron](https://developer.nvidia.com/megatron-core).
+
+Reference results from NVIDIA to train Llama 3 can be found on the
+[NGC Catalog](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/dgxc-benchmarking/resources/llama3-dgxc-benchmarking).
+
+Reviewed: 18.03.2025
+
+# When to use this asset?
+
+* If you want to get started with training LLM like Llama 3 on Kubernetes using OCI.
+
+# How to use this asset?
+
+## Prerequisites
+
+* You have access to an Orcale Cloud Tenancy.
+* You have access to shapes with NVIDIA GPUs such as H100.
+* You have a HuggingFace account and access to `meta-llama/Llama-3.1-8B-Instruct`.
+
+This guide is loosely based on the
+[NVIDIA NeMo Framework Launcher guide for Kubernetes](https://docs.nvidia.com/nemo-framework/user-guide/24.07/playbooks/kubernetes.html).
+
+## Infrastructure Setup
+
+1. Create an OKE cluster according
+   [to the instructions](https://github.com/oracle-quickstart/oci-hpc-oke/tree/main#instructions-for-deploying-an-oke-cluster-with-gpus-and-rdma-connectivity),
+   importing one of the images and creating a GPU partition with BM.GPU.H100.8 nodes.
+
+   The configuration here assumes a minimum of 16 BM.GPU.H100.8 nodes.
+
+   - Ensure that the follwing setting is selected under the "OKE Cluster" section:
+
+     > Disable OKE GPU device plugin
+
+     as this tutorial will install the GPU operator later.
+
+2. Create a new File System for NFS, and modify the [persistent volume configuration in `pv.yaml`](./files/pv.yaml) to match.
+   Optimally, this will utilize High Performance Mount Targets (HMPT) as described in the following two whitepapers:
+   * [Scale Out OCI File Storage Performance for AI/ML and
+Data-Intensive Workloads](https://docs.oracle.com/en-us/iaas/Content/Resources/Assets/whitepapers/scale-out-oci-file-storage-performance-for-data-intensive-workloads.pdf)
+   * [File Storage Performance Guide](https://docs.oracle.com/en-us/iaas/Content/Resources/Assets/whitepapers/file-storage-performance-guide.pdf)
+
+3. Install the NVIDIA GPU Operator according to
+   [NVIDIA NeMo Framework Launcher guide for Kubernetes](https://docs.nvidia.com/nemo-framework/user-guide/24.07/playbooks/kubernetes.html), then install the [Volcano scheduler](https://github.com/volcano-sh/volcano) with:
+   ```sh
+   kubectl apply -f https://raw.githubusercontent.com/volcano-sh/volcano/master/installer/volcano-development.yaml
+   ```
+
+4. Copy the [files in this repository](./files) to the Kubernetes operator node.
+   You can download them from this repository via:
+   ```sh
+   BRANCH=main
+   curl -L https://github.com/oracle-devrel/technology-engineering/archive/refs/heads/${BRANCH}.tar.gz|tar xzf - --strip-components=6 technology-engineering-${BRANCH}/cloud-infrastructure/ai-infra-gpu/ai-infrastructure/nemo-megatron-training-oke/files
+   ```
+   
+   Then modify the values in [`training/values.yaml`](./files/training/values.yaml) to match the storage server and export path.
+
+5. Mount the file system on the Kubernetes operator node. In the following, the mount location is assumed to be `/mnt/data/`.
+
+## Data Preparation and Training
+
+1. Download the tokenizer model from HuggingFace:
+   ```sh
+   mkdir -p /mnt/data/tokenizer
+   huggingface-cli login
+   huggingface-cli download meta-llama/Llama-3.1-8B-Instruct tokenizer_config.json --local-dir /mnt/data/tokenizer
+   huggingface-cli download meta-llama/Llama-3.1-8B-Instruct tokenizer.json --local-dir /mnt/data/tokenizer
+   ```
+
+2. Apply the preprocessing job that will download and tokenize parts of the Pile dataset:
+   ```sh
+   helm install --set num_nodes=1 --set download_data=true "my-preprocessing" ./training
+   ```
+
+   The progress can then be monitored by
+   ```sh
+   kubectl logs -f megatron-prep-my-preprocessing-mpimaster-0
+   ```
+
+3. Following successful preprocessing, the training can be started with:
+   ```sh
+   helm install --set num_nodes=1 "my-training-v0" ./training
+   ```
+
+   The progress can then be monitored by
+   ```sh
+   kubectl logs -f megatron-train-my-training-v0-mpimaster-0
+   ```
+
+4. Calculate training throughput. For this, the following data is required from the training output:
+   ```
+   [NeMo I 2025-03-10 16:24:43 perf_metrics_utils:42] train_step_timing in s: [7.13, 7.12, 7.12, 7.13, 7.13, 7.13, 7.12, 7.13, 7.14, 7.13, 7.14, 7.26, 7.13, 7.13, 7.13, 7.13, 7.15, 7.14, 7.14, 7.13, 7.14, 7.14, 7.14, 7.14, 7.13, 7.14, 7.14, 7.14, 7.14, 7.14]
+   ```
+   This log can be saved into a file with:
+   ```sh
+   kubectl logs  megatron-train-my-training-v0-mpimaster-0 > training.log
+   ```
+   and the performance analyzed with
+   ```sh
+   python3 utils/performance.py training.log
+   ```
+
+## Potential Issues
+
+* **PyTorch can't resolve hostnames via c10d**
+
+  If the rendezvous backend for PyTorch fails to connect to an OCI style
+  hostname for Kubernetes clusters, one work around this resolution failure by
+  augmenting `/etc/hosts` for every pod.
+
+  For convenience, this is facilitated by enhancing `mpi.yaml` via
+  ```sh
+  ./utils/host_list.sh >> ./training/files/mpi.yaml
+  ```
+  and afterwards reinstalling the training job via Helm.
+
+# Acknowledgments
+
+- **Author** - Matthias Wolf (GPU Solution Specialist)
+
+# License
+ 
+Copyright (c) 2025 Oracle and/or its affiliates.
+ 
+Licensed under the Universal Permissive License (UPL), Version 1.0.
+ 
+See [LICENSE](https://github.com/oracle-devrel/technology-engineering/blob/main/LICENSE) for more details.
diff --git a/cloud-infrastructure/ai-infra-gpu/ai-infrastructure/nemo-megatron-training-oke/files/training/.helmignore b/cloud-infrastructure/ai-infra-gpu/ai-infrastructure/nemo-megatron-training-oke/files/training/.helmignore
new file mode 100644
index 000000000..0e8a0eb36
--- /dev/null
+++ b/cloud-infrastructure/ai-infra-gpu/ai-infrastructure/nemo-megatron-training-oke/files/training/.helmignore
@@ -0,0 +1,23 @@
+# Patterns to ignore when building packages.
+# This supports shell glob matching, relative path matching, and
+# negation (prefixed with !). Only one pattern per line.
+.DS_Store
+# Common VCS dirs
+.git/
+.gitignore
+.bzr/
+.bzrignore
+.hg/
+.hgignore
+.svn/
+# Common backup files
+*.swp
+*.bak
+*.tmp
+*.orig
+*~
+# Various IDEs
+.project
+.idea/
+*.tmproj
+.vscode/
diff --git a/cloud-infrastructure/ai-infra-gpu/ai-infrastructure/nemo-megatron-training-oke/files/training/Chart.yaml b/cloud-infrastructure/ai-infra-gpu/ai-infrastructure/nemo-megatron-training-oke/files/training/Chart.yaml
new file mode 100644
index 000000000..01b27cc34
--- /dev/null
+++ b/cloud-infrastructure/ai-infra-gpu/ai-infrastructure/nemo-megatron-training-oke/files/training/Chart.yaml
@@ -0,0 +1,7 @@
+# Copyright (c) 2025 Oracle and/or its affiliates.
+apiVersion: v2
+name: training
+description: A Helm chart to train LLM on Kubernetes using NVIDIA NeMo and Megatron
+type: application
+version: 0.1.0
+appVersion: "1.16.0"
diff --git a/cloud-infrastructure/ai-infra-gpu/ai-infrastructure/nemo-megatron-training-oke/files/training/files/config_llama3_70b.yaml b/cloud-infrastructure/ai-infra-gpu/ai-infrastructure/nemo-megatron-training-oke/files/training/files/config_llama3_70b.yaml
new file mode 100644
index 000000000..eef6460de
--- /dev/null
+++ b/cloud-infrastructure/ai-infra-gpu/ai-infrastructure/nemo-megatron-training-oke/files/training/files/config_llama3_70b.yaml
@@ -0,0 +1,229 @@
+# Copyright (c) 2025 Oracle and/or its affiliates.
+run:
+  name: llama3_70b
+  results_dir: /mnt/data/results/llama3_70b
+  time_limit: 0-01:00:00
+  dependency: singleton
+trainer:
+  num_nodes: 8
+  devices: 8
+  accelerator: gpu
+  precision: bf16-mixed
+  logger: false
+  enable_checkpointing: false
+  use_distributed_sampler: false
+  max_epochs: null
+  max_steps: 100
+  max_time: '19:23:30:00'
+  log_every_n_steps: 1
+  val_check_interval: 100
+  limit_val_batches: 32
+  limit_test_batches: 50
+  accumulate_grad_batches: 1
+  gradient_clip_val: 1.0
+  num_sanity_val_steps: 0
+exp_manager:
+  explicit_log_dir: /mnt/data/results/llama3_70b/logs
+  exp_dir: null
+  name: megatron_llama
+  create_wandb_logger: false
+  wandb_logger_kwargs:
+    project: nemo_llama_pretrain
+    name: llama3_1_70b
+  resume_if_exists: false
+  resume_ignore_no_checkpoint: true
+  create_checkpoint_callback: false
+  checkpoint_callback_params:
+    monitor: val_loss
+    save_top_k: 10
+    mode: min
+    always_save_nemo: false
+    save_nemo_on_train_end: false
+    filename: megatron_llama--{val_loss:.2f}-{step}-{consumed_samples}
+    model_parallel_size: 16
+  log_step_timing: true
+  step_timing_kwargs:
+    sync_cuda: true
+  seconds_to_sleep: 60
+model:
+  mcore_gpt: true
+  micro_batch_size: 1
+  global_batch_size: 128
+  rampup_batch_size: null
+  tensor_model_parallel_size: 4
+  pipeline_model_parallel_size: 4
+  virtual_pipeline_model_parallel_size: 5
+  context_parallel_size: 2
+  encoder_seq_length: 8192
+  max_position_embeddings: 8192
+  num_layers: 80
+  hidden_size: 8192
+  ffn_hidden_size: 28672
+  num_attention_heads: 64
+  num_query_groups: 8
+  init_method_std: 0.008944
+  use_scaled_init_method: true
+  hidden_dropout: 0.0
+  attention_dropout: 0.0
+  ffn_dropout: 0.0
+  kv_channels: null
+  apply_query_key_layer_scaling: true
+  normalization: rmsnorm
+  layernorm_epsilon: 1.0e-05
+  do_layer_norm_weight_decay: false
+  make_vocab_size_divisible_by: 128
+  pre_process: true
+  post_process: true
+  persist_layer_norm: true
+  bias: false
+  activation: fast-swiglu
+  headscale: false
+  transformer_block_type: pre_ln
+  openai_gelu: false
+  normalize_attention_scores: true
+  position_embedding_type: rope
+  rotary_percentage: 1.0
+  apply_rope_fusion: true
+  attention_type: multihead
+  share_embeddings_and_output_weights: false
+  tokenizer:
+    library: huggingface
+    type: /mnt/data/tokenizer
+    use_fast: true
+  native_amp_init_scale: 4294967296
+  native_amp_growth_interval: 1000
+  hysteresis: 2
+  fp32_residual_connection: false
+  fp16_lm_cross_entropy: false
+  megatron_amp_O2: true
+  grad_allreduce_chunk_size_mb: 125
+  grad_div_ar_fusion: true
+  gradient_accumulation_fusion: true
+  cross_entropy_loss_fusion: true
+  bias_activation_fusion: true
+  bias_dropout_add_fusion: true
+  masked_softmax_fusion: true
+  seed: 1234
+  resume_from_checkpoint: null
+  use_cpu_initialization: false
+  onnx_safe: false
+  apex_transformer_log_level: 30
+  gradient_as_bucket_view: true
+  sync_batch_comm: false
+  activations_checkpoint_granularity: null
+  activations_checkpoint_method: null
+  activations_checkpoint_num_layers: null
+  num_micro_batches_with_partial_activation_checkpoints: null
+  activations_checkpoint_layers_per_pipeline: null
+  sequence_parallel: true
+  defer_embedding_wgrad_compute: true
+  wgrad_deferral_limit: 22
+  deterministic_mode: false
+  transformer_engine: true
+  fp8: false
+  fp8_e4m3: false
+  fp8_hybrid: false
+  fp8_margin: 0
+  fp8_interval: 1
+  fp8_amax_history_len: 1024
+  fp8_amax_compute_algo: max
+  ub_tp_comm_overlap: true
+  use_flash_attention: true
+  overlap_p2p_comm: true
+  batch_p2p_comm: false
+  gc_interval: 25
+  nsys_profile:
+    enabled: false
+    trace:
+    - nvtx
+    - cuda
+    start_step: 10
+    end_step: 10
+    ranks:
+    - 0
+    gen_shape: false
+  optim:
+    name: distributed_fused_adam
+    lr: 0.0003
+    weight_decay: 0.1
+    betas:
+    - 0.9
+    - 0.95
+    bucket_cap_mb: 75
+    overlap_grad_sync: true
+    overlap_param_sync: true
+    contiguous_grad_buffer: true
+    contiguous_param_buffer: true
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 2000
+      constant_steps: 0
+      min_lr: 2.9999999999999997e-05
+    grad_sync_dtype: bf16
+  data:
+    data_impl: mmap
+    splits_string: 99990,8,2
+    seq_length: 8192
+    skip_warmup: true
+    num_workers: 2
+    dataloader_type: single
+    reset_position_ids: true
+    reset_attention_mask: true
+    eod_mask_loss: false
+    index_mapping_dir: null
+    data_prefix:
+    - 1
+    - /mnt/data/pile/my-gpt3_00_text_document
+  ub_tp_comm_overlap_cfg:
+    fc1_dgrad:
+      cga_size: 2
+      method: bulk
+      num_sm: 2
+      set_sm_margin: 0
+    fc1_fprop:
+      aggregate: 0
+      method: ring_exchange
+      num_sm: 1
+      set_sm_margin: 0
+    fc1_wgrad:
+      cga_size: 2
+      method: bulk
+      num_sm: 4
+      set_sm_margin: 0
+    fc2_dgrad:
+      aggregate: 0
+      method: ring_exchange
+      num_sm: 1
+      set_sm_margin: 0
+    fc2_fprop:
+      cga_size: 2
+      method: pipeline
+      num_sm: 16
+      num_splits: 4
+      set_sm_margin: 1
+    proj_dgrad:
+      aggregate: 0
+      method: ring_exchange
+      num_sm: 1
+      set_sm_margin: 0
+    proj_fprop:
+      cga_size: 2
+      method: pipeline
+      num_sm: 24
+      num_splits: 4
+      set_sm_margin: 1
+    qkv_dgrad:
+      cga_size: 2
+      method: bulk
+      num_sm: 4
+      set_sm_margin: 0
+    qkv_fprop:
+      aggregate: 0
+      method: ring_exchange
+      num_sm: 1
+      set_sm_margin: 0
+    qkv_wgrad:
+      cga_size: 2
+      method: bulk
+      num_sm: 24
+      set_sm_margin: 0
diff --git a/cloud-infrastructure/ai-infra-gpu/ai-infrastructure/nemo-megatron-training-oke/files/training/files/config_llama3_8b.yaml b/cloud-infrastructure/ai-infra-gpu/ai-infrastructure/nemo-megatron-training-oke/files/training/files/config_llama3_8b.yaml
new file mode 100644
index 000000000..7f8068831
--- /dev/null
+++ b/cloud-infrastructure/ai-infra-gpu/ai-infrastructure/nemo-megatron-training-oke/files/training/files/config_llama3_8b.yaml
@@ -0,0 +1,163 @@
+# Copyright (c) 2025 Oracle and/or its affiliates.
+hydra:
+  searchpath:
+  - file:///opt/NeMo/examples/nlp/language_modeling/conf
+run:
+  name: llama3_8b
+  results_dir: /mnt/data/results/llama3_8b
+  time_limit: 0-01:00:00
+  dependency: singleton
+trainer:
+  num_nodes: 4
+  devices: 8
+  accelerator: gpu
+  precision: bf16
+  logger: false
+  enable_checkpointing: false
+  use_distributed_sampler: false
+  max_epochs: null
+  max_steps: 100
+  max_time: '19:23:30:00'
+  log_every_n_steps: 1
+  val_check_interval: 100
+  limit_val_batches: 32
+  limit_test_batches: 50
+  accumulate_grad_batches: 1
+  gradient_clip_val: 1.0
+exp_manager:
+  explicit_log_dir: /mnt/data/results/llama3_8b/logs
+  exp_dir: null
+  name: megatron_llama
+  create_wandb_logger: false
+  wandb_logger_kwargs:
+    project: nemo_llama_pretrain
+    name: llama3_8b_bf16
+  resume_if_exists: false
+  resume_ignore_no_checkpoint: true
+  create_checkpoint_callback: false
+  checkpoint_callback_params:
+    monitor: val_loss
+    save_top_k: 10
+    mode: min
+    always_save_nemo: false
+    save_nemo_on_train_end: true
+    filename: megatron_llama--{val_loss:.2f}-{step}-{consumed_samples}
+    model_parallel_size: 1
+  log_step_timing: true
+  step_timing_kwargs:
+    sync_cuda: true
+    buffer_size: 5
+  seconds_to_sleep: 60
+model:
+  mcore_gpt: true
+  micro_batch_size: 1
+  global_batch_size: 512
+  rampup_batch_size: null
+  tensor_model_parallel_size: 1
+  pipeline_model_parallel_size: 1
+  virtual_pipeline_model_parallel_size: null
+  context_parallel_size: 2
+  encoder_seq_length: 8192
+  max_position_embeddings: 8192
+  num_layers: 32
+  hidden_size: 4096
+  ffn_hidden_size: 14336
+  num_attention_heads: 32
+  num_query_groups: 8
+  init_method_std: 0.01
+  use_scaled_init_method: true
+  hidden_dropout: 0.0
+  attention_dropout: 0.0
+  ffn_dropout: 0.0
+  kv_channels: null
+  apply_query_key_layer_scaling: true
+  normalization: rmsnorm
+  layernorm_epsilon: 1.0e-05
+  do_layer_norm_weight_decay: false
+  make_vocab_size_divisible_by: 128
+  pre_process: true
+  post_process: true
+  persist_layer_norm: true
+  bias: false
+  activation: fast-swiglu
+  headscale: false
+  transformer_block_type: pre_ln
+  openai_gelu: false
+  normalize_attention_scores: true
+  position_embedding_type: rope
+  rotary_percentage: 1.0
+  apply_rope_fusion: true
+  attention_type: multihead
+  share_embeddings_and_output_weights: false
+  tokenizer:
+    library: huggingface
+    type: /mnt/data/tokenizer
+    use_fast: true
+  native_amp_init_scale: 4294967296
+  native_amp_growth_interval: 1000
+  hysteresis: 2
+  fp32_residual_connection: false
+  fp16_lm_cross_entropy: false
+  megatron_amp_O2: true
+  grad_allreduce_chunk_size_mb: 125
+  grad_div_ar_fusion: true
+  gradient_accumulation_fusion: true
+  bias_activation_fusion: true
+  bias_dropout_add_fusion: true
+  masked_softmax_fusion: true
+  seed: 1234
+  resume_from_checkpoint: null
+  use_cpu_initialization: false
+  onnx_safe: false
+  apex_transformer_log_level: 30
+  gradient_as_bucket_view: true
+  sync_batch_comm: false
+  activations_checkpoint_granularity: null
+  activations_checkpoint_method: null
+  activations_checkpoint_num_layers: null
+  num_micro_batches_with_partial_activation_checkpoints: null
+  activations_checkpoint_layers_per_pipeline: null
+  sequence_parallel: false
+  transformer_engine: true
+  fp8: false
+  fp8_e4m3: false
+  fp8_hybrid: false
+  fp8_margin: 0
+  fp8_interval: 1
+  fp8_amax_history_len: 1024
+  fp8_amax_compute_algo: max
+  ub_tp_comm_overlap: false
+  use_flash_attention: true
+  gc_interval: 25
+  optim:
+    name: distributed_fused_adam
+    lr: 0.0001
+    weight_decay: 0.1
+    betas:
+    - 0.9
+    - 0.95
+    bucket_cap_mb: 125
+    overlap_grad_sync: true
+    overlap_param_sync: true
+    contiguous_grad_buffer: true
+    contiguous_param_buffer: true
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 500
+      constant_steps: 0
+      min_lr: 1.0e-05
+    grad_sync_dtype: bf16
+  data:
+    data_impl: mmap
+    splits_string: 99990,8,2
+    seq_length: 8192
+    skip_warmup: true
+    num_workers: 2
+    dataloader_type: single
+    reset_position_ids: false
+    reset_attention_mask: false
+    eod_mask_loss: false
+    index_mapping_dir: null
+    data_prefix:
+    - 1
+    - /mnt/data/pile/my-gpt3_00_text_document
diff --git a/cloud-infrastructure/ai-infra-gpu/ai-infrastructure/nemo-megatron-training-oke/files/training/files/config_llama3_8b_v2.yaml b/cloud-infrastructure/ai-infra-gpu/ai-infrastructure/nemo-megatron-training-oke/files/training/files/config_llama3_8b_v2.yaml
new file mode 100644
index 000000000..71dd2992f
--- /dev/null
+++ b/cloud-infrastructure/ai-infra-gpu/ai-infrastructure/nemo-megatron-training-oke/files/training/files/config_llama3_8b_v2.yaml
@@ -0,0 +1,173 @@
+# Copyright (c) 2025 Oracle and/or its affiliates.
+run:
+  name: llama3_8b
+  results_dir: /mnt/data/results/llama3_8b
+  time_limit: 0-01:00:00
+  dependency: singleton
+trainer:
+  num_nodes: 1
+  devices: 8
+  accelerator: gpu
+  precision: bf16
+  logger: false
+  enable_checkpointing: false
+  use_distributed_sampler: false
+  max_epochs: null
+  max_steps: 100
+  max_time: '19:23:30:00'
+  log_every_n_steps: 1
+  val_check_interval: 100
+  limit_val_batches: 1
+  limit_test_batches: 50
+  accumulate_grad_batches: 1
+  gradient_clip_val: 1.0
+  num_sanity_val_steps: 0
+exp_manager:
+  explicit_log_dir: /mnt/data/results/llama3_8b/logs
+  exp_dir: null
+  name: megatron_llama
+  create_wandb_logger: false
+  wandb_logger_kwargs:
+    project: nemo_llama_pretrain
+    name: llama3_1_8b
+  resume_if_exists: false
+  resume_ignore_no_checkpoint: true
+  create_checkpoint_callback: false
+  checkpoint_callback_params:
+    monitor: val_loss
+    save_top_k: 10
+    mode: min
+    always_save_nemo: false
+    save_nemo_on_train_end: false
+    filename: megatron_llama--{val_loss:.2f}-{step}-{consumed_samples}
+    model_parallel_size: 1
+  log_step_timing: true
+  step_timing_kwargs:
+    sync_cuda: true
+    buffer_size: 5
+  seconds_to_sleep: 60
+model:
+  mcore_gpt: true
+  micro_batch_size: 1
+  global_batch_size: 512
+  rampup_batch_size: null
+  tensor_model_parallel_size: 1
+  pipeline_model_parallel_size: 1
+  virtual_pipeline_model_parallel_size: null
+  context_parallel_size: 2
+  encoder_seq_length: 8192
+  max_position_embeddings: 8192
+  num_layers: 32
+  hidden_size: 4096
+  ffn_hidden_size: 14336
+  num_attention_heads: 32
+  num_query_groups: 8
+  init_method_std: 0.01
+  use_scaled_init_method: true
+  hidden_dropout: 0.0
+  attention_dropout: 0.0
+  ffn_dropout: 0.0
+  kv_channels: null
+  apply_query_key_layer_scaling: true
+  normalization: rmsnorm
+  layernorm_epsilon: 1.0e-05
+  do_layer_norm_weight_decay: false
+  make_vocab_size_divisible_by: 128
+  pre_process: true
+  post_process: true
+  persist_layer_norm: true
+  bias: false
+  activation: fast-swiglu
+  headscale: false
+  transformer_block_type: pre_ln
+  openai_gelu: false
+  normalize_attention_scores: true
+  position_embedding_type: rope
+  rotary_percentage: 1.0
+  apply_rope_fusion: true
+  cross_entropy_loss_fusion: true
+  attention_type: multihead
+  share_embeddings_and_output_weights: false
+  tokenizer:
+    library: huggingface
+    type: /mnt/data/tokenizer
+    use_fast: true
+  native_amp_init_scale: 4294967296
+  native_amp_growth_interval: 1000
+  hysteresis: 2
+  fp32_residual_connection: false
+  fp16_lm_cross_entropy: false
+  megatron_amp_O2: true
+  grad_allreduce_chunk_size_mb: 125
+  grad_div_ar_fusion: true
+  gradient_accumulation_fusion: true
+  bias_activation_fusion: true
+  bias_dropout_add_fusion: true
+  masked_softmax_fusion: true
+  seed: 1234
+  resume_from_checkpoint: null
+  use_cpu_initialization: false
+  onnx_safe: false
+  apex_transformer_log_level: 30
+  gradient_as_bucket_view: true
+  sync_batch_comm: false
+  activations_checkpoint_granularity: null
+  activations_checkpoint_method: null
+  activations_checkpoint_num_layers: null
+  num_micro_batches_with_partial_activation_checkpoints: null
+  activations_checkpoint_layers_per_pipeline: null
+  sequence_parallel: false
+  deterministic_mode: false
+  transformer_engine: true
+  fp8: false
+  fp8_e4m3: false
+  fp8_hybrid: false
+  fp8_margin: 0
+  fp8_interval: 1
+  fp8_amax_history_len: 1024
+  fp8_amax_compute_algo: max
+  ub_tp_comm_overlap: false
+  use_flash_attention: true
+  gc_interval: 25
+  nsys_profile:
+    enabled: false
+    trace:
+    - nvtx
+    - cuda
+    start_step: 10
+    end_step: 10
+    ranks:
+    - 0
+    gen_shape: false
+  optim:
+    name: distributed_fused_adam
+    lr: 0.0003
+    weight_decay: 0.1
+    betas:
+    - 0.9
+    - 0.95
+    bucket_cap_mb: 75
+    overlap_grad_sync: true
+    overlap_param_sync: true
+    contiguous_grad_buffer: true
+    contiguous_param_buffer: true
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 2000
+      constant_steps: 0
+      min_lr: 2.9999999999999997e-05
+    grad_sync_dtype: bf16
+  data:
+    data_impl: mmap
+    splits_string: 99990,8,2
+    seq_length: 8192
+    skip_warmup: true
+    num_workers: 2
+    dataloader_type: single
+    reset_position_ids: true
+    reset_attention_mask: true
+    eod_mask_loss: false
+    index_mapping_dir: null
+    data_prefix:
+    - 1
+    - /mnt/data/pile/my-gpt3_00_text_document
diff --git a/cloud-infrastructure/ai-infra-gpu/ai-infrastructure/nemo-megatron-training-oke/files/training/files/sort_hosts.sh b/cloud-infrastructure/ai-infra-gpu/ai-infrastructure/nemo-megatron-training-oke/files/training/files/sort_hosts.sh
new file mode 100755
index 000000000..46fb15f74
--- /dev/null
+++ b/cloud-infrastructure/ai-infra-gpu/ai-infrastructure/nemo-megatron-training-oke/files/training/files/sort_hosts.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+# Copyright (c) 2025 Oracle and/or its affiliates.
+
+# Use this script inside a Volcano Kubernetes job.
+#
+# Assumes that the workers are named "mpiworker", and host management is
+# enabled.  Currently only reliable for BM H100 shapes, as it uses the block
+# rather than rack ID to sort hosts.
+#
+# This may add 5-10% performance gain by speeding up MPI operations when passed
+# to `mpirun` via
+#
+#     ./utils/sort_hosts.sh myhosts.sorted
+#     mpirun -hostfile myhosts.sorted ...
+
+for host in ${VC_MPIWORKER_HOSTS//,/ }; do
+  echo -n "$host " >> myhosts
+  mpirun --allow-run-as-root \
+    -mca plm_rsh_args "-p 2222" \
+    --host $host -n 1 bash -c "curl -s http://169.254.169.254/opc/v1/host/| jq .rdmaTopologyData.customerLocalBlock" >> myhosts || echo "none" >> myhosts
+done
+python << EOF > ${1:-myhosts.sorted}
+import sys
+
+DATA="""\
+$(<myhosts)
+"""
+
+switches = {}
+for l in DATA.splitlines():
+    host, switch = l.split()
+    switches.setdefault(switch, []).append(host)
+
+for r, hs in sorted(switches.items(), key=lambda x: len(x[1]), reverse=True):
+    sys.stderr.write(f"adding switch with {len(hs)} nodes\n")
+    for h in hs:
+        print(h)
+EOF
diff --git a/cloud-infrastructure/ai-infra-gpu/ai-infrastructure/nemo-megatron-training-oke/files/training/templates/config.yaml b/cloud-infrastructure/ai-infra-gpu/ai-infrastructure/nemo-megatron-training-oke/files/training/templates/config.yaml
new file mode 100644
index 000000000..014238226
--- /dev/null
+++ b/cloud-infrastructure/ai-infra-gpu/ai-infrastructure/nemo-megatron-training-oke/files/training/templates/config.yaml
@@ -0,0 +1,14 @@
+# Copyright (c) 2025 Oracle and/or its affiliates.
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: {{ .Release.Name }}-config
+data:
+  sort_hosts.sh: |
+    {{ .Files.Get "files/sort_hosts.sh" | nindent 4 }}
+  config_llama3_8b.yaml: |
+    {{ .Files.Get "files/config_llama3_8b.yaml" | nindent 4 }}
+  config_llama3_8b_v2.yaml: |
+    {{ .Files.Get "files/config_llama3_8b_v2.yaml" | nindent 4 }}
+  config_llama3_70b.yaml: |
+    {{ .Files.Get "files/config_llama3_70b.yaml" | nindent 4 }}
diff --git a/cloud-infrastructure/ai-infra-gpu/ai-infrastructure/nemo-megatron-training-oke/files/training/templates/mpi.yaml b/cloud-infrastructure/ai-infra-gpu/ai-infrastructure/nemo-megatron-training-oke/files/training/templates/mpi.yaml
new file mode 100644
index 000000000..4f32690aa
--- /dev/null
+++ b/cloud-infrastructure/ai-infra-gpu/ai-infrastructure/nemo-megatron-training-oke/files/training/templates/mpi.yaml
@@ -0,0 +1,27 @@
+# Copyright (c) 2025 Oracle and/or its affiliates.
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: {{ .Release.Name }}-mpi-setup
+  namespace: default
+data:
+  MPI_ARGS:
+    -mca coll ^hcoll -mca coll_hcoll_enable 0
+    -x NCCL_CROSS_NIC=2
+    -x NCCL_SOCKET_NTHREADS=16
+    -x NCCL_DEBUG=WARN
+    -x NCCL_CUMEM_ENABLE=0
+    -x NCCL_IB_SPLIT_DATA_ON_QPS=0
+    -x NCCL_IB_QPS_PER_CONNECTION=16
+    -x NCCL_IB_GID_INDEX=3
+    -x NCCL_IB_HCA==mlx5_0,mlx5_1,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_7,mlx5_8,mlx5_9,mlx5_10,mlx5_12,mlx5_13,mlx5_14,mlx5_15,mlx5_16,mlx5_17
+    -x NCCL_IB_TC=41
+    -x NCCL_IB_SL=0
+    -x NCCL_IB_TIMEOUT=22
+    -x HCOLL_ENABLE_MCAST_ALL=0
+    -x UCX_TLS=tcp
+    -x UCX_NET_DEVICES=eth0
+    -x RX_QUEUE_LEN=8192
+    -x IB_RX_QUEUE_LEN=8192
+    -x NCCL_SOCKET_IFNAME=eth0
+    -x NCCL_IGNORE_CPU_AFFINITY=1
diff --git a/cloud-infrastructure/ai-infra-gpu/ai-infrastructure/nemo-megatron-training-oke/files/training/templates/preprocessing.yaml b/cloud-infrastructure/ai-infra-gpu/ai-infrastructure/nemo-megatron-training-oke/files/training/templates/preprocessing.yaml
new file mode 100644
index 000000000..cedaff03b
--- /dev/null
+++ b/cloud-infrastructure/ai-infra-gpu/ai-infrastructure/nemo-megatron-training-oke/files/training/templates/preprocessing.yaml
@@ -0,0 +1,158 @@
+# Copyright (c) 2025 Oracle and/or its affiliates.
+{{- if .Values.download_data }}
+apiVersion: batch.volcano.sh/v1alpha1
+kind: Job
+metadata:
+  annotations:
+  name: megatron-prep-{{ .Release.Name }}
+spec:
+  minAvailable: 0
+  plugins:
+    ssh: []
+    svc: []
+  queue: default
+  schedulerName: volcano
+  tasks:
+  - name: mpimaster
+    policies:
+    - action: CompleteJob
+      event: TaskCompleted
+    replicas: 1
+    template:
+      metadata:
+      spec:
+        containers:
+        - command:
+          - /bin/bash
+          - -c
+          - |
+            set -ex -o pipefail
+            NUM_PROCS={{ .Values.preprocessing.processes_per_node }}
+            NUM_HOSTS=$(sed -n '$=' /etc/volcano/mpiworker.host)
+            NP=$(($NUM_HOSTS*$NUM_PROCS))
+
+            HOSTFILE=/etc/volcano/mpiworker.host
+
+            mpirun --allow-run-as-root \
+              -mca plm_rsh_args "-p 2222" \
+              -np $NP -npernode $NUM_PROCS --bind-to none -map-by slot \
+              -hostfile $HOSTFILE \
+              -x PYTHONPATH \
+              python3 -u /opt/NeMo-Framework-Launcher/launcher_scripts/nemo_launcher/collections/dataprep_scripts/pile_dataprep/download.py \
+                ++cluster_type=k8s \
+                ++data_dir=/mnt/data/pile \
+                ++the_pile_url=https://huggingface.co/datasets/monology/pile-uncopyrighted/resolve/main/train/ \
+                ++file_numbers=0-{{ .Values.preprocessing.final_file_number }} \
+                ++rm_downloaded=False \
+                ++rm_extracted=False
+            mpirun --allow-run-as-root \
+              -mca plm_rsh_args "-p 2222" \
+              -np $NP -npernode $NUM_PROCS --bind-to none -map-by slot \
+              -hostfile $HOSTFILE \
+              $MPI_ARGS \
+              -x PYTHONPATH \
+              python3 -u /opt/NeMo-Framework-Launcher/launcher_scripts/nemo_launcher/collections/dataprep_scripts/pile_dataprep/extract.py \
+                ++cluster_type=k8s \
+                ++data_dir=/mnt/data/pile \
+                ++file_numbers=0-{{ .Values.preprocessing.final_file_number }} \
+                ++rm_downloaded=False \
+                ++rm_extracted=False
+            mpirun --allow-run-as-root \
+              -mca plm_rsh_args "-p 2222" \
+              -np $NP -npernode $NUM_PROCS --bind-to none -map-by slot \
+              -hostfile $HOSTFILE \
+              $MPI_ARGS \
+              -x PYTHONPATH \
+              -x HF_HUB_OFFLINE=1 \
+              python3 -u /opt/NeMo-Framework-Launcher/launcher_scripts/nemo_launcher/collections/dataprep_scripts/pile_dataprep/preprocess.py \
+                ++cluster_type=k8s \
+                ++launcher_scripts_path=/opt/NeMo-Framework-Launcher/launcher_scripts \
+                ++data_dir=/mnt/data/pile \
+                ++file_numbers=0-{{ .Values.preprocessing.final_file_number }} \
+                ++download_vocab_url= \
+                ++download_merges_url= \
+                ++vocab_save_dir=/mnt/data/pile/vocab \
+                ++merges_save_dir=/mnt/data/pile/merges \
+                ++tokenizer_type=/mnt/data/tokenizer \
+                ++tokenizer_library=huggingface \
+                ++rm_downloaded=False \
+                ++rm_extracted=False
+          ports:
+          - { name: mpijob-port, containerPort: 2222, protocol: TCP }
+          image: nvcr.io/nvidia/nemo:24.09
+          name: mpimaster
+          env:
+          - name: OMP_NUM_THREADS
+            value: "14"
+          envFrom:
+          - configMapRef:
+              name: {{ .Release.Name }}-mpi-setup
+          resources:
+            limits:
+              ephemeral-storage: 32Gi
+            requests:
+              cpu: {{ .Values.preprocessing.processes_per_node }}
+              ephemeral-storage: 32Gi
+              memory: {{ mul .Values.preprocessing.processes_per_node 2 }}Gi
+          securityContext:
+            privileged: true
+            capabilities:
+              add:
+              - IPC_LOCK
+          volumeMounts:
+          - { mountPath: /dev/infiniband, name: devinf }
+          - { mountPath: /dev/shm, name: shm }
+          - { mountPath: /mnt/data, name: workspace, readOnly: false }
+          workingDir: /workspace
+        dnsPolicy: ClusterFirstWithHostNet
+        hostNetwork: true
+        restartPolicy: OnFailure
+        terminationGracePeriodSeconds: 2
+        volumes:
+        - { name: devinf, hostPath: { path: /dev/infiniband }}
+        - { name: shm, emptyDir: { medium: Memory, sizeLimit: 32Gi }}
+        - { name: workspace, persistentVolumeClaim: { claimName: {{ .Release.Name }}-pv }}
+  - name: mpiworker
+    minAvailable: 0
+    replicas: {{ .Values.num_nodes }}
+    template:
+      metadata:
+      spec:
+        containers:
+        - command:
+          - /bin/bash
+          - -c
+          - mkdir -p /var/run/sshd; /usr/sbin/sshd -D -p 2222 || sleep 999999999;
+          image: nvcr.io/nvidia/nemo:24.09
+          name: mpiworker
+          ports:
+          - { name: mpijob-port, containerPort: 2222, protocol: TCP }
+          resources:
+            limits:
+              ephemeral-storage: 32Gi
+            requests:
+              cpu: 8
+              ephemeral-storage: 32Gi
+              memory: 768Gi
+          securityContext:
+            privileged: true
+            capabilities:
+              add:
+              - IPC_LOCK
+              - CAP_SYS_ADMIN
+          volumeMounts:
+          - { mountPath: /dev/infiniband, name: devinf }
+          - { mountPath: /dev/shm, name: shm }
+          - { mountPath: /mnt/data, name: workspace, readOnly: false }
+          workingDir: /workspace
+        dnsPolicy: ClusterFirstWithHostNet
+        hostNetwork: true
+        restartPolicy: OnFailure
+        terminationGracePeriodSeconds: 15
+        tolerations:
+        - { key: nvidia.com/gpu, operator: Exists }
+        volumes:
+        - { name: devinf, hostPath: { path: /dev/infiniband }}
+        - { name: shm, emptyDir: { medium: Memory, sizeLimit: 32Gi }}
+        - { name: workspace, persistentVolumeClaim: { claimName: {{ .Release.Name }}-pv }}
+{{- end }}
diff --git a/cloud-infrastructure/ai-infra-gpu/ai-infrastructure/nemo-megatron-training-oke/files/training/templates/pv.yaml b/cloud-infrastructure/ai-infra-gpu/ai-infrastructure/nemo-megatron-training-oke/files/training/templates/pv.yaml
new file mode 100644
index 000000000..252c4aaa2
--- /dev/null
+++ b/cloud-infrastructure/ai-infra-gpu/ai-infrastructure/nemo-megatron-training-oke/files/training/templates/pv.yaml
@@ -0,0 +1,42 @@
+# Copyright (c) 2025 Oracle and/or its affiliates.
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+ name: {{ .Release.Name }}-pv
+ labels:
+   type: nfs
+spec:
+ capacity:
+  storage: 50Gi
+ volumeMode: Filesystem
+ accessModes:
+   - ReadWriteMany
+ persistentVolumeReclaimPolicy: Retain
+ nfs:
+  path: {{ .Values.storage.fss_path }}
+  server: {{ .Values.storage.fss_name }}
+ mountOptions:
+  - nconnect=16
+---
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: {{ .Release.Name }}-pv
+spec:
+  accessModes:
+  - ReadWriteMany
+  resources:
+    requests:
+      storage: 1Gi
+  selector:
+    matchLabels:
+      type: nfs
+  storageClassName: ""
+  volumeMode: Filesystem
+  volumeName: {{ .Release.Name }}-pv
+status:
+  accessModes:
+  - ReadWriteMany
+  capacity:
+    storage: 50Gi
+  phase: Bound
diff --git a/cloud-infrastructure/ai-infra-gpu/ai-infrastructure/nemo-megatron-training-oke/files/training/templates/training.yaml b/cloud-infrastructure/ai-infra-gpu/ai-infrastructure/nemo-megatron-training-oke/files/training/templates/training.yaml
new file mode 100644
index 000000000..53f700898
--- /dev/null
+++ b/cloud-infrastructure/ai-infra-gpu/ai-infrastructure/nemo-megatron-training-oke/files/training/templates/training.yaml
@@ -0,0 +1,185 @@
+# Copyright (c) 2025 Oracle and/or its affiliates.
+{{- if not .Values.download_data }}
+apiVersion: batch.volcano.sh/v1alpha1
+kind: Job
+metadata:
+  annotations:
+  name: megatron-train-{{ .Release.Name }}
+spec:
+  minAvailable: 0
+  plugins:
+    ssh: []
+    svc: []
+  queue: default
+  schedulerName: volcano
+  tasks:
+  - name: mpimaster
+    policies:
+    - action: CompleteJob
+      event: TaskCompleted
+    replicas: 1
+    # dependsOn:
+    #   name: 
+    #   - "mpiworker"
+    template:
+      metadata:
+      spec:
+        containers:
+        - command:
+          - /bin/bash
+          - -c
+          - |
+            NUM_GPUS=8
+            NUM_HOSTS=$(sed -n '$=' /etc/volcano/mpiworker.host)
+            NP=$(($NUM_HOSTS*$NUM_GPUS))
+
+            HOSTFILE=/etc/volcano/mpiworker.host
+
+            # Enabling these two lines would allow for a bit more perfomance if
+            # nodes are distributed across different switches.
+            #
+            # HOSTFILE=myhosts.sorted
+            # bash /config/sort_hosts.sh $HOSTFILE
+
+            mpirun --allow-run-as-root \
+              -mca plm_rsh_args "-p 2222" \
+              -np $NP -npernode $NUM_GPUS --bind-to numa \
+              -hostfile $HOSTFILE \
+              -x NVIDIA_PYTORCH_VERSION \
+              -x PYTHONPATH \
+              -x HOST_LIST \
+              $MPI_ARGS \
+              bash -c "
+                # c10d will publish a host address based on the OCI hostname, not
+                # the labelling of OKE. We don't seem to have DNS mapping, manually
+                # do that for now.
+                [[ \${HOST_LIST:+yes} == yes ]] && echo \"\${HOST_LIST}\" >> /etc/hosts
+
+                export HYDRA_FULL_ERROR=1
+                export CUDA_DEVICE_MAX_CONNECTIONS=1
+                export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
+
+                # From NVIDIA SLURM reference setup;
+                export SLURM_UNBUFFEREDIO=1
+                export TORCHX_MAX_RETRIES=0
+                export CUDA_DEVICE_MAX_CONNECTIONS=1
+                export TOKENIZERS_PARALLELISM=False
+                export TRANSFORMERS_OFFLINE=1
+                export TORCH_NCCL_AVOID_RECORD_STREAMS=1
+                export NCCL_NVLS_ENABLE=0
+                export NVTE_DP_AMAX_REDUCE_INTERVAL=0
+                export NVTE_ASYNC_AMAX_REDUCTION=1
+                export NVTE_APPLY_QK_LAYER_SCALING=0
+                export NVTE_FLASH_ATTN=0
+                export NVTE_FUSED_ATTN=1
+                export NEMO_LOG_MEMORY_USAGE=1
+                export NVTE_FWD_LAYERNORM_SM_MARGIN=8
+                export NVTE_BWD_LAYERNORM_SM_MARGIN=8
+                export HYDRA_FULL_ERROR=1
+
+                # Gloo connectFullMesh failed
+                export GLOO_SOCKET_IFNAME=eth0
+                export TP_SOCKET_IFNAME=eth0
+
+                python -u /opt/NeMo/examples/nlp/language_modeling/megatron_gpt_pretraining.py \
+                    --config-path=/config \
+                    --config-name={{ .Values.training.configuration }} \
+                    trainer.num_nodes={{ .Values.num_nodes }} \
+                    model.global_batch_size={{ mul .Values.num_nodes .Values.training.base_global_batch_size }}"
+          ports:
+          - { name: mpijob-port, containerPort: 2222, protocol: TCP }
+          image: nvcr.io/nvidia/nemo:24.09
+          name: mpimaster
+          env:
+          - name: OMP_NUM_THREADS
+            value: "14"
+          envFrom:
+          - configMapRef:
+              name: {{ .Release.Name }}-mpi-setup
+          resources:
+            limits:
+              ephemeral-storage: 32Gi
+            requests:
+              cpu: 4
+              ephemeral-storage: 32Gi
+              memory: 1Gi
+          securityContext:
+            privileged: true
+            capabilities:
+              add:
+              - IPC_LOCK
+          volumeMounts:
+          - { mountPath: /dev/infiniband, name: devinf }
+          - { mountPath: /dev/shm, name: shm }
+          - { mountPath: /mnt/data, name: workspace, readOnly: false }
+          workingDir: /workspace
+        dnsPolicy: ClusterFirstWithHostNet
+        hostNetwork: true
+        restartPolicy: OnFailure
+        terminationGracePeriodSeconds: 2
+        volumes:
+        - { name: devinf, hostPath: { path: /dev/infiniband }}
+        - { name: shm, emptyDir: { medium: Memory, sizeLimit: 32Gi }}
+        - { name: workspace, persistentVolumeClaim: { claimName: {{ .Release.Name }}-pv }}
+  - name: mpiworker
+    replicas: {{ .Values.num_nodes }}
+    template:
+      metadata:
+      spec:
+        containers:
+        - command:
+          - /bin/bash
+          - -c
+          - mkdir -p /var/run/sshd; /usr/sbin/sshd -D -p 2222 || sleep 999999999;
+          image: nvcr.io/nvidia/nemo:24.09
+          name: mpiworker
+          ports:
+          - { name: mpijob-port, containerPort: 2222, protocol: TCP }
+          - { name: c10d, containerPort: 29500, protocol: TCP }
+          envFrom:
+          - configMapRef:
+              name: {{ .Release.Name }}-mpi-setup
+          resources:
+            limits:
+              ephemeral-storage: 32Gi
+              nvidia.com/gpu: 8
+            requests:
+              cpu: 112
+              ephemeral-storage: 32Gi
+              memory: 768Gi
+              nvidia.com/gpu: 8
+          securityContext:
+            privileged: true
+            capabilities:
+              add:
+              - IPC_LOCK
+              - CAP_SYS_ADMIN
+          volumeMounts:
+          - { mountPath: /dev/infiniband, name: devinf }
+          - { mountPath: /dev/shm, name: shm }
+          - { mountPath: /mnt/data, name: workspace, readOnly: false }
+          - { mountPath: /config, name: config }
+          workingDir: /workspace
+        dnsPolicy: ClusterFirstWithHostNet
+        hostNetwork: true
+        restartPolicy: OnFailure
+        terminationGracePeriodSeconds: 15
+        tolerations:
+        - { key: nvidia.com/gpu, operator: Exists }
+        volumes:
+        - { name: devinf, hostPath: { path: /dev/infiniband }}
+        - { name: shm, emptyDir: { medium: Memory, sizeLimit: 32Gi }}
+        - { name: workspace, persistentVolumeClaim: { claimName: {{ .Release.Name }}-pv }}
+        - { name: config, configMap: { name: {{ .Release.Name }}-config }}
+        # If you need to exclude certain nodes, use this:
+        #
+        # affinity:
+        #   nodeAffinity:
+        #     requiredDuringSchedulingIgnoredDuringExecution:
+        #       nodeSelectorTerms:
+        #       - matchExpressions:
+        #         - key: kubernetes.io/hostname
+        #           operator: NotIn
+        #           values:
+        #           - <hostname>
+{{- end }}
diff --git a/cloud-infrastructure/ai-infra-gpu/ai-infrastructure/nemo-megatron-training-oke/files/training/values.yaml b/cloud-infrastructure/ai-infra-gpu/ai-infrastructure/nemo-megatron-training-oke/files/training/values.yaml
new file mode 100644
index 000000000..1c27cd173
--- /dev/null
+++ b/cloud-infrastructure/ai-infra-gpu/ai-infrastructure/nemo-megatron-training-oke/files/training/values.yaml
@@ -0,0 +1,26 @@
+# Copyright (c) 2025 Oracle and/or its affiliates.
+
+# Enable this for the initial data download and preprocessing
+download_data: false
+
+# Number of nodes to use for training / preprocessing
+num_nodes: 16
+
+preprocessing:
+  # How many cpu cores to use on each node to preprocess data
+  processes_per_node: 16
+  # The last file number to process, indexed to 0: change to, e.g., 3 to
+  # process files 0, 1, 2, 3
+  final_file_number: 0
+
+training:
+  # Which configuration file to use
+  configuration: config_llama3_8b.yaml
+  # Global Batch Size for a single node; will be multiplied
+  base_global_batch_size: 128
+
+storage:
+  # Export path of the file storage server
+  fss_path: /data
+  # IP Address or DNS name of the file storage server
+  fss_name: fs1.fss.zone
diff --git a/cloud-infrastructure/ai-infra-gpu/ai-infrastructure/nemo-megatron-training-oke/files/utils/host_list.sh b/cloud-infrastructure/ai-infra-gpu/ai-infrastructure/nemo-megatron-training-oke/files/utils/host_list.sh
new file mode 100644
index 000000000..c69249da5
--- /dev/null
+++ b/cloud-infrastructure/ai-infra-gpu/ai-infrastructure/nemo-megatron-training-oke/files/utils/host_list.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+# Copyright (c) 2025 Oracle and/or its affiliates.
+
+# This script creates a mapping of node displayname (normally an OCI-style name
+# for OKE nodes) to an alias used by Kubernetes.
+#
+# Use whenever PyTorch's c10d cannot resolve hostnames with:
+#
+#   ./utils/host_list.sh >> mpi.yaml
+#   kubectl apply -f mpi.yaml
+
+echo "  HOST_LIST: |"
+kubectl get nodes -o custom-columns=HOSTNAME:.status.addresses[0].address,NODE:.metadata.labels.displayName --no-headers|sed 's/^/    /'
diff --git a/cloud-infrastructure/ai-infra-gpu/ai-infrastructure/nemo-megatron-training-oke/files/utils/performance.py b/cloud-infrastructure/ai-infra-gpu/ai-infrastructure/nemo-megatron-training-oke/files/utils/performance.py
new file mode 100644
index 000000000..6c59b7bc1
--- /dev/null
+++ b/cloud-infrastructure/ai-infra-gpu/ai-infrastructure/nemo-megatron-training-oke/files/utils/performance.py
@@ -0,0 +1,53 @@
+# Copyright (c) 2025 Oracle and/or its affiliates.
+"""Generate a small performance report from PyTorch logs.
+"""
+import argparse
+import re
+import textwrap
+import yaml
+from pathlib import Path
+
+
+SETTINGS_RE = re.compile(r'^\s*(devices|num_nodes|global_batch_size|encoder_seq_length): ([0-9]*)$')
+TIMING_RE = re.compile(r'train_step_timing in s: \[((?:[0-9.]+(?:, )?)+)\]')
+
+
+parser = argparse.ArgumentParser("calculate training performance based on log files and configuration")
+parser.add_argument("logfile", type=Path, help="the log file of the training run")
+
+args = parser.parse_args()
+
+num_gpus = 1
+num_tokens = 1
+
+settings = set()
+
+try:
+    with args.logfile.open() as fd:
+        for line in fd:
+            if m := TIMING_RE.search(line):
+                timings = [float(n) for n in m.group(1).split(", ")]
+            elif m := SETTINGS_RE.match(line):
+                setting = m.group(1)
+                value = int(m.group(2))
+                settings.add(setting)
+                if setting in ("devices", "num_nodes"):
+                    num_gpus *= value
+                else:
+                    num_tokens *= value
+except Exception as e:
+    parser.error(f"failed to parse log: {e}")
+
+timing_avg = sum(timings) / len(timings)
+throughput = num_tokens / timing_avg
+
+print(
+    textwrap.dedent(
+        f"""\
+        Number of GPUs: {num_gpus}
+        Training step time (seconds per step): {timing_avg}
+        Total token throughput per second: {throughput}
+        Total token throughput per GPU per second: {throughput / num_gpus}\
+        """
+    )
+)