NVIDIA-NeMo
diff --git a/‎.github/actions/test-template/action.yml‎
Lines changed: 5 additions & 2 deletions b/‎.github/actions/test-template/action.yml‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎.github/workflows/cicd-main.yml‎
Lines changed: 3 additions & 1 deletion b/‎.github/workflows/cicd-main.yml‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎.github/workflows/install-test.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/install-test.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/release-docs.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/release-docs.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎3rdparty/Megatron-LM‎ b/‎3rdparty/Megatron-LM‎
diff --git a/‎docker/Dockerfile.ci‎
Lines changed: 4 additions & 1 deletion b/‎docker/Dockerfile.ci‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎docs/megatron-lm-to-megatron-bridge.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/megatron-lm-to-megatron-bridge.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/training/packed-sequences.md‎
Lines changed: 1 addition & 0 deletions b/‎docs/training/packed-sequences.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/training/resiliency.md‎
Lines changed: 3 additions & 3 deletions b/‎docs/training/resiliency.md‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎examples/conversion/hf_to_megatron_generate_nemotron_vlm.py‎
Lines changed: 2 additions & 1 deletion b/‎examples/conversion/hf_to_megatron_generate_nemotron_vlm.py‎
Lines changed: 2 additions & 1 deletion
@@ -51,8 +51,10 @@ inputs:
     required: true
   test-data-path:
     description: "Test data path"
-    required: false
-    default: "/mnt/datadrive/TestData/nemo-fw/TestData"
+    required: true
+  runner:
+    description: "Runner to use for test"
+    required: true
 
 runs:
   using: "composite"
@@ -103,6 +105,7 @@ runs:
 
     - name: Install uuidgen
       shell: bash -x -e -u -o pipefail {0}
+      if: ${{ contains(inputs.runner, 'aws') }}
       run: |
         apt-get update
         apt-get install -y uuid-runtime
 
@@ -65,7 +65,7 @@ env:
 
 jobs:
   pre-flight:
-    uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.69.0
+    uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.69.1
     with:
       default_runner_prefix: ${{ vars.DEFAULT_RUNNER_PREFIX }}
       non_nvidia_runner_prefix: ${{ vars.NON_NVIDIA_RUNNER_PREFIX }}
@@ -369,6 +369,7 @@ jobs:
           PAT: ${{ secrets.PAT }}
           container-image: ${{ env.container-registry }}/megatron-bridge:${{ github.sha }}
           test-data-path: ${{ needs.pre-flight.outputs.test_data_path }}
+          runner: ${{ needs.pre-flight.outputs.runner_prefix }}-gpu-x2
 
   cicd-functional-tests:
     strategy:
@@ -464,6 +465,7 @@ jobs:
           PAT: ${{ secrets.PAT }}
           container-image: ${{ env.container-registry }}/megatron-bridge:${{ github.sha }}
           test-data-path: ${{ needs.pre-flight.outputs.test_data_path }}
+          runner: ${{ needs.pre-flight.outputs.runner_prefix }}-gpu-x2
 
   Nemo_CICD_Test:
     needs:
 
@@ -57,7 +57,7 @@ concurrency:
 
 jobs:
   pre-flight:
-    uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.69.0
+    uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.69.1
     with:
       default_runner_prefix: ${{ vars.INSTALL_TEST_DEFAULT_RUNNER_PREFIX }}
       non_nvidia_runner_prefix: ${{ vars.INSTALL_TEST_NON_NVIDIA_RUNNER_PREFIX }}
 
@@ -58,7 +58,7 @@ jobs:
           dry-run: ${{ inputs.dry-run }}
           artifacts-name: docs-html
           artifacts-path: _build/html
-          emails-csv: ${{ inputs.notify-emails }}
+          emails-csv: ${{ inputs.notify-emails && format('{0},{1}', vars.docs_release_emails, inputs.notify-emails) || vars.docs_release_emails }}
           overwrite-latest-on-tag: false
           run-on-version-tag-only: ${{ github.ref_name != 'main' }}
           request-name: megatron-bridge-publish-docs-${{ github.run_id }}
 
@@ -12,7 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-FROM nvcr.io/nvidia/pytorch:25.11-py3
+ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:25.11-py3
+FROM ${BASE_IMAGE} AS megatron_bridge
 WORKDIR /opt/Megatron-Bridge
 ENV PATH="/root/.local/bin:$PATH"
 ENV UV_PROJECT_ENVIRONMENT=/opt/venv
@@ -45,3 +46,5 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
         uv sync --link-mode copy --locked --all-extras --all-groups; \
     fi && \
     uv cache prune
+
+COPY . /opt/Megatron-Bridge
@@ -281,7 +281,7 @@ Additional distributed/optimizer overlap settings:
 | --- | --- | --- |
 | `--error-injection-rate` | `rerun_state_machine.error_injection_rate` | Frequency of injected validation perturbations. |
 | `--error-injection-type` | `rerun_state_machine.error_injection_type` | Kind of injection (correct/transient/persistent). |
-| `--rerun-mode` | `rerun_state_machine.rerun_mode` | Disabled/validate_results/report_stats. |
+| `--rerun-mode` | `rerun_state_machine.rerun_mode` | Disabled/validate_results/report_determinism_stats. |
 
 ### Data / Tokenizer args
 
 
@@ -60,6 +60,7 @@ The {py:class}`bridge.data.datasets.packed_sequence.PackedSequenceSpecs` class p
 | `packed_train_data_path` | `str` | `None` | Custom path for packed training dataset file (`.npy` format). |
 | `packed_val_data_path` | `str` | `None` | Custom path for packed validation dataset file (`.npy` format). |
 | `packed_metadata_path` | `str` | `None` | Custom path for packing metadata file (`.jsonl` format). |
+| `pad_seq_to_mult` | `int \| None` | `None` | Pad each sample to a multiple of this value when generating packed datasets (e.g., set to `2 * context_parallel_size` for THD CP). |
 | `pad_cu_seqlens` | `bool` | `False` | Whether to pad `cu_seqlens` to constant size, required for CUDA graphs. |
 
 ### Batch Size Considerations
 
@@ -438,7 +438,7 @@ from megatron.bridge.training.config import RerunStateMachineConfig
 
 # Configure re-run state machine in your config
 config.rerun_state_machine = RerunStateMachineConfig(
-    rerun_mode="validate_results",  # or "report_stats" or "disabled"
+    rerun_mode="validate_results",  # or "report_determinism_stats" or "disabled"
     check_for_nan_in_loss=True,
     check_for_spiky_loss=False,
     error_injection_rate=0,  # For testing only
@@ -450,7 +450,7 @@ config.rerun_state_machine = RerunStateMachineConfig(
 
 | Parameter | Type | Default | Description |
 |-----------|------|---------|-------------|
-| `rerun_mode` | `str` | `"disabled"` | Operating mode: `"disabled"`, `"validate_results"`, or `"report_stats"` |
+| `rerun_mode` | `str` | `"disabled"` | Operating mode: `"disabled"`, `"validate_results"`, or `"report_determinism_stats"` |
 | `check_for_nan_in_loss` | `bool` | `True` | Check for NaN values in loss |
 | `check_for_spiky_loss` | `bool` | `False` | Check for unexpectedly large loss values |
 | `error_injection_rate` | `int` | `0` | Rate for injecting test errors (testing only) |
@@ -463,7 +463,7 @@ config.rerun_state_machine = RerunStateMachineConfig(
 - **Behavior**: Training proceeds normally without any result checking.
 - **Use Case**: When re-run overhead is not acceptable or validation is not needed.
 
-#### 2. Report Stats Mode (`report_stats`)  
+#### 2. Report Stats Mode (`report_determinism_stats`)  
 - **Purpose**: Collect statistics on computational determinism.
 - **Behavior**: Re-runs every step once to measure variability.
 - **Output**: Reports on computational non-determinism without stopping training.
 
@@ -150,7 +150,8 @@ def process_image_inputs(processor, image_path: Optional[str], prompt: str, syst
             image_paths = image_path.split(",")
             content = []
             for i, path in enumerate(image_paths):
-                content.append({"type": "text", "text": f"{'\n' if i > 0 else ''}Image-{i + 1}: "})
+                prefix = "\n" if i > 0 else ""
+                content.append({"type": "text", "text": f"{prefix}Image-{i + 1}: "})
                 content.append({"type": "image", "image": path})
             content.append({"type": "text", "text": "\n" + prompt})
         else: