Skip to content

Commit 7d33380

Browse files
committed
Merge branch 'romeyn/parquet-sequence-pack' of github.com:NVIDIA-NeMo/Megatron-Bridge into romeyn/parquet-sequence-pack
2 parents e080c5b + e6e40ec commit 7d33380

File tree

449 files changed

+44256
-16889
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

449 files changed

+44256
-16889
lines changed

.coderabbit.yaml

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
# yaml-language-server: $schema=https://coderabbit.ai/integrations/schema.v2.json
17+
# https://docs.coderabbit.ai/getting-started/configure-coderabbit/
18+
# Validator https://docs.coderabbit.ai/configuration/yaml-validator#yaml-validator
19+
# In PR, comment "@coderabbitai configuration" to get the full config including defaults
20+
# Set the language for reviews by using the corresponding ISO language code.
21+
# Default: "en-US"
22+
language: "en-US"
23+
# Settings related to reviews.
24+
# Default: {}
25+
reviews:
26+
# Set the profile for reviews. Assertive profile yields more feedback, that may be considered nitpicky.
27+
# Options: chill, assertive
28+
# Default: "chill"
29+
profile: chill
30+
# Add this keyword in the PR/MR title to auto-generate the title.
31+
# Default: "@coderabbitai"
32+
auto_title_placeholder: '@coderabbitai title'
33+
# Auto Title Instructions - Custom instructions for auto-generating the PR/MR title.
34+
# Default: ""
35+
auto_title_instructions: 'Format: "[{modules}] {type}: {description}". Modules: model, recipe, training, data, ckpt, peft, perf, ci, doc, test, build, misc. Use comma to separate multiple modules. Type must be one of: feat, fix, refactor, chore, test. Title should be concise (<= 80 chars). Example: "[model] feat: Add Qwen3 model bridge" or "[recipe, doc] feat: Add Llama 3.1 70B recipe".'
36+
# Set the commit status to 'pending' when the review is in progress and 'success' when it is complete.
37+
# Default: true
38+
commit_status: false
39+
# Generate walkthrough in a markdown collapsible section.
40+
# Default: false
41+
collapse_walkthrough: true
42+
# Generate an assessment of how well the changes address the linked issues in the walkthrough.
43+
# Default: true
44+
assess_linked_issues: true
45+
# Include possibly related issues in the walkthrough.
46+
# Default: true
47+
related_issues: true
48+
# Related PRs - Include possibly related pull requests in the walkthrough.
49+
# Default: true
50+
related_prs: true
51+
# Suggest labels based on the changes in the pull request in the walkthrough.
52+
# Default: true
53+
suggested_labels: true
54+
# Suggest reviewers based on the changes in the pull request in the walkthrough.
55+
# Default: true
56+
suggested_reviewers: true
57+
# Generate a poem in the walkthrough comment.
58+
# Default: true
59+
poem: false
60+
# Post review details on each review. Additionally, post a review status when a review is skipped in certain cases.
61+
# Default: true
62+
review_status: false
63+
# Configuration for pre merge checks
64+
# Default: {}
65+
pre_merge_checks:
66+
# Custom Pre-merge Checks - Add unique checks to enforce your team's standards before merging a pull request. Each check must have a unique name (up to 50 characters) and clear instructions (up to 10000 characters). Use these to automatically verify coding, security, documentation, or business rules and maintain code quality.
67+
# Default: []
68+
custom_checks:
69+
- name: "Test Results for Major Changes"
70+
mode: "warning" # or "error" to block merges
71+
instructions: |
72+
If this PR contains major changes (such as new features, breaking changes, or significant refactoring), verify that the PR description includes test results or testing information.
73+
If a change could affect numerics or convergence, the PR description should include information demonstrating that there is no regression.
74+
If a change could affect performance, the PR description should include before-and-after performance numbers, as well as the configuration and context in which they apply.
75+
Pass if test results are documented or if the changes are minor.
76+
auto_review:
77+
# Configuration for auto review
78+
# Default: {}
79+
# Automatic Incremental Review - Automatic incremental code review on each push
80+
# Default: true
81+
auto_incremental_review: false
82+
# Review draft PRs/MRs.
83+
# Default: false
84+
drafts: false
85+
# Base branches (other than the default branch) to review. Accepts regex patterns. Use '.*' to match all branches.
86+
# Default: []
87+
base_branches: ["main", "r[0-9].*"]
88+
# Configuration for knowledge base
89+
# Default: {}
90+
knowledge_base:
91+
code_guidelines:
92+
# CodeRabbit will analyse and learn from your organization's code guidelines, which you can mention in the file patterns section. These guidelines will then be used to conduct thorough code reviews.
93+
# Default: {}
94+
enabled: true
95+
# Enabled - Enable CodeRabbit to enforce your organization's coding standards during reviews.
96+
# Default: true
97+
filePatterns:
98+
# File Patterns - Specify files for your coding guideline documents in this section. CodeRabbit will scan these files to understand your team's standards and apply them during code reviews. Multiple files supported. File names are case-sensitive. Common files like: (**/.cursorrules, .github/copilot-instructions.md, .github/instructions/*.instructions.md, **/CLAUDE.md, **/GEMINI.md, **/.cursor/rules/*, **/.windsurfrules, **/.clinerules/*, **/.rules/*, **/AGENT.md, **/AGENTS.md) are included by default.
99+
# Default: []
100+
- "**/CODING_GUIDELINES.md"
101+
- "**/.cursor/rules/*"
102+
- "**/CONTRIBUTING.md"

.dockerignore

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,5 @@ nosetests.xml
1414
coverage.xml
1515
*,cover
1616
*.log
17-
.git
1817
**/*.nemo
1918
**/*.ckpt

.github/actions/test-template/action.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,7 @@ runs:
132132
--runtime=nvidia --gpus all \
133133
--shm-size=64g \
134134
--cpus=40 \
135+
--env GHA_RUNNER=${{ inputs.runner }} \
135136
--env HYDRA_FULL_ERROR=1 \
136137
--env HF_HOME=/home/TestData/HF_HOME \
137138
--env NEMO_HOME=/home/TestData/nemo_home \

.github/workflows/build-test-publish-wheel.yml

Lines changed: 56 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
14-
1514
name: Build, test, and publish a PyPi wheel (to testpypi).
1615

1716
on:
@@ -35,55 +34,62 @@ concurrency:
3534

3635
jobs:
3736
pre-flight:
38-
uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.64.2
37+
uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.69.1
38+
with:
39+
default_runner_prefix: ${{ vars.DEFAULT_RUNNER_PREFIX }}
40+
non_nvidia_runner_prefix: ${{ vars.NON_NVIDIA_RUNNER_PREFIX }}
41+
default_test_data_path: ${{ vars.DEFAULT_TEST_DATA_PATH }}
42+
non_nvidia_test_data_path: ${{ vars.NON_NVIDIA_TEST_DATA_PATH }}
43+
secrets:
44+
NVIDIA_MANAGEMENT_ORG_PAT: ${{ secrets.NVIDIA_MANAGEMENT_ORG_PAT }}
3945

40-
# build-test-publish-wheel:
41-
# needs: [pre-flight]
42-
# if: |
43-
# !(needs.pre-flight.outputs.docs_only == 'true'
44-
# || needs.pre-flight.outputs.is_deployment_workflow == 'true')
45-
# uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_build_test_publish_wheel.yml@v0.65.1
46-
# with:
47-
# dry-run: true
48-
# python-package: megatron.bridge
49-
# python-version: "3.10"
50-
# packaging: uv
51-
# no-publish: ${{ !(github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/r')) }}
52-
# has-src-dir: true
53-
# skip-test-wheel: true
54-
# custom-container: nvcr.io/nvidia/pytorch:25.05-py3
55-
# runner: self-hosted-nemo
56-
# no-build-isolation: true
57-
# submodules: recursive
58-
# container-options: "--gpus all --runtime=nvidia"
59-
# secrets:
60-
# TWINE_USERNAME: ${{ secrets.TWINE_USERNAME }}
61-
# TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }}
62-
# SLACK_WEBHOOK: ${{ secrets.SLACK_RELEASE_ENDPOINT }}
63-
# SLACK_WEBHOOK_ADMIN: ${{ secrets.SLACK_WEBHOOK_ADMIN }}
64-
# GH_TOKEN: ${{ secrets.PAT }}
46+
build-test-publish-wheel:
47+
needs: [pre-flight]
48+
if: |
49+
!(needs.pre-flight.outputs.docs_only == 'true'
50+
|| needs.pre-flight.outputs.is_deployment_workflow == 'true')
51+
uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_build_test_publish_wheel.yml@v0.70.1
52+
with:
53+
dry-run: true
54+
python-package: megatron.bridge
55+
python-version: "3.10"
56+
packaging: uv
57+
no-publish: ${{ !(github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/r')) }}
58+
has-src-dir: true
59+
skip-test-wheel: true
60+
custom-container: nvcr.io/nvidia/pytorch:25.11-py3
61+
runner: ${{ needs.pre-flight.outputs.runner_prefix }}-gpu-x2-container
62+
no-build-isolation: true
63+
submodules: recursive
64+
container-options: "--gpus all --runtime=nvidia"
65+
secrets:
66+
TWINE_USERNAME: ${{ secrets.TWINE_USERNAME }}
67+
TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }}
68+
SLACK_WEBHOOK: ${{ secrets.SLACK_RELEASE_ENDPOINT }}
69+
SLACK_WEBHOOK_ADMIN: ${{ secrets.SLACK_WEBHOOK_ADMIN }}
70+
GH_TOKEN: ${{ secrets.PAT }}
6571

66-
# build-test-publish-wheel-summary:
67-
# needs: [pre-flight, build-test-publish-wheel]
68-
# if: |
69-
# (
70-
# needs.pre-flight.outputs.docs_only == 'true'
71-
# || needs.pre-flight.outputs.is_deployment_workflow == 'true'
72-
# || always()
73-
# )
74-
# && !cancelled()
75-
# runs-on: ubuntu-latest
76-
# steps:
77-
# - name: Result
78-
# run: |
79-
# FAILED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion != "success")] | length') || echo 0
72+
build-test-publish-wheel-summary:
73+
needs: [pre-flight, build-test-publish-wheel]
74+
if: |
75+
(
76+
needs.pre-flight.outputs.docs_only == 'true'
77+
|| needs.pre-flight.outputs.is_deployment_workflow == 'true'
78+
|| always()
79+
)
80+
&& !cancelled()
81+
runs-on: ubuntu-latest
82+
steps:
83+
- name: Result
84+
run: |
85+
FAILED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion != "success")] | length') || echo 0
8086
81-
# if [ "${FAILED_JOBS:-0}" -eq 0 ] || [ "$SKIPPING_IS_ALLOWED" == "true" ]; then
82-
# echo "✅ All previous jobs completed successfully"
83-
# exit 0
84-
# else
85-
# echo "❌ Found $FAILED_JOBS failed job(s)"
86-
# # Show which jobs failed
87-
# gh run view $GITHUB_RUN_ID --json jobs --jq '.jobs[] | select(.status == "completed" and .conclusion != "success") | .name'
88-
# exit 1
89-
# fi
87+
if [ "${FAILED_JOBS:-0}" -eq 0 ] || [ "$SKIPPING_IS_ALLOWED" == "true" ]; then
88+
echo "✅ All previous jobs completed successfully"
89+
exit 0
90+
else
91+
echo "❌ Found $FAILED_JOBS failed job(s)"
92+
# Show which jobs failed
93+
gh run view $GITHUB_RUN_ID --json jobs --jq '.jobs[] | select(.status == "completed" and .conclusion != "success") | .name'
94+
exit 1
95+
fi

.github/workflows/cicd-main.yml

Lines changed: 16 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -25,32 +25,32 @@ on:
2525
workflow_dispatch:
2626
inputs:
2727
mcore_commit:
28-
description: 'MCore commit SHA to test against'
28+
description: "MCore commit SHA to test against"
2929
required: false
3030
type: string
3131
mcore_branch:
32-
description: 'MCore branch name (for reference)'
32+
description: "MCore branch name (for reference)"
3333
required: false
3434
type: string
3535
mcore_repo:
36-
description: 'MCore repository URL (for fetching from forks)'
36+
description: "MCore repository URL (for fetching from forks)"
3737
required: false
3838
type: string
39-
default: 'https://github.com/NVIDIA/Megatron-LM.git'
39+
default: "https://github.com/NVIDIA/Megatron-LM.git"
4040
test_suite:
41-
description: 'Test suite to run'
41+
description: "Test suite to run"
4242
required: false
4343
type: choice
4444
options:
45-
- 'all'
46-
- 'unit-only'
47-
- 'functional-only'
48-
default: 'all'
45+
- "all"
46+
- "unit-only"
47+
- "functional-only"
48+
default: "all"
4949
triggered_by:
50-
description: 'Trigger source (for tracking)'
50+
description: "Trigger source (for tracking)"
5151
required: false
5252
type: string
53-
default: 'manual'
53+
default: "manual"
5454

5555
concurrency:
5656
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-${{ github.event.label.name || 'main' }}-${{ github.event_name }}
@@ -378,7 +378,7 @@ jobs:
378378
matrix:
379379
include:
380380
- script: L2_Launch_training
381-
timeout: 40
381+
timeout: 50
382382
- script: L2_Launch_converter
383383
- script: L2_Launch_models_deepseek
384384
- script: L2_Launch_models_gemma
@@ -393,19 +393,21 @@ jobs:
393393
- script: L2_Launch_models_nemotron_vl
394394
- script: L2_Launch_models_olmoe
395395
- script: L2_Launch_models_qwen
396-
- script: L2_Launch_models_qwen_quantization
396+
# - script: L2_Launch_models_qwen_quantization
397397
- script: L2_Launch_models_qwen_vl
398+
- script: L2_Launch_recipes_gemma_vl
399+
- script: L2_Launch_recipes_gpt_oss
398400
- script: L2_Launch_recipes_llama_1b
399401
- script: L2_Launch_recipes_llama_3b
400402
- script: L2_Launch_recipes_llama_distill
401-
- script: L2_Launch_recipes_mamba
402403
- script: L2_Launch_recipes_nemotronh
403404
- script: L2_Launch_recipes_qwen
404405
- script: L2_Launch_data
405406
- script: L2_Launch_post_training_quantization
406407
- script: L2_Launch_quantization_aware_training
407408
- script: L2_Launch_quantization_export
408409
- script: L2_Launch_recipes_llama_cuda_graphs
410+
- script: L2_Launch_utils
409411
needs: [pre-flight, cicd-unit-tests]
410412
runs-on: ${{ needs.pre-flight.outputs.runner_prefix }}-gpu-x2
411413
if: |

0 commit comments

Comments
 (0)