Skip to content

Commit 56ecd09

Browse files
authored
Merge branch 'main' into add_ag_groups_to_pg_collection
2 parents d1c1c62 + d31a21f commit 56ecd09

File tree

495 files changed

+46234
-32536
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

495 files changed

+46234
-32536
lines changed

.github/CODEOWNERS

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,9 @@ megatron/post_training/ @NVIDIA/post-training
3939

4040
megatron/core/transformer/cuda_graphs.py @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/cuda-graphs
4141

42+
megatron/training/ @NVIDIA/training-adlr @NVIDIA/training-nemo
43+
megatron/training/arguments.py
44+
4245
.gitlab/ @NVIDIA/ci
4346
.github/ @NVIDIA/ci
4447
.gitlab-ci.yml @NVIDIA/ci

.github/actions/action.yml

Lines changed: 24 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -45,12 +45,22 @@ inputs:
4545
PAT:
4646
description: "GitHub Personal Access Token"
4747
required: true
48-
is_ci_workload:
49-
description: "Is CI workload"
50-
required: true
51-
is_merge_group:
52-
description: "Is merge group"
53-
required: true
48+
scope:
49+
description: "Test scope (e.g. mr-github, mr-github-slim)"
50+
required: false
51+
default: "mr-github-slim"
52+
n_repeat:
53+
description: "Number of test repetitions"
54+
required: false
55+
default: "5"
56+
lightweight:
57+
description: "Enable lightweight mode"
58+
required: false
59+
default: "false"
60+
platform:
61+
description: "Platform to run tests on (e.g. dgx_h100, dgx_gb200)"
62+
required: false
63+
default: "dgx_h100"
5464
runs:
5565
using: "composite"
5666
steps:
@@ -96,7 +106,7 @@ runs:
96106
--model unit-tests \
97107
--test-case "${{ inputs.test_case }}" \
98108
--environment dev \
99-
--platform dgx_h100 \
109+
--platform ${{ inputs.platform }} \
100110
--tag ${{ inputs.tag }} \
101111
--container-image ${{ inputs.container-image }} \
102112
--hf-home /mnt/datadrive/TestData/nemo-fw/TestData/HF_HOME
@@ -106,39 +116,6 @@ runs:
106116
echo "$cmd" | tee "job.sh"
107117
echo "::endgroup::"
108118
109-
- name: Get PR info
110-
id: get-pr-info
111-
if: startsWith(github.ref, 'refs/heads/pull-request/')
112-
uses: nv-gha-runners/get-pr-info@main
113-
114-
- name: Install GH CLI
115-
shell: bash -x -e -u -o pipefail {0}
116-
run: |
117-
apt-get update
118-
apt-get install -y gh
119-
120-
- name: Has Run tests label
121-
shell: bash -x -e -u -o pipefail {0}
122-
id: has-run-tests-label
123-
env:
124-
GH_TOKEN: ${{ github.token }}
125-
run: |
126-
PR_NUMBER=${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number }}
127-
HAS_RUN_TESTS_LABEL=$(gh pr view $PR_NUMBER --json labels | jq '[.labels[].name] | any(. == "Run tests")') || echo "false"
128-
echo "main=$HAS_RUN_TESTS_LABEL" | tee -a $GITHUB_OUTPUT
129-
130-
- name: Has Run functional tests label
131-
shell: bash -x -e -u -o pipefail {0}
132-
id: has-run-functional-tests-label
133-
env:
134-
GH_TOKEN: ${{ github.token }}
135-
IS_CI_WORKLOAD: ${{ inputs.is_ci_workload }}
136-
run: |
137-
PR_NUMBER=${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number }}
138-
HAS_RUN_FUNCTIONAL_TESTS_LABEL=$(gh pr view $PR_NUMBER --json labels | jq '[.labels[].name] | any(. == "Run functional tests")') || echo "$IS_CI_WORKLOAD"
139-
HAS_RUN_FUNCTIONAL_TESTS_LABEL=${HAS_RUN_FUNCTIONAL_TESTS_LABEL:-$IS_CI_WORKLOAD}
140-
echo "main=$HAS_RUN_FUNCTIONAL_TESTS_LABEL" | tee -a $GITHUB_OUTPUT
141-
142119
- name: Create run-script (e2e test)
143120
shell: bash -x -e -u -o pipefail {0}
144121
if: inputs.is_unit_test == 'false'
@@ -150,27 +127,12 @@ runs:
150127
#!/bin/bash
151128
set -euxo pipefail
152129
153-
if [ "${{ inputs.is_merge_group }}" == "true" ]; then
154-
ARGS=(
155-
--scope mr-github
156-
--n-repeat 1
157-
)
158-
elif [ "${{ steps.has-run-tests-label.outputs.main }}" == "true" ]; then
159-
ARGS=(
160-
--scope mr-github
161-
--enable-lightweight-mode
162-
--n-repeat 1
163-
)
164-
elif [ "${{ steps.has-run-functional-tests-label.outputs.main }}" == "true" ]; then
165-
ARGS=(
166-
--scope mr-github
167-
--n-repeat 5
168-
)
169-
else
170-
ARGS=(
171-
--scope mr-github-slim
172-
--n-repeat 5
173-
)
130+
ARGS=(
131+
--scope ${{ inputs.scope }}
132+
--n-repeat ${{ inputs.n_repeat }}
133+
)
134+
if [ "${{ inputs.lightweight }}" == "true" ]; then
135+
ARGS+=(--enable-lightweight-mode)
174136
fi
175137
176138
export PYTHONPATH=$(pwd)
@@ -184,7 +146,7 @@ runs:
184146
--model ${{ inputs.model }} \
185147
--test-case ${{ inputs.test_case }} \
186148
--environment dev \
187-
--platform dgx_h100 \
149+
--platform ${{ inputs.platform }} \
188150
--container-image ${{ inputs.container-image }} \
189151
--data-dir /mnt/datadrive/TestData/megatron-lm/artifacts \
190152
--hf-home /mnt/datadrive/TestData/nemo-fw/TestData/HF_HOME

.github/copy-pr-bot.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
enabled: true
22
auto_sync_draft: false
33
auto_sync_ready: true
4-
trustees_override: ["AAnoosheh", "ArEsKay3", "Autumn1998", "BestJuly", "BoxiangW", "CarlosGomes98", "ChenhanYu", "FDecaYed", "HaochenYuan", "ISEEKYAN", "JRD971000", "Phlip79", "QiZhangNV", "RPrenger", "ShriyaRishab", "Victarry", "Wohox", "ZhiyuLi-Nvidia", "ahmadki", "aklife97", "ananthsub", "asolergi-nv", "buptzyb", "chtruong814", "cjld", "cspades", "cuichenx", "deepakn94", "dimapihtar", "dingqingy-nv", "duncanriach", "erhoo82", "ericharper", "fanshiqing", "faradawn", "frsun-nvda", "gautham-kollu", "gdengk", "guyueh1", "huvunvidia", "hxbai", "ilml", "jalbericiola", "janEbert", "jaredcasper", "jenchen13", "jiemingz", "jingqiny-99", "jkamalu", "jon-barker", "jstjohn", "kanz-nv", "kevalmorabia97", "ko3n1g", "ksivaman", "kunlunl", "kvareddy", "kwyss-nvidia", "layalir", "lhb8125", "lmcafee-nvidia", "maanug-nv", "mathemakitten", "matthieule", "mchrzanowski", "mehraakash", "mkhona-nvidia", "nanz-nv", "parthmannan", "prajwal1210", "pthombre", "rhewett-nv", "rogerwaleffe", "sajadn", "sanandaraj5597", "sancha", "santhnm2", "sbak5", "shanmugamr1992", "sharathts", "shengf-nv", "shifangx", "shjwudp", "sidsingh-nvidia", "skyw", "sudhakarsingh27", "tdene", "theothermike", "thomasdhc", "tomlifu", "trintamaki", "tylerpoon", "wdykas", "wplf", "xiaoyao0115", "xuwchen", "yanring", "yaox12", "yaoyu-33", "yashaswikarnati", "yeyu-nvidia", "yobibyte", "youngeunkwon0405", "yueshen2016", "yuzhongw-nvidia", "zhongbozhu"]
4+
trustees_override: ["AAnoosheh", "ArEsKay3", "Autumn1998", "BestJuly", "BoxiangW", "CarlosGomes98", "ChenhanYu", "FDecaYed", "HaochenYuan", "ISEEKYAN", "JRD971000", "Phlip79", "QiZhangNV", "RPrenger", "ShriyaRishab", "Victarry", "Wohox", "ZhiyuLi-Nvidia", "ahmadki", "aklife97", "ananthsub", "asolergi-nv", "buptzyb", "chtruong814", "cjld", "cspades", "cuichenx", "deepakn94", "dimapihtar", "dingqingy-nv", "duncanriach", "erhoo82", "ericharper", "fanshiqing", "faradawn", "frsun-nvda", "gautham-kollu", "gdengk", "guyueh1", "huvunvidia", "hxbai", "ilml", "jalbericiola", "janEbert", "jaredcasper", "jenchen13", "jiemingz", "jingqiny-99", "jkamalu", "jon-barker", "jstjohn", "kajalj22", "kanz-nv", "kevalmorabia97", "ko3n1g", "ksivaman", "kunlunl", "kvareddy", "kwyss-nvidia", "layalir", "lhb8125", "lmcafee-nvidia", "maanug-nv", "mathemakitten", "matthieule", "mchrzanowski", "mehraakash", "mkhona-nvidia", "nanz-nv", "parthmannan", "prajwal1210", "pthombre", "rhewett-nv", "rogerwaleffe", "sajadn", "sanandaraj5597", "sancha", "santhnm2", "sbak5", "shanmugamr1992", "sharathts", "shengf-nv", "shifangx", "shjwudp", "sidsingh-nvidia", "skyw", "sudhakarsingh27", "tdene", "theothermike", "thomasdhc", "tomlifu", "trintamaki", "tylerpoon", "wdykas", "wplf", "xiaoyao0115", "xuwchen", "yanring", "yaox12", "yaoyu-33", "yashaswikarnati", "yeyu-nvidia", "yobibyte", "youngeunkwon0405", "yueshen2016", "yuzhongw-nvidia", "zhongbozhu"]

.github/oncall_schedule.json

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,4 @@
11
[
2-
{
3-
"user": "maanug-nv",
4-
"date": "2026-03-11"
5-
},
6-
{
7-
"user": "dimapihtar",
8-
"date": "2026-03-18"
9-
},
102
{
113
"user": "janEbert",
124
"date": "2026-03-25"
@@ -46,5 +38,13 @@
4638
{
4739
"user": "ilml",
4840
"date": "2026-05-27"
41+
},
42+
{
43+
"user": "janEbert",
44+
"date": "2026-06-03"
45+
},
46+
{
47+
"user": "maanug-nv",
48+
"date": "2026-06-10"
4949
}
5050
]

.github/workflows/_release_library.yml

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -486,13 +486,8 @@ jobs:
486486
secrets: inherit
487487

488488
notify:
489-
needs: [build-test-publish-wheels, create-gh-release]
489+
needs: [build-test-publish-wheels, create-gh-release, bump-next-version]
490490
runs-on: ubuntu-latest
491-
env:
492-
GH_URL: https://github.com/${{ github.repository }}/releases/tag/v${{ needs.build-test-publish-wheels.outputs.version }}
493-
PYPI_URL: https://${{ inputs.dry-run == true && 'test.' || '' }}pypi.org/project/${{ needs.build-test-publish-wheels.outputs.pypi-name }}/${{ needs.build-test-publish-wheels.outputs.version }}/
494-
PROJECT_NAME: Megatron Core
495-
VERSION: ${{ needs.build-test-publish-wheels.outputs.version }}
496491
steps:
497492
- name: Checkout
498493
uses: actions/checkout@v6
@@ -505,10 +500,10 @@ jobs:
505500
uses: ./send-slack-alert/.github/actions/send-slack-alert
506501
env:
507502
MESSAGE: |
508-
${{ inputs.dry-run == true && 'This is a dry-run, nothing actually happened: ' || '' }}We have released `${{ env.VERSION }}` of `NVIDIA ${{ env.PROJECT_NAME }}` 🚀✨🎉
503+
${{ inputs.dry-run == true && 'This is a dry-run, nothing actually happened: ' || '' }}We have released `${{ needs.bump-next-version.outputs.release-version }}` of `NVIDIA Megatron Core` 🚀✨🎉
509504
510-
• <${{ env.GH_URL }}|GitHub release>
511-
• <${{ env.PYPI_URL }}|PyPi release>
505+
• <https://github.com/${{ github.repository }}/releases/tag/core_v${{ needs.bump-next-version.outputs.release-version }}|GitHub release>
506+
• <https://${{ inputs.dry-run == true && 'test.' || '' }}pypi.org/project/megatron-core/${{ needs.bump-next-version.outputs.release-version }}/|PyPi release>
512507
513508
with:
514509
message: ${{ env.MESSAGE }}

.github/workflows/build-docs.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ jobs:
3232
build-docs:
3333
needs: [pre-flight]
3434
if: needs.pre-flight.outputs.is_deployment_workflow != 'true'
35-
uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_build_docs.yml@v0.57.0
35+
uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_build_docs.yml@v0.80.2
3636

3737
build-docs-summary:
3838
needs: [pre-flight, build-docs]

.github/workflows/build-test-publish-wheel.yml

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -64,16 +64,16 @@ jobs:
6464
env:
6565
GH_TOKEN: ${{ github.token }}
6666
GITHUB_RUN_ID: ${{ github.run_id }}
67-
SKIPPING_IS_ALLOWED: true
67+
SKIPPING_IS_ALLOWED: false
6868
run: |
69-
FAILED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion != "success")] | length') || echo 0
69+
FAILED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion != "success" and (.name | test("build-and-test-wheels")))] | length') || echo 0
7070
7171
if [ "${FAILED_JOBS:-0}" -eq 0 ] || [ "$SKIPPING_IS_ALLOWED" == "true" ]; then
72-
echo "✅ All previous jobs completed successfully"
72+
echo "✅ All build-and-test-wheels jobs completed successfully"
7373
exit 0
7474
else
75-
echo "❌ Found $FAILED_JOBS failed job(s)"
75+
echo "❌ Found $FAILED_JOBS failed build-and-test-wheels job(s)"
7676
# Show which jobs failed
77-
gh run view $GITHUB_RUN_ID --json jobs --jq '.jobs[] | select(.status == "completed" and .conclusion != "success") | .name'
77+
gh run view $GITHUB_RUN_ID --json jobs --jq '.jobs[] | select(.status == "completed" and .conclusion != "success" and (.name | test("build-and-test-wheels"))) | .name'
7878
exit 1
7979
fi

.github/workflows/cicd-approve-test-queue.yml

Lines changed: 23 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ jobs:
5555
env:
5656
GITHUB_TOKEN: ${{ secrets.PAT }}
5757
MAX_CONCURRENCY: ${{ vars.MAX_CONCURRENCY || 1 }}
58-
MAX_CONCURRENCY_EXTERNAL: ${{ vars.MAX_CONCURRENCY_EXTERNAL || 3 }}
58+
MAX_CONCURRENCY_EXTERNAL: ${{ vars.MAX_CONCURRENCY_EXTERNAL || 1 }}
5959
CONTRIBUTOR_TYPE: ${{ matrix.contributor_type }}
6060
SSO_USERS_FILE: users_sso.json
6161
PYTHONUNBUFFERED: 1
@@ -71,7 +71,8 @@ jobs:
7171
REPO = os.environ["GITHUB_REPOSITORY"]
7272
CONTRIBUTOR_TYPE = os.environ["CONTRIBUTOR_TYPE"]
7373
if CONTRIBUTOR_TYPE == "external":
74-
MAX_CONCURRENCY = int(os.environ["MAX_CONCURRENCY_EXTERNAL"]) // 2
74+
# Global limit across all branches — no division needed since we count globally.
75+
MAX_CONCURRENCY = int(os.environ["MAX_CONCURRENCY_EXTERNAL"])
7576
else:
7677
MAX_CONCURRENCY = int(os.environ["MAX_CONCURRENCY"]) // 2
7778
API_BASE = f"https://api.github.com/repos/NVIDIA/Megatron-LM"
@@ -132,6 +133,14 @@ jobs:
132133
base_branch = pr_info.get("base", {}).get("ref")
133134
return base_branch, pr_info
134135
136+
def matches_contributor(workflow_run, contributor_type):
137+
"""Return True if the workflow run matches the contributor type (ignores branch)."""
138+
_, pr_info = get_pr_base_branch(workflow_run)
139+
if pr_info is None:
140+
return False
141+
internal = is_internal_contributor(pr_info)
142+
return (contributor_type == "internal") == internal
143+
135144
def matches_queue(workflow_run, target_branch, contributor_type):
136145
"""
137146
Return True if the workflow run belongs to this queue cell:
@@ -160,11 +169,19 @@ jobs:
160169
queued_workflow_runs = make_request("actions/runs?status=queued").get("workflow_runs", [])
161170
in_progress_workflow_runs = make_request("actions/runs?status=in_progress").get("workflow_runs", [])
162171
163-
# Filter for workflows belonging to PRs targeting ${{ matrix.branch }} with matching contributor type
164-
queued_workflow_runs = [run for run in queued_workflow_runs
165-
if run["name"] == "CICD Megatron-LM" and matches_queue(run, "${{ matrix.branch }}", CONTRIBUTOR_TYPE)]
166-
in_progress_workflow_runs = [run for run in in_progress_workflow_runs
172+
# For external contributors, enforce a single global concurrency limit across ALL branches.
173+
# For internal contributors, enforce per-branch limits as before.
174+
if CONTRIBUTOR_TYPE == "external":
175+
queued_workflow_runs = [run for run in queued_workflow_runs
176+
if run["name"] == "CICD Megatron-LM" and matches_contributor(run, CONTRIBUTOR_TYPE)]
177+
in_progress_workflow_runs = [run for run in in_progress_workflow_runs
178+
if run["name"] == "CICD Megatron-LM" and matches_contributor(run, CONTRIBUTOR_TYPE)]
179+
else:
180+
# Filter for workflows belonging to PRs targeting ${{ matrix.branch }} with matching contributor type
181+
queued_workflow_runs = [run for run in queued_workflow_runs
167182
if run["name"] == "CICD Megatron-LM" and matches_queue(run, "${{ matrix.branch }}", CONTRIBUTOR_TYPE)]
183+
in_progress_workflow_runs = [run for run in in_progress_workflow_runs
184+
if run["name"] == "CICD Megatron-LM" and matches_queue(run, "${{ matrix.branch }}", CONTRIBUTOR_TYPE)]
168185
169186
# Count running and queued workflows
170187
queued_workflows = len(queued_workflow_runs)

0 commit comments

Comments
 (0)