Skip to content

Commit 37568e4

Browse files
Merge branch 'main' into zhiyul/fix/p2p-dealloc-race
2 parents dcc9cc5 + 1d43284 commit 37568e4

File tree

133 files changed

+10703
-5407
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

133 files changed

+10703
-5407
lines changed

.claude/skills/build-and-test/SKILL.md

Lines changed: 400 additions & 0 deletions
Large diffs are not rendered by default.

.cursorrules

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
See CLAUDE.md for all repository guidelines.

.github/actions/action.yml

Lines changed: 3 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -75,16 +75,12 @@ runs:
7575
shell: bash
7676
run: sudo chown -R $(whoami) /home/runner/
7777

78-
- name: Setup python
79-
uses: actions/setup-python@v5
80-
with:
81-
python-version: 3.12
82-
83-
- name: Install uuidgen
78+
- name: Setup python and install dependencies
8479
shell: bash -x -e -u -o pipefail {0}
8580
run: |
8681
apt-get update
87-
apt-get install -y uuid-runtime
82+
apt-get install -y python3.12 python3.12-venv uuid-runtime
83+
curl -LsSf https://astral.sh/uv/install.sh | UV_INSTALL_DIR=/usr/local/bin sh
8884
8985
- name: Create run-script (unit test)
9086
shell: bash -x -e -u -o pipefail {0}
@@ -97,7 +93,6 @@ runs:
9793
export PYTHONPATH=$(pwd)
9894
export NEMORUN_HOME=$(pwd)
9995
export NCCL_DEBUG=INFO
100-
pip install --no-cache-dir "uv<0.9.29"
10196
uv venv .venv
10297
uv cache clean
10398
uv sync --no-cache --only-group test
@@ -137,7 +132,6 @@ runs:
137132
138133
export PYTHONPATH=$(pwd)
139134
export NEMORUN_HOME=$(pwd)
140-
pip install --no-cache-dir "uv<0.9.29"
141135
uv venv .venv
142136
uv cache clean
143137
uv sync --no-cache --only-group test

.github/copy-pr-bot.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
enabled: true
22
auto_sync_draft: false
33
auto_sync_ready: true
4-
trustees_override: ["AAnoosheh", "ArEsKay3", "Autumn1998", "BestJuly", "BoxiangW", "CarlosGomes98", "ChenhanYu", "FDecaYed", "HaochenYuan", "ISEEKYAN", "JRD971000", "Phlip79", "QiZhangNV", "RPrenger", "ShriyaRishab", "Victarry", "Wohox", "ZhiyuLi-Nvidia", "ahmadki", "aklife97", "ananthsub", "asolergi-nv", "buptzyb", "chtruong814", "cjld", "cspades", "cuichenx", "deepakn94", "dimapihtar", "dingqingy-nv", "duncanriach", "erhoo82", "ericharper", "fanshiqing", "faradawn", "frsun-nvda", "gautham-kollu", "gdengk", "guyueh1", "huvunvidia", "hxbai", "ilml", "jalbericiola", "janEbert", "jaredcasper", "jenchen13", "jiemingz", "jingqiny-99", "jkamalu", "jon-barker", "jstjohn", "kajalj22", "kanz-nv", "kevalmorabia97", "ko3n1g", "ksivaman", "kunlunl", "kvareddy", "kwyss-nvidia", "layalir", "lhb8125", "lmcafee-nvidia", "maanug-nv", "mathemakitten", "matthieule", "mchrzanowski", "mehraakash", "mkhona-nvidia", "nanz-nv", "parthmannan", "prajwal1210", "pthombre", "rhewett-nv", "rogerwaleffe", "sajadn", "sanandaraj5597", "sancha", "santhnm2", "sbak5", "shanmugamr1992", "sharathts", "shengf-nv", "shifangx", "shjwudp", "sidsingh-nvidia", "skyw", "sudhakarsingh27", "tdene", "theothermike", "thomasdhc", "tomlifu", "trintamaki", "tylerpoon", "wdykas", "wplf", "xiaoyao0115", "xuwchen", "yanring", "yaox12", "yaoyu-33", "yashaswikarnati", "yeyu-nvidia", "yobibyte", "youngeunkwon0405", "yueshen2016", "yuzhongw-nvidia", "zhongbozhu"]
4+
trustees_override: ["AAnoosheh", "ArEsKay3", "Autumn1998", "BestJuly", "BoxiangW", "CarlosGomes98", "ChenhanYu", "FDecaYed", "HaochenYuan", "ISEEKYAN", "JRD971000", "Phlip79", "QiZhangNV", "RPrenger", "ShriyaRishab", "Victarry", "WanZzzzzz", "Wohox", "ZhiyuLi-Nvidia", "ahmadki", "aklife97", "ananthsub", "asolergi-nv", "buptzyb", "chtruong814", "cjld", "cspades", "cuichenx", "deepakn94", "dimapihtar", "dingqingy-nv", "duncanriach", "erhoo82", "ericharper", "fanshiqing", "faradawn", "frsun-nvda", "gautham-kollu", "gdengk", "guyueh1", "hexinw-nvidia", "huvunvidia", "hxbai", "ilml", "jalbericiola", "janEbert", "jaredcasper", "jenchen13", "jiemingz", "jingqiny-99", "jkamalu", "jon-barker", "jstjohn", "kajalj22", "kanz-nv", "keshavb96", "kevalmorabia97", "ko3n1g", "ksivaman", "kunlunl", "kvareddy", "kwyss-nvidia", "layalir", "lhb8125", "lmcafee-nvidia", "maanug-nv", "mathemakitten", "matthieule", "mchrzanowski", "mehraakash", "mkhona-nvidia", "nanz-nv", "parthmannan", "prajwal1210", "pthombre", "rhewett-nv", "rogerwaleffe", "sajadn", "sanandaraj5597", "sancha", "santhnm2", "sbak5", "shanmugamr1992", "sharathts", "sheliang-nv", "shengf-nv", "shifangx", "shjwudp", "sidsingh-nvidia", "skyw", "sudhakarsingh27", "tdene", "theothermike", "thomasdhc", "tomlifu", "trintamaki", "tylerpoon", "wdykas", "wplf", "xiaoyao0115", "xuwchen", "yanring", "yaox12", "yaoyu-33", "yashaswikarnati", "yeyu-nvidia", "yobibyte", "youngeunkwon0405", "yueshen2016", "yuzhongw-nvidia", "zhongbozhu"]

.github/oncall_schedule.json

Lines changed: 22 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,4 @@
11
[
2-
{
3-
"user": "janEbert",
4-
"date": "2026-03-25"
5-
},
62
{
73
"user": "gautham-kollu",
84
"date": "2026-04-01"
@@ -20,31 +16,47 @@
2016
"date": "2026-04-22"
2117
},
2218
{
23-
"user": "BoxiangW",
19+
"user": "maanug-nv",
2420
"date": "2026-04-29"
2521
},
2622
{
27-
"user": "maanug-nv",
23+
"user": "dimapihtar",
2824
"date": "2026-05-06"
2925
},
3026
{
31-
"user": "dimapihtar",
27+
"user": "gautham-kollu",
3228
"date": "2026-05-13"
3329
},
3430
{
35-
"user": "gautham-kollu",
31+
"user": "janEbert",
3632
"date": "2026-05-20"
3733
},
3834
{
3935
"user": "ilml",
4036
"date": "2026-05-27"
4137
},
4238
{
43-
"user": "janEbert",
39+
"user": "Phlip79",
4440
"date": "2026-06-03"
4541
},
4642
{
47-
"user": "maanug-nv",
43+
"user": "asolergi-nv",
4844
"date": "2026-06-10"
45+
},
46+
{
47+
"user": "maanug-nv",
48+
"date": "2026-06-17"
49+
},
50+
{
51+
"user": "dimapihtar",
52+
"date": "2026-06-24"
53+
},
54+
{
55+
"user": "gautham-kollu",
56+
"date": "2026-07-01"
57+
},
58+
{
59+
"user": "janEbert",
60+
"date": "2026-07-08"
4961
}
5062
]

.github/workflows/_build_test_publish_wheel.yml

Lines changed: 11 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -71,10 +71,10 @@ jobs:
7171
pushd $BUILD_DIR
7272
rm LICENSE || true
7373
docker run --rm -v $(pwd):/workspace -w /workspace $IMAGE bash -c '\
74-
for python_version in cp310 cp311 cp312 cp313; do \
74+
for python_version in cp311 cp312 cp313; do \
7575
/opt/python/${python_version}-${python_version}/bin/pip install --upgrade "setuptools<80.0.0,>=77.0.0" build; \
7676
done && \
77-
for python_version in cp310 cp311 cp312 cp313; do \
77+
for python_version in cp311 cp312 cp313; do \
7878
/opt/python/${python_version}-${python_version}/bin/python -m build; \
7979
done \
8080
'
@@ -118,22 +118,20 @@ jobs:
118118
119119
if [ "$PACKAGE" = "megatron-core" ]; then
120120
if [[ "$PLATFORM" == "arm64" ]]; then
121-
for file in dist/$WHEEL_PREFIX*cp310*aarch64.whl; do
122-
pip install --no-cache-dir "$file"
123-
done
121+
WHEEL_GLOB="dist/${WHEEL_PREFIX}*cp312*aarch64.whl"
124122
else
125-
for file in dist/$WHEEL_PREFIX*cp310*x86_64.whl; do
126-
pip install --no-cache-dir "$file"
127-
done
123+
WHEEL_GLOB="dist/${WHEEL_PREFIX}*cp312*x86_64.whl"
128124
fi
129125
else
130-
pip install --no-cache-dir dist/$WHEEL_PREFIX*.whl
126+
WHEEL_GLOB="dist/${WHEEL_PREFIX}*.whl"
131127
fi
132128
133-
sudo rm -rf megatron/
134-
135-
RELEASE_NUMBER=$(python -c "import $ROOTPATH; print($ROOTPATH.__version__)")
136-
test "${{ steps.build-wheel.outputs.expected-release-number }}" == "$RELEASE_NUMBER"
129+
docker run --rm -v $(pwd):/workspace -w /workspace $IMAGE bash -c "\
130+
/opt/python/cp312-cp312/bin/pip install --no-cache-dir $WHEEL_GLOB && \
131+
rm -rf megatron/ && \
132+
RELEASE_NUMBER=\$(/opt/python/cp312-cp312/bin/python -c 'import $ROOTPATH; print($ROOTPATH.__version__)') && \
133+
test '${{ steps.build-wheel.outputs.expected-release-number }}' == \"\$RELEASE_NUMBER\" \
134+
"
137135
138136
- name: Upload wheels
139137
uses: actions/upload-artifact@v6

.github/workflows/claude_review.yml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,12 @@ jobs:
3333
fetch-depth: 1
3434
ref: ${{ steps.get-pr-head-commit.outputs.sha }}
3535

36+
- name: React to trigger comment
37+
run: |
38+
gh api repos/$REPO/issues/comments/${{ github.event.comment.id }}/reactions \
39+
--method POST \
40+
-f content='eyes'
41+
3642
- name: Run Claude Code Review
3743
uses: anthropics/claude-code-action@v1
3844
with:
@@ -52,6 +58,8 @@ jobs:
5258
- Critical bugs or logic errors
5359
- Typos in code, comments, or strings
5460
- Missing or insufficient test coverage for changed code
61+
- If the PR adds a new feature or significant functionality without corresponding tests, suggest adding tests
62+
- If the PR fixes a bug that was not caught by an existing unit test, suggest adding a regression test to prevent recurrence
5563
- Outdated or inaccurate documentation affected by the changes
5664
5765
Do NOT comment on:

.github/workflows/release-freeze.yml

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -34,13 +34,16 @@ on:
3434
default: true
3535
jobs:
3636
code-freeze:
37-
uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_code_freeze.yml@v0.22.5
37+
uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_code_freeze.yml@v0.86.0
3838
with:
39-
library-name: Megatron-Bridge
40-
python-package: megatron.bridge
39+
library-name: Megatron-Core
40+
python-package: megatron.core
4141
release-type: ${{ inputs.release-type }}
4242
freeze-commit: ${{ inputs.freeze-commit }}
4343
dry-run: ${{ inputs.dry-run }}
44+
release-branch-prefix: core_
45+
use-pat: true
4446
secrets:
45-
SLACK_WEBHOOK: ${{ secrets.SLACK_MAIN_CHANNEL_WEBHOOK }}
47+
SLACK_WEBHOOK: ${{ inputs.dry-run && secrets.SLACK_CI_CHANNEL_WEBHOOK ||secrets.SLACK_MAIN_CHANNEL_WEBHOOK }}
4648
SLACK_WEBHOOK_ADMIN: ${{ secrets.SLACK_TEAM_GROUP_ID }}
49+
PAT: ${{ secrets.PAT }}

AGENTS.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
See [CLAUDE.md](CLAUDE.md) for all repository guidelines.

CLAUDE.md

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# Repository Guidelines
2+
3+
## Contributing
4+
5+
### Pull Requests
6+
7+
- All PRs must be created as **drafts**. Use `gh pr create --draft` or the GitHub UI draft option.
8+
- Never push branches directly to `https://github.com/NVIDIA/Megatron-LM`. You must push your branch to a personal fork (e.g. `https://github.com/<your-username>/Megatron-LM`), then open a PR from the fork's branch against `NVIDIA/Megatron-LM`.
9+
- Read [docs/developer/contribute.md](docs/developer/contribute.md) for the full contribution policy, including code style, commit message conventions, and issue guidelines.
10+
11+
### Code Quality
12+
13+
- After editing imports in any Python files, always run `uv run isort` on those files to fix import order before committing.

0 commit comments

Comments
 (0)