Skip to content

Commit dd645a8

Browse files
authored
Merge branch 'main' into dnarayanan/latent_moe
2 parents 839624a + 416687f commit dd645a8

File tree

138 files changed

+22158
-5292
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

138 files changed

+22158
-5292
lines changed

.github/CODEOWNERS

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ megatron/core/parallel_state.py @NVIDIA/core-adlr @NVIDIA/core-nemo
3333

3434
megatron/core/post_training/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/post-training
3535

36-
megatron/post_training/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/post-training
36+
megatron/post_training/ @NVIDIA/post-training
3737

3838
.gitlab/ @NVIDIA/ci
3939
.github/ @NVIDIA/ci

.github/actions/action.yml

Lines changed: 35 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -11,28 +11,28 @@
1111
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
14-
name: "Test Template"
15-
description: "Template for running NeMo tests in a containerized environment"
14+
name: 'Test Template'
15+
description: 'Template for running NeMo tests in a containerized environment'
1616

1717
inputs:
1818
container-image:
19-
description: "Container image to use for test"
19+
description: 'Container image to use for test'
2020
required: true
2121
timeout:
22-
description: "Max runtime of test in minutes"
22+
description: 'Max runtime of test in minutes'
2323
required: false
24-
default: "30"
24+
default: '30'
2525
script:
26-
description: "Test script to execute"
26+
description: 'Test script to execute'
2727
required: true
2828
is-optional:
29-
description: "Pass this job on failure."
29+
description: 'Pass this job on failure.'
3030
required: false
31-
default: "false"
31+
default: 'false'
3232
is_unit_test:
33-
description: "Upload coverage as unit test"
33+
description: 'Upload coverage as unit test'
3434
required: false
35-
default: "false"
35+
default: 'false'
3636
tag:
3737
description: Latest or legacy test suite
3838
required: true
@@ -43,11 +43,11 @@ inputs:
4343
description: Model to launch
4444
required: false
4545
PAT:
46-
description: "GitHub Personal Access Token"
46+
description: 'GitHub Personal Access Token'
4747
required: true
4848

4949
runs:
50-
using: "composite"
50+
using: 'composite'
5151
steps:
5252
- name: Checkout repository
5353
uses: actions/checkout@v2
@@ -114,6 +114,16 @@ runs:
114114
HAS_RUN_TESTS_LABEL=$(gh pr view $PR_NUMBER --json labels | jq '[.labels[].name] | any(. == "Run tests")') || echo "false"
115115
echo "main=$HAS_RUN_TESTS_LABEL" | tee -a $GITHUB_OUTPUT
116116
117+
- name: Has Run functional tests label
118+
shell: bash -x -e -u -o pipefail {0}
119+
id: has-run-functional-tests-label
120+
env:
121+
GH_TOKEN: ${{ github.token }}
122+
run: |
123+
PR_NUMBER=${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number }}
124+
HAS_RUN_FUNCTIONAL_TESTS_LABEL=$(gh pr view $PR_NUMBER --json labels | jq '[.labels[].name] | any(. == "Run functional tests")') || echo "false"
125+
echo "main=$HAS_RUN_FUNCTIONAL_TESTS_LABEL" | tee -a $GITHUB_OUTPUT
126+
117127
- name: Create run-script (e2e test)
118128
shell: bash -x -e -u -o pipefail {0}
119129
if: inputs.is_unit_test == 'false'
@@ -126,16 +136,19 @@ runs:
126136
set -euxo pipefail
127137
128138
if [ "${{ steps.has-run-tests-label.outputs.main }}" == "true" ]; then
129-
ARGS=(
130-
--scope mr-github
131-
--enable-lightweight-mode
132-
)
133-
else
134-
ARGS=(
135-
--scope mr-slim
136-
--enable-lightweight-mode
137-
)
138-
fi
139+
ARGS=(
140+
--scope mr-github
141+
--enable-lightweight-mode
142+
)
143+
elif [ "${{ steps.has-run-functional-tests-label.outputs.main }}" == "true" ]; then
144+
ARGS=(
145+
--scope mr-github
146+
)
147+
else
148+
ARGS=(
149+
--scope mr-github-slim
150+
)
151+
fi
139152
140153
export PYTHONPATH=$(pwd)
141154
export NEMORUN_HOME=$(pwd)
Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,139 @@
1+
name: 'Check NVIDIA SSO Membership'
2+
description: 'Check if a GitHub username exists in the NVIDIA SSO users list from github-audits'
3+
author: 'NVIDIA'
4+
5+
inputs:
6+
username:
7+
description: 'GitHub username to check'
8+
required: true
9+
github_audits_repo:
10+
description: 'Repository containing SSO users file'
11+
required: false
12+
default: 'NVIDIA-GitHub-Management/github-audits'
13+
github_audits_version:
14+
description: 'Release version tag'
15+
required: false
16+
default: 'v0.1.0'
17+
sso_users_filename:
18+
description: 'Filename of SSO users JSON'
19+
required: false
20+
default: 'users_sso.json'
21+
github_token:
22+
description: 'GitHub token with access to github-audits repo'
23+
required: true
24+
25+
outputs:
26+
is_member:
27+
description: 'Boolean - true if user is in NVIDIA SSO list, false otherwise'
28+
value: ${{ steps.check-membership.outputs.is_member }}
29+
is_org_member:
30+
description: 'Boolean - true if user has NVIDIA or NVIDIA-NeMo in org_roles'
31+
value: ${{ steps.check-membership.outputs.is_org_member }}
32+
user_orgs:
33+
description: 'Comma-separated list of orgs user is member of'
34+
value: ${{ steps.check-membership.outputs.user_orgs }}
35+
sso_file_available:
36+
description: 'Boolean - true if SSO file was successfully downloaded'
37+
value: ${{ steps.download-sso.outputs.sso_file_available }}
38+
user_count:
39+
description: 'Number of users in the SSO file (0 if download failed)'
40+
value: ${{ steps.download-sso.outputs.user_count }}
41+
42+
runs:
43+
using: 'composite'
44+
steps:
45+
- name: Download NVIDIA SSO users from github-audits
46+
id: download-sso
47+
shell: bash
48+
env:
49+
GH_TOKEN: ${{ inputs.github_token }}
50+
run: |
51+
echo "Downloading ${{ inputs.sso_users_filename }} from ${{ inputs.github_audits_repo }} ${{ inputs.github_audits_version }} release..."
52+
53+
# Download the release asset using gh CLI
54+
gh release download ${{ inputs.github_audits_version }} \
55+
--repo ${{ inputs.github_audits_repo }} \
56+
--pattern ${{ inputs.sso_users_filename }} \
57+
--clobber 2>&1 || {
58+
echo "ERROR: Failed to download ${{ inputs.sso_users_filename }} from github-audits release"
59+
echo "sso_file_available=false" >> $GITHUB_OUTPUT
60+
echo "user_count=0" >> $GITHUB_OUTPUT
61+
exit 0
62+
}
63+
64+
# Verify file was downloaded and is valid JSON
65+
if [ ! -f ${{ inputs.sso_users_filename }} ]; then
66+
echo "ERROR: ${{ inputs.sso_users_filename }} file not found after download"
67+
echo "sso_file_available=false" >> $GITHUB_OUTPUT
68+
echo "user_count=0" >> $GITHUB_OUTPUT
69+
exit 0
70+
fi
71+
72+
# Validate JSON structure
73+
if ! jq -e 'type == "object"' ${{ inputs.sso_users_filename }} > /dev/null 2>&1; then
74+
echo "ERROR: ${{ inputs.sso_users_filename }} is not a valid JSON object"
75+
echo "sso_file_available=false" >> $GITHUB_OUTPUT
76+
echo "user_count=0" >> $GITHUB_OUTPUT
77+
exit 0
78+
fi
79+
80+
USER_COUNT=$(jq 'length' ${{ inputs.sso_users_filename }})
81+
echo "Successfully downloaded ${{ inputs.sso_users_filename }} with $USER_COUNT NVIDIA SSO users"
82+
echo "sso_file_available=true" >> $GITHUB_OUTPUT
83+
echo "user_count=$USER_COUNT" >> $GITHUB_OUTPUT
84+
85+
- name: Check if user is in SSO list
86+
id: check-membership
87+
shell: bash
88+
run: |
89+
USERNAME="${{ inputs.username }}"
90+
SSO_FILE="${{ inputs.sso_users_filename }}"
91+
92+
echo "Checking if $USERNAME is in NVIDIA SSO users list..."
93+
94+
# Check if SSO file is available
95+
if [ "${{ steps.download-sso.outputs.sso_file_available }}" != "true" ] || [ ! -f "$SSO_FILE" ]; then
96+
echo "ERROR: $SSO_FILE not available - cannot check membership"
97+
echo "is_member=false" >> $GITHUB_OUTPUT
98+
echo "is_org_member=false" >> $GITHUB_OUTPUT
99+
echo "user_orgs=" >> $GITHUB_OUTPUT
100+
exit 0
101+
fi
102+
103+
# Check if username exists as a key in the JSON object
104+
if jq -e --arg user "$USERNAME" 'has($user)' "$SSO_FILE" > /dev/null 2>&1; then
105+
echo "$USERNAME found in NVIDIA SSO users"
106+
echo "is_member=true" >> $GITHUB_OUTPUT
107+
108+
# Extract and check org membership
109+
IS_ORG_MEMBER=$(jq -r --arg user "$USERNAME" '
110+
.[$user].org_roles // [] |
111+
map(select(test("^(NVIDIA|NVIDIA-NeMo):Member$"))) |
112+
length > 0
113+
' "$SSO_FILE")
114+
115+
USER_ORGS=$(jq -r --arg user "$USERNAME" '
116+
.[$user].org_roles // [] |
117+
map(split(":")[0]) |
118+
unique |
119+
join(",")
120+
' "$SSO_FILE")
121+
122+
echo "is_org_member=$IS_ORG_MEMBER" >> $GITHUB_OUTPUT
123+
echo "user_orgs=$USER_ORGS" >> $GITHUB_OUTPUT
124+
125+
if [ "$IS_ORG_MEMBER" == "true" ]; then
126+
echo "$USERNAME is a member of NVIDIA or NVIDIA-NeMo org"
127+
else
128+
echo "$USERNAME has @nvidia.com email but is not in NVIDIA or NVIDIA-NeMo org (orgs: $USER_ORGS)"
129+
fi
130+
else
131+
echo "$USERNAME NOT found in NVIDIA SSO users"
132+
echo "is_member=false" >> $GITHUB_OUTPUT
133+
echo "is_org_member=false" >> $GITHUB_OUTPUT
134+
echo "user_orgs=" >> $GITHUB_OUTPUT
135+
fi
136+
137+
branding:
138+
icon: 'shield'
139+
color: 'green'

.github/workflows/_build_test_publish_wheel.yml

Lines changed: 29 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,21 @@
11
on:
22
workflow_call:
3+
inputs:
4+
ref:
5+
required: false
6+
description: Ref (SHA or branch) to release
7+
type: string
8+
default: ${{ github.sha }}
9+
dry-run:
10+
required: false
11+
description: Upload to PyPy Test instance
12+
type: boolean
13+
default: true
14+
no-publish:
15+
required: false
16+
description: Do not publish the wheel
17+
type: boolean
18+
default: true
319
secrets:
420
TWINE_USERNAME:
521
required: true
@@ -26,17 +42,18 @@ jobs:
2642
PACKAGE: ${{ matrix.PACKAGE }}
2743
IMAGE: ${{ matrix.IMAGE }}
2844
PLATFORM: ${{ matrix.PLATFORM }}
45+
PUBLISH_DRYRUN: ${{ inputs.dry-run }}
2946
steps:
3047
- name: Checkout repository
3148
uses: actions/checkout@v4
49+
with:
50+
ref: ${{ inputs.ref }}
3251

3352
- name: Build wheel
3453
id: build-wheel
3554
run: |
3655
set -x
3756
38-
PUBLISH_DRYRUN=yes
39-
4057
if [ "$PACKAGE" = "megatron-core" ]; then
4158
ROOTDIR="megatron/core"
4259
BUILD_DIR="."
@@ -48,7 +65,7 @@ jobs:
4865
exit 1
4966
fi
5067
51-
if [ "$PUBLISH_DRYRUN" = "yes" ]; then
68+
if [ "$PUBLISH_DRYRUN" = "true" ]; then
5269
PRE_RELEASE=$(sed -n "s/.*PRE_RELEASE = '\(.*\)'/\1/p" $ROOTDIR/package_info.py)
5370
sed -i "/^PRE_RELEASE/c\PRE_RELEASE = '${PRE_RELEASE}.dev$((RANDOM % 900000 + 100000))'" $ROOTDIR/package_info.py
5471
fi
@@ -123,26 +140,31 @@ jobs:
123140
- name: Upload wheels
124141
uses: actions/upload-artifact@v4
125142
with:
126-
name: wheels-${{ matrix.PACKAGE }}-${{ matrix.PLATFORM }}
143+
name: wheels-${{ matrix.PACKAGE }}-${{ matrix.PLATFORM }}-${{ inputs.dry-run && 'dry-run' || 'release' }}
127144
path: dist/
128145

129146
publish-wheels:
130147
needs: [build-and-test-wheels]
131148
runs-on: ubuntu-latest
132-
if: github.ref == 'refs/heads/main'
149+
if: inputs.no-publish == false
133150
environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/r')) && 'main' || 'public' }}
134151
strategy:
135152
fail-fast: false
136153
matrix:
137154
include:
138-
- PACKAGE: megatron_core
139-
- PACKAGE: megatron_fsdp
155+
- PACKAGE: megatron-core
156+
PLATFORM: arm64
157+
- PACKAGE: megatron-core
158+
PLATFORM: amd64
159+
- PACKAGE: megatron-fsdp
160+
IMAGE: quay.io/pypa/manylinux_2_28_x86_64
140161
env:
141162
PACKAGE: ${{ matrix.PACKAGE }}
142163
steps:
143164
- name: Download wheels
144165
uses: actions/download-artifact@v4
145166
with:
167+
name: wheels-${{ matrix.PACKAGE }}-${{ matrix.PLATFORM }}-${{ inputs.dry-run && 'dry-run' || 'release' }}
146168
path: dist/
147169
merge-multiple: true
148170

0 commit comments

Comments
 (0)