Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
__pycache__
*.pyc
*.pyo
*.pyd
.Python
env
pip-log.txt
pip-delete-this-directory.txt
.tox
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*,cover
*.log
.git
**/*.nemo
**/*.ckpt
9 changes: 9 additions & 0 deletions .flake8
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
[flake8]
max-line-length = 119
select =
F541, # f-string without any placeholders
F841, # local variable 'x' is assigned to but never used
F401, # 'x' imported but unused
E741, # ambiguous variable name 'l'
F821, # undefined name 'x'
E266, # too many leading '#' for block comment
7 changes: 7 additions & 0 deletions .github/CODEOWNERS
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
.github/ @ko3n1g @chtruong814 @thomasdhc @pablo-garay
docker/ @ko3n1g @chtruong814 @thomasdhc @pablo-garay
.pylintrc.* @ko3n1g @chtruong814 @thomasdhc @pablo-garay
.flake8.* @ko3n1g @chtruong814 @thomasdhc @pablo-garay
setup.py @ko3n1g @chtruong814 @thomasdhc @pablo-garay
pyproject.toml @ko3n1g @chtruong814 @thomasdhc @pablo-garay
requirements/ @ko3n1g @chtruong814 @thomasdhc @pablo-garay
83 changes: 83 additions & 0 deletions .github/actions/test-template/action.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
# Copyright (c) 2025, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: ~Build container template
on:
Comment on lines +13 to +15
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think a copy-paste error

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ah yup.

workflow_call:
inputs:
image-name:
required: true
type: string
description: "The name of the image to build"
dockerfile:
required: true
type: string
runner:
required: false
default: linux-amd64-gpu-rtxa6000-latest-2-nemo
type: string
description: "The runner to use for the build"
secrets:
AZURE_CLIENT_ID:
required: true
AZURE_TENANT_ID:
required: true
AZURE_SUBSCRIPTION_ID:
required: true

jobs:
pre-flight:
runs-on: ubuntu-latest
outputs:
cache-from: ${{ steps.cache-from.outputs.LAST_PRS }}
steps:
- name: Checkout repository
uses: actions/checkout@v4

- name: Get last merged PR
id: cache-from
env:
GH_TOKEN: ${{ github.token }}
run: |
LAST_PRS=$(gh api graphql -f query='
query {
repository(owner: "NVIDIA", name: "NeMo-LM") {
pullRequests(states: MERGED, first: 100, orderBy: {field: UPDATED_AT, direction: DESC}) {
nodes {
number
}
}
}
}' | jq -r '.data.repository.pullRequests.nodes[].number' | while read -r number; do
echo "nemoci.azurecr.io/${{ inputs.image-name }}-buildcache:$number"
done)

echo "LAST_PRS<<EOF" >> $GITHUB_OUTPUT
echo "$LAST_PRS" >> $GITHUB_OUTPUT
echo "EOF" >> $GITHUB_OUTPUT

build:
uses: NVIDIA/NeMo-FW-CI-templates/.github/workflows/[email protected]
needs: [pre-flight]
with:
image-name: ${{ inputs.image-name }}
dockerfile: ${{ inputs.dockerfile }}
image-label: nemo-core
prune-filter-timerange: 24h
use-inline-cache: false
runner: ${{ inputs.runner }}
has-azure-credentials: true
secrets:
AZURE_CLIENT_ID: ${{ secrets.AZURE_CLIENT_ID }}
AZURE_TENANT_ID: ${{ secrets.AZURE_TENANT_ID }}
AZURE_SUBSCRIPTION_ID: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
29 changes: 29 additions & 0 deletions .github/copy-pr-bot.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
enabled: true
additional_trustees:
- ericharper
- ko3n1g
- chtruong814
- thomasdhc
- pablo-garay
- adil-a
- akoumpa
- ananthsub
- athitten
- bernardwin
- hemildesai
- maanug-nv
additional_vetters:
- ericharper
- ko3n1g
- chtruong814
- thomasdhc
- pablo-garay
- adil-a
- akoumpa
- ananthsub
- athitten
- bernardwin
- hemildesai
- maanug-nv
auto_sync_draft: false
auto_sync_ready: true
83 changes: 83 additions & 0 deletions .github/workflows/_build_container.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
# Copyright (c) 2025, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: ~Build container template
on:
workflow_call:
inputs:
image-name:
required: true
type: string
description: "The name of the image to build"
dockerfile:
required: true
type: string
runner:
required: false
default: linux-amd64-gpu-rtxa6000-latest-2-nemo
type: string
description: "The runner to use for the build"
secrets:
AZURE_CLIENT_ID:
required: true
AZURE_TENANT_ID:
required: true
AZURE_SUBSCRIPTION_ID:
required: true

jobs:
pre-flight:
runs-on: ubuntu-latest
outputs:
cache-from: ${{ steps.cache-from.outputs.LAST_PRS }}
steps:
- name: Checkout repository
uses: actions/checkout@v4

- name: Get last merged PR
id: cache-from
env:
GH_TOKEN: ${{ github.token }}
run: |
LAST_PRS=$(gh api graphql -f query='
query {
repository(owner: "NVIDIA", name: "NeMo-LM") {
pullRequests(states: MERGED, first: 100, orderBy: {field: UPDATED_AT, direction: DESC}) {
nodes {
number
}
}
}
}' | jq -r '.data.repository.pullRequests.nodes[].number' | while read -r number; do
echo "nemoci.azurecr.io/${{ inputs.image-name }}-buildcache:$number"
done)

echo "LAST_PRS<<EOF" >> $GITHUB_OUTPUT
echo "$LAST_PRS" >> $GITHUB_OUTPUT
echo "EOF" >> $GITHUB_OUTPUT

build:
uses: NVIDIA/NeMo-FW-CI-templates/.github/workflows/[email protected]
needs: [pre-flight]
with:
image-name: ${{ inputs.image-name }}
dockerfile: ${{ inputs.dockerfile }}
image-label: nemo-core
prune-filter-timerange: 24h
use-inline-cache: false
runner: ${{ inputs.runner }}
has-azure-credentials: true
secrets:
AZURE_CLIENT_ID: ${{ secrets.AZURE_CLIENT_ID }}
AZURE_TENANT_ID: ${{ secrets.AZURE_TENANT_ID }}
AZURE_SUBSCRIPTION_ID: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
152 changes: 152 additions & 0 deletions .github/workflows/cicd-approve-test-queue.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

name: Approve Test Queue

on:
schedule:
- cron: '*/5 * * * *' # Runs every 5 minutes
workflow_dispatch: # Allows manual triggering

jobs:
approve-queue:
runs-on: ubuntu-latest
environment: main
steps:
- name: Checkout repository
uses: actions/checkout@v4

- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: "3.12"

- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install requests

- name: Approve waiting deployments
env:
GITHUB_TOKEN: ${{ secrets.PAT }}
MAX_CONCURRENCY: ${{ vars.MAX_CONCURRENCY || 1 }}
run: |
python - <<EOF
import os
import requests


# GitHub API configuration
GITHUB_TOKEN = os.environ["GITHUB_TOKEN"]
REPO = os.environ["GITHUB_REPOSITORY"]
MAX_CONCURRENCY = int(os.environ["MAX_CONCURRENCY"])
API_BASE = f"https://api.github.com/repos/{REPO}"

# Headers for GitHub API
headers = {
"Authorization": f"token {GITHUB_TOKEN}",
"Accept": "application/vnd.github.v3+json",
"X-GitHub-Api-Version": "2022-11-28",
}

def make_request(endpoint, method="GET", data=None):
"""Make a request to the GitHub API with error handling."""
url = f"{API_BASE}/{endpoint}"
try:
if method == "GET":
response = requests.get(url, headers=headers)
else:
response = requests.post(url, headers=headers, json=data)
response.raise_for_status()
return response.json()
except requests.exceptions.RequestException as e:
print(f"Error making request to {endpoint}: {str(e)}")
if hasattr(e.response, 'text'):
print(f"Response: {e.response.text}")
return None

# Get current running and queued workflows
print("Fetching workflow runs...")
queued_workflow_runs = make_request("actions/runs?status=queued").get("workflow_runs", [])
in_progress_workflow_runs = make_request("actions/runs?status=in_progress").get("workflow_runs", [])

# Count running and queued workflows
queued_workflows = sum(1 for run in queued_workflow_runs if run["name"] == "CICD NeMo")
in_progress_workflows = sum(1 for run in in_progress_workflow_runs if run["name"] == "CICD NeMo")

total_workflows = queued_workflows + in_progress_workflows
print(f"Current queued workflows: {queued_workflows}")
print(f"Current running workflows: {in_progress_workflows}")
print(f"Total workflows: {total_workflows}")
print(f"Max concurrency: {MAX_CONCURRENCY}")

if total_workflows >= MAX_CONCURRENCY:
print("Maximum concurrency reached, no new approvals will be made")
exit(0)

# Get waiting CI workflows for test environment
print("Fetching deployments...")
pending_workflows = make_request("actions/runs?status=waiting").get("workflow_runs", [])
pending_workflows = [run for run in pending_workflows if run["name"] == "CICD NeMo"]

# Sort deployments by creation date (oldest first)
print("Sorting workflows...")
pending_workflows = sorted(pending_workflows, key=lambda x: x["created_at"])

# Process each deployment
print("Processing ...")
for workflow in pending_workflows:
if total_workflows >= MAX_CONCURRENCY:
print("Maximum concurrency reached, stopping approvals")
break

workflow_id = workflow["id"]
workflow_name = workflow["display_title"]
print(f"Approving workflow {workflow_name} with Run Id: {workflow_id}")

deployment_url = f"actions/runs/{workflow_id}/pending_deployments"
deployment = make_request(deployment_url)[0]
environment_id = deployment["environment"]["id"]

# Approve the deployment
status_data = {
"environment_ids": [environment_id],
"state": "approved",
"comment": "Automatically approved by queue manager"
}
result = make_request(deployment_url, method="POST", data=status_data)

if result:
total_workflows += 1
else:
print(f"Failed to approve deployment {deployment['id']}")
exit(1)

EOF
notify:
if: failure()
runs-on: ubuntu-latest
needs: [approve-queue]
steps:
- name: Notify
env:
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
SLACK_WEBHOOK_ADMIN: <!subteam^${{ secrets.SLACK_WEBHOOK_ADMIN }}>
GITHUB_RUN_ID: ${{ github.run_id }}
GITHUB_REPOSITORY: ${{ github.repository }}
run: |
curl -X POST \
-H 'Content-type: application/json' \
--data "{\"text\":\":robot_joy: <https://github.com/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID}|Test-queue-approval-bot workflow> failed. Please review manually.\n\ncc ${SLACK_WEBHOOK_ADMIN}\"}" \
$SLACK_WEBHOOK
Loading