Skip to content

Commit ab4e6d9

Browse files
authored
Merge branch 'main' into cjluo-nv-patch-1
2 parents 501d8f0 + b233ad1 commit ab4e6d9

File tree

7 files changed

+108
-49
lines changed

7 files changed

+108
-49
lines changed

.github/CODEOWNERS

Lines changed: 21 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -32,24 +32,24 @@ modelopt/torch/utils @NVIDIA/modelopt-torch-utils-codeowners
3232
# Examples
3333
/docker @NVIDIA/modelopt-docker-codeowners
3434
/README.md @NVIDIA/modelopt-examples-codeowners
35-
examples @NVIDIA/modelopt-examples-codeowners
36-
examples/chained_optimizations @NVIDIA/modelopt-torch-nas-prune-codeowners
37-
examples/cnn_qat @NVIDIA/modelopt-examples-cnn_qat-codeowners
38-
examples/deepseek @NVIDIA/modelopt-deploy-codeowners
39-
examples/diffusers @NVIDIA/modelopt-examples-diffusers-codeowners
40-
examples/gpt-oss @NVIDIA/modelopt-examples-gpt-oss-codeowners
41-
examples/llm_autodeploy @NVIDIA/modelopt-deploy-codeowners
42-
examples/llm_distill @NVIDIA/modelopt-torch-distill-codeowners
43-
examples/llm_eval @NVIDIA/modelopt-examples-llm_ptq-codeowners
44-
examples/llm_ptq @NVIDIA/modelopt-examples-llm_ptq-codeowners
45-
examples/llm_qat @NVIDIA/modelopt-examples-llm_qat-codeowners
46-
examples/llm_sparsity @NVIDIA/modelopt-torch-sparsity-codeowners
47-
examples/megatron-lm @NVIDIA/modelopt-examples-megatron-codeowners
48-
examples/model_hub @NVIDIA/modelopt-examples-model_hub-codeowners
49-
examples/nemo_run @NVIDIA/modelopt-examples-megatron-codeowners
50-
examples/onnx_ptq @NVIDIA/modelopt-onnx-codeowners
51-
examples/pruning @NVIDIA/modelopt-torch-nas-prune-codeowners
52-
examples/speculative_decoding @NVIDIA/modelopt-torch-speculative-codeowners
53-
examples/vlm_eval @NVIDIA/modelopt-examples-vlm-codeowners
54-
examples/vlm_ptq @NVIDIA/modelopt-examples-vlm-codeowners
55-
examples/windows @NVIDIA/modelopt-windows-codeowners
35+
/examples @NVIDIA/modelopt-examples-codeowners
36+
/examples/chained_optimizations @NVIDIA/modelopt-torch-nas-prune-codeowners
37+
/examples/cnn_qat @NVIDIA/modelopt-examples-cnn_qat-codeowners
38+
/examples/deepseek @NVIDIA/modelopt-deploy-codeowners
39+
/examples/diffusers @NVIDIA/modelopt-examples-diffusers-codeowners
40+
/examples/gpt-oss @NVIDIA/modelopt-examples-gpt-oss-codeowners
41+
/examples/llm_autodeploy @NVIDIA/modelopt-deploy-codeowners
42+
/examples/llm_distill @NVIDIA/modelopt-torch-distill-codeowners
43+
/examples/llm_eval @NVIDIA/modelopt-examples-llm_ptq-codeowners
44+
/examples/llm_ptq @NVIDIA/modelopt-examples-llm_ptq-codeowners
45+
/examples/llm_qat @NVIDIA/modelopt-examples-llm_qat-codeowners
46+
/examples/llm_sparsity @NVIDIA/modelopt-torch-sparsity-codeowners
47+
/examples/megatron-lm @NVIDIA/modelopt-examples-megatron-codeowners
48+
/examples/model_hub @NVIDIA/modelopt-examples-model_hub-codeowners
49+
/examples/nemo_run @NVIDIA/modelopt-examples-megatron-codeowners
50+
/examples/onnx_ptq @NVIDIA/modelopt-onnx-codeowners
51+
/examples/pruning @NVIDIA/modelopt-torch-nas-prune-codeowners
52+
/examples/speculative_decoding @NVIDIA/modelopt-torch-speculative-codeowners
53+
/examples/vlm_eval @NVIDIA/modelopt-examples-vlm-codeowners
54+
/examples/vlm_ptq @NVIDIA/modelopt-examples-vlm-codeowners
55+
/examples/windows @NVIDIA/modelopt-windows-codeowners

.github/workflows/gpu_tests.yml

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,20 +22,31 @@ jobs:
2222
any_changed: ${{ steps.changed-tests.outputs.any_changed }}
2323
steps:
2424
- uses: actions/checkout@v4
25+
with:
26+
fetch-depth: 0
2527
- id: get-pr-info
2628
uses: nv-gha-runners/get-pr-info@main
29+
# Get commit from main branch that is present in the PR to use as base for changed files
30+
- id: calculate-merge-base
31+
env:
32+
PR_SHA: ${{ fromJSON(steps.get-pr-info.outputs.pr-info).head.sha }}
33+
BASE_SHA: ${{ fromJSON(steps.get-pr-info.outputs.pr-info).base.sha }}
34+
run: |
35+
(echo -n "merge-base="; git merge-base "$BASE_SHA" "$PR_SHA") | tee --append "${GITHUB_OUTPUT}"
2736
- name: Check for changes in test-relevant directories
2837
id: changed-tests
2938
uses: step-security/[email protected]
3039
with:
40+
base_sha: ${{ steps.calculate-merge-base.outputs.merge-base }}
41+
sha: ${{ fromJSON(steps.get-pr-info.outputs.pr-info).head.sha }}
3142
files: |
3243
.github/workflows/gpu_tests.yml
3344
modelopt/**
3445
tests/gpu/**
3546
tox.ini
3647
pyproject.toml
3748
setup.py
38-
base_sha: ${{ fromJSON(steps.get-pr-info.outputs.pr-info).base.ref }}
49+
fail_on_initial_diff_error: true
3950
wait-checks:
4051
needs: [check-file-changes]
4152
if: needs.check-file-changes.outputs.any_changed == 'true'
@@ -70,3 +81,12 @@ jobs:
7081
timeout-minutes: 90
7182
container: *gpu_container
7283
steps: *gpu_steps
84+
gpu-pr-required-check:
85+
# Run even if gpu-tests-pr is skipped
86+
if: ${{ startsWith(github.ref, 'refs/heads/pull-request/') && always() }}
87+
needs: [check-file-changes, gpu-tests-pr]
88+
runs-on: ubuntu-latest
89+
steps:
90+
- name: Required GPU tests did not succeed
91+
if: ${{ needs.check-file-changes.result != 'success' || (needs.check-file-changes.outputs.any_changed == 'true' && needs.gpu-tests-pr.result != 'success') }}
92+
run: exit 1

.github/workflows/unit_tests.yml

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,6 @@ name: Unit tests
44
on:
55
pull_request:
66
branches: [main, release/*]
7-
paths:
8-
- ".github/workflows/unit_tests.yml"
9-
- "modelopt/**"
10-
- "tests/unit/**"
11-
- "pyproject.toml"
12-
- "setup.py"
13-
- "tox.ini"
147
push:
158
branches: [main, release/*]
169
paths:
@@ -126,3 +119,9 @@ jobs:
126119
python-version: "3.12"
127120
- name: Run unit tests
128121
run: pip install tox && tox -e py312-partial-unit-${{ matrix.test-env }}
122+
unit-pr-required-check:
123+
if: github.event_name == 'pull_request'
124+
needs: [linux, windows, multi-py, multi-torch, multi-transformers, partial-install]
125+
runs-on: ubuntu-latest
126+
steps:
127+
- run: echo "All PR unit test jobs completed"

modelopt/onnx/quantization/qdq_utils.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -790,8 +790,10 @@ def remove_input_dq_and_output_q(
790790
if cons_idx in quantizable_custom_ops[consumer.op_type]["inp"]:
791791
consumer.input[cons_idx] = q_node.output[0]
792792
else:
793-
q_node_prev = tensor_producers[q_node.input[0]]
794-
consumer.input[cons_idx] = q_node_prev.output[0]
793+
q_node_prev = tensor_producers.get(q_node.input[0], None)
794+
consumer.input[cons_idx] = (
795+
q_node_prev.output[0] if q_node_prev else q_node.input[0]
796+
)
795797
break
796798

797799
# Track DequantizeLinear node indices for cleanup
@@ -828,8 +830,11 @@ def remove_input_dq_and_output_q(
828830
if quantizable_custom_ops[producer.op_type]["out"]:
829831
dq_node[0].input[0] = producer.output[0]
830832
else:
831-
dq_node_next = tensor_consumers[dq_node[0].output[0]]
832-
dq_node_next[0].input[0] = producer.output[0]
833+
dq_node_next = tensor_consumers.get(dq_node[0].output[0], None)
834+
if dq_node_next:
835+
dq_node_next[0].input[0] = producer.output[0]
836+
else:
837+
dq_node[0].input[0] = producer.output[0]
833838

834839
# Track QuantizeLinear node indices for cleanup
835840
q_indices.append(node_idx)

modelopt/onnx/trt_utils.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -416,9 +416,10 @@ def interpret_trt_plugins_precision_flag(
416416
# Will add Q/DQ nodes in the requested I/O indices
417417
inp_precision_quant = [i for i, p in enumerate(inp_precision) if p in ["int8", "fp8"]]
418418
out_precision_quant = [i for i, p in enumerate(out_precision) if p in ["int8", "fp8"]]
419-
custom_ops_to_quantize[op_type] = {
420-
"inp": inp_precision_quant,
421-
"out": out_precision_quant,
422-
}
419+
if inp_precision_quant or out_precision_quant:
420+
custom_ops_to_quantize[op_type] = {
421+
"inp": inp_precision_quant,
422+
"out": out_precision_quant,
423+
}
423424

424425
return custom_ops_to_cast, custom_ops_to_quantize

modelopt/torch/speculative/plugins/megatron_eagle.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -524,9 +524,11 @@ def __init__(
524524
if self._num_aux_hidden_states > 0:
525525
# Register forward hook to the last EAGLE3 layer to extract the pre-norm hidden_state
526526
# for eagle3 auto regression.
527-
layer = self.decoder.layers[-1]
528-
layer.register_forward_hook(self._eagle3_layer_forward_hook)
527+
last_layer = self.decoder.layers[-1]
528+
last_layer.register_forward_hook(self._eagle3_layer_forward_hook)
529529

530+
# The first EAGLE3 layer needs to be specialized.
531+
layer = self.decoder.layers[0]
530532
self_attention = layer.self_attention
531533
if not isinstance(self_attention, SelfAttention):
532534
raise ValueError("EAGLE-3 only support SelfAttention (MHA, GQA).")

tests/gpu/torch/speculative/plugins/test_speculative_megatron_modules.py

Lines changed: 42 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,12 @@
3232
from modelopt.torch.speculative.plugins.megatron_medusa import _DynamicMedusaGPTModel
3333
from modelopt.torch.speculative.utils import Tree, get_default_attention_mask_and_position_ids
3434

35+
ALGO_TO_CONFIG = {
36+
"eagle1": mtsp.config.EAGLE1_DEFAULT_CFG,
37+
"eagle3": mtsp.config.EAGLE3_DEFAULT_CFG,
38+
"eagle-mtp": mtsp.config.EAGLE_MTP_DEFAULT_CFG,
39+
}
40+
3541

3642
def _test_speculative_gpt_model(
3743
algo, num_medusa_heads_or_eagle_layers, activation_func, normalization, rank, size
@@ -64,18 +70,42 @@ def _test_speculative_gpt_model(
6470

6571
# Type checking
6672
assert isinstance(model, _DynamicMedusaGPTModel)
67-
elif algo == "eagle":
68-
config = {"eagle_architecture_config": deepcopy(default_eagle_config)}
69-
config["eagle_architecture_config"]["hidden_size"] = model.config.hidden_size
70-
config["eagle_architecture_config"]["vocab_size"] = model.vocab_size
71-
config["eagle_architecture_config"]["draft_vocab_size"] = model.vocab_size
73+
elif algo in {"eagle1", "eagle3"}:
74+
mtsp_config = ALGO_TO_CONFIG[algo]
75+
76+
mtsp_config["config"]["eagle_architecture_config"]["num_hidden_layers"] = (
77+
num_medusa_heads_or_eagle_layers
78+
)
79+
mtsp_config["config"]["eagle_architecture_config"]["hidden_size"] = model.config.hidden_size
80+
mtsp_config["config"]["eagle_architecture_config"]["vocab_size"] = model.vocab_size
81+
mtsp_config["config"]["eagle_architecture_config"]["draft_vocab_size"] = model.vocab_size
7282

73-
model = mtsp.convert(model, [("eagle", config)])
83+
model = mtsp.convert(model, mtsp_config)
7484

7585
# Type checking
7686
assert isinstance(model, _DynamicEagleGPTModel)
7787
else:
78-
raise ValueError("Only algo={eagle, medusa} are supported!")
88+
raise ValueError("Only algo={eagle1, eagle3, medusa} are supported!")
89+
90+
if algo == "eagle3":
91+
first_layer = model.eagle_module.decoder.layers[0]
92+
last_layer = model.eagle_module.decoder.layers[-1]
93+
# Eagle3 QKV input_dim is 2x of hidden_size
94+
assert (
95+
first_layer.self_attention.linear_qkv.weight.shape[-1] == model.config.hidden_size * 2
96+
)
97+
# Eagle3 attention has a forward_pre_hook to handle additional features to be concatenated
98+
assert len(first_layer.self_attention._forward_pre_hooks) > 0
99+
# Eagle3 last layer has a forward hook to extrat the pre_norm hidden_state
100+
assert len(last_layer._forward_hooks) > 0
101+
elif algo == "eagle1":
102+
first_layer = model.eagle_module.decoder.layers[0]
103+
last_layer = model.eagle_module.decoder.layers[-1]
104+
# Eagle1 QKV input_dim the same as hidden_size
105+
assert first_layer.self_attention.linear_qkv.weight.shape[-1] == model.config.hidden_size
106+
# No forward_hook or forward_pre_hook are needed
107+
assert len(first_layer.self_attention._forward_pre_hooks) == 0
108+
assert len(last_layer._forward_hooks) == 0
79109

80110
# Bfloat16
81111
model = model.to(torch.bfloat16)
@@ -104,7 +134,7 @@ def _test_speculative_gpt_model(
104134

105135
assert medusa_loss.shape[0] == batch_size
106136
assert medusa_loss.shape[1] == max_sequence_length
107-
elif algo == "eagle":
137+
elif algo in {"eagle1", "eagle3"}:
108138
labels = torch.randint(0, vocab_size, (batch_size, max_sequence_length)).cuda()
109139
eagle_loss = model(prompt_tokens, position_ids, attention_mask, labels=labels)
110140

@@ -115,8 +145,10 @@ def _test_speculative_gpt_model(
115145
@pytest.mark.parametrize(
116146
("algo", "num_medusa_heads_or_eagle_layers", "activation_func", "normalization"),
117147
[
118-
("eagle", 1, "squared_relu", "LayerNorm"), # MHA
119-
("eagle", 2, "swiglu", "RMSNorm"), # GQA
148+
("eagle1", 1, "squared_relu", "LayerNorm"), # MHA
149+
("eagle1", 2, "swiglu", "RMSNorm"), # GQA
150+
("eagle3", 1, "swiglu", "RMSNorm"), # GQA
151+
("eagle3", 2, "swiglu", "RMSNorm"), # GQA
120152
("medusa", 1, "squared_relu", "LayerNorm"), # MHA
121153
("medusa", 2, "swiglu", "RMSNorm"), # GQA
122154
],

0 commit comments

Comments
 (0)