Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions jenkins/L0_MergeRequest.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -719,6 +719,7 @@ def getMultiGpuFileChanged(pipeline, testFilter, globalVars)
"tensorrt_llm/_torch/pyexecutor/_util.py",
"tensorrt_llm/_torch/pyexecutor/model_engine.py",
"tensorrt_llm/_torch/pyexecutor/py_executor.py",
"tensorrt_llm/_torch/auto_deploy/transform/library/sharding.py",
"tensorrt_llm/evaluate/json_mode_eval.py",
"tensorrt_llm/evaluate/mmlu.py",
"tensorrt_llm/executor/",
Expand All @@ -740,6 +741,7 @@ def getMultiGpuFileChanged(pipeline, testFilter, globalVars)
"tests/integration/defs/accuracy/test_disaggregated_serving.py",
"tests/unittest/_torch/ray_orchestrator/multi_gpu/",
"tests/integration/defs/examples/test_ray.py",
"tests/integration/defs/accuracy/test_llm_api_autodeploy.py",
"tests/unittest/llmapi/test_async_llm.py",
]

Expand Down
12 changes: 7 additions & 5 deletions tensorrt_llm/_torch/auto_deploy/transform/library/sharding.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,7 +267,7 @@ class WeightShardingInfo(ShardingTransformInfo):
min_local_shape: int = 1
layer_type: LayerType = LayerType.MLP
# used for TP sharding of fused weights
fused_weight_dims: Optional[list] = None
fused_weight_dims: Optional[tuple] = None

def quantization_cb(
self,
Expand Down Expand Up @@ -1316,7 +1316,7 @@ def _shard_parameter_node(
config: ShardingTransformConfig,
add_dist: bool = False,
min_local_shape: int = 1,
fused_weight_dims: Optional[list] = None,
fused_weight_dims: Optional[tuple] = None,
quantization_cb: Optional[
Callable[[GraphModule, nn.Module, Node, str, torch.Size, int, int, int], None]
] = None,
Expand Down Expand Up @@ -1835,7 +1835,7 @@ def _process_ssm_sharding(
config=config,
dist_op=None,
min_local_shape=1,
fused_weight_dims=fused_weight_dims["in_proj"],
fused_weight_dims=tuple(fused_weight_dims["in_proj"]),
layer_type=LayerType.SSM,
)
):
Expand Down Expand Up @@ -1904,7 +1904,7 @@ def _process_ssm_sharding(
fused_dims = None
for k, v in fused_weight_dims.items():
if k in weight_key:
fused_dims = v
fused_dims = tuple(v)
break

# Shard the weight tensor (also updates the parameter in the module)
Expand Down Expand Up @@ -2089,7 +2089,7 @@ def _determine_fused_weight_dims(
ad_logger.warning(
f"Fused weight dims {fused_weight_dims} do not sum to weight dim {weight_dim}. Skipping."
)
return
return None
chunk_nodes = list(filtered_nodes(linear_node.users, ops=torch.ops.aten.chunk))
if len(chunk_nodes) > 0:
assert len(linear_nodes) == 1
Expand All @@ -2098,6 +2098,8 @@ def _determine_fused_weight_dims(
num_chunks = chunk_nodes[0].args[1]
weight_dim = shape(linear_node)[2]
fused_weight_dims = [weight_dim // num_chunks] * num_chunks
if fused_weight_dims is not None:
fused_weight_dims = tuple(fused_weight_dims)
return fused_weight_dims


Expand Down
4 changes: 3 additions & 1 deletion tests/integration/defs/accuracy/test_llm_api_autodeploy.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,11 +218,13 @@ def test_bf16(self):
task.evaluate(llm)

@pytest.mark.skip_less_device_memory(32000)
def test_fp8(self):
@pytest.mark.parametrize("world_size", [1, 4])
def test_fp8(self, world_size):
kwargs = self.get_default_kwargs()
sampling_params = self.get_default_sampling_params()
with AutoDeployLLM(model=self.MODEL_PATH_FP8,
tokenizer=self.MODEL_PATH_FP8,
world_size=world_size,
**kwargs) as llm:
# Manually set quant_config for FP8 model to get the accuracy threshold
llm.args.quant_config.quant_algo = QuantAlgo.FP8
Expand Down
3 changes: 2 additions & 1 deletion tests/integration/test_lists/test-db/l0_b200.yml
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,8 @@ l0_b200:
- unittest/_torch/modules/test_fused_moe.py::test_fused_moe_fp8_blockwise_deepgemm[enable_configurable_moe-dtype1-72-256-2560-DefaultMoeRoutingMethod]
# ------------- AutoDeploy tests ---------------
- accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype[False-1]
- accuracy/test_llm_api_autodeploy.py::TestNemotronMOE::test_fp8
- accuracy/test_llm_api_autodeploy.py::TestNemotronMOE::test_fp8[1]
- accuracy/test_llm_api_autodeploy.py::TestNemotronMOE::test_fp8[4]
- unittest/_torch/auto_deploy/unit/singlegpu
- condition:
ranges:
Expand Down
3 changes: 2 additions & 1 deletion tests/integration/test_lists/test-db/l0_h100.yml
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,8 @@ l0_h100:
- accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype[True-1]
- accuracy/test_llm_api_autodeploy.py::TestNemotronH::test_auto_dtype[False]
- accuracy/test_llm_api_autodeploy.py::TestNemotronH::test_auto_dtype[True]
- accuracy/test_llm_api_autodeploy.py::TestNemotronMOE::test_fp8
- accuracy/test_llm_api_autodeploy.py::TestNemotronMOE::test_fp8[1]
- accuracy/test_llm_api_autodeploy.py::TestNemotronMOE::test_fp8[4]
- accuracy/test_llm_api_autodeploy.py::TestNemotronMOE::test_bf16
- examples/test_ad_speculative_decoding.py::test_autodeploy_spec_dec_output[draft_target]
- examples/test_ad_speculative_decoding.py::test_autodeploy_spec_dec_output[eagle3]
Expand Down
Loading