|
17 | 17 | from abc import ABC, abstractmethod
|
18 | 18 | from enum import IntEnum
|
19 | 19 | from typing import Dict
|
20 |
| - |
21 | 20 | import pytest
|
22 | 21 | import torch
|
23 | 22 | from cuda.bindings import runtime
|
@@ -1839,7 +1838,7 @@ def cache_permute_indices():
|
1839 | 1838 |
|
1840 | 1839 | @pytest.mark.parametrize("num_tokens", [1, 8, 1024])
|
1841 | 1840 | @pytest.mark.parametrize("hidden_size", [1024, 8192])
|
1842 |
| -@pytest.mark.parametrize("intermediate_size", [2048, 1024, 768, 384]) |
| 1841 | +@pytest.mark.parametrize("intermediate_size", [384, 768, 1024, 2048]) |
1843 | 1842 | @pytest.mark.parametrize(
|
1844 | 1843 | "moe_impl",
|
1845 | 1844 | [
|
@@ -2244,35 +2243,3 @@ def test_moe_quantization_classes(
|
2244 | 2243 | rtol=tolerances["rtol"],
|
2245 | 2244 | percent=tolerances["percent"],
|
2246 | 2245 | )
|
2247 |
| - |
2248 |
| - |
2249 |
| -if __name__ == "__main__": |
2250 |
| - # pytest.main([__file__, "-v"]) |
2251 |
| - routing_config = { |
2252 |
| - "num_experts": 256, |
2253 |
| - "top_k": 8, |
2254 |
| - "padding": 8, |
2255 |
| - "n_groups": 8, |
2256 |
| - "top_k_groups": 4, |
2257 |
| - "routed_scaling": 2.5, |
2258 |
| - "has_routing_bias": True, |
2259 |
| - "routing_method_type": RoutingMethodType.DeepSeekV3, |
2260 |
| - "compatible_moe_impls": [ |
2261 |
| - FP8BlockScaleMoe, |
2262 |
| - ], |
2263 |
| - } |
2264 |
| - weight_processing = { |
2265 |
| - "use_shuffled_weight": False, |
2266 |
| - "layout": WeightLayout.MajorK, |
2267 |
| - "compatible_moe_impls": [FP8BlockScaleMoe], |
2268 |
| - } |
2269 |
| - test_moe_quantization_classes( |
2270 |
| - num_tokens=4, |
2271 |
| - hidden_size=1024, |
2272 |
| - intermediate_size=1024, |
2273 |
| - moe_impl=FP8BlockScaleMoe(), |
2274 |
| - routing_config=routing_config, |
2275 |
| - weight_processing=weight_processing, |
2276 |
| - gated_act_type=GatedActType.SwiGlu, |
2277 |
| - cache_permute_indices=cache_permute_indices, |
2278 |
| - ) |
0 commit comments