Skip to content

Commit 153e376

Browse files
committed
rebase bug fixes
Signed-off-by: Kinjal Patel <[email protected]>
1 parent 169677c commit 153e376

File tree

2 files changed

+0
-180
lines changed

2 files changed

+0
-180
lines changed

tests/_test_utils/torch_quantization/quantize_common.py

Lines changed: 0 additions & 97 deletions
Original file line numberDiff line numberDiff line change
@@ -210,103 +210,6 @@ def forward_loop(model):
210210
)
211211

212212

213-
@patch("modelopt.torch.quantization.model_calib.awq_lite", side_effect=_debug_awq_lite)
214-
def dp_cp_parallel_test_helper(model, config, group, mock_awq_lite):
215-
calib_data = model.get_dummy_input().cuda()
216-
217-
def forward_loop(model):
218-
model(calib_data)
219-
220-
model = mtq.quantize(model, config, forward_loop)
221-
222-
# Sanity check
223-
forward_loop(model)
224-
225-
# Input quantizer amax
226-
if config not in [mtq.INT4_BLOCKWISE_WEIGHT_ONLY_CFG, mtq.INT4_AWQ_CFG]:
227-
_reduce_quantizer_attr(model.fc1.input_quantizer, "amax", dist.ReduceOp.MAX, group=group)
228-
_reduce_quantizer_attr(model.fc2.input_quantizer, "amax", dist.ReduceOp.MAX, group=group)
229-
230-
# Weight quantizer amax
231-
if isinstance(model.fc1.weight_quantizer, SequentialQuantizer):
232-
for quantizer in model.fc1.weight_quantizer:
233-
_reduce_quantizer_attr(quantizer, "amax", dist.ReduceOp.MAX, group=group)
234-
else:
235-
_reduce_quantizer_attr(model.fc1.weight_quantizer, "amax", dist.ReduceOp.MAX, group=group)
236-
if isinstance(model.fc2.weight_quantizer, SequentialQuantizer):
237-
for quantizer in model.fc2.weight_quantizer:
238-
_reduce_quantizer_attr(quantizer, "amax", dist.ReduceOp.MAX, group=group)
239-
else:
240-
_reduce_quantizer_attr(model.fc2.weight_quantizer, "amax", dist.ReduceOp.MAX, group=group)
241-
242-
if config in [mtq.INT4_AWQ_CFG, mtq.W4A8_AWQ_BETA_CFG]:
243-
# Check act scale
244-
_reduce_quantizer_attr(
245-
model.fc1.awq_lite,
246-
"act_scale",
247-
dist.ReduceOp.AVG,
248-
group=group,
249-
)
250-
_reduce_quantizer_attr(
251-
model.fc2.awq_lite,
252-
"act_scale",
253-
dist.ReduceOp.AVG,
254-
group=group,
255-
)
256-
257-
258-
@patch("modelopt.torch.quantization.model_calib.awq_lite", side_effect=_debug_awq_lite)
259-
def data_tensor_context_parallel_test_helper(model, config, dp_group, tp_group, mock_awq_lite):
260-
# Calib data should be same across each DP rank
261-
dp_rank = dist.get_rank(group=dp_group)
262-
calib_data = model.get_dummy_input(seed=dp_rank).cuda()
263-
264-
def forward_loop(model):
265-
model(calib_data)
266-
267-
model = mtq.quantize(model, config, forward_loop)
268-
269-
def _reduce_quantizer_attr(quantizer, attr=str, op=dist.ReduceOp.MAX):
270-
quantizer_attr = getattr(quantizer, attr).clone()
271-
272-
# Perform all-reduce operations
273-
dist.all_reduce(quantizer_attr, op=op, group=tp_group)
274-
275-
dist.all_reduce(quantizer_attr, op=op, group=dp_group)
276-
277-
assert torch.allclose(quantizer_attr, getattr(quantizer, attr)), getattr(quantizer, attr)
278-
279-
# Input quantizer amax
280-
if config not in [mtq.INT4_BLOCKWISE_WEIGHT_ONLY_CFG, mtq.INT4_AWQ_CFG]:
281-
_reduce_quantizer_attr(model.fc1.input_quantizer, "amax", dist.ReduceOp.MAX)
282-
_reduce_quantizer_attr(model.fc2.input_quantizer, "amax", dist.ReduceOp.MAX)
283-
284-
# Per-tensor quantization (FP8/NVFP4) expects same amax across row and column parallel ranks
285-
# Channel-wise (INT8) only expects same amax across row parallel ranks
286-
# Block-wise quantization does not expect same amax across row and column parallel ranks
287-
if config in [mtq.FP8_DEFAULT_CFG, mtq.NVFP4_DEFAULT_CFG]:
288-
if isinstance(model.fc1.weight_quantizer, SequentialQuantizer):
289-
for quantizer in model.fc1.weight_quantizer:
290-
_reduce_quantizer_attr(quantizer, "amax", dist.ReduceOp.MAX)
291-
else:
292-
_reduce_quantizer_attr(model.fc1.weight_quantizer, "amax", dist.ReduceOp.MAX)
293-
294-
if config in [mtq.FP8_DEFAULT_CFG, mtq.NVFP4_DEFAULT_CFG, mtq.INT8_DEFAULT_CFG]:
295-
if isinstance(model.fc2.weight_quantizer, SequentialQuantizer):
296-
for quantizer in model.fc2.weight_quantizer:
297-
_reduce_quantizer_attr(quantizer, "amax", dist.ReduceOp.MAX)
298-
else:
299-
_reduce_quantizer_attr(model.fc2.weight_quantizer, "amax", dist.ReduceOp.MAX)
300-
301-
# Check act scale
302-
if config in [mtq.INT4_AWQ_CFG, mtq.W4A8_AWQ_BETA_CFG]:
303-
_reduce_quantizer_attr(
304-
model.fc1.awq_lite,
305-
"act_scale",
306-
dist.ReduceOp.AVG,
307-
)
308-
309-
310213
def auto_quantize_helper(model):
311214
model, search_state = mtq.auto_quantize(
312215
model,

tests/gpu/torch/quantization/plugins/test_megatron.py

Lines changed: 0 additions & 83 deletions
Original file line numberDiff line numberDiff line change
@@ -231,89 +231,6 @@ def test_data_tensor_context_parallel(need_8_gpus, config):
231231
)
232232

233233

234-
# 2. Data Parallel Test
235-
def _test_data_parallel_helper(config, rank, size):
236-
initialize_for_megatron(seed=SEED + rank) # modify seed so data is different across ranks
237-
model = MegatronModel().cuda()
238-
239-
dp_cp_parallel_test_helper(model, config, get_data_parallel_group())
240-
241-
242-
@pytest.mark.parametrize(
243-
"config",
244-
[
245-
mtq.INT8_DEFAULT_CFG,
246-
mtq.FP8_DEFAULT_CFG,
247-
mtq.W4A8_AWQ_BETA_CFG,
248-
mtq.INT8_SMOOTHQUANT_CFG,
249-
mtq.INT4_BLOCKWISE_WEIGHT_ONLY_CFG,
250-
mtq.INT4_AWQ_CFG,
251-
mtq.NVFP4_DEFAULT_CFG,
252-
],
253-
)
254-
def test_data_parallel(need_2_gpus, config):
255-
spawn_multiprocess_job(size=2, job=partial(_test_data_parallel_helper, config), backend="nccl")
256-
257-
258-
# 3. Context Parallel Test
259-
def _test_context_parallel_helper(config, rank, size):
260-
initialize_for_megatron(
261-
context_parallel_size=size, seed=SEED + rank
262-
) # modify seed so data is different across ranks
263-
model = MegatronModel(cp_size=size).cuda()
264-
265-
dp_cp_parallel_test_helper(model, config, get_data_parallel_group(with_context_parallel=True))
266-
267-
268-
@pytest.mark.parametrize(
269-
"config",
270-
[
271-
mtq.INT8_DEFAULT_CFG,
272-
mtq.FP8_DEFAULT_CFG,
273-
mtq.W4A8_AWQ_BETA_CFG,
274-
mtq.INT8_SMOOTHQUANT_CFG,
275-
mtq.INT4_BLOCKWISE_WEIGHT_ONLY_CFG,
276-
mtq.INT4_AWQ_CFG,
277-
mtq.NVFP4_DEFAULT_CFG,
278-
],
279-
)
280-
def test_context_parallel(need_2_gpus, config):
281-
spawn_multiprocess_job(
282-
size=2, job=partial(_test_context_parallel_helper, config), backend="nccl"
283-
)
284-
285-
286-
# 4. DP=2 + TP=2 + CP=2 Test (on 2*2*2=8 GPUs)
287-
def _test_data_tensor_context_parallel_helper(config, rank, size):
288-
initialize_for_megatron(tensor_model_parallel_size=2, context_parallel_size=2, seed=SEED + rank)
289-
model = MegatronModel(tp_size=2, cp_size=2).cuda()
290-
291-
data_tensor_context_parallel_test_helper(
292-
model,
293-
config,
294-
get_data_parallel_group(with_context_parallel=True),
295-
get_tensor_model_parallel_group(),
296-
)
297-
298-
299-
@pytest.mark.parametrize(
300-
"config",
301-
[
302-
mtq.INT8_DEFAULT_CFG,
303-
mtq.FP8_DEFAULT_CFG,
304-
mtq.W4A8_AWQ_BETA_CFG,
305-
mtq.INT8_SMOOTHQUANT_CFG,
306-
mtq.INT4_BLOCKWISE_WEIGHT_ONLY_CFG,
307-
mtq.INT4_AWQ_CFG,
308-
mtq.NVFP4_DEFAULT_CFG,
309-
],
310-
)
311-
def test_data_tensor_context_parallel(need_8_gpus, config):
312-
spawn_multiprocess_job(
313-
size=8, job=partial(_test_data_tensor_context_parallel_helper, config), backend="nccl"
314-
)
315-
316-
317234
def _gpt_model_provider(
318235
tp_size: int,
319236
hidden_size=256,

0 commit comments

Comments
 (0)