Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ Model Optimizer Changelog (Linux)
- Add flag ``op_types_to_exclude_fp16`` in ONNX quantization to exclude ops from being converted to FP16/BF16. Alternatively, for custom TensorRT ops, this can also be done by indicating ``'fp32'`` precision in ``trt_plugins_precision``.
- Add LoRA mode support for MCore in a new peft submodule: ``modelopt.torch.peft.update_model(model, LORA_CFG)``.
- Support PTQ and fakequant in vLLM for fast evaluation of arbitrary quantization formats. See ``examples/vllm_serve`` for more details.
- Add support for ``nemotron-post-training-dataset-v2`` and ``nemotron-post-training-dataset-v1`` in ``examples/llm_ptq``. Default to ``nemotron-post-training-dataset-v2`` if no dataset is specified.
Copy link
Collaborator

@kevalmorabia97 kevalmorabia97 Oct 10, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
- Add support for ``nemotron-post-training-dataset-v2`` and ``nemotron-post-training-dataset-v1`` in ``examples/llm_ptq``. Default to ``nemotron-post-training-dataset-v2`` if no dataset is specified.
- Add support for ``nemotron-post-training-dataset-v2`` and ``nemotron-post-training-dataset-v1`` in ``examples/llm_ptq``. Default changed from ``cnn_dailymail`` to ``nemotron-post-training-dataset-v2`` if no dataset is specified.

- Allow specifying ``calib_seq`` in ``examples/llm_ptq`` to set the maximum sequence length for calibration.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

general question, what is the difference between calib_size and calib_seq?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

calib_size is the num of calib samples.
calib_seq is the length of the max calib sample sequence.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit:
@cjluo-nv should we use be more verbose and use --calib_seq_len for clarity?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ACK. Personally I prefer shorter flags.


0.37 (2025-09-xx)
^^^^^^^^^^^^^^^^^
Expand Down
12 changes: 10 additions & 2 deletions examples/llm_ptq/hf_ptq.py
Original file line number Diff line number Diff line change
Expand Up @@ -297,8 +297,8 @@ def main(args):
)
else:
if args.dataset is None:
args.dataset = ["cnn_dailymail"]
warnings.warn("No dataset specified. Defaulting to cnn_dailymail.")
args.dataset = ["nemotron-post-training-dataset-v2"]
warnings.warn("No dataset specified. Defaulting to nemotron-post-training-dataset-v2.")
tokenizer = get_tokenizer(args.pyt_ckpt_path, trust_remote_code=args.trust_remote_code)

default_padding_side = tokenizer.padding_side
Expand Down Expand Up @@ -349,6 +349,7 @@ def main(args):
tokenizer=tokenizer,
batch_size=args.batch_size,
num_samples=args.calib_size,
max_sample_length=args.calib_seq,
device=device,
)
model = mts.sparsify(
Expand Down Expand Up @@ -390,6 +391,7 @@ def main(args):

args.batch_size = get_max_batch_size(
model,
max_sample_length=args.calib_seq,
sample_memory_usage_ratio=sample_memory_usage_ratio if not run_auto_quant else 1.0,
sample_input_single_batch=sample_input_single_batch,
enable_grad=run_auto_quant,
Expand Down Expand Up @@ -680,6 +682,12 @@ def output_decode(generated_ids, input_shape):
type=str,
default="512",
)
parser.add_argument(
"--calib_seq",
help="Maximum sequence length for calibration.",
type=int,
default=512,
)
parser.add_argument("--export_path", default="exported_model")
parser.add_argument(
"--dataset",
Expand Down
4 changes: 4 additions & 0 deletions examples/llm_ptq/scripts/huggingface_example.sh
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,10 @@ if [ -n "$GPU_MAX_MEM_PERCENTAGE" ]; then
PTQ_ARGS+=" --gpu_max_mem_percentage=$GPU_MAX_MEM_PERCENTAGE "
fi

if [ -n "$CALIB_SEQ" ]; then
PTQ_ARGS+=" --calib_seq=$CALIB_SEQ "
fi

if ! $VERBOSE; then
PTQ_ARGS+=" --no-verbose "
fi
Expand Down
8 changes: 7 additions & 1 deletion examples/llm_ptq/scripts/parser.sh
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ parse_options() {
USE_SEQ_DEVICE_MAP=false

# Parse command-line options
ARGS=$(getopt -o "" -l "model:,quant:,kv_cache_quant:,tp:,pp:,sparsity:,awq_block_size:,calib:,calib_batch_size:,auto_quantize_bits:,output:,batch:,tasks:,lm_eval_tasks:,lm_eval_limit:,simple_eval_tasks:,trust_remote_code,use_seq_device_map,gpu_max_mem_percentage:,kv_cache_free_gpu_memory_fraction:,low_memory_mode,no-verbose,calib_dataset:" -n "$0" -- "$@")
ARGS=$(getopt -o "" -l "model:,quant:,kv_cache_quant:,tp:,pp:,sparsity:,awq_block_size:,calib:,calib_batch_size:,auto_quantize_bits:,output:,batch:,tasks:,lm_eval_tasks:,lm_eval_limit:,simple_eval_tasks:,trust_remote_code,use_seq_device_map,gpu_max_mem_percentage:,kv_cache_free_gpu_memory_fraction:,low_memory_mode,no-verbose,calib_dataset:,calib_seq:" -n "$0" -- "$@")

eval set -- "$ARGS"
while true; do
Expand Down Expand Up @@ -64,19 +64,24 @@ parse_options() {
--no-verbose ) VERBOSE=false; shift;;
--low_memory_mode ) LOW_MEMORY_MODE=true; shift;;
--calib_dataset ) CALIB_DATASET="$2"; shift 2;;
--calib_seq ) CALIB_SEQ="$2"; shift 2;;
-- ) shift; break ;;
* ) break ;;
esac
done

DEFAULT_CALIB_SIZE=512
DEFAULT_CALIB_SEQ=512
DEFAULT_CALIB_BATCH_SIZE=0
DEFAULT_BUILD_MAX_OUTPUT_LEN=1024
DEFAULT_BUILD_MAX_BATCH_SIZE=2

if [ -z "$CALIB_SIZE" ]; then
CALIB_SIZE=$DEFAULT_CALIB_SIZE
fi
if [ -z "$CALIB_SEQ" ]; then
CALIB_SEQ=$DEFAULT_CALIB_SEQ
fi
if [ -z "$CALIB_BATCH_SIZE" ]; then
CALIB_BATCH_SIZE=$DEFAULT_CALIB_BATCH_SIZE
fi
Expand Down Expand Up @@ -144,5 +149,6 @@ parse_options() {
echo "kv_cache_free_gpu_memory_fraction: $KV_CACHE_FREE_GPU_MEMORY_FRACTION"
echo "low_memory_mode: $LOW_MEMORY_MODE"
echo "calib_dataset: $CALIB_DATASET"
echo "calib_seq: $CALIB_SEQ"
echo "================="
}
18 changes: 16 additions & 2 deletions modelopt/torch/utils/dataset_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,20 @@
+ "\n"
+ sample["output"],
},
"nemotron-post-training-dataset-v2": {
"config": {
"path": "nvidia/Nemotron-Post-Training-Dataset-v2",
"split": ["stem", "chat", "math", "code"],
},
"preprocess": lambda sample: "\n".join(turn["content"] for turn in sample["messages"]),
},
"nemotron-post-training-dataset-v1": {
"config": {
"path": "nvidia/Nemotron-Post-Training-Dataset-v1",
"split": ["stem", "chat", "math", "code", "tool_calling"],
},
"preprocess": lambda sample: "\n".join(turn["content"] for turn in sample["messages"]),
},
"magpie": {
"config": {
"path": "Magpie-Align/Magpie-Pro-MT-300K-v0.1",
Expand Down Expand Up @@ -321,10 +335,10 @@ def _get_free_gpu_mem():
return 1
elif target_data_batch < 4:
return 2
elif target_data_batch < 64:
elif target_data_batch < 512:
return target_data_batch // 4 * 4
else:
return 64
return 512


def _process_batch(batch_data, infer_method, max_working_batch_size=None):
Expand Down