diff --git a/CHANGELOG.rst b/CHANGELOG.rst index d473f0c8c..f712d428c 100755 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -11,6 +11,8 @@ Model Optimizer Changelog (Linux) - Add flag ``op_types_to_exclude_fp16`` in ONNX quantization to exclude ops from being converted to FP16/BF16. Alternatively, for custom TensorRT ops, this can also be done by indicating ``'fp32'`` precision in ``trt_plugins_precision``. - Add LoRA mode support for MCore in a new peft submodule: ``modelopt.torch.peft.update_model(model, LORA_CFG)``. - Support PTQ and fakequant in vLLM for fast evaluation of arbitrary quantization formats. See ``examples/vllm_serve`` for more details. +- Add support for ``nemotron-post-training-dataset-v2`` and ``nemotron-post-training-dataset-v1`` in ``examples/llm_ptq``. Default to a mix of ``cnn_dailymail`` and ``nemotron-post-training-dataset-v2`` if no dataset is specified. +- Allow specifying ``calib_seq`` in ``examples/llm_ptq`` to set the maximum sequence length for calibration. 0.37 (2025-09-xx) ^^^^^^^^^^^^^^^^^ diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py index da6761252..c55c38abc 100755 --- a/examples/llm_ptq/hf_ptq.py +++ b/examples/llm_ptq/hf_ptq.py @@ -297,8 +297,14 @@ def main(args): ) else: if args.dataset is None: - args.dataset = ["cnn_dailymail"] - warnings.warn("No dataset specified. Defaulting to cnn_dailymail.") + args.dataset = ["cnn_dailymail", "nemotron-post-training-dataset-v2"] + warnings.warn( + "No dataset specified. Defaulting to cnn_dailymail and nemotron-post-training-dataset-v2." + ) + # Adjust calib_size to match dataset length by extending or truncating as needed + args.calib_size = (args.calib_size + [args.calib_size[-1]] * len(args.dataset))[ + : len(args.dataset) + ] tokenizer = get_tokenizer(args.pyt_ckpt_path, trust_remote_code=args.trust_remote_code) default_padding_side = tokenizer.padding_side @@ -349,6 +355,7 @@ def main(args): tokenizer=tokenizer, batch_size=args.batch_size, num_samples=args.calib_size, + max_sample_length=args.calib_seq, device=device, ) model = mts.sparsify( @@ -390,6 +397,7 @@ def main(args): args.batch_size = get_max_batch_size( model, + max_sample_length=args.calib_seq, sample_memory_usage_ratio=sample_memory_usage_ratio if not run_auto_quant else 1.0, sample_input_single_batch=sample_input_single_batch, enable_grad=run_auto_quant, @@ -680,6 +688,12 @@ def output_decode(generated_ids, input_shape): type=str, default="512", ) + parser.add_argument( + "--calib_seq", + help="Maximum sequence length for calibration.", + type=int, + default=512, + ) parser.add_argument("--export_path", default="exported_model") parser.add_argument( "--dataset", diff --git a/examples/llm_ptq/scripts/huggingface_example.sh b/examples/llm_ptq/scripts/huggingface_example.sh index 9c741c7df..7b7d6910e 100755 --- a/examples/llm_ptq/scripts/huggingface_example.sh +++ b/examples/llm_ptq/scripts/huggingface_example.sh @@ -113,6 +113,10 @@ if [ -n "$GPU_MAX_MEM_PERCENTAGE" ]; then PTQ_ARGS+=" --gpu_max_mem_percentage=$GPU_MAX_MEM_PERCENTAGE " fi +if [ -n "$CALIB_SEQ" ]; then + PTQ_ARGS+=" --calib_seq=$CALIB_SEQ " +fi + if ! $VERBOSE; then PTQ_ARGS+=" --no-verbose " fi diff --git a/examples/llm_ptq/scripts/parser.sh b/examples/llm_ptq/scripts/parser.sh index cd5b95466..7df601327 100644 --- a/examples/llm_ptq/scripts/parser.sh +++ b/examples/llm_ptq/scripts/parser.sh @@ -36,7 +36,7 @@ parse_options() { USE_SEQ_DEVICE_MAP=false # Parse command-line options - ARGS=$(getopt -o "" -l "model:,quant:,kv_cache_quant:,tp:,pp:,sparsity:,awq_block_size:,calib:,calib_batch_size:,auto_quantize_bits:,output:,batch:,tasks:,lm_eval_tasks:,lm_eval_limit:,simple_eval_tasks:,trust_remote_code,use_seq_device_map,gpu_max_mem_percentage:,kv_cache_free_gpu_memory_fraction:,low_memory_mode,no-verbose,calib_dataset:" -n "$0" -- "$@") + ARGS=$(getopt -o "" -l "model:,quant:,kv_cache_quant:,tp:,pp:,sparsity:,awq_block_size:,calib:,calib_batch_size:,auto_quantize_bits:,output:,batch:,tasks:,lm_eval_tasks:,lm_eval_limit:,simple_eval_tasks:,trust_remote_code,use_seq_device_map,gpu_max_mem_percentage:,kv_cache_free_gpu_memory_fraction:,low_memory_mode,no-verbose,calib_dataset:,calib_seq:" -n "$0" -- "$@") eval set -- "$ARGS" while true; do @@ -64,12 +64,14 @@ parse_options() { --no-verbose ) VERBOSE=false; shift;; --low_memory_mode ) LOW_MEMORY_MODE=true; shift;; --calib_dataset ) CALIB_DATASET="$2"; shift 2;; + --calib_seq ) CALIB_SEQ="$2"; shift 2;; -- ) shift; break ;; * ) break ;; esac done DEFAULT_CALIB_SIZE=512 + DEFAULT_CALIB_SEQ=512 DEFAULT_CALIB_BATCH_SIZE=0 DEFAULT_BUILD_MAX_OUTPUT_LEN=1024 DEFAULT_BUILD_MAX_BATCH_SIZE=2 @@ -77,6 +79,9 @@ parse_options() { if [ -z "$CALIB_SIZE" ]; then CALIB_SIZE=$DEFAULT_CALIB_SIZE fi + if [ -z "$CALIB_SEQ" ]; then + CALIB_SEQ=$DEFAULT_CALIB_SEQ + fi if [ -z "$CALIB_BATCH_SIZE" ]; then CALIB_BATCH_SIZE=$DEFAULT_CALIB_BATCH_SIZE fi @@ -144,5 +149,6 @@ parse_options() { echo "kv_cache_free_gpu_memory_fraction: $KV_CACHE_FREE_GPU_MEMORY_FRACTION" echo "low_memory_mode: $LOW_MEMORY_MODE" echo "calib_dataset: $CALIB_DATASET" + echo "calib_seq: $CALIB_SEQ" echo "=================" } diff --git a/modelopt/torch/utils/dataset_utils.py b/modelopt/torch/utils/dataset_utils.py index 9361e221c..2f8397e7a 100644 --- a/modelopt/torch/utils/dataset_utils.py +++ b/modelopt/torch/utils/dataset_utils.py @@ -52,6 +52,20 @@ + "\n" + sample["output"], }, + "nemotron-post-training-dataset-v2": { + "config": { + "path": "nvidia/Nemotron-Post-Training-Dataset-v2", + "split": ["stem", "chat", "math", "code"], + }, + "preprocess": lambda sample: "\n".join(turn["content"] for turn in sample["messages"]), + }, + "nemotron-post-training-dataset-v1": { + "config": { + "path": "nvidia/Nemotron-Post-Training-Dataset-v1", + "split": ["stem", "chat", "math", "code", "tool_calling"], + }, + "preprocess": lambda sample: "\n".join(turn["content"] for turn in sample["messages"]), + }, "magpie": { "config": { "path": "Magpie-Align/Magpie-Pro-MT-300K-v0.1", @@ -321,10 +335,10 @@ def _get_free_gpu_mem(): return 1 elif target_data_batch < 4: return 2 - elif target_data_batch < 64: + elif target_data_batch < 512: return target_data_batch // 4 * 4 else: - return 64 + return 512 def _process_batch(batch_data, infer_method, max_working_batch_size=None):