NVIDIA · cjluo-nv · Oct 14, 2025 · Oct 9, 2025 · Oct 13, 2025 · sugunav14
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -11,6 +11,8 @@ Model Optimizer Changelog (Linux)
 - Add flag ``op_types_to_exclude_fp16`` in ONNX quantization to exclude ops from being converted to FP16/BF16. Alternatively, for custom TensorRT ops, this can also be done by indicating ``'fp32'`` precision in ``trt_plugins_precision``.
 - Add LoRA mode support for MCore in a new peft submodule: ``modelopt.torch.peft.update_model(model, LORA_CFG)``.
 - Support PTQ and fakequant in vLLM for fast evaluation of arbitrary quantization formats. See ``examples/vllm_serve`` for more details.
+- Add support for ``nemotron-post-training-dataset-v2`` and ``nemotron-post-training-dataset-v1`` in ``examples/llm_ptq``. Default to a mix of ``cnn_dailymail`` and ``nemotron-post-training-dataset-v2`` if no dataset is specified.
+- Allow specifying ``calib_seq`` in ``examples/llm_ptq`` to set the maximum sequence length for calibration.
 
 0.37 (2025-09-xx)
 ^^^^^^^^^^^^^^^^^

@@ -297,8 +297,14 @@ def main(args):
         )
     else:
         if args.dataset is None:
-            args.dataset = ["cnn_dailymail"]
-            warnings.warn("No dataset specified. Defaulting to cnn_dailymail.")
+            args.dataset = ["cnn_dailymail", "nemotron-post-training-dataset-v2"]
+            warnings.warn(
+                "No dataset specified. Defaulting to cnn_dailymail and nemotron-post-training-dataset-v2."
+            )
+        # Adjust calib_size to match dataset length by extending or truncating as needed
+        args.calib_size = (args.calib_size + [args.calib_size[-1]] * len(args.dataset))[
+            : len(args.dataset)
+        ]
         tokenizer = get_tokenizer(args.pyt_ckpt_path, trust_remote_code=args.trust_remote_code)
 
         default_padding_side = tokenizer.padding_side
@@ -349,6 +355,7 @@ def main(args):
             tokenizer=tokenizer,
             batch_size=args.batch_size,
             num_samples=args.calib_size,
+            max_sample_length=args.calib_seq,
             device=device,
         )
         model = mts.sparsify(
@@ -390,6 +397,7 @@ def main(args):
 
             args.batch_size = get_max_batch_size(
                 model,
+                max_sample_length=args.calib_seq,
                 sample_memory_usage_ratio=sample_memory_usage_ratio if not run_auto_quant else 1.0,
                 sample_input_single_batch=sample_input_single_batch,
                 enable_grad=run_auto_quant,
@@ -680,6 +688,12 @@ def output_decode(generated_ids, input_shape):
         type=str,
         default="512",
     )
+    parser.add_argument(
+        "--calib_seq",
+        help="Maximum sequence length for calibration.",
+        type=int,
+        default=512,
+    )
     parser.add_argument("--export_path", default="exported_model")
     parser.add_argument(
         "--dataset",

@@ -113,6 +113,10 @@ if [ -n "$GPU_MAX_MEM_PERCENTAGE" ]; then
     PTQ_ARGS+=" --gpu_max_mem_percentage=$GPU_MAX_MEM_PERCENTAGE "
 fi
 
+if [ -n "$CALIB_SEQ" ]; then
+    PTQ_ARGS+=" --calib_seq=$CALIB_SEQ "
+fi
+
 if ! $VERBOSE; then
     PTQ_ARGS+=" --no-verbose "
 fi

@@ -36,7 +36,7 @@ parse_options() {
     USE_SEQ_DEVICE_MAP=false
 
   # Parse command-line options
-  ARGS=$(getopt -o "" -l "model:,quant:,kv_cache_quant:,tp:,pp:,sparsity:,awq_block_size:,calib:,calib_batch_size:,auto_quantize_bits:,output:,batch:,tasks:,lm_eval_tasks:,lm_eval_limit:,simple_eval_tasks:,trust_remote_code,use_seq_device_map,gpu_max_mem_percentage:,kv_cache_free_gpu_memory_fraction:,low_memory_mode,no-verbose,calib_dataset:" -n "$0" -- "$@")
+  ARGS=$(getopt -o "" -l "model:,quant:,kv_cache_quant:,tp:,pp:,sparsity:,awq_block_size:,calib:,calib_batch_size:,auto_quantize_bits:,output:,batch:,tasks:,lm_eval_tasks:,lm_eval_limit:,simple_eval_tasks:,trust_remote_code,use_seq_device_map,gpu_max_mem_percentage:,kv_cache_free_gpu_memory_fraction:,low_memory_mode,no-verbose,calib_dataset:,calib_seq:" -n "$0" -- "$@")
 
   eval set -- "$ARGS"
   while true; do
@@ -64,19 +64,24 @@ parse_options() {
       --no-verbose ) VERBOSE=false; shift;;
       --low_memory_mode ) LOW_MEMORY_MODE=true; shift;;
       --calib_dataset ) CALIB_DATASET="$2"; shift 2;;
+      --calib_seq ) CALIB_SEQ="$2"; shift 2;;
       -- ) shift; break ;;
       * ) break ;;
     esac
   done
 
   DEFAULT_CALIB_SIZE=512
+  DEFAULT_CALIB_SEQ=512
   DEFAULT_CALIB_BATCH_SIZE=0
   DEFAULT_BUILD_MAX_OUTPUT_LEN=1024
   DEFAULT_BUILD_MAX_BATCH_SIZE=2
 
   if [ -z "$CALIB_SIZE" ]; then
     CALIB_SIZE=$DEFAULT_CALIB_SIZE
   fi
+  if [ -z "$CALIB_SEQ" ]; then
+    CALIB_SEQ=$DEFAULT_CALIB_SEQ
+  fi
   if [ -z "$CALIB_BATCH_SIZE" ]; then
     CALIB_BATCH_SIZE=$DEFAULT_CALIB_BATCH_SIZE
   fi
@@ -144,5 +149,6 @@ parse_options() {
   echo "kv_cache_free_gpu_memory_fraction: $KV_CACHE_FREE_GPU_MEMORY_FRACTION"
   echo "low_memory_mode: $LOW_MEMORY_MODE"
   echo "calib_dataset: $CALIB_DATASET"
+  echo "calib_seq: $CALIB_SEQ"
   echo "================="
 }
@@ -52,6 +52,20 @@
         + "\n"
         + sample["output"],
     },
+    "nemotron-post-training-dataset-v2": {
+        "config": {
+            "path": "nvidia/Nemotron-Post-Training-Dataset-v2",
+            "split": ["stem", "chat", "math", "code"],
+        },
+        "preprocess": lambda sample: "\n".join(turn["content"] for turn in sample["messages"]),
+    },
+    "nemotron-post-training-dataset-v1": {
+        "config": {
+            "path": "nvidia/Nemotron-Post-Training-Dataset-v1",
+            "split": ["stem", "chat", "math", "code", "tool_calling"],
+        },
+        "preprocess": lambda sample: "\n".join(turn["content"] for turn in sample["messages"]),
+    },
     "magpie": {
         "config": {
             "path": "Magpie-Align/Magpie-Pro-MT-300K-v0.1",
@@ -321,10 +335,10 @@ def _get_free_gpu_mem():
         return 1
     elif target_data_batch < 4:
         return 2
-    elif target_data_batch < 64:
+    elif target_data_batch < 512:
         return target_data_batch // 4 * 4
     else:
-        return 64
+        return 512
 
 
 def _process_batch(batch_data, infer_method, max_working_batch_size=None):