Use nemotron post training dataset for calibration

cjluo-nv · cjluo-nv · commit 86bcc398952a · 2025-10-09T19:39:57.000Z
Signed-off-by: Chenjie Luo &lt;chenjiel@nvidia.com&gt;
diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py
@@ -297,8 +297,8 @@ def main(args):
         )
     else:
         if args.dataset is None:
-            args.dataset = ["cnn_dailymail"]
-            warnings.warn("No dataset specified. Defaulting to cnn_dailymail.")
+            args.dataset = ["nemotron-post-training-dataset-v2"]
+            warnings.warn("No dataset specified. Defaulting to nemotron-post-training-dataset-v2.")
         tokenizer = get_tokenizer(args.pyt_ckpt_path, trust_remote_code=args.trust_remote_code)
 
         default_padding_side = tokenizer.padding_side
@@ -349,6 +349,7 @@ def main(args):
             tokenizer=tokenizer,
             batch_size=args.batch_size,
             num_samples=args.calib_size,
+            max_sample_length=args.calib_seq,
             device=device,
         )
         model = mts.sparsify(
@@ -390,6 +391,7 @@ def main(args):
 
             args.batch_size = get_max_batch_size(
                 model,
+                max_sample_length=args.calib_seq,
                 sample_memory_usage_ratio=sample_memory_usage_ratio if not run_auto_quant else 1.0,
                 sample_input_single_batch=sample_input_single_batch,
                 enable_grad=run_auto_quant,
@@ -680,6 +682,12 @@ def output_decode(generated_ids, input_shape):
         type=str,
         default="512",
     )
+    parser.add_argument(
+        "--calib_seq",
+        help="Maximum sequence length for calibration.",
+        type=int,
+        default=512,
+    )
     parser.add_argument("--export_path", default="exported_model")
     parser.add_argument(
         "--dataset",
diff --git a/examples/llm_ptq/scripts/huggingface_example.sh b/examples/llm_ptq/scripts/huggingface_example.sh
@@ -113,6 +113,10 @@ if [ -n "$GPU_MAX_MEM_PERCENTAGE" ]; then
     PTQ_ARGS+=" --gpu_max_mem_percentage=$GPU_MAX_MEM_PERCENTAGE "
 fi
 
+if [ -n "$CALIB_SEQ" ]; then
+    PTQ_ARGS+=" --calib_seq=$CALIB_SEQ "
+fi
+
 if ! $VERBOSE; then
     PTQ_ARGS+=" --no-verbose "
 fi
diff --git a/examples/llm_ptq/scripts/parser.sh b/examples/llm_ptq/scripts/parser.sh
@@ -36,7 +36,7 @@ parse_options() {
     USE_SEQ_DEVICE_MAP=false
 
   # Parse command-line options
-  ARGS=$(getopt -o "" -l "model:,quant:,kv_cache_quant:,tp:,pp:,sparsity:,awq_block_size:,calib:,calib_batch_size:,auto_quantize_bits:,output:,batch:,tasks:,lm_eval_tasks:,lm_eval_limit:,simple_eval_tasks:,trust_remote_code,use_seq_device_map,gpu_max_mem_percentage:,kv_cache_free_gpu_memory_fraction:,low_memory_mode,no-verbose,calib_dataset:" -n "$0" -- "$@")
+  ARGS=$(getopt -o "" -l "model:,quant:,kv_cache_quant:,tp:,pp:,sparsity:,awq_block_size:,calib:,calib_batch_size:,auto_quantize_bits:,output:,batch:,tasks:,lm_eval_tasks:,lm_eval_limit:,simple_eval_tasks:,trust_remote_code,use_seq_device_map,gpu_max_mem_percentage:,kv_cache_free_gpu_memory_fraction:,low_memory_mode,no-verbose,calib_dataset:,calib_seq:" -n "$0" -- "$@")
 
   eval set -- "$ARGS"
   while true; do
@@ -64,19 +64,24 @@ parse_options() {
       --no-verbose ) VERBOSE=false; shift;;
       --low_memory_mode ) LOW_MEMORY_MODE=true; shift;;
       --calib_dataset ) CALIB_DATASET="$2"; shift 2;;
+      --calib_seq ) CALIB_SEQ="$2"; shift 2;;
       -- ) shift; break ;;
       * ) break ;;
     esac
   done
 
   DEFAULT_CALIB_SIZE=512
+  DEFAULT_CALIB_SEQ=512
   DEFAULT_CALIB_BATCH_SIZE=0
   DEFAULT_BUILD_MAX_OUTPUT_LEN=1024
   DEFAULT_BUILD_MAX_BATCH_SIZE=2
 
   if [ -z "$CALIB_SIZE" ]; then
     CALIB_SIZE=$DEFAULT_CALIB_SIZE
   fi
+  if [ -z "$CALIB_SEQ" ]; then
+    CALIB_SEQ=$DEFAULT_CALIB_SEQ
+  fi
   if [ -z "$CALIB_BATCH_SIZE" ]; then
     CALIB_BATCH_SIZE=$DEFAULT_CALIB_BATCH_SIZE
   fi
@@ -144,5 +149,6 @@ parse_options() {
   echo "kv_cache_free_gpu_memory_fraction: $KV_CACHE_FREE_GPU_MEMORY_FRACTION"
   echo "low_memory_mode: $LOW_MEMORY_MODE"
   echo "calib_dataset: $CALIB_DATASET"
+  echo "calib_seq: $CALIB_SEQ"
   echo "================="
 }
diff --git a/modelopt/torch/utils/dataset_utils.py b/modelopt/torch/utils/dataset_utils.py
@@ -52,6 +52,20 @@
         + "\n"
         + sample["output"],
     },
+    "nemotron-post-training-dataset-v2": {
+        "config": {
+            "path": "nvidia/Nemotron-Post-Training-Dataset-v2",
+            "split": ["stem", "chat", "math", "code"],
+        },
+        "preprocess": lambda sample: "\n".join(turn["content"] for turn in sample["messages"]),
+    },
+    "nemotron-post-training-dataset-v1": {
+        "config": {
+            "path": "nvidia/Nemotron-Post-Training-Dataset-v1",
+            "split": ["stem", "chat", "math", "code", "tool_calling"],
+        },
+        "preprocess": lambda sample: "\n".join(turn["content"] for turn in sample["messages"]),
+    },
     "magpie": {
         "config": {
             "path": "Magpie-Align/Magpie-Pro-MT-300K-v0.1",
@@ -321,10 +335,10 @@ def _get_free_gpu_mem():
         return 1
     elif target_data_batch < 4:
         return 2
-    elif target_data_batch < 64:
+    elif target_data_batch < 512:
         return target_data_batch // 4 * 4
     else:
-        return 64
+        return 512
 
 
 def _process_batch(batch_data, infer_method, max_working_batch_size=None):