From c758ad50c563e46890ee72b9dc9697db91bb3204 Mon Sep 17 00:00:00 2001
From: Daniel Korzekwa <dkorzekwa@nvidia.com>
Date: Mon, 27 Oct 2025 11:50:00 +0100
Subject: [PATCH 01/81] The main compression function for a model using
 MIP-based NAS search algorithm.

Signed-off-by: Daniel Korzekwa <dkorzekwa@nvidia.com>
---
 modelopt/torch/_compress/README.md   |  3 ++
 modelopt/torch/_compress/compress.py | 70 ++++++++++++++++++++++++++++
 2 files changed, 73 insertions(+)
 create mode 100644 modelopt/torch/_compress/README.md
 create mode 100644 modelopt/torch/_compress/compress.py

diff --git a/modelopt/torch/_compress/README.md b/modelopt/torch/_compress/README.md
new file mode 100644
index 000000000..97afc4491
--- /dev/null
+++ b/modelopt/torch/_compress/README.md
@@ -0,0 +1,3 @@
+Experimental model compression algorithm based on a Local Neural Architecture Search.
+Based on the Puzzle paper: https://arxiv.org/abs/2411.19146
+PoC for Llama 3.1 model.
\ No newline at end of file
diff --git a/modelopt/torch/_compress/compress.py b/modelopt/torch/_compress/compress.py
new file mode 100644
index 000000000..5136d9623
--- /dev/null
+++ b/modelopt/torch/_compress/compress.py
@@ -0,0 +1,70 @@
+"""
+
+This module provides the main compression function for a model
+using MIP-based NAS search algorithm.
+"""
+
+import build_library_and_stats
+import mip_and_realize_models
+import pruning_ckpts
+import score_pruning_activations
+import scoring
+from omegaconf import DictConfig
+from puzzle_tools.runtime import IRuntime
+
+# TODO Move initialize_hydra_config_for_dir from tests to main
+from tests.utils.test_utils import initialize_hydra_config_for_dir
+
+
+def compress(
+    hydra_config_dir: str, hydra_config: str, puzzle_dir: str, dataset_path: str, runtime: IRuntime
+) -> DictConfig:
+    """Compress a puzzletron model using the MIP-based NAS search algorithm.
+
+    Args:
+        hydra_config_dir (str): path to a hydra_config_dir that defines the search space
+        hydra_config (str): the corresponding hydra config file
+        puzzle_dir (str): directory with a puzzletron model to compress
+        dataset_path (str): dataset used for scoring and distillation
+        runtime: distributed runtime to use to run the compression steps, e.g.,
+                 NativeDDP_Runtime(dtype=torch.bfloat16, torch_distributed_timeout=datetime.timedelta(10))
+
+    Returns:
+        Hydra config object after compressing the model.
+        The same hydra configuration object is used across all compression steps.
+        @TODO: Investigate if this config object is immutable across steps and clarify
+    """
+    # Step 0: Load puzzletron hydra config
+    hydra_cfg = initialize_hydra_config_for_dir(
+        config_dir=hydra_config_dir,
+        config_name=hydra_config,
+        overrides=[
+            f"puzzle_dir={puzzle_dir}",
+            f"dataset_path={dataset_path}",
+        ],
+    )
+
+    # Step 1: score_pruning_activations (distributed processing)
+    score_pruning_activations.launch_score_activations(hydra_cfg, runtime)
+
+    # Step 2: pruning_ckpts (single process)
+    if runtime.global_rank == 0:
+        pruning_ckpts.launch_prune_ckpt(hydra_cfg)
+    runtime.wait_for_everyone()
+
+    # # Step 3: bypass distillation (distributed processing)
+    # # TODO: Add bypass distillation step
+    # #run_bypassed_training(hydra_cfg, runtime)
+
+    # Step 4: build_library_and_stats (single process)
+    if runtime.global_rank == 0:
+        build_library_and_stats.launch_build_library_and_stats(hydra_cfg)
+    runtime.wait_for_everyone()
+
+    # Step 5: calc_one_block_scores (distributed processing)
+    scoring.launch_scoring(hydra_cfg, runtime)
+
+    # Step 6: mip_and_realize_models (distributed processing)
+    mip_and_realize_models.launch_mip_and_realize_model(hydra_cfg, runtime)
+
+    return hydra_cfg

From 8af99036c9cea7daee252f8cb6951778a52e939a Mon Sep 17 00:00:00 2001
From: Daniel Korzekwa <dkorzekwa@nvidia.com>
Date: Mon, 27 Oct 2025 11:54:50 +0100
Subject: [PATCH 02/81] Code formatting

Signed-off-by: Daniel Korzekwa <dkorzekwa@nvidia.com>
---
 modelopt/torch/_compress/README.md   | 4 ++--
 modelopt/torch/_compress/compress.py | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/modelopt/torch/_compress/README.md b/modelopt/torch/_compress/README.md
index 97afc4491..4c6da80e5 100644
--- a/modelopt/torch/_compress/README.md
+++ b/modelopt/torch/_compress/README.md
@@ -1,3 +1,3 @@
 Experimental model compression algorithm based on a Local Neural Architecture Search.
-Based on the Puzzle paper: https://arxiv.org/abs/2411.19146
-PoC for Llama 3.1 model.
\ No newline at end of file
+Based on the Puzzle paper: <https://arxiv.org/abs/2411.19146>
+PoC for Llama 3.1 model.
diff --git a/modelopt/torch/_compress/compress.py b/modelopt/torch/_compress/compress.py
index 5136d9623..c0661259c 100644
--- a/modelopt/torch/_compress/compress.py
+++ b/modelopt/torch/_compress/compress.py
@@ -2,6 +2,7 @@
 
 This module provides the main compression function for a model
 using MIP-based NAS search algorithm.
+
 """
 
 import build_library_and_stats

From 5ba6c2705499d5e9f0f78ddbc43cd254cbaf99d6 Mon Sep 17 00:00:00 2001
From: Daniel Korzekwa <dkorzekwa@nvidia.com>
Date: Mon, 27 Oct 2025 12:21:11 +0100
Subject: [PATCH 03/81] Model search space configuration used by
 test_compress.py test.

Signed-off-by: Daniel Korzekwa <dkorzekwa@nvidia.com>
---
 .../resources/configs/Llama-3_1-8B.yaml       | 108 ++++++++++++++++
 .../bypass/bypass_distillation_defaults.yaml  | 116 ++++++++++++++++++
 .../configs/bypass/llama-3_1-8b_bypass.yaml   |  38 ++++++
 .../configs/pruning/attn_pruning.yaml         |  16 +++
 .../configs/pruning/ffn_pruning.yaml          |  12 ++
 .../configs/pruning/hidden_dim_pruning.yaml   |  15 +++
 .../configs/pruning/pruning_defaults.yaml     |  32 +++++
 .../configs/validate_model_defaults.yaml      |  15 +++
 .../configs/validate_solutions_defaults.yaml  |  10 ++
 9 files changed, 362 insertions(+)
 create mode 100644 tests/gpu/torch/_compress/resources/configs/Llama-3_1-8B.yaml
 create mode 100644 tests/gpu/torch/_compress/resources/configs/bypass/bypass_distillation_defaults.yaml
 create mode 100644 tests/gpu/torch/_compress/resources/configs/bypass/llama-3_1-8b_bypass.yaml
 create mode 100644 tests/gpu/torch/_compress/resources/configs/pruning/attn_pruning.yaml
 create mode 100644 tests/gpu/torch/_compress/resources/configs/pruning/ffn_pruning.yaml
 create mode 100644 tests/gpu/torch/_compress/resources/configs/pruning/hidden_dim_pruning.yaml
 create mode 100644 tests/gpu/torch/_compress/resources/configs/pruning/pruning_defaults.yaml
 create mode 100644 tests/gpu/torch/_compress/resources/configs/validate_model_defaults.yaml
 create mode 100644 tests/gpu/torch/_compress/resources/configs/validate_solutions_defaults.yaml

diff --git a/tests/gpu/torch/_compress/resources/configs/Llama-3_1-8B.yaml b/tests/gpu/torch/_compress/resources/configs/Llama-3_1-8B.yaml
new file mode 100644
index 000000000..98c7b746c
--- /dev/null
+++ b/tests/gpu/torch/_compress/resources/configs/Llama-3_1-8B.yaml
@@ -0,0 +1,108 @@
+defaults:
+  - pruning: ffn_pruning
+  - scoring: ../validate_solutions_defaults
+  - realize_model: ../validate_solutions_defaults
+  - bypass: llama-3_1-8b_bypass
+  - override hydra/hydra_logging: disabled
+  - _self_
+
+puzzle_dir: ???
+teacher_dir: ${puzzle_dir}/ckpts/teacher/
+replacement_library_path: ${puzzle_dir}/replacement_library.json
+dataset_path: ???     # path to v0.4_mini
+
+skip_realize_model: false
+
+build_replacement_library:
+  add_ffn_no_ops: true
+  add_attention_no_ops: true
+
+calc_subblock_stats:
+  batch_sizes: [64, 96, 128]
+  prefill_seq_len: 4096
+  generation_seq_len: 4096
+  num_active_tokens_override:       # Optional override for sequence lengths
+  prefill_queue_size: 0
+  allocate_prefill_query: false
+  benchmark_iterations:       # Set to a number (e.g., 1000) to enable runtime benchmarking
+  merge_with_existing_stats: false
+  subblock_stats_filename: "subblock_stats.json"
+  moe_stats_filename: "moe_stats.json"
+
+scoring:
+  solutions_to_validate:
+  skip_existing_solutions: true
+
+  replacement_library_path: ${replacement_library_path}
+  solutions_path: ${to_path:${puzzle_dir}/single_sequence_replacement_solutions.json}
+  teacher_dir: ${to_path:${teacher_dir}}
+  output_dir: ${puzzle_dir}/single_sequence_replacement_solutions--validation
+
+  eval_samples: 2
+  micro_batch_size: 1
+  dataset_path: ${dataset_path}/valid
+  seed: 42
+  shuffle_seed: 444
+
+mip:
+  single_block_replacement_validation_dir: ${to_path:${scoring.output_dir}}
+  subblock_stats_path: ${to_path:${puzzle_dir}/${calc_subblock_stats.subblock_stats_filename}}
+  output_path: ${to_path:${puzzle_dir}/mip/puzzle_solutions}
+  gathered_metrics_path:
+  puzzle_profile:
+
+  # puzzle_profile:
+  objective: metrics.cosine_embedding_loss_hidden_states
+  bigger_is_better: false
+  num_solutions: 1
+  minimal_diversity: 2
+
+  subblock_stats_args:
+    - batch_size: 96
+      weights_dtype: torch.bfloat16
+      activations_dtype: torch.bfloat16
+      kv_cache_dtype: torch.bfloat16
+
+  report_additional_costs:
+    - stats.memory_mib
+    - stats.num_params
+    - stats.num_kv_heads
+    - stats.has_attention
+    - stats.has_ffn
+    - stats.kv_cache_memory_mib
+    - stats.attention_memory_mib
+    - stats.ffn_memory_mib
+    - stats.ffn_num_params
+    - stats.attention_num_params
+
+  human_constraints:
+    target_memory: 780_000 # 78_000
+
+  mip_constraints:
+  use_greedy_search: false
+  is_multi_layer_puzzle: true
+  metric_overrides:
+  constrain_search_func:
+  max_seconds_per_solution: 60
+
+realize_model:
+  teacher_dir: ${to_path:${teacher_dir}}
+  tokenizer_name: ${to_path:${teacher_dir}}
+  replacement_library_path: ${replacement_library_path}
+  save_models: true
+  solutions_path:     # Filled dynamically
+
+  # Validate params
+  skip_validation: false    # To enable validation of the model solution set `skip_validation` as False
+  eval_samples: 2
+  micro_batch_size: 1
+  dataset_path: ${dataset_path}/valid
+  seed: 42
+  shuffle_seed: 444
+
+nccl_timeout_minutes: ${timedelta_minutes:10}
+
+# This section redirects Hydra outputs
+hydra:
+  run:
+    dir: ${puzzle_dir}/hydra_logs/${now:%Y-%m-%d}/${now:%H-%M-%S}
diff --git a/tests/gpu/torch/_compress/resources/configs/bypass/bypass_distillation_defaults.yaml b/tests/gpu/torch/_compress/resources/configs/bypass/bypass_distillation_defaults.yaml
new file mode 100644
index 000000000..c48f47f69
--- /dev/null
+++ b/tests/gpu/torch/_compress/resources/configs/bypass/bypass_distillation_defaults.yaml
@@ -0,0 +1,116 @@
+# defaults:
+#   - ../validate_model_defaults # TODO: Unify this default YAML with KD base YAML, for a "training defaults" configurations
+
+# Runtime Configuration
+dtype: "bf16"  # Model precision: bf16 for efficiency, fp32 for stability
+seed: 42       # Random seed for reproducibility
+
+# Experiment Tracking
+experiment_id:      # Unique identifier for this experiment. Will be dynamically set
+iter_num: 1         # Current iteration number
+step_num: 1         # Current step number within iteration
+token_count: 0      # Token count tracker (auto-updated during training)
+
+# Data Configuration
+data:
+  data_column: "conversation"
+  block_size: 8192                  # Sequence length (tokens per sample)
+  bos_rate: 0.5
+  fim_rate: 0
+  fim_spm_rate: 0
+  source_datasets_to_discard: []
+  load_from_disk: true              # Load preprocessed data from disk or from stream
+  keep_in_memory: false
+  val_dataset_name: valid
+  max_eval_samples: 256
+  eval_samples_per_process:         # Samples per GPU during distributed eval (auto if null)
+
+# Training Configuration
+training:
+  learning_rate: 1e-4               # Initial learning rate (1e-4 = 0.0001)
+  training_tokens: 1e+7             # Total training tokens (1B tokens)
+  micro_batch_size: 4
+  val_micro_batch_size: 2
+  warmup_ratio: 0.05
+  warmup_steps: ${warmup_steps:${.training_tokens},${..data.block_size},${.micro_batch_size},${.warmup_ratio}}  # Auto-calculated warmup steps
+  min_lr_factor: 1e-5
+  grad_accumulation_steps: 1
+  skip_first_batches: 0
+  weight_decay: 0.1
+  decay_lr: true
+  beta1: 0.9
+  beta2: 0.95
+  use_grad_scaling: false
+  grad_clip: 1.0
+  grad_clip_type: norm
+  clipping_count: 0
+  log_interval: 100
+  eval_interval: 100
+
+# Model Loading Configuration
+resume_checkpoint_path:         # Path to resume training from checkpoint
+parameter_count:
+init_checkpoint_path:           # Path to initialize weights from
+
+model:
+  student_weights_dtype: "bf16"   # Student model weight precision
+
+  model_overrides:
+    delete_old_checkpoints: true     # Clean up old checkpoints to save disk space
+    save_interval_seconds: 12900     # Save checkpoint every ~3.5 hours
+    save_interval: 1e+9              # Save checkpoint every 1B steps (effectively disabled)
+    save_checkpoint_when_done: true  # Save final checkpoint when training completes
+
+  # Architecture modifications for student model
+  model_config_overrides:
+    ffn:
+      - intermediate_size: 256
+        replace_with_linear: false   # Replace with simple linear layer (true/false)
+        no_op: false               # Disable FFN entirely (true/false)
+    attention:
+      - n_heads_in_group: 8     # Number of heads per group (for GQA)
+        replace_with_linear: false   # Replace attention with linear layer (true/false)
+        no_op: false               # Disable attention entirely (true/false)
+        # Sliding window attention length. Commenting this line so that the default value will be used.
+        #window_length: ???
+
+# Model Factory Configuration - Controls student model creation and initialization
+model_factory:
+  factory: gqa_factory_fn                    # Factory function for creating GQA (Grouped Query Attention) models
+  block_loss_func: normalized_mse_loss      # Loss function for comparing teacher/student blocks. vectorwise_normalized_mse_loss / batched_normalized_mse_loss / normalized_mse_loss
+  blocks_to_copy_indexes:                   # Which teacher blocks to copy unchanged (null = determine automatically)
+  gqa_init_mode: AverageKV                  # How to initialize K/V heads in GQA. All options here: GQAInitMode
+  mlp_init_mode: Truncate                   # MLP initialization. All options here: MlpInitMode
+  mlp_init_config:                          # Configuration for MLP initialization (if needed)
+    activations_log_dir:                    # Directory with activation statistics (required for PruneByActivationsLog)
+  linear_init_mode: FromTeacher             # How to initialize linear layers: FromTeacher, Random, etc.
+  student_module_for_bypass: block          # Which module to train as student.
+  submodule_for_loss_calculation:           # Specific submodule for loss calc.
+  keys_to_learn:                            # What parameters to train. Either "entire_block", or specific submodules. Computed dynamically.
+
+# Validation Configuration
+disable_initial_validate: false
+validate_teacher_model: true
+validate_student_model: true
+disable_validation: false          # Disable all validation (TODO: Not working yet)
+best_val_loss: 1e+9                # Track best validation loss achieved
+
+# Performance Optimization
+compile: false                     # Use PyTorch compilation (TODO: CURRENTLY NOT WORKING)
+disable_fa2: false                 # Disable Flash Attention 2 (false = use FA2 if available)
+teacher_model_load_on_cpu: false
+
+# Checkpoint Management
+save_checkpoint_before_training: true   # Save initial checkpoint before training
+disable_checkpoint_save: false          # Disable all checkpoint saving
+save_best_ckpt: true                    # Save checkpoint when validation improves
+kill_after_first_save: false           # Exit after first checkpoint save (for testing)
+realize_best_or_latest: "latest"
+
+# Experiment Tracking (Weights & Biases)
+wandb_log: false                    # Enable wandb logging
+wandb:
+  entity: ???                     # Must be set: wandb team/user name
+  mode: ???                       # Must be set: "online", "offline", or "disabled"
+  project: ???                    # Must be set: wandb project name
+  run_name: ???                   # Must be set: name for this specific run
diff --git a/tests/gpu/torch/_compress/resources/configs/bypass/llama-3_1-8b_bypass.yaml b/tests/gpu/torch/_compress/resources/configs/bypass/llama-3_1-8b_bypass.yaml
new file mode 100644
index 000000000..87341e72d
--- /dev/null
+++ b/tests/gpu/torch/_compress/resources/configs/bypass/llama-3_1-8b_bypass.yaml
@@ -0,0 +1,38 @@
+defaults:
+  - bypass_distillation_defaults
+
+# Model & Runtime Configuration
+
+# Data type for model weights and computations (bfloat16 for efficiency)
+dtype: "bf16"
+
+# Unique identifier for this experiment (must be set when running)
+experiment_id:
+
+# Data Configuration Overrides
+data:
+  max_eval_samples: 10
+
+# Model Factory Configuration
+model_factory:
+  mlp_init_mode: PruneByActivationsLog
+
+  mlp_init_config:
+    # REQUIRED: Path to directory containing activation statistics/logs
+    # This should point to precomputed activation data.
+    # Replace with the directory you want to init your FFN from.
+    # Example path for NRT cluster:  /lustre/fs1/portfolios/llmservice/projects/llmservice_deci_vlm/users/tkeren/puzzle/lior_exp/puzzle_kd-hidden-dim-4096_tokens-5e9_logits/pruning/pruning_scores/ffn_iterative/20000samples_diverse_mini
+    activations_log_dir: ${puzzle_dir}/pruning/pruning_scores/ffn_iterative/100samples_diverse_mini
+
+disable_initial_validate: false
+
+save_checkpoint_before_training: false
+
+wandb_log: false
+wandb:
+  # Organization/team name in wandb
+  entity: nv-aim
+  # Project name for organizing related experiments
+  project: puzzletron_bypass_distillation
+  mode: online
+  run_name: ${..experiment_id}
diff --git a/tests/gpu/torch/_compress/resources/configs/pruning/attn_pruning.yaml b/tests/gpu/torch/_compress/resources/configs/pruning/attn_pruning.yaml
new file mode 100644
index 000000000..01886607e
--- /dev/null
+++ b/tests/gpu/torch/_compress/resources/configs/pruning/attn_pruning.yaml
@@ -0,0 +1,16 @@
+defaults:
+  - pruning_defaults
+
+activations_log_dir: ${puzzle_dir}/pruning/pruning_scores/attn_${pruning.activation_hooks_kwargs.method}/${pruning.experiment_id}
+
+activation_hooks_kwargs:
+  method: independent_kv_head_contribution
+  optimize_for: memory    # IndependentKvHeadContributionHook implementation that consumes less memory
+  target_layer: "self_attn.o_proj"
+  layer_input_descriptors_path:
+
+# n_heads_in_group: 4
+# num_attention_heads: 32       # num query heads
+# num_kv_heads: 32 / 4 = 8      # num_query_heads // n_heads_in_group
+n_heads_in_group_list: [8, 16, 32]      # num_kv_heads = [4, 2, 1]
+gqa_init_mode: "PruneKVHeads"
diff --git a/tests/gpu/torch/_compress/resources/configs/pruning/ffn_pruning.yaml b/tests/gpu/torch/_compress/resources/configs/pruning/ffn_pruning.yaml
new file mode 100644
index 000000000..f0c852eec
--- /dev/null
+++ b/tests/gpu/torch/_compress/resources/configs/pruning/ffn_pruning.yaml
@@ -0,0 +1,12 @@
+defaults:
+  - pruning_defaults
+
+activations_log_dir: ${puzzle_dir}/pruning/pruning_scores/ffn_${pruning.activation_hooks_kwargs.method}/${pruning.experiment_id}
+
+activation_hooks_kwargs:
+  method: iterative
+  target_layer: "mlp.down_proj"
+  layer_input_descriptors_path:
+
+intermediate_size_list: [256]  # teacher_intermediate_size is 14336
+mlp_init_mode: "PruneByActivationsLog"
diff --git a/tests/gpu/torch/_compress/resources/configs/pruning/hidden_dim_pruning.yaml b/tests/gpu/torch/_compress/resources/configs/pruning/hidden_dim_pruning.yaml
new file mode 100644
index 000000000..407c835d8
--- /dev/null
+++ b/tests/gpu/torch/_compress/resources/configs/pruning/hidden_dim_pruning.yaml
@@ -0,0 +1,15 @@
+defaults:
+  - pruning_defaults
+
+activations_log_dir: ${puzzle_dir}/pruning/pruning_scores/hidden_dim_${pruning.activation_hooks_kwargs.method}/${pruning.experiment_id}
+
+activation_hooks_kwargs:
+  method: layer_norm_contribution
+  target_layer: "layernorm"
+
+# Hidden dimension pruning specific settings
+hidden_size_list: [3072, 2048]  # Target hidden sizes to prune to
+hidden_size_init_mode: "PruneByChannelRanking"
+mlp_init_mode: "Truncate" # TODO, make it work with CopyAsIs/FromTeacher
+gqa_init_mode: "AverageKV" # TODO, make it work with CopyAsIs/FromTeacher
+linear_init_mode: "FromTeacher"
diff --git a/tests/gpu/torch/_compress/resources/configs/pruning/pruning_defaults.yaml b/tests/gpu/torch/_compress/resources/configs/pruning/pruning_defaults.yaml
new file mode 100644
index 000000000..0a5eafcff
--- /dev/null
+++ b/tests/gpu/torch/_compress/resources/configs/pruning/pruning_defaults.yaml
@@ -0,0 +1,32 @@
+defaults:
+  - /validate_model_defaults
+
+model_name_or_path: ${teacher_dir}
+experiment_id: ${pruning.eval_samples}samples_diverse_mini
+activations_log_dir: ???
+activation_hooks_kwargs: ???
+
+# Data:
+eval_samples: 100
+micro_batch_size: 4
+dataset_path: ${dataset_path}
+val_dataset_name: train
+
+# Prune ckpts
+pruned_ckpts_outpt_dir: ${puzzle_dir}/pruning/${pruning.experiment_id}
+
+## FFN pruning
+ffn_list:
+mlp_init_mode: "Truncate"
+
+## KV-heads pruning
+n_heads_in_group_list:
+gqa_init_mode: "AverageKV"
+
+## Hidden dimension pruning
+hidden_size_list:
+hidden_size_init_mode: "PruneByChannelRanking"
+linear_init_mode: "FromTeacher"
+
+mlp_init_config_yaml:
+  activations_log_dir: ${pruning.activations_log_dir}
diff --git a/tests/gpu/torch/_compress/resources/configs/validate_model_defaults.yaml b/tests/gpu/torch/_compress/resources/configs/validate_model_defaults.yaml
new file mode 100644
index 000000000..046ff51f6
--- /dev/null
+++ b/tests/gpu/torch/_compress/resources/configs/validate_model_defaults.yaml
@@ -0,0 +1,15 @@
+block_size: 8192
+bos_rate: 0.5
+data_column: conversation
+val_dataset_name: train
+shuffle_seed: 81436
+seed: 42
+fim_rate: 0
+fim_spm_rate: 0
+source_datasets_to_discard:
+varlen: false
+write_results: false
+calc_losses_on_cpu: false
+activations_log_dir:
+model_name_or_path:
+load_dataset_fn: ${get_object:utils.data.dataloaders.load_from_disk_fn}
diff --git a/tests/gpu/torch/_compress/resources/configs/validate_solutions_defaults.yaml b/tests/gpu/torch/_compress/resources/configs/validate_solutions_defaults.yaml
new file mode 100644
index 000000000..ec1390237
--- /dev/null
+++ b/tests/gpu/torch/_compress/resources/configs/validate_solutions_defaults.yaml
@@ -0,0 +1,10 @@
+defaults:
+  - /validate_model_defaults
+  - _self_
+
+solutions_to_validate:
+skip_validation: false
+save_models: false
+bigger_is_better: false
+sort_solutions_by:
+calculate_full_score_ablations: false

From 0bc5d8492886376eda41fb9235081886b1e2ea24 Mon Sep 17 00:00:00 2001
From: Daniel Korzekwa <dkorzekwa@nvidia.com>
Date: Mon, 27 Oct 2025 12:21:50 +0100
Subject: [PATCH 04/81] Tokenizer used by test_compress.py test.

Signed-off-by: Daniel Korzekwa <dkorzekwa@nvidia.com>
---
 .../tokenizer/special_tokens_map.json         |  16 ++
 .../resources/tokenizer/tokenizer.json        | 212 ++++++++++++++++++
 .../resources/tokenizer/tokenizer_config.json |  13 ++
 3 files changed, 241 insertions(+)
 create mode 100644 tests/gpu/torch/_compress/resources/tokenizer/special_tokens_map.json
 create mode 100644 tests/gpu/torch/_compress/resources/tokenizer/tokenizer.json
 create mode 100644 tests/gpu/torch/_compress/resources/tokenizer/tokenizer_config.json

diff --git a/tests/gpu/torch/_compress/resources/tokenizer/special_tokens_map.json b/tests/gpu/torch/_compress/resources/tokenizer/special_tokens_map.json
new file mode 100644
index 000000000..02ee80b61
--- /dev/null
+++ b/tests/gpu/torch/_compress/resources/tokenizer/special_tokens_map.json
@@ -0,0 +1,16 @@
+{
+  "bos_token": {
+    "content": "<|begin_of_text|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|eot_id|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
diff --git a/tests/gpu/torch/_compress/resources/tokenizer/tokenizer.json b/tests/gpu/torch/_compress/resources/tokenizer/tokenizer.json
new file mode 100644
index 000000000..83592e249
--- /dev/null
+++ b/tests/gpu/torch/_compress/resources/tokenizer/tokenizer.json
@@ -0,0 +1,212 @@
+{
+  "version": "1.0",
+  "truncation": null,
+  "padding": null,
+  "added_tokens": [],
+  "normalizer": null,
+  "pre_tokenizer": {
+    "type": "Sequence",
+    "pretokenizers": [
+      {
+        "type": "Split",
+        "pattern": {
+          "Regex": "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
+        },
+        "behavior": "Isolated",
+        "invert": false
+      },
+      {
+        "type": "ByteLevel",
+        "add_prefix_space": false,
+        "trim_offsets": true,
+        "use_regex": false
+      }
+    ]
+  },
+  "post_processor": {
+    "type": "Sequence",
+    "processors": [
+      {
+        "type": "ByteLevel",
+        "add_prefix_space": true,
+        "trim_offsets": false,
+        "use_regex": true
+      },
+      {
+        "type": "TemplateProcessing",
+        "single": [
+          {
+            "SpecialToken": {
+              "id": "<|begin_of_text|>",
+              "type_id": 0
+            }
+          },
+          {
+            "Sequence": {
+              "id": "A",
+              "type_id": 0
+            }
+          }
+        ],
+        "pair": [
+          {
+            "SpecialToken": {
+              "id": "<|begin_of_text|>",
+              "type_id": 0
+            }
+          },
+          {
+            "Sequence": {
+              "id": "A",
+              "type_id": 0
+            }
+          },
+          {
+            "SpecialToken": {
+              "id": "<|begin_of_text|>",
+              "type_id": 1
+            }
+          },
+          {
+            "Sequence": {
+              "id": "B",
+              "type_id": 1
+            }
+          }
+        ],
+        "special_tokens": {
+          "<|begin_of_text|>": {
+            "id": "<|begin_of_text|>",
+            "ids": [
+              100
+            ],
+            "tokens": [
+              "<|begin_of_text|>"
+            ]
+          }
+        }
+      }
+    ]
+  },
+  "decoder": {
+    "type": "ByteLevel",
+    "add_prefix_space": true,
+    "trim_offsets": true,
+    "use_regex": true
+  },
+  "model": {
+    "type": "BPE",
+    "dropout": null,
+    "unk_token": null,
+    "continuing_subword_prefix": null,
+    "end_of_word_suffix": null,
+    "fuse_unk": false,
+    "byte_fallback": false,
+    "ignore_merges": true,
+    "vocab": {
+      "!": 0,
+      "\"": 1,
+      "#": 2,
+      "$": 3,
+      "%": 4,
+      "&": 5,
+      "'": 6,
+      "(": 7,
+      ")": 8,
+      "*": 9,
+      "+": 10,
+      ",": 11,
+      "-": 12,
+      ".": 13,
+      "/": 14,
+      "0": 15,
+      "1": 16,
+      "2": 17,
+      "3": 18,
+      "4": 19,
+      "5": 20,
+      "6": 21,
+      "7": 22,
+      "8": 23,
+      "9": 24,
+      ":": 25,
+      ";": 26,
+      "<": 27,
+      "=": 28,
+      ">": 29,
+      "?": 30,
+      "@": 31,
+      "A": 32,
+      "B": 33,
+      "C": 34,
+      "D": 35,
+      "E": 36,
+      "F": 37,
+      "G": 38,
+      "H": 39,
+      "I": 40,
+      "J": 41,
+      "K": 42,
+      "L": 43,
+      "M": 44,
+      "N": 45,
+      "O": 46,
+      "P": 47,
+      "Q": 48,
+      "R": 49,
+      "S": 50,
+      "T": 51,
+      "U": 52,
+      "V": 53,
+      "W": 54,
+      "X": 55,
+      "Y": 56,
+      "Z": 57,
+      "[": 58,
+      "\\": 59,
+      "]": 60,
+      "^": 61,
+      "_": 62,
+      "`": 63,
+      "a": 64,
+      "b": 65,
+      "c": 66,
+      "d": 67,
+      "e": 68,
+      "f": 69,
+      "g": 70,
+      "h": 71,
+      "i": 72,
+      "j": 73,
+      "k": 74,
+      "l": 75,
+      "m": 76,
+      "n": 77,
+      "o": 78,
+      "p": 79,
+      "q": 80,
+      "r": 81,
+      "s": 82,
+      "t": 83,
+      "u": 84,
+      "v": 85,
+      "w": 86,
+      "x": 87,
+      "y": 88,
+      "z": 89,
+      "{": 90,
+      "|": 91,
+      "}": 92,
+      "~": 93,
+      "¡": 94,
+      "¢": 95,
+      "£": 96,
+      "¤": 97,
+      "¥": 98,
+      "¦": 99,
+      "<|begin_of_text|>": 100,
+      "<|eot_id|>": 101
+    },
+    "merges": []
+  }
+}
diff --git a/tests/gpu/torch/_compress/resources/tokenizer/tokenizer_config.json b/tests/gpu/torch/_compress/resources/tokenizer/tokenizer_config.json
new file mode 100644
index 000000000..754d9e8db
--- /dev/null
+++ b/tests/gpu/torch/_compress/resources/tokenizer/tokenizer_config.json
@@ -0,0 +1,13 @@
+{
+  "bos_token": "<|begin_of_text|>",
+  "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n    {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n    {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n    {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n    {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n    {%- set system_message = messages[0]['content']|trim %}\n    {%- set messages = messages[1:] %}\n{%- else %}\n    {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n    {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n    {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n    {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n    {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n    {{- \"Do not use variables.\\n\\n\" }}\n    {%- for t in tools %}\n        {{- t | tojson(indent=4) }}\n        {{- \"\\n\\n\" }}\n    {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n    {#- Extract the first user message so we can plug it in here #}\n    {%- if messages | length != 0 %}\n        {%- set first_user_message = messages[0]['content']|trim %}\n        {%- set messages = messages[1:] %}\n    {%- else %}\n        {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n    {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n    {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n    {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n    {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n    {{- \"Do not use variables.\\n\\n\" }}\n    {%- for t in tools %}\n        {{- t | tojson(indent=4) }}\n        {{- \"\\n\\n\" }}\n    {%- endfor %}\n    {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n    {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n        {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n    {%- elif 'tool_calls' in message %}\n        {%- if not message.tool_calls|length == 1 %}\n            {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n        {%- endif %}\n        {%- set tool_call = message.tool_calls[0].function %}\n        {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n            {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n            {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n            {%- for arg_name, arg_val in tool_call.arguments | items %}\n                {{- arg_name + '=\"' + arg_val + '\"' }}\n                {%- if not loop.last %}\n                    {{- \", \" }}\n                {%- endif %}\n                {%- endfor %}\n            {{- \")\" }}\n        {%- else  %}\n            {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n            {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n            {{- '\"parameters\": ' }}\n            {{- tool_call.arguments | tojson }}\n            {{- \"}\" }}\n        {%- endif %}\n        {%- if builtin_tools is defined %}\n            {#- This means we're in ipython mode #}\n            {{- \"<|eom_id|>\" }}\n        {%- else %}\n            {{- \"<|eot_id|>\" }}\n        {%- endif %}\n    {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n        {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n        {%- if message.content is mapping or message.content is iterable %}\n            {{- message.content | tojson }}\n        {%- else %}\n            {{- message.content }}\n        {%- endif %}\n        {{- \"<|eot_id|>\" }}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n",
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "<|eot_id|>",
+  "extra_special_tokens": {},
+  "model_input_names": [
+    "input_ids",
+    "attention_mask"
+  ],
+  "model_max_length": 131072,
+  "tokenizer_class": "PreTrainedTokenizer"
+}

From 87d4fa5930e79b15fe2751940e519eb937600a00 Mon Sep 17 00:00:00 2001
From: Daniel Korzekwa <dkorzekwa@nvidia.com>
Date: Mon, 27 Oct 2025 12:22:30 +0100
Subject: [PATCH 05/81] Tokenizer utility used by test_compress.py test

Signed-off-by: Daniel Korzekwa <dkorzekwa@nvidia.com>
---
 .../resources/tokenizer/truncate_tokenizer.py | 42 +++++++++++++++++++
 1 file changed, 42 insertions(+)
 create mode 100644 tests/gpu/torch/_compress/resources/tokenizer/truncate_tokenizer.py

diff --git a/tests/gpu/torch/_compress/resources/tokenizer/truncate_tokenizer.py b/tests/gpu/torch/_compress/resources/tokenizer/truncate_tokenizer.py
new file mode 100644
index 000000000..baac5e14c
--- /dev/null
+++ b/tests/gpu/torch/_compress/resources/tokenizer/truncate_tokenizer.py
@@ -0,0 +1,42 @@
+import json
+
+# Path to your original and new tokenizer.json
+in_path = "./tokenizer.json"
+out_path = "./tokenizer_truncated.json"
+
+# How many top tokens to keep
+NUM_TO_KEEP = 100
+
+with open(in_path, encoding="utf-8") as f:
+    tokenizer_data = json.load(f)
+
+# Get and sort the original vocab by index (frequency proxy)
+orig_vocab = tokenizer_data["model"]["vocab"]
+
+# Sort tokens by their original index (lowest index = assumed most common/important)
+sorted_tokens = sorted(orig_vocab.items(), key=lambda item: item[1])
+
+# Keep the top N tokens
+tokens_to_keep = [tok for tok, idx in sorted_tokens[:NUM_TO_KEEP]]
+
+# Re-index the selected tokens: 0..N-1
+small_vocab = {tok: i for i, tok in enumerate(tokens_to_keep)}
+tokenizer_data["model"]["vocab"] = small_vocab
+
+# Update vocab size
+if "vocab_size" in tokenizer_data["model"]:
+    tokenizer_data["model"]["vocab_size"] = len(small_vocab)
+
+# Optionally remove merges if present and unneeded (mostly for BPE/WordPiece)
+if "merges" in tokenizer_data["model"]:
+    tokenizer_data["model"]["merges"] = []
+
+# Remove added_tokens if not needed
+if "added_tokens" in tokenizer_data:
+    tokenizer_data["added_tokens"] = []
+
+# Write out the truncated tokenizer.json
+with open(out_path, "w", encoding="utf-8") as f:
+    json.dump(tokenizer_data, f, indent=2, ensure_ascii=False)
+
+print(f"Truncated tokenizer saved to: {out_path}")

From ced1e997431c8d88e452dd56804adcca67d88bca Mon Sep 17 00:00:00 2001
From: Daniel Korzekwa <dkorzekwa@nvidia.com>
Date: Mon, 27 Oct 2025 12:22:46 +0100
Subject: [PATCH 06/81] e2e tests for compress.py

Signed-off-by: Daniel Korzekwa <dkorzekwa@nvidia.com>
---
 tests/gpu/torch/_compress/test_compress.py | 186 +++++++++++++++++++++
 1 file changed, 186 insertions(+)
 create mode 100644 tests/gpu/torch/_compress/test_compress.py

diff --git a/tests/gpu/torch/_compress/test_compress.py b/tests/gpu/torch/_compress/test_compress.py
new file mode 100644
index 000000000..ddcea6aaf
--- /dev/null
+++ b/tests/gpu/torch/_compress/test_compress.py
@@ -0,0 +1,186 @@
+import datetime
+import os
+import os.path as osp
+import shutil
+from pathlib import Path
+
+import pytest
+import torch
+from logger import mprint
+from puzzle_tools.hydra_utils import register_hydra_resolvers
+from puzzle_tools.runtime import NativeDDP_Runtime
+from scripts.convert_llama3_to_decilm import convert_llama3_to_decilm
+from transformers import AutoTokenizer, LlamaConfig, LlamaForCausalLM, PreTrainedTokenizerBase
+
+from modelopt.torch._compress import compress
+from tests.integration.puzzle_tools.e2e_puzzletron_test.dummy_dataset import save_dummy_dataset
+
+
+@pytest.fixture(scope="module", autouse=True)
+def setup_test_module():
+    register_hydra_resolvers()
+
+
+@pytest.fixture
+def project_root_path(request: pytest.FixtureRequest) -> Path:
+    return Path(request.config.rootpath)
+
+
+# The e2e test to compress a model based on Local Neural Architecture Search (Mixed Integer Programing NAS search)
+# using a one-click command.
+#
+# Note: Bypass is disabled now in the test.
+
+# How to run this test (currently only supported internally at Nvidia).
+#
+# Have both modelopt and puzzle source code in the same directory:
+# /workspace/modelopt
+# /workspace/puzzletron
+#
+# submit_job --partition interactive --time 0 \
+# --image gitlab-master.nvidia.com/deci/puzzletron:trtllm_main \
+# --workdir $MODELOPT SRC DIRECTORY --interactive --gpu 1
+#
+# pip install mip
+# pip install lru-dict
+#
+# export PYTHONPATH=$PYTHONPATH:/workspace/puzzletron/v1
+#
+# ../puzzletron/v1/scripts/torch_dist_runner.sh \
+# pytest -s -v ./tests/gpu/torch/puzzletron/test_compress_model.py -o addopts=""
+
+
+def test_compress(project_root_path):
+    # The input to puzzletron.compress().
+    os.environ["WANDB_DISABLED"] = "true"
+    puzzle_dir = "/tmp/pytest-shared/test_compress_model"
+    dataset_path = osp.join(puzzle_dir, "dummy_dataset")
+    hydra_config_dir = osp.join(
+        project_root_path,
+        "tests/gpu/torch/_compress/resources/configs",
+    )
+
+    _runtime = NativeDDP_Runtime(
+        dtype=torch.bfloat16, torch_distributed_timeout=datetime.timedelta(10)
+    )
+
+    with _runtime as runtime:
+        #
+        # Test setup
+        #
+        if runtime.global_rank == 0:
+            # Setup puzzle_dir and dataset
+            setup_puzzle_dir(puzzle_dir)
+            save_dummy_dataset(dataset_path)
+
+            #
+            # Step 1: Create and save a teacher model to compress
+            # This mimics the normal pipeline where we start with a Llama model
+            #
+            tokenizer_path = osp.join(
+                project_root_path, "tests/gpu/torch/_compress/resources/tokenizer"
+            )
+            tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
+
+            # Create a small Llama model (not DeciLM) to match the normal conversion pipeline
+            hf_ckpt_teacher_dir = "ckpts/teacher"
+            llama_checkpoint_path = osp.join(puzzle_dir, hf_ckpt_teacher_dir)
+            create_and_save_small_llama_model(
+                llama_checkpoint_path, vocab_size=tokenizer.vocab_size, tokenizer=tokenizer
+            )
+
+            # Use the full conversion pipeline (matches normal usage)
+            convert_llama3_to_decilm(
+                input_dir=llama_checkpoint_path,
+                output_dir=llama_checkpoint_path,
+            )
+        runtime.wait_for_everyone()
+
+        # Compress the model using a one-click approach
+        compress.compress(hydra_config_dir, "Llama-3_1-8B", puzzle_dir, dataset_path, runtime)
+
+        #
+        # Check assertions
+        #
+        if runtime.global_rank == 0:
+            # assertions for the score_pruning_activations step 1
+            rank = int(os.environ["RANK"])
+            rank_filepath = (
+                f"pruning/pruning_scores/ffn_iterative/100samples_diverse_mini/rank_{rank}.pth"
+            )
+            assert os.path.isfile(osp.join(puzzle_dir, rank_filepath))
+
+            # assertions for the pruning_ckpts step 2
+            assert os.path.exists(osp.join(puzzle_dir, "ckpts/ffn_256_attn_no_op"))
+
+            # assertions fo bypass distillation step 3
+            # TODO: Add bypass distillation step
+            # assert os.path.exists(osp.join(hydra_cfg.bypass.experiment_dir, "latest/config.json"))
+
+            # assertions for the build_library_and_stats step 4
+            assert os.path.isfile(osp.join(puzzle_dir, "replacement_library.json"))
+            assert os.path.isfile(osp.join(puzzle_dir, "subblock_stats.json"))
+
+            # assertions for the scoring step 5
+            solution_0_filepath = osp.join(
+                puzzle_dir, "single_sequence_replacement_solutions--validation/solution_0.json"
+            )
+            assert os.path.exists(solution_0_filepath)
+
+            # assertions for the mip_and_realize_models step 6
+            solution_0_ckpt_config_path = osp.join(
+                puzzle_dir,
+                "mip/puzzle_solutions/target_memory_780000MiB/solutions--checkpoints/solution_0/config.json",
+            )
+            assert os.path.exists(solution_0_ckpt_config_path)
+            assert os.path.exists(
+                osp.join(puzzle_dir, "mip/puzzle_solutions/target_memory_780000MiB/solutions.json")
+            )
+
+        runtime.wait_for_everyone()
+
+        mprint("PYTEST SUMMARY: test_compress_model() test has finished successfully")
+
+
+def create_and_save_small_llama_model(
+    output_path: str, vocab_size: int, tokenizer: PreTrainedTokenizerBase
+):
+    """
+    Create and save a small Llama model for testing the conversion pipeline.
+    This mimics having a real Llama checkpoint that needs to be converted.
+    """
+    os.makedirs(output_path, exist_ok=True)
+
+    # Create a minimal Llama config (small for testing)
+    # Note: intermediate_size must be divisible by 256 per DeciLM config requirements
+    # Note: hidden_size must give head_dim >= 8 for Flash Attention 2 compatibility
+    llama_config = LlamaConfig(
+        vocab_size=vocab_size,
+        hidden_size=256,  # 32 heads times 8 head_dim = 256 (matches bypass config expectations)
+        intermediate_size=512,  # Must be divisible by 256
+        num_hidden_layers=2,
+        num_attention_heads=32,  # Matches original test
+        num_key_value_heads=8,  # GQA: 32÷4=8 (matches original n_heads_in_group=4)
+        max_position_embeddings=512,
+        rms_norm_eps=1e-5,
+        rope_theta=10000.0,
+        attention_bias=False,
+        hidden_act="silu",
+        tie_word_embeddings=False,
+    )
+
+    # Create and save the Llama model
+    model = LlamaForCausalLM(llama_config)
+    model.to(dtype=torch.bfloat16).save_pretrained(output_path)
+
+    # Save tokenizer
+    tokenizer.save_pretrained(output_path)
+
+    # Save config
+    llama_config.save_pretrained(output_path)
+
+
+def setup_puzzle_dir(puzzle_dir: str):
+    if Path(puzzle_dir).exists():
+        shutil.rmtree(puzzle_dir)
+        Path(puzzle_dir).mkdir(parents=True, exist_ok=True)

From 5de0bdc6846d4c707c25440c12daf55993f53169 Mon Sep 17 00:00:00 2001
From: Daniel Korzekwa <dkorzekwa@nvidia.com>
Date: Mon, 27 Oct 2025 12:45:28 +0100
Subject: [PATCH 07/81] Add convert_llama3_config_to_decilm_config + unit test

Signed-off-by: Daniel Korzekwa <dkorzekwa@nvidia.com>
---
 .../converters/convert_llama3_to_decilm.py    | 136 ++++++++++++++++++
 ..._convert_llama3_config_to_decilm_config.py |  45 ++++++
 2 files changed, 181 insertions(+)
 create mode 100644 modelopt/torch/_compress/decilm/converters/convert_llama3_to_decilm.py
 create mode 100644 tests/gpu/torch/_compress/decilm/converters/test_convert_llama3_config_to_decilm_config.py

diff --git a/modelopt/torch/_compress/decilm/converters/convert_llama3_to_decilm.py b/modelopt/torch/_compress/decilm/converters/convert_llama3_to_decilm.py
new file mode 100644
index 000000000..6cdd1f02c
--- /dev/null
+++ b/modelopt/torch/_compress/decilm/converters/convert_llama3_to_decilm.py
@@ -0,0 +1,136 @@
+"""Convert a Llama3 model to a DeciLM model."""
+
+#!/usr/bin/env python3
+from pathlib import Path
+
+from fire import Fire
+from puzzle_tools.checkpoint_utils import copy_tokenizer
+from puzzle_tools.checkpoint_utils_hf import copy_deci_lm_hf_code
+from puzzle_tools.conversion_utils import convert_model_weights_to_decilm
+from puzzle_tools.deci_lm_hf_code.configuration_decilm import DeciLMConfig
+from transformers import LlamaConfig
+
+"""
+example:
+
+python -m scripts.hf.convert_llama3_to_decilm  \
+    --input_dir .../meta-llama/Meta-Llama-3.1-8B-Instruct \
+    --output_dir .../meta-llama/Meta-Llama-3.1-8B-Instruct--deci-hf/
+"""
+
+
+def convert_llama3_config_to_decilm_config(config: LlamaConfig) -> DeciLMConfig:
+    """Convert Llama3 config to DeciLM config format."""
+    print("\n=== Converting Llama3 Config to DeciLM Config ===")
+
+    # Get dtype from config - check both dtype and torch_dtype
+    # Prefer dtype if it's set (not None), otherwise fall back to torch_dtype
+    dtype = getattr(config, "dtype", None)
+    if dtype is None:
+        dtype = getattr(config, "torch_dtype", None)
+
+    # Convert torch.dtype to string if needed (for JSON serialization)
+    if dtype is not None and hasattr(dtype, "__module__") and "torch" in dtype.__module__:
+        dtype = str(dtype).replace("torch.", "")
+
+    # Track which global values will be removed (moved to per-layer configs)
+    print("\n📝 Converting global values to per-layer block_configs:")
+    print(
+        f"  - intermediate_size: {config.intermediate_size} → block_configs[*].ffn.intermediate_size"
+    )
+    print(
+        f"  - num_key_value_heads: {config.num_key_value_heads} → block_configs[*].attention.n_heads_in_group (derived)"
+    )
+    print(f"  - hidden_act: {config.hidden_act} → block_configs[*].ffn.hidden_act")
+    print(
+        f"  - sliding_window: {getattr(config, 'sliding_window', None)} → block_configs[*].attention.window_length"
+    )
+
+    # Create block configs for each layer
+    block_configs = []
+    for i in range(config.num_hidden_layers):
+        # Configure attention
+        attention_config = {
+            "no_op": False,
+            "replace_with_linear": False,
+            "sparsify": None,
+            "n_heads_in_group": config.num_attention_heads // config.num_key_value_heads,
+            "window_length": None,  # Llama3 doesn't use sliding window by default
+            "num_sink_tokens": None,  # Llama3 doesn't use sink attention
+            "use_prefill_window_in_sink_attention": False,
+            "unshifted_sink": False,
+            "mamba": None,
+            "llama4": None,  # No Llama4 specific attention for Llama3
+        }
+
+        # Configure FFN
+        ffn_config = {
+            "no_op": False,
+            "replace_with_linear": False,
+            "sparsify": None,
+            "intermediate_size": config.intermediate_size,
+            "gated": True,  # Llama3 uses SwiGLU
+            "hidden_act": config.hidden_act,
+            "moe": None,  # Llama3 doesn't use MoE
+        }
+
+        block_configs.append({"attention": attention_config, "ffn": ffn_config})
+
+    # Create DeciLM config
+    decilm_config = DeciLMConfig(
+        block_configs=block_configs,
+        hidden_size=config.hidden_size,
+        max_position_embeddings=config.max_position_embeddings,
+        num_attention_heads=config.num_attention_heads,
+        num_hidden_layers=config.num_hidden_layers,
+        tie_word_embeddings=config.tie_word_embeddings,
+        vocab_size=config.vocab_size,
+        rms_norm_eps=config.rms_norm_eps,
+        attention_bias=config.attention_bias,
+        o_proj_bias=config.attention_bias,  # llama3 bias defined by attention_bias
+        rope_theta=config.rope_theta,
+        rope_scaling=config.rope_scaling,
+        position_embedding_type="rope",  # Llama3 uses standard RoPE
+        architectures=["DeciLMForCausalLM"],
+        auto_map={
+            "AutoConfig": "configuration_decilm.DeciLMConfig",
+            "AutoModelForCausalLM": "modeling_decilm.DeciLMForCausalLM",
+        },
+        eos_token_id=config.eos_token_id,
+        bos_token_id=config.bos_token_id,
+        pad_token_id=config.pad_token_id,
+        head_dim=getattr(config, "head_dim", config.hidden_size // config.num_attention_heads),
+        dtype=dtype,
+    )
+
+    print(f"\n✓ Created DeciLM config with {len(block_configs)} layers")
+    print(
+        "✓ Global per-layer keys (intermediate_size, num_key_value_heads, hidden_act, sliding_window)"
+    )
+    print("  will be removed from saved config and are only in block_configs")
+
+    return decilm_config
+
+
+def convert_configs_in_dirs(input_dir, output_dir):
+    """Convert the config of a Llama3 model to a DeciLM model."""
+    input_dir = Path(input_dir)
+    output_dir = Path(output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    input_config_path = input_dir / "config.json"
+    config = LlamaConfig.from_pretrained(input_config_path)
+    decilm_config = convert_llama3_config_to_decilm_config(config)
+    decilm_config.save_pretrained(output_dir)
+
+
+def convert_llama3_to_decilm(input_dir, output_dir):
+    """Convert a Llama3 model to a DeciLM model."""
+    convert_configs_in_dirs(input_dir, output_dir)
+    copy_tokenizer(input_dir, output_dir)
+    convert_model_weights_to_decilm(input_dir, output_dir)
+    copy_deci_lm_hf_code(output_dir)
+
+
+if __name__ == "__main__":
+    Fire(convert_llama3_to_decilm)
diff --git a/tests/gpu/torch/_compress/decilm/converters/test_convert_llama3_config_to_decilm_config.py b/tests/gpu/torch/_compress/decilm/converters/test_convert_llama3_config_to_decilm_config.py
new file mode 100644
index 000000000..4bab4d505
--- /dev/null
+++ b/tests/gpu/torch/_compress/decilm/converters/test_convert_llama3_config_to_decilm_config.py
@@ -0,0 +1,45 @@
+import json
+import os.path as osp
+from pathlib import Path
+
+import pytest
+from gpu.torch._compress.test_compress import create_and_save_small_llama_model
+from transformers import AutoTokenizer
+
+from modelopt.torch._compress.decilm.converters.convert_llama3_to_decilm import (
+    convert_llama3_to_decilm,
+)
+
+
+@pytest.fixture
+def project_root_path(request: pytest.FixtureRequest) -> Path:
+    return Path(request.config.rootpath)
+
+
+def test_convert_llama3_config_to_decilm_config(project_root_path: Path, tmp_path: Path):
+    tokenizer_path = osp.join(project_root_path, "tests/gpu/torch/_compress/resources/tokenizer")
+    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
+
+    llama_checkpoint_path = tmp_path / "llama_checkpoint"
+    create_and_save_small_llama_model(
+        llama_checkpoint_path, vocab_size=tokenizer.vocab_size, tokenizer=tokenizer
+    )
+
+    # Convert the Llama model to a DeciLM model
+    decilm_checkpoint_path = tmp_path / "decilm_checkpoint"
+    convert_llama3_to_decilm(
+        input_dir=llama_checkpoint_path,
+        output_dir=decilm_checkpoint_path,
+    )
+
+    # Assert that the converted config has the correct number of block_configs
+    config_path = decilm_checkpoint_path / "config.json"
+    assert config_path.exists(), f"Config file not found at {config_path}"
+
+    with open(config_path) as f:
+        decilm_config = json.load(f)
+
+    # Verify block_configs exists and has the correct length
+    assert "block_configs" in decilm_config, "block_configs not found in converted config"
+    actual_num_block_configs = len(decilm_config["block_configs"])
+    assert actual_num_block_configs == 2

From 800414c275994ac44ac9881d5839f0e9a2aa0c1e Mon Sep 17 00:00:00 2001
From: Daniel Korzekwa <dkorzekwa@nvidia.com>
Date: Mon, 27 Oct 2025 15:35:51 +0100
Subject: [PATCH 08/81] Remove unused bypass distillation config files.

Signed-off-by: Daniel Korzekwa <dkorzekwa@nvidia.com>
---
 modelopt/torch/_compress/compress.py          |   4 -
 .../resources/configs/Llama-3_1-8B.yaml       |   2 +-
 .../bypass/bypass_distillation_defaults.yaml  | 116 ------------------
 .../configs/bypass/llama-3_1-8b_bypass.yaml   |  38 ------
 4 files changed, 1 insertion(+), 159 deletions(-)
 delete mode 100644 tests/gpu/torch/_compress/resources/configs/bypass/bypass_distillation_defaults.yaml
 delete mode 100644 tests/gpu/torch/_compress/resources/configs/bypass/llama-3_1-8b_bypass.yaml

diff --git a/modelopt/torch/_compress/compress.py b/modelopt/torch/_compress/compress.py
index c0661259c..97819a42b 100644
--- a/modelopt/torch/_compress/compress.py
+++ b/modelopt/torch/_compress/compress.py
@@ -53,10 +53,6 @@ def compress(
         pruning_ckpts.launch_prune_ckpt(hydra_cfg)
     runtime.wait_for_everyone()
 
-    # # Step 3: bypass distillation (distributed processing)
-    # # TODO: Add bypass distillation step
-    # #run_bypassed_training(hydra_cfg, runtime)
-
     # Step 4: build_library_and_stats (single process)
     if runtime.global_rank == 0:
         build_library_and_stats.launch_build_library_and_stats(hydra_cfg)
diff --git a/tests/gpu/torch/_compress/resources/configs/Llama-3_1-8B.yaml b/tests/gpu/torch/_compress/resources/configs/Llama-3_1-8B.yaml
index 98c7b746c..1d8fac655 100644
--- a/tests/gpu/torch/_compress/resources/configs/Llama-3_1-8B.yaml
+++ b/tests/gpu/torch/_compress/resources/configs/Llama-3_1-8B.yaml
@@ -2,7 +2,7 @@ defaults:
   - pruning: ffn_pruning
   - scoring: ../validate_solutions_defaults
   - realize_model: ../validate_solutions_defaults
-  - bypass: llama-3_1-8b_bypass
+  - bypass:
   - override hydra/hydra_logging: disabled
   - _self_
 
diff --git a/tests/gpu/torch/_compress/resources/configs/bypass/bypass_distillation_defaults.yaml b/tests/gpu/torch/_compress/resources/configs/bypass/bypass_distillation_defaults.yaml
deleted file mode 100644
index c48f47f69..000000000
--- a/tests/gpu/torch/_compress/resources/configs/bypass/bypass_distillation_defaults.yaml
+++ /dev/null
@@ -1,116 +0,0 @@
-# defaults:
-#   - ../validate_model_defaults # TODO: Unify this default YAML with KD base YAML, for a "training defaults" configurations
-
-# Runtime Configuration
-dtype: "bf16"  # Model precision: bf16 for efficiency, fp32 for stability
-seed: 42       # Random seed for reproducibility
-
-# Experiment Tracking
-experiment_id:      # Unique identifier for this experiment. Will be dynamically set
-iter_num: 1         # Current iteration number
-step_num: 1         # Current step number within iteration
-token_count: 0      # Token count tracker (auto-updated during training)
-
-# Data Configuration
-data:
-  data_column: "conversation"
-  block_size: 8192                  # Sequence length (tokens per sample)
-  bos_rate: 0.5
-  fim_rate: 0
-  fim_spm_rate: 0
-  source_datasets_to_discard: []
-  load_from_disk: true              # Load preprocessed data from disk or from stream
-  keep_in_memory: false
-  val_dataset_name: valid
-  max_eval_samples: 256
-  eval_samples_per_process:         # Samples per GPU during distributed eval (auto if null)
-
-# Training Configuration
-training:
-  learning_rate: 1e-4               # Initial learning rate (1e-4 = 0.0001)
-  training_tokens: 1e+7             # Total training tokens (1B tokens)
-  micro_batch_size: 4
-  val_micro_batch_size: 2
-  warmup_ratio: 0.05
-  warmup_steps: ${warmup_steps:${.training_tokens},${..data.block_size},${.micro_batch_size},${.warmup_ratio}}  # Auto-calculated warmup steps
-  min_lr_factor: 1e-5
-  grad_accumulation_steps: 1
-  skip_first_batches: 0
-  weight_decay: 0.1
-  decay_lr: true
-  beta1: 0.9
-  beta2: 0.95
-  use_grad_scaling: false
-  grad_clip: 1.0
-  grad_clip_type: norm
-  clipping_count: 0
-  log_interval: 100
-  eval_interval: 100
-
-# Model Loading Configuration
-resume_checkpoint_path:         # Path to resume training from checkpoint
-parameter_count:
-init_checkpoint_path:           # Path to initialize weights from
-
-model:
-  student_weights_dtype: "bf16"   # Student model weight precision
-
-  model_overrides:
-    delete_old_checkpoints: true     # Clean up old checkpoints to save disk space
-    save_interval_seconds: 12900     # Save checkpoint every ~3.5 hours
-    save_interval: 1e+9              # Save checkpoint every 1B steps (effectively disabled)
-    save_checkpoint_when_done: true  # Save final checkpoint when training completes
-
-  # Architecture modifications for student model
-  model_config_overrides:
-    ffn:
-      - intermediate_size: 256
-        replace_with_linear: false   # Replace with simple linear layer (true/false)
-        no_op: false               # Disable FFN entirely (true/false)
-    attention:
-      - n_heads_in_group: 8     # Number of heads per group (for GQA)
-        replace_with_linear: false   # Replace attention with linear layer (true/false)
-        no_op: false               # Disable attention entirely (true/false)
-        # Sliding window attention length. Commenting this line so that the default value will be used.
-        #window_length: ???
-
-# Model Factory Configuration - Controls student model creation and initialization
-model_factory:
-  factory: gqa_factory_fn                    # Factory function for creating GQA (Grouped Query Attention) models
-  block_loss_func: normalized_mse_loss      # Loss function for comparing teacher/student blocks. vectorwise_normalized_mse_loss / batched_normalized_mse_loss / normalized_mse_loss
-  blocks_to_copy_indexes:                   # Which teacher blocks to copy unchanged (null = determine automatically)
-  gqa_init_mode: AverageKV                  # How to initialize K/V heads in GQA. All options here: GQAInitMode
-  mlp_init_mode: Truncate                   # MLP initialization. All options here: MlpInitMode
-  mlp_init_config:                          # Configuration for MLP initialization (if needed)
-    activations_log_dir:                    # Directory with activation statistics (required for PruneByActivationsLog)
-  linear_init_mode: FromTeacher             # How to initialize linear layers: FromTeacher, Random, etc.
-  student_module_for_bypass: block          # Which module to train as student.
-  submodule_for_loss_calculation:           # Specific submodule for loss calc.
-  keys_to_learn:                            # What parameters to train. Either "entire_block", or specific submodules. Computed dynamically.
-
-# Validation Configuration
-disable_initial_validate: false
-validate_teacher_model: true
-validate_student_model: true
-disable_validation: false          # Disable all validation (TODO: Not working yet)
-best_val_loss: 1e+9                # Track best validation loss achieved
-
-# Performance Optimization
-compile: false                     # Use PyTorch compilation (TODO: CURRENTLY NOT WORKING)
-disable_fa2: false                 # Disable Flash Attention 2 (false = use FA2 if available)
-teacher_model_load_on_cpu: false
-
-# Checkpoint Management
-save_checkpoint_before_training: true   # Save initial checkpoint before training
-disable_checkpoint_save: false          # Disable all checkpoint saving
-save_best_ckpt: true                    # Save checkpoint when validation improves
-kill_after_first_save: false           # Exit after first checkpoint save (for testing)
-realize_best_or_latest: "latest"
-
-# Experiment Tracking (Weights & Biases)
-wandb_log: false                    # Enable wandb logging
-wandb:
-  entity: ???                     # Must be set: wandb team/user name
-  mode: ???                       # Must be set: "online", "offline", or "disabled"
-  project: ???                    # Must be set: wandb project name
-  run_name: ???                   # Must be set: name for this specific run
diff --git a/tests/gpu/torch/_compress/resources/configs/bypass/llama-3_1-8b_bypass.yaml b/tests/gpu/torch/_compress/resources/configs/bypass/llama-3_1-8b_bypass.yaml
deleted file mode 100644
index 87341e72d..000000000
--- a/tests/gpu/torch/_compress/resources/configs/bypass/llama-3_1-8b_bypass.yaml
+++ /dev/null
@@ -1,38 +0,0 @@
-defaults:
-  - bypass_distillation_defaults
-
-# Model & Runtime Configuration
-
-# Data type for model weights and computations (bfloat16 for efficiency)
-dtype: "bf16"
-
-# Unique identifier for this experiment (must be set when running)
-experiment_id:
-
-# Data Configuration Overrides
-data:
-  max_eval_samples: 10
-
-# Model Factory Configuration
-model_factory:
-  mlp_init_mode: PruneByActivationsLog
-
-  mlp_init_config:
-    # REQUIRED: Path to directory containing activation statistics/logs
-    # This should point to precomputed activation data.
-    # Replace with the directory you want to init your FFN from.
-    # Example path for NRT cluster:  /lustre/fs1/portfolios/llmservice/projects/llmservice_deci_vlm/users/tkeren/puzzle/lior_exp/puzzle_kd-hidden-dim-4096_tokens-5e9_logits/pruning/pruning_scores/ffn_iterative/20000samples_diverse_mini
-    activations_log_dir: ${puzzle_dir}/pruning/pruning_scores/ffn_iterative/100samples_diverse_mini
-
-disable_initial_validate: false
-
-save_checkpoint_before_training: false
-
-wandb_log: false
-wandb:
-  # Organization/team name in wandb
-  entity: nv-aim
-  # Project name for organizing related experiments
-  project: puzzletron_bypass_distillation
-  mode: online
-  run_name: ${..experiment_id}

From 16abcc9f1643ac372854afcd816b8a37e6356fed Mon Sep 17 00:00:00 2001
From: Daniel Korzekwa <dkorzekwa@nvidia.com>
Date: Mon, 27 Oct 2025 15:38:33 +0100
Subject: [PATCH 09/81] Moving integration tests to tests/experimental to not
 trigger CICD

Signed-off-by: Daniel Korzekwa <dkorzekwa@nvidia.com>
---
 .../torch/_compress/resources/configs/Llama-3_1-8B.yaml       | 0
 .../_compress/resources/configs/pruning/attn_pruning.yaml     | 0
 .../_compress/resources/configs/pruning/ffn_pruning.yaml      | 0
 .../resources/configs/pruning/hidden_dim_pruning.yaml         | 0
 .../_compress/resources/configs/pruning/pruning_defaults.yaml | 0
 .../_compress/resources/configs/validate_model_defaults.yaml  | 0
 .../resources/configs/validate_solutions_defaults.yaml        | 0
 .../_compress/resources/tokenizer/special_tokens_map.json     | 0
 .../torch/_compress/resources/tokenizer/tokenizer.json        | 0
 .../torch/_compress/resources/tokenizer/tokenizer_config.json | 0
 .../torch/_compress/resources/tokenizer/truncate_tokenizer.py | 0
 tests/{gpu => experimental}/torch/_compress/test_compress.py  | 4 ++--
 12 files changed, 2 insertions(+), 2 deletions(-)
 rename tests/{gpu => experimental}/torch/_compress/resources/configs/Llama-3_1-8B.yaml (100%)
 rename tests/{gpu => experimental}/torch/_compress/resources/configs/pruning/attn_pruning.yaml (100%)
 rename tests/{gpu => experimental}/torch/_compress/resources/configs/pruning/ffn_pruning.yaml (100%)
 rename tests/{gpu => experimental}/torch/_compress/resources/configs/pruning/hidden_dim_pruning.yaml (100%)
 rename tests/{gpu => experimental}/torch/_compress/resources/configs/pruning/pruning_defaults.yaml (100%)
 rename tests/{gpu => experimental}/torch/_compress/resources/configs/validate_model_defaults.yaml (100%)
 rename tests/{gpu => experimental}/torch/_compress/resources/configs/validate_solutions_defaults.yaml (100%)
 rename tests/{gpu => experimental}/torch/_compress/resources/tokenizer/special_tokens_map.json (100%)
 rename tests/{gpu => experimental}/torch/_compress/resources/tokenizer/tokenizer.json (100%)
 rename tests/{gpu => experimental}/torch/_compress/resources/tokenizer/tokenizer_config.json (100%)
 rename tests/{gpu => experimental}/torch/_compress/resources/tokenizer/truncate_tokenizer.py (100%)
 rename tests/{gpu => experimental}/torch/_compress/test_compress.py (97%)

diff --git a/tests/gpu/torch/_compress/resources/configs/Llama-3_1-8B.yaml b/tests/experimental/torch/_compress/resources/configs/Llama-3_1-8B.yaml
similarity index 100%
rename from tests/gpu/torch/_compress/resources/configs/Llama-3_1-8B.yaml
rename to tests/experimental/torch/_compress/resources/configs/Llama-3_1-8B.yaml
diff --git a/tests/gpu/torch/_compress/resources/configs/pruning/attn_pruning.yaml b/tests/experimental/torch/_compress/resources/configs/pruning/attn_pruning.yaml
similarity index 100%
rename from tests/gpu/torch/_compress/resources/configs/pruning/attn_pruning.yaml
rename to tests/experimental/torch/_compress/resources/configs/pruning/attn_pruning.yaml
diff --git a/tests/gpu/torch/_compress/resources/configs/pruning/ffn_pruning.yaml b/tests/experimental/torch/_compress/resources/configs/pruning/ffn_pruning.yaml
similarity index 100%
rename from tests/gpu/torch/_compress/resources/configs/pruning/ffn_pruning.yaml
rename to tests/experimental/torch/_compress/resources/configs/pruning/ffn_pruning.yaml
diff --git a/tests/gpu/torch/_compress/resources/configs/pruning/hidden_dim_pruning.yaml b/tests/experimental/torch/_compress/resources/configs/pruning/hidden_dim_pruning.yaml
similarity index 100%
rename from tests/gpu/torch/_compress/resources/configs/pruning/hidden_dim_pruning.yaml
rename to tests/experimental/torch/_compress/resources/configs/pruning/hidden_dim_pruning.yaml
diff --git a/tests/gpu/torch/_compress/resources/configs/pruning/pruning_defaults.yaml b/tests/experimental/torch/_compress/resources/configs/pruning/pruning_defaults.yaml
similarity index 100%
rename from tests/gpu/torch/_compress/resources/configs/pruning/pruning_defaults.yaml
rename to tests/experimental/torch/_compress/resources/configs/pruning/pruning_defaults.yaml
diff --git a/tests/gpu/torch/_compress/resources/configs/validate_model_defaults.yaml b/tests/experimental/torch/_compress/resources/configs/validate_model_defaults.yaml
similarity index 100%
rename from tests/gpu/torch/_compress/resources/configs/validate_model_defaults.yaml
rename to tests/experimental/torch/_compress/resources/configs/validate_model_defaults.yaml
diff --git a/tests/gpu/torch/_compress/resources/configs/validate_solutions_defaults.yaml b/tests/experimental/torch/_compress/resources/configs/validate_solutions_defaults.yaml
similarity index 100%
rename from tests/gpu/torch/_compress/resources/configs/validate_solutions_defaults.yaml
rename to tests/experimental/torch/_compress/resources/configs/validate_solutions_defaults.yaml
diff --git a/tests/gpu/torch/_compress/resources/tokenizer/special_tokens_map.json b/tests/experimental/torch/_compress/resources/tokenizer/special_tokens_map.json
similarity index 100%
rename from tests/gpu/torch/_compress/resources/tokenizer/special_tokens_map.json
rename to tests/experimental/torch/_compress/resources/tokenizer/special_tokens_map.json
diff --git a/tests/gpu/torch/_compress/resources/tokenizer/tokenizer.json b/tests/experimental/torch/_compress/resources/tokenizer/tokenizer.json
similarity index 100%
rename from tests/gpu/torch/_compress/resources/tokenizer/tokenizer.json
rename to tests/experimental/torch/_compress/resources/tokenizer/tokenizer.json
diff --git a/tests/gpu/torch/_compress/resources/tokenizer/tokenizer_config.json b/tests/experimental/torch/_compress/resources/tokenizer/tokenizer_config.json
similarity index 100%
rename from tests/gpu/torch/_compress/resources/tokenizer/tokenizer_config.json
rename to tests/experimental/torch/_compress/resources/tokenizer/tokenizer_config.json
diff --git a/tests/gpu/torch/_compress/resources/tokenizer/truncate_tokenizer.py b/tests/experimental/torch/_compress/resources/tokenizer/truncate_tokenizer.py
similarity index 100%
rename from tests/gpu/torch/_compress/resources/tokenizer/truncate_tokenizer.py
rename to tests/experimental/torch/_compress/resources/tokenizer/truncate_tokenizer.py
diff --git a/tests/gpu/torch/_compress/test_compress.py b/tests/experimental/torch/_compress/test_compress.py
similarity index 97%
rename from tests/gpu/torch/_compress/test_compress.py
rename to tests/experimental/torch/_compress/test_compress.py
index ddcea6aaf..565c94423 100644
--- a/tests/gpu/torch/_compress/test_compress.py
+++ b/tests/experimental/torch/_compress/test_compress.py
@@ -57,7 +57,7 @@ def test_compress(project_root_path):
     dataset_path = osp.join(puzzle_dir, "dummy_dataset")
     hydra_config_dir = osp.join(
         project_root_path,
-        "tests/gpu/torch/_compress/resources/configs",
+        "tests/experimental/torch/_compress/resources/configs",
     )
 
     _runtime = NativeDDP_Runtime(
@@ -78,7 +78,7 @@ def test_compress(project_root_path):
             # This mimics the normal pipeline where we start with a Llama model
             #
             tokenizer_path = osp.join(
-                project_root_path, "tests/gpu/torch/_compress/resources/tokenizer"
+                project_root_path, "tests/experimental/torch/_compress/resources/tokenizer"
             )
             tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
 

From a5ba1c7023aa471304a1643261cfb6ce8101be67 Mon Sep 17 00:00:00 2001
From: Daniel Korzekwa <dkorzekwa@nvidia.com>
Date: Mon, 27 Oct 2025 15:44:59 +0100
Subject: [PATCH 10/81] update docs

Signed-off-by: Daniel Korzekwa <dkorzekwa@nvidia.com>
---
 .../_compress/resources/tokenizer/truncate_tokenizer.py      | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tests/experimental/torch/_compress/resources/tokenizer/truncate_tokenizer.py b/tests/experimental/torch/_compress/resources/tokenizer/truncate_tokenizer.py
index baac5e14c..1c911ac76 100644
--- a/tests/experimental/torch/_compress/resources/tokenizer/truncate_tokenizer.py
+++ b/tests/experimental/torch/_compress/resources/tokenizer/truncate_tokenizer.py
@@ -1,3 +1,8 @@
+"""
+This script was used to truncate the tokenizer.json file from Llama 3.1 8B model
+to keep only the top 100 most common tokens.
+"""
+
 import json
 
 # Path to your original and new tokenizer.json

From 1bda391134370c65ddb600eacb67f81760f709ae Mon Sep 17 00:00:00 2001
From: Daniel Korzekwa <dkorzekwa@nvidia.com>
Date: Mon, 27 Oct 2025 16:34:42 +0100
Subject: [PATCH 11/81] Replace mprint with print and replace osp.join with
 path1 / path2 notation.

Signed-off-by: Daniel Korzekwa <dkorzekwa@nvidia.com>
---
 .../torch/_compress/test_compress.py          | 50 +++++++++----------
 1 file changed, 23 insertions(+), 27 deletions(-)

diff --git a/tests/experimental/torch/_compress/test_compress.py b/tests/experimental/torch/_compress/test_compress.py
index 565c94423..c6547847e 100644
--- a/tests/experimental/torch/_compress/test_compress.py
+++ b/tests/experimental/torch/_compress/test_compress.py
@@ -1,12 +1,10 @@
 import datetime
 import os
-import os.path as osp
 import shutil
 from pathlib import Path
 
 import pytest
 import torch
-from logger import mprint
 from puzzle_tools.hydra_utils import register_hydra_resolvers
 from puzzle_tools.runtime import NativeDDP_Runtime
 from scripts.convert_llama3_to_decilm import convert_llama3_to_decilm
@@ -53,12 +51,9 @@ def project_root_path(request: pytest.FixtureRequest) -> Path:
 def test_compress(project_root_path):
     # The input to puzzletron.compress().
     os.environ["WANDB_DISABLED"] = "true"
-    puzzle_dir = "/tmp/pytest-shared/test_compress_model"
-    dataset_path = osp.join(puzzle_dir, "dummy_dataset")
-    hydra_config_dir = osp.join(
-        project_root_path,
-        "tests/experimental/torch/_compress/resources/configs",
-    )
+    puzzle_dir = Path("/tmp/pytest-shared/test_compress_model")
+    dataset_path = puzzle_dir / "dummy_dataset"
+    hydra_config_dir = project_root_path / "tests/experimental/torch/_compress/resources/configs"
 
     _runtime = NativeDDP_Runtime(
         dtype=torch.bfloat16, torch_distributed_timeout=datetime.timedelta(10)
@@ -77,14 +72,15 @@ def test_compress(project_root_path):
             # Step 1: Create and save a teacher model to compress
             # This mimics the normal pipeline where we start with a Llama model
             #
-            tokenizer_path = osp.join(
-                project_root_path, "tests/experimental/torch/_compress/resources/tokenizer"
+            tokenizer_path = (
+                project_root_path / "tests/experimental/torch/_compress/resources/tokenizer"
             )
+
             tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
 
             # Create a small Llama model (not DeciLM) to match the normal conversion pipeline
             hf_ckpt_teacher_dir = "ckpts/teacher"
-            llama_checkpoint_path = osp.join(puzzle_dir, hf_ckpt_teacher_dir)
+            llama_checkpoint_path = puzzle_dir / hf_ckpt_teacher_dir
             create_and_save_small_llama_model(
                 llama_checkpoint_path, vocab_size=tokenizer.vocab_size, tokenizer=tokenizer
             )
@@ -97,7 +93,9 @@ def test_compress(project_root_path):
         runtime.wait_for_everyone()
 
         # Compress the model using a one-click approach
-        compress.compress(hydra_config_dir, "Llama-3_1-8B", puzzle_dir, dataset_path, runtime)
+        compress.compress(
+            str(hydra_config_dir), "Llama-3_1-8B", str(puzzle_dir), str(dataset_path), runtime
+        )
 
         #
         # Check assertions
@@ -108,38 +106,36 @@ def test_compress(project_root_path):
             rank_filepath = (
                 f"pruning/pruning_scores/ffn_iterative/100samples_diverse_mini/rank_{rank}.pth"
             )
-            assert os.path.isfile(osp.join(puzzle_dir, rank_filepath))
+            assert os.path.isfile(puzzle_dir / rank_filepath)
 
             # assertions for the pruning_ckpts step 2
-            assert os.path.exists(osp.join(puzzle_dir, "ckpts/ffn_256_attn_no_op"))
-
-            # assertions fo bypass distillation step 3
-            # TODO: Add bypass distillation step
-            # assert os.path.exists(osp.join(hydra_cfg.bypass.experiment_dir, "latest/config.json"))
+            assert os.path.exists(puzzle_dir / "ckpts/ffn_256_attn_no_op")
 
             # assertions for the build_library_and_stats step 4
-            assert os.path.isfile(osp.join(puzzle_dir, "replacement_library.json"))
-            assert os.path.isfile(osp.join(puzzle_dir, "subblock_stats.json"))
+            assert os.path.isfile(puzzle_dir / "replacement_library.json")
+            assert os.path.isfile(puzzle_dir / "subblock_stats.json")
 
             # assertions for the scoring step 5
-            solution_0_filepath = osp.join(
-                puzzle_dir, "single_sequence_replacement_solutions--validation/solution_0.json"
+            solution_0_filepath = (
+                puzzle_dir / "single_sequence_replacement_solutions--validation/solution_0.json"
             )
+
             assert os.path.exists(solution_0_filepath)
 
             # assertions for the mip_and_realize_models step 6
-            solution_0_ckpt_config_path = osp.join(
-                puzzle_dir,
-                "mip/puzzle_solutions/target_memory_780000MiB/solutions--checkpoints/solution_0/config.json",
+            solution_0_ckpt_config_path = (
+                puzzle_dir
+                / "mip/puzzle_solutions/target_memory_780000MiB/solutions--checkpoints/solution_0/config.json"
             )
+
             assert os.path.exists(solution_0_ckpt_config_path)
             assert os.path.exists(
-                osp.join(puzzle_dir, "mip/puzzle_solutions/target_memory_780000MiB/solutions.json")
+                puzzle_dir / "mip/puzzle_solutions/target_memory_780000MiB/solutions.json"
             )
 
         runtime.wait_for_everyone()
 
-        mprint("PYTEST SUMMARY: test_compress_model() test has finished successfully")
+        print("PYTEST SUMMARY: test_compress_model() test has finished successfully")
 
 
 def create_and_save_small_llama_model(

From bb38401971709adb97ba09d1a2017150dfd3c672 Mon Sep 17 00:00:00 2001
From: Daniel Korzekwa <dkorzekwa@nvidia.com>
Date: Mon, 27 Oct 2025 16:39:04 +0100
Subject: [PATCH 12/81] Refactor file checking assertions to use .is_file() and
 .exists()

Signed-off-by: Daniel Korzekwa <dkorzekwa@nvidia.com>
---
 .../torch/_compress/test_compress.py            | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/tests/experimental/torch/_compress/test_compress.py b/tests/experimental/torch/_compress/test_compress.py
index c6547847e..2245bbd4e 100644
--- a/tests/experimental/torch/_compress/test_compress.py
+++ b/tests/experimental/torch/_compress/test_compress.py
@@ -106,21 +106,22 @@ def test_compress(project_root_path):
             rank_filepath = (
                 f"pruning/pruning_scores/ffn_iterative/100samples_diverse_mini/rank_{rank}.pth"
             )
-            assert os.path.isfile(puzzle_dir / rank_filepath)
+            assert (puzzle_dir / rank_filepath).is_file()
 
             # assertions for the pruning_ckpts step 2
-            assert os.path.exists(puzzle_dir / "ckpts/ffn_256_attn_no_op")
+            assert (puzzle_dir / "ckpts/ffn_256_attn_no_op").exists()
 
             # assertions for the build_library_and_stats step 4
-            assert os.path.isfile(puzzle_dir / "replacement_library.json")
-            assert os.path.isfile(puzzle_dir / "subblock_stats.json")
+
+            assert (puzzle_dir / "replacement_library.json").is_file()
+            assert (puzzle_dir / "subblock_stats.json").is_file()
 
             # assertions for the scoring step 5
             solution_0_filepath = (
                 puzzle_dir / "single_sequence_replacement_solutions--validation/solution_0.json"
             )
 
-            assert os.path.exists(solution_0_filepath)
+            assert solution_0_filepath.exists()
 
             # assertions for the mip_and_realize_models step 6
             solution_0_ckpt_config_path = (
@@ -128,10 +129,10 @@ def test_compress(project_root_path):
                 / "mip/puzzle_solutions/target_memory_780000MiB/solutions--checkpoints/solution_0/config.json"
             )
 
-            assert os.path.exists(solution_0_ckpt_config_path)
-            assert os.path.exists(
+            assert solution_0_ckpt_config_path.exists()
+            assert (
                 puzzle_dir / "mip/puzzle_solutions/target_memory_780000MiB/solutions.json"
-            )
+            ).exists()
 
         runtime.wait_for_everyone()
 

From 8415548bc9ee63fcbda02b8680a78db96630c44c Mon Sep 17 00:00:00 2001
From: Daniel Korzekwa <dkorzekwa@nvidia.com>
Date: Mon, 27 Oct 2025 17:41:51 +0100
Subject: [PATCH 13/81] Add a new dependency section to setyp.py for the
 modelopt.torch._compress module.

Signed-off-by: Daniel Korzekwa <dkorzekwa@nvidia.com>
---
 setup.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/setup.py b/setup.py
index 67bf114ae..cfadd5170 100644
--- a/setup.py
+++ b/setup.py
@@ -99,6 +99,8 @@
         "setuptools>=80",
         "setuptools-scm>=8",
     ],
+    # Dependedencies for modelopt.torch._compress subpackage
+    "compress": ["fire"],
 }
 
 # create "compound" optional dependencies

From b1b18333a3fe6abc14ca5ad92960fdcd27981161 Mon Sep 17 00:00:00 2001
From: Daniel Korzekwa <dkorzekwa@nvidia.com>
Date: Mon, 27 Oct 2025 18:43:24 +0100
Subject: [PATCH 14/81] Move test_convert_llama3_config_to_decilm_config.py to
 tests/experimental/ folder to not be run by CICD yet.

Signed-off-by: Daniel Korzekwa <dkorzekwa@nvidia.com>
---
 .../converters/test_convert_llama3_config_to_decilm_config.py     | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename tests/{gpu => experimental}/torch/_compress/decilm/converters/test_convert_llama3_config_to_decilm_config.py (100%)

diff --git a/tests/gpu/torch/_compress/decilm/converters/test_convert_llama3_config_to_decilm_config.py b/tests/experimental/torch/_compress/decilm/converters/test_convert_llama3_config_to_decilm_config.py
similarity index 100%
rename from tests/gpu/torch/_compress/decilm/converters/test_convert_llama3_config_to_decilm_config.py
rename to tests/experimental/torch/_compress/decilm/converters/test_convert_llama3_config_to_decilm_config.py

From 6f28e4a75a0cdc66ef84c943f41b421f2d19fb5c Mon Sep 17 00:00:00 2001
From: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>
Date: Mon, 27 Oct 2025 11:20:42 -0700
Subject: [PATCH 15/81] Fix: Add missing LICENSE headers

Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>
---
 modelopt/torch/_compress/compress.py              | 15 +++++++++++++++
 .../resources/tokenizer/truncate_tokenizer.py     | 15 +++++++++++++++
 .../experimental/torch/_compress/test_compress.py | 15 +++++++++++++++
 3 files changed, 45 insertions(+)

diff --git a/modelopt/torch/_compress/compress.py b/modelopt/torch/_compress/compress.py
index 97819a42b..a3617e37a 100644
--- a/modelopt/torch/_compress/compress.py
+++ b/modelopt/torch/_compress/compress.py
@@ -1,3 +1,18 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 """
 
 This module provides the main compression function for a model
diff --git a/tests/experimental/torch/_compress/resources/tokenizer/truncate_tokenizer.py b/tests/experimental/torch/_compress/resources/tokenizer/truncate_tokenizer.py
index 1c911ac76..aedcae4ab 100644
--- a/tests/experimental/torch/_compress/resources/tokenizer/truncate_tokenizer.py
+++ b/tests/experimental/torch/_compress/resources/tokenizer/truncate_tokenizer.py
@@ -1,3 +1,18 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 """
 This script was used to truncate the tokenizer.json file from Llama 3.1 8B model
 to keep only the top 100 most common tokens.
diff --git a/tests/experimental/torch/_compress/test_compress.py b/tests/experimental/torch/_compress/test_compress.py
index 2245bbd4e..452d2b6f6 100644
--- a/tests/experimental/torch/_compress/test_compress.py
+++ b/tests/experimental/torch/_compress/test_compress.py
@@ -1,3 +1,18 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import datetime
 import os
 import shutil

From 016fb63c0a0283ba15a79dd5a1aa9db42f784e1e Mon Sep 17 00:00:00 2001
From: Daniel Korzekwa <dkorzekwa@nvidia.com>
Date: Tue, 28 Oct 2025 11:35:43 +0100
Subject: [PATCH 16/81] Use spawn_multiprocess_job for test_compress test (to
 be able to use tmp_path.

Signed-off-by: Daniel Korzekwa <dkorzekwa@nvidia.com>
---
 modelopt/torch/_compress/compress.py          |   2 +-
 modelopt/torch/_compress/runtime.py           | 539 ++++++++++++++++++
 .../torch/_compress/test_compress.py          |  35 +-
 3 files changed, 563 insertions(+), 13 deletions(-)
 create mode 100644 modelopt/torch/_compress/runtime.py

diff --git a/modelopt/torch/_compress/compress.py b/modelopt/torch/_compress/compress.py
index a3617e37a..265fd5eeb 100644
--- a/modelopt/torch/_compress/compress.py
+++ b/modelopt/torch/_compress/compress.py
@@ -43,7 +43,7 @@ def compress(
         puzzle_dir (str): directory with a puzzletron model to compress
         dataset_path (str): dataset used for scoring and distillation
         runtime: distributed runtime to use to run the compression steps, e.g.,
-                 NativeDDP_Runtime(dtype=torch.bfloat16, torch_distributed_timeout=datetime.timedelta(10))
+                 NativeDdpRuntime(dtype=torch.bfloat16, torch_distributed_timeout=datetime.timedelta(10))
 
     Returns:
         Hydra config object after compressing the model.
diff --git a/modelopt/torch/_compress/runtime.py b/modelopt/torch/_compress/runtime.py
new file mode 100644
index 000000000..e46a48a18
--- /dev/null
+++ b/modelopt/torch/_compress/runtime.py
@@ -0,0 +1,539 @@
+import os
+import random
+from abc import ABC, abstractmethod
+from collections.abc import Callable, Iterable, Iterator, Sequence
+from contextlib import AbstractContextManager, suppress
+from datetime import timedelta
+from pathlib import Path
+from typing import Literal, TypeVar, cast
+
+import numpy as np
+import torch
+import torch.distributed
+import torch.nn as nn
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+from typing_extensions import override
+
+PrepareModelsT = TypeVar("PrepareModelsT", bound=Sequence[nn.Module])
+PrepareDataLoaderT = TypeVar("PrepareDataLoaderT", bound=DataLoader)
+CompileT = TypeVar("CompileT", bound=nn.Module)
+Filter = (
+    Literal["main_process", "last", "local_main_process", "local_last", "all"]
+    | list[int]
+    | set[int]
+    | Callable[[int], bool]
+)
+
+
+class IRuntime(ABC):
+    @abstractmethod
+    def setup(self) -> None: ...
+
+    @abstractmethod
+    def cleanup(self) -> None: ...
+
+    @abstractmethod
+    def autocast(self) -> AbstractContextManager: ...
+
+    @abstractmethod
+    def wait_for_everyone(self) -> None: ...
+
+    @abstractmethod
+    def set_seed(self, seed: int, device_specific: bool = False) -> int: ...
+
+    @abstractmethod
+    def prepare_models(self, models: PrepareModelsT) -> PrepareModelsT: ...
+
+    @abstractmethod
+    def prepare_train_dataloader(
+        self, train_dataloader: PrepareDataLoaderT
+    ) -> PrepareDataLoaderT: ...
+
+    @abstractmethod
+    def prepare_val_dataloader(self, val_dataloader: PrepareDataLoaderT) -> PrepareDataLoaderT: ...
+
+    @abstractmethod
+    def compile(self, model: CompileT) -> CompileT: ...
+
+    @abstractmethod
+    def backward(self, loss: torch.Tensor) -> None: ...
+
+    @abstractmethod
+    def clip_grad_norm_(
+        self,
+        parameters: Iterable[torch.Tensor] | torch.Tensor,
+        max_norm: float,
+        norm_type: float = 2,
+    ) -> torch.Tensor: ...
+
+    @abstractmethod
+    def clip_grad_value_(
+        self, parameters: Iterable[torch.Tensor] | torch.Tensor, clip_value: float
+    ) -> None: ...
+
+    @abstractmethod
+    def save_state(self, path: str | Path) -> None: ...
+
+    @abstractmethod
+    def load_state(self, path: str | Path) -> None: ...
+
+    @abstractmethod
+    def skip_first_batches(self, dataloader_iterator: Iterator, num_batches: int) -> None: ...
+
+    @property
+    @abstractmethod
+    def sync_gradients(self) -> bool: ...
+
+    @property
+    @abstractmethod
+    def device(self) -> torch.device: ...
+
+    @property
+    @abstractmethod
+    def is_main_process(self) -> bool: ...
+
+    @property
+    @abstractmethod
+    def is_local_main_process(self) -> bool: ...
+
+    @property
+    @abstractmethod
+    def is_last_process(self) -> bool: ...
+
+    @property
+    @abstractmethod
+    def is_local_last_process(self) -> bool: ...
+
+    @property
+    @abstractmethod
+    def local_rank(self) -> int: ...
+
+    @property
+    @abstractmethod
+    def global_rank(self) -> int: ...
+
+    @property
+    @abstractmethod
+    def local_world_size(self) -> int: ...
+
+    @property
+    @abstractmethod
+    def world_size(self) -> int: ...
+
+    @property
+    @abstractmethod
+    def dtype(self) -> torch.dtype: ...
+
+    def __enter__(self):
+        self.setup()
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        # avoid barrier if exceution errored
+        if exc_type is None:
+            self.cleanup()
+
+        # if exc_type is not None:
+        #     raise exc_value
+        # Handle exceptions if necessary
+        # pass
+
+    # def __del__(self):
+    #     torch.distributed.barrier()
+    #     torch.distributed.destroy_process_group()
+
+    def check_filter(self, filter_: Filter):
+        return (
+            filter_ == "all"
+            or (filter_ == "main_process" and self.is_main_process)
+            or (filter_ == "local_main_process" and self.is_local_main_process)
+            or (filter_ == "last" and self.is_last_process)
+            or (filter_ == "local_last" and self.is_local_last_process)
+            or (isinstance(filter_, (list, set)) and self.global_rank in filter_)
+            or (callable(filter_) and filter_(self.global_rank))
+        )
+
+    def print(
+        self, *args, filter_: Filter = "main_process", rank_prefix=False, flush=True, **kwargs
+    ) -> None:
+        if not self.check_filter(filter_):
+            return
+
+        if rank_prefix:
+            print(f"[global_rank={self.global_rank}]", *args, flush=flush, **kwargs)
+        else:
+            print(*args, flush=flush, **kwargs)
+
+    def process_print(
+        self, *args, filter_: Filter = "all", rank_prefix=True, flush=True, **kwargs
+    ) -> None:
+        if not self.check_filter(filter_):
+            return
+
+        if rank_prefix:
+            prefix = f"[global_rank={self.global_rank}]"
+            if len(args) == 1:  # avoid out-of-order printing if possible
+                out = f"{prefix} {args[0]}"
+                args = (out,)
+            else:
+                args = (prefix, *args)
+            print(*args, flush=flush, **kwargs)
+        else:
+            print(*args, flush=flush, **kwargs)
+
+
+class NativeDdpRuntime(IRuntime):
+    def __init__(
+        self,
+        dtype: torch.dtype = torch.float,
+        torch_distributed_timeout: timedelta | None = None,
+    ):
+        self._master_addr = os.environ["MASTER_ADDR"]
+        self._master_port = int(os.environ["MASTER_PORT"])
+        self._local_rank = int(os.environ["LOCAL_RANK"])
+        self._global_rank = int(os.environ["RANK"])
+        self._local_world_size = int(os.environ["LOCAL_WORLD_SIZE"])
+        self._world_size = int(os.environ["WORLD_SIZE"])
+        self._device = torch.device(self.local_rank)
+        self._dtype = dtype
+        self._torch_distributed_timeout = torch_distributed_timeout
+
+    @override
+    def setup(self):
+        torch.cuda.set_device(self._device)
+        if not torch.distributed.is_initialized():
+            torch.distributed.init_process_group(
+                "cpu:gloo,cuda:nccl", timeout=self._torch_distributed_timeout
+            )
+        input_tensors = [
+            torch.tensor([0], dtype=torch.float32, device=self._device)
+            for _ in range(self.world_size)
+        ]
+        output_tensors = [
+            torch.tensor([0], dtype=torch.float32, device=self._device)
+            for _ in range(self.world_size)
+        ]
+        torch.distributed.all_to_all(input_tensors, output_tensors)
+
+    @override
+    def cleanup(self):
+        with suppress(Exception):
+            torch.distributed.barrier()
+        torch.distributed.destroy_process_group()
+
+    @override
+    def autocast(self) -> AbstractContextManager:
+        result = torch.autocast(device_type="cuda", dtype=self._dtype, enabled=True)
+        return result
+
+    @override
+    def wait_for_everyone(self):
+        torch.distributed.barrier()
+
+    @override
+    def set_seed(self, seed: int, device_specific: bool = False) -> int:
+        """
+        Helper function for reproducible behavior to set the seed in `random`, `numpy`, `torch`.
+
+        Args:
+            seed (`int`):
+                The seed to set.
+            device_specific (`bool`, *optional*, defaults to `False`):
+                Whether to differ the seed on each device slightly with `self.process_index`.
+        """
+        if device_specific:
+            seed += self.global_rank
+
+        random.seed(seed)
+        np.random.seed(seed)
+        torch.manual_seed(seed)
+        torch.cuda.manual_seed_all(seed)
+
+        return seed
+
+    @override
+    def prepare_models(self, models: PrepareModelsT) -> PrepareModelsT:
+        assert all(isinstance(x, nn.Module) for x in models)
+        new_models = [nn.parallel.DistributedDataParallel(m) for m in models]
+        new_models = cast("PrepareModelsT", new_models)
+        return new_models  # type: ignore[return-value]
+
+    @override
+    def prepare_train_dataloader(self, train_dataloader: PrepareDataLoaderT) -> PrepareDataLoaderT:
+        return train_dataloader
+
+    @override
+    def prepare_val_dataloader(self, val_dataloader: PrepareDataLoaderT) -> PrepareDataLoaderT:
+        return val_dataloader
+
+    @override
+    def compile(self, model: CompileT) -> CompileT:
+        result = torch.compile(model)
+        result = cast("CompileT", result)
+        return result
+
+    @override
+    def backward(self, loss: torch.Tensor) -> None:
+        loss.backward()
+
+    @override
+    def clip_grad_norm_(
+        self,
+        parameters: Iterable[torch.Tensor] | torch.Tensor,
+        max_norm: float,
+        norm_type: float = 2,
+    ) -> torch.Tensor:
+        result = torch.nn.utils.clip_grad_norm_(parameters, max_norm, norm_type=norm_type)
+        return result
+
+    @override
+    def clip_grad_value_(
+        self, parameters: Iterable[torch.Tensor] | torch.Tensor, clip_value: float
+    ) -> None:
+        torch.nn.utils.clip_grad_value_(parameters, clip_value)
+
+    @override
+    def save_state(self, path: str | Path) -> None:
+        pass
+
+    @override
+    def load_state(self, path: str | Path) -> None:
+        pass
+
+    @override
+    def skip_first_batches(self, dataloader_iterator: Iterator, num_batches: int) -> None:
+        for _ in tqdm(
+            range(num_batches), desc=f"rank {self._global_rank}: skip_first_batches({num_batches=})"
+        ):
+            next(dataloader_iterator)
+
+    @property
+    @override
+    def sync_gradients(self) -> bool:
+        return True
+
+    @property
+    @override
+    def is_main_process(self) -> bool:
+        result = self.global_rank == 0
+        return result
+
+    @property
+    @override
+    def is_local_main_process(self) -> bool:
+        result = self.local_rank == 0
+        return result
+
+    @property
+    @override
+    def is_last_process(self) -> bool:
+        result = self.global_rank == self.world_size - 1
+        return result
+
+    @property
+    @override
+    def is_local_last_process(self) -> bool:
+        result = self.local_rank == self.local_world_size - 1
+        return result
+
+    @property
+    @override
+    def local_rank(self) -> int:
+        return self._local_rank
+
+    @property
+    @override
+    def global_rank(self) -> int:
+        return self._global_rank
+
+    @property
+    @override
+    def local_world_size(self) -> int:
+        return self._local_world_size
+
+    @property
+    @override
+    def world_size(self) -> int:
+        return self._world_size
+
+    @property
+    @override
+    def device(self) -> torch.device:
+        return self._device
+
+    @property
+    @override
+    def dtype(self) -> torch.dtype:
+        return self._dtype
+
+    @property
+    def master_addr(self) -> str:
+        return self._master_addr
+
+    @property
+    def master_port(self) -> int:
+        return self._master_port
+
+
+class BaseRuntime(IRuntime):
+    def __init__(self, dtype: torch.dtype = torch.float):
+        self._device = torch.device(self.local_rank)
+        self._dtype = dtype
+
+    @override
+    def setup(self):
+        torch.cuda.set_device(self._device)
+
+    @override
+    def cleanup(self): ...
+
+    @override
+    def autocast(self) -> AbstractContextManager:
+        result = torch.autocast(device_type="cuda", dtype=self._dtype, enabled=True)
+        return result
+
+    @override
+    def wait_for_everyone(self): ...
+
+    @override
+    def set_seed(self, seed: int, device_specific: bool = False) -> int:
+        """
+        Helper function for reproducible behavior to set the seed in `random`, `numpy`, `torch`.
+
+        Args:
+            seed (`int`):
+                The seed to set.
+            device_specific (`bool`, *optional*, defaults to `False`):
+                Whether to differ the seed on each device slightly with `self.process_index`.
+        """
+        if device_specific:
+            seed += self.global_rank
+
+        random.seed(seed)
+        np.random.seed(seed)
+        torch.manual_seed(seed)
+        torch.cuda.manual_seed_all(seed)
+
+        return seed
+
+    @override
+    def prepare_models(self, models: PrepareModelsT) -> PrepareModelsT:
+        assert all(isinstance(x, nn.Module) for x in models)
+        return models
+
+    @override
+    def prepare_train_dataloader(self, train_dataloader: PrepareDataLoaderT) -> PrepareDataLoaderT:
+        return train_dataloader
+
+    @override
+    def prepare_val_dataloader(self, val_dataloader: PrepareDataLoaderT) -> PrepareDataLoaderT:
+        return val_dataloader
+
+    @override
+    def compile(self, model: CompileT) -> CompileT:
+        result = torch.compile(model)
+        result = cast("CompileT", result)
+        return result
+
+    @override
+    def backward(self, loss: torch.Tensor) -> None:
+        loss.backward()
+
+    @override
+    def clip_grad_norm_(
+        self,
+        parameters: Iterable[torch.Tensor] | torch.Tensor,
+        max_norm: float,
+        norm_type: float = 2,
+    ) -> torch.Tensor:
+        result = torch.nn.utils.clip_grad_norm_(parameters, max_norm, norm_type=norm_type)
+        return result
+
+    @override
+    def clip_grad_value_(
+        self, parameters: Iterable[torch.Tensor] | torch.Tensor, clip_value: float
+    ) -> None:
+        torch.nn.utils.clip_grad_value_(parameters, clip_value)
+
+    @override
+    def save_state(self, path: str | Path) -> None:
+        pass
+
+    @override
+    def load_state(self, path: str | Path) -> None:
+        pass
+
+    @override
+    def skip_first_batches(self, dataloader_iterator: Iterator, num_batches: int) -> None:
+        for _ in tqdm(
+            range(num_batches), desc=f"rank {self.global_rank}: skip_first_batches({num_batches=})"
+        ):
+            next(dataloader_iterator)
+
+    @property
+    @override
+    def sync_gradients(self) -> bool:
+        return True
+
+    @property
+    @override
+    def is_main_process(self) -> bool:
+        result = self.global_rank == 0
+        return result
+
+    @property
+    @override
+    def is_local_main_process(self) -> bool:
+        result = self.local_rank == 0
+        return result
+
+    @property
+    @override
+    def is_last_process(self) -> bool:
+        result = self.global_rank == self.world_size - 1
+        return result
+
+    @property
+    @override
+    def is_local_last_process(self) -> bool:
+        result = self.local_rank == self.local_world_size - 1
+        return result
+
+    @property
+    @override
+    def local_rank(self) -> int:
+        return 0
+
+    @property
+    @override
+    def global_rank(self) -> int:
+        return 0
+
+    @property
+    @override
+    def local_world_size(self) -> int:
+        return 1
+
+    @property
+    @override
+    def world_size(self) -> int:
+        return 1
+
+    @property
+    @override
+    def device(self) -> torch.device:
+        return self._device
+
+    @property
+    @override
+    def dtype(self) -> torch.dtype:
+        return self._dtype
+
+    @property
+    def master_addr(self) -> str | None:
+        return None
+
+    @property
+    def master_port(self) -> int | None:
+        return None
diff --git a/tests/experimental/torch/_compress/test_compress.py b/tests/experimental/torch/_compress/test_compress.py
index 452d2b6f6..7e4078d41 100644
--- a/tests/experimental/torch/_compress/test_compress.py
+++ b/tests/experimental/torch/_compress/test_compress.py
@@ -16,24 +16,21 @@
 import datetime
 import os
 import shutil
+from functools import partial
 from pathlib import Path
 
 import pytest
 import torch
+from _test_utils.torch_dist.dist_utils import spawn_multiprocess_job
 from puzzle_tools.hydra_utils import register_hydra_resolvers
-from puzzle_tools.runtime import NativeDDP_Runtime
 from scripts.convert_llama3_to_decilm import convert_llama3_to_decilm
 from transformers import AutoTokenizer, LlamaConfig, LlamaForCausalLM, PreTrainedTokenizerBase
 
 from modelopt.torch._compress import compress
+from modelopt.torch._compress.runtime import NativeDdpRuntime
 from tests.integration.puzzle_tools.e2e_puzzletron_test.dummy_dataset import save_dummy_dataset
 
 
-@pytest.fixture(scope="module", autouse=True)
-def setup_test_module():
-    register_hydra_resolvers()
-
-
 @pytest.fixture
 def project_root_path(request: pytest.FixtureRequest) -> Path:
     return Path(request.config.rootpath)
@@ -59,18 +56,32 @@ def project_root_path(request: pytest.FixtureRequest) -> Path:
 #
 # export PYTHONPATH=$PYTHONPATH:/workspace/puzzletron/v1
 #
-# ../puzzletron/v1/scripts/torch_dist_runner.sh \
-# pytest -s -v ./tests/gpu/torch/puzzletron/test_compress_model.py -o addopts=""
+# pytest -s -v ./tests/experimental/torch/_compress/test_compress.py::test_compress -o addopts=""
 
 
-def test_compress(project_root_path):
-    # The input to puzzletron.compress().
+def test_compress(project_root_path: Path, tmp_path: Path):
+    spawn_multiprocess_job(
+        size=torch.cuda.device_count(),
+        job=partial(_test_compress_multiprocess_job, project_root_path, tmp_path),
+        backend="nccl",
+    )
+
+
+def _test_compress_multiprocess_job(project_root_path: Path, tmp_path: Path, rank: int, size: int):
+    register_hydra_resolvers()
+
+    # Set environment variables expected by NativeDDP_Runtime
+    os.environ["RANK"] = str(rank)
+    os.environ["LOCAL_RANK"] = str(rank)
+    os.environ["WORLD_SIZE"] = str(size)
+    os.environ["LOCAL_WORLD_SIZE"] = str(size)
     os.environ["WANDB_DISABLED"] = "true"
-    puzzle_dir = Path("/tmp/pytest-shared/test_compress_model")
+
+    puzzle_dir = tmp_path
     dataset_path = puzzle_dir / "dummy_dataset"
     hydra_config_dir = project_root_path / "tests/experimental/torch/_compress/resources/configs"
 
-    _runtime = NativeDDP_Runtime(
+    _runtime = NativeDdpRuntime(
         dtype=torch.bfloat16, torch_distributed_timeout=datetime.timedelta(10)
     )
 

From 0ccf1c43d1bdc454ae911ff613108b3413cdc8d2 Mon Sep 17 00:00:00 2001
From: Daniel Korzekwa <dkorzekwa@nvidia.com>
Date: Tue, 28 Oct 2025 14:57:03 +0100
Subject: [PATCH 17/81] Add comments.

Signed-off-by: Daniel Korzekwa <dkorzekwa@nvidia.com>
---
 modelopt/torch/_compress/runtime.py | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/modelopt/torch/_compress/runtime.py b/modelopt/torch/_compress/runtime.py
index e46a48a18..46f561a5d 100644
--- a/modelopt/torch/_compress/runtime.py
+++ b/modelopt/torch/_compress/runtime.py
@@ -1,3 +1,20 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Classes for torch distributed runtime management"""
+
 import os
 import random
 from abc import ABC, abstractmethod

From 58439ca0273ba38a3b1ff9e010c866ef1794903d Mon Sep 17 00:00:00 2001
From: Daniel Korzekwa <dkorzekwa@nvidia.com>
Date: Tue, 28 Oct 2025 16:35:13 +0100
Subject: [PATCH 18/81] Add _save_dummy_dataset to the test_compress.py

Signed-off-by: Daniel Korzekwa <dkorzekwa@nvidia.com>
---
 .../torch/_compress/test_compress.py          | 50 ++++++++++++++++---
 1 file changed, 44 insertions(+), 6 deletions(-)

diff --git a/tests/experimental/torch/_compress/test_compress.py b/tests/experimental/torch/_compress/test_compress.py
index 7e4078d41..e1d2e84a6 100644
--- a/tests/experimental/torch/_compress/test_compress.py
+++ b/tests/experimental/torch/_compress/test_compress.py
@@ -22,13 +22,13 @@
 import pytest
 import torch
 from _test_utils.torch_dist.dist_utils import spawn_multiprocess_job
+from datasets import Dataset, DatasetDict
 from puzzle_tools.hydra_utils import register_hydra_resolvers
 from scripts.convert_llama3_to_decilm import convert_llama3_to_decilm
 from transformers import AutoTokenizer, LlamaConfig, LlamaForCausalLM, PreTrainedTokenizerBase
 
 from modelopt.torch._compress import compress
 from modelopt.torch._compress.runtime import NativeDdpRuntime
-from tests.integration.puzzle_tools.e2e_puzzletron_test.dummy_dataset import save_dummy_dataset
 
 
 @pytest.fixture
@@ -91,8 +91,8 @@ def _test_compress_multiprocess_job(project_root_path: Path, tmp_path: Path, ran
         #
         if runtime.global_rank == 0:
             # Setup puzzle_dir and dataset
-            setup_puzzle_dir(puzzle_dir)
-            save_dummy_dataset(dataset_path)
+            _setup_puzzle_dir(puzzle_dir)
+            _save_dummy_dataset(dataset_path)
 
             #
             # Step 1: Create and save a teacher model to compress
@@ -107,7 +107,7 @@ def _test_compress_multiprocess_job(project_root_path: Path, tmp_path: Path, ran
             # Create a small Llama model (not DeciLM) to match the normal conversion pipeline
             hf_ckpt_teacher_dir = "ckpts/teacher"
             llama_checkpoint_path = puzzle_dir / hf_ckpt_teacher_dir
-            create_and_save_small_llama_model(
+            _create_and_save_small_llama_model(
                 llama_checkpoint_path, vocab_size=tokenizer.vocab_size, tokenizer=tokenizer
             )
 
@@ -165,7 +165,7 @@ def _test_compress_multiprocess_job(project_root_path: Path, tmp_path: Path, ran
         print("PYTEST SUMMARY: test_compress_model() test has finished successfully")
 
 
-def create_and_save_small_llama_model(
+def _create_and_save_small_llama_model(
     output_path: str, vocab_size: int, tokenizer: PreTrainedTokenizerBase
 ):
     """
@@ -203,7 +203,45 @@ def create_and_save_small_llama_model(
     llama_config.save_pretrained(output_path)
 
 
-def setup_puzzle_dir(puzzle_dir: str):
+def _setup_puzzle_dir(puzzle_dir: str):
     if Path(puzzle_dir).exists():
         shutil.rmtree(puzzle_dir)
         Path(puzzle_dir).mkdir(parents=True, exist_ok=True)
+
+
+def _save_dummy_dataset(dataset_path: str):
+    # dummy sample
+    sample = [
+        {"role": "user", "content": "please cite Lorem Ipsum?"},
+        {
+            "role": "assistant",
+            "content": (
+                "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed in blandit ante. "
+                "Sed tempus erat urna, ac elementum nisl facilisis quis. Aliquam consectetur mollis massa, "
+                "in elementum sem venenatis posuere. Fusce lorem arcu, egestas vel massa sollicitudin, "
+                "dictum mollis purus. Proin in ullamcorper elit. Nam tellus nisi, volutpat a mattis vel, "
+                "pretium in purus. Nunc at lectus facilisis risus scelerisque rhoncus eu nec ex. "
+                "Maecenas semper, tellus non placerat vulputate, urna felis facilisis diam, "
+                "sit amet vestibulum erat sapien nec libero. Praesent non massa velit. Donec faucibus mi eros. "
+                "Nam turpis nulla, congue sit amet mi at, porttitor scelerisque elit. Nunc id sodales lorem, "
+                "nec tincidunt leo. Quisque a neque nec ligula porttitor auctor. "
+                "Nunc accumsan nunc ac tellus congue vehicula. Praesent tellus eros, luctus non gravida dapibus, "
+                "faucibus eu ex. Quisque bibendum leo pharetra, tristique est vitae, hendrerit nunc. "
+                "Duis nec congue dolor. Donec commodo ipsum non efficitur volutpat. "
+                "Nulla risus nulla, efficitur et urna at, imperdiet sodales lorem. "
+                "Suspendisse erat est, sollicitudin at nisl tincidunt, vehicula hendrerit lectus. "
+                "Nam quis nisi ullamcorper, rhoncus massa vel, tempus purus. "
+                "Duis pulvinar eros vel nulla pellentesque, at dapibus justo laoreet. "
+                "Praesent tortor orci, vulputate fermentum dapibus nec, feugiat vitae tortor. "
+                "Donec mollis convallis massa quis iaculis."
+            ),
+        },
+    ]
+
+    # Prepare train and val splits with sample repeated, 2500 samples are for
+    # 128 samples with block-size 8192 and LLama3 tokenizer
+    data = [{"conversation": sample}] * 2500
+
+    # For train-val splits
+    data_dict = DatasetDict({"train": Dataset.from_list(data), "valid": Dataset.from_list(data)})
+    data_dict.save_to_disk(dataset_path)

From 2e5f776bf7d49410ef207180f5b78cf16647f815 Mon Sep 17 00:00:00 2001
From: Daniel Korzekwa <dkorzekwa@nvidia.com>
Date: Tue, 28 Oct 2025 16:42:52 +0100
Subject: [PATCH 19/81] Refactoring: Move torch distributed env variables to
 dist_utils.py

Signed-off-by: Daniel Korzekwa <dkorzekwa@nvidia.com>
---
 tests/_test_utils/torch_dist/dist_utils.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/_test_utils/torch_dist/dist_utils.py b/tests/_test_utils/torch_dist/dist_utils.py
index c7407b018..3b85728aa 100644
--- a/tests/_test_utils/torch_dist/dist_utils.py
+++ b/tests/_test_utils/torch_dist/dist_utils.py
@@ -34,6 +34,10 @@ def init_process(rank, size, job=None, backend="gloo", port=None):
     """Initialize the distributed environment."""
 
     os.environ["MASTER_ADDR"] = "localhost"
+    os.environ["RANK"] = str(rank)
+    os.environ["LOCAL_RANK"] = str(rank)
+    os.environ["WORLD_SIZE"] = str(size)
+    os.environ["LOCAL_WORLD_SIZE"] = str(size)
 
     port = str(get_free_port()) if port is None else str(port)
 

From 6274db5164a3e8a7c5299c329d9d004b6124f04d Mon Sep 17 00:00:00 2001
From: Daniel Korzekwa <dkorzekwa@nvidia.com>
Date: Tue, 28 Oct 2025 16:44:23 +0100
Subject: [PATCH 20/81] Refactoring: move torch distributed variables to
 dist_utils

Signed-off-by: Daniel Korzekwa <dkorzekwa@nvidia.com>
---
 tests/experimental/torch/_compress/test_compress.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/tests/experimental/torch/_compress/test_compress.py b/tests/experimental/torch/_compress/test_compress.py
index e1d2e84a6..e72d8ad34 100644
--- a/tests/experimental/torch/_compress/test_compress.py
+++ b/tests/experimental/torch/_compress/test_compress.py
@@ -71,10 +71,6 @@ def _test_compress_multiprocess_job(project_root_path: Path, tmp_path: Path, ran
     register_hydra_resolvers()
 
     # Set environment variables expected by NativeDDP_Runtime
-    os.environ["RANK"] = str(rank)
-    os.environ["LOCAL_RANK"] = str(rank)
-    os.environ["WORLD_SIZE"] = str(size)
-    os.environ["LOCAL_WORLD_SIZE"] = str(size)
     os.environ["WANDB_DISABLED"] = "true"
 
     puzzle_dir = tmp_path
@@ -89,7 +85,7 @@ def _test_compress_multiprocess_job(project_root_path: Path, tmp_path: Path, ran
         #
         # Test setup
         #
-        if runtime.global_rank == 0:
+        if rank == 0:
             # Setup puzzle_dir and dataset
             _setup_puzzle_dir(puzzle_dir)
             _save_dummy_dataset(dataset_path)
@@ -126,7 +122,7 @@ def _test_compress_multiprocess_job(project_root_path: Path, tmp_path: Path, ran
         #
         # Check assertions
         #
-        if runtime.global_rank == 0:
+        if rank == 0:
             # assertions for the score_pruning_activations step 1
             rank = int(os.environ["RANK"])
             rank_filepath = (

From d942e0a4907f9cddf9ddc89a038196126fdbae04 Mon Sep 17 00:00:00 2001
From: Daniel Korzekwa <dkorzekwa@nvidia.com>
Date: Tue, 28 Oct 2025 16:54:41 +0100
Subject: [PATCH 21/81] Move  os.environ["WANDB_DISABLED"] = "true" to
 dist_utils.py

Signed-off-by: Daniel Korzekwa <dkorzekwa@nvidia.com>
---
 tests/_test_utils/torch_dist/dist_utils.py          | 1 +
 tests/experimental/torch/_compress/test_compress.py | 3 ---
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/tests/_test_utils/torch_dist/dist_utils.py b/tests/_test_utils/torch_dist/dist_utils.py
index 3b85728aa..f7160cf28 100644
--- a/tests/_test_utils/torch_dist/dist_utils.py
+++ b/tests/_test_utils/torch_dist/dist_utils.py
@@ -38,6 +38,7 @@ def init_process(rank, size, job=None, backend="gloo", port=None):
     os.environ["LOCAL_RANK"] = str(rank)
     os.environ["WORLD_SIZE"] = str(size)
     os.environ["LOCAL_WORLD_SIZE"] = str(size)
+    os.environ["WANDB_DISABLED"] = "true"
 
     port = str(get_free_port()) if port is None else str(port)
 
diff --git a/tests/experimental/torch/_compress/test_compress.py b/tests/experimental/torch/_compress/test_compress.py
index e72d8ad34..096de4de3 100644
--- a/tests/experimental/torch/_compress/test_compress.py
+++ b/tests/experimental/torch/_compress/test_compress.py
@@ -70,9 +70,6 @@ def test_compress(project_root_path: Path, tmp_path: Path):
 def _test_compress_multiprocess_job(project_root_path: Path, tmp_path: Path, rank: int, size: int):
     register_hydra_resolvers()
 
-    # Set environment variables expected by NativeDDP_Runtime
-    os.environ["WANDB_DISABLED"] = "true"
-
     puzzle_dir = tmp_path
     dataset_path = puzzle_dir / "dummy_dataset"
     hydra_config_dir = project_root_path / "tests/experimental/torch/_compress/resources/configs"

From f765921d8427f5854dc51af918d5f41d2eac7e5a Mon Sep 17 00:00:00 2001
From: Daniel Korzekwa <dkorzekwa@nvidia.com>
Date: Tue, 28 Oct 2025 18:40:48 +0100
Subject: [PATCH 22/81] Implement integration test for mnt.convert() for the
 _compress algorithm.

Signed-off-by: Daniel Korzekwa <dkorzekwa@nvidia.com>
---
 .../nas/plugins/_compress/test_nas_convert.py | 28 +++++++++++++++++++
 tests/gpu/torch/_compress/test_compress.py    |  2 ++
 2 files changed, 30 insertions(+)
 create mode 100644 tests/experimental/torch/nas/plugins/_compress/test_nas_convert.py

diff --git a/tests/experimental/torch/nas/plugins/_compress/test_nas_convert.py b/tests/experimental/torch/nas/plugins/_compress/test_nas_convert.py
new file mode 100644
index 000000000..23f8f3cfe
--- /dev/null
+++ b/tests/experimental/torch/nas/plugins/_compress/test_nas_convert.py
@@ -0,0 +1,28 @@
+from pathlib import Path
+
+import pytest
+from gpu.torch._compress.test_compress import create_and_save_small_llama_model
+from transformers import AutoTokenizer
+
+
+@pytest.fixture
+def project_root_path(request: pytest.FixtureRequest) -> Path:
+    return Path(request.config.rootpath)
+
+
+#
+# See tests/gpu/torch/_compress/test_compress.py for instructions on how to run this test
+# TODO: Remove those instructions once this test runs automatically on CI
+#
+def test_nas_convert(project_root_path: Path, tmp_path: Path):
+    puzzle_dir = tmp_path
+
+    # Create a small Llama model (input to the mnt.convert() - the first model conversion step)
+    tokenizer_path = project_root_path / "tests/gpu/torch/_compress/resources/tokenizer"
+    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
+    hf_ckpt_teacher_dir = "ckpts/teacher"
+    llama_checkpoint_path = puzzle_dir / hf_ckpt_teacher_dir
+    # TODO: the same as in tests/gpu/torch/_compress/test_compress.py (refactor it)
+    create_and_save_small_llama_model(
+        llama_checkpoint_path, vocab_size=tokenizer.vocab_size, tokenizer=tokenizer
+    )
diff --git a/tests/gpu/torch/_compress/test_compress.py b/tests/gpu/torch/_compress/test_compress.py
index ddcea6aaf..217acd533 100644
--- a/tests/gpu/torch/_compress/test_compress.py
+++ b/tests/gpu/torch/_compress/test_compress.py
@@ -48,6 +48,8 @@ def project_root_path(request: pytest.FixtureRequest) -> Path:
 #
 # ../puzzletron/v1/scripts/torch_dist_runner.sh \
 # pytest -s -v ./tests/gpu/torch/puzzletron/test_compress_model.py -o addopts=""
+#
+# TODO: Remove those instructions once this test runs automatically on CI
 
 
 def test_compress(project_root_path):

From de876d6b409462ad55f5bcc81b2e0f25cc8ece34 Mon Sep 17 00:00:00 2001
From: Daniel Korzekwa <dkorzekwa@nvidia.com>
Date: Tue, 28 Oct 2025 19:15:15 +0100
Subject: [PATCH 23/81] Implement mtn.convert() for compress() algorithm.

Signed-off-by: Daniel Korzekwa <dkorzekwa@nvidia.com>
---
 .../plugins/_compress/compress_nas_plugin.py  | 103 ++++++++++++++++++
 .../nas/plugins/_compress/test_nas_convert.py |  37 ++++++-
 tests/gpu/torch/_compress/test_compress.py    |  12 +-
 3 files changed, 148 insertions(+), 4 deletions(-)
 create mode 100644 modelopt/torch/nas/plugins/_compress/compress_nas_plugin.py

diff --git a/modelopt/torch/nas/plugins/_compress/compress_nas_plugin.py b/modelopt/torch/nas/plugins/_compress/compress_nas_plugin.py
new file mode 100644
index 000000000..563d8055f
--- /dev/null
+++ b/modelopt/torch/nas/plugins/_compress/compress_nas_plugin.py
@@ -0,0 +1,103 @@
+from torch import nn
+
+from modelopt.torch.nas.conversion import NASModeRegistry
+from modelopt.torch.opt.config import ModeloptBaseConfig, ModeloptField
+from modelopt.torch.opt.mode import (
+    ConvertEntrypoint,
+    ConvertReturnType,
+    MetadataDict,
+    ModeDescriptor,
+    RestoreEntrypoint,
+)
+from modelopt.torch.opt.searcher import BaseSearcher
+
+
+class CompressModel(nn.Module):
+    pass
+
+
+class CompressConfig(ModeloptBaseConfig):
+    """Configuration for Compress NAS algorithm."""
+
+    hydra_config_dir: str = ModeloptField(
+        default="",
+        title="",
+        description="",
+    )
+
+    puzzle_dir: str = ModeloptField(
+        default="",
+        title="",
+        description="",
+    )
+
+    dataset_path: str = ModeloptField(
+        default="",
+        title="",
+        description="",
+    )
+
+
+# TOD: Why is it called SuperNetMLP?
+class SuperNetMLP(CompressModel):
+    """Marker subclass indicating converted/search-space state for CompressConfig.
+    TODO: Provide better description
+    """
+
+    hydra_config_dir: str
+    puzzle_dir: str
+    dataset_path: str
+
+
+def convert_compress_model(model: nn.Module, config: CompressConfig) -> ConvertReturnType:
+    """Convert the model to a search space model."""
+    print("=" * 80)
+    print(f"[convert] before convert:\n{model}")
+    model.__class__ = SuperNetMLP
+    model.hydra_config_dir = config.hydra_config_dir
+    model.puzzle_dir = config.puzzle_dir
+    model.dataset_path = config.dataset_path
+    print(f"[convert] after convert:\n{model}")
+    return model, {}
+
+
+def restore_compress_model(
+    model: nn.Module, config: CompressConfig, metadata: MetadataDict
+) -> nn.Module:
+    """Reuse convert to produce the same behavior on restore."""
+    return convert_compress_model(model, config)[0]
+
+
+@NASModeRegistry.register_mode
+class CompressDescriptor(ModeDescriptor):
+    """Descriptor for the Compress mode."""
+
+    @property
+    def name(self) -> str:
+        """String identifier for this mode."""
+        return "compress"
+
+    @property
+    def config_class(self) -> type[ModeloptBaseConfig]:
+        """Configuration class for this mode."""
+        return CompressConfig
+
+    @property
+    def search_algorithm(self) -> type[BaseSearcher]:
+        """Return the associated searcher implementation."""
+        raise NotImplementedError("Compress mode does not have a search algorithm.")
+
+    @property
+    def convert(self) -> ConvertEntrypoint:
+        """Entrypoint to convert a model."""
+        return convert_compress_model
+
+    @property
+    def restore(self) -> RestoreEntrypoint:
+        """Entrypoint to restore a model."""
+        return restore_compress_model
+
+    @property
+    def export_mode(self) -> str | None:
+        """The mode that corresponds to the export mode of this mode."""
+        return "export"
diff --git a/tests/experimental/torch/nas/plugins/_compress/test_nas_convert.py b/tests/experimental/torch/nas/plugins/_compress/test_nas_convert.py
index 23f8f3cfe..3bd49da69 100644
--- a/tests/experimental/torch/nas/plugins/_compress/test_nas_convert.py
+++ b/tests/experimental/torch/nas/plugins/_compress/test_nas_convert.py
@@ -4,6 +4,9 @@
 from gpu.torch._compress.test_compress import create_and_save_small_llama_model
 from transformers import AutoTokenizer
 
+import modelopt.torch.nas as mtn
+from modelopt.torch.nas.plugins._compress.compress_nas_plugin import CompressModel
+
 
 @pytest.fixture
 def project_root_path(request: pytest.FixtureRequest) -> Path:
@@ -15,9 +18,19 @@ def project_root_path(request: pytest.FixtureRequest) -> Path:
 # TODO: Remove those instructions once this test runs automatically on CI
 #
 def test_nas_convert(project_root_path: Path, tmp_path: Path):
+    #
+    # Step 1: Setup the puzzle_dir, dataset, hydra_config_dir, and input model
+    # needed for the mnt.convert() step
+    #
     puzzle_dir = tmp_path
+    hydra_config_dir = project_root_path / "tests/gpu/torch/puzzletron/resources/configs"
+    # dataset_path = puzzle_dir / "dummy_dataset"
+
+    # Setup puzzle_dir and dataset
+    # setup_puzzle_dir(puzzle_dir)
+    # save_dummy_dataset(dataset_path)
 
-    # Create a small Llama model (input to the mnt.convert() - the first model conversion step)
+    # Create a small Llama model (input to the mnt.convert() step)
     tokenizer_path = project_root_path / "tests/gpu/torch/_compress/resources/tokenizer"
     tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
     hf_ckpt_teacher_dir = "ckpts/teacher"
@@ -26,3 +39,25 @@ def test_nas_convert(project_root_path: Path, tmp_path: Path):
     create_and_save_small_llama_model(
         llama_checkpoint_path, vocab_size=tokenizer.vocab_size, tokenizer=tokenizer
     )
+
+    #
+    # Run the mnt.convert() step
+    #
+    input_model = CompressModel()
+    mtn.convert(
+        input_model,
+        mode=[
+            (
+                "compress",
+                {
+                    "hydra_config_dir": str(hydra_config_dir),
+                    "puzzle_dir": str(puzzle_dir),
+                    "dataset_path": "",  # dataset_path,
+                },
+            )
+        ],
+    )
+
+    #
+    # Check assertions
+    #
diff --git a/tests/gpu/torch/_compress/test_compress.py b/tests/gpu/torch/_compress/test_compress.py
index 217acd533..924e458d9 100644
--- a/tests/gpu/torch/_compress/test_compress.py
+++ b/tests/gpu/torch/_compress/test_compress.py
@@ -88,7 +88,9 @@ def test_compress(project_root_path):
             hf_ckpt_teacher_dir = "ckpts/teacher"
             llama_checkpoint_path = osp.join(puzzle_dir, hf_ckpt_teacher_dir)
             create_and_save_small_llama_model(
-                llama_checkpoint_path, vocab_size=tokenizer.vocab_size, tokenizer=tokenizer
+                llama_checkpoint_path,
+                vocab_size=tokenizer.vocab_size,
+                tokenizer=tokenizer,
             )
 
             # Use the full conversion pipeline (matches normal usage)
@@ -125,7 +127,8 @@ def test_compress(project_root_path):
 
             # assertions for the scoring step 5
             solution_0_filepath = osp.join(
-                puzzle_dir, "single_sequence_replacement_solutions--validation/solution_0.json"
+                puzzle_dir,
+                "single_sequence_replacement_solutions--validation/solution_0.json",
             )
             assert os.path.exists(solution_0_filepath)
 
@@ -136,7 +139,10 @@ def test_compress(project_root_path):
             )
             assert os.path.exists(solution_0_ckpt_config_path)
             assert os.path.exists(
-                osp.join(puzzle_dir, "mip/puzzle_solutions/target_memory_780000MiB/solutions.json")
+                osp.join(
+                    puzzle_dir,
+                    "mip/puzzle_solutions/target_memory_780000MiB/solutions.json",
+                )
             )
 
         runtime.wait_for_everyone()

From f7fe23cd4cca117d3758707b1829ed6b45faf5a1 Mon Sep 17 00:00:00 2001
From: Daniel Korzekwa <dkorzekwa@nvidia.com>
Date: Tue, 28 Oct 2025 20:23:07 +0100
Subject: [PATCH 24/81] Fix broken test - incorrect package names.

Signed-off-by: Daniel Korzekwa <dkorzekwa@nvidia.com>
---
 .../converters/convert_llama3_to_decilm.py    | 18 ++++++++++++++-
 ..._convert_llama3_config_to_decilm_config.py | 23 ++++++++++++++++---
 2 files changed, 37 insertions(+), 4 deletions(-)

diff --git a/modelopt/torch/_compress/decilm/converters/convert_llama3_to_decilm.py b/modelopt/torch/_compress/decilm/converters/convert_llama3_to_decilm.py
index 6cdd1f02c..4b65eeada 100644
--- a/modelopt/torch/_compress/decilm/converters/convert_llama3_to_decilm.py
+++ b/modelopt/torch/_compress/decilm/converters/convert_llama3_to_decilm.py
@@ -1,4 +1,20 @@
-"""Convert a Llama3 model to a DeciLM model."""
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Convert a Llama3 model to a DeciLM model."""
 
 #!/usr/bin/env python3
 from pathlib import Path
diff --git a/tests/experimental/torch/_compress/decilm/converters/test_convert_llama3_config_to_decilm_config.py b/tests/experimental/torch/_compress/decilm/converters/test_convert_llama3_config_to_decilm_config.py
index 4bab4d505..03c3c4cd6 100644
--- a/tests/experimental/torch/_compress/decilm/converters/test_convert_llama3_config_to_decilm_config.py
+++ b/tests/experimental/torch/_compress/decilm/converters/test_convert_llama3_config_to_decilm_config.py
@@ -1,9 +1,24 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import json
 import os.path as osp
 from pathlib import Path
 
 import pytest
-from gpu.torch._compress.test_compress import create_and_save_small_llama_model
+from experimental.torch._compress.test_compress import _create_and_save_small_llama_model
 from transformers import AutoTokenizer
 
 from modelopt.torch._compress.decilm.converters.convert_llama3_to_decilm import (
@@ -17,11 +32,13 @@ def project_root_path(request: pytest.FixtureRequest) -> Path:
 
 
 def test_convert_llama3_config_to_decilm_config(project_root_path: Path, tmp_path: Path):
-    tokenizer_path = osp.join(project_root_path, "tests/gpu/torch/_compress/resources/tokenizer")
+    tokenizer_path = osp.join(
+        project_root_path, "tests/experimental/torch/_compress/resources/tokenizer"
+    )
     tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
 
     llama_checkpoint_path = tmp_path / "llama_checkpoint"
-    create_and_save_small_llama_model(
+    _create_and_save_small_llama_model(
         llama_checkpoint_path, vocab_size=tokenizer.vocab_size, tokenizer=tokenizer
     )
 

From a2104830c58e4dae5c8c5817306453437a67bd4c Mon Sep 17 00:00:00 2001
From: Daniel Korzekwa <dkorzekwa@nvidia.com>
Date: Tue, 28 Oct 2025 20:48:03 +0100
Subject: [PATCH 25/81] Implementing nas.convert for compress algorithm.

Signed-off-by: Daniel Korzekwa <dkorzekwa@nvidia.com>
---
 modelopt/torch/_compress/compress.py          |  6 +--
 .../plugins/_compress/compress_nas_plugin.py  | 39 ++++++++++++++++++
 .../nas/plugins/_compress/test_nas_convert.py | 40 ++++++++++++++-----
 3 files changed, 73 insertions(+), 12 deletions(-)

diff --git a/modelopt/torch/_compress/compress.py b/modelopt/torch/_compress/compress.py
index 265fd5eeb..94b15ec88 100644
--- a/modelopt/torch/_compress/compress.py
+++ b/modelopt/torch/_compress/compress.py
@@ -35,12 +35,12 @@
 def compress(
     hydra_config_dir: str, hydra_config: str, puzzle_dir: str, dataset_path: str, runtime: IRuntime
 ) -> DictConfig:
-    """Compress a puzzletron model using the MIP-based NAS search algorithm.
+    """Compress a compress model using the MIP-based NAS search algorithm.
 
     Args:
         hydra_config_dir (str): path to a hydra_config_dir that defines the search space
         hydra_config (str): the corresponding hydra config file
-        puzzle_dir (str): directory with a puzzletron model to compress
+        puzzle_dir (str): directory with a compress model to compress
         dataset_path (str): dataset used for scoring and distillation
         runtime: distributed runtime to use to run the compression steps, e.g.,
                  NativeDdpRuntime(dtype=torch.bfloat16, torch_distributed_timeout=datetime.timedelta(10))
@@ -50,7 +50,7 @@ def compress(
         The same hydra configuration object is used across all compression steps.
         @TODO: Investigate if this config object is immutable across steps and clarify
     """
-    # Step 0: Load puzzletron hydra config
+    # Step 0: Load hydra config
     hydra_cfg = initialize_hydra_config_for_dir(
         config_dir=hydra_config_dir,
         config_name=hydra_config,
diff --git a/modelopt/torch/nas/plugins/_compress/compress_nas_plugin.py b/modelopt/torch/nas/plugins/_compress/compress_nas_plugin.py
index 563d8055f..783e5317f 100644
--- a/modelopt/torch/nas/plugins/_compress/compress_nas_plugin.py
+++ b/modelopt/torch/nas/plugins/_compress/compress_nas_plugin.py
@@ -1,3 +1,21 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from pathlib import Path
+
+from scripts.convert_llama3_to_decilm import convert_llama3_to_decilm
 from torch import nn
 
 from modelopt.torch.nas.conversion import NASModeRegistry
@@ -11,6 +29,9 @@
 )
 from modelopt.torch.opt.searcher import BaseSearcher
 
+# TODO Move initialize_hydra_config_for_dir from tests to main
+from tests.utils.test_utils import initialize_hydra_config_for_dir
+
 
 class CompressModel(nn.Module):
     pass
@@ -57,6 +78,24 @@ def convert_compress_model(model: nn.Module, config: CompressConfig) -> ConvertR
     model.hydra_config_dir = config.hydra_config_dir
     model.puzzle_dir = config.puzzle_dir
     model.dataset_path = config.dataset_path
+
+    # Load hydra config
+    initialize_hydra_config_for_dir(
+        config_dir=config.hydra_config_dir,
+        config_name="Llama-3_1-8B",  # TODO: Make it configurable
+        overrides=[
+            f"puzzle_dir={config.puzzle_dir}",
+            f"dataset_path={config.dataset_path}",
+        ],
+    )
+
+    # Convert Llama3 model to DeciLM model
+    hf_ckpt_teacher_dir = "ckpts/teacher"
+    convert_llama3_to_decilm(
+        input_dir=Path(config.puzzle_dir) / hf_ckpt_teacher_dir,  # TODO this should be configurable
+        output_dir=Path(config.puzzle_dir) / hf_ckpt_teacher_dir,
+    )
+
     print(f"[convert] after convert:\n{model}")
     return model, {}
 
diff --git a/tests/experimental/torch/nas/plugins/_compress/test_nas_convert.py b/tests/experimental/torch/nas/plugins/_compress/test_nas_convert.py
index 3bd49da69..b2f5d3780 100644
--- a/tests/experimental/torch/nas/plugins/_compress/test_nas_convert.py
+++ b/tests/experimental/torch/nas/plugins/_compress/test_nas_convert.py
@@ -1,7 +1,27 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from pathlib import Path
 
 import pytest
-from gpu.torch._compress.test_compress import create_and_save_small_llama_model
+from experimental.torch._compress.test_compress import (
+    _create_and_save_small_llama_model,
+    _save_dummy_dataset,
+    _setup_puzzle_dir,
+)
+from puzzle_tools.hydra_utils import register_hydra_resolvers
 from transformers import AutoTokenizer
 
 import modelopt.torch.nas as mtn
@@ -18,25 +38,27 @@ def project_root_path(request: pytest.FixtureRequest) -> Path:
 # TODO: Remove those instructions once this test runs automatically on CI
 #
 def test_nas_convert(project_root_path: Path, tmp_path: Path):
+    # Register Hydra custom resolvers (needed for config resolution)
+    register_hydra_resolvers()
+
     #
     # Step 1: Setup the puzzle_dir, dataset, hydra_config_dir, and input model
     # needed for the mnt.convert() step
     #
     puzzle_dir = tmp_path
-    hydra_config_dir = project_root_path / "tests/gpu/torch/puzzletron/resources/configs"
-    # dataset_path = puzzle_dir / "dummy_dataset"
+    dataset_path = puzzle_dir / "dummy_dataset"
+    hydra_config_dir = project_root_path / "tests/experimental/torch/_compress/resources/configs"
 
     # Setup puzzle_dir and dataset
-    # setup_puzzle_dir(puzzle_dir)
-    # save_dummy_dataset(dataset_path)
+    _setup_puzzle_dir(puzzle_dir)
+    _save_dummy_dataset(dataset_path)
 
     # Create a small Llama model (input to the mnt.convert() step)
-    tokenizer_path = project_root_path / "tests/gpu/torch/_compress/resources/tokenizer"
+    tokenizer_path = project_root_path / "tests/experimental/torch/_compress/resources/tokenizer"
     tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
     hf_ckpt_teacher_dir = "ckpts/teacher"
     llama_checkpoint_path = puzzle_dir / hf_ckpt_teacher_dir
-    # TODO: the same as in tests/gpu/torch/_compress/test_compress.py (refactor it)
-    create_and_save_small_llama_model(
+    _create_and_save_small_llama_model(
         llama_checkpoint_path, vocab_size=tokenizer.vocab_size, tokenizer=tokenizer
     )
 
@@ -52,7 +74,7 @@ def test_nas_convert(project_root_path: Path, tmp_path: Path):
                 {
                     "hydra_config_dir": str(hydra_config_dir),
                     "puzzle_dir": str(puzzle_dir),
-                    "dataset_path": "",  # dataset_path,
+                    "dataset_path": str(dataset_path),
                 },
             )
         ],

From 739f868960375b0e60aa56555b2c4959c056725f Mon Sep 17 00:00:00 2001
From: Daniel Korzekwa <dkorzekwa@nvidia.com>
Date: Tue, 28 Oct 2025 20:50:53 +0100
Subject: [PATCH 26/81] Improve docs

Signed-off-by: Daniel Korzekwa <dkorzekwa@nvidia.com>
---
 modelopt/torch/_compress/compress.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/modelopt/torch/_compress/compress.py b/modelopt/torch/_compress/compress.py
index 265fd5eeb..0722cab73 100644
--- a/modelopt/torch/_compress/compress.py
+++ b/modelopt/torch/_compress/compress.py
@@ -15,8 +15,8 @@
 
 """
 
-This module provides the main compression function for a model
-using MIP-based NAS search algorithm.
+This module provides the main compression function using the puzzle algorithm.
+MIP-based NAS search algorithm.
 
 """
 
@@ -35,12 +35,12 @@
 def compress(
     hydra_config_dir: str, hydra_config: str, puzzle_dir: str, dataset_path: str, runtime: IRuntime
 ) -> DictConfig:
-    """Compress a puzzletron model using the MIP-based NAS search algorithm.
+    """Compress a model using the MIP-based NAS search algorithm.
 
     Args:
         hydra_config_dir (str): path to a hydra_config_dir that defines the search space
         hydra_config (str): the corresponding hydra config file
-        puzzle_dir (str): directory with a puzzletron model to compress
+        puzzle_dir (str): directory with a model to compress
         dataset_path (str): dataset used for scoring and distillation
         runtime: distributed runtime to use to run the compression steps, e.g.,
                  NativeDdpRuntime(dtype=torch.bfloat16, torch_distributed_timeout=datetime.timedelta(10))
@@ -50,7 +50,7 @@ def compress(
         The same hydra configuration object is used across all compression steps.
         @TODO: Investigate if this config object is immutable across steps and clarify
     """
-    # Step 0: Load puzzletron hydra config
+    # Step 0: Load a hydra config
     hydra_cfg = initialize_hydra_config_for_dir(
         config_dir=hydra_config_dir,
         config_name=hydra_config,

From 20a3c5e7704ecdb72d29b1c39fa98d41b15193c7 Mon Sep 17 00:00:00 2001
From: Daniel Korzekwa <dkorzekwa@nvidia.com>
Date: Tue, 28 Oct 2025 21:14:36 +0100
Subject: [PATCH 27/81] Code cleanup.

Signed-off-by: Daniel Korzekwa <dkorzekwa@nvidia.com>
---
 .../plugins/_compress/compress_nas_plugin.py  | 26 +++++++------------
 1 file changed, 9 insertions(+), 17 deletions(-)

diff --git a/modelopt/torch/nas/plugins/_compress/compress_nas_plugin.py b/modelopt/torch/nas/plugins/_compress/compress_nas_plugin.py
index 783e5317f..48fb51565 100644
--- a/modelopt/torch/nas/plugins/_compress/compress_nas_plugin.py
+++ b/modelopt/torch/nas/plugins/_compress/compress_nas_plugin.py
@@ -59,25 +59,14 @@ class CompressConfig(ModeloptBaseConfig):
     )
 
 
-# TOD: Why is it called SuperNetMLP?
-class SuperNetMLP(CompressModel):
-    """Marker subclass indicating converted/search-space state for CompressConfig.
-    TODO: Provide better description
-    """
-
-    hydra_config_dir: str
-    puzzle_dir: str
-    dataset_path: str
-
-
 def convert_compress_model(model: nn.Module, config: CompressConfig) -> ConvertReturnType:
     """Convert the model to a search space model."""
     print("=" * 80)
     print(f"[convert] before convert:\n{model}")
-    model.__class__ = SuperNetMLP
-    model.hydra_config_dir = config.hydra_config_dir
-    model.puzzle_dir = config.puzzle_dir
-    model.dataset_path = config.dataset_path
+
+    # _runtime = NativeDdpRuntime(
+    #     dtype=torch.bfloat16, torch_distributed_timeout=datetime.timedelta(10)
+    # )
 
     # Load hydra config
     initialize_hydra_config_for_dir(
@@ -96,6 +85,9 @@ def convert_compress_model(model: nn.Module, config: CompressConfig) -> ConvertR
         output_dir=Path(config.puzzle_dir) / hf_ckpt_teacher_dir,
     )
 
+    #  Score_pruning_activations (distributed processing)
+    # score_pruning_activations.launch_score_activations(hydra_cfg, runtime)
+
     print(f"[convert] after convert:\n{model}")
     return model, {}
 
@@ -103,8 +95,8 @@ def convert_compress_model(model: nn.Module, config: CompressConfig) -> ConvertR
 def restore_compress_model(
     model: nn.Module, config: CompressConfig, metadata: MetadataDict
 ) -> nn.Module:
-    """Reuse convert to produce the same behavior on restore."""
-    return convert_compress_model(model, config)[0]
+    """Restore is not needed for the compress mode as we are not saving any model state"""
+    return model
 
 
 @NASModeRegistry.register_mode

From 1033c81e3828d123939b3bda4949cb5b17d16c06 Mon Sep 17 00:00:00 2001
From: Daniel Korzekwa <dkorzekwa@nvidia.com>
Date: Tue, 28 Oct 2025 21:35:41 +0100
Subject: [PATCH 28/81] Fix import

Signed-off-by: Daniel Korzekwa <dkorzekwa@nvidia.com>
---
 tests/experimental/torch/_compress/test_compress.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/experimental/torch/_compress/test_compress.py b/tests/experimental/torch/_compress/test_compress.py
index 096de4de3..db06e6580 100644
--- a/tests/experimental/torch/_compress/test_compress.py
+++ b/tests/experimental/torch/_compress/test_compress.py
@@ -21,7 +21,7 @@
 
 import pytest
 import torch
-from _test_utils.torch_dist.dist_utils import spawn_multiprocess_job
+from _test_utils.torch.distributed.utils import spawn_multiprocess_job
 from datasets import Dataset, DatasetDict
 from puzzle_tools.hydra_utils import register_hydra_resolvers
 from scripts.convert_llama3_to_decilm import convert_llama3_to_decilm

From 0680c45439c7aeca813cd323adddde317a6a0e20 Mon Sep 17 00:00:00 2001
From: Daniel Korzekwa <dkorzekwa@nvidia.com>
Date: Wed, 29 Oct 2025 12:04:35 +0100
Subject: [PATCH 29/81] simplify code

Signed-off-by: Daniel Korzekwa <dkorzekwa@nvidia.com>
---
 .../_compress/decilm/converters/convert_llama3_to_decilm.py    | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/modelopt/torch/_compress/decilm/converters/convert_llama3_to_decilm.py b/modelopt/torch/_compress/decilm/converters/convert_llama3_to_decilm.py
index 4b65eeada..d17e7ef74 100644
--- a/modelopt/torch/_compress/decilm/converters/convert_llama3_to_decilm.py
+++ b/modelopt/torch/_compress/decilm/converters/convert_llama3_to_decilm.py
@@ -19,6 +19,7 @@
 #!/usr/bin/env python3
 from pathlib import Path
 
+import torch
 from fire import Fire
 from puzzle_tools.checkpoint_utils import copy_tokenizer
 from puzzle_tools.checkpoint_utils_hf import copy_deci_lm_hf_code
@@ -46,7 +47,7 @@ def convert_llama3_config_to_decilm_config(config: LlamaConfig) -> DeciLMConfig:
         dtype = getattr(config, "torch_dtype", None)
 
     # Convert torch.dtype to string if needed (for JSON serialization)
-    if dtype is not None and hasattr(dtype, "__module__") and "torch" in dtype.__module__:
+    if dtype is not None and isinstance(dtype, torch.dtype):
         dtype = str(dtype).replace("torch.", "")
 
     # Track which global values will be removed (moved to per-layer configs)

From 2d9da30b1a0ba5ca7bc60867b4dd36e96ce3c5cb Mon Sep 17 00:00:00 2001
From: Daniel Korzekwa <dkorzekwa@nvidia.com>
Date: Wed, 29 Oct 2025 12:05:40 +0100
Subject: [PATCH 30/81] implementing compress_nas_plugin

Signed-off-by: Daniel Korzekwa <dkorzekwa@nvidia.com>
---
 .../plugins/_compress/compress_nas_plugin.py  | 35 +++++++++++++++----
 .../nas/plugins/_compress/test_nas_convert.py | 28 +++++++++++++--
 2 files changed, 54 insertions(+), 9 deletions(-)

diff --git a/modelopt/torch/nas/plugins/_compress/compress_nas_plugin.py b/modelopt/torch/nas/plugins/_compress/compress_nas_plugin.py
index 48fb51565..9ba971f45 100644
--- a/modelopt/torch/nas/plugins/_compress/compress_nas_plugin.py
+++ b/modelopt/torch/nas/plugins/_compress/compress_nas_plugin.py
@@ -13,11 +13,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import datetime
 from pathlib import Path
 
+import pruning_ckpts
+import score_pruning_activations
+import torch
 from scripts.convert_llama3_to_decilm import convert_llama3_to_decilm
 from torch import nn
 
+from modelopt.torch._compress.runtime import NativeDdpRuntime
 from modelopt.torch.nas.conversion import NASModeRegistry
 from modelopt.torch.opt.config import ModeloptBaseConfig, ModeloptField
 from modelopt.torch.opt.mode import (
@@ -40,12 +45,24 @@ class CompressModel(nn.Module):
 class CompressConfig(ModeloptBaseConfig):
     """Configuration for Compress NAS algorithm."""
 
+    input_model_path: str = ModeloptField(
+        default="",
+        title="",
+        description="",
+    )
+
     hydra_config_dir: str = ModeloptField(
         default="",
         title="",
         description="",
     )
 
+    hydra_config_name: str = ModeloptField(
+        default="",
+        title="",
+        description="",
+    )
+
     puzzle_dir: str = ModeloptField(
         default="",
         title="",
@@ -64,14 +81,14 @@ def convert_compress_model(model: nn.Module, config: CompressConfig) -> ConvertR
     print("=" * 80)
     print(f"[convert] before convert:\n{model}")
 
-    # _runtime = NativeDdpRuntime(
-    #     dtype=torch.bfloat16, torch_distributed_timeout=datetime.timedelta(10)
-    # )
+    runtime = NativeDdpRuntime(
+        dtype=torch.bfloat16, torch_distributed_timeout=datetime.timedelta(10)
+    )
 
     # Load hydra config
-    initialize_hydra_config_for_dir(
+    hydra_cfg = initialize_hydra_config_for_dir(
         config_dir=config.hydra_config_dir,
-        config_name="Llama-3_1-8B",  # TODO: Make it configurable
+        config_name=config.hydra_config_name,
         overrides=[
             f"puzzle_dir={config.puzzle_dir}",
             f"dataset_path={config.dataset_path}",
@@ -81,12 +98,16 @@ def convert_compress_model(model: nn.Module, config: CompressConfig) -> ConvertR
     # Convert Llama3 model to DeciLM model
     hf_ckpt_teacher_dir = "ckpts/teacher"
     convert_llama3_to_decilm(
-        input_dir=Path(config.puzzle_dir) / hf_ckpt_teacher_dir,  # TODO this should be configurable
+        input_dir=config.input_model_path,
         output_dir=Path(config.puzzle_dir) / hf_ckpt_teacher_dir,
     )
 
     #  Score_pruning_activations (distributed processing)
-    # score_pruning_activations.launch_score_activations(hydra_cfg, runtime)
+    score_pruning_activations.launch_score_activations(hydra_cfg, runtime)
+
+    if runtime.global_rank == 0:
+        pruning_ckpts.launch_prune_ckpt(hydra_cfg)
+    runtime.wait_for_everyone()
 
     print(f"[convert] after convert:\n{model}")
     return model, {}
diff --git a/tests/experimental/torch/nas/plugins/_compress/test_nas_convert.py b/tests/experimental/torch/nas/plugins/_compress/test_nas_convert.py
index b2f5d3780..002d3f81c 100644
--- a/tests/experimental/torch/nas/plugins/_compress/test_nas_convert.py
+++ b/tests/experimental/torch/nas/plugins/_compress/test_nas_convert.py
@@ -13,9 +13,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
+from functools import partial
 from pathlib import Path
 
 import pytest
+import torch
+from _test_utils.torch_dist.dist_utils import spawn_multiprocess_job
 from experimental.torch._compress.test_compress import (
     _create_and_save_small_llama_model,
     _save_dummy_dataset,
@@ -38,6 +42,16 @@ def project_root_path(request: pytest.FixtureRequest) -> Path:
 # TODO: Remove those instructions once this test runs automatically on CI
 #
 def test_nas_convert(project_root_path: Path, tmp_path: Path):
+    spawn_multiprocess_job(
+        size=torch.cuda.device_count(),
+        job=partial(_test_nas_convert_multiprocess_job, project_root_path, tmp_path),
+        backend="nccl",
+    )
+
+
+def _test_nas_convert_multiprocess_job(
+    project_root_path: Path, tmp_path: Path, rank: int, size: int
+):
     # Register Hydra custom resolvers (needed for config resolution)
     register_hydra_resolvers()
 
@@ -48,6 +62,7 @@ def test_nas_convert(project_root_path: Path, tmp_path: Path):
     puzzle_dir = tmp_path
     dataset_path = puzzle_dir / "dummy_dataset"
     hydra_config_dir = project_root_path / "tests/experimental/torch/_compress/resources/configs"
+    hydra_config_name = "Llama-3_1-8B"
 
     # Setup puzzle_dir and dataset
     _setup_puzzle_dir(puzzle_dir)
@@ -56,8 +71,7 @@ def test_nas_convert(project_root_path: Path, tmp_path: Path):
     # Create a small Llama model (input to the mnt.convert() step)
     tokenizer_path = project_root_path / "tests/experimental/torch/_compress/resources/tokenizer"
     tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
-    hf_ckpt_teacher_dir = "ckpts/teacher"
-    llama_checkpoint_path = puzzle_dir / hf_ckpt_teacher_dir
+    llama_checkpoint_path = puzzle_dir / "ckpts/llama"
     _create_and_save_small_llama_model(
         llama_checkpoint_path, vocab_size=tokenizer.vocab_size, tokenizer=tokenizer
     )
@@ -72,7 +86,9 @@ def test_nas_convert(project_root_path: Path, tmp_path: Path):
             (
                 "compress",
                 {
+                    "input_model_path": str(llama_checkpoint_path),
                     "hydra_config_dir": str(hydra_config_dir),
+                    "hydra_config_name": hydra_config_name,
                     "puzzle_dir": str(puzzle_dir),
                     "dataset_path": str(dataset_path),
                 },
@@ -83,3 +99,11 @@ def test_nas_convert(project_root_path: Path, tmp_path: Path):
     #
     # Check assertions
     #
+
+    # assertions for the score_pruning_activations step 1
+    rank = int(os.environ["RANK"])
+    rank_filepath = f"pruning/pruning_scores/ffn_iterative/100samples_diverse_mini/rank_{rank}.pth"
+    assert (puzzle_dir / rank_filepath).is_file()
+
+    # assertions for the pruning_ckpts step 2
+    assert (puzzle_dir / "ckpts/ffn_256_attn_no_op").exists()

From febab440b714ee745ca4464d50f6d795ef145e63 Mon Sep 17 00:00:00 2001
From: Daniel Korzekwa <dkorzekwa@nvidia.com>
Date: Wed, 29 Oct 2025 12:11:29 +0100
Subject: [PATCH 31/81] code clean up.

Signed-off-by: Daniel Korzekwa <dkorzekwa@nvidia.com>
---
 .../torch/nas/plugins/_compress/test_nas_convert.py  | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tests/experimental/torch/nas/plugins/_compress/test_nas_convert.py b/tests/experimental/torch/nas/plugins/_compress/test_nas_convert.py
index 002d3f81c..0c4756bd8 100644
--- a/tests/experimental/torch/nas/plugins/_compress/test_nas_convert.py
+++ b/tests/experimental/torch/nas/plugins/_compress/test_nas_convert.py
@@ -38,7 +38,7 @@ def project_root_path(request: pytest.FixtureRequest) -> Path:
 
 
 #
-# See tests/gpu/torch/_compress/test_compress.py for instructions on how to run this test
+# See tests/experimental/torch/_compress/test_compress.py for instructions on how to run this test
 # TODO: Remove those instructions once this test runs automatically on CI
 #
 def test_nas_convert(project_root_path: Path, tmp_path: Path):
@@ -56,8 +56,8 @@ def _test_nas_convert_multiprocess_job(
     register_hydra_resolvers()
 
     #
-    # Step 1: Setup the puzzle_dir, dataset, hydra_config_dir, and input model
-    # needed for the mnt.convert() step
+    # Setup the inputs for the nas.convert() step: puzzle_dir, dataset,
+    # hydra_config_dir/hydra_config_name, and input model
     #
     puzzle_dir = tmp_path
     dataset_path = puzzle_dir / "dummy_dataset"
@@ -68,7 +68,7 @@ def _test_nas_convert_multiprocess_job(
     _setup_puzzle_dir(puzzle_dir)
     _save_dummy_dataset(dataset_path)
 
-    # Create a small Llama model (input to the mnt.convert() step)
+    # Create a small Llama model
     tokenizer_path = project_root_path / "tests/experimental/torch/_compress/resources/tokenizer"
     tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
     llama_checkpoint_path = puzzle_dir / "ckpts/llama"
@@ -100,10 +100,10 @@ def _test_nas_convert_multiprocess_job(
     # Check assertions
     #
 
-    # assertions for the score_pruning_activations step 1
+    # assertions for the score_pruning_activations step
     rank = int(os.environ["RANK"])
     rank_filepath = f"pruning/pruning_scores/ffn_iterative/100samples_diverse_mini/rank_{rank}.pth"
     assert (puzzle_dir / rank_filepath).is_file()
 
-    # assertions for the pruning_ckpts step 2
+    # assertions for the pruning_ckpts step
     assert (puzzle_dir / "ckpts/ffn_256_attn_no_op").exists()

From 86bf394f41c4842d25d2b7c6287c034aea328768 Mon Sep 17 00:00:00 2001
From: Daniel Korzekwa <dkorzekwa@nvidia.com>
Date: Wed, 29 Oct 2025 12:21:59 +0100
Subject: [PATCH 32/81] code clean up

Signed-off-by: Daniel Korzekwa <dkorzekwa@nvidia.com>
---
 .../plugins/_compress/compress_nas_plugin.py  | 27 +++++++++++++------
 1 file changed, 19 insertions(+), 8 deletions(-)

diff --git a/modelopt/torch/nas/plugins/_compress/compress_nas_plugin.py b/modelopt/torch/nas/plugins/_compress/compress_nas_plugin.py
index 9ba971f45..748f33939 100644
--- a/modelopt/torch/nas/plugins/_compress/compress_nas_plugin.py
+++ b/modelopt/torch/nas/plugins/_compress/compress_nas_plugin.py
@@ -13,6 +13,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+"""
+Compress NAS plugin for the Modelopt framework (based on Puzzle algorithm: https://arxiv.org/abs/2411.19146).
+"""
+
 import datetime
 from pathlib import Path
 
@@ -39,36 +43,41 @@
 
 
 class CompressModel(nn.Module):
-    pass
+    pass  # No model implementation is needed for the compress mode
 
 
 class CompressConfig(ModeloptBaseConfig):
     """Configuration for Compress NAS algorithm."""
 
+    # Input model path to compress in the HF format
     input_model_path: str = ModeloptField(
         default="",
         title="",
         description="",
     )
 
+    # Hydra config directory containing the search space definition
     hydra_config_dir: str = ModeloptField(
         default="",
         title="",
         description="",
     )
 
+    # Hydra config name containing the search space definition
     hydra_config_name: str = ModeloptField(
         default="",
         title="",
         description="",
     )
 
+    # Directory to save the compressed model and intermediate results
     puzzle_dir: str = ModeloptField(
         default="",
         title="",
         description="",
     )
 
+    # Dataset path to use for scoring in prunining and NAS search
     dataset_path: str = ModeloptField(
         default="",
         title="",
@@ -77,10 +86,12 @@ class CompressConfig(ModeloptBaseConfig):
 
 
 def convert_compress_model(model: nn.Module, config: CompressConfig) -> ConvertReturnType:
-    """Convert the model to a search space model."""
-    print("=" * 80)
-    print(f"[convert] before convert:\n{model}")
+    """1. Convert the model from HF format to DeciLM format.
+    2. Score the pruning activations.
+    3. Prune the model and save pruned checkpoints
 
+    The output of this step will be used by mnt.search() to perform the NAS search.
+    """
     runtime = NativeDdpRuntime(
         dtype=torch.bfloat16, torch_distributed_timeout=datetime.timedelta(10)
     )
@@ -96,20 +107,20 @@ def convert_compress_model(model: nn.Module, config: CompressConfig) -> ConvertR
     )
 
     # Convert Llama3 model to DeciLM model
-    hf_ckpt_teacher_dir = "ckpts/teacher"
+    hf_ckpt_teacher_dir = "ckpts/teacher"  # TODO: make it configurable
     convert_llama3_to_decilm(
         input_dir=config.input_model_path,
         output_dir=Path(config.puzzle_dir) / hf_ckpt_teacher_dir,
     )
 
-    #  Score_pruning_activations (distributed processing)
+    # Score_pruning_activations (distributed processing)
     score_pruning_activations.launch_score_activations(hydra_cfg, runtime)
 
+    # Prune the model and save pruned checkpoints
     if runtime.global_rank == 0:
         pruning_ckpts.launch_prune_ckpt(hydra_cfg)
     runtime.wait_for_everyone()
 
-    print(f"[convert] after convert:\n{model}")
     return model, {}
 
 
@@ -137,7 +148,7 @@ def config_class(self) -> type[ModeloptBaseConfig]:
     @property
     def search_algorithm(self) -> type[BaseSearcher]:
         """Return the associated searcher implementation."""
-        raise NotImplementedError("Compress mode does not have a search algorithm.")
+        raise NotImplementedError("Compress mode does not have a search algorithm yet.")
 
     @property
     def convert(self) -> ConvertEntrypoint:

From 86e04a06157dec82e1baf380298ddb75200c239e Mon Sep 17 00:00:00 2001
From: Daniel Korzekwa <dkorzekwa@nvidia.com>
Date: Wed, 29 Oct 2025 14:16:48 +0100
Subject: [PATCH 33/81] create conftest.py  with shared test logic for compress
 tests.

Signed-off-by: Daniel Korzekwa <dkorzekwa@nvidia.com>
---
 .../experimental/torch/_compress/conftest.py  | 120 ++++++++++++++++++
 ..._convert_llama3_config_to_decilm_config.py |  21 +--
 .../torch/_compress/test_compress.py          | 110 ++--------------
 3 files changed, 136 insertions(+), 115 deletions(-)
 create mode 100644 tests/experimental/torch/_compress/conftest.py

diff --git a/tests/experimental/torch/_compress/conftest.py b/tests/experimental/torch/_compress/conftest.py
new file mode 100644
index 000000000..4dedf5363
--- /dev/null
+++ b/tests/experimental/torch/_compress/conftest.py
@@ -0,0 +1,120 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import shutil
+from pathlib import Path
+
+import pytest
+import torch
+from datasets import Dataset, DatasetDict
+from transformers import AutoTokenizer, LlamaConfig, LlamaForCausalLM, PreTrainedTokenizerBase
+
+
+@pytest.fixture
+def project_root_path(request: pytest.FixtureRequest) -> Path:
+    """Fixture providing the project root path for tests."""
+    return Path(request.config.rootpath)
+
+
+def create_and_save_small_llama_model(
+    output_path: str, vocab_size: int, tokenizer: PreTrainedTokenizerBase
+):
+    """
+    Create and save a small Llama model for testing the conversion pipeline.
+    This mimics having a real Llama checkpoint that needs to be converted.
+    """
+    os.makedirs(output_path, exist_ok=True)
+
+    # Create a minimal Llama config (small for testing)
+    # Note: intermediate_size must be divisible by 256 per DeciLM config requirements
+    # Note: hidden_size must give head_dim >= 8 for Flash Attention 2 compatibility
+    llama_config = LlamaConfig(
+        vocab_size=vocab_size,
+        hidden_size=256,  # 32 heads times 8 head_dim = 256 (matches bypass config expectations)
+        intermediate_size=512,  # Must be divisible by 256
+        num_hidden_layers=2,
+        num_attention_heads=32,  # Matches original test
+        num_key_value_heads=8,  # GQA: 32÷4=8 (matches original n_heads_in_group=4)
+        max_position_embeddings=512,
+        rms_norm_eps=1e-5,
+        rope_theta=10000.0,
+        attention_bias=False,
+        hidden_act="silu",
+        tie_word_embeddings=False,
+    )
+
+    # Create and save the Llama model
+    model = LlamaForCausalLM(llama_config)
+    model.to(dtype=torch.bfloat16).save_pretrained(output_path)
+
+    # Save tokenizer
+    tokenizer.save_pretrained(output_path)
+
+    # Save config
+    llama_config.save_pretrained(output_path)
+
+
+def create_tokenizer(project_root_path: Path) -> PreTrainedTokenizerBase:
+    """
+    Create a tokenizer for the Llama model.
+    """
+    tokenizer_path = project_root_path / "tests/experimental/torch/_compress/resources/tokenizer"
+    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
+    return tokenizer
+
+
+def setup_puzzle_dir(puzzle_dir: str):
+    if Path(puzzle_dir).exists():
+        shutil.rmtree(puzzle_dir)
+        Path(puzzle_dir).mkdir(parents=True, exist_ok=True)
+
+
+def save_dummy_dataset(dataset_path: str):
+    # dummy sample
+    sample = [
+        {"role": "user", "content": "please cite Lorem Ipsum?"},
+        {
+            "role": "assistant",
+            "content": (
+                "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed in blandit ante. "
+                "Sed tempus erat urna, ac elementum nisl facilisis quis. Aliquam consectetur mollis massa, "
+                "in elementum sem venenatis posuere. Fusce lorem arcu, egestas vel massa sollicitudin, "
+                "dictum mollis purus. Proin in ullamcorper elit. Nam tellus nisi, volutpat a mattis vel, "
+                "pretium in purus. Nunc at lectus facilisis risus scelerisque rhoncus eu nec ex. "
+                "Maecenas semper, tellus non placerat vulputate, urna felis facilisis diam, "
+                "sit amet vestibulum erat sapien nec libero. Praesent non massa velit. Donec faucibus mi eros. "
+                "Nam turpis nulla, congue sit amet mi at, porttitor scelerisque elit. Nunc id sodales lorem, "
+                "nec tincidunt leo. Quisque a neque nec ligula porttitor auctor. "
+                "Nunc accumsan nunc ac tellus congue vehicula. Praesent tellus eros, luctus non gravida dapibus, "
+                "faucibus eu ex. Quisque bibendum leo pharetra, tristique est vitae, hendrerit nunc. "
+                "Duis nec congue dolor. Donec commodo ipsum non efficitur volutpat. "
+                "Nulla risus nulla, efficitur et urna at, imperdiet sodales lorem. "
+                "Suspendisse erat est, sollicitudin at nisl tincidunt, vehicula hendrerit lectus. "
+                "Nam quis nisi ullamcorper, rhoncus massa vel, tempus purus. "
+                "Duis pulvinar eros vel nulla pellentesque, at dapibus justo laoreet. "
+                "Praesent tortor orci, vulputate fermentum dapibus nec, feugiat vitae tortor. "
+                "Donec mollis convallis massa quis iaculis."
+            ),
+        },
+    ]
+
+    # Prepare train and val splits with sample repeated, 2500 samples are for
+    # 128 samples with block-size 8192 and LLama3 tokenizer
+    data = [{"conversation": sample}] * 2500
+
+    # For train-val splits
+    data_dict = DatasetDict({"train": Dataset.from_list(data), "valid": Dataset.from_list(data)})
+    data_dict.save_to_disk(dataset_path)
diff --git a/tests/experimental/torch/_compress/decilm/converters/test_convert_llama3_config_to_decilm_config.py b/tests/experimental/torch/_compress/decilm/converters/test_convert_llama3_config_to_decilm_config.py
index 03c3c4cd6..a1d897ceb 100644
--- a/tests/experimental/torch/_compress/decilm/converters/test_convert_llama3_config_to_decilm_config.py
+++ b/tests/experimental/torch/_compress/decilm/converters/test_convert_llama3_config_to_decilm_config.py
@@ -14,31 +14,22 @@
 # limitations under the License.
 
 import json
-import os.path as osp
 from pathlib import Path
 
-import pytest
-from experimental.torch._compress.test_compress import _create_and_save_small_llama_model
-from transformers import AutoTokenizer
+from experimental.torch._compress.conftest import (
+    create_and_save_small_llama_model,
+    create_tokenizer,
+)
 
 from modelopt.torch._compress.decilm.converters.convert_llama3_to_decilm import (
     convert_llama3_to_decilm,
 )
 
 
-@pytest.fixture
-def project_root_path(request: pytest.FixtureRequest) -> Path:
-    return Path(request.config.rootpath)
-
-
 def test_convert_llama3_config_to_decilm_config(project_root_path: Path, tmp_path: Path):
-    tokenizer_path = osp.join(
-        project_root_path, "tests/experimental/torch/_compress/resources/tokenizer"
-    )
-    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
-
+    tokenizer = create_tokenizer(project_root_path)
     llama_checkpoint_path = tmp_path / "llama_checkpoint"
-    _create_and_save_small_llama_model(
+    create_and_save_small_llama_model(
         llama_checkpoint_path, vocab_size=tokenizer.vocab_size, tokenizer=tokenizer
     )
 
diff --git a/tests/experimental/torch/_compress/test_compress.py b/tests/experimental/torch/_compress/test_compress.py
index db06e6580..f36c9ff6b 100644
--- a/tests/experimental/torch/_compress/test_compress.py
+++ b/tests/experimental/torch/_compress/test_compress.py
@@ -15,27 +15,23 @@
 
 import datetime
 import os
-import shutil
 from functools import partial
 from pathlib import Path
 
-import pytest
 import torch
 from _test_utils.torch.distributed.utils import spawn_multiprocess_job
-from datasets import Dataset, DatasetDict
+from experimental.torch._compress.conftest import (
+    create_and_save_small_llama_model,
+    create_tokenizer,
+    save_dummy_dataset,
+    setup_puzzle_dir,
+)
 from puzzle_tools.hydra_utils import register_hydra_resolvers
 from scripts.convert_llama3_to_decilm import convert_llama3_to_decilm
-from transformers import AutoTokenizer, LlamaConfig, LlamaForCausalLM, PreTrainedTokenizerBase
 
 from modelopt.torch._compress import compress
 from modelopt.torch._compress.runtime import NativeDdpRuntime
 
-
-@pytest.fixture
-def project_root_path(request: pytest.FixtureRequest) -> Path:
-    return Path(request.config.rootpath)
-
-
 # The e2e test to compress a model based on Local Neural Architecture Search (Mixed Integer Programing NAS search)
 # using a one-click command.
 #
@@ -84,23 +80,19 @@ def _test_compress_multiprocess_job(project_root_path: Path, tmp_path: Path, ran
         #
         if rank == 0:
             # Setup puzzle_dir and dataset
-            _setup_puzzle_dir(puzzle_dir)
-            _save_dummy_dataset(dataset_path)
+            setup_puzzle_dir(puzzle_dir)
+            save_dummy_dataset(dataset_path)
 
             #
             # Step 1: Create and save a teacher model to compress
             # This mimics the normal pipeline where we start with a Llama model
             #
-            tokenizer_path = (
-                project_root_path / "tests/experimental/torch/_compress/resources/tokenizer"
-            )
-
-            tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
 
             # Create a small Llama model (not DeciLM) to match the normal conversion pipeline
+            tokenizer = create_tokenizer(project_root_path)
             hf_ckpt_teacher_dir = "ckpts/teacher"
             llama_checkpoint_path = puzzle_dir / hf_ckpt_teacher_dir
-            _create_and_save_small_llama_model(
+            create_and_save_small_llama_model(
                 llama_checkpoint_path, vocab_size=tokenizer.vocab_size, tokenizer=tokenizer
             )
 
@@ -156,85 +148,3 @@ def _test_compress_multiprocess_job(project_root_path: Path, tmp_path: Path, ran
         runtime.wait_for_everyone()
 
         print("PYTEST SUMMARY: test_compress_model() test has finished successfully")
-
-
-def _create_and_save_small_llama_model(
-    output_path: str, vocab_size: int, tokenizer: PreTrainedTokenizerBase
-):
-    """
-    Create and save a small Llama model for testing the conversion pipeline.
-    This mimics having a real Llama checkpoint that needs to be converted.
-    """
-    os.makedirs(output_path, exist_ok=True)
-
-    # Create a minimal Llama config (small for testing)
-    # Note: intermediate_size must be divisible by 256 per DeciLM config requirements
-    # Note: hidden_size must give head_dim >= 8 for Flash Attention 2 compatibility
-    llama_config = LlamaConfig(
-        vocab_size=vocab_size,
-        hidden_size=256,  # 32 heads times 8 head_dim = 256 (matches bypass config expectations)
-        intermediate_size=512,  # Must be divisible by 256
-        num_hidden_layers=2,
-        num_attention_heads=32,  # Matches original test
-        num_key_value_heads=8,  # GQA: 32÷4=8 (matches original n_heads_in_group=4)
-        max_position_embeddings=512,
-        rms_norm_eps=1e-5,
-        rope_theta=10000.0,
-        attention_bias=False,
-        hidden_act="silu",
-        tie_word_embeddings=False,
-    )
-
-    # Create and save the Llama model
-    model = LlamaForCausalLM(llama_config)
-    model.to(dtype=torch.bfloat16).save_pretrained(output_path)
-
-    # Save tokenizer
-    tokenizer.save_pretrained(output_path)
-
-    # Save config
-    llama_config.save_pretrained(output_path)
-
-
-def _setup_puzzle_dir(puzzle_dir: str):
-    if Path(puzzle_dir).exists():
-        shutil.rmtree(puzzle_dir)
-        Path(puzzle_dir).mkdir(parents=True, exist_ok=True)
-
-
-def _save_dummy_dataset(dataset_path: str):
-    # dummy sample
-    sample = [
-        {"role": "user", "content": "please cite Lorem Ipsum?"},
-        {
-            "role": "assistant",
-            "content": (
-                "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed in blandit ante. "
-                "Sed tempus erat urna, ac elementum nisl facilisis quis. Aliquam consectetur mollis massa, "
-                "in elementum sem venenatis posuere. Fusce lorem arcu, egestas vel massa sollicitudin, "
-                "dictum mollis purus. Proin in ullamcorper elit. Nam tellus nisi, volutpat a mattis vel, "
-                "pretium in purus. Nunc at lectus facilisis risus scelerisque rhoncus eu nec ex. "
-                "Maecenas semper, tellus non placerat vulputate, urna felis facilisis diam, "
-                "sit amet vestibulum erat sapien nec libero. Praesent non massa velit. Donec faucibus mi eros. "
-                "Nam turpis nulla, congue sit amet mi at, porttitor scelerisque elit. Nunc id sodales lorem, "
-                "nec tincidunt leo. Quisque a neque nec ligula porttitor auctor. "
-                "Nunc accumsan nunc ac tellus congue vehicula. Praesent tellus eros, luctus non gravida dapibus, "
-                "faucibus eu ex. Quisque bibendum leo pharetra, tristique est vitae, hendrerit nunc. "
-                "Duis nec congue dolor. Donec commodo ipsum non efficitur volutpat. "
-                "Nulla risus nulla, efficitur et urna at, imperdiet sodales lorem. "
-                "Suspendisse erat est, sollicitudin at nisl tincidunt, vehicula hendrerit lectus. "
-                "Nam quis nisi ullamcorper, rhoncus massa vel, tempus purus. "
-                "Duis pulvinar eros vel nulla pellentesque, at dapibus justo laoreet. "
-                "Praesent tortor orci, vulputate fermentum dapibus nec, feugiat vitae tortor. "
-                "Donec mollis convallis massa quis iaculis."
-            ),
-        },
-    ]
-
-    # Prepare train and val splits with sample repeated, 2500 samples are for
-    # 128 samples with block-size 8192 and LLama3 tokenizer
-    data = [{"conversation": sample}] * 2500
-
-    # For train-val splits
-    data_dict = DatasetDict({"train": Dataset.from_list(data), "valid": Dataset.from_list(data)})
-    data_dict.save_to_disk(dataset_path)

From ae6164423e0539ca4640f35f0ffc6fcd67a9a1b6 Mon Sep 17 00:00:00 2001
From: Daniel Korzekwa <dkorzekwa@nvidia.com>
Date: Wed, 29 Oct 2025 14:19:10 +0100
Subject: [PATCH 34/81] code cleanup

Signed-off-by: Daniel Korzekwa <dkorzekwa@nvidia.com>
---
 tests/experimental/torch/_compress/test_compress.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/experimental/torch/_compress/test_compress.py b/tests/experimental/torch/_compress/test_compress.py
index f36c9ff6b..018b78e1a 100644
--- a/tests/experimental/torch/_compress/test_compress.py
+++ b/tests/experimental/torch/_compress/test_compress.py
@@ -69,6 +69,7 @@ def _test_compress_multiprocess_job(project_root_path: Path, tmp_path: Path, ran
     puzzle_dir = tmp_path
     dataset_path = puzzle_dir / "dummy_dataset"
     hydra_config_dir = project_root_path / "tests/experimental/torch/_compress/resources/configs"
+    hydra_config_name = "Llama-3_1-8B"
 
     _runtime = NativeDdpRuntime(
         dtype=torch.bfloat16, torch_distributed_timeout=datetime.timedelta(10)
@@ -105,7 +106,7 @@ def _test_compress_multiprocess_job(project_root_path: Path, tmp_path: Path, ran
 
         # Compress the model using a one-click approach
         compress.compress(
-            str(hydra_config_dir), "Llama-3_1-8B", str(puzzle_dir), str(dataset_path), runtime
+            str(hydra_config_dir), hydra_config_name, str(puzzle_dir), str(dataset_path), runtime
         )
 
         #

From 3778ec21e20146a81410cdc7c2e86253d79ec40d Mon Sep 17 00:00:00 2001
From: Daniel Korzekwa <dkorzekwa@nvidia.com>
Date: Wed, 29 Oct 2025 14:51:49 +0100
Subject: [PATCH 35/81] code refactoring

Signed-off-by: Daniel Korzekwa <dkorzekwa@nvidia.com>
---
 .../torch/_compress/test_compress.py          |   9 +-
 .../nas/plugins/_compress/test_nas_convert.py | 103 ++++++++++--------
 2 files changed, 64 insertions(+), 48 deletions(-)

diff --git a/tests/experimental/torch/_compress/test_compress.py b/tests/experimental/torch/_compress/test_compress.py
index 018b78e1a..1cc948c58 100644
--- a/tests/experimental/torch/_compress/test_compress.py
+++ b/tests/experimental/torch/_compress/test_compress.py
@@ -66,16 +66,19 @@ def test_compress(project_root_path: Path, tmp_path: Path):
 def _test_compress_multiprocess_job(project_root_path: Path, tmp_path: Path, rank: int, size: int):
     register_hydra_resolvers()
 
+    #
+    # The inputs for the compress() algorihm.
+    #
     puzzle_dir = tmp_path
     dataset_path = puzzle_dir / "dummy_dataset"
     hydra_config_dir = project_root_path / "tests/experimental/torch/_compress/resources/configs"
     hydra_config_name = "Llama-3_1-8B"
 
-    _runtime = NativeDdpRuntime(
+    runtime = NativeDdpRuntime(
         dtype=torch.bfloat16, torch_distributed_timeout=datetime.timedelta(10)
     )
 
-    with _runtime as runtime:
+    with runtime as runtime:
         #
         # Test setup
         #
@@ -148,4 +151,4 @@ def _test_compress_multiprocess_job(project_root_path: Path, tmp_path: Path, ran
 
         runtime.wait_for_everyone()
 
-        print("PYTEST SUMMARY: test_compress_model() test has finished successfully")
+    print("PYTEST SUMMARY: test_compress_model() test has finished successfully")
diff --git a/tests/experimental/torch/nas/plugins/_compress/test_nas_convert.py b/tests/experimental/torch/nas/plugins/_compress/test_nas_convert.py
index 0c4756bd8..6bd0e248a 100644
--- a/tests/experimental/torch/nas/plugins/_compress/test_nas_convert.py
+++ b/tests/experimental/torch/nas/plugins/_compress/test_nas_convert.py
@@ -13,22 +13,24 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import datetime
 import os
 from functools import partial
 from pathlib import Path
 
 import pytest
 import torch
-from _test_utils.torch_dist.dist_utils import spawn_multiprocess_job
-from experimental.torch._compress.test_compress import (
-    _create_and_save_small_llama_model,
-    _save_dummy_dataset,
-    _setup_puzzle_dir,
+from _test_utils.torch.distributed.utils import spawn_multiprocess_job
+from experimental.torch._compress.conftest import (
+    create_and_save_small_llama_model,
+    create_tokenizer,
+    save_dummy_dataset,
+    setup_puzzle_dir,
 )
 from puzzle_tools.hydra_utils import register_hydra_resolvers
-from transformers import AutoTokenizer
 
 import modelopt.torch.nas as mtn
+from modelopt.torch._compress.runtime import NativeDdpRuntime
 from modelopt.torch.nas.plugins._compress.compress_nas_plugin import CompressModel
 
 
@@ -56,54 +58,65 @@ def _test_nas_convert_multiprocess_job(
     register_hydra_resolvers()
 
     #
-    # Setup the inputs for the nas.convert() step: puzzle_dir, dataset,
-    # hydra_config_dir/hydra_config_name, and input model
+    # The inputs for the nas.convert() step.
     #
     puzzle_dir = tmp_path
+    llama_checkpoint_path = puzzle_dir / "ckpts/llama"
     dataset_path = puzzle_dir / "dummy_dataset"
     hydra_config_dir = project_root_path / "tests/experimental/torch/_compress/resources/configs"
     hydra_config_name = "Llama-3_1-8B"
 
-    # Setup puzzle_dir and dataset
-    _setup_puzzle_dir(puzzle_dir)
-    _save_dummy_dataset(dataset_path)
-
-    # Create a small Llama model
-    tokenizer_path = project_root_path / "tests/experimental/torch/_compress/resources/tokenizer"
-    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
-    llama_checkpoint_path = puzzle_dir / "ckpts/llama"
-    _create_and_save_small_llama_model(
-        llama_checkpoint_path, vocab_size=tokenizer.vocab_size, tokenizer=tokenizer
+    runtime = NativeDdpRuntime(
+        dtype=torch.bfloat16, torch_distributed_timeout=datetime.timedelta(10)
     )
 
-    #
-    # Run the mnt.convert() step
-    #
-    input_model = CompressModel()
-    mtn.convert(
-        input_model,
-        mode=[
-            (
-                "compress",
-                {
-                    "input_model_path": str(llama_checkpoint_path),
-                    "hydra_config_dir": str(hydra_config_dir),
-                    "hydra_config_name": hydra_config_name,
-                    "puzzle_dir": str(puzzle_dir),
-                    "dataset_path": str(dataset_path),
-                },
+    with runtime as runtime:
+        if rank == 0:
+            # Setup puzzle_dir and dataset
+            setup_puzzle_dir(puzzle_dir)
+            save_dummy_dataset(dataset_path)
+
+            # Create a small Llama model
+            tokenizer = create_tokenizer(project_root_path)
+            create_and_save_small_llama_model(
+                llama_checkpoint_path, vocab_size=tokenizer.vocab_size, tokenizer=tokenizer
             )
-        ],
-    )
+        runtime.wait_for_everyone()
+
+        #
+        # Run the mnt.convert() step
+        #
+        input_model = CompressModel()
+        mtn.convert(
+            input_model,
+            mode=[
+                (
+                    "compress",
+                    {
+                        "puzzle_dir": str(puzzle_dir),
+                        "input_model_path": str(llama_checkpoint_path),
+                        "hydra_config_dir": str(hydra_config_dir),
+                        "hydra_config_name": hydra_config_name,
+                        "dataset_path": str(dataset_path),
+                    },
+                )
+            ],
+        )
+
+        #
+        # Check assertions
+        #
+        if rank == 0:
+            # assertions for the score_pruning_activations step
+            rank = int(os.environ["RANK"])
+            rank_filepath = (
+                f"pruning/pruning_scores/ffn_iterative/100samples_diverse_mini/rank_{rank}.pth"
+            )
+            assert (puzzle_dir / rank_filepath).is_file()
 
-    #
-    # Check assertions
-    #
+            # assertions for the pruning_ckpts step
+            assert (puzzle_dir / "ckpts/ffn_256_attn_no_op").exists()
 
-    # assertions for the score_pruning_activations step
-    rank = int(os.environ["RANK"])
-    rank_filepath = f"pruning/pruning_scores/ffn_iterative/100samples_diverse_mini/rank_{rank}.pth"
-    assert (puzzle_dir / rank_filepath).is_file()
+        runtime.wait_for_everyone()
 
-    # assertions for the pruning_ckpts step
-    assert (puzzle_dir / "ckpts/ffn_256_attn_no_op").exists()
+    print("PYTEST SUMMARY: test_nas_convert() test has finished successfully")

From d940000ecb2116dc64e68c08cfc5018cfba05d0c Mon Sep 17 00:00:00 2001
From: Daniel Korzekwa <dkorzekwa@nvidia.com>
Date: Wed, 29 Oct 2025 14:56:30 +0100
Subject: [PATCH 36/81] refactoring

Signed-off-by: Daniel Korzekwa <dkorzekwa@nvidia.com>
---
 .../_compress => _compress/nas/plugins}/compress_nas_plugin.py  | 0
 .../_compress => _compress/nas/plugins}/test_nas_convert.py     | 2 +-
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename modelopt/torch/{nas/plugins/_compress => _compress/nas/plugins}/compress_nas_plugin.py (100%)
 rename tests/experimental/torch/{nas/plugins/_compress => _compress/nas/plugins}/test_nas_convert.py (98%)

diff --git a/modelopt/torch/nas/plugins/_compress/compress_nas_plugin.py b/modelopt/torch/_compress/nas/plugins/compress_nas_plugin.py
similarity index 100%
rename from modelopt/torch/nas/plugins/_compress/compress_nas_plugin.py
rename to modelopt/torch/_compress/nas/plugins/compress_nas_plugin.py
diff --git a/tests/experimental/torch/nas/plugins/_compress/test_nas_convert.py b/tests/experimental/torch/_compress/nas/plugins/test_nas_convert.py
similarity index 98%
rename from tests/experimental/torch/nas/plugins/_compress/test_nas_convert.py
rename to tests/experimental/torch/_compress/nas/plugins/test_nas_convert.py
index 6bd0e248a..81b51dcf3 100644
--- a/tests/experimental/torch/nas/plugins/_compress/test_nas_convert.py
+++ b/tests/experimental/torch/_compress/nas/plugins/test_nas_convert.py
@@ -30,8 +30,8 @@
 from puzzle_tools.hydra_utils import register_hydra_resolvers
 
 import modelopt.torch.nas as mtn
+from modelopt.torch._compress.nas.plugins.compress_nas_plugin import CompressModel
 from modelopt.torch._compress.runtime import NativeDdpRuntime
-from modelopt.torch.nas.plugins._compress.compress_nas_plugin import CompressModel
 
 
 @pytest.fixture

From 0bf9a92763e4125b2dd2b23655abb16040e22b9c Mon Sep 17 00:00:00 2001
From: Daniel Korzekwa <dkorzekwa@nvidia.com>
Date: Wed, 29 Oct 2025 15:02:13 +0100
Subject: [PATCH 37/81] move test utilities from conftest.py to test_utils.py

Signed-off-by: Daniel Korzekwa <dkorzekwa@nvidia.com>
---
 .../experimental/torch/_compress/conftest.py  |  96 --------------
 ..._convert_llama3_config_to_decilm_config.py |   2 +-
 .../_compress/nas/plugins/test_nas_convert.py |   8 +-
 .../torch/_compress/test_compress.py          |   2 +-
 .../torch/_compress/test_utils.py             | 119 ++++++++++++++++++
 5 files changed, 122 insertions(+), 105 deletions(-)
 create mode 100644 tests/experimental/torch/_compress/test_utils.py

diff --git a/tests/experimental/torch/_compress/conftest.py b/tests/experimental/torch/_compress/conftest.py
index 4dedf5363..cae1bfbca 100644
--- a/tests/experimental/torch/_compress/conftest.py
+++ b/tests/experimental/torch/_compress/conftest.py
@@ -13,108 +13,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os
-import shutil
 from pathlib import Path
 
 import pytest
-import torch
-from datasets import Dataset, DatasetDict
-from transformers import AutoTokenizer, LlamaConfig, LlamaForCausalLM, PreTrainedTokenizerBase
 
 
 @pytest.fixture
 def project_root_path(request: pytest.FixtureRequest) -> Path:
     """Fixture providing the project root path for tests."""
     return Path(request.config.rootpath)
-
-
-def create_and_save_small_llama_model(
-    output_path: str, vocab_size: int, tokenizer: PreTrainedTokenizerBase
-):
-    """
-    Create and save a small Llama model for testing the conversion pipeline.
-    This mimics having a real Llama checkpoint that needs to be converted.
-    """
-    os.makedirs(output_path, exist_ok=True)
-
-    # Create a minimal Llama config (small for testing)
-    # Note: intermediate_size must be divisible by 256 per DeciLM config requirements
-    # Note: hidden_size must give head_dim >= 8 for Flash Attention 2 compatibility
-    llama_config = LlamaConfig(
-        vocab_size=vocab_size,
-        hidden_size=256,  # 32 heads times 8 head_dim = 256 (matches bypass config expectations)
-        intermediate_size=512,  # Must be divisible by 256
-        num_hidden_layers=2,
-        num_attention_heads=32,  # Matches original test
-        num_key_value_heads=8,  # GQA: 32÷4=8 (matches original n_heads_in_group=4)
-        max_position_embeddings=512,
-        rms_norm_eps=1e-5,
-        rope_theta=10000.0,
-        attention_bias=False,
-        hidden_act="silu",
-        tie_word_embeddings=False,
-    )
-
-    # Create and save the Llama model
-    model = LlamaForCausalLM(llama_config)
-    model.to(dtype=torch.bfloat16).save_pretrained(output_path)
-
-    # Save tokenizer
-    tokenizer.save_pretrained(output_path)
-
-    # Save config
-    llama_config.save_pretrained(output_path)
-
-
-def create_tokenizer(project_root_path: Path) -> PreTrainedTokenizerBase:
-    """
-    Create a tokenizer for the Llama model.
-    """
-    tokenizer_path = project_root_path / "tests/experimental/torch/_compress/resources/tokenizer"
-    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
-    return tokenizer
-
-
-def setup_puzzle_dir(puzzle_dir: str):
-    if Path(puzzle_dir).exists():
-        shutil.rmtree(puzzle_dir)
-        Path(puzzle_dir).mkdir(parents=True, exist_ok=True)
-
-
-def save_dummy_dataset(dataset_path: str):
-    # dummy sample
-    sample = [
-        {"role": "user", "content": "please cite Lorem Ipsum?"},
-        {
-            "role": "assistant",
-            "content": (
-                "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed in blandit ante. "
-                "Sed tempus erat urna, ac elementum nisl facilisis quis. Aliquam consectetur mollis massa, "
-                "in elementum sem venenatis posuere. Fusce lorem arcu, egestas vel massa sollicitudin, "
-                "dictum mollis purus. Proin in ullamcorper elit. Nam tellus nisi, volutpat a mattis vel, "
-                "pretium in purus. Nunc at lectus facilisis risus scelerisque rhoncus eu nec ex. "
-                "Maecenas semper, tellus non placerat vulputate, urna felis facilisis diam, "
-                "sit amet vestibulum erat sapien nec libero. Praesent non massa velit. Donec faucibus mi eros. "
-                "Nam turpis nulla, congue sit amet mi at, porttitor scelerisque elit. Nunc id sodales lorem, "
-                "nec tincidunt leo. Quisque a neque nec ligula porttitor auctor. "
-                "Nunc accumsan nunc ac tellus congue vehicula. Praesent tellus eros, luctus non gravida dapibus, "
-                "faucibus eu ex. Quisque bibendum leo pharetra, tristique est vitae, hendrerit nunc. "
-                "Duis nec congue dolor. Donec commodo ipsum non efficitur volutpat. "
-                "Nulla risus nulla, efficitur et urna at, imperdiet sodales lorem. "
-                "Suspendisse erat est, sollicitudin at nisl tincidunt, vehicula hendrerit lectus. "
-                "Nam quis nisi ullamcorper, rhoncus massa vel, tempus purus. "
-                "Duis pulvinar eros vel nulla pellentesque, at dapibus justo laoreet. "
-                "Praesent tortor orci, vulputate fermentum dapibus nec, feugiat vitae tortor. "
-                "Donec mollis convallis massa quis iaculis."
-            ),
-        },
-    ]
-
-    # Prepare train and val splits with sample repeated, 2500 samples are for
-    # 128 samples with block-size 8192 and LLama3 tokenizer
-    data = [{"conversation": sample}] * 2500
-
-    # For train-val splits
-    data_dict = DatasetDict({"train": Dataset.from_list(data), "valid": Dataset.from_list(data)})
-    data_dict.save_to_disk(dataset_path)
diff --git a/tests/experimental/torch/_compress/decilm/converters/test_convert_llama3_config_to_decilm_config.py b/tests/experimental/torch/_compress/decilm/converters/test_convert_llama3_config_to_decilm_config.py
index a1d897ceb..92dad84e4 100644
--- a/tests/experimental/torch/_compress/decilm/converters/test_convert_llama3_config_to_decilm_config.py
+++ b/tests/experimental/torch/_compress/decilm/converters/test_convert_llama3_config_to_decilm_config.py
@@ -16,7 +16,7 @@
 import json
 from pathlib import Path
 
-from experimental.torch._compress.conftest import (
+from experimental.torch._compress.test_utils import (
     create_and_save_small_llama_model,
     create_tokenizer,
 )
diff --git a/tests/experimental/torch/_compress/nas/plugins/test_nas_convert.py b/tests/experimental/torch/_compress/nas/plugins/test_nas_convert.py
index 81b51dcf3..4a416c833 100644
--- a/tests/experimental/torch/_compress/nas/plugins/test_nas_convert.py
+++ b/tests/experimental/torch/_compress/nas/plugins/test_nas_convert.py
@@ -18,10 +18,9 @@
 from functools import partial
 from pathlib import Path
 
-import pytest
 import torch
 from _test_utils.torch.distributed.utils import spawn_multiprocess_job
-from experimental.torch._compress.conftest import (
+from experimental.torch._compress.test_utils import (
     create_and_save_small_llama_model,
     create_tokenizer,
     save_dummy_dataset,
@@ -34,11 +33,6 @@
 from modelopt.torch._compress.runtime import NativeDdpRuntime
 
 
-@pytest.fixture
-def project_root_path(request: pytest.FixtureRequest) -> Path:
-    return Path(request.config.rootpath)
-
-
 #
 # See tests/experimental/torch/_compress/test_compress.py for instructions on how to run this test
 # TODO: Remove those instructions once this test runs automatically on CI
diff --git a/tests/experimental/torch/_compress/test_compress.py b/tests/experimental/torch/_compress/test_compress.py
index 1cc948c58..dd9bac9c5 100644
--- a/tests/experimental/torch/_compress/test_compress.py
+++ b/tests/experimental/torch/_compress/test_compress.py
@@ -20,7 +20,7 @@
 
 import torch
 from _test_utils.torch.distributed.utils import spawn_multiprocess_job
-from experimental.torch._compress.conftest import (
+from experimental.torch._compress.test_utils import (
     create_and_save_small_llama_model,
     create_tokenizer,
     save_dummy_dataset,
diff --git a/tests/experimental/torch/_compress/test_utils.py b/tests/experimental/torch/_compress/test_utils.py
new file mode 100644
index 000000000..21ca622da
--- /dev/null
+++ b/tests/experimental/torch/_compress/test_utils.py
@@ -0,0 +1,119 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import shutil
+from pathlib import Path
+
+import torch
+from datasets import Dataset, DatasetDict
+from transformers import AutoTokenizer, LlamaConfig, LlamaForCausalLM, PreTrainedTokenizerBase
+
+
+def create_and_save_small_llama_model(
+    output_path: str, vocab_size: int, tokenizer: PreTrainedTokenizerBase
+):
+    """
+    Create and save a small Llama model for testing the conversion pipeline.
+    This mimics having a real Llama checkpoint that needs to be converted.
+    """
+    os.makedirs(output_path, exist_ok=True)
+
+    # Create a minimal Llama config (small for testing)
+    # Note: intermediate_size must be divisible by 256 per DeciLM config requirements
+    # Note: hidden_size must give head_dim >= 8 for Flash Attention 2 compatibility
+    llama_config = LlamaConfig(
+        vocab_size=vocab_size,
+        hidden_size=256,  # 32 heads times 8 head_dim = 256 (matches bypass config expectations)
+        intermediate_size=512,  # Must be divisible by 256
+        num_hidden_layers=2,
+        num_attention_heads=32,  # Matches original test
+        num_key_value_heads=8,  # GQA: 32÷4=8 (matches original n_heads_in_group=4)
+        max_position_embeddings=512,
+        rms_norm_eps=1e-5,
+        rope_theta=10000.0,
+        attention_bias=False,
+        hidden_act="silu",
+        tie_word_embeddings=False,
+    )
+
+    # Create and save the Llama model
+    model = LlamaForCausalLM(llama_config)
+    model.to(dtype=torch.bfloat16).save_pretrained(output_path)
+
+    # Save tokenizer
+    tokenizer.save_pretrained(output_path)
+
+    # Save config
+    llama_config.save_pretrained(output_path)
+
+
+def create_tokenizer(project_root_path: Path) -> PreTrainedTokenizerBase:
+    """
+    Create a tokenizer for the Llama model.
+    """
+    tokenizer_path = project_root_path / "tests/experimental/torch/_compress/resources/tokenizer"
+    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
+    return tokenizer
+
+
+def setup_puzzle_dir(puzzle_dir: str):
+    """
+    Setup puzzle directory by removing existing directory and creating a new one.
+    """
+    if Path(puzzle_dir).exists():
+        shutil.rmtree(puzzle_dir)
+        Path(puzzle_dir).mkdir(parents=True, exist_ok=True)
+
+
+def save_dummy_dataset(dataset_path: str):
+    """
+    Save a dummy dataset for testing purposes.
+    """
+    # dummy sample
+    sample = [
+        {"role": "user", "content": "please cite Lorem Ipsum?"},
+        {
+            "role": "assistant",
+            "content": (
+                "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed in blandit ante. "
+                "Sed tempus erat urna, ac elementum nisl facilisis quis. Aliquam consectetur mollis massa, "
+                "in elementum sem venenatis posuere. Fusce lorem arcu, egestas vel massa sollicitudin, "
+                "dictum mollis purus. Proin in ullamcorper elit. Nam tellus nisi, volutpat a mattis vel, "
+                "pretium in purus. Nunc at lectus facilisis risus scelerisque rhoncus eu nec ex. "
+                "Maecenas semper, tellus non placerat vulputate, urna felis facilisis diam, "
+                "sit amet vestibulum erat sapien nec libero. Praesent non massa velit. Donec faucibus mi eros. "
+                "Nam turpis nulla, congue sit amet mi at, porttitor scelerisque elit. Nunc id sodales lorem, "
+                "nec tincidunt leo. Quisque a neque nec ligula porttitor auctor. "
+                "Nunc accumsan nunc ac tellus congue vehicula. Praesent tellus eros, luctus non gravida dapibus, "
+                "faucibus eu ex. Quisque bibendum leo pharetra, tristique est vitae, hendrerit nunc. "
+                "Duis nec congue dolor. Donec commodo ipsum non efficitur volutpat. "
+                "Nulla risus nulla, efficitur et urna at, imperdiet sodales lorem. "
+                "Suspendisse erat est, sollicitudin at nisl tincidunt, vehicula hendrerit lectus. "
+                "Nam quis nisi ullamcorper, rhoncus massa vel, tempus purus. "
+                "Duis pulvinar eros vel nulla pellentesque, at dapibus justo laoreet. "
+                "Praesent tortor orci, vulputate fermentum dapibus nec, feugiat vitae tortor. "
+                "Donec mollis convallis massa quis iaculis."
+            ),
+        },
+    ]
+
+    # Prepare train and val splits with sample repeated, 2500 samples are for
+    # 128 samples with block-size 8192 and LLama3 tokenizer
+    data = [{"conversation": sample}] * 2500
+
+    # For train-val splits
+    data_dict = DatasetDict({"train": Dataset.from_list(data), "valid": Dataset.from_list(data)})
+    data_dict.save_to_disk(dataset_path)

From b56df9a2746297389f3699a8aa628c488812ef4d Mon Sep 17 00:00:00 2001
From: Daniel Korzekwa <dkorzekwa@nvidia.com>
Date: Wed, 29 Oct 2025 15:06:41 +0100
Subject: [PATCH 38/81] Improve comments

Signed-off-by: Daniel Korzekwa <dkorzekwa@nvidia.com>
---
 .../torch/_compress/nas/plugins/compress_nas_plugin.py     | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/modelopt/torch/_compress/nas/plugins/compress_nas_plugin.py b/modelopt/torch/_compress/nas/plugins/compress_nas_plugin.py
index 748f33939..7b7acbed6 100644
--- a/modelopt/torch/_compress/nas/plugins/compress_nas_plugin.py
+++ b/modelopt/torch/_compress/nas/plugins/compress_nas_plugin.py
@@ -162,5 +162,8 @@ def restore(self) -> RestoreEntrypoint:
 
     @property
     def export_mode(self) -> str | None:
-        """The mode that corresponds to the export mode of this mode."""
-        return "export"
+        """The mode that corresponds to the export mode.
+        For now, this will be a no-op as there is no modelopt's concept of search space defined
+        for the compress algorithm.
+        """
+        return "export_nas"

From 9bfcc21a91a0f7a8d4a2e568941d289979e63ab6 Mon Sep 17 00:00:00 2001
From: Daniel Korzekwa <dkorzekwa@nvidia.com>
Date: Wed, 29 Oct 2025 19:18:01 +0100
Subject: [PATCH 39/81] Added TODO.

Signed-off-by: Daniel Korzekwa <dkorzekwa@nvidia.com>
---
 tests/experimental/torch/_compress/test_compress.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/tests/experimental/torch/_compress/test_compress.py b/tests/experimental/torch/_compress/test_compress.py
index dd9bac9c5..a95cd0be5 100644
--- a/tests/experimental/torch/_compress/test_compress.py
+++ b/tests/experimental/torch/_compress/test_compress.py
@@ -94,8 +94,9 @@ def _test_compress_multiprocess_job(project_root_path: Path, tmp_path: Path, ran
 
             # Create a small Llama model (not DeciLM) to match the normal conversion pipeline
             tokenizer = create_tokenizer(project_root_path)
-            hf_ckpt_teacher_dir = "ckpts/teacher"
-            llama_checkpoint_path = puzzle_dir / hf_ckpt_teacher_dir
+            # TODO: change it to "ckpts/llama" once the conversion script is fixed
+            # Currently, the build replacement library step will fail with such a path.
+            llama_checkpoint_path = puzzle_dir / "ckpts/teacher"
             create_and_save_small_llama_model(
                 llama_checkpoint_path, vocab_size=tokenizer.vocab_size, tokenizer=tokenizer
             )
@@ -103,7 +104,7 @@ def _test_compress_multiprocess_job(project_root_path: Path, tmp_path: Path, ran
             # Use the full conversion pipeline (matches normal usage)
             convert_llama3_to_decilm(
                 input_dir=llama_checkpoint_path,
-                output_dir=llama_checkpoint_path,
+                output_dir=puzzle_dir / "ckpts/teacher",
             )
         runtime.wait_for_everyone()
 

From 1dc89c44c67e7b7797bcce339f7c7047d371514a Mon Sep 17 00:00:00 2001
From: Daniel Korzekwa <dkorzekwa@nvidia.com>
Date: Wed, 29 Oct 2025 23:03:41 +0100
Subject: [PATCH 40/81] Implement mtn.search() for the compress algorithm

Signed-off-by: Daniel Korzekwa <dkorzekwa@nvidia.com>
---
 .../nas/plugins/compress_nas_plugin.py        |  48 +++++-
 .../_compress/nas/plugins/test_nas_search.py  | 141 ++++++++++++++++++
 2 files changed, 187 insertions(+), 2 deletions(-)
 create mode 100644 tests/experimental/torch/_compress/nas/plugins/test_nas_search.py

diff --git a/modelopt/torch/_compress/nas/plugins/compress_nas_plugin.py b/modelopt/torch/_compress/nas/plugins/compress_nas_plugin.py
index 7b7acbed6..178a6b55d 100644
--- a/modelopt/torch/_compress/nas/plugins/compress_nas_plugin.py
+++ b/modelopt/torch/_compress/nas/plugins/compress_nas_plugin.py
@@ -20,8 +20,11 @@
 import datetime
 from pathlib import Path
 
+import build_library_and_stats
+import mip_and_realize_models
 import pruning_ckpts
 import score_pruning_activations
+import scoring
 import torch
 from scripts.convert_llama3_to_decilm import convert_llama3_to_decilm
 from torch import nn
@@ -36,7 +39,7 @@
     ModeDescriptor,
     RestoreEntrypoint,
 )
-from modelopt.torch.opt.searcher import BaseSearcher
+from modelopt.torch.opt.searcher import BaseSearcher, SearchStateDict
 
 # TODO Move initialize_hydra_config_for_dir from tests to main
 from tests.utils.test_utils import initialize_hydra_config_for_dir
@@ -96,6 +99,12 @@ def convert_compress_model(model: nn.Module, config: CompressConfig) -> ConvertR
         dtype=torch.bfloat16, torch_distributed_timeout=datetime.timedelta(10)
     )
 
+    # Required for mtn.search() to read NAS configuration
+    model.hydra_config_dir = config.hydra_config_dir
+    model.hydra_config_name = config.hydra_config_name
+    model.puzzle_dir = config.puzzle_dir
+    model.dataset_path = config.dataset_path
+
     # Load hydra config
     hydra_cfg = initialize_hydra_config_for_dir(
         config_dir=config.hydra_config_dir,
@@ -148,7 +157,7 @@ def config_class(self) -> type[ModeloptBaseConfig]:
     @property
     def search_algorithm(self) -> type[BaseSearcher]:
         """Return the associated searcher implementation."""
-        raise NotImplementedError("Compress mode does not have a search algorithm yet.")
+        return CompressSearcher
 
     @property
     def convert(self) -> ConvertEntrypoint:
@@ -167,3 +176,38 @@ def export_mode(self) -> str | None:
         for the compress algorithm.
         """
         return "export_nas"
+
+
+class CompressSearcher(BaseSearcher):
+    """Runs NAS search for the Compress mode."""
+
+    @property
+    def default_state_dict(self) -> SearchStateDict:
+        """Not needed for the compress mode as we are not saving any model state"""
+        return {}
+
+    def run_search(self) -> None:
+        runtime = NativeDdpRuntime(
+            dtype=torch.bfloat16, torch_distributed_timeout=datetime.timedelta(10)
+        )
+
+        # Load hydra config
+        hydra_cfg = initialize_hydra_config_for_dir(
+            config_dir=self.model.hydra_config_dir,
+            config_name=self.model.hydra_config_name,
+            overrides=[
+                f"puzzle_dir={self.model.puzzle_dir}",
+                f"dataset_path={self.model.dataset_path}",
+            ],
+        )
+
+        # Build_library_and_stats (single process)
+        if runtime.global_rank == 0:
+            build_library_and_stats.launch_build_library_and_stats(hydra_cfg)
+        runtime.wait_for_everyone()
+
+        # Calc_one_block_scores (distributed processing)
+        scoring.launch_scoring(hydra_cfg, runtime)
+
+        # mip_and_realize_models (distributed processing)
+        mip_and_realize_models.launch_mip_and_realize_model(hydra_cfg, runtime)
diff --git a/tests/experimental/torch/_compress/nas/plugins/test_nas_search.py b/tests/experimental/torch/_compress/nas/plugins/test_nas_search.py
new file mode 100644
index 000000000..6b6cd5a24
--- /dev/null
+++ b/tests/experimental/torch/_compress/nas/plugins/test_nas_search.py
@@ -0,0 +1,141 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#
+# See tests/experimental/torch/_compress/test_compress.py for instructions on how to run this test
+# TODO: Remove those instructions once this test runs automatically on CI
+#
+import datetime
+from functools import partial
+from pathlib import Path
+
+import torch
+from _test_utils.torch.distributed.utils import spawn_multiprocess_job
+from experimental.torch._compress.test_utils import (
+    create_and_save_small_llama_model,
+    create_tokenizer,
+    save_dummy_dataset,
+    setup_puzzle_dir,
+)
+from puzzle_tools.hydra_utils import register_hydra_resolvers
+
+import modelopt.torch.nas as mtn
+from modelopt.torch._compress.nas.plugins.compress_nas_plugin import CompressModel
+from modelopt.torch._compress.runtime import NativeDdpRuntime
+
+
+def test_nas_search(project_root_path: Path, tmp_path: Path):
+    spawn_multiprocess_job(
+        size=torch.cuda.device_count(),
+        job=partial(_test_nas_search_multiprocess_job, project_root_path, tmp_path),
+        backend="nccl",
+    )
+
+
+def _test_nas_search_multiprocess_job(
+    project_root_path: Path, tmp_path: Path, rank: int, size: int
+):
+    # Register Hydra custom resolvers (needed for config resolution)
+    register_hydra_resolvers()
+
+    #
+    # The inputs for the nas.convert()/nas.search() steps.
+    #
+    puzzle_dir = tmp_path
+    # TODO: change it to "ckpts/llama" once the conversion script is fixed (internal NVidia modelopt bug: issues/17)
+    llama_checkpoint_path = puzzle_dir / "ckpts/teacher"
+    dataset_path = puzzle_dir / "dummy_dataset"
+    hydra_config_dir = project_root_path / "tests/experimental/torch/_compress/resources/configs"
+    hydra_config_name = "Llama-3_1-8B"
+
+    runtime = NativeDdpRuntime(
+        dtype=torch.bfloat16, torch_distributed_timeout=datetime.timedelta(10)
+    )
+
+    with runtime as runtime:
+        if rank == 0:
+            # Setup puzzle_dir and dataset
+            setup_puzzle_dir(puzzle_dir)
+            save_dummy_dataset(dataset_path)
+
+            # Create a small Llama model
+            tokenizer = create_tokenizer(project_root_path)
+            create_and_save_small_llama_model(
+                llama_checkpoint_path, vocab_size=tokenizer.vocab_size, tokenizer=tokenizer
+            )
+        runtime.wait_for_everyone()
+
+        #
+        # Run the mnt.convert() step
+        #
+        input_model = CompressModel()
+
+        # Converted model is the same as the input model, but with the search space set up:
+        # (HF model imported to DeciLM format, pruning scores pruned checkpoints and are saved)
+        converted_model = mtn.convert(
+            input_model,
+            mode=[
+                (
+                    "compress",
+                    {
+                        "puzzle_dir": str(puzzle_dir),
+                        "input_model_path": str(llama_checkpoint_path),
+                        "hydra_config_dir": str(hydra_config_dir),
+                        "hydra_config_name": hydra_config_name,
+                        "dataset_path": str(dataset_path),
+                    },
+                )
+            ],
+        )
+
+        #
+        # Run the mnt.search() step
+        #
+        mtn.search(
+            converted_model,
+            constraints={},  # this is not used as the search space is defined in the hydra config
+            dummy_input=None,  # Not used
+            config={},  # this is not used as the search space is defined in the hydra config
+        )
+
+        #
+        # Check assertions for mnt.search() step
+        #
+        if rank == 0:
+            # assertions for the build_library_and_stats step
+            assert (puzzle_dir / "replacement_library.json").is_file()
+            assert (puzzle_dir / "subblock_stats.json").is_file()
+
+            # assertions for the scoring step
+            solution_0_filepath = (
+                puzzle_dir / "single_sequence_replacement_solutions--validation/solution_0.json"
+            )
+
+            assert solution_0_filepath.exists()
+
+            # assertions for the mip_and_realize_models step
+            solution_0_ckpt_config_path = (
+                puzzle_dir
+                / "mip/puzzle_solutions/target_memory_780000MiB/solutions--checkpoints/solution_0/config.json"
+            )
+
+            assert solution_0_ckpt_config_path.exists()
+            assert (
+                puzzle_dir / "mip/puzzle_solutions/target_memory_780000MiB/solutions.json"
+            ).exists()
+
+        runtime.wait_for_everyone()
+
+    print("PYTEST SUMMARY: test_nas_search() test has finished successfully")

From 6bfa3ece535853325c1ec343095312d6f8f5fe8e Mon Sep 17 00:00:00 2001
From: Daniel Korzekwa <dkorzekwa@nvidia.com>
Date: Wed, 29 Oct 2025 23:17:29 +0100
Subject: [PATCH 41/81] Refactoring

Signed-off-by: Daniel Korzekwa <dkorzekwa@nvidia.com>
---
 .../_compress/nas/plugins/test_nas_convert.py | 95 ++++++++++---------
 1 file changed, 52 insertions(+), 43 deletions(-)

diff --git a/tests/experimental/torch/_compress/nas/plugins/test_nas_convert.py b/tests/experimental/torch/_compress/nas/plugins/test_nas_convert.py
index 4a416c833..ce285e49d 100644
--- a/tests/experimental/torch/_compress/nas/plugins/test_nas_convert.py
+++ b/tests/experimental/torch/_compress/nas/plugins/test_nas_convert.py
@@ -48,54 +48,12 @@ def test_nas_convert(project_root_path: Path, tmp_path: Path):
 def _test_nas_convert_multiprocess_job(
     project_root_path: Path, tmp_path: Path, rank: int, size: int
 ):
-    # Register Hydra custom resolvers (needed for config resolution)
-    register_hydra_resolvers()
-
-    #
-    # The inputs for the nas.convert() step.
-    #
-    puzzle_dir = tmp_path
-    llama_checkpoint_path = puzzle_dir / "ckpts/llama"
-    dataset_path = puzzle_dir / "dummy_dataset"
-    hydra_config_dir = project_root_path / "tests/experimental/torch/_compress/resources/configs"
-    hydra_config_name = "Llama-3_1-8B"
-
     runtime = NativeDdpRuntime(
         dtype=torch.bfloat16, torch_distributed_timeout=datetime.timedelta(10)
     )
 
     with runtime as runtime:
-        if rank == 0:
-            # Setup puzzle_dir and dataset
-            setup_puzzle_dir(puzzle_dir)
-            save_dummy_dataset(dataset_path)
-
-            # Create a small Llama model
-            tokenizer = create_tokenizer(project_root_path)
-            create_and_save_small_llama_model(
-                llama_checkpoint_path, vocab_size=tokenizer.vocab_size, tokenizer=tokenizer
-            )
-        runtime.wait_for_everyone()
-
-        #
-        # Run the mnt.convert() step
-        #
-        input_model = CompressModel()
-        mtn.convert(
-            input_model,
-            mode=[
-                (
-                    "compress",
-                    {
-                        "puzzle_dir": str(puzzle_dir),
-                        "input_model_path": str(llama_checkpoint_path),
-                        "hydra_config_dir": str(hydra_config_dir),
-                        "hydra_config_name": hydra_config_name,
-                        "dataset_path": str(dataset_path),
-                    },
-                )
-            ],
-        )
+        converted_model, puzzle_dir = run_nas_convert(project_root_path, tmp_path, rank, runtime)
 
         #
         # Check assertions
@@ -114,3 +72,54 @@ def _test_nas_convert_multiprocess_job(
         runtime.wait_for_everyone()
 
     print("PYTEST SUMMARY: test_nas_convert() test has finished successfully")
+
+
+def run_nas_convert(
+    project_root_path: Path,
+    tmp_path: Path,
+    rank: int,
+    runtime,
+):
+    # Register Hydra custom resolvers (needed for config resolution)
+    register_hydra_resolvers()
+
+    # The inputs for the nas.convert() step.
+    #
+    puzzle_dir = tmp_path
+    llama_checkpoint_path = puzzle_dir / "ckpts/llama"
+    dataset_path = puzzle_dir / "dummy_dataset"
+    hydra_config_dir = project_root_path / "tests/experimental/torch/_compress/resources/configs"
+    hydra_config_name = "Llama-3_1-8B"
+
+    if rank == 0:
+        # Setup puzzle_dir and dataset
+        setup_puzzle_dir(puzzle_dir)
+        save_dummy_dataset(dataset_path)
+
+        # Create a small Llama model
+        tokenizer = create_tokenizer(project_root_path)
+        create_and_save_small_llama_model(
+            llama_checkpoint_path, vocab_size=tokenizer.vocab_size, tokenizer=tokenizer
+        )
+    runtime.wait_for_everyone()
+
+    # Run the mnt.convert() step
+    #
+    input_model = CompressModel()
+    converted_model = mtn.convert(
+        input_model,
+        mode=[
+            (
+                "compress",
+                {
+                    "puzzle_dir": str(puzzle_dir),
+                    "input_model_path": str(llama_checkpoint_path),
+                    "hydra_config_dir": str(hydra_config_dir),
+                    "hydra_config_name": hydra_config_name,
+                    "dataset_path": str(dataset_path),
+                },
+            )
+        ],
+    )
+
+    return converted_model, puzzle_dir

From 6d45e3342241e2f727910c6e3d0ad16ea660859d Mon Sep 17 00:00:00 2001
From: Daniel Korzekwa <dkorzekwa@nvidia.com>
Date: Wed, 29 Oct 2025 23:22:49 +0100
Subject: [PATCH 42/81] code refactoring

Signed-off-by: Daniel Korzekwa <dkorzekwa@nvidia.com>
---
 .../_compress/nas/plugins/test_nas_convert.py |  3 +-
 .../_compress/nas/plugins/test_nas_search.py  | 57 +------------------
 2 files changed, 4 insertions(+), 56 deletions(-)

diff --git a/tests/experimental/torch/_compress/nas/plugins/test_nas_convert.py b/tests/experimental/torch/_compress/nas/plugins/test_nas_convert.py
index ce285e49d..1bd588582 100644
--- a/tests/experimental/torch/_compress/nas/plugins/test_nas_convert.py
+++ b/tests/experimental/torch/_compress/nas/plugins/test_nas_convert.py
@@ -86,7 +86,8 @@ def run_nas_convert(
     # The inputs for the nas.convert() step.
     #
     puzzle_dir = tmp_path
-    llama_checkpoint_path = puzzle_dir / "ckpts/llama"
+    # TODO: change it to "ckpts/llama" once the conversion script is fixed (internal NVidia modelopt bug: issues/17)
+    llama_checkpoint_path = puzzle_dir / "ckpts/teacher"
     dataset_path = puzzle_dir / "dummy_dataset"
     hydra_config_dir = project_root_path / "tests/experimental/torch/_compress/resources/configs"
     hydra_config_name = "Llama-3_1-8B"
diff --git a/tests/experimental/torch/_compress/nas/plugins/test_nas_search.py b/tests/experimental/torch/_compress/nas/plugins/test_nas_search.py
index 6b6cd5a24..c21f3fa1b 100644
--- a/tests/experimental/torch/_compress/nas/plugins/test_nas_search.py
+++ b/tests/experimental/torch/_compress/nas/plugins/test_nas_search.py
@@ -23,16 +23,9 @@
 
 import torch
 from _test_utils.torch.distributed.utils import spawn_multiprocess_job
-from experimental.torch._compress.test_utils import (
-    create_and_save_small_llama_model,
-    create_tokenizer,
-    save_dummy_dataset,
-    setup_puzzle_dir,
-)
-from puzzle_tools.hydra_utils import register_hydra_resolvers
+from experimental.torch._compress.nas.plugins.test_nas_convert import run_nas_convert
 
 import modelopt.torch.nas as mtn
-from modelopt.torch._compress.nas.plugins.compress_nas_plugin import CompressModel
 from modelopt.torch._compress.runtime import NativeDdpRuntime
 
 
@@ -47,58 +40,12 @@ def test_nas_search(project_root_path: Path, tmp_path: Path):
 def _test_nas_search_multiprocess_job(
     project_root_path: Path, tmp_path: Path, rank: int, size: int
 ):
-    # Register Hydra custom resolvers (needed for config resolution)
-    register_hydra_resolvers()
-
-    #
-    # The inputs for the nas.convert()/nas.search() steps.
-    #
-    puzzle_dir = tmp_path
-    # TODO: change it to "ckpts/llama" once the conversion script is fixed (internal NVidia modelopt bug: issues/17)
-    llama_checkpoint_path = puzzle_dir / "ckpts/teacher"
-    dataset_path = puzzle_dir / "dummy_dataset"
-    hydra_config_dir = project_root_path / "tests/experimental/torch/_compress/resources/configs"
-    hydra_config_name = "Llama-3_1-8B"
-
     runtime = NativeDdpRuntime(
         dtype=torch.bfloat16, torch_distributed_timeout=datetime.timedelta(10)
     )
 
     with runtime as runtime:
-        if rank == 0:
-            # Setup puzzle_dir and dataset
-            setup_puzzle_dir(puzzle_dir)
-            save_dummy_dataset(dataset_path)
-
-            # Create a small Llama model
-            tokenizer = create_tokenizer(project_root_path)
-            create_and_save_small_llama_model(
-                llama_checkpoint_path, vocab_size=tokenizer.vocab_size, tokenizer=tokenizer
-            )
-        runtime.wait_for_everyone()
-
-        #
-        # Run the mnt.convert() step
-        #
-        input_model = CompressModel()
-
-        # Converted model is the same as the input model, but with the search space set up:
-        # (HF model imported to DeciLM format, pruning scores pruned checkpoints and are saved)
-        converted_model = mtn.convert(
-            input_model,
-            mode=[
-                (
-                    "compress",
-                    {
-                        "puzzle_dir": str(puzzle_dir),
-                        "input_model_path": str(llama_checkpoint_path),
-                        "hydra_config_dir": str(hydra_config_dir),
-                        "hydra_config_name": hydra_config_name,
-                        "dataset_path": str(dataset_path),
-                    },
-                )
-            ],
-        )
+        converted_model, puzzle_dir = run_nas_convert(project_root_path, tmp_path, rank, runtime)
 
         #
         # Run the mnt.search() step

From f9e09d928dd7e7367173dd77251367dcd83990e8 Mon Sep 17 00:00:00 2001
From: Daniel Korzekwa <dkorzekwa@nvidia.com>
Date: Thu, 30 Oct 2025 12:58:47 +0100
Subject: [PATCH 43/81] Correct import paths

Signed-off-by: Daniel Korzekwa <dkorzekwa@nvidia.com>
---
 modelopt/torch/_compress/nas/plugins/compress_nas_plugin.py | 4 +++-
 tests/experimental/torch/_compress/test_compress.py         | 4 +++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/modelopt/torch/_compress/nas/plugins/compress_nas_plugin.py b/modelopt/torch/_compress/nas/plugins/compress_nas_plugin.py
index 178a6b55d..026f1478b 100644
--- a/modelopt/torch/_compress/nas/plugins/compress_nas_plugin.py
+++ b/modelopt/torch/_compress/nas/plugins/compress_nas_plugin.py
@@ -26,9 +26,11 @@
 import score_pruning_activations
 import scoring
 import torch
-from scripts.convert_llama3_to_decilm import convert_llama3_to_decilm
 from torch import nn
 
+from modelopt.torch._compress.decilm.converters.convert_llama3_to_decilm import (
+    convert_llama3_to_decilm,
+)
 from modelopt.torch._compress.runtime import NativeDdpRuntime
 from modelopt.torch.nas.conversion import NASModeRegistry
 from modelopt.torch.opt.config import ModeloptBaseConfig, ModeloptField
diff --git a/tests/experimental/torch/_compress/test_compress.py b/tests/experimental/torch/_compress/test_compress.py
index a95cd0be5..bc8f153dd 100644
--- a/tests/experimental/torch/_compress/test_compress.py
+++ b/tests/experimental/torch/_compress/test_compress.py
@@ -27,9 +27,11 @@
     setup_puzzle_dir,
 )
 from puzzle_tools.hydra_utils import register_hydra_resolvers
-from scripts.convert_llama3_to_decilm import convert_llama3_to_decilm
 
 from modelopt.torch._compress import compress
+from modelopt.torch._compress.decilm.converters.convert_llama3_to_decilm import (
+    convert_llama3_to_decilm,
+)
 from modelopt.torch._compress.runtime import NativeDdpRuntime
 
 # The e2e test to compress a model based on Local Neural Architecture Search (Mixed Integer Programing NAS search)

From a0cfd13564ed2ed6f738f5ff4cd828228d85509a Mon Sep 17 00:00:00 2001
From: Daniel Korzekwa <dkorzekwa@nvidia.com>
Date: Thu, 30 Oct 2025 13:32:25 +0100
Subject: [PATCH 44/81] Change llama_checkpoint_path, can't be inside of ckpts
 folder

Signed-off-by: Daniel Korzekwa <dkorzekwa@nvidia.com>
---
 .../torch/_compress/nas/plugins/test_nas_convert.py            | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tests/experimental/torch/_compress/nas/plugins/test_nas_convert.py b/tests/experimental/torch/_compress/nas/plugins/test_nas_convert.py
index 1bd588582..3c8e0ebe6 100644
--- a/tests/experimental/torch/_compress/nas/plugins/test_nas_convert.py
+++ b/tests/experimental/torch/_compress/nas/plugins/test_nas_convert.py
@@ -86,8 +86,7 @@ def run_nas_convert(
     # The inputs for the nas.convert() step.
     #
     puzzle_dir = tmp_path
-    # TODO: change it to "ckpts/llama" once the conversion script is fixed (internal NVidia modelopt bug: issues/17)
-    llama_checkpoint_path = puzzle_dir / "ckpts/teacher"
+    llama_checkpoint_path = puzzle_dir / "input_model/llama"
     dataset_path = puzzle_dir / "dummy_dataset"
     hydra_config_dir = project_root_path / "tests/experimental/torch/_compress/resources/configs"
     hydra_config_name = "Llama-3_1-8B"

From 2c2995c99c04a3a09f5c2292da68e46bbc9a8ce0 Mon Sep 17 00:00:00 2001
From: Daniel Korzekwa <dkorzekwa@nvidia.com>
Date: Fri, 31 Oct 2025 11:44:35 +0100
Subject: [PATCH 45/81] Initial commit for compress tutorial

Signed-off-by: Daniel Korzekwa <dkorzekwa@nvidia.com>
---
 examples/compress/README.md |  0
 examples/compress/main.py   | 15 +++++++++++++++
 2 files changed, 15 insertions(+)
 create mode 100644 examples/compress/README.md
 create mode 100644 examples/compress/main.py

diff --git a/examples/compress/README.md b/examples/compress/README.md
new file mode 100644
index 000000000..e69de29bb
diff --git a/examples/compress/main.py b/examples/compress/main.py
new file mode 100644
index 000000000..47f1c65a1
--- /dev/null
+++ b/examples/compress/main.py
@@ -0,0 +1,15 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+

From b152689dc2908527d481adeab8016a939b495179 Mon Sep 17 00:00:00 2001
From: Daniel Korzekwa <dkorzekwa@nvidia.com>
Date: Fri, 31 Oct 2025 16:12:29 +0100
Subject: [PATCH 46/81] Update compress tutorial and implement main.py for
 compress tutorial.

Signed-off-by: Daniel Korzekwa <dkorzekwa@nvidia.com>
---
 examples/compress/README.md | 36 ++++++++++++++++++
 examples/compress/main.py   | 75 +++++++++++++++++++++++++++++++++++++
 2 files changed, 111 insertions(+)

diff --git a/examples/compress/README.md b/examples/compress/README.md
index e69de29bb..f123e2ac6 100644
--- a/examples/compress/README.md
+++ b/examples/compress/README.md
@@ -0,0 +1,36 @@
+# Compress Algorithm Tutorial
+
+This tutorial demonstrates how to compress large language models using the compress algorithm based on the [Puzzle paper](https://arxiv.org/abs/2411.19146).
+
+In this example, we compress Llama 3.2 1B by searching for the optimal `ffn_intermediate_size` across MLP layers, resulting in a heterogeneous architecture while reducing GPU memory usage by 20%.
+
+## Compress the Model
+
+```bash
+# TODO
+torchrun examples/compress/main.py
+```
+
+## Evaluate Model Accuracy
+
+```bash
+# TODO
+```
+
+## Re-run MIP Search with Different Memory Constraints
+
+```bash
+# TODO
+```
+
+## Deploy to TensorRT-LLM
+
+```bash
+# TODO
+```
+
+## Export to NeMo for Knowledge Distillation
+
+```bash
+# TODO
+```
diff --git a/examples/compress/main.py b/examples/compress/main.py
index 47f1c65a1..e0fa74db0 100644
--- a/examples/compress/main.py
+++ b/examples/compress/main.py
@@ -13,3 +13,78 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+"""
+Main script for running the compress algorithm on large language models (based on Puzzle paper https://arxiv.org/abs/2411.19146).
+
+This script provides two modes:
+1. Default mode: Runs the full compression pipeline
+2. MIP-only mode: Runs only the MIP search and realize models phase
+
+Usage:
+    # Full compression pipeline
+    torchrun main.py --config ./configs/llama_3.2_1B_pruneffn_memory.yaml
+
+    # Only MIP search and realize models phase
+    torchrun main.py --config ./configs/llama_3.2_1B_pruneffn_memory.yaml --mip-only
+"""
+
+import argparse
+from pathlib import Path
+
+
+def parse_args():
+    """Parse command line arguments."""
+    parser = argparse.ArgumentParser(
+        description="Compress large language models using the Compress algorithm (based on Puzzle paper https://arxiv.org/abs/2411.19146)"
+    )
+    parser.add_argument(
+        "--config",
+        type=str,
+        required=True,
+        help="Path to the main config YAML file (e.g., ./configs/llama_3.2_1B_pruneffn_memory.yaml)",
+    )
+    parser.add_argument(
+        "--mip-only",
+        action="store_true",
+        help="Run only the MIP search and realize models phase (skip pruning and NAS scoring)",
+    )
+
+    return parser.parse_args()
+
+
+def run_full_compress(hydra_config_path: str):
+    """Run the full compression pipeline.
+
+    Args:
+        config_path: Path to the YAML configuration file
+    """
+    hydra_config_path = Path(hydra_config_path).resolve()
+    # config_dir = str(hydra_config_path.parent)
+    # config_name = hydra_config_path.stem
+
+
+def run_mip_only(hydra_config_path: str):
+    """Run only the MIP search and realize models phase.
+
+    This assumes that pruning, replacement library building, NAS scoring, and subblock stats calculation
+    have already been completed.
+
+    Args:
+        config_path: Path to the YAML configuration file
+    """
+    hydra_config_path = Path(hydra_config_path).resolve()
+    # config_dir = str(hydra_config_path.parent)
+    # config_name = hydra_config_path.stem
+
+
+def main():
+    args = parse_args()
+
+    if args.mip_only:
+        run_mip_only(hydra_config_path=args.config)
+    else:
+        run_full_compress(hydra_config_path=args.config)
+
+
+if __name__ == "__main__":
+    main()

From 24e30e6214fb709a3852fbcbc67fd5689fe4814e Mon Sep 17 00:00:00 2001
From: Daniel Korzekwa <dkorzekwa@nvidia.com>
Date: Fri, 31 Oct 2025 16:15:06 +0100
Subject: [PATCH 47/81] Update compress tutorial

Signed-off-by: Daniel Korzekwa <dkorzekwa@nvidia.com>
---
 examples/compress/README.md | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/examples/compress/README.md b/examples/compress/README.md
index f123e2ac6..b42db37fa 100644
--- a/examples/compress/README.md
+++ b/examples/compress/README.md
@@ -7,8 +7,8 @@ In this example, we compress Llama 3.2 1B by searching for the optimal `ffn_inte
 ## Compress the Model
 
 ```bash
-# TODO
-torchrun examples/compress/main.py
+torchrun --nproc_per_node=8 examples/compress/main.py \
+  --config ./examples/compress/configs/llama_3.2_1B_pruneffn_memory.yaml
 ```
 
 ## Evaluate Model Accuracy
@@ -19,10 +19,16 @@ torchrun examples/compress/main.py
 
 ## Re-run MIP Search with Different Memory Constraints
 
+If you want to try different memory constraints without re-running the expensive pruning and scoring steps, use the `--mip-only` flag:
+
 ```bash
-# TODO
+torchrun --nproc_per_node=8 examples/compress/main.py \
+  --config ./examples/compress/configs/llama_3.2_1B_pruneffn_memory.yaml \
+  --mip-only
 ```
 
+This assumes pruning, replacement library building, NAS scoring, and subblock stats calculation have already been completed.
+
 ## Deploy to TensorRT-LLM
 
 ```bash

From 21f115e52173489b5f972a7f30f7eec2a292ecf3 Mon Sep 17 00:00:00 2001
From: Daniel Korzekwa <dkorzekwa@nvidia.com>
Date: Fri, 31 Oct 2025 18:50:46 +0100
Subject: [PATCH 48/81] Create a yaml file for llama 3.2-1B model compression

Signed-off-by: Daniel Korzekwa <dkorzekwa@nvidia.com>
---
 .../llama_3.2_1B_pruneffn_memory.yaml         | 21 +++++++++++++++++++
 1 file changed, 21 insertions(+)
 create mode 100644 examples/compress/configs/llama_3.2_1B_pruneffn_memory/llama_3.2_1B_pruneffn_memory.yaml

diff --git a/examples/compress/configs/llama_3.2_1B_pruneffn_memory/llama_3.2_1B_pruneffn_memory.yaml b/examples/compress/configs/llama_3.2_1B_pruneffn_memory/llama_3.2_1B_pruneffn_memory.yaml
new file mode 100644
index 000000000..bec15d2f7
--- /dev/null
+++ b/examples/compress/configs/llama_3.2_1B_pruneffn_memory/llama_3.2_1B_pruneffn_memory.yaml
@@ -0,0 +1,21 @@
+defaults:
+  - /Llama-3_2-1B
+  - _self_
+
+# Input Hugging Face model to compress
+input_hf_model_path: ???  # e.g., "path/to/meta-llama/Llama-3.2-1B"
+
+# Dataset path for pruning and NAS scoring
+dataset_path: ???  # e.g., "path/to/dataset"
+
+# Working directory for compression outputs
+puzzle_dir: ???  # e.g., "path/to/puzzle_dir"
+
+# MIP memory constraint (in MiB) 
+mip:
+  human_constraints:
+    target_memory: 2_000 # 2 GiB
+
+# FFN intermediate sizes to search over (heterogeneous architecture)
+pruning:
+  intermediate_size_list: [768, 1024, 1536, 2048, 2560, 4192]  # Llama 3.2 1B baseline: 8192

From d19b9ab0823465ba2c48bd755c4c4736470a2dec Mon Sep 17 00:00:00 2001
From: Daniel Korzekwa <dkorzekwa@nvidia.com>
Date: Fri, 31 Oct 2025 18:51:19 +0100
Subject: [PATCH 49/81] fix input model path in the unit test.

Signed-off-by: Daniel Korzekwa <dkorzekwa@nvidia.com>
---
 tests/experimental/torch/_compress/test_compress.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/experimental/torch/_compress/test_compress.py b/tests/experimental/torch/_compress/test_compress.py
index bc8f153dd..02010f71b 100644
--- a/tests/experimental/torch/_compress/test_compress.py
+++ b/tests/experimental/torch/_compress/test_compress.py
@@ -98,7 +98,7 @@ def _test_compress_multiprocess_job(project_root_path: Path, tmp_path: Path, ran
             tokenizer = create_tokenizer(project_root_path)
             # TODO: change it to "ckpts/llama" once the conversion script is fixed
             # Currently, the build replacement library step will fail with such a path.
-            llama_checkpoint_path = puzzle_dir / "ckpts/teacher"
+            llama_checkpoint_path = puzzle_dir / "input_model/llama"
             create_and_save_small_llama_model(
                 llama_checkpoint_path, vocab_size=tokenizer.vocab_size, tokenizer=tokenizer
             )

From 78d7a870ebb3f0c00f2bc1059562415ff2e4da75 Mon Sep 17 00:00:00 2001
From: Daniel Korzekwa <dkorzekwa@nvidia.com>
Date: Fri, 31 Oct 2025 19:32:28 +0100
Subject: [PATCH 50/81] compress tutorial

Signed-off-by: Daniel Korzekwa <dkorzekwa@nvidia.com>
---
 examples/compress/README.md                   |  12 +-
 .../Llama-3_2-8B.yaml                         | 108 ++++++++++++++++++
 .../llama_3.2_1B_pruneffn_memory.yaml         |   2 +-
 .../pruning/attn_pruning.yaml                 |  16 +++
 .../pruning/ffn_pruning.yaml                  |  12 ++
 .../pruning/hidden_dim_pruning.yaml           |  15 +++
 .../pruning/pruning_defaults.yaml             |  32 ++++++
 .../validate_model_defaults.yaml              |  15 +++
 .../validate_solutions_defaults.yaml          |  10 ++
 examples/compress/main.py                     |  19 ++-
 10 files changed, 234 insertions(+), 7 deletions(-)
 create mode 100644 examples/compress/configs/llama_3.2_1B_pruneffn_memory/Llama-3_2-8B.yaml
 create mode 100644 examples/compress/configs/llama_3.2_1B_pruneffn_memory/pruning/attn_pruning.yaml
 create mode 100644 examples/compress/configs/llama_3.2_1B_pruneffn_memory/pruning/ffn_pruning.yaml
 create mode 100644 examples/compress/configs/llama_3.2_1B_pruneffn_memory/pruning/hidden_dim_pruning.yaml
 create mode 100644 examples/compress/configs/llama_3.2_1B_pruneffn_memory/pruning/pruning_defaults.yaml
 create mode 100644 examples/compress/configs/llama_3.2_1B_pruneffn_memory/validate_model_defaults.yaml
 create mode 100644 examples/compress/configs/llama_3.2_1B_pruneffn_memory/validate_solutions_defaults.yaml

diff --git a/examples/compress/README.md b/examples/compress/README.md
index b42db37fa..0304217c7 100644
--- a/examples/compress/README.md
+++ b/examples/compress/README.md
@@ -7,8 +7,8 @@ In this example, we compress Llama 3.2 1B by searching for the optimal `ffn_inte
 ## Compress the Model
 
 ```bash
-torchrun --nproc_per_node=8 examples/compress/main.py \
-  --config ./examples/compress/configs/llama_3.2_1B_pruneffn_memory.yaml
+torchrun examples/compress/main.py \
+  --config path/to/llama_3.2_1B_pruneffn_memory.yaml
 ```
 
 ## Evaluate Model Accuracy
@@ -22,8 +22,8 @@ torchrun --nproc_per_node=8 examples/compress/main.py \
 If you want to try different memory constraints without re-running the expensive pruning and scoring steps, use the `--mip-only` flag:
 
 ```bash
-torchrun --nproc_per_node=8 examples/compress/main.py \
-  --config ./examples/compress/configs/llama_3.2_1B_pruneffn_memory.yaml \
+torchrun examples/compress/main.py \
+  --config path/to/llama_3.2_1B_pruneffn_memory.yaml \
   --mip-only
 ```
 
@@ -40,3 +40,7 @@ This assumes pruning, replacement library building, NAS scoring, and subblock st
 ```bash
 # TODO
 ```
+
+## Advanced usage
+
+Modify `path/to/Llama-3_2-1B yaml` file for advanced compression scenarios.
diff --git a/examples/compress/configs/llama_3.2_1B_pruneffn_memory/Llama-3_2-8B.yaml b/examples/compress/configs/llama_3.2_1B_pruneffn_memory/Llama-3_2-8B.yaml
new file mode 100644
index 000000000..1d8fac655
--- /dev/null
+++ b/examples/compress/configs/llama_3.2_1B_pruneffn_memory/Llama-3_2-8B.yaml
@@ -0,0 +1,108 @@
+defaults:
+  - pruning: ffn_pruning
+  - scoring: ../validate_solutions_defaults
+  - realize_model: ../validate_solutions_defaults
+  - bypass:
+  - override hydra/hydra_logging: disabled
+  - _self_
+
+puzzle_dir: ???
+teacher_dir: ${puzzle_dir}/ckpts/teacher/
+replacement_library_path: ${puzzle_dir}/replacement_library.json
+dataset_path: ???     # path to v0.4_mini
+
+skip_realize_model: false
+
+build_replacement_library:
+  add_ffn_no_ops: true
+  add_attention_no_ops: true
+
+calc_subblock_stats:
+  batch_sizes: [64, 96, 128]
+  prefill_seq_len: 4096
+  generation_seq_len: 4096
+  num_active_tokens_override:       # Optional override for sequence lengths
+  prefill_queue_size: 0
+  allocate_prefill_query: false
+  benchmark_iterations:       # Set to a number (e.g., 1000) to enable runtime benchmarking
+  merge_with_existing_stats: false
+  subblock_stats_filename: "subblock_stats.json"
+  moe_stats_filename: "moe_stats.json"
+
+scoring:
+  solutions_to_validate:
+  skip_existing_solutions: true
+
+  replacement_library_path: ${replacement_library_path}
+  solutions_path: ${to_path:${puzzle_dir}/single_sequence_replacement_solutions.json}
+  teacher_dir: ${to_path:${teacher_dir}}
+  output_dir: ${puzzle_dir}/single_sequence_replacement_solutions--validation
+
+  eval_samples: 2
+  micro_batch_size: 1
+  dataset_path: ${dataset_path}/valid
+  seed: 42
+  shuffle_seed: 444
+
+mip:
+  single_block_replacement_validation_dir: ${to_path:${scoring.output_dir}}
+  subblock_stats_path: ${to_path:${puzzle_dir}/${calc_subblock_stats.subblock_stats_filename}}
+  output_path: ${to_path:${puzzle_dir}/mip/puzzle_solutions}
+  gathered_metrics_path:
+  puzzle_profile:
+
+  # puzzle_profile:
+  objective: metrics.cosine_embedding_loss_hidden_states
+  bigger_is_better: false
+  num_solutions: 1
+  minimal_diversity: 2
+
+  subblock_stats_args:
+    - batch_size: 96
+      weights_dtype: torch.bfloat16
+      activations_dtype: torch.bfloat16
+      kv_cache_dtype: torch.bfloat16
+
+  report_additional_costs:
+    - stats.memory_mib
+    - stats.num_params
+    - stats.num_kv_heads
+    - stats.has_attention
+    - stats.has_ffn
+    - stats.kv_cache_memory_mib
+    - stats.attention_memory_mib
+    - stats.ffn_memory_mib
+    - stats.ffn_num_params
+    - stats.attention_num_params
+
+  human_constraints:
+    target_memory: 780_000 # 78_000
+
+  mip_constraints:
+  use_greedy_search: false
+  is_multi_layer_puzzle: true
+  metric_overrides:
+  constrain_search_func:
+  max_seconds_per_solution: 60
+
+realize_model:
+  teacher_dir: ${to_path:${teacher_dir}}
+  tokenizer_name: ${to_path:${teacher_dir}}
+  replacement_library_path: ${replacement_library_path}
+  save_models: true
+  solutions_path:     # Filled dynamically
+
+  # Validate params
+  skip_validation: false    # To enable validation of the model solution set `skip_validation` as False
+  eval_samples: 2
+  micro_batch_size: 1
+  dataset_path: ${dataset_path}/valid
+  seed: 42
+  shuffle_seed: 444
+
+nccl_timeout_minutes: ${timedelta_minutes:10}
+
+# This section redirects Hydra outputs
+hydra:
+  run:
+    dir: ${puzzle_dir}/hydra_logs/${now:%Y-%m-%d}/${now:%H-%M-%S}
diff --git a/examples/compress/configs/llama_3.2_1B_pruneffn_memory/llama_3.2_1B_pruneffn_memory.yaml b/examples/compress/configs/llama_3.2_1B_pruneffn_memory/llama_3.2_1B_pruneffn_memory.yaml
index bec15d2f7..f7962f0aa 100644
--- a/examples/compress/configs/llama_3.2_1B_pruneffn_memory/llama_3.2_1B_pruneffn_memory.yaml
+++ b/examples/compress/configs/llama_3.2_1B_pruneffn_memory/llama_3.2_1B_pruneffn_memory.yaml
@@ -1,5 +1,5 @@
 defaults:
-  - /Llama-3_2-1B
+  - ./Llama-3_2-1B
   - _self_
 
 # Input Hugging Face model to compress
diff --git a/examples/compress/configs/llama_3.2_1B_pruneffn_memory/pruning/attn_pruning.yaml b/examples/compress/configs/llama_3.2_1B_pruneffn_memory/pruning/attn_pruning.yaml
new file mode 100644
index 000000000..01886607e
--- /dev/null
+++ b/examples/compress/configs/llama_3.2_1B_pruneffn_memory/pruning/attn_pruning.yaml
@@ -0,0 +1,16 @@
+defaults:
+  - pruning_defaults
+
+activations_log_dir: ${puzzle_dir}/pruning/pruning_scores/attn_${pruning.activation_hooks_kwargs.method}/${pruning.experiment_id}
+
+activation_hooks_kwargs:
+  method: independent_kv_head_contribution
+  optimize_for: memory    # IndependentKvHeadContributionHook implementation that consumes less memory
+  target_layer: "self_attn.o_proj"
+  layer_input_descriptors_path:
+
+# n_heads_in_group: 4
+# num_attention_heads: 32       # num query heads
+# num_kv_heads: 32 / 4 = 8      # num_query_heads // n_heads_in_group
+n_heads_in_group_list: [8, 16, 32]      # num_kv_heads = [4, 2, 1]
+gqa_init_mode: "PruneKVHeads"
diff --git a/examples/compress/configs/llama_3.2_1B_pruneffn_memory/pruning/ffn_pruning.yaml b/examples/compress/configs/llama_3.2_1B_pruneffn_memory/pruning/ffn_pruning.yaml
new file mode 100644
index 000000000..f0c852eec
--- /dev/null
+++ b/examples/compress/configs/llama_3.2_1B_pruneffn_memory/pruning/ffn_pruning.yaml
@@ -0,0 +1,12 @@
+defaults:
+  - pruning_defaults
+
+activations_log_dir: ${puzzle_dir}/pruning/pruning_scores/ffn_${pruning.activation_hooks_kwargs.method}/${pruning.experiment_id}
+
+activation_hooks_kwargs:
+  method: iterative
+  target_layer: "mlp.down_proj"
+  layer_input_descriptors_path:
+
+intermediate_size_list: [256]  # teacher_intermediate_size is 14336
+mlp_init_mode: "PruneByActivationsLog"
diff --git a/examples/compress/configs/llama_3.2_1B_pruneffn_memory/pruning/hidden_dim_pruning.yaml b/examples/compress/configs/llama_3.2_1B_pruneffn_memory/pruning/hidden_dim_pruning.yaml
new file mode 100644
index 000000000..407c835d8
--- /dev/null
+++ b/examples/compress/configs/llama_3.2_1B_pruneffn_memory/pruning/hidden_dim_pruning.yaml
@@ -0,0 +1,15 @@
+defaults:
+  - pruning_defaults
+
+activations_log_dir: ${puzzle_dir}/pruning/pruning_scores/hidden_dim_${pruning.activation_hooks_kwargs.method}/${pruning.experiment_id}
+
+activation_hooks_kwargs:
+  method: layer_norm_contribution
+  target_layer: "layernorm"
+
+# Hidden dimension pruning specific settings
+hidden_size_list: [3072, 2048]  # Target hidden sizes to prune to
+hidden_size_init_mode: "PruneByChannelRanking"
+mlp_init_mode: "Truncate" # TODO, make it work with CopyAsIs/FromTeacher
+gqa_init_mode: "AverageKV" # TODO, make it work with CopyAsIs/FromTeacher
+linear_init_mode: "FromTeacher"
diff --git a/examples/compress/configs/llama_3.2_1B_pruneffn_memory/pruning/pruning_defaults.yaml b/examples/compress/configs/llama_3.2_1B_pruneffn_memory/pruning/pruning_defaults.yaml
new file mode 100644
index 000000000..0a5eafcff
--- /dev/null
+++ b/examples/compress/configs/llama_3.2_1B_pruneffn_memory/pruning/pruning_defaults.yaml
@@ -0,0 +1,32 @@
+defaults:
+  - /validate_model_defaults
+
+model_name_or_path: ${teacher_dir}
+experiment_id: ${pruning.eval_samples}samples_diverse_mini
+activations_log_dir: ???
+activation_hooks_kwargs: ???
+
+# Data:
+eval_samples: 100
+micro_batch_size: 4
+dataset_path: ${dataset_path}
+val_dataset_name: train
+
+# Prune ckpts
+pruned_ckpts_outpt_dir: ${puzzle_dir}/pruning/${pruning.experiment_id}
+
+## FFN pruning
+ffn_list:
+mlp_init_mode: "Truncate"
+
+## KV-heads pruning
+n_heads_in_group_list:
+gqa_init_mode: "AverageKV"
+
+## Hidden dimension pruning
+hidden_size_list:
+hidden_size_init_mode: "PruneByChannelRanking"
+linear_init_mode: "FromTeacher"
+
+mlp_init_config_yaml:
+  activations_log_dir: ${pruning.activations_log_dir}
diff --git a/examples/compress/configs/llama_3.2_1B_pruneffn_memory/validate_model_defaults.yaml b/examples/compress/configs/llama_3.2_1B_pruneffn_memory/validate_model_defaults.yaml
new file mode 100644
index 000000000..046ff51f6
--- /dev/null
+++ b/examples/compress/configs/llama_3.2_1B_pruneffn_memory/validate_model_defaults.yaml
@@ -0,0 +1,15 @@
+block_size: 8192
+bos_rate: 0.5
+data_column: conversation
+val_dataset_name: train
+shuffle_seed: 81436
+seed: 42
+fim_rate: 0
+fim_spm_rate: 0
+source_datasets_to_discard:
+varlen: false
+write_results: false
+calc_losses_on_cpu: false
+activations_log_dir:
+model_name_or_path:
+load_dataset_fn: ${get_object:utils.data.dataloaders.load_from_disk_fn}
diff --git a/examples/compress/configs/llama_3.2_1B_pruneffn_memory/validate_solutions_defaults.yaml b/examples/compress/configs/llama_3.2_1B_pruneffn_memory/validate_solutions_defaults.yaml
new file mode 100644
index 000000000..ec1390237
--- /dev/null
+++ b/examples/compress/configs/llama_3.2_1B_pruneffn_memory/validate_solutions_defaults.yaml
@@ -0,0 +1,10 @@
+defaults:
+  - /validate_model_defaults
+  - _self_
+
+solutions_to_validate:
+skip_validation: false
+save_models: false
+bigger_is_better: false
+sort_solutions_by:
+calculate_full_score_ablations: false
diff --git a/examples/compress/main.py b/examples/compress/main.py
index e0fa74db0..e7e443f0f 100644
--- a/examples/compress/main.py
+++ b/examples/compress/main.py
@@ -31,6 +31,10 @@
 import argparse
 from pathlib import Path
 
+from puzzle_tools.hydra_utils import register_hydra_resolvers
+
+from tests.utils.test_utils import initialize_hydra_config_for_dir
+
 
 def parse_args():
     """Parse command line arguments."""
@@ -58,9 +62,20 @@ def run_full_compress(hydra_config_path: str):
     Args:
         config_path: Path to the YAML configuration file
     """
+
+    # Register Hydra custom resolvers (needed for config resolution)
+    register_hydra_resolvers()
+
     hydra_config_path = Path(hydra_config_path).resolve()
-    # config_dir = str(hydra_config_path.parent)
-    # config_name = hydra_config_path.stem
+    hydra_config_dir = str(hydra_config_path.parent)
+    hydra_config_name = hydra_config_path.stem
+
+    # Load hydra config
+    initialize_hydra_config_for_dir(
+        config_dir=hydra_config_dir,
+        config_name=hydra_config_name,
+        overrides=[],
+    )
 
 
 def run_mip_only(hydra_config_path: str):

From f71c1b68ea8ca3d268a3c1a21c1e7a107cc480c5 Mon Sep 17 00:00:00 2001
From: Daniel Korzekwa <dkorzekwa@nvidia.com>
Date: Fri, 31 Oct 2025 19:43:25 +0100
Subject: [PATCH 51/81] Code refactoring

Signed-off-by: Daniel Korzekwa <dkorzekwa@nvidia.com>
---
 .../torch/_compress/nas/plugins/test_nas_convert.py         | 6 ++----
 .../torch/_compress/nas/plugins/test_nas_search.py          | 6 ++----
 2 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/tests/experimental/torch/_compress/nas/plugins/test_nas_convert.py b/tests/experimental/torch/_compress/nas/plugins/test_nas_convert.py
index db20686e9..fd0bb2cf1 100644
--- a/tests/experimental/torch/_compress/nas/plugins/test_nas_convert.py
+++ b/tests/experimental/torch/_compress/nas/plugins/test_nas_convert.py
@@ -48,11 +48,9 @@ def test_nas_convert(project_root_path: Path, tmp_path: Path):
 def _test_nas_convert_multiprocess_job(
     project_root_path: Path, tmp_path: Path, rank: int, size: int
 ):
-    runtime = NativeDdpRuntime(
+    with NativeDdpRuntime(
         dtype=torch.bfloat16, torch_distributed_timeout=datetime.timedelta(10)
-    )
-
-    with runtime as runtime:
+    ) as runtime:
         converted_model, puzzle_dir = run_nas_convert(project_root_path, tmp_path, rank, runtime)
 
         #
diff --git a/tests/experimental/torch/_compress/nas/plugins/test_nas_search.py b/tests/experimental/torch/_compress/nas/plugins/test_nas_search.py
index c21f3fa1b..e6309002a 100644
--- a/tests/experimental/torch/_compress/nas/plugins/test_nas_search.py
+++ b/tests/experimental/torch/_compress/nas/plugins/test_nas_search.py
@@ -40,11 +40,9 @@ def test_nas_search(project_root_path: Path, tmp_path: Path):
 def _test_nas_search_multiprocess_job(
     project_root_path: Path, tmp_path: Path, rank: int, size: int
 ):
-    runtime = NativeDdpRuntime(
+    with NativeDdpRuntime(
         dtype=torch.bfloat16, torch_distributed_timeout=datetime.timedelta(10)
-    )
-
-    with runtime as runtime:
+    ) as runtime:
         converted_model, puzzle_dir = run_nas_convert(project_root_path, tmp_path, rank, runtime)
 
         #

From 7eb2fd7d7b3fe1d36e8011f6b3ce986b153f369f Mon Sep 17 00:00:00 2001
From: Daniel Korzekwa <dkorzekwa@nvidia.com>
Date: Fri, 31 Oct 2025 19:47:58 +0100
Subject: [PATCH 52/81] refactoring

Signed-off-by: Daniel Korzekwa <dkorzekwa@nvidia.com>
---
 .../nas/plugins/compress_nas_plugin.py        |   4 +-
 .../torch/_compress/test_utils.py             | 119 ------------------
 2 files changed, 1 insertion(+), 122 deletions(-)
 delete mode 100644 tests/experimental/torch/_compress/test_utils.py

diff --git a/modelopt/torch/_compress/nas/plugins/compress_nas_plugin.py b/modelopt/torch/_compress/nas/plugins/compress_nas_plugin.py
index 5217c74d7..3b881c2e2 100644
--- a/modelopt/torch/_compress/nas/plugins/compress_nas_plugin.py
+++ b/modelopt/torch/_compress/nas/plugins/compress_nas_plugin.py
@@ -31,6 +31,7 @@
 from modelopt.torch._compress.decilm.converters.convert_llama3_to_decilm import (
     convert_llama3_to_decilm,
 )
+from modelopt.torch._compress.hydra import initialize_hydra_config_for_dir
 from modelopt.torch._compress.runtime import NativeDdpRuntime
 from modelopt.torch.nas.conversion import NASModeRegistry
 from modelopt.torch.opt.config import ModeloptBaseConfig, ModeloptField
@@ -43,9 +44,6 @@
 )
 from modelopt.torch.opt.searcher import BaseSearcher, SearchStateDict
 
-# TODO Move initialize_hydra_config_for_dir from tests to main
-from tests.utils.test_utils import initialize_hydra_config_for_dir
-
 
 class CompressModel(nn.Module):
     pass  # No model implementation is needed for the compress mode
diff --git a/tests/experimental/torch/_compress/test_utils.py b/tests/experimental/torch/_compress/test_utils.py
deleted file mode 100644
index 21ca622da..000000000
--- a/tests/experimental/torch/_compress/test_utils.py
+++ /dev/null
@@ -1,119 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import shutil
-from pathlib import Path
-
-import torch
-from datasets import Dataset, DatasetDict
-from transformers import AutoTokenizer, LlamaConfig, LlamaForCausalLM, PreTrainedTokenizerBase
-
-
-def create_and_save_small_llama_model(
-    output_path: str, vocab_size: int, tokenizer: PreTrainedTokenizerBase
-):
-    """
-    Create and save a small Llama model for testing the conversion pipeline.
-    This mimics having a real Llama checkpoint that needs to be converted.
-    """
-    os.makedirs(output_path, exist_ok=True)
-
-    # Create a minimal Llama config (small for testing)
-    # Note: intermediate_size must be divisible by 256 per DeciLM config requirements
-    # Note: hidden_size must give head_dim >= 8 for Flash Attention 2 compatibility
-    llama_config = LlamaConfig(
-        vocab_size=vocab_size,
-        hidden_size=256,  # 32 heads times 8 head_dim = 256 (matches bypass config expectations)
-        intermediate_size=512,  # Must be divisible by 256
-        num_hidden_layers=2,
-        num_attention_heads=32,  # Matches original test
-        num_key_value_heads=8,  # GQA: 32÷4=8 (matches original n_heads_in_group=4)
-        max_position_embeddings=512,
-        rms_norm_eps=1e-5,
-        rope_theta=10000.0,
-        attention_bias=False,
-        hidden_act="silu",
-        tie_word_embeddings=False,
-    )
-
-    # Create and save the Llama model
-    model = LlamaForCausalLM(llama_config)
-    model.to(dtype=torch.bfloat16).save_pretrained(output_path)
-
-    # Save tokenizer
-    tokenizer.save_pretrained(output_path)
-
-    # Save config
-    llama_config.save_pretrained(output_path)
-
-
-def create_tokenizer(project_root_path: Path) -> PreTrainedTokenizerBase:
-    """
-    Create a tokenizer for the Llama model.
-    """
-    tokenizer_path = project_root_path / "tests/experimental/torch/_compress/resources/tokenizer"
-    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
-    return tokenizer
-
-
-def setup_puzzle_dir(puzzle_dir: str):
-    """
-    Setup puzzle directory by removing existing directory and creating a new one.
-    """
-    if Path(puzzle_dir).exists():
-        shutil.rmtree(puzzle_dir)
-        Path(puzzle_dir).mkdir(parents=True, exist_ok=True)
-
-
-def save_dummy_dataset(dataset_path: str):
-    """
-    Save a dummy dataset for testing purposes.
-    """
-    # dummy sample
-    sample = [
-        {"role": "user", "content": "please cite Lorem Ipsum?"},
-        {
-            "role": "assistant",
-            "content": (
-                "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed in blandit ante. "
-                "Sed tempus erat urna, ac elementum nisl facilisis quis. Aliquam consectetur mollis massa, "
-                "in elementum sem venenatis posuere. Fusce lorem arcu, egestas vel massa sollicitudin, "
-                "dictum mollis purus. Proin in ullamcorper elit. Nam tellus nisi, volutpat a mattis vel, "
-                "pretium in purus. Nunc at lectus facilisis risus scelerisque rhoncus eu nec ex. "
-                "Maecenas semper, tellus non placerat vulputate, urna felis facilisis diam, "
-                "sit amet vestibulum erat sapien nec libero. Praesent non massa velit. Donec faucibus mi eros. "
-                "Nam turpis nulla, congue sit amet mi at, porttitor scelerisque elit. Nunc id sodales lorem, "
-                "nec tincidunt leo. Quisque a neque nec ligula porttitor auctor. "
-                "Nunc accumsan nunc ac tellus congue vehicula. Praesent tellus eros, luctus non gravida dapibus, "
-                "faucibus eu ex. Quisque bibendum leo pharetra, tristique est vitae, hendrerit nunc. "
-                "Duis nec congue dolor. Donec commodo ipsum non efficitur volutpat. "
-                "Nulla risus nulla, efficitur et urna at, imperdiet sodales lorem. "
-                "Suspendisse erat est, sollicitudin at nisl tincidunt, vehicula hendrerit lectus. "
-                "Nam quis nisi ullamcorper, rhoncus massa vel, tempus purus. "
-                "Duis pulvinar eros vel nulla pellentesque, at dapibus justo laoreet. "
-                "Praesent tortor orci, vulputate fermentum dapibus nec, feugiat vitae tortor. "
-                "Donec mollis convallis massa quis iaculis."
-            ),
-        },
-    ]
-
-    # Prepare train and val splits with sample repeated, 2500 samples are for
-    # 128 samples with block-size 8192 and LLama3 tokenizer
-    data = [{"conversation": sample}] * 2500
-
-    # For train-val splits
-    data_dict = DatasetDict({"train": Dataset.from_list(data), "valid": Dataset.from_list(data)})
-    data_dict.save_to_disk(dataset_path)

From 3eb39f99dd75deb8a93a77b8479542734217f999 Mon Sep 17 00:00:00 2001
From: Daniel Korzekwa <dkorzekwa@nvidia.com>
Date: Fri, 31 Oct 2025 19:51:57 +0100
Subject: [PATCH 53/81] code clean up

Signed-off-by: Daniel Korzekwa <dkorzekwa@nvidia.com>
---
 .../experimental/torch/_compress/nas/plugins/test_nas_search.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/experimental/torch/_compress/nas/plugins/test_nas_search.py b/tests/experimental/torch/_compress/nas/plugins/test_nas_search.py
index e6309002a..21526f5ec 100644
--- a/tests/experimental/torch/_compress/nas/plugins/test_nas_search.py
+++ b/tests/experimental/torch/_compress/nas/plugins/test_nas_search.py
@@ -56,7 +56,7 @@ def _test_nas_search_multiprocess_job(
         )
 
         #
-        # Check assertions for mnt.search() step
+        # Check assertions for mtn.search() step
         #
         if rank == 0:
             # assertions for the build_library_and_stats step

From 8360de94f76f753941e27fea07373d8d8f3d6557 Mon Sep 17 00:00:00 2001
From: Daniel Korzekwa <dkorzekwa@nvidia.com>
Date: Fri, 31 Oct 2025 22:33:31 +0100
Subject: [PATCH 54/81] Implement compress cli tool.

Signed-off-by: Daniel Korzekwa <dkorzekwa@nvidia.com>
---
 .../{Llama-3_2-8B.yaml => Llama-3_2-1B.yaml}  |  0
 .../llama_3.2_1B_pruneffn_memory.yaml         | 10 ++--
 examples/compress/main.py                     | 54 ++++++++++++++-----
 3 files changed, 45 insertions(+), 19 deletions(-)
 rename examples/compress/configs/llama_3.2_1B_pruneffn_memory/{Llama-3_2-8B.yaml => Llama-3_2-1B.yaml} (100%)

diff --git a/examples/compress/configs/llama_3.2_1B_pruneffn_memory/Llama-3_2-8B.yaml b/examples/compress/configs/llama_3.2_1B_pruneffn_memory/Llama-3_2-1B.yaml
similarity index 100%
rename from examples/compress/configs/llama_3.2_1B_pruneffn_memory/Llama-3_2-8B.yaml
rename to examples/compress/configs/llama_3.2_1B_pruneffn_memory/Llama-3_2-1B.yaml
diff --git a/examples/compress/configs/llama_3.2_1B_pruneffn_memory/llama_3.2_1B_pruneffn_memory.yaml b/examples/compress/configs/llama_3.2_1B_pruneffn_memory/llama_3.2_1B_pruneffn_memory.yaml
index f7962f0aa..f3fa86953 100644
--- a/examples/compress/configs/llama_3.2_1B_pruneffn_memory/llama_3.2_1B_pruneffn_memory.yaml
+++ b/examples/compress/configs/llama_3.2_1B_pruneffn_memory/llama_3.2_1B_pruneffn_memory.yaml
@@ -1,15 +1,15 @@
 defaults:
-  - ./Llama-3_2-1B
+  - Llama-3_2-1B
   - _self_
 
 # Input Hugging Face model to compress
-input_hf_model_path: ???  # e.g., "path/to/meta-llama/Llama-3.2-1B"
+input_hf_model_path: /workspace/hf_models/meta-llama/Llama-3.2-1B
 
 # Dataset path for pruning and NAS scoring
-dataset_path: ???  # e.g., "path/to/dataset"
+dataset_path: /workspace/datasets/Nemotron-Post-Training-Dataset-v2
 
 # Working directory for compression outputs
-puzzle_dir: ???  # e.g., "path/to/puzzle_dir"
+puzzle_dir: /workspace/puzzle_dir
 
 # MIP memory constraint (in MiB) 
 mip:
@@ -18,4 +18,4 @@ mip:
 
 # FFN intermediate sizes to search over (heterogeneous architecture)
 pruning:
-  intermediate_size_list: [768, 1024, 1536, 2048, 2560, 4192]  # Llama 3.2 1B baseline: 8192
+  intermediate_size_list: [256]  # Llama 3.2 1B baseline: 8192
diff --git a/examples/compress/main.py b/examples/compress/main.py
index e7e443f0f..9fc525b03 100644
--- a/examples/compress/main.py
+++ b/examples/compress/main.py
@@ -29,10 +29,15 @@
 """
 
 import argparse
+import datetime
 from pathlib import Path
 
+import torch
 from puzzle_tools.hydra_utils import register_hydra_resolvers
 
+import modelopt.torch.nas as mtn
+from modelopt.torch._compress.nas.plugins.compress_nas_plugin import CompressModel
+from modelopt.torch._compress.runtime import NativeDdpRuntime
 from tests.utils.test_utils import initialize_hydra_config_for_dir
 
 
@@ -63,19 +68,39 @@ def run_full_compress(hydra_config_path: str):
         config_path: Path to the YAML configuration file
     """
 
-    # Register Hydra custom resolvers (needed for config resolution)
-    register_hydra_resolvers()
-
-    hydra_config_path = Path(hydra_config_path).resolve()
-    hydra_config_dir = str(hydra_config_path.parent)
-    hydra_config_name = hydra_config_path.stem
-
-    # Load hydra config
-    initialize_hydra_config_for_dir(
-        config_dir=hydra_config_dir,
-        config_name=hydra_config_name,
-        overrides=[],
-    )
+    with NativeDdpRuntime(dtype=torch.bfloat16, torch_distributed_timeout=datetime.timedelta(10)):
+        # Register Hydra custom resolvers (needed for config resolution)
+        register_hydra_resolvers()
+
+        hydra_config_path = Path(hydra_config_path).resolve()
+        hydra_config_dir = str(hydra_config_path.parent)
+        hydra_config_name = hydra_config_path.stem
+
+        # Load hydra config
+        hydra_cfg = initialize_hydra_config_for_dir(
+            config_dir=hydra_config_dir,
+            config_name=hydra_config_name,
+            overrides=[],
+        )
+
+        input_model = CompressModel()
+        mtn.convert(
+            input_model,
+            mode=[
+                (
+                    "compress",
+                    {
+                        "puzzle_dir": str(hydra_cfg.puzzle_dir),
+                        "input_model_path": hydra_cfg.input_hf_model_path,
+                        "hydra_config_dir": hydra_config_dir,
+                        "hydra_config_name": hydra_config_name,
+                        "dataset_path": str(hydra_cfg.dataset_path),
+                    },
+                )
+            ],
+        )
+
+        print(f"\nCompression completed. Output in: {hydra_cfg.puzzle_dir}")
 
 
 def run_mip_only(hydra_config_path: str):
@@ -87,7 +112,8 @@ def run_mip_only(hydra_config_path: str):
     Args:
         config_path: Path to the YAML configuration file
     """
-    hydra_config_path = Path(hydra_config_path).resolve()
+    raise NotImplementedError("MIP-only mode is not implemented yet")
+    # hydra_config_path = Path(hydra_config_path).resolve()
     # config_dir = str(hydra_config_path.parent)
     # config_name = hydra_config_path.stem
 

From 9230d81cd77d096819ffcb0aceeff03989551791 Mon Sep 17 00:00:00 2001
From: Daniel Korzekwa <dkorzekwa@nvidia.com>
Date: Fri, 31 Oct 2025 22:38:49 +0100
Subject: [PATCH 55/81] Add running mtn.search() to compress cli tool.

Signed-off-by: Daniel Korzekwa <dkorzekwa@nvidia.com>
---
 examples/compress/main.py | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/examples/compress/main.py b/examples/compress/main.py
index 9fc525b03..396f65a70 100644
--- a/examples/compress/main.py
+++ b/examples/compress/main.py
@@ -83,8 +83,10 @@ def run_full_compress(hydra_config_path: str):
             overrides=[],
         )
 
+        # Convert model (convert from HF to DeciLM, score pruning activations,
+        # prune the model and save pruned checkpoints)
         input_model = CompressModel()
-        mtn.convert(
+        converted_model = mtn.convert(
             input_model,
             mode=[
                 (
@@ -100,6 +102,15 @@ def run_full_compress(hydra_config_path: str):
             ],
         )
 
+        # Run NAS search (build replacement library and compute stats,
+        # compute one block scores, run MIP and realize models)
+        mtn.search(
+            converted_model,
+            constraints={},  # this is not used as the search space is defined in the hydra config
+            dummy_input=None,  # Not used
+            config={},  # this is not used as the search space is defined in the hydra config
+        )
+
         print(f"\nCompression completed. Output in: {hydra_cfg.puzzle_dir}")
 
 

From 28b5c13d5beb5b1516651b663cbf386b4f0d8194 Mon Sep 17 00:00:00 2001
From: Daniel Korzekwa <dkorzekwa@nvidia.com>
Date: Fri, 31 Oct 2025 22:40:25 +0100
Subject: [PATCH 56/81] update docs

Signed-off-by: Daniel Korzekwa <dkorzekwa@nvidia.com>
---
 examples/compress/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/compress/main.py b/examples/compress/main.py
index 396f65a70..155490e13 100644
--- a/examples/compress/main.py
+++ b/examples/compress/main.py
@@ -121,7 +121,7 @@ def run_mip_only(hydra_config_path: str):
     have already been completed.
 
     Args:
-        config_path: Path to the YAML configuration file
+        hydra_config_path: Path to the YAML configuration file
     """
     raise NotImplementedError("MIP-only mode is not implemented yet")
     # hydra_config_path = Path(hydra_config_path).resolve()

From a7eba4bcc9df82c4c7ef39a106ec17aad44f6189 Mon Sep 17 00:00:00 2001
From: Daniel Korzekwa <dkorzekwa@nvidia.com>
Date: Sat, 1 Nov 2025 14:01:05 +0100
Subject: [PATCH 57/81] Replace dummy dataset with
 Nemotron-Post-Training-Dataset-v2

Signed-off-by: Daniel Korzekwa <dkorzekwa@nvidia.com>
---
 examples/compress/README.md                     | 17 +++++++++++++----
 .../validate_model_defaults.yaml                |  4 ++--
 2 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/examples/compress/README.md b/examples/compress/README.md
index 0304217c7..542423da7 100644
--- a/examples/compress/README.md
+++ b/examples/compress/README.md
@@ -6,10 +6,19 @@ In this example, we compress Llama 3.2 1B by searching for the optimal `ffn_inte
 
 ## Compress the Model
 
-```bash
-torchrun examples/compress/main.py \
-  --config path/to/llama_3.2_1B_pruneffn_memory.yaml
-```
+1. Specify the `puzzle_dir`, `input_hf_model_path`, `dataset_path`, `intermediate_size_list`, and `target_memory` arguments in the [llama_3.2_1B_pruneffn_memory.yaml](./configs/llama_3.2_1B_pruneffn_memory/llama_3.2_1B_pruneffn_memory.yaml) configuration file.
+
+2. Download and prepare the dataset:
+
+   ```bash
+   python -m scripts.prepare_dataset --dataset_name nvidia/Nemotron-Post-Training-Dataset-v2 --output_dir path/to/Nemotron-Post-Training-Dataset-v2
+   ```
+
+3. Run the compression script:
+
+   ```bash
+   torchrun examples/compress/main.py --config path/to/llama_3.2_1B_pruneffn_memory.yaml
+   ```
 
 ## Evaluate Model Accuracy
 
diff --git a/examples/compress/configs/llama_3.2_1B_pruneffn_memory/validate_model_defaults.yaml b/examples/compress/configs/llama_3.2_1B_pruneffn_memory/validate_model_defaults.yaml
index 046ff51f6..572331a84 100644
--- a/examples/compress/configs/llama_3.2_1B_pruneffn_memory/validate_model_defaults.yaml
+++ b/examples/compress/configs/llama_3.2_1B_pruneffn_memory/validate_model_defaults.yaml
@@ -1,7 +1,7 @@
 block_size: 8192
 bos_rate: 0.5
-data_column: conversation
-val_dataset_name: train
+data_column: messages
+val_dataset_name: valid
 shuffle_seed: 81436
 seed: 42
 fim_rate: 0

From 21ed59bfb3ed9d3e7e551d8bdd2d09f5cf8934f9 Mon Sep 17 00:00:00 2001
From: Daniel Korzekwa <dkorzekwa@nvidia.com>
Date: Sat, 1 Nov 2025 14:13:01 +0100
Subject: [PATCH 58/81] Refactoring

Signed-off-by: Daniel Korzekwa <dkorzekwa@nvidia.com>
---
 examples/compress/README.md                   |  2 +-
 modelopt/torch/_compress/__init__.py          | 15 +++++
 modelopt/torch/_compress/dataset/__init__.py  | 15 +++++
 .../_compress/dataset/prepare_dataset.py      | 64 +++++++++++++++++++
 4 files changed, 95 insertions(+), 1 deletion(-)
 create mode 100644 modelopt/torch/_compress/__init__.py
 create mode 100644 modelopt/torch/_compress/dataset/__init__.py
 create mode 100644 modelopt/torch/_compress/dataset/prepare_dataset.py

diff --git a/examples/compress/README.md b/examples/compress/README.md
index 542423da7..23b862f67 100644
--- a/examples/compress/README.md
+++ b/examples/compress/README.md
@@ -11,7 +11,7 @@ In this example, we compress Llama 3.2 1B by searching for the optimal `ffn_inte
 2. Download and prepare the dataset:
 
    ```bash
-   python -m scripts.prepare_dataset --dataset_name nvidia/Nemotron-Post-Training-Dataset-v2 --output_dir path/to/Nemotron-Post-Training-Dataset-v2
+   python -m modelopt.torch._compress.dataset.prepare_dataset --dataset_name nvidia/Nemotron-Post-Training-Dataset-v2 --output_dir path/to/Nemotron-Post-Training-Dataset-v2
    ```
 
 3. Run the compression script:
diff --git a/modelopt/torch/_compress/__init__.py b/modelopt/torch/_compress/__init__.py
new file mode 100644
index 000000000..47f1c65a1
--- /dev/null
+++ b/modelopt/torch/_compress/__init__.py
@@ -0,0 +1,15 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/modelopt/torch/_compress/dataset/__init__.py b/modelopt/torch/_compress/dataset/__init__.py
new file mode 100644
index 000000000..47f1c65a1
--- /dev/null
+++ b/modelopt/torch/_compress/dataset/__init__.py
@@ -0,0 +1,15 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/modelopt/torch/_compress/dataset/prepare_dataset.py b/modelopt/torch/_compress/dataset/prepare_dataset.py
new file mode 100644
index 000000000..49d63d122
--- /dev/null
+++ b/modelopt/torch/_compress/dataset/prepare_dataset.py
@@ -0,0 +1,64 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import datasets
+import fire
+import numpy as np
+from logger import mprint
+
+
+def process_and_save_dataset(
+    dataset_name: str,
+    output_dir: str,
+    split: tuple = ("code", "math", "stem", "chat"),
+    overwrite: bool = False,
+):
+    # Check if output_dir contains an existing dataset
+    dataset_dict_path = os.path.join(output_dir, "dataset_dict.json")
+    if os.path.exists(output_dir) and os.path.exists(dataset_dict_path):
+        if not overwrite:
+            mprint(
+                f"Output directory '{output_dir}' already contains a dataset. "
+                "Use '--overwrite True' to overwrite existing data."
+            )
+            return
+
+    ds = datasets.load_dataset(dataset_name, split=split)
+    ds = datasets.concatenate_datasets(ds)
+    # Filter out samples with reasoning = on
+    ds = ds.filter(lambda x: x["reasoning"] == "off")
+    # Hardcoded for dynamically create a deterministic train-val split
+    seed = 408
+    generator = np.random.RandomState(seed=seed)
+    ds_split = ds.train_test_split(test_size=0.05, shuffle=True, generator=generator)
+    # Rename dataset names to follow previous conventions
+    ds_dict = datasets.DatasetDict(
+        {
+            "train": ds_split["train"],
+            "valid": ds_split["test"],
+        }
+    )
+    # Save locally
+    os.makedirs(output_dir, exist_ok=True)
+    ds_dict.save_to_disk(output_dir)
+
+    mprint(f"Dataset splits:\n{ds_dict}")
+    mprint(f"Saved processed datasets to {output_dir}")
+
+
+if __name__ == "__main__":
+    fire.Fire(process_and_save_dataset)

From e3ed0a44802379c859cde61737b95d3c95c2e4c4 Mon Sep 17 00:00:00 2001
From: Daniel Korzekwa <dkorzekwa@nvidia.com>
Date: Sat, 1 Nov 2025 14:14:58 +0100
Subject: [PATCH 59/81] Update docs

Signed-off-by: Daniel Korzekwa <dkorzekwa@nvidia.com>
---
 examples/compress/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/compress/README.md b/examples/compress/README.md
index 23b862f67..13634db32 100644
--- a/examples/compress/README.md
+++ b/examples/compress/README.md
@@ -8,7 +8,7 @@ In this example, we compress Llama 3.2 1B by searching for the optimal `ffn_inte
 
 1. Specify the `puzzle_dir`, `input_hf_model_path`, `dataset_path`, `intermediate_size_list`, and `target_memory` arguments in the [llama_3.2_1B_pruneffn_memory.yaml](./configs/llama_3.2_1B_pruneffn_memory/llama_3.2_1B_pruneffn_memory.yaml) configuration file.
 
-2. Download and prepare the dataset:
+2. Download and prepare the dataset (2.62GB):
 
    ```bash
    python -m modelopt.torch._compress.dataset.prepare_dataset --dataset_name nvidia/Nemotron-Post-Training-Dataset-v2 --output_dir path/to/Nemotron-Post-Training-Dataset-v2

From 9e09e8f1bc02763b125a1799a971f1b61076f89b Mon Sep 17 00:00:00 2001
From: Daniel Korzekwa <dkorzekwa@nvidia.com>
Date: Sat, 1 Nov 2025 16:01:00 +0100
Subject: [PATCH 60/81] Refactoring. Change the compress tutorial from Llama
 3.2 1B-instruct to Llma 3.1 8B-instruct

Signed-off-by: Daniel Korzekwa <dkorzekwa@nvidia.com>
---
 examples/compress/README.md                   | 20 ++++++++++---------
 .../Llama-3_1-8B.yaml}                        |  0
 .../llama-3_1-8B_pruneffn_memory.yaml}        |  4 ++--
 .../pruning/attn_pruning.yaml                 |  0
 .../pruning/ffn_pruning.yaml                  |  0
 .../pruning/hidden_dim_pruning.yaml           |  0
 .../pruning/pruning_defaults.yaml             |  0
 .../validate_model_defaults.yaml              |  0
 .../validate_solutions_defaults.yaml          |  0
 9 files changed, 13 insertions(+), 11 deletions(-)
 rename examples/compress/configs/{llama_3.2_1B_pruneffn_memory/Llama-3_2-1B.yaml => llama-3_1-8B_pruneffn_memory/Llama-3_1-8B.yaml} (100%)
 rename examples/compress/configs/{llama_3.2_1B_pruneffn_memory/llama_3.2_1B_pruneffn_memory.yaml => llama-3_1-8B_pruneffn_memory/llama-3_1-8B_pruneffn_memory.yaml} (84%)
 rename examples/compress/configs/{llama_3.2_1B_pruneffn_memory => llama-3_1-8B_pruneffn_memory}/pruning/attn_pruning.yaml (100%)
 rename examples/compress/configs/{llama_3.2_1B_pruneffn_memory => llama-3_1-8B_pruneffn_memory}/pruning/ffn_pruning.yaml (100%)
 rename examples/compress/configs/{llama_3.2_1B_pruneffn_memory => llama-3_1-8B_pruneffn_memory}/pruning/hidden_dim_pruning.yaml (100%)
 rename examples/compress/configs/{llama_3.2_1B_pruneffn_memory => llama-3_1-8B_pruneffn_memory}/pruning/pruning_defaults.yaml (100%)
 rename examples/compress/configs/{llama_3.2_1B_pruneffn_memory => llama-3_1-8B_pruneffn_memory}/validate_model_defaults.yaml (100%)
 rename examples/compress/configs/{llama_3.2_1B_pruneffn_memory => llama-3_1-8B_pruneffn_memory}/validate_solutions_defaults.yaml (100%)

diff --git a/examples/compress/README.md b/examples/compress/README.md
index 13634db32..dcee6f796 100644
--- a/examples/compress/README.md
+++ b/examples/compress/README.md
@@ -2,37 +2,39 @@
 
 This tutorial demonstrates how to compress large language models using the compress algorithm based on the [Puzzle paper](https://arxiv.org/abs/2411.19146).
 
-In this example, we compress Llama 3.2 1B by searching for the optimal `ffn_intermediate_size` across MLP layers, resulting in a heterogeneous architecture while reducing GPU memory usage by 20%.
+In this example, we compress [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) model by searching for the optimal `ffn_intermediate_size` across MLP layers, resulting in a heterogeneous architecture while reducing GPU memory usage by 20%.
 
 ## Compress the Model
 
-1. Specify the `puzzle_dir`, `input_hf_model_path`, `dataset_path`, `intermediate_size_list`, and `target_memory` arguments in the [llama_3.2_1B_pruneffn_memory.yaml](./configs/llama_3.2_1B_pruneffn_memory/llama_3.2_1B_pruneffn_memory.yaml) configuration file.
+1. Specify the `puzzle_dir`, `input_hf_model_path`, `dataset_path`, `intermediate_size_list`, and `target_memory` arguments in the [llama-3_1-8B_pruneffn_memory.yaml](./configs/llama-3_1-8B_pruneffn_memory/llama-3_1-8B_pruneffn_memory.yaml) configuration file.
 
-2. Download and prepare the dataset (2.62GB):
+2. Download and prepare the [Nemotron-Post-Training-Dataset-v2](https://huggingface.co/datasets/nvidia/Nemotron-Post-Training-Dataset-v2).
+
+   dataset split: "code", "math", "stem", "chat", excluding reasoning samples (2.62GB)
 
    ```bash
    python -m modelopt.torch._compress.dataset.prepare_dataset --dataset_name nvidia/Nemotron-Post-Training-Dataset-v2 --output_dir path/to/Nemotron-Post-Training-Dataset-v2
    ```
 
-3. Run the compression script:
+3. Run the compression script.
 
    ```bash
-   torchrun examples/compress/main.py --config path/to/llama_3.2_1B_pruneffn_memory.yaml
+   torchrun examples/compress/main.py --config path/to/llama-3_1-8B_pruneffn_memory.yaml
    ```
 
-## Evaluate Model Accuracy
+## Evaluate model accuracy
 
 ```bash
 # TODO
 ```
 
-## Re-run MIP Search with Different Memory Constraints
+## Re-run MIP Search with different memory constraints
 
 If you want to try different memory constraints without re-running the expensive pruning and scoring steps, use the `--mip-only` flag:
 
 ```bash
 torchrun examples/compress/main.py \
-  --config path/to/llama_3.2_1B_pruneffn_memory.yaml \
+  --config path/to/llama-3_1_8B_pruneffn_memory.yaml \
   --mip-only
 ```
 
@@ -52,4 +54,4 @@ This assumes pruning, replacement library building, NAS scoring, and subblock st
 
 ## Advanced usage
 
-Modify `path/to/Llama-3_2-1B yaml` file for advanced compression scenarios.
+Modify `path/to/Llama-3_1-8B yaml` file for advanced compression scenarios.
diff --git a/examples/compress/configs/llama_3.2_1B_pruneffn_memory/Llama-3_2-1B.yaml b/examples/compress/configs/llama-3_1-8B_pruneffn_memory/Llama-3_1-8B.yaml
similarity index 100%
rename from examples/compress/configs/llama_3.2_1B_pruneffn_memory/Llama-3_2-1B.yaml
rename to examples/compress/configs/llama-3_1-8B_pruneffn_memory/Llama-3_1-8B.yaml
diff --git a/examples/compress/configs/llama_3.2_1B_pruneffn_memory/llama_3.2_1B_pruneffn_memory.yaml b/examples/compress/configs/llama-3_1-8B_pruneffn_memory/llama-3_1-8B_pruneffn_memory.yaml
similarity index 84%
rename from examples/compress/configs/llama_3.2_1B_pruneffn_memory/llama_3.2_1B_pruneffn_memory.yaml
rename to examples/compress/configs/llama-3_1-8B_pruneffn_memory/llama-3_1-8B_pruneffn_memory.yaml
index f3fa86953..74af0cad6 100644
--- a/examples/compress/configs/llama_3.2_1B_pruneffn_memory/llama_3.2_1B_pruneffn_memory.yaml
+++ b/examples/compress/configs/llama-3_1-8B_pruneffn_memory/llama-3_1-8B_pruneffn_memory.yaml
@@ -1,9 +1,9 @@
 defaults:
-  - Llama-3_2-1B
+  - Llama-3_1-8B
   - _self_
 
 # Input Hugging Face model to compress
-input_hf_model_path: /workspace/hf_models/meta-llama/Llama-3.2-1B
+input_hf_model_path: /workspace/hf_models/meta-llama/Llama-3.1-8B-Instruct
 
 # Dataset path for pruning and NAS scoring
 dataset_path: /workspace/datasets/Nemotron-Post-Training-Dataset-v2
diff --git a/examples/compress/configs/llama_3.2_1B_pruneffn_memory/pruning/attn_pruning.yaml b/examples/compress/configs/llama-3_1-8B_pruneffn_memory/pruning/attn_pruning.yaml
similarity index 100%
rename from examples/compress/configs/llama_3.2_1B_pruneffn_memory/pruning/attn_pruning.yaml
rename to examples/compress/configs/llama-3_1-8B_pruneffn_memory/pruning/attn_pruning.yaml
diff --git a/examples/compress/configs/llama_3.2_1B_pruneffn_memory/pruning/ffn_pruning.yaml b/examples/compress/configs/llama-3_1-8B_pruneffn_memory/pruning/ffn_pruning.yaml
similarity index 100%
rename from examples/compress/configs/llama_3.2_1B_pruneffn_memory/pruning/ffn_pruning.yaml
rename to examples/compress/configs/llama-3_1-8B_pruneffn_memory/pruning/ffn_pruning.yaml
diff --git a/examples/compress/configs/llama_3.2_1B_pruneffn_memory/pruning/hidden_dim_pruning.yaml b/examples/compress/configs/llama-3_1-8B_pruneffn_memory/pruning/hidden_dim_pruning.yaml
similarity index 100%
rename from examples/compress/configs/llama_3.2_1B_pruneffn_memory/pruning/hidden_dim_pruning.yaml
rename to examples/compress/configs/llama-3_1-8B_pruneffn_memory/pruning/hidden_dim_pruning.yaml
diff --git a/examples/compress/configs/llama_3.2_1B_pruneffn_memory/pruning/pruning_defaults.yaml b/examples/compress/configs/llama-3_1-8B_pruneffn_memory/pruning/pruning_defaults.yaml
similarity index 100%
rename from examples/compress/configs/llama_3.2_1B_pruneffn_memory/pruning/pruning_defaults.yaml
rename to examples/compress/configs/llama-3_1-8B_pruneffn_memory/pruning/pruning_defaults.yaml
diff --git a/examples/compress/configs/llama_3.2_1B_pruneffn_memory/validate_model_defaults.yaml b/examples/compress/configs/llama-3_1-8B_pruneffn_memory/validate_model_defaults.yaml
similarity index 100%
rename from examples/compress/configs/llama_3.2_1B_pruneffn_memory/validate_model_defaults.yaml
rename to examples/compress/configs/llama-3_1-8B_pruneffn_memory/validate_model_defaults.yaml
diff --git a/examples/compress/configs/llama_3.2_1B_pruneffn_memory/validate_solutions_defaults.yaml b/examples/compress/configs/llama-3_1-8B_pruneffn_memory/validate_solutions_defaults.yaml
similarity index 100%
rename from examples/compress/configs/llama_3.2_1B_pruneffn_memory/validate_solutions_defaults.yaml
rename to examples/compress/configs/llama-3_1-8B_pruneffn_memory/validate_solutions_defaults.yaml

From abb39f3f325939ea2757c7ddd779fa783626ee00 Mon Sep 17 00:00:00 2001
From: Daniel Korzekwa <dkorzekwa@nvidia.com>
Date: Sat, 1 Nov 2025 20:41:36 +0100
Subject: [PATCH 61/81] Improve logging

Signed-off-by: Daniel Korzekwa <dkorzekwa@nvidia.com>
---
 examples/compress/README.md                   | 15 ++++++-
 examples/compress/main.py                     |  4 +-
 modelopt/torch/_compress/dateutils.py         | 41 +++++++++++++++++++
 .../nas/plugins/compress_nas_plugin.py        | 11 +++++
 4 files changed, 69 insertions(+), 2 deletions(-)
 create mode 100644 modelopt/torch/_compress/dateutils.py

diff --git a/examples/compress/README.md b/examples/compress/README.md
index dcee6f796..7a4535441 100644
--- a/examples/compress/README.md
+++ b/examples/compress/README.md
@@ -19,7 +19,20 @@ In this example, we compress [meta-llama/Llama-3.1-8B-Instruct](https://huggingf
 3. Run the compression script.
 
    ```bash
-   torchrun examples/compress/main.py --config path/to/llama-3_1-8B_pruneffn_memory.yaml
+   torchrun examples/compress/main.py --config path/to/llama-3_1-8B_pruneffn_memory.yaml 2>&1 | tee ./log.txt |grep "Compress Progress"
+   ```
+
+   screen output:
+
+   ```bash
+   [2025-11-01 19:26:38] Compress Progress 1/8: starting compression pipeline
+   [2025-11-01 19:26:38] Compress Progress 2/8: converting model from HF to DeciLM
+   [2025-11-01 19:26:39] Compress Progress 3/8: scoring pruning activations
+   [2025-11-01 19:26:46] Compress Progress 4/8: pruning the model and saving pruned checkpoints
+   [2025-11-01 19:26:46] Compress Progress 5/8: building replacement library and calculating subblock statistics
+   [2025-11-01 19:26:46] Compress Progress 6/8: calculating one block scores
+   [2025-11-01 19:26:52] Compress Progress 7/8: running MIP and realizing models
+   [2025-11-01 19:26:59] Compress Progress 8/8: compression pipeline completed
    ```
 
 ## Evaluate model accuracy
diff --git a/examples/compress/main.py b/examples/compress/main.py
index 155490e13..95cda0d9a 100644
--- a/examples/compress/main.py
+++ b/examples/compress/main.py
@@ -36,6 +36,7 @@
 from puzzle_tools.hydra_utils import register_hydra_resolvers
 
 import modelopt.torch.nas as mtn
+from modelopt.torch._compress.dateutils import timestamped
 from modelopt.torch._compress.nas.plugins.compress_nas_plugin import CompressModel
 from modelopt.torch._compress.runtime import NativeDdpRuntime
 from tests.utils.test_utils import initialize_hydra_config_for_dir
@@ -68,6 +69,7 @@ def run_full_compress(hydra_config_path: str):
         config_path: Path to the YAML configuration file
     """
 
+    print(timestamped("Compress Progress 1/8: starting compression pipeline"))
     with NativeDdpRuntime(dtype=torch.bfloat16, torch_distributed_timeout=datetime.timedelta(10)):
         # Register Hydra custom resolvers (needed for config resolution)
         register_hydra_resolvers()
@@ -111,7 +113,7 @@ def run_full_compress(hydra_config_path: str):
             config={},  # this is not used as the search space is defined in the hydra config
         )
 
-        print(f"\nCompression completed. Output in: {hydra_cfg.puzzle_dir}")
+        print(timestamped("Compress Progress 8/8: compression pipeline completed"))
 
 
 def run_mip_only(hydra_config_path: str):
diff --git a/modelopt/torch/_compress/dateutils.py b/modelopt/torch/_compress/dateutils.py
new file mode 100644
index 000000000..76a8aec2a
--- /dev/null
+++ b/modelopt/torch/_compress/dateutils.py
@@ -0,0 +1,41 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Date and time utility functions for the compress module.
+"""
+
+import datetime
+
+
+def get_timestamp() -> str:
+    """Get a formatted timestamp string for logging.
+
+    Returns:
+        A formatted timestamp string in the format 'YYYY-MM-DD HH:MM:SS'.
+    """
+    return datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+
+
+def timestamped(message: str) -> str:
+    """Add a timestamp prefix to a message.
+
+    Args:
+        message: The message to prefix with a timestamp.
+
+    Returns:
+        The message with a timestamp prefix in the format '[YYYY-MM-DD HH:MM:SS] message'.
+    """
+    return f"[{get_timestamp()}] {message}"
diff --git a/modelopt/torch/_compress/nas/plugins/compress_nas_plugin.py b/modelopt/torch/_compress/nas/plugins/compress_nas_plugin.py
index 3b881c2e2..aa06c217b 100644
--- a/modelopt/torch/_compress/nas/plugins/compress_nas_plugin.py
+++ b/modelopt/torch/_compress/nas/plugins/compress_nas_plugin.py
@@ -28,6 +28,7 @@
 import torch
 from torch import nn
 
+from modelopt.torch._compress.dateutils import timestamped
 from modelopt.torch._compress.decilm.converters.convert_llama3_to_decilm import (
     convert_llama3_to_decilm,
 )
@@ -116,6 +117,7 @@ def convert_compress_model(model: nn.Module, config: CompressConfig) -> ConvertR
     )
 
     # Convert Llama3 model to DeciLM model
+    print(timestamped("Compress Progress 2/8: converting model from HF to DeciLM"))
     hf_ckpt_teacher_dir = "ckpts/teacher"  # TODO: make it configurable
     convert_llama3_to_decilm(
         input_dir=config.input_model_path,
@@ -123,9 +125,11 @@ def convert_compress_model(model: nn.Module, config: CompressConfig) -> ConvertR
     )
 
     # Score_pruning_activations (distributed processing)
+    print(timestamped("Compress Progress 3/8: scoring pruning activations"))
     score_pruning_activations.launch_score_activations(hydra_cfg, runtime)
 
     # Prune the model and save pruned checkpoints
+    print(timestamped("Compress Progress 4/8: pruning the model and saving pruned checkpoints"))
     if runtime.global_rank == 0:
         pruning_ckpts.launch_prune_ckpt(hydra_cfg)
     runtime.wait_for_everyone()
@@ -203,12 +207,19 @@ def run_search(self) -> None:
         )
 
         # Build_library_and_stats (single process)
+        print(
+            timestamped(
+                "Compress Progress 5/8: building replacement library and calculating subblock statistics"
+            )
+        )
         if runtime.global_rank == 0:
             build_library_and_stats.launch_build_library_and_stats(hydra_cfg)
         runtime.wait_for_everyone()
 
         # Calc_one_block_scores (distributed processing)
+        print(timestamped("Compress Progress 6/8: calculating one block scores"))
         scoring.launch_scoring(hydra_cfg, runtime)
 
         # mip_and_realize_models (distributed processing)
+        print(timestamped("Compress Progress 7/8: running MIP and realizing models"))
         mip_and_realize_models.launch_mip_and_realize_model(hydra_cfg, runtime)

From 64b33e24cd89f0c9cdc313e9c690d7d96b7ff9a8 Mon Sep 17 00:00:00 2001
From: Daniel Korzekwa <dkorzekwa@nvidia.com>
Date: Sat, 1 Nov 2025 21:27:32 +0100
Subject: [PATCH 62/81] Update docs

Signed-off-by: Daniel Korzekwa <dkorzekwa@nvidia.com>
---
 examples/compress/README.md | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/examples/compress/README.md b/examples/compress/README.md
index 7a4535441..37c12d723 100644
--- a/examples/compress/README.md
+++ b/examples/compress/README.md
@@ -46,9 +46,7 @@ In this example, we compress [meta-llama/Llama-3.1-8B-Instruct](https://huggingf
 If you want to try different memory constraints without re-running the expensive pruning and scoring steps, use the `--mip-only` flag:
 
 ```bash
-torchrun examples/compress/main.py \
-  --config path/to/llama-3_1_8B_pruneffn_memory.yaml \
-  --mip-only
+torchrun examples/compress/main.py --config path/to/llama-3_1-8B_pruneffn_memory.yaml --mip-only 2>&1 | tee ./log.txt |grep "Compress Progress"  
 ```
 
 This assumes pruning, replacement library building, NAS scoring, and subblock stats calculation have already been completed.

From 21a602ce44049b083e629e0f41aba95eea489ca0 Mon Sep 17 00:00:00 2001
From: Daniel Korzekwa <dkorzekwa@nvidia.com>
Date: Sat, 1 Nov 2025 21:53:42 +0100
Subject: [PATCH 63/81] Update compress tutorial

Signed-off-by: Daniel Korzekwa <dkorzekwa@nvidia.com>
---
 examples/compress/README.md                   | 23 ++++++++++---------
 .../llama-3_1-8B_pruneffn_memory.yaml         |  2 +-
 2 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/examples/compress/README.md b/examples/compress/README.md
index 37c12d723..67ad3985a 100644
--- a/examples/compress/README.md
+++ b/examples/compress/README.md
@@ -19,20 +19,21 @@ In this example, we compress [meta-llama/Llama-3.1-8B-Instruct](https://huggingf
 3. Run the compression script.
 
    ```bash
-   torchrun examples/compress/main.py --config path/to/llama-3_1-8B_pruneffn_memory.yaml 2>&1 | tee ./log.txt |grep "Compress Progress"
+   torchrun examples/compress/main.py --config path/to/llama-3_1-8B_pruneffn_memory.yaml 2>&1 | tee ./log.txt | grep "Compress Progress"
    ```
 
-   screen output:
+   This will save the full output to `log.txt` and display the following progress on screen:
 
    ```bash
-   [2025-11-01 19:26:38] Compress Progress 1/8: starting compression pipeline
-   [2025-11-01 19:26:38] Compress Progress 2/8: converting model from HF to DeciLM
-   [2025-11-01 19:26:39] Compress Progress 3/8: scoring pruning activations
-   [2025-11-01 19:26:46] Compress Progress 4/8: pruning the model and saving pruned checkpoints
-   [2025-11-01 19:26:46] Compress Progress 5/8: building replacement library and calculating subblock statistics
-   [2025-11-01 19:26:46] Compress Progress 6/8: calculating one block scores
-   [2025-11-01 19:26:52] Compress Progress 7/8: running MIP and realizing models
-   [2025-11-01 19:26:59] Compress Progress 8/8: compression pipeline completed
+   # Produced on a single NVIDIA H100 80GB HBM3 card
+   [2025-11-01 13:43:10] Compress Progress 1/8: starting compression pipeline
+   [2025-11-01 13:43:10] Compress Progress 2/8: converting model from HF to DeciLM
+   [2025-11-01 13:43:30] Compress Progress 3/8: scoring pruning activations
+   [2025-11-01 13:44:38] Compress Progress 4/8: pruning the model and saving pruned checkpoints
+   [2025-11-01 13:44:45] Compress Progress 5/8: building replacement library and calculating subblock statistics
+   [2025-11-01 13:44:46] Compress Progress 6/8: calculating one block scores
+   [2025-11-01 13:49:29] Compress Progress 7/8: running MIP and realizing models
+   [2025-11-01 13:52:59] Compress Progress 8/8: compression pipeline completed
    ```
 
 ## Evaluate model accuracy
@@ -46,7 +47,7 @@ In this example, we compress [meta-llama/Llama-3.1-8B-Instruct](https://huggingf
 If you want to try different memory constraints without re-running the expensive pruning and scoring steps, use the `--mip-only` flag:
 
 ```bash
-torchrun examples/compress/main.py --config path/to/llama-3_1-8B_pruneffn_memory.yaml --mip-only 2>&1 | tee ./log.txt |grep "Compress Progress"  
+torchrun examples/compress/main.py --config path/to/llama-3_1-8B_pruneffn_memory.yaml --mip-only 2>&1 | tee ./log.txt | grep "Compress Progress"
 ```
 
 This assumes pruning, replacement library building, NAS scoring, and subblock stats calculation have already been completed.
diff --git a/examples/compress/configs/llama-3_1-8B_pruneffn_memory/llama-3_1-8B_pruneffn_memory.yaml b/examples/compress/configs/llama-3_1-8B_pruneffn_memory/llama-3_1-8B_pruneffn_memory.yaml
index 74af0cad6..ab697fd93 100644
--- a/examples/compress/configs/llama-3_1-8B_pruneffn_memory/llama-3_1-8B_pruneffn_memory.yaml
+++ b/examples/compress/configs/llama-3_1-8B_pruneffn_memory/llama-3_1-8B_pruneffn_memory.yaml
@@ -14,7 +14,7 @@ puzzle_dir: /workspace/puzzle_dir
 # MIP memory constraint (in MiB) 
 mip:
   human_constraints:
-    target_memory: 2_000 # 2 GiB
+    target_memory: 78_000 # 78 GiB
 
 # FFN intermediate sizes to search over (heterogeneous architecture)
 pruning:

From 9a381fe39dfe1a6b5fcbafe76be97c9b07dd35f8 Mon Sep 17 00:00:00 2001
From: Daniel Korzekwa <dkorzekwa@nvidia.com>
Date: Sat, 1 Nov 2025 22:00:31 +0100
Subject: [PATCH 64/81] Update compress tutorial ffn search space

Signed-off-by: Daniel Korzekwa <dkorzekwa@nvidia.com>
---
 .../llama-3_1-8B_pruneffn_memory.yaml                           | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/compress/configs/llama-3_1-8B_pruneffn_memory/llama-3_1-8B_pruneffn_memory.yaml b/examples/compress/configs/llama-3_1-8B_pruneffn_memory/llama-3_1-8B_pruneffn_memory.yaml
index ab697fd93..c9a0cabf3 100644
--- a/examples/compress/configs/llama-3_1-8B_pruneffn_memory/llama-3_1-8B_pruneffn_memory.yaml
+++ b/examples/compress/configs/llama-3_1-8B_pruneffn_memory/llama-3_1-8B_pruneffn_memory.yaml
@@ -18,4 +18,4 @@ mip:
 
 # FFN intermediate sizes to search over (heterogeneous architecture)
 pruning:
-  intermediate_size_list: [256]  # Llama 3.2 1B baseline: 8192
+  intermediate_size_list: [3072, 5888, 8704, 11520]  # teacher_intermediate_size is 14336

From c47e0af48009ba872a927f53ea9aa0dfdf74c7fb Mon Sep 17 00:00:00 2001
From: Daniel Korzekwa <dkorzekwa@nvidia.com>
Date: Sat, 1 Nov 2025 22:15:52 +0100
Subject: [PATCH 65/81] Update tutorial

Signed-off-by: Daniel Korzekwa <dkorzekwa@nvidia.com>
---
 examples/compress/README.md | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/examples/compress/README.md b/examples/compress/README.md
index 67ad3985a..8f8db8f04 100644
--- a/examples/compress/README.md
+++ b/examples/compress/README.md
@@ -26,14 +26,14 @@ In this example, we compress [meta-llama/Llama-3.1-8B-Instruct](https://huggingf
 
    ```bash
    # Produced on a single NVIDIA H100 80GB HBM3 card
-   [2025-11-01 13:43:10] Compress Progress 1/8: starting compression pipeline
-   [2025-11-01 13:43:10] Compress Progress 2/8: converting model from HF to DeciLM
-   [2025-11-01 13:43:30] Compress Progress 3/8: scoring pruning activations
-   [2025-11-01 13:44:38] Compress Progress 4/8: pruning the model and saving pruned checkpoints
-   [2025-11-01 13:44:45] Compress Progress 5/8: building replacement library and calculating subblock statistics
-   [2025-11-01 13:44:46] Compress Progress 6/8: calculating one block scores
-   [2025-11-01 13:49:29] Compress Progress 7/8: running MIP and realizing models
-   [2025-11-01 13:52:59] Compress Progress 8/8: compression pipeline completed
+   [2025-11-01 14:01:10] Compress Progress 1/8: starting compression pipeline
+   [2025-11-01 14:01:10] Compress Progress 2/8: converting model from HF to DeciLM
+   [2025-11-01 14:01:29] Compress Progress 3/8: scoring pruning activations
+   [2025-11-01 14:02:30] Compress Progress 4/8: pruning the model and saving pruned checkpoints
+   [2025-11-01 14:03:18] Compress Progress 5/8: building replacement library and calculating subblock statistics
+   [2025-11-01 14:03:19] Compress Progress 6/8: calculating one block scores
+   [2025-11-01 14:13:35] Compress Progress 7/8: running MIP and realizing models
+   [2025-11-01 14:13:52] Compress Progress 8/8: compression pipeline completed
    ```
 
 ## Evaluate model accuracy

From ce8d53afd5173978923fd81ae927ff7095eb0c31 Mon Sep 17 00:00:00 2001
From: Daniel Korzekwa <dkorzekwa@nvidia.com>
Date: Sun, 2 Nov 2025 09:58:06 +0100
Subject: [PATCH 66/81] Implement mip_only mode.

Signed-off-by: Daniel Korzekwa <dkorzekwa@nvidia.com>
---
 examples/compress/main.py | 29 +++++++++++++++++++++++++----
 1 file changed, 25 insertions(+), 4 deletions(-)

diff --git a/examples/compress/main.py b/examples/compress/main.py
index 95cda0d9a..991af9b69 100644
--- a/examples/compress/main.py
+++ b/examples/compress/main.py
@@ -32,6 +32,7 @@
 import datetime
 from pathlib import Path
 
+import mip_and_realize_models
 import torch
 from puzzle_tools.hydra_utils import register_hydra_resolvers
 
@@ -125,10 +126,30 @@ def run_mip_only(hydra_config_path: str):
     Args:
         hydra_config_path: Path to the YAML configuration file
     """
-    raise NotImplementedError("MIP-only mode is not implemented yet")
-    # hydra_config_path = Path(hydra_config_path).resolve()
-    # config_dir = str(hydra_config_path.parent)
-    # config_name = hydra_config_path.stem
+
+    with NativeDdpRuntime(
+        dtype=torch.bfloat16, torch_distributed_timeout=datetime.timedelta(10)
+    ) as runtime:
+        # Register Hydra custom resolvers (needed for config resolution)
+        register_hydra_resolvers()
+
+        hydra_config_path = Path(hydra_config_path).resolve()
+        hydra_config_dir = str(hydra_config_path.parent)
+        hydra_config_name = hydra_config_path.stem
+
+        # Load hydra config
+        hydra_cfg = initialize_hydra_config_for_dir(
+            config_dir=hydra_config_dir,
+            config_name=hydra_config_name,
+            overrides=[],
+        )
+
+        # mip_and_realize_models (distributed processing)
+        # TODO: How to make it part of mnt.search() api, similarly to run_full_compress() API
+        print(timestamped("Compress Progress 7/8: running MIP and realizing models"))
+        mip_and_realize_models.launch_mip_and_realize_model(hydra_cfg, runtime)
+
+        print(timestamped("Compress Progress 8/8: compression pipeline completed"))
 
 
 def main():

From c754419d0495e49987d824c48c2a300fc7c0d2d8 Mon Sep 17 00:00:00 2001
From: Daniel Korzekwa <dkorzekwa@nvidia.com>
Date: Sun, 2 Nov 2025 10:45:01 +0100
Subject: [PATCH 67/81] Improve logging. Convert HF to DeciLM checkpoint only
 once (single-gpu)

Signed-off-by: Daniel Korzekwa <dkorzekwa@nvidia.com>
---
 examples/compress/main.py                     |  6 +--
 .../nas/plugins/compress_nas_plugin.py        | 37 +++++++++++--------
 2 files changed, 25 insertions(+), 18 deletions(-)

diff --git a/examples/compress/main.py b/examples/compress/main.py
index 991af9b69..93ea0b8ab 100644
--- a/examples/compress/main.py
+++ b/examples/compress/main.py
@@ -114,7 +114,7 @@ def run_full_compress(hydra_config_path: str):
             config={},  # this is not used as the search space is defined in the hydra config
         )
 
-        print(timestamped("Compress Progress 8/8: compression pipeline completed"))
+        print(timestamped("Compress Progress 8/8: compression pipeline completed (multi-gpu)"))
 
 
 def run_mip_only(hydra_config_path: str):
@@ -146,10 +146,10 @@ def run_mip_only(hydra_config_path: str):
 
         # mip_and_realize_models (distributed processing)
         # TODO: How to make it part of mnt.search() api, similarly to run_full_compress() API
-        print(timestamped("Compress Progress 7/8: running MIP and realizing models"))
+        print(timestamped("Compress Progress 7/8: running MIP and realizing models (multi-gpu)"))
         mip_and_realize_models.launch_mip_and_realize_model(hydra_cfg, runtime)
 
-        print(timestamped("Compress Progress 8/8: compression pipeline completed"))
+        print(timestamped("Compress Progress 8/8: compression pipeline completed (multi-gpu)"))
 
 
 def main():
diff --git a/modelopt/torch/_compress/nas/plugins/compress_nas_plugin.py b/modelopt/torch/_compress/nas/plugins/compress_nas_plugin.py
index aa06c217b..bcaaa1114 100644
--- a/modelopt/torch/_compress/nas/plugins/compress_nas_plugin.py
+++ b/modelopt/torch/_compress/nas/plugins/compress_nas_plugin.py
@@ -117,20 +117,27 @@ def convert_compress_model(model: nn.Module, config: CompressConfig) -> ConvertR
     )
 
     # Convert Llama3 model to DeciLM model
-    print(timestamped("Compress Progress 2/8: converting model from HF to DeciLM"))
-    hf_ckpt_teacher_dir = "ckpts/teacher"  # TODO: make it configurable
-    convert_llama3_to_decilm(
-        input_dir=config.input_model_path,
-        output_dir=Path(config.puzzle_dir) / hf_ckpt_teacher_dir,
-    )
+    if runtime.global_rank == 0:
+        print(timestamped("Compress Progress 2/8: converting model from HF to DeciLM (single-gpu)"))
+        hf_ckpt_teacher_dir = "ckpts/teacher"  # TODO: make it configurable
+        convert_llama3_to_decilm(
+            input_dir=config.input_model_path,
+            output_dir=Path(config.puzzle_dir) / hf_ckpt_teacher_dir,
+        )
+    runtime.wait_for_everyone()
 
     # Score_pruning_activations (distributed processing)
-    print(timestamped("Compress Progress 3/8: scoring pruning activations"))
+    print(timestamped("Compress Progress 3/8: scoring pruning activations (multi-gpu)"))
     score_pruning_activations.launch_score_activations(hydra_cfg, runtime)
 
     # Prune the model and save pruned checkpoints
-    print(timestamped("Compress Progress 4/8: pruning the model and saving pruned checkpoints"))
+
     if runtime.global_rank == 0:
+        print(
+            timestamped(
+                "Compress Progress 4/8: pruning the model and saving pruned checkpoints (single-gpu)"
+            )
+        )
         pruning_ckpts.launch_prune_ckpt(hydra_cfg)
     runtime.wait_for_everyone()
 
@@ -207,19 +214,19 @@ def run_search(self) -> None:
         )
 
         # Build_library_and_stats (single process)
-        print(
-            timestamped(
-                "Compress Progress 5/8: building replacement library and calculating subblock statistics"
-            )
-        )
         if runtime.global_rank == 0:
+            print(
+                timestamped(
+                    "Compress Progress 5/8: building replacement library and subblock statistics (single-gpu)"
+                )
+            )
             build_library_and_stats.launch_build_library_and_stats(hydra_cfg)
         runtime.wait_for_everyone()
 
         # Calc_one_block_scores (distributed processing)
-        print(timestamped("Compress Progress 6/8: calculating one block scores"))
+        print(timestamped("Compress Progress 6/8: calculating one block scores (multi-gpu)"))
         scoring.launch_scoring(hydra_cfg, runtime)
 
         # mip_and_realize_models (distributed processing)
-        print(timestamped("Compress Progress 7/8: running MIP and realizing models"))
+        print(timestamped("Compress Progress 7/8: running MIP and realizing models (multi-gpu)"))
         mip_and_realize_models.launch_mip_and_realize_model(hydra_cfg, runtime)

From 6505631acf2e19c3e1a0a7e12be35477018ab93d Mon Sep 17 00:00:00 2001
From: Daniel Korzekwa <dkorzekwa@nvidia.com>
Date: Sun, 2 Nov 2025 11:12:44 +0100
Subject: [PATCH 68/81] update docs

Signed-off-by: Daniel Korzekwa <dkorzekwa@nvidia.com>
---
 examples/compress/README.md                         | 4 ++--
 tests/experimental/torch/_compress/test_compress.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/compress/README.md b/examples/compress/README.md
index 8f8db8f04..615703664 100644
--- a/examples/compress/README.md
+++ b/examples/compress/README.md
@@ -19,7 +19,7 @@ In this example, we compress [meta-llama/Llama-3.1-8B-Instruct](https://huggingf
 3. Run the compression script.
 
    ```bash
-   torchrun examples/compress/main.py --config path/to/llama-3_1-8B_pruneffn_memory.yaml 2>&1 | tee ./log.txt | grep "Compress Progress"
+   torchrun --nproc_per_node 1 examples/compress/main.py --config path/to/llama-3_1-8B_pruneffn_memory.yaml 2>&1 | tee ./log.txt | grep "Compress Progress"
    ```
 
    This will save the full output to `log.txt` and display the following progress on screen:
@@ -47,7 +47,7 @@ In this example, we compress [meta-llama/Llama-3.1-8B-Instruct](https://huggingf
 If you want to try different memory constraints without re-running the expensive pruning and scoring steps, use the `--mip-only` flag:
 
 ```bash
-torchrun examples/compress/main.py --config path/to/llama-3_1-8B_pruneffn_memory.yaml --mip-only 2>&1 | tee ./log.txt | grep "Compress Progress"
+torchrun --nproc_per_node 1 examples/compress/main.py --config path/to/llama-3_1-8B_pruneffn_memory.yaml --mip-only 2>&1 | tee ./log.txt | grep "Compress Progress"
 ```
 
 This assumes pruning, replacement library building, NAS scoring, and subblock stats calculation have already been completed.
diff --git a/tests/experimental/torch/_compress/test_compress.py b/tests/experimental/torch/_compress/test_compress.py
index 2ef786d14..0622bbbda 100644
--- a/tests/experimental/torch/_compress/test_compress.py
+++ b/tests/experimental/torch/_compress/test_compress.py
@@ -52,7 +52,7 @@
 # pip install mip
 # pip install lru-dict
 #
-# export PYTHONPATH=$PYTHONPATH:/workspace/puzzletron/v1
+# export PYTHONPATH=$PYTHONPATH:.:/workspace/puzzletron/v1
 #
 # pytest -s -v ./tests/experimental/torch/_compress/test_compress.py::test_compress -o addopts=""
 

From 734c32cd92c62b4128c008f86117989cd17e62e6 Mon Sep 17 00:00:00 2001
From: Daniel Korzekwa <dkorzekwa@nvidia.com>
Date: Sun, 2 Nov 2025 13:59:48 +0100
Subject: [PATCH 69/81] Update compress tutorial with --mip_only part.

Signed-off-by: Daniel Korzekwa <dkorzekwa@nvidia.com>
---
 examples/compress/README.md                   | 87 ++++++++++++++++++-
 .../torch/_compress/test_compress.py          |  3 -
 2 files changed, 85 insertions(+), 5 deletions(-)

diff --git a/examples/compress/README.md b/examples/compress/README.md
index 615703664..0ea718175 100644
--- a/examples/compress/README.md
+++ b/examples/compress/README.md
@@ -36,6 +36,47 @@ In this example, we compress [meta-llama/Llama-3.1-8B-Instruct](https://huggingf
    [2025-11-01 14:13:52] Compress Progress 8/8: compression pipeline completed
    ```
 
+   This will generate the following network architecture (see `log.txt`):
+
+   ```bash
+   block_0:   attention  gqa_4   ffn  intermediate_14336
+   block_1:   attention  gqa_4   ffn  intermediate_14336
+   block_2:   attention  gqa_4   ffn  intermediate_14336
+   block_3:   attention  gqa_4   ffn  intermediate_14336
+   block_4:   attention  gqa_4   ffn  intermediate_14336
+   block_5:   attention  gqa_4   ffn  intermediate_14336
+   block_6:   attention  gqa_4   ffn  intermediate_14336
+   block_7:   attention  gqa_4   ffn  intermediate_14336
+   block_8:   attention  gqa_4   ffn  intermediate_14336
+   block_9:   attention  gqa_4   ffn  intermediate_14336
+   block_10:  attention  gqa_4   ffn  intermediate_14336
+   block_11:  attention  gqa_4   ffn  intermediate_14336
+   block_12:  attention  gqa_4   ffn  intermediate_14336
+   block_13:  attention  gqa_4   ffn  intermediate_14336
+   block_14:  attention  gqa_4   ffn  intermediate_14336
+   block_15:  attention  gqa_4   ffn  intermediate_14336
+   block_16:  attention  gqa_4   ffn  intermediate_14336
+   block_17:  attention  no_op   ffn  intermediate_14336
+   block_18:  attention  no_op   ffn  intermediate_14336
+   block_19:  attention  no_op   ffn  intermediate_14336
+   block_20:  attention  no_op   ffn  intermediate_14336
+   block_21:  attention  no_op   ffn  intermediate_14336
+   block_22:  attention  no_op   ffn  intermediate_14336
+   block_23:  attention  no_op   ffn  intermediate_14336
+   block_24:  attention  no_op   ffn  intermediate_14336
+   block_25:  attention  no_op   ffn  intermediate_14336
+   block_26:  attention  no_op   ffn  intermediate_14336
+   block_27:  attention  no_op   ffn  intermediate_14336
+   block_28:  attention  no_op   ffn  intermediate_14336
+   block_29:  attention  gqa_4   ffn  intermediate_14336
+   block_30:  attention  gqa_4   ffn  intermediate_14336
+   block_31:  attention  gqa_4   ffn  intermediate_14336
+
+   [2025-11-02 04:53:11,332]^[[92m[rank-0]^[[0m[run_puzzle.py:295] Total costs: {'stats.memory_mib': 75796.4140625, 'stats.ffn_num_params': 5637275648, 'stats.num_kv_heads': 160, 'stats.kv_cache_memory_mib': 61440.0, 'stats.ffn_memory_mib': 10752.25, 'stats.attention_memory_mib': 63040.15625, 'stats.attention_num_params': 838942720, 'stats.num_params': 7526895616, 'stats.has_attention': 20, 'stats.has_ffn': 32}
+   [2025-11-02 04:53:11,341]^[[92m[rank-0]^[[0m[run_puzzle.py:300] /workspace/puzzle_dir/mip/puzzle_solutions/target_memory_78000MiB/solutions.json
+   [2025-11-02 04:53:11,341]^[[92m[rank-0]^[[0m[mip_and_realize_models.py:49]      Realize model for the solution: /workspace/puzzle_dir/mip/puzzle_solutions/target_memory_78000MiB/solutions.json
+   ```
+
 ## Evaluate model accuracy
 
 ```bash
@@ -44,13 +85,55 @@ In this example, we compress [meta-llama/Llama-3.1-8B-Instruct](https://huggingf
 
 ## Re-run MIP Search with different memory constraints
 
-If you want to try different memory constraints without re-running the expensive pruning and scoring steps, use the `--mip-only` flag:
+If you want to try different memory constraints without re-running the expensive pruning and scoring steps, use the `--mip-only` flag.
+This assumes pruning, replacement library building, NAS scoring, and subblock stats calculation have already been completed.
+
+Set `target_memory: 28_000` in `llama-3_1-8B_pruneffn_memory.yaml`.
 
 ```bash
 torchrun --nproc_per_node 1 examples/compress/main.py --config path/to/llama-3_1-8B_pruneffn_memory.yaml --mip-only 2>&1 | tee ./log.txt | grep "Compress Progress"
 ```
 
-This assumes pruning, replacement library building, NAS scoring, and subblock stats calculation have already been completed.
+This will generate the following network architecture (see `log.txt`):
+
+```bash
+block_0:   attention  gqa_4   ffn  intermediate_14336
+block_1:   attention  gqa_4   ffn  intermediate_14336
+block_2:   attention  gqa_4   ffn  intermediate_14336
+block_3:   attention  gqa_4   ffn  intermediate_14336
+block_4:   attention  gqa_4   ffn  intermediate_14336
+block_5:   attention  no_op   ffn  intermediate_11520
+block_6:   attention  no_op   ffn  intermediate_14336
+block_7:   attention  no_op   ffn  intermediate_8704
+block_8:   attention  no_op   ffn  intermediate_14336
+block_9:   attention  no_op   ffn  intermediate_3072
+block_10:  attention  no_op   ffn  intermediate_11520
+block_11:  attention  no_op   ffn  intermediate_11520
+block_12:  attention  no_op   ffn  intermediate_11520
+block_13:  attention  no_op   ffn  intermediate_11520
+block_14:  attention  no_op   ffn  intermediate_3072
+block_15:  attention  no_op   ffn  intermediate_14336
+block_16:  attention  no_op   ffn  intermediate_14336
+block_17:  attention  no_op   ffn  intermediate_14336
+block_18:  attention  no_op   ffn  intermediate_14336
+block_19:  attention  no_op   ffn  intermediate_14336
+block_20:  attention  no_op   ffn  intermediate_14336
+block_21:  attention  no_op   ffn  intermediate_14336
+block_22:  attention  no_op   ffn  intermediate_14336
+block_23:  attention  no_op   ffn  intermediate_14336
+block_24:  attention  no_op   ffn  intermediate_14336
+block_25:  attention  no_op   ffn  intermediate_14336
+block_26:  attention  no_op   ffn  intermediate_14336
+block_27:  attention  no_op   ffn  intermediate_14336
+block_28:  attention  no_op   ffn  intermediate_14336
+block_29:  attention  no_op   ffn  intermediate_14336
+block_30:  attention  no_op   ffn  intermediate_14336
+block_31:  attention  no_op   ffn  intermediate_14336
+
+[2025-11-02 04:47:51,874]^[[92m[rank-0]^[[0m[run_puzzle.py:295] Total costs: {'stats.memory_mib': 27526.296875, 'stats.num_kv_heads': 40, 'stats.kv_cache_memory_mib': 15360.0, 'stats.has_ffn': 32, 'stats.attention_num_params': 209735680, 'stats.ffn_num_params': 5118230528, 'stats.attention_memory_mib': 15760.0390625, 'stats.num_params': 6378643456, 'stats.has_attention': 5, 'stats.ffn_memory_mib': 9762.25}
+[2025-11-02 04:47:51,882]^[[92m[rank-0]^[[0m[run_puzzle.py:300] /workspace/puzzle_dir/mip/puzzle_solutions/target_memory_28000MiB/solutions.json
+[2025-11-02 04:47:51,882]^[[92m[rank-0]^[[0m[mip_and_realize_models.py:49]      Realize model for the solution: /workspace/puzzle_dir/mip/puzzle_solutions/target_memory_28000MiB/solutions.json
+```
 
 ## Deploy to TensorRT-LLM
 
diff --git a/tests/experimental/torch/_compress/test_compress.py b/tests/experimental/torch/_compress/test_compress.py
index 0622bbbda..9d009c313 100644
--- a/tests/experimental/torch/_compress/test_compress.py
+++ b/tests/experimental/torch/_compress/test_compress.py
@@ -49,9 +49,6 @@
 # --image gitlab-master.nvidia.com/deci/puzzletron:trtllm_main \
 # --workdir $MODELOPT SRC DIRECTORY --interactive --gpu 1
 #
-# pip install mip
-# pip install lru-dict
-#
 # export PYTHONPATH=$PYTHONPATH:.:/workspace/puzzletron/v1
 #
 # pytest -s -v ./tests/experimental/torch/_compress/test_compress.py::test_compress -o addopts=""

From ee14792fc89b610a96418119ae145f3762b35221 Mon Sep 17 00:00:00 2001
From: Daniel Korzekwa <dkorzekwa@nvidia.com>
Date: Sun, 2 Nov 2025 19:01:27 +0100
Subject: [PATCH 70/81] Update docs

Signed-off-by: Daniel Korzekwa <dkorzekwa@nvidia.com>
---
 examples/compress/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/compress/README.md b/examples/compress/README.md
index 0ea718175..0cdc10577 100644
--- a/examples/compress/README.md
+++ b/examples/compress/README.md
@@ -2,7 +2,7 @@
 
 This tutorial demonstrates how to compress large language models using the compress algorithm based on the [Puzzle paper](https://arxiv.org/abs/2411.19146).
 
-In this example, we compress [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) model by searching for the optimal `ffn_intermediate_size` across MLP layers, resulting in a heterogeneous architecture while reducing GPU memory usage by 20%.
+In this example, we compress [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) model by searching for the optimal `ffn_intermediate_size` across MLP layers, resulting in a heterogeneous architecture while reducing GPU memory usage from 113GB to 76GB.
 
 ## Compress the Model
 

From 5dca0aa5eaf399aac0e09368a9cca8e334e8ab0e Mon Sep 17 00:00:00 2001
From: Daniel Korzekwa <dkorzekwa@nvidia.com>
Date: Sun, 2 Nov 2025 19:48:12 +0100
Subject: [PATCH 71/81] Update tutorial llama config file.

Signed-off-by: Daniel Korzekwa <dkorzekwa@nvidia.com>
---
 .../llama-3_1-8B_pruneffn_memory/Llama-3_1-8B.yaml     | 10 ++++++----
 tests/experimental/torch/_compress/test_compress.py    |  2 +-
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/examples/compress/configs/llama-3_1-8B_pruneffn_memory/Llama-3_1-8B.yaml b/examples/compress/configs/llama-3_1-8B_pruneffn_memory/Llama-3_1-8B.yaml
index 1d8fac655..6d9c90fa9 100644
--- a/examples/compress/configs/llama-3_1-8B_pruneffn_memory/Llama-3_1-8B.yaml
+++ b/examples/compress/configs/llama-3_1-8B_pruneffn_memory/Llama-3_1-8B.yaml
@@ -28,6 +28,8 @@ calc_subblock_stats:
   merge_with_existing_stats: false
   subblock_stats_filename: "subblock_stats.json"
   moe_stats_filename: "moe_stats.json"
+  runtime_stats:
+    backend: trt_torch
 
 scoring:
   solutions_to_validate:
@@ -40,9 +42,9 @@ scoring:
 
   eval_samples: 2
   micro_batch_size: 1
-  dataset_path: ${dataset_path}/valid
   seed: 42
   shuffle_seed: 444
+  dataset_path: ${dataset_path}
 
 mip:
   single_block_replacement_validation_dir: ${to_path:${scoring.output_dir}}
@@ -76,7 +78,7 @@ mip:
     - stats.attention_num_params
 
   human_constraints:
-    target_memory: 780_000 # 78_000
+    target_memory: 78_000
 
   mip_constraints:
   use_greedy_search: false
@@ -94,11 +96,11 @@ realize_model:
 
   # Validate params
   skip_validation: false    # To enable validation of the model solution set `skip_validation` as False
-  eval_samples: 2
+  eval_samples: 128
   micro_batch_size: 1
-  dataset_path: ${dataset_path}/valid
   seed: 42
   shuffle_seed: 444
+  dataset_path: ${dataset_path}
 
 nccl_timeout_minutes: ${timedelta_minutes:10}
 
diff --git a/tests/experimental/torch/_compress/test_compress.py b/tests/experimental/torch/_compress/test_compress.py
index 9d009c313..1c673da51 100644
--- a/tests/experimental/torch/_compress/test_compress.py
+++ b/tests/experimental/torch/_compress/test_compress.py
@@ -46,7 +46,7 @@
 # /workspace/puzzletron
 #
 # submit_job --partition interactive --time 0 \
-# --image gitlab-master.nvidia.com/deci/puzzletron:trtllm_main \
+# --image gitlab-master.nvidia.com/deci/puzzletron:modelopt_main \
 # --workdir $MODELOPT SRC DIRECTORY --interactive --gpu 1
 #
 # export PYTHONPATH=$PYTHONPATH:.:/workspace/puzzletron/v1

From 5454c59395dcfd67face2186df9729ca3b6c9bf8 Mon Sep 17 00:00:00 2001
From: Daniel Korzekwa <dkorzekwa@nvidia.com>
Date: Sun, 2 Nov 2025 20:45:30 +0100
Subject: [PATCH 72/81] Update compress tutorial

Signed-off-by: Daniel Korzekwa <dkorzekwa@nvidia.com>
---
 examples/compress/README.md                   | 102 ++++++++++--------
 .../llama-3_1-8B_pruneffn_memory.yaml         |   2 +-
 2 files changed, 60 insertions(+), 44 deletions(-)

diff --git a/examples/compress/README.md b/examples/compress/README.md
index 0cdc10577..db62207a8 100644
--- a/examples/compress/README.md
+++ b/examples/compress/README.md
@@ -2,12 +2,14 @@
 
 This tutorial demonstrates how to compress large language models using the compress algorithm based on the [Puzzle paper](https://arxiv.org/abs/2411.19146).
 
-In this example, we compress [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) model by searching for the optimal `ffn_intermediate_size` across MLP layers, resulting in a heterogeneous architecture while reducing GPU memory usage from 113GB to 76GB.
+In this example, we compress the [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) model by searching for the optimal `ffn_intermediate_size` across MLP layers and `attention op/noop`. This results in a heterogeneous architecture while reducing GPU memory usage from 113GB to 96GB (15% reduction) with less than 1% regression in the token_accuracy_top_10 metric.
 
 ## Compress the Model
 
 1. Specify the `puzzle_dir`, `input_hf_model_path`, `dataset_path`, `intermediate_size_list`, and `target_memory` arguments in the [llama-3_1-8B_pruneffn_memory.yaml](./configs/llama-3_1-8B_pruneffn_memory/llama-3_1-8B_pruneffn_memory.yaml) configuration file.
 
+   Let's first shoot for 32% GPU memory reduction setting `target_memory = 78_000` GiB.
+
 2. Download and prepare the [Nemotron-Post-Training-Dataset-v2](https://huggingface.co/datasets/nvidia/Nemotron-Post-Training-Dataset-v2).
 
    dataset split: "code", "math", "stem", "chat", excluding reasoning samples (2.62GB)
@@ -39,6 +41,7 @@ In this example, we compress [meta-llama/Llama-3.1-8B-Instruct](https://huggingf
    This will generate the following network architecture (see `log.txt`):
 
    ```bash
+   ...
    block_0:   attention  gqa_4   ffn  intermediate_14336
    block_1:   attention  gqa_4   ffn  intermediate_14336
    block_2:   attention  gqa_4   ffn  intermediate_14336
@@ -71,24 +74,31 @@ In this example, we compress [meta-llama/Llama-3.1-8B-Instruct](https://huggingf
    block_29:  attention  gqa_4   ffn  intermediate_14336
    block_30:  attention  gqa_4   ffn  intermediate_14336
    block_31:  attention  gqa_4   ffn  intermediate_14336
-
+   
    [2025-11-02 04:53:11,332]^[[92m[rank-0]^[[0m[run_puzzle.py:295] Total costs: {'stats.memory_mib': 75796.4140625, 'stats.ffn_num_params': 5637275648, 'stats.num_kv_heads': 160, 'stats.kv_cache_memory_mib': 61440.0, 'stats.ffn_memory_mib': 10752.25, 'stats.attention_memory_mib': 63040.15625, 'stats.attention_num_params': 838942720, 'stats.num_params': 7526895616, 'stats.has_attention': 20, 'stats.has_ffn': 32}
-   [2025-11-02 04:53:11,341]^[[92m[rank-0]^[[0m[run_puzzle.py:300] /workspace/puzzle_dir/mip/puzzle_solutions/target_memory_78000MiB/solutions.json
-   [2025-11-02 04:53:11,341]^[[92m[rank-0]^[[0m[mip_and_realize_models.py:49]      Realize model for the solution: /workspace/puzzle_dir/mip/puzzle_solutions/target_memory_78000MiB/solutions.json
-   ```
+   ...
+   ################################################################
+   validate_model_and_extract_token_probs(model_name='teacher')
+   ################################################################
+   ...
+   Average losses = {'lm_loss': 1.118250765837729, 'token_accuracy_top_1': 0.7331905364990234, 'token_accuracy_top_5': 0.9094219207763672, 'token_accuracy_top_10': 0.9423646926879883,
+   ...
+   ################################################################
+   validate_model_with_kl_div(model_name='solution_0', is_calc_kl_div=True)
+   ################################################################
+   ....
+   Average losses = {'lm_loss': 1.7577573340386152, 'token_accuracy_top_1': 0.6225490570068359, 'token_accuracy_top_5': 0.846257209777832, 'token_accuracy_top_10': 0.8987817764282227} 
 
-## Evaluate model accuracy
+   ```
 
-```bash
-# TODO
-```
+   30% GPU memory reduction leads to nearly 5% regression in token_accuracy_top_10 metric (0.898 / 0.942). Let's rerun MIP search aiming for 15% memory reduction.
 
 ## Re-run MIP Search with different memory constraints
 
 If you want to try different memory constraints without re-running the expensive pruning and scoring steps, use the `--mip-only` flag.
 This assumes pruning, replacement library building, NAS scoring, and subblock stats calculation have already been completed.
 
-Set `target_memory: 28_000` in `llama-3_1-8B_pruneffn_memory.yaml`.
+Set `target_memory: 96_000` in `llama-3_1-8B_pruneffn_memory.yaml`.
 
 ```bash
 torchrun --nproc_per_node 1 examples/compress/main.py --config path/to/llama-3_1-8B_pruneffn_memory.yaml --mip-only 2>&1 | tee ./log.txt | grep "Compress Progress"
@@ -102,49 +112,55 @@ block_1:   attention  gqa_4   ffn  intermediate_14336
 block_2:   attention  gqa_4   ffn  intermediate_14336
 block_3:   attention  gqa_4   ffn  intermediate_14336
 block_4:   attention  gqa_4   ffn  intermediate_14336
-block_5:   attention  no_op   ffn  intermediate_11520
-block_6:   attention  no_op   ffn  intermediate_14336
-block_7:   attention  no_op   ffn  intermediate_8704
-block_8:   attention  no_op   ffn  intermediate_14336
-block_9:   attention  no_op   ffn  intermediate_3072
-block_10:  attention  no_op   ffn  intermediate_11520
-block_11:  attention  no_op   ffn  intermediate_11520
-block_12:  attention  no_op   ffn  intermediate_11520
-block_13:  attention  no_op   ffn  intermediate_11520
-block_14:  attention  no_op   ffn  intermediate_3072
-block_15:  attention  no_op   ffn  intermediate_14336
-block_16:  attention  no_op   ffn  intermediate_14336
-block_17:  attention  no_op   ffn  intermediate_14336
+block_5:   attention  gqa_4   ffn  intermediate_14336
+block_6:   attention  gqa_4   ffn  intermediate_14336
+block_7:   attention  gqa_4   ffn  intermediate_14336
+block_8:   attention  gqa_4   ffn  intermediate_14336
+block_9:   attention  gqa_4   ffn  intermediate_14336
+block_10:  attention  gqa_4   ffn  intermediate_14336
+block_11:  attention  gqa_4   ffn  intermediate_14336
+block_12:  attention  gqa_4   ffn  intermediate_14336
+block_13:  attention  gqa_4   ffn  intermediate_14336
+block_14:  attention  gqa_4   ffn  intermediate_14336
+block_15:  attention  gqa_4   ffn  intermediate_14336
+block_16:  attention  gqa_4   ffn  intermediate_14336
+block_17:  attention  gqa_4   ffn  intermediate_14336
 block_18:  attention  no_op   ffn  intermediate_14336
 block_19:  attention  no_op   ffn  intermediate_14336
 block_20:  attention  no_op   ffn  intermediate_14336
-block_21:  attention  no_op   ffn  intermediate_14336
+block_21:  attention  gqa_4   ffn  intermediate_14336
 block_22:  attention  no_op   ffn  intermediate_14336
 block_23:  attention  no_op   ffn  intermediate_14336
 block_24:  attention  no_op   ffn  intermediate_14336
-block_25:  attention  no_op   ffn  intermediate_14336
-block_26:  attention  no_op   ffn  intermediate_14336
-block_27:  attention  no_op   ffn  intermediate_14336
-block_28:  attention  no_op   ffn  intermediate_14336
-block_29:  attention  no_op   ffn  intermediate_14336
-block_30:  attention  no_op   ffn  intermediate_14336
-block_31:  attention  no_op   ffn  intermediate_14336
-
-[2025-11-02 04:47:51,874]^[[92m[rank-0]^[[0m[run_puzzle.py:295] Total costs: {'stats.memory_mib': 27526.296875, 'stats.num_kv_heads': 40, 'stats.kv_cache_memory_mib': 15360.0, 'stats.has_ffn': 32, 'stats.attention_num_params': 209735680, 'stats.ffn_num_params': 5118230528, 'stats.attention_memory_mib': 15760.0390625, 'stats.num_params': 6378643456, 'stats.has_attention': 5, 'stats.ffn_memory_mib': 9762.25}
-[2025-11-02 04:47:51,882]^[[92m[rank-0]^[[0m[run_puzzle.py:300] /workspace/puzzle_dir/mip/puzzle_solutions/target_memory_28000MiB/solutions.json
-[2025-11-02 04:47:51,882]^[[92m[rank-0]^[[0m[mip_and_realize_models.py:49]      Realize model for the solution: /workspace/puzzle_dir/mip/puzzle_solutions/target_memory_28000MiB/solutions.json
-```
-
-## Deploy to TensorRT-LLM
-
-```bash
-# TODO
+block_25:  attention  gqa_4   ffn  intermediate_14336
+block_26:  attention  gqa_4   ffn  intermediate_14336
+block_27:  attention  gqa_4   ffn  intermediate_14336
+block_28:  attention  gqa_4   ffn  intermediate_14336
+block_29:  attention  gqa_4   ffn  intermediate_14336
+block_30:  attention  gqa_4   ffn  intermediate_14336
+block_31:  attention  gqa_4   ffn  intermediate_14336
+
+[2025-11-02 11:01:56,443]^[[92m[rank-0]^[[0m[run_puzzle.py:295] Total costs: {'stats.memory_mib': 94708.4609375, 'stats.attention_memory_mib': 81952.203125, 'stats.ffn_memory_mib': 10752.25, 'stats.has_ffn': 32, 'stats.ffn_num_params': 5637275648, 'stats.attention_num_params': 1090625536, 'stats.has_attention': 26, 'stats.kv_cache_memory_mib': 79872.0, 'stats.num_kv_heads': 208, 'stats.num_params': 7778578432}
+...
+################################################################
+validate_model_with_kl_div(model_name='solution_0', is_calc_kl_div=True)
+################################################################
+Average losses = {'lm_loss': 1.2425934937782586, 'token_accuracy_top_1': 0.703862190246582, 'token_accuracy_top_5': 0.8954982757568359, 'token_accuracy_top_10': 0.9336576461791992,
 ```
 
-## Export to NeMo for Knowledge Distillation
+On the other hand, if you set `target_memory: 28_000`, you would observe that for some layers the intermediate FFN size starts to reduce (see `log.txt`):
 
 ```bash
-# TODO
+block_5:   attention  no_op   ffn  intermediate_11520
+block_6:   attention  no_op   ffn  intermediate_14336
+block_7:   attention  no_op   ffn  intermediate_8704
+block_8:   attention  no_op   ffn  intermediate_14336
+block_9:   attention  no_op   ffn  intermediate_3072
+block_10:  attention  no_op   ffn  intermediate_11520
+block_11:  attention  no_op   ffn  intermediate_11520
+block_12:  attention  no_op   ffn  intermediate_11520
+block_13:  attention  no_op   ffn  intermediate_11520
+block_14:  attention  no_op   ffn  intermediate_3072
 ```
 
 ## Advanced usage
diff --git a/examples/compress/configs/llama-3_1-8B_pruneffn_memory/llama-3_1-8B_pruneffn_memory.yaml b/examples/compress/configs/llama-3_1-8B_pruneffn_memory/llama-3_1-8B_pruneffn_memory.yaml
index c9a0cabf3..cfd7f93e8 100644
--- a/examples/compress/configs/llama-3_1-8B_pruneffn_memory/llama-3_1-8B_pruneffn_memory.yaml
+++ b/examples/compress/configs/llama-3_1-8B_pruneffn_memory/llama-3_1-8B_pruneffn_memory.yaml
@@ -14,7 +14,7 @@ puzzle_dir: /workspace/puzzle_dir
 # MIP memory constraint (in MiB) 
 mip:
   human_constraints:
-    target_memory: 78_000 # 78 GiB
+    target_memory: 96_000 # 96 GiB
 
 # FFN intermediate sizes to search over (heterogeneous architecture)
 pruning:

From b3fd9df191d808d482aa1441002dbe651fd94643 Mon Sep 17 00:00:00 2001
From: Daniel Korzekwa <dkorzekwa@nvidia.com>
Date: Sun, 2 Nov 2025 20:53:27 +0100
Subject: [PATCH 73/81] Update docs

Signed-off-by: Daniel Korzekwa <dkorzekwa@nvidia.com>
---
 examples/compress/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/compress/README.md b/examples/compress/README.md
index db62207a8..a1cc1eced 100644
--- a/examples/compress/README.md
+++ b/examples/compress/README.md
@@ -27,7 +27,7 @@ In this example, we compress the [meta-llama/Llama-3.1-8B-Instruct](https://hugg
    This will save the full output to `log.txt` and display the following progress on screen:
 
    ```bash
-   # Produced on a single NVIDIA H100 80GB HBM3 card
+   # Produced on 2x NVIDIA H100 80GB HBM3
    [2025-11-01 14:01:10] Compress Progress 1/8: starting compression pipeline
    [2025-11-01 14:01:10] Compress Progress 2/8: converting model from HF to DeciLM
    [2025-11-01 14:01:29] Compress Progress 3/8: scoring pruning activations

From d4ed34a6f1a4f649b3ee0c611c5939c165e2b72c Mon Sep 17 00:00:00 2001
From: Daniel Korzekwa <dkorzekwa@nvidia.com>
Date: Sun, 2 Nov 2025 21:04:21 +0100
Subject: [PATCH 74/81] Update compress setting to increase the number of eval
 samples.

Signed-off-by: Daniel Korzekwa <dkorzekwa@nvidia.com>
---
 .../configs/llama-3_1-8B_pruneffn_memory/Llama-3_1-8B.yaml    | 2 +-
 .../pruning/pruning_defaults.yaml                             | 4 ++--
 .../_compress/resources/configs/pruning/ffn_pruning.yaml      | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/compress/configs/llama-3_1-8B_pruneffn_memory/Llama-3_1-8B.yaml b/examples/compress/configs/llama-3_1-8B_pruneffn_memory/Llama-3_1-8B.yaml
index 6d9c90fa9..70b5304c5 100644
--- a/examples/compress/configs/llama-3_1-8B_pruneffn_memory/Llama-3_1-8B.yaml
+++ b/examples/compress/configs/llama-3_1-8B_pruneffn_memory/Llama-3_1-8B.yaml
@@ -40,7 +40,7 @@ scoring:
   teacher_dir: ${to_path:${teacher_dir}}
   output_dir: ${puzzle_dir}/single_sequence_replacement_solutions--validation
 
-  eval_samples: 2
+  eval_samples: 10 # default is 128
   micro_batch_size: 1
   seed: 42
   shuffle_seed: 444
diff --git a/examples/compress/configs/llama-3_1-8B_pruneffn_memory/pruning/pruning_defaults.yaml b/examples/compress/configs/llama-3_1-8B_pruneffn_memory/pruning/pruning_defaults.yaml
index 0a5eafcff..5d5307b9c 100644
--- a/examples/compress/configs/llama-3_1-8B_pruneffn_memory/pruning/pruning_defaults.yaml
+++ b/examples/compress/configs/llama-3_1-8B_pruneffn_memory/pruning/pruning_defaults.yaml
@@ -7,7 +7,7 @@ activations_log_dir: ???
 activation_hooks_kwargs: ???
 
 # Data:
-eval_samples: 100
+eval_samples: 1000 # default is 10000
 micro_batch_size: 4
 dataset_path: ${dataset_path}
 val_dataset_name: train
@@ -17,7 +17,7 @@ pruned_ckpts_outpt_dir: ${puzzle_dir}/pruning/${pruning.experiment_id}
 
 ## FFN pruning
 ffn_list:
-mlp_init_mode: "Truncate"
+mlp_init_mode: "Truncate" # PruneByActivationsLog
 
 ## KV-heads pruning
 n_heads_in_group_list:
diff --git a/tests/experimental/torch/_compress/resources/configs/pruning/ffn_pruning.yaml b/tests/experimental/torch/_compress/resources/configs/pruning/ffn_pruning.yaml
index f0c852eec..96a8ca72e 100644
--- a/tests/experimental/torch/_compress/resources/configs/pruning/ffn_pruning.yaml
+++ b/tests/experimental/torch/_compress/resources/configs/pruning/ffn_pruning.yaml
@@ -8,5 +8,5 @@ activation_hooks_kwargs:
   target_layer: "mlp.down_proj"
   layer_input_descriptors_path:
 
-intermediate_size_list: [256]  # teacher_intermediate_size is 14336
+intermediate_size_list: [3072, 5888, 8704, 11520]  # teacher_intermediate_size is 14336
 mlp_init_mode: "PruneByActivationsLog"

From 99798727ede4fc158763be8cc810c2a9bf09f43e Mon Sep 17 00:00:00 2001
From: Daniel Korzekwa <dkorzekwa@nvidia.com>
Date: Sun, 2 Nov 2025 22:25:16 +0100
Subject: [PATCH 75/81] Update compress tutorial

Signed-off-by: Daniel Korzekwa <dkorzekwa@nvidia.com>
---
 examples/compress/README.md | 25 ++++++++++++-------------
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/examples/compress/README.md b/examples/compress/README.md
index a1cc1eced..5efd879a0 100644
--- a/examples/compress/README.md
+++ b/examples/compress/README.md
@@ -28,14 +28,14 @@ In this example, we compress the [meta-llama/Llama-3.1-8B-Instruct](https://hugg
 
    ```bash
    # Produced on 2x NVIDIA H100 80GB HBM3
-   [2025-11-01 14:01:10] Compress Progress 1/8: starting compression pipeline
-   [2025-11-01 14:01:10] Compress Progress 2/8: converting model from HF to DeciLM
-   [2025-11-01 14:01:29] Compress Progress 3/8: scoring pruning activations
-   [2025-11-01 14:02:30] Compress Progress 4/8: pruning the model and saving pruned checkpoints
-   [2025-11-01 14:03:18] Compress Progress 5/8: building replacement library and calculating subblock statistics
-   [2025-11-01 14:03:19] Compress Progress 6/8: calculating one block scores
-   [2025-11-01 14:13:35] Compress Progress 7/8: running MIP and realizing models
-   [2025-11-01 14:13:52] Compress Progress 8/8: compression pipeline completed
+   [2025-11-02 12:06:34] Compress Progress 1/8: starting compression pipeline
+   [2025-11-02 12:06:45] Compress Progress 2/8: converting model from HF to DeciLM (single-gpu)
+   [2025-11-02 12:07:07] Compress Progress 3/8: scoring pruning activations (multi-gpu)
+   [2025-11-02 12:11:36] Compress Progress 4/8: pruning the model and saving pruned checkpoints (single-gpu)
+   [2025-11-02 12:12:20] Compress Progress 5/8: building replacement library and subblock statistics (single-gpu)
+   [2025-11-02 12:12:21] Compress Progress 6/8: calculating one block scores (multi-gpu)
+   [2025-11-02 12:50:41] Compress Progress 7/8: running MIP and realizing models (multi-gpu)
+   [2025-11-02 12:52:34] Compress Progress 8/8: compression pipeline completed (multi-gpu)
    ```
 
    This will generate the following network architecture (see `log.txt`):
@@ -81,14 +81,13 @@ In this example, we compress the [meta-llama/Llama-3.1-8B-Instruct](https://hugg
    validate_model_and_extract_token_probs(model_name='teacher')
    ################################################################
    ...
-   Average losses = {'lm_loss': 1.118250765837729, 'token_accuracy_top_1': 0.7331905364990234, 'token_accuracy_top_5': 0.9094219207763672, 'token_accuracy_top_10': 0.9423646926879883,
+   Average losses = {'lm_loss': 1.118250765837729, 'token_accuracy_top_1': 0.7331905364990234, 'token_accuracy_top_5': 0.9094219207763672, 'token_accuracy_top_10': 0.9423646926879883}
    ...
    ################################################################
    validate_model_with_kl_div(model_name='solution_0', is_calc_kl_div=True)
    ################################################################
    ....
-   Average losses = {'lm_loss': 1.7577573340386152, 'token_accuracy_top_1': 0.6225490570068359, 'token_accuracy_top_5': 0.846257209777832, 'token_accuracy_top_10': 0.8987817764282227} 
-
+   Average losses = {'lm_loss': 1.7577573340386152, 'token_accuracy_top_1': 0.6225490570068359, 'token_accuracy_top_5': 0.846257209777832, 'token_accuracy_top_10': 0.8987817764282227}
    ```
 
    30% GPU memory reduction leads to nearly 5% regression in token_accuracy_top_10 metric (0.898 / 0.942). Let's rerun MIP search aiming for 15% memory reduction.
@@ -140,12 +139,12 @@ block_29:  attention  gqa_4   ffn  intermediate_14336
 block_30:  attention  gqa_4   ffn  intermediate_14336
 block_31:  attention  gqa_4   ffn  intermediate_14336
 
-[2025-11-02 11:01:56,443]^[[92m[rank-0]^[[0m[run_puzzle.py:295] Total costs: {'stats.memory_mib': 94708.4609375, 'stats.attention_memory_mib': 81952.203125, 'stats.ffn_memory_mib': 10752.25, 'stats.has_ffn': 32, 'stats.ffn_num_params': 5637275648, 'stats.attention_num_params': 1090625536, 'stats.has_attention': 26, 'stats.kv_cache_memory_mib': 79872.0, 'stats.num_kv_heads': 208, 'stats.num_params': 7778578432}
+[2025-11-02 12:50:42,024]^[[92m[rank-0]^[[0m[run_puzzle.py:295] Total costs: {'stats.memory_mib': 94708.4609375, 'stats.has_ffn': 32, 'stats.ffn_memory_mib': 10752.25, 'stats.kv_cache_memory_mib': 79872.0, 'stats.attention_num_params': 1090625536, 'stats.ffn_num_params': 5637275648, 'stats.has_attention': 26, 'stats.num_params': 7778578432, 'stats.attention_memory_mib': 81952.203125, 'stats.num_kv_heads': 208}
 ...
 ################################################################
 validate_model_with_kl_div(model_name='solution_0', is_calc_kl_div=True)
 ################################################################
-Average losses = {'lm_loss': 1.2425934937782586, 'token_accuracy_top_1': 0.703862190246582, 'token_accuracy_top_5': 0.8954982757568359, 'token_accuracy_top_10': 0.9336576461791992,
+Average losses = {'lm_loss': 1.2425934937782586, 'token_accuracy_top_1': 0.703862190246582, 'token_accuracy_top_5': 0.8954982757568359, 'token_accuracy_top_10': 0.9336576461791992
 ```
 
 On the other hand, if you set `target_memory: 28_000`, you would observe that for some layers the intermediate FFN size starts to reduce (see `log.txt`):

From 8cb50d45fbe669d82e5eecbfd12c7468c97b3e86 Mon Sep 17 00:00:00 2001
From: Daniel Korzekwa <dkorzekwa@nvidia.com>
Date: Sun, 2 Nov 2025 22:27:08 +0100
Subject: [PATCH 76/81] Update tutorial

Signed-off-by: Daniel Korzekwa <dkorzekwa@nvidia.com>
---
 examples/compress/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/compress/README.md b/examples/compress/README.md
index 5efd879a0..4d2f99ac9 100644
--- a/examples/compress/README.md
+++ b/examples/compress/README.md
@@ -2,7 +2,7 @@
 
 This tutorial demonstrates how to compress large language models using the compress algorithm based on the [Puzzle paper](https://arxiv.org/abs/2411.19146).
 
-In this example, we compress the [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) model by searching for the optimal `ffn_intermediate_size` across MLP layers and `attention op/noop`. This results in a heterogeneous architecture while reducing GPU memory usage from 113GB to 96GB (15% reduction) with less than 1% regression in the token_accuracy_top_10 metric.
+In this example, we compress the [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) model by searching for the optimal `ffn_intermediate_size` across MLP layers and `attention op/noop`. This results in a heterogeneous architecture while reducing GPU memory usage from 113 GiB to 96 GiB (15% reduction) with less than 1% regression in the token_accuracy_top_10 metric.
 
 ## Compress the Model
 

From 2856ca1a108c65bf6f2b7fc8edc2af22acbc6b47 Mon Sep 17 00:00:00 2001
From: Daniel Korzekwa <dkorzekwa@nvidia.com>
Date: Sun, 2 Nov 2025 22:45:05 +0100
Subject: [PATCH 77/81] Update tutorial.

Signed-off-by: Daniel Korzekwa <dkorzekwa@nvidia.com>
---
 examples/compress/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/compress/README.md b/examples/compress/README.md
index 4d2f99ac9..778bf2688 100644
--- a/examples/compress/README.md
+++ b/examples/compress/README.md
@@ -27,7 +27,7 @@ In this example, we compress the [meta-llama/Llama-3.1-8B-Instruct](https://hugg
    This will save the full output to `log.txt` and display the following progress on screen:
 
    ```bash
-   # Produced on 2x NVIDIA H100 80GB HBM3
+   # Produced on 8x NVIDIA H100 80GB HBM3
    [2025-11-02 12:06:34] Compress Progress 1/8: starting compression pipeline
    [2025-11-02 12:06:45] Compress Progress 2/8: converting model from HF to DeciLM (single-gpu)
    [2025-11-02 12:07:07] Compress Progress 3/8: scoring pruning activations (multi-gpu)

From 553107af81e0421dead7a8e341f3b820e6d9d834 Mon Sep 17 00:00:00 2001
From: Daniel Korzekwa <dkorzekwa@nvidia.com>
Date: Sun, 2 Nov 2025 23:22:31 +0100
Subject: [PATCH 78/81] Update compress tutorial.

Signed-off-by: Daniel Korzekwa <dkorzekwa@nvidia.com>
---
 examples/compress/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/compress/README.md b/examples/compress/README.md
index 778bf2688..4d2f99ac9 100644
--- a/examples/compress/README.md
+++ b/examples/compress/README.md
@@ -27,7 +27,7 @@ In this example, we compress the [meta-llama/Llama-3.1-8B-Instruct](https://hugg
    This will save the full output to `log.txt` and display the following progress on screen:
 
    ```bash
-   # Produced on 8x NVIDIA H100 80GB HBM3
+   # Produced on 2x NVIDIA H100 80GB HBM3
    [2025-11-02 12:06:34] Compress Progress 1/8: starting compression pipeline
    [2025-11-02 12:06:45] Compress Progress 2/8: converting model from HF to DeciLM (single-gpu)
    [2025-11-02 12:07:07] Compress Progress 3/8: scoring pruning activations (multi-gpu)

From 3917a789c10c965e4404896ccb5a516a5a8d9dc9 Mon Sep 17 00:00:00 2001
From: Daniel Korzekwa <dkorzekwa@nvidia.com>
Date: Mon, 3 Nov 2025 08:14:14 +0100
Subject: [PATCH 79/81] Add Dockerfile for the compress tutorial

Signed-off-by: Daniel Korzekwa <dkorzekwa@nvidia.com>
---
 examples/compress/Dockerfile | 26 ++++++++++++++++++++++++++
 examples/compress/README.md  |  6 +++++-
 2 files changed, 31 insertions(+), 1 deletion(-)
 create mode 100644 examples/compress/Dockerfile

diff --git a/examples/compress/Dockerfile b/examples/compress/Dockerfile
new file mode 100644
index 000000000..5a65839de
--- /dev/null
+++ b/examples/compress/Dockerfile
@@ -0,0 +1,26 @@
+# Docker file for compress example
+
+FROM nvcr.io/nvidia/tensorrt-llm/release:1.1.0rc5
+
+# TODO: The MIP solver would not work with this torch version. 
+# Fix it, otherwise, mamba models will not be supported by the Compress algorithm.
+# # Required for mamba_ssm to work (the default torch version in the 1.1.0rc5 does not work)
+# RUN pip uninstall -y torch
+# RUN pip uninstall -y torchvision
+# RUN pip install torch==2.7.1 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128
+
+# # Mamba SSM
+# RUN pip install causal-conv1d --no-build-isolation
+# RUN pip install mamba_ssm --no-build-isolation
+
+# Required for puzzletron calc_subblock_stats
+RUN pip install hydra-core==1.3.2
+RUN pip install wandb~=0.17.5
+RUN pip install "frozendict>=2.4.4"
+RUN pip install fire
+RUN pip install mip
+RUN pip install lru-dict
+
+WORKDIR /workspace/
+
+COPY . .
diff --git a/examples/compress/README.md b/examples/compress/README.md
index 4d2f99ac9..f3ff36232 100644
--- a/examples/compress/README.md
+++ b/examples/compress/README.md
@@ -4,6 +4,11 @@ This tutorial demonstrates how to compress large language models using the compr
 
 In this example, we compress the [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) model by searching for the optimal `ffn_intermediate_size` across MLP layers and `attention op/noop`. This results in a heterogeneous architecture while reducing GPU memory usage from 113 GiB to 96 GiB (15% reduction) with less than 1% regression in the token_accuracy_top_10 metric.
 
+## Environment
+
+- [Dockerfile](./Dockerfile) to use.
+- 2x NVIDIA H100 80GB HBM3 (1 card will be good as well).
+
 ## Compress the Model
 
 1. Specify the `puzzle_dir`, `input_hf_model_path`, `dataset_path`, `intermediate_size_list`, and `target_memory` arguments in the [llama-3_1-8B_pruneffn_memory.yaml](./configs/llama-3_1-8B_pruneffn_memory/llama-3_1-8B_pruneffn_memory.yaml) configuration file.
@@ -27,7 +32,6 @@ In this example, we compress the [meta-llama/Llama-3.1-8B-Instruct](https://hugg
    This will save the full output to `log.txt` and display the following progress on screen:
 
    ```bash
-   # Produced on 2x NVIDIA H100 80GB HBM3
    [2025-11-02 12:06:34] Compress Progress 1/8: starting compression pipeline
    [2025-11-02 12:06:45] Compress Progress 2/8: converting model from HF to DeciLM (single-gpu)
    [2025-11-02 12:07:07] Compress Progress 3/8: scoring pruning activations (multi-gpu)

From 6e1d910453a73c39e752555589ea46d049c847a0 Mon Sep 17 00:00:00 2001
From: Daniel Korzekwa <dkorzekwa@nvidia.com>
Date: Mon, 3 Nov 2025 08:36:15 +0100
Subject: [PATCH 80/81] Update compress tutorial

Signed-off-by: Daniel Korzekwa <dkorzekwa@nvidia.com>
---
 examples/compress/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/compress/README.md b/examples/compress/README.md
index f3ff36232..55dab2cda 100644
--- a/examples/compress/README.md
+++ b/examples/compress/README.md
@@ -26,7 +26,7 @@ In this example, we compress the [meta-llama/Llama-3.1-8B-Instruct](https://hugg
 3. Run the compression script.
 
    ```bash
-   torchrun --nproc_per_node 1 examples/compress/main.py --config path/to/llama-3_1-8B_pruneffn_memory.yaml 2>&1 | tee ./log.txt | grep "Compress Progress"
+   torchrun --nproc_per_node 2 examples/compress/main.py --config path/to/llama-3_1-8B_pruneffn_memory.yaml 2>&1 | tee ./log.txt | grep "Compress Progress"
    ```
 
    This will save the full output to `log.txt` and display the following progress on screen:
@@ -104,7 +104,7 @@ This assumes pruning, replacement library building, NAS scoring, and subblock st
 Set `target_memory: 96_000` in `llama-3_1-8B_pruneffn_memory.yaml`.
 
 ```bash
-torchrun --nproc_per_node 1 examples/compress/main.py --config path/to/llama-3_1-8B_pruneffn_memory.yaml --mip-only 2>&1 | tee ./log.txt | grep "Compress Progress"
+torchrun --nproc_per_node 2 examples/compress/main.py --config path/to/llama-3_1-8B_pruneffn_memory.yaml --mip-only 2>&1 | tee ./log.txt | grep "Compress Progress"
 ```
 
 This will generate the following network architecture (see `log.txt`):

From bb91d73a8eaae4c05d71679adbedffe84e82d15c Mon Sep 17 00:00:00 2001
From: Liana Mikaelyan <45925959+LianaMikael@users.noreply.github.com>
Date: Tue, 4 Nov 2025 10:03:27 +0000
Subject: [PATCH 81/81] Update Puzzle Compression Tutorial (#493)

## What does this PR do?

**Type of change:**
Documentation

**Overview:**
Updated the tutorial with more details on how to choose the required
config parameters and added MMLU evaluation.

---------

Signed-off-by: Liana Mikaelyan <lmikaelyan@nvidia.com>
---
 examples/compress/README.md | 39 +++++++++++++++++++++++++++++--------
 examples/pruning/README.md  |  2 ++
 2 files changed, 33 insertions(+), 8 deletions(-)

diff --git a/examples/compress/README.md b/examples/compress/README.md
index 55dab2cda..0b165f46b 100644
--- a/examples/compress/README.md
+++ b/examples/compress/README.md
@@ -1,8 +1,15 @@
 # Compress Algorithm Tutorial
 
 This tutorial demonstrates how to compress large language models using the compress algorithm based on the [Puzzle paper](https://arxiv.org/abs/2411.19146).
+The goal of the algorithm it to find the most optimal modifications to MLP and attention layers of the model, resulting in a heterogeneous model architecture.
+The supported modifications are: 
 
-In this example, we compress the [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) model by searching for the optimal `ffn_intermediate_size` across MLP layers and `attention op/noop`. This results in a heterogeneous architecture while reducing GPU memory usage from 113 GiB to 96 GiB (15% reduction) with less than 1% regression in the token_accuracy_top_10 metric.
+- `ffn_intermediate_size`: different FFN intermediate sizes
+- `attention op/noop`: complete removal of attention layers
+
+To use the Puzzle algorithm effectively, we need to specify the target number of parameters and/or the memory. The final stage is based on Mixed-Integer Programming (MIP) algorithm to find the most optimal combination of layer modifications that satisfy the target requirements.
+
+In this example, we compress the [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) model reducing GPU memory usage from 113 GiB to 96 GiB (15% reduction) with less than 1% regression in the token_accuracy_top_10 metric.
 
 ## Environment
 
@@ -13,7 +20,11 @@ In this example, we compress the [meta-llama/Llama-3.1-8B-Instruct](https://hugg
 
 1. Specify the `puzzle_dir`, `input_hf_model_path`, `dataset_path`, `intermediate_size_list`, and `target_memory` arguments in the [llama-3_1-8B_pruneffn_memory.yaml](./configs/llama-3_1-8B_pruneffn_memory/llama-3_1-8B_pruneffn_memory.yaml) configuration file.
 
-   Let's first shoot for 32% GPU memory reduction setting `target_memory = 78_000` GiB.
+   **_NOTE:_**
+   How to choose `intermediate_size_list`? 
+   The list specifies the candidate FFN sizes that we wish to search over. It is recommended to choose several pruning sizes (e.g. 15%, 20%, 30% etc of the original). Note that the values must be hardware-friendly (divisible by a multiple of 2) to avoid issues with tensor operations in subsequent steps. 
+
+   Let's first shoot for 32% GPU memory reduction setting `target_memory = 78_000` GiB. This means that the algorithm will choose the candidates with highest accuracy that also meet the specified requirements.
 
 2. Download and prepare the [Nemotron-Post-Training-Dataset-v2](https://huggingface.co/datasets/nvidia/Nemotron-Post-Training-Dataset-v2).
 
@@ -23,7 +34,7 @@ In this example, we compress the [meta-llama/Llama-3.1-8B-Instruct](https://hugg
    python -m modelopt.torch._compress.dataset.prepare_dataset --dataset_name nvidia/Nemotron-Post-Training-Dataset-v2 --output_dir path/to/Nemotron-Post-Training-Dataset-v2
    ```
 
-3. Run the compression script.
+3. Run the compression script. 
 
    ```bash
    torchrun --nproc_per_node 2 examples/compress/main.py --config path/to/llama-3_1-8B_pruneffn_memory.yaml 2>&1 | tee ./log.txt | grep "Compress Progress"
@@ -42,7 +53,7 @@ In this example, we compress the [meta-llama/Llama-3.1-8B-Instruct](https://hugg
    [2025-11-02 12:52:34] Compress Progress 8/8: compression pipeline completed (multi-gpu)
    ```
 
-   This will generate the following network architecture (see `log.txt`):
+   Once the process is complete, the resulting network architecture will be recorded in `log.txt` for your review:
 
    ```bash
    ...
@@ -96,12 +107,12 @@ In this example, we compress the [meta-llama/Llama-3.1-8B-Instruct](https://hugg
 
    30% GPU memory reduction leads to nearly 5% regression in token_accuracy_top_10 metric (0.898 / 0.942). Let's rerun MIP search aiming for 15% memory reduction.
 
-## Re-run MIP Search with different memory constraints
+## Re-run MIP Search with different constraints
 
-If you want to try different memory constraints without re-running the expensive pruning and scoring steps, use the `--mip-only` flag.
+If you want to try different constraints without re-running the expensive pruning and scoring steps, use the `--mip-only` flag.
 This assumes pruning, replacement library building, NAS scoring, and subblock stats calculation have already been completed.
 
-Set `target_memory: 96_000` in `llama-3_1-8B_pruneffn_memory.yaml`.
+For example, let's set `target_memory: 96_000` in `llama-3_1-8B_pruneffn_memory.yaml`.
 
 ```bash
 torchrun --nproc_per_node 2 examples/compress/main.py --config path/to/llama-3_1-8B_pruneffn_memory.yaml --mip-only 2>&1 | tee ./log.txt | grep "Compress Progress"
@@ -151,7 +162,7 @@ validate_model_with_kl_div(model_name='solution_0', is_calc_kl_div=True)
 Average losses = {'lm_loss': 1.2425934937782586, 'token_accuracy_top_1': 0.703862190246582, 'token_accuracy_top_5': 0.8954982757568359, 'token_accuracy_top_10': 0.9336576461791992
 ```
 
-On the other hand, if you set `target_memory: 28_000`, you would observe that for some layers the intermediate FFN size starts to reduce (see `log.txt`):
+On the other hand, if you set `target_memory: 28_000`, you'll observe that the intermediate FFN sizes are significantly reduced in certain layers (see `log.txt` for details):
 
 ```bash
 block_5:   attention  no_op   ffn  intermediate_11520
@@ -166,6 +177,18 @@ block_13:  attention  no_op   ffn  intermediate_11520
 block_14:  attention  no_op   ffn  intermediate_3072
 ```
 
+## Evaluation
+
+Once the model is ready, you can evaluate it using [Language Model Evaluation Harness](https://pypi.org/project/lm-eval/). For example, run the following to evaluate the model on a subset of [MMLU](https://huggingface.co/datasets/cais/mmlu).
+
+```bash
+lm_eval --model hf \
+  --model_args pretrained=path/to/model,dtype=bfloat16,trust_remote_code=true,parallelize=True \
+  --tasks mmlu_humanities \
+  --num_fewshot 5 \
+  --batch_size 4
+```
+
 ## Advanced usage
 
 Modify `path/to/Llama-3_1-8B yaml` file for advanced compression scenarios.
diff --git a/examples/pruning/README.md b/examples/pruning/README.md
index 3efa9eb79..54f7322b1 100644
--- a/examples/pruning/README.md
+++ b/examples/pruning/README.md
@@ -23,6 +23,8 @@ This section focuses on applying Model Optimizer's state-of-the-art complementar
 
 </div>
 
+For more advanced pruning strategies, such as the [Puzzle methodology](https://arxiv.org/pdf/2411.19146), please see [Puzzle pruning example](https://github.com/NVIDIA/TensorRT-Model-Optimizer/tree/feature/compress/examples/compress).
+
 ## Pre-Requisites
 
 For Minitron pruning for Megatron-LM / NeMo models, use the NeMo container (e.g., `nvcr.io/nvidia/nemo:25.07`) which has all the dependencies installed.