From c758ad50c563e46890ee72b9dc9697db91bb3204 Mon Sep 17 00:00:00 2001 From: Daniel Korzekwa Date: Mon, 27 Oct 2025 11:50:00 +0100 Subject: [PATCH 01/81] The main compression function for a model using MIP-based NAS search algorithm. Signed-off-by: Daniel Korzekwa --- modelopt/torch/_compress/README.md | 3 ++ modelopt/torch/_compress/compress.py | 70 ++++++++++++++++++++++++++++ 2 files changed, 73 insertions(+) create mode 100644 modelopt/torch/_compress/README.md create mode 100644 modelopt/torch/_compress/compress.py diff --git a/modelopt/torch/_compress/README.md b/modelopt/torch/_compress/README.md new file mode 100644 index 000000000..97afc4491 --- /dev/null +++ b/modelopt/torch/_compress/README.md @@ -0,0 +1,3 @@ +Experimental model compression algorithm based on a Local Neural Architecture Search. +Based on the Puzzle paper: https://arxiv.org/abs/2411.19146 +PoC for Llama 3.1 model. \ No newline at end of file diff --git a/modelopt/torch/_compress/compress.py b/modelopt/torch/_compress/compress.py new file mode 100644 index 000000000..5136d9623 --- /dev/null +++ b/modelopt/torch/_compress/compress.py @@ -0,0 +1,70 @@ +""" + +This module provides the main compression function for a model +using MIP-based NAS search algorithm. +""" + +import build_library_and_stats +import mip_and_realize_models +import pruning_ckpts +import score_pruning_activations +import scoring +from omegaconf import DictConfig +from puzzle_tools.runtime import IRuntime + +# TODO Move initialize_hydra_config_for_dir from tests to main +from tests.utils.test_utils import initialize_hydra_config_for_dir + + +def compress( + hydra_config_dir: str, hydra_config: str, puzzle_dir: str, dataset_path: str, runtime: IRuntime +) -> DictConfig: + """Compress a puzzletron model using the MIP-based NAS search algorithm. + + Args: + hydra_config_dir (str): path to a hydra_config_dir that defines the search space + hydra_config (str): the corresponding hydra config file + puzzle_dir (str): directory with a puzzletron model to compress + dataset_path (str): dataset used for scoring and distillation + runtime: distributed runtime to use to run the compression steps, e.g., + NativeDDP_Runtime(dtype=torch.bfloat16, torch_distributed_timeout=datetime.timedelta(10)) + + Returns: + Hydra config object after compressing the model. + The same hydra configuration object is used across all compression steps. + @TODO: Investigate if this config object is immutable across steps and clarify + """ + # Step 0: Load puzzletron hydra config + hydra_cfg = initialize_hydra_config_for_dir( + config_dir=hydra_config_dir, + config_name=hydra_config, + overrides=[ + f"puzzle_dir={puzzle_dir}", + f"dataset_path={dataset_path}", + ], + ) + + # Step 1: score_pruning_activations (distributed processing) + score_pruning_activations.launch_score_activations(hydra_cfg, runtime) + + # Step 2: pruning_ckpts (single process) + if runtime.global_rank == 0: + pruning_ckpts.launch_prune_ckpt(hydra_cfg) + runtime.wait_for_everyone() + + # # Step 3: bypass distillation (distributed processing) + # # TODO: Add bypass distillation step + # #run_bypassed_training(hydra_cfg, runtime) + + # Step 4: build_library_and_stats (single process) + if runtime.global_rank == 0: + build_library_and_stats.launch_build_library_and_stats(hydra_cfg) + runtime.wait_for_everyone() + + # Step 5: calc_one_block_scores (distributed processing) + scoring.launch_scoring(hydra_cfg, runtime) + + # Step 6: mip_and_realize_models (distributed processing) + mip_and_realize_models.launch_mip_and_realize_model(hydra_cfg, runtime) + + return hydra_cfg From 8af99036c9cea7daee252f8cb6951778a52e939a Mon Sep 17 00:00:00 2001 From: Daniel Korzekwa Date: Mon, 27 Oct 2025 11:54:50 +0100 Subject: [PATCH 02/81] Code formatting Signed-off-by: Daniel Korzekwa --- modelopt/torch/_compress/README.md | 4 ++-- modelopt/torch/_compress/compress.py | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/modelopt/torch/_compress/README.md b/modelopt/torch/_compress/README.md index 97afc4491..4c6da80e5 100644 --- a/modelopt/torch/_compress/README.md +++ b/modelopt/torch/_compress/README.md @@ -1,3 +1,3 @@ Experimental model compression algorithm based on a Local Neural Architecture Search. -Based on the Puzzle paper: https://arxiv.org/abs/2411.19146 -PoC for Llama 3.1 model. \ No newline at end of file +Based on the Puzzle paper: +PoC for Llama 3.1 model. diff --git a/modelopt/torch/_compress/compress.py b/modelopt/torch/_compress/compress.py index 5136d9623..c0661259c 100644 --- a/modelopt/torch/_compress/compress.py +++ b/modelopt/torch/_compress/compress.py @@ -2,6 +2,7 @@ This module provides the main compression function for a model using MIP-based NAS search algorithm. + """ import build_library_and_stats From 5ba6c2705499d5e9f0f78ddbc43cd254cbaf99d6 Mon Sep 17 00:00:00 2001 From: Daniel Korzekwa Date: Mon, 27 Oct 2025 12:21:11 +0100 Subject: [PATCH 03/81] Model search space configuration used by test_compress.py test. Signed-off-by: Daniel Korzekwa --- .../resources/configs/Llama-3_1-8B.yaml | 108 ++++++++++++++++ .../bypass/bypass_distillation_defaults.yaml | 116 ++++++++++++++++++ .../configs/bypass/llama-3_1-8b_bypass.yaml | 38 ++++++ .../configs/pruning/attn_pruning.yaml | 16 +++ .../configs/pruning/ffn_pruning.yaml | 12 ++ .../configs/pruning/hidden_dim_pruning.yaml | 15 +++ .../configs/pruning/pruning_defaults.yaml | 32 +++++ .../configs/validate_model_defaults.yaml | 15 +++ .../configs/validate_solutions_defaults.yaml | 10 ++ 9 files changed, 362 insertions(+) create mode 100644 tests/gpu/torch/_compress/resources/configs/Llama-3_1-8B.yaml create mode 100644 tests/gpu/torch/_compress/resources/configs/bypass/bypass_distillation_defaults.yaml create mode 100644 tests/gpu/torch/_compress/resources/configs/bypass/llama-3_1-8b_bypass.yaml create mode 100644 tests/gpu/torch/_compress/resources/configs/pruning/attn_pruning.yaml create mode 100644 tests/gpu/torch/_compress/resources/configs/pruning/ffn_pruning.yaml create mode 100644 tests/gpu/torch/_compress/resources/configs/pruning/hidden_dim_pruning.yaml create mode 100644 tests/gpu/torch/_compress/resources/configs/pruning/pruning_defaults.yaml create mode 100644 tests/gpu/torch/_compress/resources/configs/validate_model_defaults.yaml create mode 100644 tests/gpu/torch/_compress/resources/configs/validate_solutions_defaults.yaml diff --git a/tests/gpu/torch/_compress/resources/configs/Llama-3_1-8B.yaml b/tests/gpu/torch/_compress/resources/configs/Llama-3_1-8B.yaml new file mode 100644 index 000000000..98c7b746c --- /dev/null +++ b/tests/gpu/torch/_compress/resources/configs/Llama-3_1-8B.yaml @@ -0,0 +1,108 @@ +defaults: + - pruning: ffn_pruning + - scoring: ../validate_solutions_defaults + - realize_model: ../validate_solutions_defaults + - bypass: llama-3_1-8b_bypass + - override hydra/hydra_logging: disabled + - _self_ + +puzzle_dir: ??? +teacher_dir: ${puzzle_dir}/ckpts/teacher/ +replacement_library_path: ${puzzle_dir}/replacement_library.json +dataset_path: ??? # path to v0.4_mini + +skip_realize_model: false + +build_replacement_library: + add_ffn_no_ops: true + add_attention_no_ops: true + +calc_subblock_stats: + batch_sizes: [64, 96, 128] + prefill_seq_len: 4096 + generation_seq_len: 4096 + num_active_tokens_override: # Optional override for sequence lengths + prefill_queue_size: 0 + allocate_prefill_query: false + benchmark_iterations: # Set to a number (e.g., 1000) to enable runtime benchmarking + merge_with_existing_stats: false + subblock_stats_filename: "subblock_stats.json" + moe_stats_filename: "moe_stats.json" + +scoring: + solutions_to_validate: + skip_existing_solutions: true + + replacement_library_path: ${replacement_library_path} + solutions_path: ${to_path:${puzzle_dir}/single_sequence_replacement_solutions.json} + teacher_dir: ${to_path:${teacher_dir}} + output_dir: ${puzzle_dir}/single_sequence_replacement_solutions--validation + + eval_samples: 2 + micro_batch_size: 1 + dataset_path: ${dataset_path}/valid + seed: 42 + shuffle_seed: 444 + +mip: + single_block_replacement_validation_dir: ${to_path:${scoring.output_dir}} + subblock_stats_path: ${to_path:${puzzle_dir}/${calc_subblock_stats.subblock_stats_filename}} + output_path: ${to_path:${puzzle_dir}/mip/puzzle_solutions} + gathered_metrics_path: + puzzle_profile: + + # puzzle_profile: + objective: metrics.cosine_embedding_loss_hidden_states + bigger_is_better: false + num_solutions: 1 + minimal_diversity: 2 + + subblock_stats_args: + - batch_size: 96 + weights_dtype: torch.bfloat16 + activations_dtype: torch.bfloat16 + kv_cache_dtype: torch.bfloat16 + + report_additional_costs: + - stats.memory_mib + - stats.num_params + - stats.num_kv_heads + - stats.has_attention + - stats.has_ffn + - stats.kv_cache_memory_mib + - stats.attention_memory_mib + - stats.ffn_memory_mib + - stats.ffn_num_params + - stats.attention_num_params + + human_constraints: + target_memory: 780_000 # 78_000 + + mip_constraints: + use_greedy_search: false + is_multi_layer_puzzle: true + metric_overrides: + constrain_search_func: + max_seconds_per_solution: 60 + +realize_model: + teacher_dir: ${to_path:${teacher_dir}} + tokenizer_name: ${to_path:${teacher_dir}} + replacement_library_path: ${replacement_library_path} + save_models: true + solutions_path: # Filled dynamically + + # Validate params + skip_validation: false # To enable validation of the model solution set `skip_validation` as False + eval_samples: 2 + micro_batch_size: 1 + dataset_path: ${dataset_path}/valid + seed: 42 + shuffle_seed: 444 + +nccl_timeout_minutes: ${timedelta_minutes:10} + +# This section redirects Hydra outputs +hydra: + run: + dir: ${puzzle_dir}/hydra_logs/${now:%Y-%m-%d}/${now:%H-%M-%S} diff --git a/tests/gpu/torch/_compress/resources/configs/bypass/bypass_distillation_defaults.yaml b/tests/gpu/torch/_compress/resources/configs/bypass/bypass_distillation_defaults.yaml new file mode 100644 index 000000000..c48f47f69 --- /dev/null +++ b/tests/gpu/torch/_compress/resources/configs/bypass/bypass_distillation_defaults.yaml @@ -0,0 +1,116 @@ +# defaults: +# - ../validate_model_defaults # TODO: Unify this default YAML with KD base YAML, for a "training defaults" configurations + +# Runtime Configuration +dtype: "bf16" # Model precision: bf16 for efficiency, fp32 for stability +seed: 42 # Random seed for reproducibility + +# Experiment Tracking +experiment_id: # Unique identifier for this experiment. Will be dynamically set +iter_num: 1 # Current iteration number +step_num: 1 # Current step number within iteration +token_count: 0 # Token count tracker (auto-updated during training) + +# Data Configuration +data: + data_column: "conversation" + block_size: 8192 # Sequence length (tokens per sample) + bos_rate: 0.5 + fim_rate: 0 + fim_spm_rate: 0 + source_datasets_to_discard: [] + load_from_disk: true # Load preprocessed data from disk or from stream + keep_in_memory: false + val_dataset_name: valid + max_eval_samples: 256 + eval_samples_per_process: # Samples per GPU during distributed eval (auto if null) + +# Training Configuration +training: + learning_rate: 1e-4 # Initial learning rate (1e-4 = 0.0001) + training_tokens: 1e+7 # Total training tokens (1B tokens) + micro_batch_size: 4 + val_micro_batch_size: 2 + warmup_ratio: 0.05 + warmup_steps: ${warmup_steps:${.training_tokens},${..data.block_size},${.micro_batch_size},${.warmup_ratio}} # Auto-calculated warmup steps + min_lr_factor: 1e-5 + grad_accumulation_steps: 1 + skip_first_batches: 0 + weight_decay: 0.1 + decay_lr: true + beta1: 0.9 + beta2: 0.95 + use_grad_scaling: false + grad_clip: 1.0 + grad_clip_type: norm + clipping_count: 0 + log_interval: 100 + eval_interval: 100 + +# Model Loading Configuration +resume_checkpoint_path: # Path to resume training from checkpoint +parameter_count: +init_checkpoint_path: # Path to initialize weights from + +model: + student_weights_dtype: "bf16" # Student model weight precision + + model_overrides: + delete_old_checkpoints: true # Clean up old checkpoints to save disk space + save_interval_seconds: 12900 # Save checkpoint every ~3.5 hours + save_interval: 1e+9 # Save checkpoint every 1B steps (effectively disabled) + save_checkpoint_when_done: true # Save final checkpoint when training completes + + # Architecture modifications for student model + model_config_overrides: + ffn: + - intermediate_size: 256 + replace_with_linear: false # Replace with simple linear layer (true/false) + no_op: false # Disable FFN entirely (true/false) + attention: + - n_heads_in_group: 8 # Number of heads per group (for GQA) + replace_with_linear: false # Replace attention with linear layer (true/false) + no_op: false # Disable attention entirely (true/false) + # Sliding window attention length. Commenting this line so that the default value will be used. + #window_length: ??? + +# Model Factory Configuration - Controls student model creation and initialization +model_factory: + factory: gqa_factory_fn # Factory function for creating GQA (Grouped Query Attention) models + block_loss_func: normalized_mse_loss # Loss function for comparing teacher/student blocks. vectorwise_normalized_mse_loss / batched_normalized_mse_loss / normalized_mse_loss + blocks_to_copy_indexes: # Which teacher blocks to copy unchanged (null = determine automatically) + gqa_init_mode: AverageKV # How to initialize K/V heads in GQA. All options here: GQAInitMode + mlp_init_mode: Truncate # MLP initialization. All options here: MlpInitMode + mlp_init_config: # Configuration for MLP initialization (if needed) + activations_log_dir: # Directory with activation statistics (required for PruneByActivationsLog) + linear_init_mode: FromTeacher # How to initialize linear layers: FromTeacher, Random, etc. + student_module_for_bypass: block # Which module to train as student. + submodule_for_loss_calculation: # Specific submodule for loss calc. + keys_to_learn: # What parameters to train. Either "entire_block", or specific submodules. Computed dynamically. + +# Validation Configuration +disable_initial_validate: false +validate_teacher_model: true +validate_student_model: true +disable_validation: false # Disable all validation (TODO: Not working yet) +best_val_loss: 1e+9 # Track best validation loss achieved + +# Performance Optimization +compile: false # Use PyTorch compilation (TODO: CURRENTLY NOT WORKING) +disable_fa2: false # Disable Flash Attention 2 (false = use FA2 if available) +teacher_model_load_on_cpu: false + +# Checkpoint Management +save_checkpoint_before_training: true # Save initial checkpoint before training +disable_checkpoint_save: false # Disable all checkpoint saving +save_best_ckpt: true # Save checkpoint when validation improves +kill_after_first_save: false # Exit after first checkpoint save (for testing) +realize_best_or_latest: "latest" + +# Experiment Tracking (Weights & Biases) +wandb_log: false # Enable wandb logging +wandb: + entity: ??? # Must be set: wandb team/user name + mode: ??? # Must be set: "online", "offline", or "disabled" + project: ??? # Must be set: wandb project name + run_name: ??? # Must be set: name for this specific run diff --git a/tests/gpu/torch/_compress/resources/configs/bypass/llama-3_1-8b_bypass.yaml b/tests/gpu/torch/_compress/resources/configs/bypass/llama-3_1-8b_bypass.yaml new file mode 100644 index 000000000..87341e72d --- /dev/null +++ b/tests/gpu/torch/_compress/resources/configs/bypass/llama-3_1-8b_bypass.yaml @@ -0,0 +1,38 @@ +defaults: + - bypass_distillation_defaults + +# Model & Runtime Configuration + +# Data type for model weights and computations (bfloat16 for efficiency) +dtype: "bf16" + +# Unique identifier for this experiment (must be set when running) +experiment_id: + +# Data Configuration Overrides +data: + max_eval_samples: 10 + +# Model Factory Configuration +model_factory: + mlp_init_mode: PruneByActivationsLog + + mlp_init_config: + # REQUIRED: Path to directory containing activation statistics/logs + # This should point to precomputed activation data. + # Replace with the directory you want to init your FFN from. + # Example path for NRT cluster: /lustre/fs1/portfolios/llmservice/projects/llmservice_deci_vlm/users/tkeren/puzzle/lior_exp/puzzle_kd-hidden-dim-4096_tokens-5e9_logits/pruning/pruning_scores/ffn_iterative/20000samples_diverse_mini + activations_log_dir: ${puzzle_dir}/pruning/pruning_scores/ffn_iterative/100samples_diverse_mini + +disable_initial_validate: false + +save_checkpoint_before_training: false + +wandb_log: false +wandb: + # Organization/team name in wandb + entity: nv-aim + # Project name for organizing related experiments + project: puzzletron_bypass_distillation + mode: online + run_name: ${..experiment_id} diff --git a/tests/gpu/torch/_compress/resources/configs/pruning/attn_pruning.yaml b/tests/gpu/torch/_compress/resources/configs/pruning/attn_pruning.yaml new file mode 100644 index 000000000..01886607e --- /dev/null +++ b/tests/gpu/torch/_compress/resources/configs/pruning/attn_pruning.yaml @@ -0,0 +1,16 @@ +defaults: + - pruning_defaults + +activations_log_dir: ${puzzle_dir}/pruning/pruning_scores/attn_${pruning.activation_hooks_kwargs.method}/${pruning.experiment_id} + +activation_hooks_kwargs: + method: independent_kv_head_contribution + optimize_for: memory # IndependentKvHeadContributionHook implementation that consumes less memory + target_layer: "self_attn.o_proj" + layer_input_descriptors_path: + +# n_heads_in_group: 4 +# num_attention_heads: 32 # num query heads +# num_kv_heads: 32 / 4 = 8 # num_query_heads // n_heads_in_group +n_heads_in_group_list: [8, 16, 32] # num_kv_heads = [4, 2, 1] +gqa_init_mode: "PruneKVHeads" diff --git a/tests/gpu/torch/_compress/resources/configs/pruning/ffn_pruning.yaml b/tests/gpu/torch/_compress/resources/configs/pruning/ffn_pruning.yaml new file mode 100644 index 000000000..f0c852eec --- /dev/null +++ b/tests/gpu/torch/_compress/resources/configs/pruning/ffn_pruning.yaml @@ -0,0 +1,12 @@ +defaults: + - pruning_defaults + +activations_log_dir: ${puzzle_dir}/pruning/pruning_scores/ffn_${pruning.activation_hooks_kwargs.method}/${pruning.experiment_id} + +activation_hooks_kwargs: + method: iterative + target_layer: "mlp.down_proj" + layer_input_descriptors_path: + +intermediate_size_list: [256] # teacher_intermediate_size is 14336 +mlp_init_mode: "PruneByActivationsLog" diff --git a/tests/gpu/torch/_compress/resources/configs/pruning/hidden_dim_pruning.yaml b/tests/gpu/torch/_compress/resources/configs/pruning/hidden_dim_pruning.yaml new file mode 100644 index 000000000..407c835d8 --- /dev/null +++ b/tests/gpu/torch/_compress/resources/configs/pruning/hidden_dim_pruning.yaml @@ -0,0 +1,15 @@ +defaults: + - pruning_defaults + +activations_log_dir: ${puzzle_dir}/pruning/pruning_scores/hidden_dim_${pruning.activation_hooks_kwargs.method}/${pruning.experiment_id} + +activation_hooks_kwargs: + method: layer_norm_contribution + target_layer: "layernorm" + +# Hidden dimension pruning specific settings +hidden_size_list: [3072, 2048] # Target hidden sizes to prune to +hidden_size_init_mode: "PruneByChannelRanking" +mlp_init_mode: "Truncate" # TODO, make it work with CopyAsIs/FromTeacher +gqa_init_mode: "AverageKV" # TODO, make it work with CopyAsIs/FromTeacher +linear_init_mode: "FromTeacher" diff --git a/tests/gpu/torch/_compress/resources/configs/pruning/pruning_defaults.yaml b/tests/gpu/torch/_compress/resources/configs/pruning/pruning_defaults.yaml new file mode 100644 index 000000000..0a5eafcff --- /dev/null +++ b/tests/gpu/torch/_compress/resources/configs/pruning/pruning_defaults.yaml @@ -0,0 +1,32 @@ +defaults: + - /validate_model_defaults + +model_name_or_path: ${teacher_dir} +experiment_id: ${pruning.eval_samples}samples_diverse_mini +activations_log_dir: ??? +activation_hooks_kwargs: ??? + +# Data: +eval_samples: 100 +micro_batch_size: 4 +dataset_path: ${dataset_path} +val_dataset_name: train + +# Prune ckpts +pruned_ckpts_outpt_dir: ${puzzle_dir}/pruning/${pruning.experiment_id} + +## FFN pruning +ffn_list: +mlp_init_mode: "Truncate" + +## KV-heads pruning +n_heads_in_group_list: +gqa_init_mode: "AverageKV" + +## Hidden dimension pruning +hidden_size_list: +hidden_size_init_mode: "PruneByChannelRanking" +linear_init_mode: "FromTeacher" + +mlp_init_config_yaml: + activations_log_dir: ${pruning.activations_log_dir} diff --git a/tests/gpu/torch/_compress/resources/configs/validate_model_defaults.yaml b/tests/gpu/torch/_compress/resources/configs/validate_model_defaults.yaml new file mode 100644 index 000000000..046ff51f6 --- /dev/null +++ b/tests/gpu/torch/_compress/resources/configs/validate_model_defaults.yaml @@ -0,0 +1,15 @@ +block_size: 8192 +bos_rate: 0.5 +data_column: conversation +val_dataset_name: train +shuffle_seed: 81436 +seed: 42 +fim_rate: 0 +fim_spm_rate: 0 +source_datasets_to_discard: +varlen: false +write_results: false +calc_losses_on_cpu: false +activations_log_dir: +model_name_or_path: +load_dataset_fn: ${get_object:utils.data.dataloaders.load_from_disk_fn} diff --git a/tests/gpu/torch/_compress/resources/configs/validate_solutions_defaults.yaml b/tests/gpu/torch/_compress/resources/configs/validate_solutions_defaults.yaml new file mode 100644 index 000000000..ec1390237 --- /dev/null +++ b/tests/gpu/torch/_compress/resources/configs/validate_solutions_defaults.yaml @@ -0,0 +1,10 @@ +defaults: + - /validate_model_defaults + - _self_ + +solutions_to_validate: +skip_validation: false +save_models: false +bigger_is_better: false +sort_solutions_by: +calculate_full_score_ablations: false From 0bc5d8492886376eda41fb9235081886b1e2ea24 Mon Sep 17 00:00:00 2001 From: Daniel Korzekwa Date: Mon, 27 Oct 2025 12:21:50 +0100 Subject: [PATCH 04/81] Tokenizer used by test_compress.py test. Signed-off-by: Daniel Korzekwa --- .../tokenizer/special_tokens_map.json | 16 ++ .../resources/tokenizer/tokenizer.json | 212 ++++++++++++++++++ .../resources/tokenizer/tokenizer_config.json | 13 ++ 3 files changed, 241 insertions(+) create mode 100644 tests/gpu/torch/_compress/resources/tokenizer/special_tokens_map.json create mode 100644 tests/gpu/torch/_compress/resources/tokenizer/tokenizer.json create mode 100644 tests/gpu/torch/_compress/resources/tokenizer/tokenizer_config.json diff --git a/tests/gpu/torch/_compress/resources/tokenizer/special_tokens_map.json b/tests/gpu/torch/_compress/resources/tokenizer/special_tokens_map.json new file mode 100644 index 000000000..02ee80b61 --- /dev/null +++ b/tests/gpu/torch/_compress/resources/tokenizer/special_tokens_map.json @@ -0,0 +1,16 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/tests/gpu/torch/_compress/resources/tokenizer/tokenizer.json b/tests/gpu/torch/_compress/resources/tokenizer/tokenizer.json new file mode 100644 index 000000000..83592e249 --- /dev/null +++ b/tests/gpu/torch/_compress/resources/tokenizer/tokenizer.json @@ -0,0 +1,212 @@ +{ + "version": "1.0", + "truncation": null, + "padding": null, + "added_tokens": [], + "normalizer": null, + "pre_tokenizer": { + "type": "Sequence", + "pretokenizers": [ + { + "type": "Split", + "pattern": { + "Regex": "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+" + }, + "behavior": "Isolated", + "invert": false + }, + { + "type": "ByteLevel", + "add_prefix_space": false, + "trim_offsets": true, + "use_regex": false + } + ] + }, + "post_processor": { + "type": "Sequence", + "processors": [ + { + "type": "ByteLevel", + "add_prefix_space": true, + "trim_offsets": false, + "use_regex": true + }, + { + "type": "TemplateProcessing", + "single": [ + { + "SpecialToken": { + "id": "<|begin_of_text|>", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + } + ], + "pair": [ + { + "SpecialToken": { + "id": "<|begin_of_text|>", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + }, + { + "SpecialToken": { + "id": "<|begin_of_text|>", + "type_id": 1 + } + }, + { + "Sequence": { + "id": "B", + "type_id": 1 + } + } + ], + "special_tokens": { + "<|begin_of_text|>": { + "id": "<|begin_of_text|>", + "ids": [ + 100 + ], + "tokens": [ + "<|begin_of_text|>" + ] + } + } + } + ] + }, + "decoder": { + "type": "ByteLevel", + "add_prefix_space": true, + "trim_offsets": true, + "use_regex": true + }, + "model": { + "type": "BPE", + "dropout": null, + "unk_token": null, + "continuing_subword_prefix": null, + "end_of_word_suffix": null, + "fuse_unk": false, + "byte_fallback": false, + "ignore_merges": true, + "vocab": { + "!": 0, + "\"": 1, + "#": 2, + "$": 3, + "%": 4, + "&": 5, + "'": 6, + "(": 7, + ")": 8, + "*": 9, + "+": 10, + ",": 11, + "-": 12, + ".": 13, + "/": 14, + "0": 15, + "1": 16, + "2": 17, + "3": 18, + "4": 19, + "5": 20, + "6": 21, + "7": 22, + "8": 23, + "9": 24, + ":": 25, + ";": 26, + "<": 27, + "=": 28, + ">": 29, + "?": 30, + "@": 31, + "A": 32, + "B": 33, + "C": 34, + "D": 35, + "E": 36, + "F": 37, + "G": 38, + "H": 39, + "I": 40, + "J": 41, + "K": 42, + "L": 43, + "M": 44, + "N": 45, + "O": 46, + "P": 47, + "Q": 48, + "R": 49, + "S": 50, + "T": 51, + "U": 52, + "V": 53, + "W": 54, + "X": 55, + "Y": 56, + "Z": 57, + "[": 58, + "\\": 59, + "]": 60, + "^": 61, + "_": 62, + "`": 63, + "a": 64, + "b": 65, + "c": 66, + "d": 67, + "e": 68, + "f": 69, + "g": 70, + "h": 71, + "i": 72, + "j": 73, + "k": 74, + "l": 75, + "m": 76, + "n": 77, + "o": 78, + "p": 79, + "q": 80, + "r": 81, + "s": 82, + "t": 83, + "u": 84, + "v": 85, + "w": 86, + "x": 87, + "y": 88, + "z": 89, + "{": 90, + "|": 91, + "}": 92, + "~": 93, + "¡": 94, + "¢": 95, + "£": 96, + "¤": 97, + "¥": 98, + "¦": 99, + "<|begin_of_text|>": 100, + "<|eot_id|>": 101 + }, + "merges": [] + } +} diff --git a/tests/gpu/torch/_compress/resources/tokenizer/tokenizer_config.json b/tests/gpu/torch/_compress/resources/tokenizer/tokenizer_config.json new file mode 100644 index 000000000..754d9e8db --- /dev/null +++ b/tests/gpu/torch/_compress/resources/tokenizer/tokenizer_config.json @@ -0,0 +1,13 @@ +{ + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "tokenizer_class": "PreTrainedTokenizer" +} From 87d4fa5930e79b15fe2751940e519eb937600a00 Mon Sep 17 00:00:00 2001 From: Daniel Korzekwa Date: Mon, 27 Oct 2025 12:22:30 +0100 Subject: [PATCH 05/81] Tokenizer utility used by test_compress.py test Signed-off-by: Daniel Korzekwa --- .../resources/tokenizer/truncate_tokenizer.py | 42 +++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 tests/gpu/torch/_compress/resources/tokenizer/truncate_tokenizer.py diff --git a/tests/gpu/torch/_compress/resources/tokenizer/truncate_tokenizer.py b/tests/gpu/torch/_compress/resources/tokenizer/truncate_tokenizer.py new file mode 100644 index 000000000..baac5e14c --- /dev/null +++ b/tests/gpu/torch/_compress/resources/tokenizer/truncate_tokenizer.py @@ -0,0 +1,42 @@ +import json + +# Path to your original and new tokenizer.json +in_path = "./tokenizer.json" +out_path = "./tokenizer_truncated.json" + +# How many top tokens to keep +NUM_TO_KEEP = 100 + +with open(in_path, encoding="utf-8") as f: + tokenizer_data = json.load(f) + +# Get and sort the original vocab by index (frequency proxy) +orig_vocab = tokenizer_data["model"]["vocab"] + +# Sort tokens by their original index (lowest index = assumed most common/important) +sorted_tokens = sorted(orig_vocab.items(), key=lambda item: item[1]) + +# Keep the top N tokens +tokens_to_keep = [tok for tok, idx in sorted_tokens[:NUM_TO_KEEP]] + +# Re-index the selected tokens: 0..N-1 +small_vocab = {tok: i for i, tok in enumerate(tokens_to_keep)} +tokenizer_data["model"]["vocab"] = small_vocab + +# Update vocab size +if "vocab_size" in tokenizer_data["model"]: + tokenizer_data["model"]["vocab_size"] = len(small_vocab) + +# Optionally remove merges if present and unneeded (mostly for BPE/WordPiece) +if "merges" in tokenizer_data["model"]: + tokenizer_data["model"]["merges"] = [] + +# Remove added_tokens if not needed +if "added_tokens" in tokenizer_data: + tokenizer_data["added_tokens"] = [] + +# Write out the truncated tokenizer.json +with open(out_path, "w", encoding="utf-8") as f: + json.dump(tokenizer_data, f, indent=2, ensure_ascii=False) + +print(f"Truncated tokenizer saved to: {out_path}") From ced1e997431c8d88e452dd56804adcca67d88bca Mon Sep 17 00:00:00 2001 From: Daniel Korzekwa Date: Mon, 27 Oct 2025 12:22:46 +0100 Subject: [PATCH 06/81] e2e tests for compress.py Signed-off-by: Daniel Korzekwa --- tests/gpu/torch/_compress/test_compress.py | 186 +++++++++++++++++++++ 1 file changed, 186 insertions(+) create mode 100644 tests/gpu/torch/_compress/test_compress.py diff --git a/tests/gpu/torch/_compress/test_compress.py b/tests/gpu/torch/_compress/test_compress.py new file mode 100644 index 000000000..ddcea6aaf --- /dev/null +++ b/tests/gpu/torch/_compress/test_compress.py @@ -0,0 +1,186 @@ +import datetime +import os +import os.path as osp +import shutil +from pathlib import Path + +import pytest +import torch +from logger import mprint +from puzzle_tools.hydra_utils import register_hydra_resolvers +from puzzle_tools.runtime import NativeDDP_Runtime +from scripts.convert_llama3_to_decilm import convert_llama3_to_decilm +from transformers import AutoTokenizer, LlamaConfig, LlamaForCausalLM, PreTrainedTokenizerBase + +from modelopt.torch._compress import compress +from tests.integration.puzzle_tools.e2e_puzzletron_test.dummy_dataset import save_dummy_dataset + + +@pytest.fixture(scope="module", autouse=True) +def setup_test_module(): + register_hydra_resolvers() + + +@pytest.fixture +def project_root_path(request: pytest.FixtureRequest) -> Path: + return Path(request.config.rootpath) + + +# The e2e test to compress a model based on Local Neural Architecture Search (Mixed Integer Programing NAS search) +# using a one-click command. +# +# Note: Bypass is disabled now in the test. + +# How to run this test (currently only supported internally at Nvidia). +# +# Have both modelopt and puzzle source code in the same directory: +# /workspace/modelopt +# /workspace/puzzletron +# +# submit_job --partition interactive --time 0 \ +# --image gitlab-master.nvidia.com/deci/puzzletron:trtllm_main \ +# --workdir $MODELOPT SRC DIRECTORY --interactive --gpu 1 +# +# pip install mip +# pip install lru-dict +# +# export PYTHONPATH=$PYTHONPATH:/workspace/puzzletron/v1 +# +# ../puzzletron/v1/scripts/torch_dist_runner.sh \ +# pytest -s -v ./tests/gpu/torch/puzzletron/test_compress_model.py -o addopts="" + + +def test_compress(project_root_path): + # The input to puzzletron.compress(). + os.environ["WANDB_DISABLED"] = "true" + puzzle_dir = "/tmp/pytest-shared/test_compress_model" + dataset_path = osp.join(puzzle_dir, "dummy_dataset") + hydra_config_dir = osp.join( + project_root_path, + "tests/gpu/torch/_compress/resources/configs", + ) + + _runtime = NativeDDP_Runtime( + dtype=torch.bfloat16, torch_distributed_timeout=datetime.timedelta(10) + ) + + with _runtime as runtime: + # + # Test setup + # + if runtime.global_rank == 0: + # Setup puzzle_dir and dataset + setup_puzzle_dir(puzzle_dir) + save_dummy_dataset(dataset_path) + + # + # Step 1: Create and save a teacher model to compress + # This mimics the normal pipeline where we start with a Llama model + # + tokenizer_path = osp.join( + project_root_path, "tests/gpu/torch/_compress/resources/tokenizer" + ) + tokenizer = AutoTokenizer.from_pretrained(tokenizer_path) + + # Create a small Llama model (not DeciLM) to match the normal conversion pipeline + hf_ckpt_teacher_dir = "ckpts/teacher" + llama_checkpoint_path = osp.join(puzzle_dir, hf_ckpt_teacher_dir) + create_and_save_small_llama_model( + llama_checkpoint_path, vocab_size=tokenizer.vocab_size, tokenizer=tokenizer + ) + + # Use the full conversion pipeline (matches normal usage) + convert_llama3_to_decilm( + input_dir=llama_checkpoint_path, + output_dir=llama_checkpoint_path, + ) + runtime.wait_for_everyone() + + # Compress the model using a one-click approach + compress.compress(hydra_config_dir, "Llama-3_1-8B", puzzle_dir, dataset_path, runtime) + + # + # Check assertions + # + if runtime.global_rank == 0: + # assertions for the score_pruning_activations step 1 + rank = int(os.environ["RANK"]) + rank_filepath = ( + f"pruning/pruning_scores/ffn_iterative/100samples_diverse_mini/rank_{rank}.pth" + ) + assert os.path.isfile(osp.join(puzzle_dir, rank_filepath)) + + # assertions for the pruning_ckpts step 2 + assert os.path.exists(osp.join(puzzle_dir, "ckpts/ffn_256_attn_no_op")) + + # assertions fo bypass distillation step 3 + # TODO: Add bypass distillation step + # assert os.path.exists(osp.join(hydra_cfg.bypass.experiment_dir, "latest/config.json")) + + # assertions for the build_library_and_stats step 4 + assert os.path.isfile(osp.join(puzzle_dir, "replacement_library.json")) + assert os.path.isfile(osp.join(puzzle_dir, "subblock_stats.json")) + + # assertions for the scoring step 5 + solution_0_filepath = osp.join( + puzzle_dir, "single_sequence_replacement_solutions--validation/solution_0.json" + ) + assert os.path.exists(solution_0_filepath) + + # assertions for the mip_and_realize_models step 6 + solution_0_ckpt_config_path = osp.join( + puzzle_dir, + "mip/puzzle_solutions/target_memory_780000MiB/solutions--checkpoints/solution_0/config.json", + ) + assert os.path.exists(solution_0_ckpt_config_path) + assert os.path.exists( + osp.join(puzzle_dir, "mip/puzzle_solutions/target_memory_780000MiB/solutions.json") + ) + + runtime.wait_for_everyone() + + mprint("PYTEST SUMMARY: test_compress_model() test has finished successfully") + + +def create_and_save_small_llama_model( + output_path: str, vocab_size: int, tokenizer: PreTrainedTokenizerBase +): + """ + Create and save a small Llama model for testing the conversion pipeline. + This mimics having a real Llama checkpoint that needs to be converted. + """ + os.makedirs(output_path, exist_ok=True) + + # Create a minimal Llama config (small for testing) + # Note: intermediate_size must be divisible by 256 per DeciLM config requirements + # Note: hidden_size must give head_dim >= 8 for Flash Attention 2 compatibility + llama_config = LlamaConfig( + vocab_size=vocab_size, + hidden_size=256, # 32 heads times 8 head_dim = 256 (matches bypass config expectations) + intermediate_size=512, # Must be divisible by 256 + num_hidden_layers=2, + num_attention_heads=32, # Matches original test + num_key_value_heads=8, # GQA: 32÷4=8 (matches original n_heads_in_group=4) + max_position_embeddings=512, + rms_norm_eps=1e-5, + rope_theta=10000.0, + attention_bias=False, + hidden_act="silu", + tie_word_embeddings=False, + ) + + # Create and save the Llama model + model = LlamaForCausalLM(llama_config) + model.to(dtype=torch.bfloat16).save_pretrained(output_path) + + # Save tokenizer + tokenizer.save_pretrained(output_path) + + # Save config + llama_config.save_pretrained(output_path) + + +def setup_puzzle_dir(puzzle_dir: str): + if Path(puzzle_dir).exists(): + shutil.rmtree(puzzle_dir) + Path(puzzle_dir).mkdir(parents=True, exist_ok=True) From 5de0bdc6846d4c707c25440c12daf55993f53169 Mon Sep 17 00:00:00 2001 From: Daniel Korzekwa Date: Mon, 27 Oct 2025 12:45:28 +0100 Subject: [PATCH 07/81] Add convert_llama3_config_to_decilm_config + unit test Signed-off-by: Daniel Korzekwa --- .../converters/convert_llama3_to_decilm.py | 136 ++++++++++++++++++ ..._convert_llama3_config_to_decilm_config.py | 45 ++++++ 2 files changed, 181 insertions(+) create mode 100644 modelopt/torch/_compress/decilm/converters/convert_llama3_to_decilm.py create mode 100644 tests/gpu/torch/_compress/decilm/converters/test_convert_llama3_config_to_decilm_config.py diff --git a/modelopt/torch/_compress/decilm/converters/convert_llama3_to_decilm.py b/modelopt/torch/_compress/decilm/converters/convert_llama3_to_decilm.py new file mode 100644 index 000000000..6cdd1f02c --- /dev/null +++ b/modelopt/torch/_compress/decilm/converters/convert_llama3_to_decilm.py @@ -0,0 +1,136 @@ +"""Convert a Llama3 model to a DeciLM model.""" + +#!/usr/bin/env python3 +from pathlib import Path + +from fire import Fire +from puzzle_tools.checkpoint_utils import copy_tokenizer +from puzzle_tools.checkpoint_utils_hf import copy_deci_lm_hf_code +from puzzle_tools.conversion_utils import convert_model_weights_to_decilm +from puzzle_tools.deci_lm_hf_code.configuration_decilm import DeciLMConfig +from transformers import LlamaConfig + +""" +example: + +python -m scripts.hf.convert_llama3_to_decilm \ + --input_dir .../meta-llama/Meta-Llama-3.1-8B-Instruct \ + --output_dir .../meta-llama/Meta-Llama-3.1-8B-Instruct--deci-hf/ +""" + + +def convert_llama3_config_to_decilm_config(config: LlamaConfig) -> DeciLMConfig: + """Convert Llama3 config to DeciLM config format.""" + print("\n=== Converting Llama3 Config to DeciLM Config ===") + + # Get dtype from config - check both dtype and torch_dtype + # Prefer dtype if it's set (not None), otherwise fall back to torch_dtype + dtype = getattr(config, "dtype", None) + if dtype is None: + dtype = getattr(config, "torch_dtype", None) + + # Convert torch.dtype to string if needed (for JSON serialization) + if dtype is not None and hasattr(dtype, "__module__") and "torch" in dtype.__module__: + dtype = str(dtype).replace("torch.", "") + + # Track which global values will be removed (moved to per-layer configs) + print("\n📝 Converting global values to per-layer block_configs:") + print( + f" - intermediate_size: {config.intermediate_size} → block_configs[*].ffn.intermediate_size" + ) + print( + f" - num_key_value_heads: {config.num_key_value_heads} → block_configs[*].attention.n_heads_in_group (derived)" + ) + print(f" - hidden_act: {config.hidden_act} → block_configs[*].ffn.hidden_act") + print( + f" - sliding_window: {getattr(config, 'sliding_window', None)} → block_configs[*].attention.window_length" + ) + + # Create block configs for each layer + block_configs = [] + for i in range(config.num_hidden_layers): + # Configure attention + attention_config = { + "no_op": False, + "replace_with_linear": False, + "sparsify": None, + "n_heads_in_group": config.num_attention_heads // config.num_key_value_heads, + "window_length": None, # Llama3 doesn't use sliding window by default + "num_sink_tokens": None, # Llama3 doesn't use sink attention + "use_prefill_window_in_sink_attention": False, + "unshifted_sink": False, + "mamba": None, + "llama4": None, # No Llama4 specific attention for Llama3 + } + + # Configure FFN + ffn_config = { + "no_op": False, + "replace_with_linear": False, + "sparsify": None, + "intermediate_size": config.intermediate_size, + "gated": True, # Llama3 uses SwiGLU + "hidden_act": config.hidden_act, + "moe": None, # Llama3 doesn't use MoE + } + + block_configs.append({"attention": attention_config, "ffn": ffn_config}) + + # Create DeciLM config + decilm_config = DeciLMConfig( + block_configs=block_configs, + hidden_size=config.hidden_size, + max_position_embeddings=config.max_position_embeddings, + num_attention_heads=config.num_attention_heads, + num_hidden_layers=config.num_hidden_layers, + tie_word_embeddings=config.tie_word_embeddings, + vocab_size=config.vocab_size, + rms_norm_eps=config.rms_norm_eps, + attention_bias=config.attention_bias, + o_proj_bias=config.attention_bias, # llama3 bias defined by attention_bias + rope_theta=config.rope_theta, + rope_scaling=config.rope_scaling, + position_embedding_type="rope", # Llama3 uses standard RoPE + architectures=["DeciLMForCausalLM"], + auto_map={ + "AutoConfig": "configuration_decilm.DeciLMConfig", + "AutoModelForCausalLM": "modeling_decilm.DeciLMForCausalLM", + }, + eos_token_id=config.eos_token_id, + bos_token_id=config.bos_token_id, + pad_token_id=config.pad_token_id, + head_dim=getattr(config, "head_dim", config.hidden_size // config.num_attention_heads), + dtype=dtype, + ) + + print(f"\n✓ Created DeciLM config with {len(block_configs)} layers") + print( + "✓ Global per-layer keys (intermediate_size, num_key_value_heads, hidden_act, sliding_window)" + ) + print(" will be removed from saved config and are only in block_configs") + + return decilm_config + + +def convert_configs_in_dirs(input_dir, output_dir): + """Convert the config of a Llama3 model to a DeciLM model.""" + input_dir = Path(input_dir) + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + input_config_path = input_dir / "config.json" + config = LlamaConfig.from_pretrained(input_config_path) + decilm_config = convert_llama3_config_to_decilm_config(config) + decilm_config.save_pretrained(output_dir) + + +def convert_llama3_to_decilm(input_dir, output_dir): + """Convert a Llama3 model to a DeciLM model.""" + convert_configs_in_dirs(input_dir, output_dir) + copy_tokenizer(input_dir, output_dir) + convert_model_weights_to_decilm(input_dir, output_dir) + copy_deci_lm_hf_code(output_dir) + + +if __name__ == "__main__": + Fire(convert_llama3_to_decilm) diff --git a/tests/gpu/torch/_compress/decilm/converters/test_convert_llama3_config_to_decilm_config.py b/tests/gpu/torch/_compress/decilm/converters/test_convert_llama3_config_to_decilm_config.py new file mode 100644 index 000000000..4bab4d505 --- /dev/null +++ b/tests/gpu/torch/_compress/decilm/converters/test_convert_llama3_config_to_decilm_config.py @@ -0,0 +1,45 @@ +import json +import os.path as osp +from pathlib import Path + +import pytest +from gpu.torch._compress.test_compress import create_and_save_small_llama_model +from transformers import AutoTokenizer + +from modelopt.torch._compress.decilm.converters.convert_llama3_to_decilm import ( + convert_llama3_to_decilm, +) + + +@pytest.fixture +def project_root_path(request: pytest.FixtureRequest) -> Path: + return Path(request.config.rootpath) + + +def test_convert_llama3_config_to_decilm_config(project_root_path: Path, tmp_path: Path): + tokenizer_path = osp.join(project_root_path, "tests/gpu/torch/_compress/resources/tokenizer") + tokenizer = AutoTokenizer.from_pretrained(tokenizer_path) + + llama_checkpoint_path = tmp_path / "llama_checkpoint" + create_and_save_small_llama_model( + llama_checkpoint_path, vocab_size=tokenizer.vocab_size, tokenizer=tokenizer + ) + + # Convert the Llama model to a DeciLM model + decilm_checkpoint_path = tmp_path / "decilm_checkpoint" + convert_llama3_to_decilm( + input_dir=llama_checkpoint_path, + output_dir=decilm_checkpoint_path, + ) + + # Assert that the converted config has the correct number of block_configs + config_path = decilm_checkpoint_path / "config.json" + assert config_path.exists(), f"Config file not found at {config_path}" + + with open(config_path) as f: + decilm_config = json.load(f) + + # Verify block_configs exists and has the correct length + assert "block_configs" in decilm_config, "block_configs not found in converted config" + actual_num_block_configs = len(decilm_config["block_configs"]) + assert actual_num_block_configs == 2 From 800414c275994ac44ac9881d5839f0e9a2aa0c1e Mon Sep 17 00:00:00 2001 From: Daniel Korzekwa Date: Mon, 27 Oct 2025 15:35:51 +0100 Subject: [PATCH 08/81] Remove unused bypass distillation config files. Signed-off-by: Daniel Korzekwa --- modelopt/torch/_compress/compress.py | 4 - .../resources/configs/Llama-3_1-8B.yaml | 2 +- .../bypass/bypass_distillation_defaults.yaml | 116 ------------------ .../configs/bypass/llama-3_1-8b_bypass.yaml | 38 ------ 4 files changed, 1 insertion(+), 159 deletions(-) delete mode 100644 tests/gpu/torch/_compress/resources/configs/bypass/bypass_distillation_defaults.yaml delete mode 100644 tests/gpu/torch/_compress/resources/configs/bypass/llama-3_1-8b_bypass.yaml diff --git a/modelopt/torch/_compress/compress.py b/modelopt/torch/_compress/compress.py index c0661259c..97819a42b 100644 --- a/modelopt/torch/_compress/compress.py +++ b/modelopt/torch/_compress/compress.py @@ -53,10 +53,6 @@ def compress( pruning_ckpts.launch_prune_ckpt(hydra_cfg) runtime.wait_for_everyone() - # # Step 3: bypass distillation (distributed processing) - # # TODO: Add bypass distillation step - # #run_bypassed_training(hydra_cfg, runtime) - # Step 4: build_library_and_stats (single process) if runtime.global_rank == 0: build_library_and_stats.launch_build_library_and_stats(hydra_cfg) diff --git a/tests/gpu/torch/_compress/resources/configs/Llama-3_1-8B.yaml b/tests/gpu/torch/_compress/resources/configs/Llama-3_1-8B.yaml index 98c7b746c..1d8fac655 100644 --- a/tests/gpu/torch/_compress/resources/configs/Llama-3_1-8B.yaml +++ b/tests/gpu/torch/_compress/resources/configs/Llama-3_1-8B.yaml @@ -2,7 +2,7 @@ defaults: - pruning: ffn_pruning - scoring: ../validate_solutions_defaults - realize_model: ../validate_solutions_defaults - - bypass: llama-3_1-8b_bypass + - bypass: - override hydra/hydra_logging: disabled - _self_ diff --git a/tests/gpu/torch/_compress/resources/configs/bypass/bypass_distillation_defaults.yaml b/tests/gpu/torch/_compress/resources/configs/bypass/bypass_distillation_defaults.yaml deleted file mode 100644 index c48f47f69..000000000 --- a/tests/gpu/torch/_compress/resources/configs/bypass/bypass_distillation_defaults.yaml +++ /dev/null @@ -1,116 +0,0 @@ -# defaults: -# - ../validate_model_defaults # TODO: Unify this default YAML with KD base YAML, for a "training defaults" configurations - -# Runtime Configuration -dtype: "bf16" # Model precision: bf16 for efficiency, fp32 for stability -seed: 42 # Random seed for reproducibility - -# Experiment Tracking -experiment_id: # Unique identifier for this experiment. Will be dynamically set -iter_num: 1 # Current iteration number -step_num: 1 # Current step number within iteration -token_count: 0 # Token count tracker (auto-updated during training) - -# Data Configuration -data: - data_column: "conversation" - block_size: 8192 # Sequence length (tokens per sample) - bos_rate: 0.5 - fim_rate: 0 - fim_spm_rate: 0 - source_datasets_to_discard: [] - load_from_disk: true # Load preprocessed data from disk or from stream - keep_in_memory: false - val_dataset_name: valid - max_eval_samples: 256 - eval_samples_per_process: # Samples per GPU during distributed eval (auto if null) - -# Training Configuration -training: - learning_rate: 1e-4 # Initial learning rate (1e-4 = 0.0001) - training_tokens: 1e+7 # Total training tokens (1B tokens) - micro_batch_size: 4 - val_micro_batch_size: 2 - warmup_ratio: 0.05 - warmup_steps: ${warmup_steps:${.training_tokens},${..data.block_size},${.micro_batch_size},${.warmup_ratio}} # Auto-calculated warmup steps - min_lr_factor: 1e-5 - grad_accumulation_steps: 1 - skip_first_batches: 0 - weight_decay: 0.1 - decay_lr: true - beta1: 0.9 - beta2: 0.95 - use_grad_scaling: false - grad_clip: 1.0 - grad_clip_type: norm - clipping_count: 0 - log_interval: 100 - eval_interval: 100 - -# Model Loading Configuration -resume_checkpoint_path: # Path to resume training from checkpoint -parameter_count: -init_checkpoint_path: # Path to initialize weights from - -model: - student_weights_dtype: "bf16" # Student model weight precision - - model_overrides: - delete_old_checkpoints: true # Clean up old checkpoints to save disk space - save_interval_seconds: 12900 # Save checkpoint every ~3.5 hours - save_interval: 1e+9 # Save checkpoint every 1B steps (effectively disabled) - save_checkpoint_when_done: true # Save final checkpoint when training completes - - # Architecture modifications for student model - model_config_overrides: - ffn: - - intermediate_size: 256 - replace_with_linear: false # Replace with simple linear layer (true/false) - no_op: false # Disable FFN entirely (true/false) - attention: - - n_heads_in_group: 8 # Number of heads per group (for GQA) - replace_with_linear: false # Replace attention with linear layer (true/false) - no_op: false # Disable attention entirely (true/false) - # Sliding window attention length. Commenting this line so that the default value will be used. - #window_length: ??? - -# Model Factory Configuration - Controls student model creation and initialization -model_factory: - factory: gqa_factory_fn # Factory function for creating GQA (Grouped Query Attention) models - block_loss_func: normalized_mse_loss # Loss function for comparing teacher/student blocks. vectorwise_normalized_mse_loss / batched_normalized_mse_loss / normalized_mse_loss - blocks_to_copy_indexes: # Which teacher blocks to copy unchanged (null = determine automatically) - gqa_init_mode: AverageKV # How to initialize K/V heads in GQA. All options here: GQAInitMode - mlp_init_mode: Truncate # MLP initialization. All options here: MlpInitMode - mlp_init_config: # Configuration for MLP initialization (if needed) - activations_log_dir: # Directory with activation statistics (required for PruneByActivationsLog) - linear_init_mode: FromTeacher # How to initialize linear layers: FromTeacher, Random, etc. - student_module_for_bypass: block # Which module to train as student. - submodule_for_loss_calculation: # Specific submodule for loss calc. - keys_to_learn: # What parameters to train. Either "entire_block", or specific submodules. Computed dynamically. - -# Validation Configuration -disable_initial_validate: false -validate_teacher_model: true -validate_student_model: true -disable_validation: false # Disable all validation (TODO: Not working yet) -best_val_loss: 1e+9 # Track best validation loss achieved - -# Performance Optimization -compile: false # Use PyTorch compilation (TODO: CURRENTLY NOT WORKING) -disable_fa2: false # Disable Flash Attention 2 (false = use FA2 if available) -teacher_model_load_on_cpu: false - -# Checkpoint Management -save_checkpoint_before_training: true # Save initial checkpoint before training -disable_checkpoint_save: false # Disable all checkpoint saving -save_best_ckpt: true # Save checkpoint when validation improves -kill_after_first_save: false # Exit after first checkpoint save (for testing) -realize_best_or_latest: "latest" - -# Experiment Tracking (Weights & Biases) -wandb_log: false # Enable wandb logging -wandb: - entity: ??? # Must be set: wandb team/user name - mode: ??? # Must be set: "online", "offline", or "disabled" - project: ??? # Must be set: wandb project name - run_name: ??? # Must be set: name for this specific run diff --git a/tests/gpu/torch/_compress/resources/configs/bypass/llama-3_1-8b_bypass.yaml b/tests/gpu/torch/_compress/resources/configs/bypass/llama-3_1-8b_bypass.yaml deleted file mode 100644 index 87341e72d..000000000 --- a/tests/gpu/torch/_compress/resources/configs/bypass/llama-3_1-8b_bypass.yaml +++ /dev/null @@ -1,38 +0,0 @@ -defaults: - - bypass_distillation_defaults - -# Model & Runtime Configuration - -# Data type for model weights and computations (bfloat16 for efficiency) -dtype: "bf16" - -# Unique identifier for this experiment (must be set when running) -experiment_id: - -# Data Configuration Overrides -data: - max_eval_samples: 10 - -# Model Factory Configuration -model_factory: - mlp_init_mode: PruneByActivationsLog - - mlp_init_config: - # REQUIRED: Path to directory containing activation statistics/logs - # This should point to precomputed activation data. - # Replace with the directory you want to init your FFN from. - # Example path for NRT cluster: /lustre/fs1/portfolios/llmservice/projects/llmservice_deci_vlm/users/tkeren/puzzle/lior_exp/puzzle_kd-hidden-dim-4096_tokens-5e9_logits/pruning/pruning_scores/ffn_iterative/20000samples_diverse_mini - activations_log_dir: ${puzzle_dir}/pruning/pruning_scores/ffn_iterative/100samples_diverse_mini - -disable_initial_validate: false - -save_checkpoint_before_training: false - -wandb_log: false -wandb: - # Organization/team name in wandb - entity: nv-aim - # Project name for organizing related experiments - project: puzzletron_bypass_distillation - mode: online - run_name: ${..experiment_id} From 16abcc9f1643ac372854afcd816b8a37e6356fed Mon Sep 17 00:00:00 2001 From: Daniel Korzekwa Date: Mon, 27 Oct 2025 15:38:33 +0100 Subject: [PATCH 09/81] Moving integration tests to tests/experimental to not trigger CICD Signed-off-by: Daniel Korzekwa --- .../torch/_compress/resources/configs/Llama-3_1-8B.yaml | 0 .../_compress/resources/configs/pruning/attn_pruning.yaml | 0 .../_compress/resources/configs/pruning/ffn_pruning.yaml | 0 .../resources/configs/pruning/hidden_dim_pruning.yaml | 0 .../_compress/resources/configs/pruning/pruning_defaults.yaml | 0 .../_compress/resources/configs/validate_model_defaults.yaml | 0 .../resources/configs/validate_solutions_defaults.yaml | 0 .../_compress/resources/tokenizer/special_tokens_map.json | 0 .../torch/_compress/resources/tokenizer/tokenizer.json | 0 .../torch/_compress/resources/tokenizer/tokenizer_config.json | 0 .../torch/_compress/resources/tokenizer/truncate_tokenizer.py | 0 tests/{gpu => experimental}/torch/_compress/test_compress.py | 4 ++-- 12 files changed, 2 insertions(+), 2 deletions(-) rename tests/{gpu => experimental}/torch/_compress/resources/configs/Llama-3_1-8B.yaml (100%) rename tests/{gpu => experimental}/torch/_compress/resources/configs/pruning/attn_pruning.yaml (100%) rename tests/{gpu => experimental}/torch/_compress/resources/configs/pruning/ffn_pruning.yaml (100%) rename tests/{gpu => experimental}/torch/_compress/resources/configs/pruning/hidden_dim_pruning.yaml (100%) rename tests/{gpu => experimental}/torch/_compress/resources/configs/pruning/pruning_defaults.yaml (100%) rename tests/{gpu => experimental}/torch/_compress/resources/configs/validate_model_defaults.yaml (100%) rename tests/{gpu => experimental}/torch/_compress/resources/configs/validate_solutions_defaults.yaml (100%) rename tests/{gpu => experimental}/torch/_compress/resources/tokenizer/special_tokens_map.json (100%) rename tests/{gpu => experimental}/torch/_compress/resources/tokenizer/tokenizer.json (100%) rename tests/{gpu => experimental}/torch/_compress/resources/tokenizer/tokenizer_config.json (100%) rename tests/{gpu => experimental}/torch/_compress/resources/tokenizer/truncate_tokenizer.py (100%) rename tests/{gpu => experimental}/torch/_compress/test_compress.py (97%) diff --git a/tests/gpu/torch/_compress/resources/configs/Llama-3_1-8B.yaml b/tests/experimental/torch/_compress/resources/configs/Llama-3_1-8B.yaml similarity index 100% rename from tests/gpu/torch/_compress/resources/configs/Llama-3_1-8B.yaml rename to tests/experimental/torch/_compress/resources/configs/Llama-3_1-8B.yaml diff --git a/tests/gpu/torch/_compress/resources/configs/pruning/attn_pruning.yaml b/tests/experimental/torch/_compress/resources/configs/pruning/attn_pruning.yaml similarity index 100% rename from tests/gpu/torch/_compress/resources/configs/pruning/attn_pruning.yaml rename to tests/experimental/torch/_compress/resources/configs/pruning/attn_pruning.yaml diff --git a/tests/gpu/torch/_compress/resources/configs/pruning/ffn_pruning.yaml b/tests/experimental/torch/_compress/resources/configs/pruning/ffn_pruning.yaml similarity index 100% rename from tests/gpu/torch/_compress/resources/configs/pruning/ffn_pruning.yaml rename to tests/experimental/torch/_compress/resources/configs/pruning/ffn_pruning.yaml diff --git a/tests/gpu/torch/_compress/resources/configs/pruning/hidden_dim_pruning.yaml b/tests/experimental/torch/_compress/resources/configs/pruning/hidden_dim_pruning.yaml similarity index 100% rename from tests/gpu/torch/_compress/resources/configs/pruning/hidden_dim_pruning.yaml rename to tests/experimental/torch/_compress/resources/configs/pruning/hidden_dim_pruning.yaml diff --git a/tests/gpu/torch/_compress/resources/configs/pruning/pruning_defaults.yaml b/tests/experimental/torch/_compress/resources/configs/pruning/pruning_defaults.yaml similarity index 100% rename from tests/gpu/torch/_compress/resources/configs/pruning/pruning_defaults.yaml rename to tests/experimental/torch/_compress/resources/configs/pruning/pruning_defaults.yaml diff --git a/tests/gpu/torch/_compress/resources/configs/validate_model_defaults.yaml b/tests/experimental/torch/_compress/resources/configs/validate_model_defaults.yaml similarity index 100% rename from tests/gpu/torch/_compress/resources/configs/validate_model_defaults.yaml rename to tests/experimental/torch/_compress/resources/configs/validate_model_defaults.yaml diff --git a/tests/gpu/torch/_compress/resources/configs/validate_solutions_defaults.yaml b/tests/experimental/torch/_compress/resources/configs/validate_solutions_defaults.yaml similarity index 100% rename from tests/gpu/torch/_compress/resources/configs/validate_solutions_defaults.yaml rename to tests/experimental/torch/_compress/resources/configs/validate_solutions_defaults.yaml diff --git a/tests/gpu/torch/_compress/resources/tokenizer/special_tokens_map.json b/tests/experimental/torch/_compress/resources/tokenizer/special_tokens_map.json similarity index 100% rename from tests/gpu/torch/_compress/resources/tokenizer/special_tokens_map.json rename to tests/experimental/torch/_compress/resources/tokenizer/special_tokens_map.json diff --git a/tests/gpu/torch/_compress/resources/tokenizer/tokenizer.json b/tests/experimental/torch/_compress/resources/tokenizer/tokenizer.json similarity index 100% rename from tests/gpu/torch/_compress/resources/tokenizer/tokenizer.json rename to tests/experimental/torch/_compress/resources/tokenizer/tokenizer.json diff --git a/tests/gpu/torch/_compress/resources/tokenizer/tokenizer_config.json b/tests/experimental/torch/_compress/resources/tokenizer/tokenizer_config.json similarity index 100% rename from tests/gpu/torch/_compress/resources/tokenizer/tokenizer_config.json rename to tests/experimental/torch/_compress/resources/tokenizer/tokenizer_config.json diff --git a/tests/gpu/torch/_compress/resources/tokenizer/truncate_tokenizer.py b/tests/experimental/torch/_compress/resources/tokenizer/truncate_tokenizer.py similarity index 100% rename from tests/gpu/torch/_compress/resources/tokenizer/truncate_tokenizer.py rename to tests/experimental/torch/_compress/resources/tokenizer/truncate_tokenizer.py diff --git a/tests/gpu/torch/_compress/test_compress.py b/tests/experimental/torch/_compress/test_compress.py similarity index 97% rename from tests/gpu/torch/_compress/test_compress.py rename to tests/experimental/torch/_compress/test_compress.py index ddcea6aaf..565c94423 100644 --- a/tests/gpu/torch/_compress/test_compress.py +++ b/tests/experimental/torch/_compress/test_compress.py @@ -57,7 +57,7 @@ def test_compress(project_root_path): dataset_path = osp.join(puzzle_dir, "dummy_dataset") hydra_config_dir = osp.join( project_root_path, - "tests/gpu/torch/_compress/resources/configs", + "tests/experimental/torch/_compress/resources/configs", ) _runtime = NativeDDP_Runtime( @@ -78,7 +78,7 @@ def test_compress(project_root_path): # This mimics the normal pipeline where we start with a Llama model # tokenizer_path = osp.join( - project_root_path, "tests/gpu/torch/_compress/resources/tokenizer" + project_root_path, "tests/experimental/torch/_compress/resources/tokenizer" ) tokenizer = AutoTokenizer.from_pretrained(tokenizer_path) From a5ba1c7023aa471304a1643261cfb6ce8101be67 Mon Sep 17 00:00:00 2001 From: Daniel Korzekwa Date: Mon, 27 Oct 2025 15:44:59 +0100 Subject: [PATCH 10/81] update docs Signed-off-by: Daniel Korzekwa --- .../_compress/resources/tokenizer/truncate_tokenizer.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/experimental/torch/_compress/resources/tokenizer/truncate_tokenizer.py b/tests/experimental/torch/_compress/resources/tokenizer/truncate_tokenizer.py index baac5e14c..1c911ac76 100644 --- a/tests/experimental/torch/_compress/resources/tokenizer/truncate_tokenizer.py +++ b/tests/experimental/torch/_compress/resources/tokenizer/truncate_tokenizer.py @@ -1,3 +1,8 @@ +""" +This script was used to truncate the tokenizer.json file from Llama 3.1 8B model +to keep only the top 100 most common tokens. +""" + import json # Path to your original and new tokenizer.json From 1bda391134370c65ddb600eacb67f81760f709ae Mon Sep 17 00:00:00 2001 From: Daniel Korzekwa Date: Mon, 27 Oct 2025 16:34:42 +0100 Subject: [PATCH 11/81] Replace mprint with print and replace osp.join with path1 / path2 notation. Signed-off-by: Daniel Korzekwa --- .../torch/_compress/test_compress.py | 50 +++++++++---------- 1 file changed, 23 insertions(+), 27 deletions(-) diff --git a/tests/experimental/torch/_compress/test_compress.py b/tests/experimental/torch/_compress/test_compress.py index 565c94423..c6547847e 100644 --- a/tests/experimental/torch/_compress/test_compress.py +++ b/tests/experimental/torch/_compress/test_compress.py @@ -1,12 +1,10 @@ import datetime import os -import os.path as osp import shutil from pathlib import Path import pytest import torch -from logger import mprint from puzzle_tools.hydra_utils import register_hydra_resolvers from puzzle_tools.runtime import NativeDDP_Runtime from scripts.convert_llama3_to_decilm import convert_llama3_to_decilm @@ -53,12 +51,9 @@ def project_root_path(request: pytest.FixtureRequest) -> Path: def test_compress(project_root_path): # The input to puzzletron.compress(). os.environ["WANDB_DISABLED"] = "true" - puzzle_dir = "/tmp/pytest-shared/test_compress_model" - dataset_path = osp.join(puzzle_dir, "dummy_dataset") - hydra_config_dir = osp.join( - project_root_path, - "tests/experimental/torch/_compress/resources/configs", - ) + puzzle_dir = Path("/tmp/pytest-shared/test_compress_model") + dataset_path = puzzle_dir / "dummy_dataset" + hydra_config_dir = project_root_path / "tests/experimental/torch/_compress/resources/configs" _runtime = NativeDDP_Runtime( dtype=torch.bfloat16, torch_distributed_timeout=datetime.timedelta(10) @@ -77,14 +72,15 @@ def test_compress(project_root_path): # Step 1: Create and save a teacher model to compress # This mimics the normal pipeline where we start with a Llama model # - tokenizer_path = osp.join( - project_root_path, "tests/experimental/torch/_compress/resources/tokenizer" + tokenizer_path = ( + project_root_path / "tests/experimental/torch/_compress/resources/tokenizer" ) + tokenizer = AutoTokenizer.from_pretrained(tokenizer_path) # Create a small Llama model (not DeciLM) to match the normal conversion pipeline hf_ckpt_teacher_dir = "ckpts/teacher" - llama_checkpoint_path = osp.join(puzzle_dir, hf_ckpt_teacher_dir) + llama_checkpoint_path = puzzle_dir / hf_ckpt_teacher_dir create_and_save_small_llama_model( llama_checkpoint_path, vocab_size=tokenizer.vocab_size, tokenizer=tokenizer ) @@ -97,7 +93,9 @@ def test_compress(project_root_path): runtime.wait_for_everyone() # Compress the model using a one-click approach - compress.compress(hydra_config_dir, "Llama-3_1-8B", puzzle_dir, dataset_path, runtime) + compress.compress( + str(hydra_config_dir), "Llama-3_1-8B", str(puzzle_dir), str(dataset_path), runtime + ) # # Check assertions @@ -108,38 +106,36 @@ def test_compress(project_root_path): rank_filepath = ( f"pruning/pruning_scores/ffn_iterative/100samples_diverse_mini/rank_{rank}.pth" ) - assert os.path.isfile(osp.join(puzzle_dir, rank_filepath)) + assert os.path.isfile(puzzle_dir / rank_filepath) # assertions for the pruning_ckpts step 2 - assert os.path.exists(osp.join(puzzle_dir, "ckpts/ffn_256_attn_no_op")) - - # assertions fo bypass distillation step 3 - # TODO: Add bypass distillation step - # assert os.path.exists(osp.join(hydra_cfg.bypass.experiment_dir, "latest/config.json")) + assert os.path.exists(puzzle_dir / "ckpts/ffn_256_attn_no_op") # assertions for the build_library_and_stats step 4 - assert os.path.isfile(osp.join(puzzle_dir, "replacement_library.json")) - assert os.path.isfile(osp.join(puzzle_dir, "subblock_stats.json")) + assert os.path.isfile(puzzle_dir / "replacement_library.json") + assert os.path.isfile(puzzle_dir / "subblock_stats.json") # assertions for the scoring step 5 - solution_0_filepath = osp.join( - puzzle_dir, "single_sequence_replacement_solutions--validation/solution_0.json" + solution_0_filepath = ( + puzzle_dir / "single_sequence_replacement_solutions--validation/solution_0.json" ) + assert os.path.exists(solution_0_filepath) # assertions for the mip_and_realize_models step 6 - solution_0_ckpt_config_path = osp.join( - puzzle_dir, - "mip/puzzle_solutions/target_memory_780000MiB/solutions--checkpoints/solution_0/config.json", + solution_0_ckpt_config_path = ( + puzzle_dir + / "mip/puzzle_solutions/target_memory_780000MiB/solutions--checkpoints/solution_0/config.json" ) + assert os.path.exists(solution_0_ckpt_config_path) assert os.path.exists( - osp.join(puzzle_dir, "mip/puzzle_solutions/target_memory_780000MiB/solutions.json") + puzzle_dir / "mip/puzzle_solutions/target_memory_780000MiB/solutions.json" ) runtime.wait_for_everyone() - mprint("PYTEST SUMMARY: test_compress_model() test has finished successfully") + print("PYTEST SUMMARY: test_compress_model() test has finished successfully") def create_and_save_small_llama_model( From bb38401971709adb97ba09d1a2017150dfd3c672 Mon Sep 17 00:00:00 2001 From: Daniel Korzekwa Date: Mon, 27 Oct 2025 16:39:04 +0100 Subject: [PATCH 12/81] Refactor file checking assertions to use .is_file() and .exists() Signed-off-by: Daniel Korzekwa --- .../torch/_compress/test_compress.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/tests/experimental/torch/_compress/test_compress.py b/tests/experimental/torch/_compress/test_compress.py index c6547847e..2245bbd4e 100644 --- a/tests/experimental/torch/_compress/test_compress.py +++ b/tests/experimental/torch/_compress/test_compress.py @@ -106,21 +106,22 @@ def test_compress(project_root_path): rank_filepath = ( f"pruning/pruning_scores/ffn_iterative/100samples_diverse_mini/rank_{rank}.pth" ) - assert os.path.isfile(puzzle_dir / rank_filepath) + assert (puzzle_dir / rank_filepath).is_file() # assertions for the pruning_ckpts step 2 - assert os.path.exists(puzzle_dir / "ckpts/ffn_256_attn_no_op") + assert (puzzle_dir / "ckpts/ffn_256_attn_no_op").exists() # assertions for the build_library_and_stats step 4 - assert os.path.isfile(puzzle_dir / "replacement_library.json") - assert os.path.isfile(puzzle_dir / "subblock_stats.json") + + assert (puzzle_dir / "replacement_library.json").is_file() + assert (puzzle_dir / "subblock_stats.json").is_file() # assertions for the scoring step 5 solution_0_filepath = ( puzzle_dir / "single_sequence_replacement_solutions--validation/solution_0.json" ) - assert os.path.exists(solution_0_filepath) + assert solution_0_filepath.exists() # assertions for the mip_and_realize_models step 6 solution_0_ckpt_config_path = ( @@ -128,10 +129,10 @@ def test_compress(project_root_path): / "mip/puzzle_solutions/target_memory_780000MiB/solutions--checkpoints/solution_0/config.json" ) - assert os.path.exists(solution_0_ckpt_config_path) - assert os.path.exists( + assert solution_0_ckpt_config_path.exists() + assert ( puzzle_dir / "mip/puzzle_solutions/target_memory_780000MiB/solutions.json" - ) + ).exists() runtime.wait_for_everyone() From 8415548bc9ee63fcbda02b8680a78db96630c44c Mon Sep 17 00:00:00 2001 From: Daniel Korzekwa Date: Mon, 27 Oct 2025 17:41:51 +0100 Subject: [PATCH 13/81] Add a new dependency section to setyp.py for the modelopt.torch._compress module. Signed-off-by: Daniel Korzekwa --- setup.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/setup.py b/setup.py index 67bf114ae..cfadd5170 100644 --- a/setup.py +++ b/setup.py @@ -99,6 +99,8 @@ "setuptools>=80", "setuptools-scm>=8", ], + # Dependedencies for modelopt.torch._compress subpackage + "compress": ["fire"], } # create "compound" optional dependencies From b1b18333a3fe6abc14ca5ad92960fdcd27981161 Mon Sep 17 00:00:00 2001 From: Daniel Korzekwa Date: Mon, 27 Oct 2025 18:43:24 +0100 Subject: [PATCH 14/81] Move test_convert_llama3_config_to_decilm_config.py to tests/experimental/ folder to not be run by CICD yet. Signed-off-by: Daniel Korzekwa --- .../converters/test_convert_llama3_config_to_decilm_config.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename tests/{gpu => experimental}/torch/_compress/decilm/converters/test_convert_llama3_config_to_decilm_config.py (100%) diff --git a/tests/gpu/torch/_compress/decilm/converters/test_convert_llama3_config_to_decilm_config.py b/tests/experimental/torch/_compress/decilm/converters/test_convert_llama3_config_to_decilm_config.py similarity index 100% rename from tests/gpu/torch/_compress/decilm/converters/test_convert_llama3_config_to_decilm_config.py rename to tests/experimental/torch/_compress/decilm/converters/test_convert_llama3_config_to_decilm_config.py From 6f28e4a75a0cdc66ef84c943f41b421f2d19fb5c Mon Sep 17 00:00:00 2001 From: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> Date: Mon, 27 Oct 2025 11:20:42 -0700 Subject: [PATCH 15/81] Fix: Add missing LICENSE headers Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> --- modelopt/torch/_compress/compress.py | 15 +++++++++++++++ .../resources/tokenizer/truncate_tokenizer.py | 15 +++++++++++++++ .../experimental/torch/_compress/test_compress.py | 15 +++++++++++++++ 3 files changed, 45 insertions(+) diff --git a/modelopt/torch/_compress/compress.py b/modelopt/torch/_compress/compress.py index 97819a42b..a3617e37a 100644 --- a/modelopt/torch/_compress/compress.py +++ b/modelopt/torch/_compress/compress.py @@ -1,3 +1,18 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + """ This module provides the main compression function for a model diff --git a/tests/experimental/torch/_compress/resources/tokenizer/truncate_tokenizer.py b/tests/experimental/torch/_compress/resources/tokenizer/truncate_tokenizer.py index 1c911ac76..aedcae4ab 100644 --- a/tests/experimental/torch/_compress/resources/tokenizer/truncate_tokenizer.py +++ b/tests/experimental/torch/_compress/resources/tokenizer/truncate_tokenizer.py @@ -1,3 +1,18 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + """ This script was used to truncate the tokenizer.json file from Llama 3.1 8B model to keep only the top 100 most common tokens. diff --git a/tests/experimental/torch/_compress/test_compress.py b/tests/experimental/torch/_compress/test_compress.py index 2245bbd4e..452d2b6f6 100644 --- a/tests/experimental/torch/_compress/test_compress.py +++ b/tests/experimental/torch/_compress/test_compress.py @@ -1,3 +1,18 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import datetime import os import shutil From 016fb63c0a0283ba15a79dd5a1aa9db42f784e1e Mon Sep 17 00:00:00 2001 From: Daniel Korzekwa Date: Tue, 28 Oct 2025 11:35:43 +0100 Subject: [PATCH 16/81] Use spawn_multiprocess_job for test_compress test (to be able to use tmp_path. Signed-off-by: Daniel Korzekwa --- modelopt/torch/_compress/compress.py | 2 +- modelopt/torch/_compress/runtime.py | 539 ++++++++++++++++++ .../torch/_compress/test_compress.py | 35 +- 3 files changed, 563 insertions(+), 13 deletions(-) create mode 100644 modelopt/torch/_compress/runtime.py diff --git a/modelopt/torch/_compress/compress.py b/modelopt/torch/_compress/compress.py index a3617e37a..265fd5eeb 100644 --- a/modelopt/torch/_compress/compress.py +++ b/modelopt/torch/_compress/compress.py @@ -43,7 +43,7 @@ def compress( puzzle_dir (str): directory with a puzzletron model to compress dataset_path (str): dataset used for scoring and distillation runtime: distributed runtime to use to run the compression steps, e.g., - NativeDDP_Runtime(dtype=torch.bfloat16, torch_distributed_timeout=datetime.timedelta(10)) + NativeDdpRuntime(dtype=torch.bfloat16, torch_distributed_timeout=datetime.timedelta(10)) Returns: Hydra config object after compressing the model. diff --git a/modelopt/torch/_compress/runtime.py b/modelopt/torch/_compress/runtime.py new file mode 100644 index 000000000..e46a48a18 --- /dev/null +++ b/modelopt/torch/_compress/runtime.py @@ -0,0 +1,539 @@ +import os +import random +from abc import ABC, abstractmethod +from collections.abc import Callable, Iterable, Iterator, Sequence +from contextlib import AbstractContextManager, suppress +from datetime import timedelta +from pathlib import Path +from typing import Literal, TypeVar, cast + +import numpy as np +import torch +import torch.distributed +import torch.nn as nn +from torch.utils.data import DataLoader +from tqdm import tqdm +from typing_extensions import override + +PrepareModelsT = TypeVar("PrepareModelsT", bound=Sequence[nn.Module]) +PrepareDataLoaderT = TypeVar("PrepareDataLoaderT", bound=DataLoader) +CompileT = TypeVar("CompileT", bound=nn.Module) +Filter = ( + Literal["main_process", "last", "local_main_process", "local_last", "all"] + | list[int] + | set[int] + | Callable[[int], bool] +) + + +class IRuntime(ABC): + @abstractmethod + def setup(self) -> None: ... + + @abstractmethod + def cleanup(self) -> None: ... + + @abstractmethod + def autocast(self) -> AbstractContextManager: ... + + @abstractmethod + def wait_for_everyone(self) -> None: ... + + @abstractmethod + def set_seed(self, seed: int, device_specific: bool = False) -> int: ... + + @abstractmethod + def prepare_models(self, models: PrepareModelsT) -> PrepareModelsT: ... + + @abstractmethod + def prepare_train_dataloader( + self, train_dataloader: PrepareDataLoaderT + ) -> PrepareDataLoaderT: ... + + @abstractmethod + def prepare_val_dataloader(self, val_dataloader: PrepareDataLoaderT) -> PrepareDataLoaderT: ... + + @abstractmethod + def compile(self, model: CompileT) -> CompileT: ... + + @abstractmethod + def backward(self, loss: torch.Tensor) -> None: ... + + @abstractmethod + def clip_grad_norm_( + self, + parameters: Iterable[torch.Tensor] | torch.Tensor, + max_norm: float, + norm_type: float = 2, + ) -> torch.Tensor: ... + + @abstractmethod + def clip_grad_value_( + self, parameters: Iterable[torch.Tensor] | torch.Tensor, clip_value: float + ) -> None: ... + + @abstractmethod + def save_state(self, path: str | Path) -> None: ... + + @abstractmethod + def load_state(self, path: str | Path) -> None: ... + + @abstractmethod + def skip_first_batches(self, dataloader_iterator: Iterator, num_batches: int) -> None: ... + + @property + @abstractmethod + def sync_gradients(self) -> bool: ... + + @property + @abstractmethod + def device(self) -> torch.device: ... + + @property + @abstractmethod + def is_main_process(self) -> bool: ... + + @property + @abstractmethod + def is_local_main_process(self) -> bool: ... + + @property + @abstractmethod + def is_last_process(self) -> bool: ... + + @property + @abstractmethod + def is_local_last_process(self) -> bool: ... + + @property + @abstractmethod + def local_rank(self) -> int: ... + + @property + @abstractmethod + def global_rank(self) -> int: ... + + @property + @abstractmethod + def local_world_size(self) -> int: ... + + @property + @abstractmethod + def world_size(self) -> int: ... + + @property + @abstractmethod + def dtype(self) -> torch.dtype: ... + + def __enter__(self): + self.setup() + return self + + def __exit__(self, exc_type, exc_value, traceback): + # avoid barrier if exceution errored + if exc_type is None: + self.cleanup() + + # if exc_type is not None: + # raise exc_value + # Handle exceptions if necessary + # pass + + # def __del__(self): + # torch.distributed.barrier() + # torch.distributed.destroy_process_group() + + def check_filter(self, filter_: Filter): + return ( + filter_ == "all" + or (filter_ == "main_process" and self.is_main_process) + or (filter_ == "local_main_process" and self.is_local_main_process) + or (filter_ == "last" and self.is_last_process) + or (filter_ == "local_last" and self.is_local_last_process) + or (isinstance(filter_, (list, set)) and self.global_rank in filter_) + or (callable(filter_) and filter_(self.global_rank)) + ) + + def print( + self, *args, filter_: Filter = "main_process", rank_prefix=False, flush=True, **kwargs + ) -> None: + if not self.check_filter(filter_): + return + + if rank_prefix: + print(f"[global_rank={self.global_rank}]", *args, flush=flush, **kwargs) + else: + print(*args, flush=flush, **kwargs) + + def process_print( + self, *args, filter_: Filter = "all", rank_prefix=True, flush=True, **kwargs + ) -> None: + if not self.check_filter(filter_): + return + + if rank_prefix: + prefix = f"[global_rank={self.global_rank}]" + if len(args) == 1: # avoid out-of-order printing if possible + out = f"{prefix} {args[0]}" + args = (out,) + else: + args = (prefix, *args) + print(*args, flush=flush, **kwargs) + else: + print(*args, flush=flush, **kwargs) + + +class NativeDdpRuntime(IRuntime): + def __init__( + self, + dtype: torch.dtype = torch.float, + torch_distributed_timeout: timedelta | None = None, + ): + self._master_addr = os.environ["MASTER_ADDR"] + self._master_port = int(os.environ["MASTER_PORT"]) + self._local_rank = int(os.environ["LOCAL_RANK"]) + self._global_rank = int(os.environ["RANK"]) + self._local_world_size = int(os.environ["LOCAL_WORLD_SIZE"]) + self._world_size = int(os.environ["WORLD_SIZE"]) + self._device = torch.device(self.local_rank) + self._dtype = dtype + self._torch_distributed_timeout = torch_distributed_timeout + + @override + def setup(self): + torch.cuda.set_device(self._device) + if not torch.distributed.is_initialized(): + torch.distributed.init_process_group( + "cpu:gloo,cuda:nccl", timeout=self._torch_distributed_timeout + ) + input_tensors = [ + torch.tensor([0], dtype=torch.float32, device=self._device) + for _ in range(self.world_size) + ] + output_tensors = [ + torch.tensor([0], dtype=torch.float32, device=self._device) + for _ in range(self.world_size) + ] + torch.distributed.all_to_all(input_tensors, output_tensors) + + @override + def cleanup(self): + with suppress(Exception): + torch.distributed.barrier() + torch.distributed.destroy_process_group() + + @override + def autocast(self) -> AbstractContextManager: + result = torch.autocast(device_type="cuda", dtype=self._dtype, enabled=True) + return result + + @override + def wait_for_everyone(self): + torch.distributed.barrier() + + @override + def set_seed(self, seed: int, device_specific: bool = False) -> int: + """ + Helper function for reproducible behavior to set the seed in `random`, `numpy`, `torch`. + + Args: + seed (`int`): + The seed to set. + device_specific (`bool`, *optional*, defaults to `False`): + Whether to differ the seed on each device slightly with `self.process_index`. + """ + if device_specific: + seed += self.global_rank + + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + + return seed + + @override + def prepare_models(self, models: PrepareModelsT) -> PrepareModelsT: + assert all(isinstance(x, nn.Module) for x in models) + new_models = [nn.parallel.DistributedDataParallel(m) for m in models] + new_models = cast("PrepareModelsT", new_models) + return new_models # type: ignore[return-value] + + @override + def prepare_train_dataloader(self, train_dataloader: PrepareDataLoaderT) -> PrepareDataLoaderT: + return train_dataloader + + @override + def prepare_val_dataloader(self, val_dataloader: PrepareDataLoaderT) -> PrepareDataLoaderT: + return val_dataloader + + @override + def compile(self, model: CompileT) -> CompileT: + result = torch.compile(model) + result = cast("CompileT", result) + return result + + @override + def backward(self, loss: torch.Tensor) -> None: + loss.backward() + + @override + def clip_grad_norm_( + self, + parameters: Iterable[torch.Tensor] | torch.Tensor, + max_norm: float, + norm_type: float = 2, + ) -> torch.Tensor: + result = torch.nn.utils.clip_grad_norm_(parameters, max_norm, norm_type=norm_type) + return result + + @override + def clip_grad_value_( + self, parameters: Iterable[torch.Tensor] | torch.Tensor, clip_value: float + ) -> None: + torch.nn.utils.clip_grad_value_(parameters, clip_value) + + @override + def save_state(self, path: str | Path) -> None: + pass + + @override + def load_state(self, path: str | Path) -> None: + pass + + @override + def skip_first_batches(self, dataloader_iterator: Iterator, num_batches: int) -> None: + for _ in tqdm( + range(num_batches), desc=f"rank {self._global_rank}: skip_first_batches({num_batches=})" + ): + next(dataloader_iterator) + + @property + @override + def sync_gradients(self) -> bool: + return True + + @property + @override + def is_main_process(self) -> bool: + result = self.global_rank == 0 + return result + + @property + @override + def is_local_main_process(self) -> bool: + result = self.local_rank == 0 + return result + + @property + @override + def is_last_process(self) -> bool: + result = self.global_rank == self.world_size - 1 + return result + + @property + @override + def is_local_last_process(self) -> bool: + result = self.local_rank == self.local_world_size - 1 + return result + + @property + @override + def local_rank(self) -> int: + return self._local_rank + + @property + @override + def global_rank(self) -> int: + return self._global_rank + + @property + @override + def local_world_size(self) -> int: + return self._local_world_size + + @property + @override + def world_size(self) -> int: + return self._world_size + + @property + @override + def device(self) -> torch.device: + return self._device + + @property + @override + def dtype(self) -> torch.dtype: + return self._dtype + + @property + def master_addr(self) -> str: + return self._master_addr + + @property + def master_port(self) -> int: + return self._master_port + + +class BaseRuntime(IRuntime): + def __init__(self, dtype: torch.dtype = torch.float): + self._device = torch.device(self.local_rank) + self._dtype = dtype + + @override + def setup(self): + torch.cuda.set_device(self._device) + + @override + def cleanup(self): ... + + @override + def autocast(self) -> AbstractContextManager: + result = torch.autocast(device_type="cuda", dtype=self._dtype, enabled=True) + return result + + @override + def wait_for_everyone(self): ... + + @override + def set_seed(self, seed: int, device_specific: bool = False) -> int: + """ + Helper function for reproducible behavior to set the seed in `random`, `numpy`, `torch`. + + Args: + seed (`int`): + The seed to set. + device_specific (`bool`, *optional*, defaults to `False`): + Whether to differ the seed on each device slightly with `self.process_index`. + """ + if device_specific: + seed += self.global_rank + + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + + return seed + + @override + def prepare_models(self, models: PrepareModelsT) -> PrepareModelsT: + assert all(isinstance(x, nn.Module) for x in models) + return models + + @override + def prepare_train_dataloader(self, train_dataloader: PrepareDataLoaderT) -> PrepareDataLoaderT: + return train_dataloader + + @override + def prepare_val_dataloader(self, val_dataloader: PrepareDataLoaderT) -> PrepareDataLoaderT: + return val_dataloader + + @override + def compile(self, model: CompileT) -> CompileT: + result = torch.compile(model) + result = cast("CompileT", result) + return result + + @override + def backward(self, loss: torch.Tensor) -> None: + loss.backward() + + @override + def clip_grad_norm_( + self, + parameters: Iterable[torch.Tensor] | torch.Tensor, + max_norm: float, + norm_type: float = 2, + ) -> torch.Tensor: + result = torch.nn.utils.clip_grad_norm_(parameters, max_norm, norm_type=norm_type) + return result + + @override + def clip_grad_value_( + self, parameters: Iterable[torch.Tensor] | torch.Tensor, clip_value: float + ) -> None: + torch.nn.utils.clip_grad_value_(parameters, clip_value) + + @override + def save_state(self, path: str | Path) -> None: + pass + + @override + def load_state(self, path: str | Path) -> None: + pass + + @override + def skip_first_batches(self, dataloader_iterator: Iterator, num_batches: int) -> None: + for _ in tqdm( + range(num_batches), desc=f"rank {self.global_rank}: skip_first_batches({num_batches=})" + ): + next(dataloader_iterator) + + @property + @override + def sync_gradients(self) -> bool: + return True + + @property + @override + def is_main_process(self) -> bool: + result = self.global_rank == 0 + return result + + @property + @override + def is_local_main_process(self) -> bool: + result = self.local_rank == 0 + return result + + @property + @override + def is_last_process(self) -> bool: + result = self.global_rank == self.world_size - 1 + return result + + @property + @override + def is_local_last_process(self) -> bool: + result = self.local_rank == self.local_world_size - 1 + return result + + @property + @override + def local_rank(self) -> int: + return 0 + + @property + @override + def global_rank(self) -> int: + return 0 + + @property + @override + def local_world_size(self) -> int: + return 1 + + @property + @override + def world_size(self) -> int: + return 1 + + @property + @override + def device(self) -> torch.device: + return self._device + + @property + @override + def dtype(self) -> torch.dtype: + return self._dtype + + @property + def master_addr(self) -> str | None: + return None + + @property + def master_port(self) -> int | None: + return None diff --git a/tests/experimental/torch/_compress/test_compress.py b/tests/experimental/torch/_compress/test_compress.py index 452d2b6f6..7e4078d41 100644 --- a/tests/experimental/torch/_compress/test_compress.py +++ b/tests/experimental/torch/_compress/test_compress.py @@ -16,24 +16,21 @@ import datetime import os import shutil +from functools import partial from pathlib import Path import pytest import torch +from _test_utils.torch_dist.dist_utils import spawn_multiprocess_job from puzzle_tools.hydra_utils import register_hydra_resolvers -from puzzle_tools.runtime import NativeDDP_Runtime from scripts.convert_llama3_to_decilm import convert_llama3_to_decilm from transformers import AutoTokenizer, LlamaConfig, LlamaForCausalLM, PreTrainedTokenizerBase from modelopt.torch._compress import compress +from modelopt.torch._compress.runtime import NativeDdpRuntime from tests.integration.puzzle_tools.e2e_puzzletron_test.dummy_dataset import save_dummy_dataset -@pytest.fixture(scope="module", autouse=True) -def setup_test_module(): - register_hydra_resolvers() - - @pytest.fixture def project_root_path(request: pytest.FixtureRequest) -> Path: return Path(request.config.rootpath) @@ -59,18 +56,32 @@ def project_root_path(request: pytest.FixtureRequest) -> Path: # # export PYTHONPATH=$PYTHONPATH:/workspace/puzzletron/v1 # -# ../puzzletron/v1/scripts/torch_dist_runner.sh \ -# pytest -s -v ./tests/gpu/torch/puzzletron/test_compress_model.py -o addopts="" +# pytest -s -v ./tests/experimental/torch/_compress/test_compress.py::test_compress -o addopts="" -def test_compress(project_root_path): - # The input to puzzletron.compress(). +def test_compress(project_root_path: Path, tmp_path: Path): + spawn_multiprocess_job( + size=torch.cuda.device_count(), + job=partial(_test_compress_multiprocess_job, project_root_path, tmp_path), + backend="nccl", + ) + + +def _test_compress_multiprocess_job(project_root_path: Path, tmp_path: Path, rank: int, size: int): + register_hydra_resolvers() + + # Set environment variables expected by NativeDDP_Runtime + os.environ["RANK"] = str(rank) + os.environ["LOCAL_RANK"] = str(rank) + os.environ["WORLD_SIZE"] = str(size) + os.environ["LOCAL_WORLD_SIZE"] = str(size) os.environ["WANDB_DISABLED"] = "true" - puzzle_dir = Path("/tmp/pytest-shared/test_compress_model") + + puzzle_dir = tmp_path dataset_path = puzzle_dir / "dummy_dataset" hydra_config_dir = project_root_path / "tests/experimental/torch/_compress/resources/configs" - _runtime = NativeDDP_Runtime( + _runtime = NativeDdpRuntime( dtype=torch.bfloat16, torch_distributed_timeout=datetime.timedelta(10) ) From 0ccf1c43d1bdc454ae911ff613108b3413cdc8d2 Mon Sep 17 00:00:00 2001 From: Daniel Korzekwa Date: Tue, 28 Oct 2025 14:57:03 +0100 Subject: [PATCH 17/81] Add comments. Signed-off-by: Daniel Korzekwa --- modelopt/torch/_compress/runtime.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/modelopt/torch/_compress/runtime.py b/modelopt/torch/_compress/runtime.py index e46a48a18..46f561a5d 100644 --- a/modelopt/torch/_compress/runtime.py +++ b/modelopt/torch/_compress/runtime.py @@ -1,3 +1,20 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Classes for torch distributed runtime management""" + import os import random from abc import ABC, abstractmethod From 58439ca0273ba38a3b1ff9e010c866ef1794903d Mon Sep 17 00:00:00 2001 From: Daniel Korzekwa Date: Tue, 28 Oct 2025 16:35:13 +0100 Subject: [PATCH 18/81] Add _save_dummy_dataset to the test_compress.py Signed-off-by: Daniel Korzekwa --- .../torch/_compress/test_compress.py | 50 ++++++++++++++++--- 1 file changed, 44 insertions(+), 6 deletions(-) diff --git a/tests/experimental/torch/_compress/test_compress.py b/tests/experimental/torch/_compress/test_compress.py index 7e4078d41..e1d2e84a6 100644 --- a/tests/experimental/torch/_compress/test_compress.py +++ b/tests/experimental/torch/_compress/test_compress.py @@ -22,13 +22,13 @@ import pytest import torch from _test_utils.torch_dist.dist_utils import spawn_multiprocess_job +from datasets import Dataset, DatasetDict from puzzle_tools.hydra_utils import register_hydra_resolvers from scripts.convert_llama3_to_decilm import convert_llama3_to_decilm from transformers import AutoTokenizer, LlamaConfig, LlamaForCausalLM, PreTrainedTokenizerBase from modelopt.torch._compress import compress from modelopt.torch._compress.runtime import NativeDdpRuntime -from tests.integration.puzzle_tools.e2e_puzzletron_test.dummy_dataset import save_dummy_dataset @pytest.fixture @@ -91,8 +91,8 @@ def _test_compress_multiprocess_job(project_root_path: Path, tmp_path: Path, ran # if runtime.global_rank == 0: # Setup puzzle_dir and dataset - setup_puzzle_dir(puzzle_dir) - save_dummy_dataset(dataset_path) + _setup_puzzle_dir(puzzle_dir) + _save_dummy_dataset(dataset_path) # # Step 1: Create and save a teacher model to compress @@ -107,7 +107,7 @@ def _test_compress_multiprocess_job(project_root_path: Path, tmp_path: Path, ran # Create a small Llama model (not DeciLM) to match the normal conversion pipeline hf_ckpt_teacher_dir = "ckpts/teacher" llama_checkpoint_path = puzzle_dir / hf_ckpt_teacher_dir - create_and_save_small_llama_model( + _create_and_save_small_llama_model( llama_checkpoint_path, vocab_size=tokenizer.vocab_size, tokenizer=tokenizer ) @@ -165,7 +165,7 @@ def _test_compress_multiprocess_job(project_root_path: Path, tmp_path: Path, ran print("PYTEST SUMMARY: test_compress_model() test has finished successfully") -def create_and_save_small_llama_model( +def _create_and_save_small_llama_model( output_path: str, vocab_size: int, tokenizer: PreTrainedTokenizerBase ): """ @@ -203,7 +203,45 @@ def create_and_save_small_llama_model( llama_config.save_pretrained(output_path) -def setup_puzzle_dir(puzzle_dir: str): +def _setup_puzzle_dir(puzzle_dir: str): if Path(puzzle_dir).exists(): shutil.rmtree(puzzle_dir) Path(puzzle_dir).mkdir(parents=True, exist_ok=True) + + +def _save_dummy_dataset(dataset_path: str): + # dummy sample + sample = [ + {"role": "user", "content": "please cite Lorem Ipsum?"}, + { + "role": "assistant", + "content": ( + "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed in blandit ante. " + "Sed tempus erat urna, ac elementum nisl facilisis quis. Aliquam consectetur mollis massa, " + "in elementum sem venenatis posuere. Fusce lorem arcu, egestas vel massa sollicitudin, " + "dictum mollis purus. Proin in ullamcorper elit. Nam tellus nisi, volutpat a mattis vel, " + "pretium in purus. Nunc at lectus facilisis risus scelerisque rhoncus eu nec ex. " + "Maecenas semper, tellus non placerat vulputate, urna felis facilisis diam, " + "sit amet vestibulum erat sapien nec libero. Praesent non massa velit. Donec faucibus mi eros. " + "Nam turpis nulla, congue sit amet mi at, porttitor scelerisque elit. Nunc id sodales lorem, " + "nec tincidunt leo. Quisque a neque nec ligula porttitor auctor. " + "Nunc accumsan nunc ac tellus congue vehicula. Praesent tellus eros, luctus non gravida dapibus, " + "faucibus eu ex. Quisque bibendum leo pharetra, tristique est vitae, hendrerit nunc. " + "Duis nec congue dolor. Donec commodo ipsum non efficitur volutpat. " + "Nulla risus nulla, efficitur et urna at, imperdiet sodales lorem. " + "Suspendisse erat est, sollicitudin at nisl tincidunt, vehicula hendrerit lectus. " + "Nam quis nisi ullamcorper, rhoncus massa vel, tempus purus. " + "Duis pulvinar eros vel nulla pellentesque, at dapibus justo laoreet. " + "Praesent tortor orci, vulputate fermentum dapibus nec, feugiat vitae tortor. " + "Donec mollis convallis massa quis iaculis." + ), + }, + ] + + # Prepare train and val splits with sample repeated, 2500 samples are for + # 128 samples with block-size 8192 and LLama3 tokenizer + data = [{"conversation": sample}] * 2500 + + # For train-val splits + data_dict = DatasetDict({"train": Dataset.from_list(data), "valid": Dataset.from_list(data)}) + data_dict.save_to_disk(dataset_path) From 2e5f776bf7d49410ef207180f5b78cf16647f815 Mon Sep 17 00:00:00 2001 From: Daniel Korzekwa Date: Tue, 28 Oct 2025 16:42:52 +0100 Subject: [PATCH 19/81] Refactoring: Move torch distributed env variables to dist_utils.py Signed-off-by: Daniel Korzekwa --- tests/_test_utils/torch_dist/dist_utils.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/_test_utils/torch_dist/dist_utils.py b/tests/_test_utils/torch_dist/dist_utils.py index c7407b018..3b85728aa 100644 --- a/tests/_test_utils/torch_dist/dist_utils.py +++ b/tests/_test_utils/torch_dist/dist_utils.py @@ -34,6 +34,10 @@ def init_process(rank, size, job=None, backend="gloo", port=None): """Initialize the distributed environment.""" os.environ["MASTER_ADDR"] = "localhost" + os.environ["RANK"] = str(rank) + os.environ["LOCAL_RANK"] = str(rank) + os.environ["WORLD_SIZE"] = str(size) + os.environ["LOCAL_WORLD_SIZE"] = str(size) port = str(get_free_port()) if port is None else str(port) From 6274db5164a3e8a7c5299c329d9d004b6124f04d Mon Sep 17 00:00:00 2001 From: Daniel Korzekwa Date: Tue, 28 Oct 2025 16:44:23 +0100 Subject: [PATCH 20/81] Refactoring: move torch distributed variables to dist_utils Signed-off-by: Daniel Korzekwa --- tests/experimental/torch/_compress/test_compress.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/tests/experimental/torch/_compress/test_compress.py b/tests/experimental/torch/_compress/test_compress.py index e1d2e84a6..e72d8ad34 100644 --- a/tests/experimental/torch/_compress/test_compress.py +++ b/tests/experimental/torch/_compress/test_compress.py @@ -71,10 +71,6 @@ def _test_compress_multiprocess_job(project_root_path: Path, tmp_path: Path, ran register_hydra_resolvers() # Set environment variables expected by NativeDDP_Runtime - os.environ["RANK"] = str(rank) - os.environ["LOCAL_RANK"] = str(rank) - os.environ["WORLD_SIZE"] = str(size) - os.environ["LOCAL_WORLD_SIZE"] = str(size) os.environ["WANDB_DISABLED"] = "true" puzzle_dir = tmp_path @@ -89,7 +85,7 @@ def _test_compress_multiprocess_job(project_root_path: Path, tmp_path: Path, ran # # Test setup # - if runtime.global_rank == 0: + if rank == 0: # Setup puzzle_dir and dataset _setup_puzzle_dir(puzzle_dir) _save_dummy_dataset(dataset_path) @@ -126,7 +122,7 @@ def _test_compress_multiprocess_job(project_root_path: Path, tmp_path: Path, ran # # Check assertions # - if runtime.global_rank == 0: + if rank == 0: # assertions for the score_pruning_activations step 1 rank = int(os.environ["RANK"]) rank_filepath = ( From d942e0a4907f9cddf9ddc89a038196126fdbae04 Mon Sep 17 00:00:00 2001 From: Daniel Korzekwa Date: Tue, 28 Oct 2025 16:54:41 +0100 Subject: [PATCH 21/81] Move os.environ["WANDB_DISABLED"] = "true" to dist_utils.py Signed-off-by: Daniel Korzekwa --- tests/_test_utils/torch_dist/dist_utils.py | 1 + tests/experimental/torch/_compress/test_compress.py | 3 --- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/_test_utils/torch_dist/dist_utils.py b/tests/_test_utils/torch_dist/dist_utils.py index 3b85728aa..f7160cf28 100644 --- a/tests/_test_utils/torch_dist/dist_utils.py +++ b/tests/_test_utils/torch_dist/dist_utils.py @@ -38,6 +38,7 @@ def init_process(rank, size, job=None, backend="gloo", port=None): os.environ["LOCAL_RANK"] = str(rank) os.environ["WORLD_SIZE"] = str(size) os.environ["LOCAL_WORLD_SIZE"] = str(size) + os.environ["WANDB_DISABLED"] = "true" port = str(get_free_port()) if port is None else str(port) diff --git a/tests/experimental/torch/_compress/test_compress.py b/tests/experimental/torch/_compress/test_compress.py index e72d8ad34..096de4de3 100644 --- a/tests/experimental/torch/_compress/test_compress.py +++ b/tests/experimental/torch/_compress/test_compress.py @@ -70,9 +70,6 @@ def test_compress(project_root_path: Path, tmp_path: Path): def _test_compress_multiprocess_job(project_root_path: Path, tmp_path: Path, rank: int, size: int): register_hydra_resolvers() - # Set environment variables expected by NativeDDP_Runtime - os.environ["WANDB_DISABLED"] = "true" - puzzle_dir = tmp_path dataset_path = puzzle_dir / "dummy_dataset" hydra_config_dir = project_root_path / "tests/experimental/torch/_compress/resources/configs" From f765921d8427f5854dc51af918d5f41d2eac7e5a Mon Sep 17 00:00:00 2001 From: Daniel Korzekwa Date: Tue, 28 Oct 2025 18:40:48 +0100 Subject: [PATCH 22/81] Implement integration test for mnt.convert() for the _compress algorithm. Signed-off-by: Daniel Korzekwa --- .../nas/plugins/_compress/test_nas_convert.py | 28 +++++++++++++++++++ tests/gpu/torch/_compress/test_compress.py | 2 ++ 2 files changed, 30 insertions(+) create mode 100644 tests/experimental/torch/nas/plugins/_compress/test_nas_convert.py diff --git a/tests/experimental/torch/nas/plugins/_compress/test_nas_convert.py b/tests/experimental/torch/nas/plugins/_compress/test_nas_convert.py new file mode 100644 index 000000000..23f8f3cfe --- /dev/null +++ b/tests/experimental/torch/nas/plugins/_compress/test_nas_convert.py @@ -0,0 +1,28 @@ +from pathlib import Path + +import pytest +from gpu.torch._compress.test_compress import create_and_save_small_llama_model +from transformers import AutoTokenizer + + +@pytest.fixture +def project_root_path(request: pytest.FixtureRequest) -> Path: + return Path(request.config.rootpath) + + +# +# See tests/gpu/torch/_compress/test_compress.py for instructions on how to run this test +# TODO: Remove those instructions once this test runs automatically on CI +# +def test_nas_convert(project_root_path: Path, tmp_path: Path): + puzzle_dir = tmp_path + + # Create a small Llama model (input to the mnt.convert() - the first model conversion step) + tokenizer_path = project_root_path / "tests/gpu/torch/_compress/resources/tokenizer" + tokenizer = AutoTokenizer.from_pretrained(tokenizer_path) + hf_ckpt_teacher_dir = "ckpts/teacher" + llama_checkpoint_path = puzzle_dir / hf_ckpt_teacher_dir + # TODO: the same as in tests/gpu/torch/_compress/test_compress.py (refactor it) + create_and_save_small_llama_model( + llama_checkpoint_path, vocab_size=tokenizer.vocab_size, tokenizer=tokenizer + ) diff --git a/tests/gpu/torch/_compress/test_compress.py b/tests/gpu/torch/_compress/test_compress.py index ddcea6aaf..217acd533 100644 --- a/tests/gpu/torch/_compress/test_compress.py +++ b/tests/gpu/torch/_compress/test_compress.py @@ -48,6 +48,8 @@ def project_root_path(request: pytest.FixtureRequest) -> Path: # # ../puzzletron/v1/scripts/torch_dist_runner.sh \ # pytest -s -v ./tests/gpu/torch/puzzletron/test_compress_model.py -o addopts="" +# +# TODO: Remove those instructions once this test runs automatically on CI def test_compress(project_root_path): From de876d6b409462ad55f5bcc81b2e0f25cc8ece34 Mon Sep 17 00:00:00 2001 From: Daniel Korzekwa Date: Tue, 28 Oct 2025 19:15:15 +0100 Subject: [PATCH 23/81] Implement mtn.convert() for compress() algorithm. Signed-off-by: Daniel Korzekwa --- .../plugins/_compress/compress_nas_plugin.py | 103 ++++++++++++++++++ .../nas/plugins/_compress/test_nas_convert.py | 37 ++++++- tests/gpu/torch/_compress/test_compress.py | 12 +- 3 files changed, 148 insertions(+), 4 deletions(-) create mode 100644 modelopt/torch/nas/plugins/_compress/compress_nas_plugin.py diff --git a/modelopt/torch/nas/plugins/_compress/compress_nas_plugin.py b/modelopt/torch/nas/plugins/_compress/compress_nas_plugin.py new file mode 100644 index 000000000..563d8055f --- /dev/null +++ b/modelopt/torch/nas/plugins/_compress/compress_nas_plugin.py @@ -0,0 +1,103 @@ +from torch import nn + +from modelopt.torch.nas.conversion import NASModeRegistry +from modelopt.torch.opt.config import ModeloptBaseConfig, ModeloptField +from modelopt.torch.opt.mode import ( + ConvertEntrypoint, + ConvertReturnType, + MetadataDict, + ModeDescriptor, + RestoreEntrypoint, +) +from modelopt.torch.opt.searcher import BaseSearcher + + +class CompressModel(nn.Module): + pass + + +class CompressConfig(ModeloptBaseConfig): + """Configuration for Compress NAS algorithm.""" + + hydra_config_dir: str = ModeloptField( + default="", + title="", + description="", + ) + + puzzle_dir: str = ModeloptField( + default="", + title="", + description="", + ) + + dataset_path: str = ModeloptField( + default="", + title="", + description="", + ) + + +# TOD: Why is it called SuperNetMLP? +class SuperNetMLP(CompressModel): + """Marker subclass indicating converted/search-space state for CompressConfig. + TODO: Provide better description + """ + + hydra_config_dir: str + puzzle_dir: str + dataset_path: str + + +def convert_compress_model(model: nn.Module, config: CompressConfig) -> ConvertReturnType: + """Convert the model to a search space model.""" + print("=" * 80) + print(f"[convert] before convert:\n{model}") + model.__class__ = SuperNetMLP + model.hydra_config_dir = config.hydra_config_dir + model.puzzle_dir = config.puzzle_dir + model.dataset_path = config.dataset_path + print(f"[convert] after convert:\n{model}") + return model, {} + + +def restore_compress_model( + model: nn.Module, config: CompressConfig, metadata: MetadataDict +) -> nn.Module: + """Reuse convert to produce the same behavior on restore.""" + return convert_compress_model(model, config)[0] + + +@NASModeRegistry.register_mode +class CompressDescriptor(ModeDescriptor): + """Descriptor for the Compress mode.""" + + @property + def name(self) -> str: + """String identifier for this mode.""" + return "compress" + + @property + def config_class(self) -> type[ModeloptBaseConfig]: + """Configuration class for this mode.""" + return CompressConfig + + @property + def search_algorithm(self) -> type[BaseSearcher]: + """Return the associated searcher implementation.""" + raise NotImplementedError("Compress mode does not have a search algorithm.") + + @property + def convert(self) -> ConvertEntrypoint: + """Entrypoint to convert a model.""" + return convert_compress_model + + @property + def restore(self) -> RestoreEntrypoint: + """Entrypoint to restore a model.""" + return restore_compress_model + + @property + def export_mode(self) -> str | None: + """The mode that corresponds to the export mode of this mode.""" + return "export" diff --git a/tests/experimental/torch/nas/plugins/_compress/test_nas_convert.py b/tests/experimental/torch/nas/plugins/_compress/test_nas_convert.py index 23f8f3cfe..3bd49da69 100644 --- a/tests/experimental/torch/nas/plugins/_compress/test_nas_convert.py +++ b/tests/experimental/torch/nas/plugins/_compress/test_nas_convert.py @@ -4,6 +4,9 @@ from gpu.torch._compress.test_compress import create_and_save_small_llama_model from transformers import AutoTokenizer +import modelopt.torch.nas as mtn +from modelopt.torch.nas.plugins._compress.compress_nas_plugin import CompressModel + @pytest.fixture def project_root_path(request: pytest.FixtureRequest) -> Path: @@ -15,9 +18,19 @@ def project_root_path(request: pytest.FixtureRequest) -> Path: # TODO: Remove those instructions once this test runs automatically on CI # def test_nas_convert(project_root_path: Path, tmp_path: Path): + # + # Step 1: Setup the puzzle_dir, dataset, hydra_config_dir, and input model + # needed for the mnt.convert() step + # puzzle_dir = tmp_path + hydra_config_dir = project_root_path / "tests/gpu/torch/puzzletron/resources/configs" + # dataset_path = puzzle_dir / "dummy_dataset" + + # Setup puzzle_dir and dataset + # setup_puzzle_dir(puzzle_dir) + # save_dummy_dataset(dataset_path) - # Create a small Llama model (input to the mnt.convert() - the first model conversion step) + # Create a small Llama model (input to the mnt.convert() step) tokenizer_path = project_root_path / "tests/gpu/torch/_compress/resources/tokenizer" tokenizer = AutoTokenizer.from_pretrained(tokenizer_path) hf_ckpt_teacher_dir = "ckpts/teacher" @@ -26,3 +39,25 @@ def test_nas_convert(project_root_path: Path, tmp_path: Path): create_and_save_small_llama_model( llama_checkpoint_path, vocab_size=tokenizer.vocab_size, tokenizer=tokenizer ) + + # + # Run the mnt.convert() step + # + input_model = CompressModel() + mtn.convert( + input_model, + mode=[ + ( + "compress", + { + "hydra_config_dir": str(hydra_config_dir), + "puzzle_dir": str(puzzle_dir), + "dataset_path": "", # dataset_path, + }, + ) + ], + ) + + # + # Check assertions + # diff --git a/tests/gpu/torch/_compress/test_compress.py b/tests/gpu/torch/_compress/test_compress.py index 217acd533..924e458d9 100644 --- a/tests/gpu/torch/_compress/test_compress.py +++ b/tests/gpu/torch/_compress/test_compress.py @@ -88,7 +88,9 @@ def test_compress(project_root_path): hf_ckpt_teacher_dir = "ckpts/teacher" llama_checkpoint_path = osp.join(puzzle_dir, hf_ckpt_teacher_dir) create_and_save_small_llama_model( - llama_checkpoint_path, vocab_size=tokenizer.vocab_size, tokenizer=tokenizer + llama_checkpoint_path, + vocab_size=tokenizer.vocab_size, + tokenizer=tokenizer, ) # Use the full conversion pipeline (matches normal usage) @@ -125,7 +127,8 @@ def test_compress(project_root_path): # assertions for the scoring step 5 solution_0_filepath = osp.join( - puzzle_dir, "single_sequence_replacement_solutions--validation/solution_0.json" + puzzle_dir, + "single_sequence_replacement_solutions--validation/solution_0.json", ) assert os.path.exists(solution_0_filepath) @@ -136,7 +139,10 @@ def test_compress(project_root_path): ) assert os.path.exists(solution_0_ckpt_config_path) assert os.path.exists( - osp.join(puzzle_dir, "mip/puzzle_solutions/target_memory_780000MiB/solutions.json") + osp.join( + puzzle_dir, + "mip/puzzle_solutions/target_memory_780000MiB/solutions.json", + ) ) runtime.wait_for_everyone() From f7fe23cd4cca117d3758707b1829ed6b45faf5a1 Mon Sep 17 00:00:00 2001 From: Daniel Korzekwa Date: Tue, 28 Oct 2025 20:23:07 +0100 Subject: [PATCH 24/81] Fix broken test - incorrect package names. Signed-off-by: Daniel Korzekwa --- .../converters/convert_llama3_to_decilm.py | 18 ++++++++++++++- ..._convert_llama3_config_to_decilm_config.py | 23 ++++++++++++++++--- 2 files changed, 37 insertions(+), 4 deletions(-) diff --git a/modelopt/torch/_compress/decilm/converters/convert_llama3_to_decilm.py b/modelopt/torch/_compress/decilm/converters/convert_llama3_to_decilm.py index 6cdd1f02c..4b65eeada 100644 --- a/modelopt/torch/_compress/decilm/converters/convert_llama3_to_decilm.py +++ b/modelopt/torch/_compress/decilm/converters/convert_llama3_to_decilm.py @@ -1,4 +1,20 @@ -"""Convert a Llama3 model to a DeciLM model.""" +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Convert a Llama3 model to a DeciLM model.""" #!/usr/bin/env python3 from pathlib import Path diff --git a/tests/experimental/torch/_compress/decilm/converters/test_convert_llama3_config_to_decilm_config.py b/tests/experimental/torch/_compress/decilm/converters/test_convert_llama3_config_to_decilm_config.py index 4bab4d505..03c3c4cd6 100644 --- a/tests/experimental/torch/_compress/decilm/converters/test_convert_llama3_config_to_decilm_config.py +++ b/tests/experimental/torch/_compress/decilm/converters/test_convert_llama3_config_to_decilm_config.py @@ -1,9 +1,24 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import json import os.path as osp from pathlib import Path import pytest -from gpu.torch._compress.test_compress import create_and_save_small_llama_model +from experimental.torch._compress.test_compress import _create_and_save_small_llama_model from transformers import AutoTokenizer from modelopt.torch._compress.decilm.converters.convert_llama3_to_decilm import ( @@ -17,11 +32,13 @@ def project_root_path(request: pytest.FixtureRequest) -> Path: def test_convert_llama3_config_to_decilm_config(project_root_path: Path, tmp_path: Path): - tokenizer_path = osp.join(project_root_path, "tests/gpu/torch/_compress/resources/tokenizer") + tokenizer_path = osp.join( + project_root_path, "tests/experimental/torch/_compress/resources/tokenizer" + ) tokenizer = AutoTokenizer.from_pretrained(tokenizer_path) llama_checkpoint_path = tmp_path / "llama_checkpoint" - create_and_save_small_llama_model( + _create_and_save_small_llama_model( llama_checkpoint_path, vocab_size=tokenizer.vocab_size, tokenizer=tokenizer ) From a2104830c58e4dae5c8c5817306453437a67bd4c Mon Sep 17 00:00:00 2001 From: Daniel Korzekwa Date: Tue, 28 Oct 2025 20:48:03 +0100 Subject: [PATCH 25/81] Implementing nas.convert for compress algorithm. Signed-off-by: Daniel Korzekwa --- modelopt/torch/_compress/compress.py | 6 +-- .../plugins/_compress/compress_nas_plugin.py | 39 ++++++++++++++++++ .../nas/plugins/_compress/test_nas_convert.py | 40 ++++++++++++++----- 3 files changed, 73 insertions(+), 12 deletions(-) diff --git a/modelopt/torch/_compress/compress.py b/modelopt/torch/_compress/compress.py index 265fd5eeb..94b15ec88 100644 --- a/modelopt/torch/_compress/compress.py +++ b/modelopt/torch/_compress/compress.py @@ -35,12 +35,12 @@ def compress( hydra_config_dir: str, hydra_config: str, puzzle_dir: str, dataset_path: str, runtime: IRuntime ) -> DictConfig: - """Compress a puzzletron model using the MIP-based NAS search algorithm. + """Compress a compress model using the MIP-based NAS search algorithm. Args: hydra_config_dir (str): path to a hydra_config_dir that defines the search space hydra_config (str): the corresponding hydra config file - puzzle_dir (str): directory with a puzzletron model to compress + puzzle_dir (str): directory with a compress model to compress dataset_path (str): dataset used for scoring and distillation runtime: distributed runtime to use to run the compression steps, e.g., NativeDdpRuntime(dtype=torch.bfloat16, torch_distributed_timeout=datetime.timedelta(10)) @@ -50,7 +50,7 @@ def compress( The same hydra configuration object is used across all compression steps. @TODO: Investigate if this config object is immutable across steps and clarify """ - # Step 0: Load puzzletron hydra config + # Step 0: Load hydra config hydra_cfg = initialize_hydra_config_for_dir( config_dir=hydra_config_dir, config_name=hydra_config, diff --git a/modelopt/torch/nas/plugins/_compress/compress_nas_plugin.py b/modelopt/torch/nas/plugins/_compress/compress_nas_plugin.py index 563d8055f..783e5317f 100644 --- a/modelopt/torch/nas/plugins/_compress/compress_nas_plugin.py +++ b/modelopt/torch/nas/plugins/_compress/compress_nas_plugin.py @@ -1,3 +1,21 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from pathlib import Path + +from scripts.convert_llama3_to_decilm import convert_llama3_to_decilm from torch import nn from modelopt.torch.nas.conversion import NASModeRegistry @@ -11,6 +29,9 @@ ) from modelopt.torch.opt.searcher import BaseSearcher +# TODO Move initialize_hydra_config_for_dir from tests to main +from tests.utils.test_utils import initialize_hydra_config_for_dir + class CompressModel(nn.Module): pass @@ -57,6 +78,24 @@ def convert_compress_model(model: nn.Module, config: CompressConfig) -> ConvertR model.hydra_config_dir = config.hydra_config_dir model.puzzle_dir = config.puzzle_dir model.dataset_path = config.dataset_path + + # Load hydra config + initialize_hydra_config_for_dir( + config_dir=config.hydra_config_dir, + config_name="Llama-3_1-8B", # TODO: Make it configurable + overrides=[ + f"puzzle_dir={config.puzzle_dir}", + f"dataset_path={config.dataset_path}", + ], + ) + + # Convert Llama3 model to DeciLM model + hf_ckpt_teacher_dir = "ckpts/teacher" + convert_llama3_to_decilm( + input_dir=Path(config.puzzle_dir) / hf_ckpt_teacher_dir, # TODO this should be configurable + output_dir=Path(config.puzzle_dir) / hf_ckpt_teacher_dir, + ) + print(f"[convert] after convert:\n{model}") return model, {} diff --git a/tests/experimental/torch/nas/plugins/_compress/test_nas_convert.py b/tests/experimental/torch/nas/plugins/_compress/test_nas_convert.py index 3bd49da69..b2f5d3780 100644 --- a/tests/experimental/torch/nas/plugins/_compress/test_nas_convert.py +++ b/tests/experimental/torch/nas/plugins/_compress/test_nas_convert.py @@ -1,7 +1,27 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from pathlib import Path import pytest -from gpu.torch._compress.test_compress import create_and_save_small_llama_model +from experimental.torch._compress.test_compress import ( + _create_and_save_small_llama_model, + _save_dummy_dataset, + _setup_puzzle_dir, +) +from puzzle_tools.hydra_utils import register_hydra_resolvers from transformers import AutoTokenizer import modelopt.torch.nas as mtn @@ -18,25 +38,27 @@ def project_root_path(request: pytest.FixtureRequest) -> Path: # TODO: Remove those instructions once this test runs automatically on CI # def test_nas_convert(project_root_path: Path, tmp_path: Path): + # Register Hydra custom resolvers (needed for config resolution) + register_hydra_resolvers() + # # Step 1: Setup the puzzle_dir, dataset, hydra_config_dir, and input model # needed for the mnt.convert() step # puzzle_dir = tmp_path - hydra_config_dir = project_root_path / "tests/gpu/torch/puzzletron/resources/configs" - # dataset_path = puzzle_dir / "dummy_dataset" + dataset_path = puzzle_dir / "dummy_dataset" + hydra_config_dir = project_root_path / "tests/experimental/torch/_compress/resources/configs" # Setup puzzle_dir and dataset - # setup_puzzle_dir(puzzle_dir) - # save_dummy_dataset(dataset_path) + _setup_puzzle_dir(puzzle_dir) + _save_dummy_dataset(dataset_path) # Create a small Llama model (input to the mnt.convert() step) - tokenizer_path = project_root_path / "tests/gpu/torch/_compress/resources/tokenizer" + tokenizer_path = project_root_path / "tests/experimental/torch/_compress/resources/tokenizer" tokenizer = AutoTokenizer.from_pretrained(tokenizer_path) hf_ckpt_teacher_dir = "ckpts/teacher" llama_checkpoint_path = puzzle_dir / hf_ckpt_teacher_dir - # TODO: the same as in tests/gpu/torch/_compress/test_compress.py (refactor it) - create_and_save_small_llama_model( + _create_and_save_small_llama_model( llama_checkpoint_path, vocab_size=tokenizer.vocab_size, tokenizer=tokenizer ) @@ -52,7 +74,7 @@ def test_nas_convert(project_root_path: Path, tmp_path: Path): { "hydra_config_dir": str(hydra_config_dir), "puzzle_dir": str(puzzle_dir), - "dataset_path": "", # dataset_path, + "dataset_path": str(dataset_path), }, ) ], From 739f868960375b0e60aa56555b2c4959c056725f Mon Sep 17 00:00:00 2001 From: Daniel Korzekwa Date: Tue, 28 Oct 2025 20:50:53 +0100 Subject: [PATCH 26/81] Improve docs Signed-off-by: Daniel Korzekwa --- modelopt/torch/_compress/compress.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/modelopt/torch/_compress/compress.py b/modelopt/torch/_compress/compress.py index 265fd5eeb..0722cab73 100644 --- a/modelopt/torch/_compress/compress.py +++ b/modelopt/torch/_compress/compress.py @@ -15,8 +15,8 @@ """ -This module provides the main compression function for a model -using MIP-based NAS search algorithm. +This module provides the main compression function using the puzzle algorithm. +MIP-based NAS search algorithm. """ @@ -35,12 +35,12 @@ def compress( hydra_config_dir: str, hydra_config: str, puzzle_dir: str, dataset_path: str, runtime: IRuntime ) -> DictConfig: - """Compress a puzzletron model using the MIP-based NAS search algorithm. + """Compress a model using the MIP-based NAS search algorithm. Args: hydra_config_dir (str): path to a hydra_config_dir that defines the search space hydra_config (str): the corresponding hydra config file - puzzle_dir (str): directory with a puzzletron model to compress + puzzle_dir (str): directory with a model to compress dataset_path (str): dataset used for scoring and distillation runtime: distributed runtime to use to run the compression steps, e.g., NativeDdpRuntime(dtype=torch.bfloat16, torch_distributed_timeout=datetime.timedelta(10)) @@ -50,7 +50,7 @@ def compress( The same hydra configuration object is used across all compression steps. @TODO: Investigate if this config object is immutable across steps and clarify """ - # Step 0: Load puzzletron hydra config + # Step 0: Load a hydra config hydra_cfg = initialize_hydra_config_for_dir( config_dir=hydra_config_dir, config_name=hydra_config, From 20a3c5e7704ecdb72d29b1c39fa98d41b15193c7 Mon Sep 17 00:00:00 2001 From: Daniel Korzekwa Date: Tue, 28 Oct 2025 21:14:36 +0100 Subject: [PATCH 27/81] Code cleanup. Signed-off-by: Daniel Korzekwa --- .../plugins/_compress/compress_nas_plugin.py | 26 +++++++------------ 1 file changed, 9 insertions(+), 17 deletions(-) diff --git a/modelopt/torch/nas/plugins/_compress/compress_nas_plugin.py b/modelopt/torch/nas/plugins/_compress/compress_nas_plugin.py index 783e5317f..48fb51565 100644 --- a/modelopt/torch/nas/plugins/_compress/compress_nas_plugin.py +++ b/modelopt/torch/nas/plugins/_compress/compress_nas_plugin.py @@ -59,25 +59,14 @@ class CompressConfig(ModeloptBaseConfig): ) -# TOD: Why is it called SuperNetMLP? -class SuperNetMLP(CompressModel): - """Marker subclass indicating converted/search-space state for CompressConfig. - TODO: Provide better description - """ - - hydra_config_dir: str - puzzle_dir: str - dataset_path: str - - def convert_compress_model(model: nn.Module, config: CompressConfig) -> ConvertReturnType: """Convert the model to a search space model.""" print("=" * 80) print(f"[convert] before convert:\n{model}") - model.__class__ = SuperNetMLP - model.hydra_config_dir = config.hydra_config_dir - model.puzzle_dir = config.puzzle_dir - model.dataset_path = config.dataset_path + + # _runtime = NativeDdpRuntime( + # dtype=torch.bfloat16, torch_distributed_timeout=datetime.timedelta(10) + # ) # Load hydra config initialize_hydra_config_for_dir( @@ -96,6 +85,9 @@ def convert_compress_model(model: nn.Module, config: CompressConfig) -> ConvertR output_dir=Path(config.puzzle_dir) / hf_ckpt_teacher_dir, ) + # Score_pruning_activations (distributed processing) + # score_pruning_activations.launch_score_activations(hydra_cfg, runtime) + print(f"[convert] after convert:\n{model}") return model, {} @@ -103,8 +95,8 @@ def convert_compress_model(model: nn.Module, config: CompressConfig) -> ConvertR def restore_compress_model( model: nn.Module, config: CompressConfig, metadata: MetadataDict ) -> nn.Module: - """Reuse convert to produce the same behavior on restore.""" - return convert_compress_model(model, config)[0] + """Restore is not needed for the compress mode as we are not saving any model state""" + return model @NASModeRegistry.register_mode From 1033c81e3828d123939b3bda4949cb5b17d16c06 Mon Sep 17 00:00:00 2001 From: Daniel Korzekwa Date: Tue, 28 Oct 2025 21:35:41 +0100 Subject: [PATCH 28/81] Fix import Signed-off-by: Daniel Korzekwa --- tests/experimental/torch/_compress/test_compress.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/experimental/torch/_compress/test_compress.py b/tests/experimental/torch/_compress/test_compress.py index 096de4de3..db06e6580 100644 --- a/tests/experimental/torch/_compress/test_compress.py +++ b/tests/experimental/torch/_compress/test_compress.py @@ -21,7 +21,7 @@ import pytest import torch -from _test_utils.torch_dist.dist_utils import spawn_multiprocess_job +from _test_utils.torch.distributed.utils import spawn_multiprocess_job from datasets import Dataset, DatasetDict from puzzle_tools.hydra_utils import register_hydra_resolvers from scripts.convert_llama3_to_decilm import convert_llama3_to_decilm From 0680c45439c7aeca813cd323adddde317a6a0e20 Mon Sep 17 00:00:00 2001 From: Daniel Korzekwa Date: Wed, 29 Oct 2025 12:04:35 +0100 Subject: [PATCH 29/81] simplify code Signed-off-by: Daniel Korzekwa --- .../_compress/decilm/converters/convert_llama3_to_decilm.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/modelopt/torch/_compress/decilm/converters/convert_llama3_to_decilm.py b/modelopt/torch/_compress/decilm/converters/convert_llama3_to_decilm.py index 4b65eeada..d17e7ef74 100644 --- a/modelopt/torch/_compress/decilm/converters/convert_llama3_to_decilm.py +++ b/modelopt/torch/_compress/decilm/converters/convert_llama3_to_decilm.py @@ -19,6 +19,7 @@ #!/usr/bin/env python3 from pathlib import Path +import torch from fire import Fire from puzzle_tools.checkpoint_utils import copy_tokenizer from puzzle_tools.checkpoint_utils_hf import copy_deci_lm_hf_code @@ -46,7 +47,7 @@ def convert_llama3_config_to_decilm_config(config: LlamaConfig) -> DeciLMConfig: dtype = getattr(config, "torch_dtype", None) # Convert torch.dtype to string if needed (for JSON serialization) - if dtype is not None and hasattr(dtype, "__module__") and "torch" in dtype.__module__: + if dtype is not None and isinstance(dtype, torch.dtype): dtype = str(dtype).replace("torch.", "") # Track which global values will be removed (moved to per-layer configs) From 2d9da30b1a0ba5ca7bc60867b4dd36e96ce3c5cb Mon Sep 17 00:00:00 2001 From: Daniel Korzekwa Date: Wed, 29 Oct 2025 12:05:40 +0100 Subject: [PATCH 30/81] implementing compress_nas_plugin Signed-off-by: Daniel Korzekwa --- .../plugins/_compress/compress_nas_plugin.py | 35 +++++++++++++++---- .../nas/plugins/_compress/test_nas_convert.py | 28 +++++++++++++-- 2 files changed, 54 insertions(+), 9 deletions(-) diff --git a/modelopt/torch/nas/plugins/_compress/compress_nas_plugin.py b/modelopt/torch/nas/plugins/_compress/compress_nas_plugin.py index 48fb51565..9ba971f45 100644 --- a/modelopt/torch/nas/plugins/_compress/compress_nas_plugin.py +++ b/modelopt/torch/nas/plugins/_compress/compress_nas_plugin.py @@ -13,11 +13,16 @@ # See the License for the specific language governing permissions and # limitations under the License. +import datetime from pathlib import Path +import pruning_ckpts +import score_pruning_activations +import torch from scripts.convert_llama3_to_decilm import convert_llama3_to_decilm from torch import nn +from modelopt.torch._compress.runtime import NativeDdpRuntime from modelopt.torch.nas.conversion import NASModeRegistry from modelopt.torch.opt.config import ModeloptBaseConfig, ModeloptField from modelopt.torch.opt.mode import ( @@ -40,12 +45,24 @@ class CompressModel(nn.Module): class CompressConfig(ModeloptBaseConfig): """Configuration for Compress NAS algorithm.""" + input_model_path: str = ModeloptField( + default="", + title="", + description="", + ) + hydra_config_dir: str = ModeloptField( default="", title="", description="", ) + hydra_config_name: str = ModeloptField( + default="", + title="", + description="", + ) + puzzle_dir: str = ModeloptField( default="", title="", @@ -64,14 +81,14 @@ def convert_compress_model(model: nn.Module, config: CompressConfig) -> ConvertR print("=" * 80) print(f"[convert] before convert:\n{model}") - # _runtime = NativeDdpRuntime( - # dtype=torch.bfloat16, torch_distributed_timeout=datetime.timedelta(10) - # ) + runtime = NativeDdpRuntime( + dtype=torch.bfloat16, torch_distributed_timeout=datetime.timedelta(10) + ) # Load hydra config - initialize_hydra_config_for_dir( + hydra_cfg = initialize_hydra_config_for_dir( config_dir=config.hydra_config_dir, - config_name="Llama-3_1-8B", # TODO: Make it configurable + config_name=config.hydra_config_name, overrides=[ f"puzzle_dir={config.puzzle_dir}", f"dataset_path={config.dataset_path}", @@ -81,12 +98,16 @@ def convert_compress_model(model: nn.Module, config: CompressConfig) -> ConvertR # Convert Llama3 model to DeciLM model hf_ckpt_teacher_dir = "ckpts/teacher" convert_llama3_to_decilm( - input_dir=Path(config.puzzle_dir) / hf_ckpt_teacher_dir, # TODO this should be configurable + input_dir=config.input_model_path, output_dir=Path(config.puzzle_dir) / hf_ckpt_teacher_dir, ) # Score_pruning_activations (distributed processing) - # score_pruning_activations.launch_score_activations(hydra_cfg, runtime) + score_pruning_activations.launch_score_activations(hydra_cfg, runtime) + + if runtime.global_rank == 0: + pruning_ckpts.launch_prune_ckpt(hydra_cfg) + runtime.wait_for_everyone() print(f"[convert] after convert:\n{model}") return model, {} diff --git a/tests/experimental/torch/nas/plugins/_compress/test_nas_convert.py b/tests/experimental/torch/nas/plugins/_compress/test_nas_convert.py index b2f5d3780..002d3f81c 100644 --- a/tests/experimental/torch/nas/plugins/_compress/test_nas_convert.py +++ b/tests/experimental/torch/nas/plugins/_compress/test_nas_convert.py @@ -13,9 +13,13 @@ # See the License for the specific language governing permissions and # limitations under the License. +import os +from functools import partial from pathlib import Path import pytest +import torch +from _test_utils.torch_dist.dist_utils import spawn_multiprocess_job from experimental.torch._compress.test_compress import ( _create_and_save_small_llama_model, _save_dummy_dataset, @@ -38,6 +42,16 @@ def project_root_path(request: pytest.FixtureRequest) -> Path: # TODO: Remove those instructions once this test runs automatically on CI # def test_nas_convert(project_root_path: Path, tmp_path: Path): + spawn_multiprocess_job( + size=torch.cuda.device_count(), + job=partial(_test_nas_convert_multiprocess_job, project_root_path, tmp_path), + backend="nccl", + ) + + +def _test_nas_convert_multiprocess_job( + project_root_path: Path, tmp_path: Path, rank: int, size: int +): # Register Hydra custom resolvers (needed for config resolution) register_hydra_resolvers() @@ -48,6 +62,7 @@ def test_nas_convert(project_root_path: Path, tmp_path: Path): puzzle_dir = tmp_path dataset_path = puzzle_dir / "dummy_dataset" hydra_config_dir = project_root_path / "tests/experimental/torch/_compress/resources/configs" + hydra_config_name = "Llama-3_1-8B" # Setup puzzle_dir and dataset _setup_puzzle_dir(puzzle_dir) @@ -56,8 +71,7 @@ def test_nas_convert(project_root_path: Path, tmp_path: Path): # Create a small Llama model (input to the mnt.convert() step) tokenizer_path = project_root_path / "tests/experimental/torch/_compress/resources/tokenizer" tokenizer = AutoTokenizer.from_pretrained(tokenizer_path) - hf_ckpt_teacher_dir = "ckpts/teacher" - llama_checkpoint_path = puzzle_dir / hf_ckpt_teacher_dir + llama_checkpoint_path = puzzle_dir / "ckpts/llama" _create_and_save_small_llama_model( llama_checkpoint_path, vocab_size=tokenizer.vocab_size, tokenizer=tokenizer ) @@ -72,7 +86,9 @@ def test_nas_convert(project_root_path: Path, tmp_path: Path): ( "compress", { + "input_model_path": str(llama_checkpoint_path), "hydra_config_dir": str(hydra_config_dir), + "hydra_config_name": hydra_config_name, "puzzle_dir": str(puzzle_dir), "dataset_path": str(dataset_path), }, @@ -83,3 +99,11 @@ def test_nas_convert(project_root_path: Path, tmp_path: Path): # # Check assertions # + + # assertions for the score_pruning_activations step 1 + rank = int(os.environ["RANK"]) + rank_filepath = f"pruning/pruning_scores/ffn_iterative/100samples_diverse_mini/rank_{rank}.pth" + assert (puzzle_dir / rank_filepath).is_file() + + # assertions for the pruning_ckpts step 2 + assert (puzzle_dir / "ckpts/ffn_256_attn_no_op").exists() From febab440b714ee745ca4464d50f6d795ef145e63 Mon Sep 17 00:00:00 2001 From: Daniel Korzekwa Date: Wed, 29 Oct 2025 12:11:29 +0100 Subject: [PATCH 31/81] code clean up. Signed-off-by: Daniel Korzekwa --- .../torch/nas/plugins/_compress/test_nas_convert.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/experimental/torch/nas/plugins/_compress/test_nas_convert.py b/tests/experimental/torch/nas/plugins/_compress/test_nas_convert.py index 002d3f81c..0c4756bd8 100644 --- a/tests/experimental/torch/nas/plugins/_compress/test_nas_convert.py +++ b/tests/experimental/torch/nas/plugins/_compress/test_nas_convert.py @@ -38,7 +38,7 @@ def project_root_path(request: pytest.FixtureRequest) -> Path: # -# See tests/gpu/torch/_compress/test_compress.py for instructions on how to run this test +# See tests/experimental/torch/_compress/test_compress.py for instructions on how to run this test # TODO: Remove those instructions once this test runs automatically on CI # def test_nas_convert(project_root_path: Path, tmp_path: Path): @@ -56,8 +56,8 @@ def _test_nas_convert_multiprocess_job( register_hydra_resolvers() # - # Step 1: Setup the puzzle_dir, dataset, hydra_config_dir, and input model - # needed for the mnt.convert() step + # Setup the inputs for the nas.convert() step: puzzle_dir, dataset, + # hydra_config_dir/hydra_config_name, and input model # puzzle_dir = tmp_path dataset_path = puzzle_dir / "dummy_dataset" @@ -68,7 +68,7 @@ def _test_nas_convert_multiprocess_job( _setup_puzzle_dir(puzzle_dir) _save_dummy_dataset(dataset_path) - # Create a small Llama model (input to the mnt.convert() step) + # Create a small Llama model tokenizer_path = project_root_path / "tests/experimental/torch/_compress/resources/tokenizer" tokenizer = AutoTokenizer.from_pretrained(tokenizer_path) llama_checkpoint_path = puzzle_dir / "ckpts/llama" @@ -100,10 +100,10 @@ def _test_nas_convert_multiprocess_job( # Check assertions # - # assertions for the score_pruning_activations step 1 + # assertions for the score_pruning_activations step rank = int(os.environ["RANK"]) rank_filepath = f"pruning/pruning_scores/ffn_iterative/100samples_diverse_mini/rank_{rank}.pth" assert (puzzle_dir / rank_filepath).is_file() - # assertions for the pruning_ckpts step 2 + # assertions for the pruning_ckpts step assert (puzzle_dir / "ckpts/ffn_256_attn_no_op").exists() From 86bf394f41c4842d25d2b7c6287c034aea328768 Mon Sep 17 00:00:00 2001 From: Daniel Korzekwa Date: Wed, 29 Oct 2025 12:21:59 +0100 Subject: [PATCH 32/81] code clean up Signed-off-by: Daniel Korzekwa --- .../plugins/_compress/compress_nas_plugin.py | 27 +++++++++++++------ 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/modelopt/torch/nas/plugins/_compress/compress_nas_plugin.py b/modelopt/torch/nas/plugins/_compress/compress_nas_plugin.py index 9ba971f45..748f33939 100644 --- a/modelopt/torch/nas/plugins/_compress/compress_nas_plugin.py +++ b/modelopt/torch/nas/plugins/_compress/compress_nas_plugin.py @@ -13,6 +13,10 @@ # See the License for the specific language governing permissions and # limitations under the License. +""" +Compress NAS plugin for the Modelopt framework (based on Puzzle algorithm: https://arxiv.org/abs/2411.19146). +""" + import datetime from pathlib import Path @@ -39,36 +43,41 @@ class CompressModel(nn.Module): - pass + pass # No model implementation is needed for the compress mode class CompressConfig(ModeloptBaseConfig): """Configuration for Compress NAS algorithm.""" + # Input model path to compress in the HF format input_model_path: str = ModeloptField( default="", title="", description="", ) + # Hydra config directory containing the search space definition hydra_config_dir: str = ModeloptField( default="", title="", description="", ) + # Hydra config name containing the search space definition hydra_config_name: str = ModeloptField( default="", title="", description="", ) + # Directory to save the compressed model and intermediate results puzzle_dir: str = ModeloptField( default="", title="", description="", ) + # Dataset path to use for scoring in prunining and NAS search dataset_path: str = ModeloptField( default="", title="", @@ -77,10 +86,12 @@ class CompressConfig(ModeloptBaseConfig): def convert_compress_model(model: nn.Module, config: CompressConfig) -> ConvertReturnType: - """Convert the model to a search space model.""" - print("=" * 80) - print(f"[convert] before convert:\n{model}") + """1. Convert the model from HF format to DeciLM format. + 2. Score the pruning activations. + 3. Prune the model and save pruned checkpoints + The output of this step will be used by mnt.search() to perform the NAS search. + """ runtime = NativeDdpRuntime( dtype=torch.bfloat16, torch_distributed_timeout=datetime.timedelta(10) ) @@ -96,20 +107,20 @@ def convert_compress_model(model: nn.Module, config: CompressConfig) -> ConvertR ) # Convert Llama3 model to DeciLM model - hf_ckpt_teacher_dir = "ckpts/teacher" + hf_ckpt_teacher_dir = "ckpts/teacher" # TODO: make it configurable convert_llama3_to_decilm( input_dir=config.input_model_path, output_dir=Path(config.puzzle_dir) / hf_ckpt_teacher_dir, ) - # Score_pruning_activations (distributed processing) + # Score_pruning_activations (distributed processing) score_pruning_activations.launch_score_activations(hydra_cfg, runtime) + # Prune the model and save pruned checkpoints if runtime.global_rank == 0: pruning_ckpts.launch_prune_ckpt(hydra_cfg) runtime.wait_for_everyone() - print(f"[convert] after convert:\n{model}") return model, {} @@ -137,7 +148,7 @@ def config_class(self) -> type[ModeloptBaseConfig]: @property def search_algorithm(self) -> type[BaseSearcher]: """Return the associated searcher implementation.""" - raise NotImplementedError("Compress mode does not have a search algorithm.") + raise NotImplementedError("Compress mode does not have a search algorithm yet.") @property def convert(self) -> ConvertEntrypoint: From 86e04a06157dec82e1baf380298ddb75200c239e Mon Sep 17 00:00:00 2001 From: Daniel Korzekwa Date: Wed, 29 Oct 2025 14:16:48 +0100 Subject: [PATCH 33/81] create conftest.py with shared test logic for compress tests. Signed-off-by: Daniel Korzekwa --- .../experimental/torch/_compress/conftest.py | 120 ++++++++++++++++++ ..._convert_llama3_config_to_decilm_config.py | 21 +-- .../torch/_compress/test_compress.py | 110 ++-------------- 3 files changed, 136 insertions(+), 115 deletions(-) create mode 100644 tests/experimental/torch/_compress/conftest.py diff --git a/tests/experimental/torch/_compress/conftest.py b/tests/experimental/torch/_compress/conftest.py new file mode 100644 index 000000000..4dedf5363 --- /dev/null +++ b/tests/experimental/torch/_compress/conftest.py @@ -0,0 +1,120 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import shutil +from pathlib import Path + +import pytest +import torch +from datasets import Dataset, DatasetDict +from transformers import AutoTokenizer, LlamaConfig, LlamaForCausalLM, PreTrainedTokenizerBase + + +@pytest.fixture +def project_root_path(request: pytest.FixtureRequest) -> Path: + """Fixture providing the project root path for tests.""" + return Path(request.config.rootpath) + + +def create_and_save_small_llama_model( + output_path: str, vocab_size: int, tokenizer: PreTrainedTokenizerBase +): + """ + Create and save a small Llama model for testing the conversion pipeline. + This mimics having a real Llama checkpoint that needs to be converted. + """ + os.makedirs(output_path, exist_ok=True) + + # Create a minimal Llama config (small for testing) + # Note: intermediate_size must be divisible by 256 per DeciLM config requirements + # Note: hidden_size must give head_dim >= 8 for Flash Attention 2 compatibility + llama_config = LlamaConfig( + vocab_size=vocab_size, + hidden_size=256, # 32 heads times 8 head_dim = 256 (matches bypass config expectations) + intermediate_size=512, # Must be divisible by 256 + num_hidden_layers=2, + num_attention_heads=32, # Matches original test + num_key_value_heads=8, # GQA: 32÷4=8 (matches original n_heads_in_group=4) + max_position_embeddings=512, + rms_norm_eps=1e-5, + rope_theta=10000.0, + attention_bias=False, + hidden_act="silu", + tie_word_embeddings=False, + ) + + # Create and save the Llama model + model = LlamaForCausalLM(llama_config) + model.to(dtype=torch.bfloat16).save_pretrained(output_path) + + # Save tokenizer + tokenizer.save_pretrained(output_path) + + # Save config + llama_config.save_pretrained(output_path) + + +def create_tokenizer(project_root_path: Path) -> PreTrainedTokenizerBase: + """ + Create a tokenizer for the Llama model. + """ + tokenizer_path = project_root_path / "tests/experimental/torch/_compress/resources/tokenizer" + tokenizer = AutoTokenizer.from_pretrained(tokenizer_path) + return tokenizer + + +def setup_puzzle_dir(puzzle_dir: str): + if Path(puzzle_dir).exists(): + shutil.rmtree(puzzle_dir) + Path(puzzle_dir).mkdir(parents=True, exist_ok=True) + + +def save_dummy_dataset(dataset_path: str): + # dummy sample + sample = [ + {"role": "user", "content": "please cite Lorem Ipsum?"}, + { + "role": "assistant", + "content": ( + "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed in blandit ante. " + "Sed tempus erat urna, ac elementum nisl facilisis quis. Aliquam consectetur mollis massa, " + "in elementum sem venenatis posuere. Fusce lorem arcu, egestas vel massa sollicitudin, " + "dictum mollis purus. Proin in ullamcorper elit. Nam tellus nisi, volutpat a mattis vel, " + "pretium in purus. Nunc at lectus facilisis risus scelerisque rhoncus eu nec ex. " + "Maecenas semper, tellus non placerat vulputate, urna felis facilisis diam, " + "sit amet vestibulum erat sapien nec libero. Praesent non massa velit. Donec faucibus mi eros. " + "Nam turpis nulla, congue sit amet mi at, porttitor scelerisque elit. Nunc id sodales lorem, " + "nec tincidunt leo. Quisque a neque nec ligula porttitor auctor. " + "Nunc accumsan nunc ac tellus congue vehicula. Praesent tellus eros, luctus non gravida dapibus, " + "faucibus eu ex. Quisque bibendum leo pharetra, tristique est vitae, hendrerit nunc. " + "Duis nec congue dolor. Donec commodo ipsum non efficitur volutpat. " + "Nulla risus nulla, efficitur et urna at, imperdiet sodales lorem. " + "Suspendisse erat est, sollicitudin at nisl tincidunt, vehicula hendrerit lectus. " + "Nam quis nisi ullamcorper, rhoncus massa vel, tempus purus. " + "Duis pulvinar eros vel nulla pellentesque, at dapibus justo laoreet. " + "Praesent tortor orci, vulputate fermentum dapibus nec, feugiat vitae tortor. " + "Donec mollis convallis massa quis iaculis." + ), + }, + ] + + # Prepare train and val splits with sample repeated, 2500 samples are for + # 128 samples with block-size 8192 and LLama3 tokenizer + data = [{"conversation": sample}] * 2500 + + # For train-val splits + data_dict = DatasetDict({"train": Dataset.from_list(data), "valid": Dataset.from_list(data)}) + data_dict.save_to_disk(dataset_path) diff --git a/tests/experimental/torch/_compress/decilm/converters/test_convert_llama3_config_to_decilm_config.py b/tests/experimental/torch/_compress/decilm/converters/test_convert_llama3_config_to_decilm_config.py index 03c3c4cd6..a1d897ceb 100644 --- a/tests/experimental/torch/_compress/decilm/converters/test_convert_llama3_config_to_decilm_config.py +++ b/tests/experimental/torch/_compress/decilm/converters/test_convert_llama3_config_to_decilm_config.py @@ -14,31 +14,22 @@ # limitations under the License. import json -import os.path as osp from pathlib import Path -import pytest -from experimental.torch._compress.test_compress import _create_and_save_small_llama_model -from transformers import AutoTokenizer +from experimental.torch._compress.conftest import ( + create_and_save_small_llama_model, + create_tokenizer, +) from modelopt.torch._compress.decilm.converters.convert_llama3_to_decilm import ( convert_llama3_to_decilm, ) -@pytest.fixture -def project_root_path(request: pytest.FixtureRequest) -> Path: - return Path(request.config.rootpath) - - def test_convert_llama3_config_to_decilm_config(project_root_path: Path, tmp_path: Path): - tokenizer_path = osp.join( - project_root_path, "tests/experimental/torch/_compress/resources/tokenizer" - ) - tokenizer = AutoTokenizer.from_pretrained(tokenizer_path) - + tokenizer = create_tokenizer(project_root_path) llama_checkpoint_path = tmp_path / "llama_checkpoint" - _create_and_save_small_llama_model( + create_and_save_small_llama_model( llama_checkpoint_path, vocab_size=tokenizer.vocab_size, tokenizer=tokenizer ) diff --git a/tests/experimental/torch/_compress/test_compress.py b/tests/experimental/torch/_compress/test_compress.py index db06e6580..f36c9ff6b 100644 --- a/tests/experimental/torch/_compress/test_compress.py +++ b/tests/experimental/torch/_compress/test_compress.py @@ -15,27 +15,23 @@ import datetime import os -import shutil from functools import partial from pathlib import Path -import pytest import torch from _test_utils.torch.distributed.utils import spawn_multiprocess_job -from datasets import Dataset, DatasetDict +from experimental.torch._compress.conftest import ( + create_and_save_small_llama_model, + create_tokenizer, + save_dummy_dataset, + setup_puzzle_dir, +) from puzzle_tools.hydra_utils import register_hydra_resolvers from scripts.convert_llama3_to_decilm import convert_llama3_to_decilm -from transformers import AutoTokenizer, LlamaConfig, LlamaForCausalLM, PreTrainedTokenizerBase from modelopt.torch._compress import compress from modelopt.torch._compress.runtime import NativeDdpRuntime - -@pytest.fixture -def project_root_path(request: pytest.FixtureRequest) -> Path: - return Path(request.config.rootpath) - - # The e2e test to compress a model based on Local Neural Architecture Search (Mixed Integer Programing NAS search) # using a one-click command. # @@ -84,23 +80,19 @@ def _test_compress_multiprocess_job(project_root_path: Path, tmp_path: Path, ran # if rank == 0: # Setup puzzle_dir and dataset - _setup_puzzle_dir(puzzle_dir) - _save_dummy_dataset(dataset_path) + setup_puzzle_dir(puzzle_dir) + save_dummy_dataset(dataset_path) # # Step 1: Create and save a teacher model to compress # This mimics the normal pipeline where we start with a Llama model # - tokenizer_path = ( - project_root_path / "tests/experimental/torch/_compress/resources/tokenizer" - ) - - tokenizer = AutoTokenizer.from_pretrained(tokenizer_path) # Create a small Llama model (not DeciLM) to match the normal conversion pipeline + tokenizer = create_tokenizer(project_root_path) hf_ckpt_teacher_dir = "ckpts/teacher" llama_checkpoint_path = puzzle_dir / hf_ckpt_teacher_dir - _create_and_save_small_llama_model( + create_and_save_small_llama_model( llama_checkpoint_path, vocab_size=tokenizer.vocab_size, tokenizer=tokenizer ) @@ -156,85 +148,3 @@ def _test_compress_multiprocess_job(project_root_path: Path, tmp_path: Path, ran runtime.wait_for_everyone() print("PYTEST SUMMARY: test_compress_model() test has finished successfully") - - -def _create_and_save_small_llama_model( - output_path: str, vocab_size: int, tokenizer: PreTrainedTokenizerBase -): - """ - Create and save a small Llama model for testing the conversion pipeline. - This mimics having a real Llama checkpoint that needs to be converted. - """ - os.makedirs(output_path, exist_ok=True) - - # Create a minimal Llama config (small for testing) - # Note: intermediate_size must be divisible by 256 per DeciLM config requirements - # Note: hidden_size must give head_dim >= 8 for Flash Attention 2 compatibility - llama_config = LlamaConfig( - vocab_size=vocab_size, - hidden_size=256, # 32 heads times 8 head_dim = 256 (matches bypass config expectations) - intermediate_size=512, # Must be divisible by 256 - num_hidden_layers=2, - num_attention_heads=32, # Matches original test - num_key_value_heads=8, # GQA: 32÷4=8 (matches original n_heads_in_group=4) - max_position_embeddings=512, - rms_norm_eps=1e-5, - rope_theta=10000.0, - attention_bias=False, - hidden_act="silu", - tie_word_embeddings=False, - ) - - # Create and save the Llama model - model = LlamaForCausalLM(llama_config) - model.to(dtype=torch.bfloat16).save_pretrained(output_path) - - # Save tokenizer - tokenizer.save_pretrained(output_path) - - # Save config - llama_config.save_pretrained(output_path) - - -def _setup_puzzle_dir(puzzle_dir: str): - if Path(puzzle_dir).exists(): - shutil.rmtree(puzzle_dir) - Path(puzzle_dir).mkdir(parents=True, exist_ok=True) - - -def _save_dummy_dataset(dataset_path: str): - # dummy sample - sample = [ - {"role": "user", "content": "please cite Lorem Ipsum?"}, - { - "role": "assistant", - "content": ( - "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed in blandit ante. " - "Sed tempus erat urna, ac elementum nisl facilisis quis. Aliquam consectetur mollis massa, " - "in elementum sem venenatis posuere. Fusce lorem arcu, egestas vel massa sollicitudin, " - "dictum mollis purus. Proin in ullamcorper elit. Nam tellus nisi, volutpat a mattis vel, " - "pretium in purus. Nunc at lectus facilisis risus scelerisque rhoncus eu nec ex. " - "Maecenas semper, tellus non placerat vulputate, urna felis facilisis diam, " - "sit amet vestibulum erat sapien nec libero. Praesent non massa velit. Donec faucibus mi eros. " - "Nam turpis nulla, congue sit amet mi at, porttitor scelerisque elit. Nunc id sodales lorem, " - "nec tincidunt leo. Quisque a neque nec ligula porttitor auctor. " - "Nunc accumsan nunc ac tellus congue vehicula. Praesent tellus eros, luctus non gravida dapibus, " - "faucibus eu ex. Quisque bibendum leo pharetra, tristique est vitae, hendrerit nunc. " - "Duis nec congue dolor. Donec commodo ipsum non efficitur volutpat. " - "Nulla risus nulla, efficitur et urna at, imperdiet sodales lorem. " - "Suspendisse erat est, sollicitudin at nisl tincidunt, vehicula hendrerit lectus. " - "Nam quis nisi ullamcorper, rhoncus massa vel, tempus purus. " - "Duis pulvinar eros vel nulla pellentesque, at dapibus justo laoreet. " - "Praesent tortor orci, vulputate fermentum dapibus nec, feugiat vitae tortor. " - "Donec mollis convallis massa quis iaculis." - ), - }, - ] - - # Prepare train and val splits with sample repeated, 2500 samples are for - # 128 samples with block-size 8192 and LLama3 tokenizer - data = [{"conversation": sample}] * 2500 - - # For train-val splits - data_dict = DatasetDict({"train": Dataset.from_list(data), "valid": Dataset.from_list(data)}) - data_dict.save_to_disk(dataset_path) From ae6164423e0539ca4640f35f0ffc6fcd67a9a1b6 Mon Sep 17 00:00:00 2001 From: Daniel Korzekwa Date: Wed, 29 Oct 2025 14:19:10 +0100 Subject: [PATCH 34/81] code cleanup Signed-off-by: Daniel Korzekwa --- tests/experimental/torch/_compress/test_compress.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/experimental/torch/_compress/test_compress.py b/tests/experimental/torch/_compress/test_compress.py index f36c9ff6b..018b78e1a 100644 --- a/tests/experimental/torch/_compress/test_compress.py +++ b/tests/experimental/torch/_compress/test_compress.py @@ -69,6 +69,7 @@ def _test_compress_multiprocess_job(project_root_path: Path, tmp_path: Path, ran puzzle_dir = tmp_path dataset_path = puzzle_dir / "dummy_dataset" hydra_config_dir = project_root_path / "tests/experimental/torch/_compress/resources/configs" + hydra_config_name = "Llama-3_1-8B" _runtime = NativeDdpRuntime( dtype=torch.bfloat16, torch_distributed_timeout=datetime.timedelta(10) @@ -105,7 +106,7 @@ def _test_compress_multiprocess_job(project_root_path: Path, tmp_path: Path, ran # Compress the model using a one-click approach compress.compress( - str(hydra_config_dir), "Llama-3_1-8B", str(puzzle_dir), str(dataset_path), runtime + str(hydra_config_dir), hydra_config_name, str(puzzle_dir), str(dataset_path), runtime ) # From 3778ec21e20146a81410cdc7c2e86253d79ec40d Mon Sep 17 00:00:00 2001 From: Daniel Korzekwa Date: Wed, 29 Oct 2025 14:51:49 +0100 Subject: [PATCH 35/81] code refactoring Signed-off-by: Daniel Korzekwa --- .../torch/_compress/test_compress.py | 9 +- .../nas/plugins/_compress/test_nas_convert.py | 103 ++++++++++-------- 2 files changed, 64 insertions(+), 48 deletions(-) diff --git a/tests/experimental/torch/_compress/test_compress.py b/tests/experimental/torch/_compress/test_compress.py index 018b78e1a..1cc948c58 100644 --- a/tests/experimental/torch/_compress/test_compress.py +++ b/tests/experimental/torch/_compress/test_compress.py @@ -66,16 +66,19 @@ def test_compress(project_root_path: Path, tmp_path: Path): def _test_compress_multiprocess_job(project_root_path: Path, tmp_path: Path, rank: int, size: int): register_hydra_resolvers() + # + # The inputs for the compress() algorihm. + # puzzle_dir = tmp_path dataset_path = puzzle_dir / "dummy_dataset" hydra_config_dir = project_root_path / "tests/experimental/torch/_compress/resources/configs" hydra_config_name = "Llama-3_1-8B" - _runtime = NativeDdpRuntime( + runtime = NativeDdpRuntime( dtype=torch.bfloat16, torch_distributed_timeout=datetime.timedelta(10) ) - with _runtime as runtime: + with runtime as runtime: # # Test setup # @@ -148,4 +151,4 @@ def _test_compress_multiprocess_job(project_root_path: Path, tmp_path: Path, ran runtime.wait_for_everyone() - print("PYTEST SUMMARY: test_compress_model() test has finished successfully") + print("PYTEST SUMMARY: test_compress_model() test has finished successfully") diff --git a/tests/experimental/torch/nas/plugins/_compress/test_nas_convert.py b/tests/experimental/torch/nas/plugins/_compress/test_nas_convert.py index 0c4756bd8..6bd0e248a 100644 --- a/tests/experimental/torch/nas/plugins/_compress/test_nas_convert.py +++ b/tests/experimental/torch/nas/plugins/_compress/test_nas_convert.py @@ -13,22 +13,24 @@ # See the License for the specific language governing permissions and # limitations under the License. +import datetime import os from functools import partial from pathlib import Path import pytest import torch -from _test_utils.torch_dist.dist_utils import spawn_multiprocess_job -from experimental.torch._compress.test_compress import ( - _create_and_save_small_llama_model, - _save_dummy_dataset, - _setup_puzzle_dir, +from _test_utils.torch.distributed.utils import spawn_multiprocess_job +from experimental.torch._compress.conftest import ( + create_and_save_small_llama_model, + create_tokenizer, + save_dummy_dataset, + setup_puzzle_dir, ) from puzzle_tools.hydra_utils import register_hydra_resolvers -from transformers import AutoTokenizer import modelopt.torch.nas as mtn +from modelopt.torch._compress.runtime import NativeDdpRuntime from modelopt.torch.nas.plugins._compress.compress_nas_plugin import CompressModel @@ -56,54 +58,65 @@ def _test_nas_convert_multiprocess_job( register_hydra_resolvers() # - # Setup the inputs for the nas.convert() step: puzzle_dir, dataset, - # hydra_config_dir/hydra_config_name, and input model + # The inputs for the nas.convert() step. # puzzle_dir = tmp_path + llama_checkpoint_path = puzzle_dir / "ckpts/llama" dataset_path = puzzle_dir / "dummy_dataset" hydra_config_dir = project_root_path / "tests/experimental/torch/_compress/resources/configs" hydra_config_name = "Llama-3_1-8B" - # Setup puzzle_dir and dataset - _setup_puzzle_dir(puzzle_dir) - _save_dummy_dataset(dataset_path) - - # Create a small Llama model - tokenizer_path = project_root_path / "tests/experimental/torch/_compress/resources/tokenizer" - tokenizer = AutoTokenizer.from_pretrained(tokenizer_path) - llama_checkpoint_path = puzzle_dir / "ckpts/llama" - _create_and_save_small_llama_model( - llama_checkpoint_path, vocab_size=tokenizer.vocab_size, tokenizer=tokenizer + runtime = NativeDdpRuntime( + dtype=torch.bfloat16, torch_distributed_timeout=datetime.timedelta(10) ) - # - # Run the mnt.convert() step - # - input_model = CompressModel() - mtn.convert( - input_model, - mode=[ - ( - "compress", - { - "input_model_path": str(llama_checkpoint_path), - "hydra_config_dir": str(hydra_config_dir), - "hydra_config_name": hydra_config_name, - "puzzle_dir": str(puzzle_dir), - "dataset_path": str(dataset_path), - }, + with runtime as runtime: + if rank == 0: + # Setup puzzle_dir and dataset + setup_puzzle_dir(puzzle_dir) + save_dummy_dataset(dataset_path) + + # Create a small Llama model + tokenizer = create_tokenizer(project_root_path) + create_and_save_small_llama_model( + llama_checkpoint_path, vocab_size=tokenizer.vocab_size, tokenizer=tokenizer ) - ], - ) + runtime.wait_for_everyone() + + # + # Run the mnt.convert() step + # + input_model = CompressModel() + mtn.convert( + input_model, + mode=[ + ( + "compress", + { + "puzzle_dir": str(puzzle_dir), + "input_model_path": str(llama_checkpoint_path), + "hydra_config_dir": str(hydra_config_dir), + "hydra_config_name": hydra_config_name, + "dataset_path": str(dataset_path), + }, + ) + ], + ) + + # + # Check assertions + # + if rank == 0: + # assertions for the score_pruning_activations step + rank = int(os.environ["RANK"]) + rank_filepath = ( + f"pruning/pruning_scores/ffn_iterative/100samples_diverse_mini/rank_{rank}.pth" + ) + assert (puzzle_dir / rank_filepath).is_file() - # - # Check assertions - # + # assertions for the pruning_ckpts step + assert (puzzle_dir / "ckpts/ffn_256_attn_no_op").exists() - # assertions for the score_pruning_activations step - rank = int(os.environ["RANK"]) - rank_filepath = f"pruning/pruning_scores/ffn_iterative/100samples_diverse_mini/rank_{rank}.pth" - assert (puzzle_dir / rank_filepath).is_file() + runtime.wait_for_everyone() - # assertions for the pruning_ckpts step - assert (puzzle_dir / "ckpts/ffn_256_attn_no_op").exists() + print("PYTEST SUMMARY: test_nas_convert() test has finished successfully") From d940000ecb2116dc64e68c08cfc5018cfba05d0c Mon Sep 17 00:00:00 2001 From: Daniel Korzekwa Date: Wed, 29 Oct 2025 14:56:30 +0100 Subject: [PATCH 36/81] refactoring Signed-off-by: Daniel Korzekwa --- .../_compress => _compress/nas/plugins}/compress_nas_plugin.py | 0 .../_compress => _compress/nas/plugins}/test_nas_convert.py | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) rename modelopt/torch/{nas/plugins/_compress => _compress/nas/plugins}/compress_nas_plugin.py (100%) rename tests/experimental/torch/{nas/plugins/_compress => _compress/nas/plugins}/test_nas_convert.py (98%) diff --git a/modelopt/torch/nas/plugins/_compress/compress_nas_plugin.py b/modelopt/torch/_compress/nas/plugins/compress_nas_plugin.py similarity index 100% rename from modelopt/torch/nas/plugins/_compress/compress_nas_plugin.py rename to modelopt/torch/_compress/nas/plugins/compress_nas_plugin.py diff --git a/tests/experimental/torch/nas/plugins/_compress/test_nas_convert.py b/tests/experimental/torch/_compress/nas/plugins/test_nas_convert.py similarity index 98% rename from tests/experimental/torch/nas/plugins/_compress/test_nas_convert.py rename to tests/experimental/torch/_compress/nas/plugins/test_nas_convert.py index 6bd0e248a..81b51dcf3 100644 --- a/tests/experimental/torch/nas/plugins/_compress/test_nas_convert.py +++ b/tests/experimental/torch/_compress/nas/plugins/test_nas_convert.py @@ -30,8 +30,8 @@ from puzzle_tools.hydra_utils import register_hydra_resolvers import modelopt.torch.nas as mtn +from modelopt.torch._compress.nas.plugins.compress_nas_plugin import CompressModel from modelopt.torch._compress.runtime import NativeDdpRuntime -from modelopt.torch.nas.plugins._compress.compress_nas_plugin import CompressModel @pytest.fixture From 0bf9a92763e4125b2dd2b23655abb16040e22b9c Mon Sep 17 00:00:00 2001 From: Daniel Korzekwa Date: Wed, 29 Oct 2025 15:02:13 +0100 Subject: [PATCH 37/81] move test utilities from conftest.py to test_utils.py Signed-off-by: Daniel Korzekwa --- .../experimental/torch/_compress/conftest.py | 96 -------------- ..._convert_llama3_config_to_decilm_config.py | 2 +- .../_compress/nas/plugins/test_nas_convert.py | 8 +- .../torch/_compress/test_compress.py | 2 +- .../torch/_compress/test_utils.py | 119 ++++++++++++++++++ 5 files changed, 122 insertions(+), 105 deletions(-) create mode 100644 tests/experimental/torch/_compress/test_utils.py diff --git a/tests/experimental/torch/_compress/conftest.py b/tests/experimental/torch/_compress/conftest.py index 4dedf5363..cae1bfbca 100644 --- a/tests/experimental/torch/_compress/conftest.py +++ b/tests/experimental/torch/_compress/conftest.py @@ -13,108 +13,12 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os -import shutil from pathlib import Path import pytest -import torch -from datasets import Dataset, DatasetDict -from transformers import AutoTokenizer, LlamaConfig, LlamaForCausalLM, PreTrainedTokenizerBase @pytest.fixture def project_root_path(request: pytest.FixtureRequest) -> Path: """Fixture providing the project root path for tests.""" return Path(request.config.rootpath) - - -def create_and_save_small_llama_model( - output_path: str, vocab_size: int, tokenizer: PreTrainedTokenizerBase -): - """ - Create and save a small Llama model for testing the conversion pipeline. - This mimics having a real Llama checkpoint that needs to be converted. - """ - os.makedirs(output_path, exist_ok=True) - - # Create a minimal Llama config (small for testing) - # Note: intermediate_size must be divisible by 256 per DeciLM config requirements - # Note: hidden_size must give head_dim >= 8 for Flash Attention 2 compatibility - llama_config = LlamaConfig( - vocab_size=vocab_size, - hidden_size=256, # 32 heads times 8 head_dim = 256 (matches bypass config expectations) - intermediate_size=512, # Must be divisible by 256 - num_hidden_layers=2, - num_attention_heads=32, # Matches original test - num_key_value_heads=8, # GQA: 32÷4=8 (matches original n_heads_in_group=4) - max_position_embeddings=512, - rms_norm_eps=1e-5, - rope_theta=10000.0, - attention_bias=False, - hidden_act="silu", - tie_word_embeddings=False, - ) - - # Create and save the Llama model - model = LlamaForCausalLM(llama_config) - model.to(dtype=torch.bfloat16).save_pretrained(output_path) - - # Save tokenizer - tokenizer.save_pretrained(output_path) - - # Save config - llama_config.save_pretrained(output_path) - - -def create_tokenizer(project_root_path: Path) -> PreTrainedTokenizerBase: - """ - Create a tokenizer for the Llama model. - """ - tokenizer_path = project_root_path / "tests/experimental/torch/_compress/resources/tokenizer" - tokenizer = AutoTokenizer.from_pretrained(tokenizer_path) - return tokenizer - - -def setup_puzzle_dir(puzzle_dir: str): - if Path(puzzle_dir).exists(): - shutil.rmtree(puzzle_dir) - Path(puzzle_dir).mkdir(parents=True, exist_ok=True) - - -def save_dummy_dataset(dataset_path: str): - # dummy sample - sample = [ - {"role": "user", "content": "please cite Lorem Ipsum?"}, - { - "role": "assistant", - "content": ( - "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed in blandit ante. " - "Sed tempus erat urna, ac elementum nisl facilisis quis. Aliquam consectetur mollis massa, " - "in elementum sem venenatis posuere. Fusce lorem arcu, egestas vel massa sollicitudin, " - "dictum mollis purus. Proin in ullamcorper elit. Nam tellus nisi, volutpat a mattis vel, " - "pretium in purus. Nunc at lectus facilisis risus scelerisque rhoncus eu nec ex. " - "Maecenas semper, tellus non placerat vulputate, urna felis facilisis diam, " - "sit amet vestibulum erat sapien nec libero. Praesent non massa velit. Donec faucibus mi eros. " - "Nam turpis nulla, congue sit amet mi at, porttitor scelerisque elit. Nunc id sodales lorem, " - "nec tincidunt leo. Quisque a neque nec ligula porttitor auctor. " - "Nunc accumsan nunc ac tellus congue vehicula. Praesent tellus eros, luctus non gravida dapibus, " - "faucibus eu ex. Quisque bibendum leo pharetra, tristique est vitae, hendrerit nunc. " - "Duis nec congue dolor. Donec commodo ipsum non efficitur volutpat. " - "Nulla risus nulla, efficitur et urna at, imperdiet sodales lorem. " - "Suspendisse erat est, sollicitudin at nisl tincidunt, vehicula hendrerit lectus. " - "Nam quis nisi ullamcorper, rhoncus massa vel, tempus purus. " - "Duis pulvinar eros vel nulla pellentesque, at dapibus justo laoreet. " - "Praesent tortor orci, vulputate fermentum dapibus nec, feugiat vitae tortor. " - "Donec mollis convallis massa quis iaculis." - ), - }, - ] - - # Prepare train and val splits with sample repeated, 2500 samples are for - # 128 samples with block-size 8192 and LLama3 tokenizer - data = [{"conversation": sample}] * 2500 - - # For train-val splits - data_dict = DatasetDict({"train": Dataset.from_list(data), "valid": Dataset.from_list(data)}) - data_dict.save_to_disk(dataset_path) diff --git a/tests/experimental/torch/_compress/decilm/converters/test_convert_llama3_config_to_decilm_config.py b/tests/experimental/torch/_compress/decilm/converters/test_convert_llama3_config_to_decilm_config.py index a1d897ceb..92dad84e4 100644 --- a/tests/experimental/torch/_compress/decilm/converters/test_convert_llama3_config_to_decilm_config.py +++ b/tests/experimental/torch/_compress/decilm/converters/test_convert_llama3_config_to_decilm_config.py @@ -16,7 +16,7 @@ import json from pathlib import Path -from experimental.torch._compress.conftest import ( +from experimental.torch._compress.test_utils import ( create_and_save_small_llama_model, create_tokenizer, ) diff --git a/tests/experimental/torch/_compress/nas/plugins/test_nas_convert.py b/tests/experimental/torch/_compress/nas/plugins/test_nas_convert.py index 81b51dcf3..4a416c833 100644 --- a/tests/experimental/torch/_compress/nas/plugins/test_nas_convert.py +++ b/tests/experimental/torch/_compress/nas/plugins/test_nas_convert.py @@ -18,10 +18,9 @@ from functools import partial from pathlib import Path -import pytest import torch from _test_utils.torch.distributed.utils import spawn_multiprocess_job -from experimental.torch._compress.conftest import ( +from experimental.torch._compress.test_utils import ( create_and_save_small_llama_model, create_tokenizer, save_dummy_dataset, @@ -34,11 +33,6 @@ from modelopt.torch._compress.runtime import NativeDdpRuntime -@pytest.fixture -def project_root_path(request: pytest.FixtureRequest) -> Path: - return Path(request.config.rootpath) - - # # See tests/experimental/torch/_compress/test_compress.py for instructions on how to run this test # TODO: Remove those instructions once this test runs automatically on CI diff --git a/tests/experimental/torch/_compress/test_compress.py b/tests/experimental/torch/_compress/test_compress.py index 1cc948c58..dd9bac9c5 100644 --- a/tests/experimental/torch/_compress/test_compress.py +++ b/tests/experimental/torch/_compress/test_compress.py @@ -20,7 +20,7 @@ import torch from _test_utils.torch.distributed.utils import spawn_multiprocess_job -from experimental.torch._compress.conftest import ( +from experimental.torch._compress.test_utils import ( create_and_save_small_llama_model, create_tokenizer, save_dummy_dataset, diff --git a/tests/experimental/torch/_compress/test_utils.py b/tests/experimental/torch/_compress/test_utils.py new file mode 100644 index 000000000..21ca622da --- /dev/null +++ b/tests/experimental/torch/_compress/test_utils.py @@ -0,0 +1,119 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import shutil +from pathlib import Path + +import torch +from datasets import Dataset, DatasetDict +from transformers import AutoTokenizer, LlamaConfig, LlamaForCausalLM, PreTrainedTokenizerBase + + +def create_and_save_small_llama_model( + output_path: str, vocab_size: int, tokenizer: PreTrainedTokenizerBase +): + """ + Create and save a small Llama model for testing the conversion pipeline. + This mimics having a real Llama checkpoint that needs to be converted. + """ + os.makedirs(output_path, exist_ok=True) + + # Create a minimal Llama config (small for testing) + # Note: intermediate_size must be divisible by 256 per DeciLM config requirements + # Note: hidden_size must give head_dim >= 8 for Flash Attention 2 compatibility + llama_config = LlamaConfig( + vocab_size=vocab_size, + hidden_size=256, # 32 heads times 8 head_dim = 256 (matches bypass config expectations) + intermediate_size=512, # Must be divisible by 256 + num_hidden_layers=2, + num_attention_heads=32, # Matches original test + num_key_value_heads=8, # GQA: 32÷4=8 (matches original n_heads_in_group=4) + max_position_embeddings=512, + rms_norm_eps=1e-5, + rope_theta=10000.0, + attention_bias=False, + hidden_act="silu", + tie_word_embeddings=False, + ) + + # Create and save the Llama model + model = LlamaForCausalLM(llama_config) + model.to(dtype=torch.bfloat16).save_pretrained(output_path) + + # Save tokenizer + tokenizer.save_pretrained(output_path) + + # Save config + llama_config.save_pretrained(output_path) + + +def create_tokenizer(project_root_path: Path) -> PreTrainedTokenizerBase: + """ + Create a tokenizer for the Llama model. + """ + tokenizer_path = project_root_path / "tests/experimental/torch/_compress/resources/tokenizer" + tokenizer = AutoTokenizer.from_pretrained(tokenizer_path) + return tokenizer + + +def setup_puzzle_dir(puzzle_dir: str): + """ + Setup puzzle directory by removing existing directory and creating a new one. + """ + if Path(puzzle_dir).exists(): + shutil.rmtree(puzzle_dir) + Path(puzzle_dir).mkdir(parents=True, exist_ok=True) + + +def save_dummy_dataset(dataset_path: str): + """ + Save a dummy dataset for testing purposes. + """ + # dummy sample + sample = [ + {"role": "user", "content": "please cite Lorem Ipsum?"}, + { + "role": "assistant", + "content": ( + "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed in blandit ante. " + "Sed tempus erat urna, ac elementum nisl facilisis quis. Aliquam consectetur mollis massa, " + "in elementum sem venenatis posuere. Fusce lorem arcu, egestas vel massa sollicitudin, " + "dictum mollis purus. Proin in ullamcorper elit. Nam tellus nisi, volutpat a mattis vel, " + "pretium in purus. Nunc at lectus facilisis risus scelerisque rhoncus eu nec ex. " + "Maecenas semper, tellus non placerat vulputate, urna felis facilisis diam, " + "sit amet vestibulum erat sapien nec libero. Praesent non massa velit. Donec faucibus mi eros. " + "Nam turpis nulla, congue sit amet mi at, porttitor scelerisque elit. Nunc id sodales lorem, " + "nec tincidunt leo. Quisque a neque nec ligula porttitor auctor. " + "Nunc accumsan nunc ac tellus congue vehicula. Praesent tellus eros, luctus non gravida dapibus, " + "faucibus eu ex. Quisque bibendum leo pharetra, tristique est vitae, hendrerit nunc. " + "Duis nec congue dolor. Donec commodo ipsum non efficitur volutpat. " + "Nulla risus nulla, efficitur et urna at, imperdiet sodales lorem. " + "Suspendisse erat est, sollicitudin at nisl tincidunt, vehicula hendrerit lectus. " + "Nam quis nisi ullamcorper, rhoncus massa vel, tempus purus. " + "Duis pulvinar eros vel nulla pellentesque, at dapibus justo laoreet. " + "Praesent tortor orci, vulputate fermentum dapibus nec, feugiat vitae tortor. " + "Donec mollis convallis massa quis iaculis." + ), + }, + ] + + # Prepare train and val splits with sample repeated, 2500 samples are for + # 128 samples with block-size 8192 and LLama3 tokenizer + data = [{"conversation": sample}] * 2500 + + # For train-val splits + data_dict = DatasetDict({"train": Dataset.from_list(data), "valid": Dataset.from_list(data)}) + data_dict.save_to_disk(dataset_path) From b56df9a2746297389f3699a8aa628c488812ef4d Mon Sep 17 00:00:00 2001 From: Daniel Korzekwa Date: Wed, 29 Oct 2025 15:06:41 +0100 Subject: [PATCH 38/81] Improve comments Signed-off-by: Daniel Korzekwa --- .../torch/_compress/nas/plugins/compress_nas_plugin.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/modelopt/torch/_compress/nas/plugins/compress_nas_plugin.py b/modelopt/torch/_compress/nas/plugins/compress_nas_plugin.py index 748f33939..7b7acbed6 100644 --- a/modelopt/torch/_compress/nas/plugins/compress_nas_plugin.py +++ b/modelopt/torch/_compress/nas/plugins/compress_nas_plugin.py @@ -162,5 +162,8 @@ def restore(self) -> RestoreEntrypoint: @property def export_mode(self) -> str | None: - """The mode that corresponds to the export mode of this mode.""" - return "export" + """The mode that corresponds to the export mode. + For now, this will be a no-op as there is no modelopt's concept of search space defined + for the compress algorithm. + """ + return "export_nas" From 9bfcc21a91a0f7a8d4a2e568941d289979e63ab6 Mon Sep 17 00:00:00 2001 From: Daniel Korzekwa Date: Wed, 29 Oct 2025 19:18:01 +0100 Subject: [PATCH 39/81] Added TODO. Signed-off-by: Daniel Korzekwa --- tests/experimental/torch/_compress/test_compress.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/experimental/torch/_compress/test_compress.py b/tests/experimental/torch/_compress/test_compress.py index dd9bac9c5..a95cd0be5 100644 --- a/tests/experimental/torch/_compress/test_compress.py +++ b/tests/experimental/torch/_compress/test_compress.py @@ -94,8 +94,9 @@ def _test_compress_multiprocess_job(project_root_path: Path, tmp_path: Path, ran # Create a small Llama model (not DeciLM) to match the normal conversion pipeline tokenizer = create_tokenizer(project_root_path) - hf_ckpt_teacher_dir = "ckpts/teacher" - llama_checkpoint_path = puzzle_dir / hf_ckpt_teacher_dir + # TODO: change it to "ckpts/llama" once the conversion script is fixed + # Currently, the build replacement library step will fail with such a path. + llama_checkpoint_path = puzzle_dir / "ckpts/teacher" create_and_save_small_llama_model( llama_checkpoint_path, vocab_size=tokenizer.vocab_size, tokenizer=tokenizer ) @@ -103,7 +104,7 @@ def _test_compress_multiprocess_job(project_root_path: Path, tmp_path: Path, ran # Use the full conversion pipeline (matches normal usage) convert_llama3_to_decilm( input_dir=llama_checkpoint_path, - output_dir=llama_checkpoint_path, + output_dir=puzzle_dir / "ckpts/teacher", ) runtime.wait_for_everyone() From 1dc89c44c67e7b7797bcce339f7c7047d371514a Mon Sep 17 00:00:00 2001 From: Daniel Korzekwa Date: Wed, 29 Oct 2025 23:03:41 +0100 Subject: [PATCH 40/81] Implement mtn.search() for the compress algorithm Signed-off-by: Daniel Korzekwa --- .../nas/plugins/compress_nas_plugin.py | 48 +++++- .../_compress/nas/plugins/test_nas_search.py | 141 ++++++++++++++++++ 2 files changed, 187 insertions(+), 2 deletions(-) create mode 100644 tests/experimental/torch/_compress/nas/plugins/test_nas_search.py diff --git a/modelopt/torch/_compress/nas/plugins/compress_nas_plugin.py b/modelopt/torch/_compress/nas/plugins/compress_nas_plugin.py index 7b7acbed6..178a6b55d 100644 --- a/modelopt/torch/_compress/nas/plugins/compress_nas_plugin.py +++ b/modelopt/torch/_compress/nas/plugins/compress_nas_plugin.py @@ -20,8 +20,11 @@ import datetime from pathlib import Path +import build_library_and_stats +import mip_and_realize_models import pruning_ckpts import score_pruning_activations +import scoring import torch from scripts.convert_llama3_to_decilm import convert_llama3_to_decilm from torch import nn @@ -36,7 +39,7 @@ ModeDescriptor, RestoreEntrypoint, ) -from modelopt.torch.opt.searcher import BaseSearcher +from modelopt.torch.opt.searcher import BaseSearcher, SearchStateDict # TODO Move initialize_hydra_config_for_dir from tests to main from tests.utils.test_utils import initialize_hydra_config_for_dir @@ -96,6 +99,12 @@ def convert_compress_model(model: nn.Module, config: CompressConfig) -> ConvertR dtype=torch.bfloat16, torch_distributed_timeout=datetime.timedelta(10) ) + # Required for mtn.search() to read NAS configuration + model.hydra_config_dir = config.hydra_config_dir + model.hydra_config_name = config.hydra_config_name + model.puzzle_dir = config.puzzle_dir + model.dataset_path = config.dataset_path + # Load hydra config hydra_cfg = initialize_hydra_config_for_dir( config_dir=config.hydra_config_dir, @@ -148,7 +157,7 @@ def config_class(self) -> type[ModeloptBaseConfig]: @property def search_algorithm(self) -> type[BaseSearcher]: """Return the associated searcher implementation.""" - raise NotImplementedError("Compress mode does not have a search algorithm yet.") + return CompressSearcher @property def convert(self) -> ConvertEntrypoint: @@ -167,3 +176,38 @@ def export_mode(self) -> str | None: for the compress algorithm. """ return "export_nas" + + +class CompressSearcher(BaseSearcher): + """Runs NAS search for the Compress mode.""" + + @property + def default_state_dict(self) -> SearchStateDict: + """Not needed for the compress mode as we are not saving any model state""" + return {} + + def run_search(self) -> None: + runtime = NativeDdpRuntime( + dtype=torch.bfloat16, torch_distributed_timeout=datetime.timedelta(10) + ) + + # Load hydra config + hydra_cfg = initialize_hydra_config_for_dir( + config_dir=self.model.hydra_config_dir, + config_name=self.model.hydra_config_name, + overrides=[ + f"puzzle_dir={self.model.puzzle_dir}", + f"dataset_path={self.model.dataset_path}", + ], + ) + + # Build_library_and_stats (single process) + if runtime.global_rank == 0: + build_library_and_stats.launch_build_library_and_stats(hydra_cfg) + runtime.wait_for_everyone() + + # Calc_one_block_scores (distributed processing) + scoring.launch_scoring(hydra_cfg, runtime) + + # mip_and_realize_models (distributed processing) + mip_and_realize_models.launch_mip_and_realize_model(hydra_cfg, runtime) diff --git a/tests/experimental/torch/_compress/nas/plugins/test_nas_search.py b/tests/experimental/torch/_compress/nas/plugins/test_nas_search.py new file mode 100644 index 000000000..6b6cd5a24 --- /dev/null +++ b/tests/experimental/torch/_compress/nas/plugins/test_nas_search.py @@ -0,0 +1,141 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# +# See tests/experimental/torch/_compress/test_compress.py for instructions on how to run this test +# TODO: Remove those instructions once this test runs automatically on CI +# +import datetime +from functools import partial +from pathlib import Path + +import torch +from _test_utils.torch.distributed.utils import spawn_multiprocess_job +from experimental.torch._compress.test_utils import ( + create_and_save_small_llama_model, + create_tokenizer, + save_dummy_dataset, + setup_puzzle_dir, +) +from puzzle_tools.hydra_utils import register_hydra_resolvers + +import modelopt.torch.nas as mtn +from modelopt.torch._compress.nas.plugins.compress_nas_plugin import CompressModel +from modelopt.torch._compress.runtime import NativeDdpRuntime + + +def test_nas_search(project_root_path: Path, tmp_path: Path): + spawn_multiprocess_job( + size=torch.cuda.device_count(), + job=partial(_test_nas_search_multiprocess_job, project_root_path, tmp_path), + backend="nccl", + ) + + +def _test_nas_search_multiprocess_job( + project_root_path: Path, tmp_path: Path, rank: int, size: int +): + # Register Hydra custom resolvers (needed for config resolution) + register_hydra_resolvers() + + # + # The inputs for the nas.convert()/nas.search() steps. + # + puzzle_dir = tmp_path + # TODO: change it to "ckpts/llama" once the conversion script is fixed (internal NVidia modelopt bug: issues/17) + llama_checkpoint_path = puzzle_dir / "ckpts/teacher" + dataset_path = puzzle_dir / "dummy_dataset" + hydra_config_dir = project_root_path / "tests/experimental/torch/_compress/resources/configs" + hydra_config_name = "Llama-3_1-8B" + + runtime = NativeDdpRuntime( + dtype=torch.bfloat16, torch_distributed_timeout=datetime.timedelta(10) + ) + + with runtime as runtime: + if rank == 0: + # Setup puzzle_dir and dataset + setup_puzzle_dir(puzzle_dir) + save_dummy_dataset(dataset_path) + + # Create a small Llama model + tokenizer = create_tokenizer(project_root_path) + create_and_save_small_llama_model( + llama_checkpoint_path, vocab_size=tokenizer.vocab_size, tokenizer=tokenizer + ) + runtime.wait_for_everyone() + + # + # Run the mnt.convert() step + # + input_model = CompressModel() + + # Converted model is the same as the input model, but with the search space set up: + # (HF model imported to DeciLM format, pruning scores pruned checkpoints and are saved) + converted_model = mtn.convert( + input_model, + mode=[ + ( + "compress", + { + "puzzle_dir": str(puzzle_dir), + "input_model_path": str(llama_checkpoint_path), + "hydra_config_dir": str(hydra_config_dir), + "hydra_config_name": hydra_config_name, + "dataset_path": str(dataset_path), + }, + ) + ], + ) + + # + # Run the mnt.search() step + # + mtn.search( + converted_model, + constraints={}, # this is not used as the search space is defined in the hydra config + dummy_input=None, # Not used + config={}, # this is not used as the search space is defined in the hydra config + ) + + # + # Check assertions for mnt.search() step + # + if rank == 0: + # assertions for the build_library_and_stats step + assert (puzzle_dir / "replacement_library.json").is_file() + assert (puzzle_dir / "subblock_stats.json").is_file() + + # assertions for the scoring step + solution_0_filepath = ( + puzzle_dir / "single_sequence_replacement_solutions--validation/solution_0.json" + ) + + assert solution_0_filepath.exists() + + # assertions for the mip_and_realize_models step + solution_0_ckpt_config_path = ( + puzzle_dir + / "mip/puzzle_solutions/target_memory_780000MiB/solutions--checkpoints/solution_0/config.json" + ) + + assert solution_0_ckpt_config_path.exists() + assert ( + puzzle_dir / "mip/puzzle_solutions/target_memory_780000MiB/solutions.json" + ).exists() + + runtime.wait_for_everyone() + + print("PYTEST SUMMARY: test_nas_search() test has finished successfully") From 6bfa3ece535853325c1ec343095312d6f8f5fe8e Mon Sep 17 00:00:00 2001 From: Daniel Korzekwa Date: Wed, 29 Oct 2025 23:17:29 +0100 Subject: [PATCH 41/81] Refactoring Signed-off-by: Daniel Korzekwa --- .../_compress/nas/plugins/test_nas_convert.py | 95 ++++++++++--------- 1 file changed, 52 insertions(+), 43 deletions(-) diff --git a/tests/experimental/torch/_compress/nas/plugins/test_nas_convert.py b/tests/experimental/torch/_compress/nas/plugins/test_nas_convert.py index 4a416c833..ce285e49d 100644 --- a/tests/experimental/torch/_compress/nas/plugins/test_nas_convert.py +++ b/tests/experimental/torch/_compress/nas/plugins/test_nas_convert.py @@ -48,54 +48,12 @@ def test_nas_convert(project_root_path: Path, tmp_path: Path): def _test_nas_convert_multiprocess_job( project_root_path: Path, tmp_path: Path, rank: int, size: int ): - # Register Hydra custom resolvers (needed for config resolution) - register_hydra_resolvers() - - # - # The inputs for the nas.convert() step. - # - puzzle_dir = tmp_path - llama_checkpoint_path = puzzle_dir / "ckpts/llama" - dataset_path = puzzle_dir / "dummy_dataset" - hydra_config_dir = project_root_path / "tests/experimental/torch/_compress/resources/configs" - hydra_config_name = "Llama-3_1-8B" - runtime = NativeDdpRuntime( dtype=torch.bfloat16, torch_distributed_timeout=datetime.timedelta(10) ) with runtime as runtime: - if rank == 0: - # Setup puzzle_dir and dataset - setup_puzzle_dir(puzzle_dir) - save_dummy_dataset(dataset_path) - - # Create a small Llama model - tokenizer = create_tokenizer(project_root_path) - create_and_save_small_llama_model( - llama_checkpoint_path, vocab_size=tokenizer.vocab_size, tokenizer=tokenizer - ) - runtime.wait_for_everyone() - - # - # Run the mnt.convert() step - # - input_model = CompressModel() - mtn.convert( - input_model, - mode=[ - ( - "compress", - { - "puzzle_dir": str(puzzle_dir), - "input_model_path": str(llama_checkpoint_path), - "hydra_config_dir": str(hydra_config_dir), - "hydra_config_name": hydra_config_name, - "dataset_path": str(dataset_path), - }, - ) - ], - ) + converted_model, puzzle_dir = run_nas_convert(project_root_path, tmp_path, rank, runtime) # # Check assertions @@ -114,3 +72,54 @@ def _test_nas_convert_multiprocess_job( runtime.wait_for_everyone() print("PYTEST SUMMARY: test_nas_convert() test has finished successfully") + + +def run_nas_convert( + project_root_path: Path, + tmp_path: Path, + rank: int, + runtime, +): + # Register Hydra custom resolvers (needed for config resolution) + register_hydra_resolvers() + + # The inputs for the nas.convert() step. + # + puzzle_dir = tmp_path + llama_checkpoint_path = puzzle_dir / "ckpts/llama" + dataset_path = puzzle_dir / "dummy_dataset" + hydra_config_dir = project_root_path / "tests/experimental/torch/_compress/resources/configs" + hydra_config_name = "Llama-3_1-8B" + + if rank == 0: + # Setup puzzle_dir and dataset + setup_puzzle_dir(puzzle_dir) + save_dummy_dataset(dataset_path) + + # Create a small Llama model + tokenizer = create_tokenizer(project_root_path) + create_and_save_small_llama_model( + llama_checkpoint_path, vocab_size=tokenizer.vocab_size, tokenizer=tokenizer + ) + runtime.wait_for_everyone() + + # Run the mnt.convert() step + # + input_model = CompressModel() + converted_model = mtn.convert( + input_model, + mode=[ + ( + "compress", + { + "puzzle_dir": str(puzzle_dir), + "input_model_path": str(llama_checkpoint_path), + "hydra_config_dir": str(hydra_config_dir), + "hydra_config_name": hydra_config_name, + "dataset_path": str(dataset_path), + }, + ) + ], + ) + + return converted_model, puzzle_dir From 6d45e3342241e2f727910c6e3d0ad16ea660859d Mon Sep 17 00:00:00 2001 From: Daniel Korzekwa Date: Wed, 29 Oct 2025 23:22:49 +0100 Subject: [PATCH 42/81] code refactoring Signed-off-by: Daniel Korzekwa --- .../_compress/nas/plugins/test_nas_convert.py | 3 +- .../_compress/nas/plugins/test_nas_search.py | 57 +------------------ 2 files changed, 4 insertions(+), 56 deletions(-) diff --git a/tests/experimental/torch/_compress/nas/plugins/test_nas_convert.py b/tests/experimental/torch/_compress/nas/plugins/test_nas_convert.py index ce285e49d..1bd588582 100644 --- a/tests/experimental/torch/_compress/nas/plugins/test_nas_convert.py +++ b/tests/experimental/torch/_compress/nas/plugins/test_nas_convert.py @@ -86,7 +86,8 @@ def run_nas_convert( # The inputs for the nas.convert() step. # puzzle_dir = tmp_path - llama_checkpoint_path = puzzle_dir / "ckpts/llama" + # TODO: change it to "ckpts/llama" once the conversion script is fixed (internal NVidia modelopt bug: issues/17) + llama_checkpoint_path = puzzle_dir / "ckpts/teacher" dataset_path = puzzle_dir / "dummy_dataset" hydra_config_dir = project_root_path / "tests/experimental/torch/_compress/resources/configs" hydra_config_name = "Llama-3_1-8B" diff --git a/tests/experimental/torch/_compress/nas/plugins/test_nas_search.py b/tests/experimental/torch/_compress/nas/plugins/test_nas_search.py index 6b6cd5a24..c21f3fa1b 100644 --- a/tests/experimental/torch/_compress/nas/plugins/test_nas_search.py +++ b/tests/experimental/torch/_compress/nas/plugins/test_nas_search.py @@ -23,16 +23,9 @@ import torch from _test_utils.torch.distributed.utils import spawn_multiprocess_job -from experimental.torch._compress.test_utils import ( - create_and_save_small_llama_model, - create_tokenizer, - save_dummy_dataset, - setup_puzzle_dir, -) -from puzzle_tools.hydra_utils import register_hydra_resolvers +from experimental.torch._compress.nas.plugins.test_nas_convert import run_nas_convert import modelopt.torch.nas as mtn -from modelopt.torch._compress.nas.plugins.compress_nas_plugin import CompressModel from modelopt.torch._compress.runtime import NativeDdpRuntime @@ -47,58 +40,12 @@ def test_nas_search(project_root_path: Path, tmp_path: Path): def _test_nas_search_multiprocess_job( project_root_path: Path, tmp_path: Path, rank: int, size: int ): - # Register Hydra custom resolvers (needed for config resolution) - register_hydra_resolvers() - - # - # The inputs for the nas.convert()/nas.search() steps. - # - puzzle_dir = tmp_path - # TODO: change it to "ckpts/llama" once the conversion script is fixed (internal NVidia modelopt bug: issues/17) - llama_checkpoint_path = puzzle_dir / "ckpts/teacher" - dataset_path = puzzle_dir / "dummy_dataset" - hydra_config_dir = project_root_path / "tests/experimental/torch/_compress/resources/configs" - hydra_config_name = "Llama-3_1-8B" - runtime = NativeDdpRuntime( dtype=torch.bfloat16, torch_distributed_timeout=datetime.timedelta(10) ) with runtime as runtime: - if rank == 0: - # Setup puzzle_dir and dataset - setup_puzzle_dir(puzzle_dir) - save_dummy_dataset(dataset_path) - - # Create a small Llama model - tokenizer = create_tokenizer(project_root_path) - create_and_save_small_llama_model( - llama_checkpoint_path, vocab_size=tokenizer.vocab_size, tokenizer=tokenizer - ) - runtime.wait_for_everyone() - - # - # Run the mnt.convert() step - # - input_model = CompressModel() - - # Converted model is the same as the input model, but with the search space set up: - # (HF model imported to DeciLM format, pruning scores pruned checkpoints and are saved) - converted_model = mtn.convert( - input_model, - mode=[ - ( - "compress", - { - "puzzle_dir": str(puzzle_dir), - "input_model_path": str(llama_checkpoint_path), - "hydra_config_dir": str(hydra_config_dir), - "hydra_config_name": hydra_config_name, - "dataset_path": str(dataset_path), - }, - ) - ], - ) + converted_model, puzzle_dir = run_nas_convert(project_root_path, tmp_path, rank, runtime) # # Run the mnt.search() step From f9e09d928dd7e7367173dd77251367dcd83990e8 Mon Sep 17 00:00:00 2001 From: Daniel Korzekwa Date: Thu, 30 Oct 2025 12:58:47 +0100 Subject: [PATCH 43/81] Correct import paths Signed-off-by: Daniel Korzekwa --- modelopt/torch/_compress/nas/plugins/compress_nas_plugin.py | 4 +++- tests/experimental/torch/_compress/test_compress.py | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/modelopt/torch/_compress/nas/plugins/compress_nas_plugin.py b/modelopt/torch/_compress/nas/plugins/compress_nas_plugin.py index 178a6b55d..026f1478b 100644 --- a/modelopt/torch/_compress/nas/plugins/compress_nas_plugin.py +++ b/modelopt/torch/_compress/nas/plugins/compress_nas_plugin.py @@ -26,9 +26,11 @@ import score_pruning_activations import scoring import torch -from scripts.convert_llama3_to_decilm import convert_llama3_to_decilm from torch import nn +from modelopt.torch._compress.decilm.converters.convert_llama3_to_decilm import ( + convert_llama3_to_decilm, +) from modelopt.torch._compress.runtime import NativeDdpRuntime from modelopt.torch.nas.conversion import NASModeRegistry from modelopt.torch.opt.config import ModeloptBaseConfig, ModeloptField diff --git a/tests/experimental/torch/_compress/test_compress.py b/tests/experimental/torch/_compress/test_compress.py index a95cd0be5..bc8f153dd 100644 --- a/tests/experimental/torch/_compress/test_compress.py +++ b/tests/experimental/torch/_compress/test_compress.py @@ -27,9 +27,11 @@ setup_puzzle_dir, ) from puzzle_tools.hydra_utils import register_hydra_resolvers -from scripts.convert_llama3_to_decilm import convert_llama3_to_decilm from modelopt.torch._compress import compress +from modelopt.torch._compress.decilm.converters.convert_llama3_to_decilm import ( + convert_llama3_to_decilm, +) from modelopt.torch._compress.runtime import NativeDdpRuntime # The e2e test to compress a model based on Local Neural Architecture Search (Mixed Integer Programing NAS search) From a0cfd13564ed2ed6f738f5ff4cd828228d85509a Mon Sep 17 00:00:00 2001 From: Daniel Korzekwa Date: Thu, 30 Oct 2025 13:32:25 +0100 Subject: [PATCH 44/81] Change llama_checkpoint_path, can't be inside of ckpts folder Signed-off-by: Daniel Korzekwa --- .../torch/_compress/nas/plugins/test_nas_convert.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/experimental/torch/_compress/nas/plugins/test_nas_convert.py b/tests/experimental/torch/_compress/nas/plugins/test_nas_convert.py index 1bd588582..3c8e0ebe6 100644 --- a/tests/experimental/torch/_compress/nas/plugins/test_nas_convert.py +++ b/tests/experimental/torch/_compress/nas/plugins/test_nas_convert.py @@ -86,8 +86,7 @@ def run_nas_convert( # The inputs for the nas.convert() step. # puzzle_dir = tmp_path - # TODO: change it to "ckpts/llama" once the conversion script is fixed (internal NVidia modelopt bug: issues/17) - llama_checkpoint_path = puzzle_dir / "ckpts/teacher" + llama_checkpoint_path = puzzle_dir / "input_model/llama" dataset_path = puzzle_dir / "dummy_dataset" hydra_config_dir = project_root_path / "tests/experimental/torch/_compress/resources/configs" hydra_config_name = "Llama-3_1-8B" From 2c2995c99c04a3a09f5c2292da68e46bbc9a8ce0 Mon Sep 17 00:00:00 2001 From: Daniel Korzekwa Date: Fri, 31 Oct 2025 11:44:35 +0100 Subject: [PATCH 45/81] Initial commit for compress tutorial Signed-off-by: Daniel Korzekwa --- examples/compress/README.md | 0 examples/compress/main.py | 15 +++++++++++++++ 2 files changed, 15 insertions(+) create mode 100644 examples/compress/README.md create mode 100644 examples/compress/main.py diff --git a/examples/compress/README.md b/examples/compress/README.md new file mode 100644 index 000000000..e69de29bb diff --git a/examples/compress/main.py b/examples/compress/main.py new file mode 100644 index 000000000..47f1c65a1 --- /dev/null +++ b/examples/compress/main.py @@ -0,0 +1,15 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + From b152689dc2908527d481adeab8016a939b495179 Mon Sep 17 00:00:00 2001 From: Daniel Korzekwa Date: Fri, 31 Oct 2025 16:12:29 +0100 Subject: [PATCH 46/81] Update compress tutorial and implement main.py for compress tutorial. Signed-off-by: Daniel Korzekwa --- examples/compress/README.md | 36 ++++++++++++++++++ examples/compress/main.py | 75 +++++++++++++++++++++++++++++++++++++ 2 files changed, 111 insertions(+) diff --git a/examples/compress/README.md b/examples/compress/README.md index e69de29bb..f123e2ac6 100644 --- a/examples/compress/README.md +++ b/examples/compress/README.md @@ -0,0 +1,36 @@ +# Compress Algorithm Tutorial + +This tutorial demonstrates how to compress large language models using the compress algorithm based on the [Puzzle paper](https://arxiv.org/abs/2411.19146). + +In this example, we compress Llama 3.2 1B by searching for the optimal `ffn_intermediate_size` across MLP layers, resulting in a heterogeneous architecture while reducing GPU memory usage by 20%. + +## Compress the Model + +```bash +# TODO +torchrun examples/compress/main.py +``` + +## Evaluate Model Accuracy + +```bash +# TODO +``` + +## Re-run MIP Search with Different Memory Constraints + +```bash +# TODO +``` + +## Deploy to TensorRT-LLM + +```bash +# TODO +``` + +## Export to NeMo for Knowledge Distillation + +```bash +# TODO +``` diff --git a/examples/compress/main.py b/examples/compress/main.py index 47f1c65a1..e0fa74db0 100644 --- a/examples/compress/main.py +++ b/examples/compress/main.py @@ -13,3 +13,78 @@ # See the License for the specific language governing permissions and # limitations under the License. +""" +Main script for running the compress algorithm on large language models (based on Puzzle paper https://arxiv.org/abs/2411.19146). + +This script provides two modes: +1. Default mode: Runs the full compression pipeline +2. MIP-only mode: Runs only the MIP search and realize models phase + +Usage: + # Full compression pipeline + torchrun main.py --config ./configs/llama_3.2_1B_pruneffn_memory.yaml + + # Only MIP search and realize models phase + torchrun main.py --config ./configs/llama_3.2_1B_pruneffn_memory.yaml --mip-only +""" + +import argparse +from pathlib import Path + + +def parse_args(): + """Parse command line arguments.""" + parser = argparse.ArgumentParser( + description="Compress large language models using the Compress algorithm (based on Puzzle paper https://arxiv.org/abs/2411.19146)" + ) + parser.add_argument( + "--config", + type=str, + required=True, + help="Path to the main config YAML file (e.g., ./configs/llama_3.2_1B_pruneffn_memory.yaml)", + ) + parser.add_argument( + "--mip-only", + action="store_true", + help="Run only the MIP search and realize models phase (skip pruning and NAS scoring)", + ) + + return parser.parse_args() + + +def run_full_compress(hydra_config_path: str): + """Run the full compression pipeline. + + Args: + config_path: Path to the YAML configuration file + """ + hydra_config_path = Path(hydra_config_path).resolve() + # config_dir = str(hydra_config_path.parent) + # config_name = hydra_config_path.stem + + +def run_mip_only(hydra_config_path: str): + """Run only the MIP search and realize models phase. + + This assumes that pruning, replacement library building, NAS scoring, and subblock stats calculation + have already been completed. + + Args: + config_path: Path to the YAML configuration file + """ + hydra_config_path = Path(hydra_config_path).resolve() + # config_dir = str(hydra_config_path.parent) + # config_name = hydra_config_path.stem + + +def main(): + args = parse_args() + + if args.mip_only: + run_mip_only(hydra_config_path=args.config) + else: + run_full_compress(hydra_config_path=args.config) + + +if __name__ == "__main__": + main() From 24e30e6214fb709a3852fbcbc67fd5689fe4814e Mon Sep 17 00:00:00 2001 From: Daniel Korzekwa Date: Fri, 31 Oct 2025 16:15:06 +0100 Subject: [PATCH 47/81] Update compress tutorial Signed-off-by: Daniel Korzekwa --- examples/compress/README.md | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/examples/compress/README.md b/examples/compress/README.md index f123e2ac6..b42db37fa 100644 --- a/examples/compress/README.md +++ b/examples/compress/README.md @@ -7,8 +7,8 @@ In this example, we compress Llama 3.2 1B by searching for the optimal `ffn_inte ## Compress the Model ```bash -# TODO -torchrun examples/compress/main.py +torchrun --nproc_per_node=8 examples/compress/main.py \ + --config ./examples/compress/configs/llama_3.2_1B_pruneffn_memory.yaml ``` ## Evaluate Model Accuracy @@ -19,10 +19,16 @@ torchrun examples/compress/main.py ## Re-run MIP Search with Different Memory Constraints +If you want to try different memory constraints without re-running the expensive pruning and scoring steps, use the `--mip-only` flag: + ```bash -# TODO +torchrun --nproc_per_node=8 examples/compress/main.py \ + --config ./examples/compress/configs/llama_3.2_1B_pruneffn_memory.yaml \ + --mip-only ``` +This assumes pruning, replacement library building, NAS scoring, and subblock stats calculation have already been completed. + ## Deploy to TensorRT-LLM ```bash From 21f115e52173489b5f972a7f30f7eec2a292ecf3 Mon Sep 17 00:00:00 2001 From: Daniel Korzekwa Date: Fri, 31 Oct 2025 18:50:46 +0100 Subject: [PATCH 48/81] Create a yaml file for llama 3.2-1B model compression Signed-off-by: Daniel Korzekwa --- .../llama_3.2_1B_pruneffn_memory.yaml | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 examples/compress/configs/llama_3.2_1B_pruneffn_memory/llama_3.2_1B_pruneffn_memory.yaml diff --git a/examples/compress/configs/llama_3.2_1B_pruneffn_memory/llama_3.2_1B_pruneffn_memory.yaml b/examples/compress/configs/llama_3.2_1B_pruneffn_memory/llama_3.2_1B_pruneffn_memory.yaml new file mode 100644 index 000000000..bec15d2f7 --- /dev/null +++ b/examples/compress/configs/llama_3.2_1B_pruneffn_memory/llama_3.2_1B_pruneffn_memory.yaml @@ -0,0 +1,21 @@ +defaults: + - /Llama-3_2-1B + - _self_ + +# Input Hugging Face model to compress +input_hf_model_path: ??? # e.g., "path/to/meta-llama/Llama-3.2-1B" + +# Dataset path for pruning and NAS scoring +dataset_path: ??? # e.g., "path/to/dataset" + +# Working directory for compression outputs +puzzle_dir: ??? # e.g., "path/to/puzzle_dir" + +# MIP memory constraint (in MiB) +mip: + human_constraints: + target_memory: 2_000 # 2 GiB + +# FFN intermediate sizes to search over (heterogeneous architecture) +pruning: + intermediate_size_list: [768, 1024, 1536, 2048, 2560, 4192] # Llama 3.2 1B baseline: 8192 From d19b9ab0823465ba2c48bd755c4c4736470a2dec Mon Sep 17 00:00:00 2001 From: Daniel Korzekwa Date: Fri, 31 Oct 2025 18:51:19 +0100 Subject: [PATCH 49/81] fix input model path in the unit test. Signed-off-by: Daniel Korzekwa --- tests/experimental/torch/_compress/test_compress.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/experimental/torch/_compress/test_compress.py b/tests/experimental/torch/_compress/test_compress.py index bc8f153dd..02010f71b 100644 --- a/tests/experimental/torch/_compress/test_compress.py +++ b/tests/experimental/torch/_compress/test_compress.py @@ -98,7 +98,7 @@ def _test_compress_multiprocess_job(project_root_path: Path, tmp_path: Path, ran tokenizer = create_tokenizer(project_root_path) # TODO: change it to "ckpts/llama" once the conversion script is fixed # Currently, the build replacement library step will fail with such a path. - llama_checkpoint_path = puzzle_dir / "ckpts/teacher" + llama_checkpoint_path = puzzle_dir / "input_model/llama" create_and_save_small_llama_model( llama_checkpoint_path, vocab_size=tokenizer.vocab_size, tokenizer=tokenizer ) From 78d7a870ebb3f0c00f2bc1059562415ff2e4da75 Mon Sep 17 00:00:00 2001 From: Daniel Korzekwa Date: Fri, 31 Oct 2025 19:32:28 +0100 Subject: [PATCH 50/81] compress tutorial Signed-off-by: Daniel Korzekwa --- examples/compress/README.md | 12 +- .../Llama-3_2-8B.yaml | 108 ++++++++++++++++++ .../llama_3.2_1B_pruneffn_memory.yaml | 2 +- .../pruning/attn_pruning.yaml | 16 +++ .../pruning/ffn_pruning.yaml | 12 ++ .../pruning/hidden_dim_pruning.yaml | 15 +++ .../pruning/pruning_defaults.yaml | 32 ++++++ .../validate_model_defaults.yaml | 15 +++ .../validate_solutions_defaults.yaml | 10 ++ examples/compress/main.py | 19 ++- 10 files changed, 234 insertions(+), 7 deletions(-) create mode 100644 examples/compress/configs/llama_3.2_1B_pruneffn_memory/Llama-3_2-8B.yaml create mode 100644 examples/compress/configs/llama_3.2_1B_pruneffn_memory/pruning/attn_pruning.yaml create mode 100644 examples/compress/configs/llama_3.2_1B_pruneffn_memory/pruning/ffn_pruning.yaml create mode 100644 examples/compress/configs/llama_3.2_1B_pruneffn_memory/pruning/hidden_dim_pruning.yaml create mode 100644 examples/compress/configs/llama_3.2_1B_pruneffn_memory/pruning/pruning_defaults.yaml create mode 100644 examples/compress/configs/llama_3.2_1B_pruneffn_memory/validate_model_defaults.yaml create mode 100644 examples/compress/configs/llama_3.2_1B_pruneffn_memory/validate_solutions_defaults.yaml diff --git a/examples/compress/README.md b/examples/compress/README.md index b42db37fa..0304217c7 100644 --- a/examples/compress/README.md +++ b/examples/compress/README.md @@ -7,8 +7,8 @@ In this example, we compress Llama 3.2 1B by searching for the optimal `ffn_inte ## Compress the Model ```bash -torchrun --nproc_per_node=8 examples/compress/main.py \ - --config ./examples/compress/configs/llama_3.2_1B_pruneffn_memory.yaml +torchrun examples/compress/main.py \ + --config path/to/llama_3.2_1B_pruneffn_memory.yaml ``` ## Evaluate Model Accuracy @@ -22,8 +22,8 @@ torchrun --nproc_per_node=8 examples/compress/main.py \ If you want to try different memory constraints without re-running the expensive pruning and scoring steps, use the `--mip-only` flag: ```bash -torchrun --nproc_per_node=8 examples/compress/main.py \ - --config ./examples/compress/configs/llama_3.2_1B_pruneffn_memory.yaml \ +torchrun examples/compress/main.py \ + --config path/to/llama_3.2_1B_pruneffn_memory.yaml \ --mip-only ``` @@ -40,3 +40,7 @@ This assumes pruning, replacement library building, NAS scoring, and subblock st ```bash # TODO ``` + +## Advanced usage + +Modify `path/to/Llama-3_2-1B yaml` file for advanced compression scenarios. diff --git a/examples/compress/configs/llama_3.2_1B_pruneffn_memory/Llama-3_2-8B.yaml b/examples/compress/configs/llama_3.2_1B_pruneffn_memory/Llama-3_2-8B.yaml new file mode 100644 index 000000000..1d8fac655 --- /dev/null +++ b/examples/compress/configs/llama_3.2_1B_pruneffn_memory/Llama-3_2-8B.yaml @@ -0,0 +1,108 @@ +defaults: + - pruning: ffn_pruning + - scoring: ../validate_solutions_defaults + - realize_model: ../validate_solutions_defaults + - bypass: + - override hydra/hydra_logging: disabled + - _self_ + +puzzle_dir: ??? +teacher_dir: ${puzzle_dir}/ckpts/teacher/ +replacement_library_path: ${puzzle_dir}/replacement_library.json +dataset_path: ??? # path to v0.4_mini + +skip_realize_model: false + +build_replacement_library: + add_ffn_no_ops: true + add_attention_no_ops: true + +calc_subblock_stats: + batch_sizes: [64, 96, 128] + prefill_seq_len: 4096 + generation_seq_len: 4096 + num_active_tokens_override: # Optional override for sequence lengths + prefill_queue_size: 0 + allocate_prefill_query: false + benchmark_iterations: # Set to a number (e.g., 1000) to enable runtime benchmarking + merge_with_existing_stats: false + subblock_stats_filename: "subblock_stats.json" + moe_stats_filename: "moe_stats.json" + +scoring: + solutions_to_validate: + skip_existing_solutions: true + + replacement_library_path: ${replacement_library_path} + solutions_path: ${to_path:${puzzle_dir}/single_sequence_replacement_solutions.json} + teacher_dir: ${to_path:${teacher_dir}} + output_dir: ${puzzle_dir}/single_sequence_replacement_solutions--validation + + eval_samples: 2 + micro_batch_size: 1 + dataset_path: ${dataset_path}/valid + seed: 42 + shuffle_seed: 444 + +mip: + single_block_replacement_validation_dir: ${to_path:${scoring.output_dir}} + subblock_stats_path: ${to_path:${puzzle_dir}/${calc_subblock_stats.subblock_stats_filename}} + output_path: ${to_path:${puzzle_dir}/mip/puzzle_solutions} + gathered_metrics_path: + puzzle_profile: + + # puzzle_profile: + objective: metrics.cosine_embedding_loss_hidden_states + bigger_is_better: false + num_solutions: 1 + minimal_diversity: 2 + + subblock_stats_args: + - batch_size: 96 + weights_dtype: torch.bfloat16 + activations_dtype: torch.bfloat16 + kv_cache_dtype: torch.bfloat16 + + report_additional_costs: + - stats.memory_mib + - stats.num_params + - stats.num_kv_heads + - stats.has_attention + - stats.has_ffn + - stats.kv_cache_memory_mib + - stats.attention_memory_mib + - stats.ffn_memory_mib + - stats.ffn_num_params + - stats.attention_num_params + + human_constraints: + target_memory: 780_000 # 78_000 + + mip_constraints: + use_greedy_search: false + is_multi_layer_puzzle: true + metric_overrides: + constrain_search_func: + max_seconds_per_solution: 60 + +realize_model: + teacher_dir: ${to_path:${teacher_dir}} + tokenizer_name: ${to_path:${teacher_dir}} + replacement_library_path: ${replacement_library_path} + save_models: true + solutions_path: # Filled dynamically + + # Validate params + skip_validation: false # To enable validation of the model solution set `skip_validation` as False + eval_samples: 2 + micro_batch_size: 1 + dataset_path: ${dataset_path}/valid + seed: 42 + shuffle_seed: 444 + +nccl_timeout_minutes: ${timedelta_minutes:10} + +# This section redirects Hydra outputs +hydra: + run: + dir: ${puzzle_dir}/hydra_logs/${now:%Y-%m-%d}/${now:%H-%M-%S} diff --git a/examples/compress/configs/llama_3.2_1B_pruneffn_memory/llama_3.2_1B_pruneffn_memory.yaml b/examples/compress/configs/llama_3.2_1B_pruneffn_memory/llama_3.2_1B_pruneffn_memory.yaml index bec15d2f7..f7962f0aa 100644 --- a/examples/compress/configs/llama_3.2_1B_pruneffn_memory/llama_3.2_1B_pruneffn_memory.yaml +++ b/examples/compress/configs/llama_3.2_1B_pruneffn_memory/llama_3.2_1B_pruneffn_memory.yaml @@ -1,5 +1,5 @@ defaults: - - /Llama-3_2-1B + - ./Llama-3_2-1B - _self_ # Input Hugging Face model to compress diff --git a/examples/compress/configs/llama_3.2_1B_pruneffn_memory/pruning/attn_pruning.yaml b/examples/compress/configs/llama_3.2_1B_pruneffn_memory/pruning/attn_pruning.yaml new file mode 100644 index 000000000..01886607e --- /dev/null +++ b/examples/compress/configs/llama_3.2_1B_pruneffn_memory/pruning/attn_pruning.yaml @@ -0,0 +1,16 @@ +defaults: + - pruning_defaults + +activations_log_dir: ${puzzle_dir}/pruning/pruning_scores/attn_${pruning.activation_hooks_kwargs.method}/${pruning.experiment_id} + +activation_hooks_kwargs: + method: independent_kv_head_contribution + optimize_for: memory # IndependentKvHeadContributionHook implementation that consumes less memory + target_layer: "self_attn.o_proj" + layer_input_descriptors_path: + +# n_heads_in_group: 4 +# num_attention_heads: 32 # num query heads +# num_kv_heads: 32 / 4 = 8 # num_query_heads // n_heads_in_group +n_heads_in_group_list: [8, 16, 32] # num_kv_heads = [4, 2, 1] +gqa_init_mode: "PruneKVHeads" diff --git a/examples/compress/configs/llama_3.2_1B_pruneffn_memory/pruning/ffn_pruning.yaml b/examples/compress/configs/llama_3.2_1B_pruneffn_memory/pruning/ffn_pruning.yaml new file mode 100644 index 000000000..f0c852eec --- /dev/null +++ b/examples/compress/configs/llama_3.2_1B_pruneffn_memory/pruning/ffn_pruning.yaml @@ -0,0 +1,12 @@ +defaults: + - pruning_defaults + +activations_log_dir: ${puzzle_dir}/pruning/pruning_scores/ffn_${pruning.activation_hooks_kwargs.method}/${pruning.experiment_id} + +activation_hooks_kwargs: + method: iterative + target_layer: "mlp.down_proj" + layer_input_descriptors_path: + +intermediate_size_list: [256] # teacher_intermediate_size is 14336 +mlp_init_mode: "PruneByActivationsLog" diff --git a/examples/compress/configs/llama_3.2_1B_pruneffn_memory/pruning/hidden_dim_pruning.yaml b/examples/compress/configs/llama_3.2_1B_pruneffn_memory/pruning/hidden_dim_pruning.yaml new file mode 100644 index 000000000..407c835d8 --- /dev/null +++ b/examples/compress/configs/llama_3.2_1B_pruneffn_memory/pruning/hidden_dim_pruning.yaml @@ -0,0 +1,15 @@ +defaults: + - pruning_defaults + +activations_log_dir: ${puzzle_dir}/pruning/pruning_scores/hidden_dim_${pruning.activation_hooks_kwargs.method}/${pruning.experiment_id} + +activation_hooks_kwargs: + method: layer_norm_contribution + target_layer: "layernorm" + +# Hidden dimension pruning specific settings +hidden_size_list: [3072, 2048] # Target hidden sizes to prune to +hidden_size_init_mode: "PruneByChannelRanking" +mlp_init_mode: "Truncate" # TODO, make it work with CopyAsIs/FromTeacher +gqa_init_mode: "AverageKV" # TODO, make it work with CopyAsIs/FromTeacher +linear_init_mode: "FromTeacher" diff --git a/examples/compress/configs/llama_3.2_1B_pruneffn_memory/pruning/pruning_defaults.yaml b/examples/compress/configs/llama_3.2_1B_pruneffn_memory/pruning/pruning_defaults.yaml new file mode 100644 index 000000000..0a5eafcff --- /dev/null +++ b/examples/compress/configs/llama_3.2_1B_pruneffn_memory/pruning/pruning_defaults.yaml @@ -0,0 +1,32 @@ +defaults: + - /validate_model_defaults + +model_name_or_path: ${teacher_dir} +experiment_id: ${pruning.eval_samples}samples_diverse_mini +activations_log_dir: ??? +activation_hooks_kwargs: ??? + +# Data: +eval_samples: 100 +micro_batch_size: 4 +dataset_path: ${dataset_path} +val_dataset_name: train + +# Prune ckpts +pruned_ckpts_outpt_dir: ${puzzle_dir}/pruning/${pruning.experiment_id} + +## FFN pruning +ffn_list: +mlp_init_mode: "Truncate" + +## KV-heads pruning +n_heads_in_group_list: +gqa_init_mode: "AverageKV" + +## Hidden dimension pruning +hidden_size_list: +hidden_size_init_mode: "PruneByChannelRanking" +linear_init_mode: "FromTeacher" + +mlp_init_config_yaml: + activations_log_dir: ${pruning.activations_log_dir} diff --git a/examples/compress/configs/llama_3.2_1B_pruneffn_memory/validate_model_defaults.yaml b/examples/compress/configs/llama_3.2_1B_pruneffn_memory/validate_model_defaults.yaml new file mode 100644 index 000000000..046ff51f6 --- /dev/null +++ b/examples/compress/configs/llama_3.2_1B_pruneffn_memory/validate_model_defaults.yaml @@ -0,0 +1,15 @@ +block_size: 8192 +bos_rate: 0.5 +data_column: conversation +val_dataset_name: train +shuffle_seed: 81436 +seed: 42 +fim_rate: 0 +fim_spm_rate: 0 +source_datasets_to_discard: +varlen: false +write_results: false +calc_losses_on_cpu: false +activations_log_dir: +model_name_or_path: +load_dataset_fn: ${get_object:utils.data.dataloaders.load_from_disk_fn} diff --git a/examples/compress/configs/llama_3.2_1B_pruneffn_memory/validate_solutions_defaults.yaml b/examples/compress/configs/llama_3.2_1B_pruneffn_memory/validate_solutions_defaults.yaml new file mode 100644 index 000000000..ec1390237 --- /dev/null +++ b/examples/compress/configs/llama_3.2_1B_pruneffn_memory/validate_solutions_defaults.yaml @@ -0,0 +1,10 @@ +defaults: + - /validate_model_defaults + - _self_ + +solutions_to_validate: +skip_validation: false +save_models: false +bigger_is_better: false +sort_solutions_by: +calculate_full_score_ablations: false diff --git a/examples/compress/main.py b/examples/compress/main.py index e0fa74db0..e7e443f0f 100644 --- a/examples/compress/main.py +++ b/examples/compress/main.py @@ -31,6 +31,10 @@ import argparse from pathlib import Path +from puzzle_tools.hydra_utils import register_hydra_resolvers + +from tests.utils.test_utils import initialize_hydra_config_for_dir + def parse_args(): """Parse command line arguments.""" @@ -58,9 +62,20 @@ def run_full_compress(hydra_config_path: str): Args: config_path: Path to the YAML configuration file """ + + # Register Hydra custom resolvers (needed for config resolution) + register_hydra_resolvers() + hydra_config_path = Path(hydra_config_path).resolve() - # config_dir = str(hydra_config_path.parent) - # config_name = hydra_config_path.stem + hydra_config_dir = str(hydra_config_path.parent) + hydra_config_name = hydra_config_path.stem + + # Load hydra config + initialize_hydra_config_for_dir( + config_dir=hydra_config_dir, + config_name=hydra_config_name, + overrides=[], + ) def run_mip_only(hydra_config_path: str): From f71c1b68ea8ca3d268a3c1a21c1e7a107cc480c5 Mon Sep 17 00:00:00 2001 From: Daniel Korzekwa Date: Fri, 31 Oct 2025 19:43:25 +0100 Subject: [PATCH 51/81] Code refactoring Signed-off-by: Daniel Korzekwa --- .../torch/_compress/nas/plugins/test_nas_convert.py | 6 ++---- .../torch/_compress/nas/plugins/test_nas_search.py | 6 ++---- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/tests/experimental/torch/_compress/nas/plugins/test_nas_convert.py b/tests/experimental/torch/_compress/nas/plugins/test_nas_convert.py index db20686e9..fd0bb2cf1 100644 --- a/tests/experimental/torch/_compress/nas/plugins/test_nas_convert.py +++ b/tests/experimental/torch/_compress/nas/plugins/test_nas_convert.py @@ -48,11 +48,9 @@ def test_nas_convert(project_root_path: Path, tmp_path: Path): def _test_nas_convert_multiprocess_job( project_root_path: Path, tmp_path: Path, rank: int, size: int ): - runtime = NativeDdpRuntime( + with NativeDdpRuntime( dtype=torch.bfloat16, torch_distributed_timeout=datetime.timedelta(10) - ) - - with runtime as runtime: + ) as runtime: converted_model, puzzle_dir = run_nas_convert(project_root_path, tmp_path, rank, runtime) # diff --git a/tests/experimental/torch/_compress/nas/plugins/test_nas_search.py b/tests/experimental/torch/_compress/nas/plugins/test_nas_search.py index c21f3fa1b..e6309002a 100644 --- a/tests/experimental/torch/_compress/nas/plugins/test_nas_search.py +++ b/tests/experimental/torch/_compress/nas/plugins/test_nas_search.py @@ -40,11 +40,9 @@ def test_nas_search(project_root_path: Path, tmp_path: Path): def _test_nas_search_multiprocess_job( project_root_path: Path, tmp_path: Path, rank: int, size: int ): - runtime = NativeDdpRuntime( + with NativeDdpRuntime( dtype=torch.bfloat16, torch_distributed_timeout=datetime.timedelta(10) - ) - - with runtime as runtime: + ) as runtime: converted_model, puzzle_dir = run_nas_convert(project_root_path, tmp_path, rank, runtime) # From 7eb2fd7d7b3fe1d36e8011f6b3ce986b153f369f Mon Sep 17 00:00:00 2001 From: Daniel Korzekwa Date: Fri, 31 Oct 2025 19:47:58 +0100 Subject: [PATCH 52/81] refactoring Signed-off-by: Daniel Korzekwa --- .../nas/plugins/compress_nas_plugin.py | 4 +- .../torch/_compress/test_utils.py | 119 ------------------ 2 files changed, 1 insertion(+), 122 deletions(-) delete mode 100644 tests/experimental/torch/_compress/test_utils.py diff --git a/modelopt/torch/_compress/nas/plugins/compress_nas_plugin.py b/modelopt/torch/_compress/nas/plugins/compress_nas_plugin.py index 5217c74d7..3b881c2e2 100644 --- a/modelopt/torch/_compress/nas/plugins/compress_nas_plugin.py +++ b/modelopt/torch/_compress/nas/plugins/compress_nas_plugin.py @@ -31,6 +31,7 @@ from modelopt.torch._compress.decilm.converters.convert_llama3_to_decilm import ( convert_llama3_to_decilm, ) +from modelopt.torch._compress.hydra import initialize_hydra_config_for_dir from modelopt.torch._compress.runtime import NativeDdpRuntime from modelopt.torch.nas.conversion import NASModeRegistry from modelopt.torch.opt.config import ModeloptBaseConfig, ModeloptField @@ -43,9 +44,6 @@ ) from modelopt.torch.opt.searcher import BaseSearcher, SearchStateDict -# TODO Move initialize_hydra_config_for_dir from tests to main -from tests.utils.test_utils import initialize_hydra_config_for_dir - class CompressModel(nn.Module): pass # No model implementation is needed for the compress mode diff --git a/tests/experimental/torch/_compress/test_utils.py b/tests/experimental/torch/_compress/test_utils.py deleted file mode 100644 index 21ca622da..000000000 --- a/tests/experimental/torch/_compress/test_utils.py +++ /dev/null @@ -1,119 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import shutil -from pathlib import Path - -import torch -from datasets import Dataset, DatasetDict -from transformers import AutoTokenizer, LlamaConfig, LlamaForCausalLM, PreTrainedTokenizerBase - - -def create_and_save_small_llama_model( - output_path: str, vocab_size: int, tokenizer: PreTrainedTokenizerBase -): - """ - Create and save a small Llama model for testing the conversion pipeline. - This mimics having a real Llama checkpoint that needs to be converted. - """ - os.makedirs(output_path, exist_ok=True) - - # Create a minimal Llama config (small for testing) - # Note: intermediate_size must be divisible by 256 per DeciLM config requirements - # Note: hidden_size must give head_dim >= 8 for Flash Attention 2 compatibility - llama_config = LlamaConfig( - vocab_size=vocab_size, - hidden_size=256, # 32 heads times 8 head_dim = 256 (matches bypass config expectations) - intermediate_size=512, # Must be divisible by 256 - num_hidden_layers=2, - num_attention_heads=32, # Matches original test - num_key_value_heads=8, # GQA: 32÷4=8 (matches original n_heads_in_group=4) - max_position_embeddings=512, - rms_norm_eps=1e-5, - rope_theta=10000.0, - attention_bias=False, - hidden_act="silu", - tie_word_embeddings=False, - ) - - # Create and save the Llama model - model = LlamaForCausalLM(llama_config) - model.to(dtype=torch.bfloat16).save_pretrained(output_path) - - # Save tokenizer - tokenizer.save_pretrained(output_path) - - # Save config - llama_config.save_pretrained(output_path) - - -def create_tokenizer(project_root_path: Path) -> PreTrainedTokenizerBase: - """ - Create a tokenizer for the Llama model. - """ - tokenizer_path = project_root_path / "tests/experimental/torch/_compress/resources/tokenizer" - tokenizer = AutoTokenizer.from_pretrained(tokenizer_path) - return tokenizer - - -def setup_puzzle_dir(puzzle_dir: str): - """ - Setup puzzle directory by removing existing directory and creating a new one. - """ - if Path(puzzle_dir).exists(): - shutil.rmtree(puzzle_dir) - Path(puzzle_dir).mkdir(parents=True, exist_ok=True) - - -def save_dummy_dataset(dataset_path: str): - """ - Save a dummy dataset for testing purposes. - """ - # dummy sample - sample = [ - {"role": "user", "content": "please cite Lorem Ipsum?"}, - { - "role": "assistant", - "content": ( - "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed in blandit ante. " - "Sed tempus erat urna, ac elementum nisl facilisis quis. Aliquam consectetur mollis massa, " - "in elementum sem venenatis posuere. Fusce lorem arcu, egestas vel massa sollicitudin, " - "dictum mollis purus. Proin in ullamcorper elit. Nam tellus nisi, volutpat a mattis vel, " - "pretium in purus. Nunc at lectus facilisis risus scelerisque rhoncus eu nec ex. " - "Maecenas semper, tellus non placerat vulputate, urna felis facilisis diam, " - "sit amet vestibulum erat sapien nec libero. Praesent non massa velit. Donec faucibus mi eros. " - "Nam turpis nulla, congue sit amet mi at, porttitor scelerisque elit. Nunc id sodales lorem, " - "nec tincidunt leo. Quisque a neque nec ligula porttitor auctor. " - "Nunc accumsan nunc ac tellus congue vehicula. Praesent tellus eros, luctus non gravida dapibus, " - "faucibus eu ex. Quisque bibendum leo pharetra, tristique est vitae, hendrerit nunc. " - "Duis nec congue dolor. Donec commodo ipsum non efficitur volutpat. " - "Nulla risus nulla, efficitur et urna at, imperdiet sodales lorem. " - "Suspendisse erat est, sollicitudin at nisl tincidunt, vehicula hendrerit lectus. " - "Nam quis nisi ullamcorper, rhoncus massa vel, tempus purus. " - "Duis pulvinar eros vel nulla pellentesque, at dapibus justo laoreet. " - "Praesent tortor orci, vulputate fermentum dapibus nec, feugiat vitae tortor. " - "Donec mollis convallis massa quis iaculis." - ), - }, - ] - - # Prepare train and val splits with sample repeated, 2500 samples are for - # 128 samples with block-size 8192 and LLama3 tokenizer - data = [{"conversation": sample}] * 2500 - - # For train-val splits - data_dict = DatasetDict({"train": Dataset.from_list(data), "valid": Dataset.from_list(data)}) - data_dict.save_to_disk(dataset_path) From 3eb39f99dd75deb8a93a77b8479542734217f999 Mon Sep 17 00:00:00 2001 From: Daniel Korzekwa Date: Fri, 31 Oct 2025 19:51:57 +0100 Subject: [PATCH 53/81] code clean up Signed-off-by: Daniel Korzekwa --- .../experimental/torch/_compress/nas/plugins/test_nas_search.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/experimental/torch/_compress/nas/plugins/test_nas_search.py b/tests/experimental/torch/_compress/nas/plugins/test_nas_search.py index e6309002a..21526f5ec 100644 --- a/tests/experimental/torch/_compress/nas/plugins/test_nas_search.py +++ b/tests/experimental/torch/_compress/nas/plugins/test_nas_search.py @@ -56,7 +56,7 @@ def _test_nas_search_multiprocess_job( ) # - # Check assertions for mnt.search() step + # Check assertions for mtn.search() step # if rank == 0: # assertions for the build_library_and_stats step From 8360de94f76f753941e27fea07373d8d8f3d6557 Mon Sep 17 00:00:00 2001 From: Daniel Korzekwa Date: Fri, 31 Oct 2025 22:33:31 +0100 Subject: [PATCH 54/81] Implement compress cli tool. Signed-off-by: Daniel Korzekwa --- .../{Llama-3_2-8B.yaml => Llama-3_2-1B.yaml} | 0 .../llama_3.2_1B_pruneffn_memory.yaml | 10 ++-- examples/compress/main.py | 54 ++++++++++++++----- 3 files changed, 45 insertions(+), 19 deletions(-) rename examples/compress/configs/llama_3.2_1B_pruneffn_memory/{Llama-3_2-8B.yaml => Llama-3_2-1B.yaml} (100%) diff --git a/examples/compress/configs/llama_3.2_1B_pruneffn_memory/Llama-3_2-8B.yaml b/examples/compress/configs/llama_3.2_1B_pruneffn_memory/Llama-3_2-1B.yaml similarity index 100% rename from examples/compress/configs/llama_3.2_1B_pruneffn_memory/Llama-3_2-8B.yaml rename to examples/compress/configs/llama_3.2_1B_pruneffn_memory/Llama-3_2-1B.yaml diff --git a/examples/compress/configs/llama_3.2_1B_pruneffn_memory/llama_3.2_1B_pruneffn_memory.yaml b/examples/compress/configs/llama_3.2_1B_pruneffn_memory/llama_3.2_1B_pruneffn_memory.yaml index f7962f0aa..f3fa86953 100644 --- a/examples/compress/configs/llama_3.2_1B_pruneffn_memory/llama_3.2_1B_pruneffn_memory.yaml +++ b/examples/compress/configs/llama_3.2_1B_pruneffn_memory/llama_3.2_1B_pruneffn_memory.yaml @@ -1,15 +1,15 @@ defaults: - - ./Llama-3_2-1B + - Llama-3_2-1B - _self_ # Input Hugging Face model to compress -input_hf_model_path: ??? # e.g., "path/to/meta-llama/Llama-3.2-1B" +input_hf_model_path: /workspace/hf_models/meta-llama/Llama-3.2-1B # Dataset path for pruning and NAS scoring -dataset_path: ??? # e.g., "path/to/dataset" +dataset_path: /workspace/datasets/Nemotron-Post-Training-Dataset-v2 # Working directory for compression outputs -puzzle_dir: ??? # e.g., "path/to/puzzle_dir" +puzzle_dir: /workspace/puzzle_dir # MIP memory constraint (in MiB) mip: @@ -18,4 +18,4 @@ mip: # FFN intermediate sizes to search over (heterogeneous architecture) pruning: - intermediate_size_list: [768, 1024, 1536, 2048, 2560, 4192] # Llama 3.2 1B baseline: 8192 + intermediate_size_list: [256] # Llama 3.2 1B baseline: 8192 diff --git a/examples/compress/main.py b/examples/compress/main.py index e7e443f0f..9fc525b03 100644 --- a/examples/compress/main.py +++ b/examples/compress/main.py @@ -29,10 +29,15 @@ """ import argparse +import datetime from pathlib import Path +import torch from puzzle_tools.hydra_utils import register_hydra_resolvers +import modelopt.torch.nas as mtn +from modelopt.torch._compress.nas.plugins.compress_nas_plugin import CompressModel +from modelopt.torch._compress.runtime import NativeDdpRuntime from tests.utils.test_utils import initialize_hydra_config_for_dir @@ -63,19 +68,39 @@ def run_full_compress(hydra_config_path: str): config_path: Path to the YAML configuration file """ - # Register Hydra custom resolvers (needed for config resolution) - register_hydra_resolvers() - - hydra_config_path = Path(hydra_config_path).resolve() - hydra_config_dir = str(hydra_config_path.parent) - hydra_config_name = hydra_config_path.stem - - # Load hydra config - initialize_hydra_config_for_dir( - config_dir=hydra_config_dir, - config_name=hydra_config_name, - overrides=[], - ) + with NativeDdpRuntime(dtype=torch.bfloat16, torch_distributed_timeout=datetime.timedelta(10)): + # Register Hydra custom resolvers (needed for config resolution) + register_hydra_resolvers() + + hydra_config_path = Path(hydra_config_path).resolve() + hydra_config_dir = str(hydra_config_path.parent) + hydra_config_name = hydra_config_path.stem + + # Load hydra config + hydra_cfg = initialize_hydra_config_for_dir( + config_dir=hydra_config_dir, + config_name=hydra_config_name, + overrides=[], + ) + + input_model = CompressModel() + mtn.convert( + input_model, + mode=[ + ( + "compress", + { + "puzzle_dir": str(hydra_cfg.puzzle_dir), + "input_model_path": hydra_cfg.input_hf_model_path, + "hydra_config_dir": hydra_config_dir, + "hydra_config_name": hydra_config_name, + "dataset_path": str(hydra_cfg.dataset_path), + }, + ) + ], + ) + + print(f"\nCompression completed. Output in: {hydra_cfg.puzzle_dir}") def run_mip_only(hydra_config_path: str): @@ -87,7 +112,8 @@ def run_mip_only(hydra_config_path: str): Args: config_path: Path to the YAML configuration file """ - hydra_config_path = Path(hydra_config_path).resolve() + raise NotImplementedError("MIP-only mode is not implemented yet") + # hydra_config_path = Path(hydra_config_path).resolve() # config_dir = str(hydra_config_path.parent) # config_name = hydra_config_path.stem From 9230d81cd77d096819ffcb0aceeff03989551791 Mon Sep 17 00:00:00 2001 From: Daniel Korzekwa Date: Fri, 31 Oct 2025 22:38:49 +0100 Subject: [PATCH 55/81] Add running mtn.search() to compress cli tool. Signed-off-by: Daniel Korzekwa --- examples/compress/main.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/examples/compress/main.py b/examples/compress/main.py index 9fc525b03..396f65a70 100644 --- a/examples/compress/main.py +++ b/examples/compress/main.py @@ -83,8 +83,10 @@ def run_full_compress(hydra_config_path: str): overrides=[], ) + # Convert model (convert from HF to DeciLM, score pruning activations, + # prune the model and save pruned checkpoints) input_model = CompressModel() - mtn.convert( + converted_model = mtn.convert( input_model, mode=[ ( @@ -100,6 +102,15 @@ def run_full_compress(hydra_config_path: str): ], ) + # Run NAS search (build replacement library and compute stats, + # compute one block scores, run MIP and realize models) + mtn.search( + converted_model, + constraints={}, # this is not used as the search space is defined in the hydra config + dummy_input=None, # Not used + config={}, # this is not used as the search space is defined in the hydra config + ) + print(f"\nCompression completed. Output in: {hydra_cfg.puzzle_dir}") From 28b5c13d5beb5b1516651b663cbf386b4f0d8194 Mon Sep 17 00:00:00 2001 From: Daniel Korzekwa Date: Fri, 31 Oct 2025 22:40:25 +0100 Subject: [PATCH 56/81] update docs Signed-off-by: Daniel Korzekwa --- examples/compress/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/compress/main.py b/examples/compress/main.py index 396f65a70..155490e13 100644 --- a/examples/compress/main.py +++ b/examples/compress/main.py @@ -121,7 +121,7 @@ def run_mip_only(hydra_config_path: str): have already been completed. Args: - config_path: Path to the YAML configuration file + hydra_config_path: Path to the YAML configuration file """ raise NotImplementedError("MIP-only mode is not implemented yet") # hydra_config_path = Path(hydra_config_path).resolve() From a7eba4bcc9df82c4c7ef39a106ec17aad44f6189 Mon Sep 17 00:00:00 2001 From: Daniel Korzekwa Date: Sat, 1 Nov 2025 14:01:05 +0100 Subject: [PATCH 57/81] Replace dummy dataset with Nemotron-Post-Training-Dataset-v2 Signed-off-by: Daniel Korzekwa --- examples/compress/README.md | 17 +++++++++++++---- .../validate_model_defaults.yaml | 4 ++-- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/examples/compress/README.md b/examples/compress/README.md index 0304217c7..542423da7 100644 --- a/examples/compress/README.md +++ b/examples/compress/README.md @@ -6,10 +6,19 @@ In this example, we compress Llama 3.2 1B by searching for the optimal `ffn_inte ## Compress the Model -```bash -torchrun examples/compress/main.py \ - --config path/to/llama_3.2_1B_pruneffn_memory.yaml -``` +1. Specify the `puzzle_dir`, `input_hf_model_path`, `dataset_path`, `intermediate_size_list`, and `target_memory` arguments in the [llama_3.2_1B_pruneffn_memory.yaml](./configs/llama_3.2_1B_pruneffn_memory/llama_3.2_1B_pruneffn_memory.yaml) configuration file. + +2. Download and prepare the dataset: + + ```bash + python -m scripts.prepare_dataset --dataset_name nvidia/Nemotron-Post-Training-Dataset-v2 --output_dir path/to/Nemotron-Post-Training-Dataset-v2 + ``` + +3. Run the compression script: + + ```bash + torchrun examples/compress/main.py --config path/to/llama_3.2_1B_pruneffn_memory.yaml + ``` ## Evaluate Model Accuracy diff --git a/examples/compress/configs/llama_3.2_1B_pruneffn_memory/validate_model_defaults.yaml b/examples/compress/configs/llama_3.2_1B_pruneffn_memory/validate_model_defaults.yaml index 046ff51f6..572331a84 100644 --- a/examples/compress/configs/llama_3.2_1B_pruneffn_memory/validate_model_defaults.yaml +++ b/examples/compress/configs/llama_3.2_1B_pruneffn_memory/validate_model_defaults.yaml @@ -1,7 +1,7 @@ block_size: 8192 bos_rate: 0.5 -data_column: conversation -val_dataset_name: train +data_column: messages +val_dataset_name: valid shuffle_seed: 81436 seed: 42 fim_rate: 0 From 21ed59bfb3ed9d3e7e551d8bdd2d09f5cf8934f9 Mon Sep 17 00:00:00 2001 From: Daniel Korzekwa Date: Sat, 1 Nov 2025 14:13:01 +0100 Subject: [PATCH 58/81] Refactoring Signed-off-by: Daniel Korzekwa --- examples/compress/README.md | 2 +- modelopt/torch/_compress/__init__.py | 15 +++++ modelopt/torch/_compress/dataset/__init__.py | 15 +++++ .../_compress/dataset/prepare_dataset.py | 64 +++++++++++++++++++ 4 files changed, 95 insertions(+), 1 deletion(-) create mode 100644 modelopt/torch/_compress/__init__.py create mode 100644 modelopt/torch/_compress/dataset/__init__.py create mode 100644 modelopt/torch/_compress/dataset/prepare_dataset.py diff --git a/examples/compress/README.md b/examples/compress/README.md index 542423da7..23b862f67 100644 --- a/examples/compress/README.md +++ b/examples/compress/README.md @@ -11,7 +11,7 @@ In this example, we compress Llama 3.2 1B by searching for the optimal `ffn_inte 2. Download and prepare the dataset: ```bash - python -m scripts.prepare_dataset --dataset_name nvidia/Nemotron-Post-Training-Dataset-v2 --output_dir path/to/Nemotron-Post-Training-Dataset-v2 + python -m modelopt.torch._compress.dataset.prepare_dataset --dataset_name nvidia/Nemotron-Post-Training-Dataset-v2 --output_dir path/to/Nemotron-Post-Training-Dataset-v2 ``` 3. Run the compression script: diff --git a/modelopt/torch/_compress/__init__.py b/modelopt/torch/_compress/__init__.py new file mode 100644 index 000000000..47f1c65a1 --- /dev/null +++ b/modelopt/torch/_compress/__init__.py @@ -0,0 +1,15 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + diff --git a/modelopt/torch/_compress/dataset/__init__.py b/modelopt/torch/_compress/dataset/__init__.py new file mode 100644 index 000000000..47f1c65a1 --- /dev/null +++ b/modelopt/torch/_compress/dataset/__init__.py @@ -0,0 +1,15 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + diff --git a/modelopt/torch/_compress/dataset/prepare_dataset.py b/modelopt/torch/_compress/dataset/prepare_dataset.py new file mode 100644 index 000000000..49d63d122 --- /dev/null +++ b/modelopt/torch/_compress/dataset/prepare_dataset.py @@ -0,0 +1,64 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import datasets +import fire +import numpy as np +from logger import mprint + + +def process_and_save_dataset( + dataset_name: str, + output_dir: str, + split: tuple = ("code", "math", "stem", "chat"), + overwrite: bool = False, +): + # Check if output_dir contains an existing dataset + dataset_dict_path = os.path.join(output_dir, "dataset_dict.json") + if os.path.exists(output_dir) and os.path.exists(dataset_dict_path): + if not overwrite: + mprint( + f"Output directory '{output_dir}' already contains a dataset. " + "Use '--overwrite True' to overwrite existing data." + ) + return + + ds = datasets.load_dataset(dataset_name, split=split) + ds = datasets.concatenate_datasets(ds) + # Filter out samples with reasoning = on + ds = ds.filter(lambda x: x["reasoning"] == "off") + # Hardcoded for dynamically create a deterministic train-val split + seed = 408 + generator = np.random.RandomState(seed=seed) + ds_split = ds.train_test_split(test_size=0.05, shuffle=True, generator=generator) + # Rename dataset names to follow previous conventions + ds_dict = datasets.DatasetDict( + { + "train": ds_split["train"], + "valid": ds_split["test"], + } + ) + # Save locally + os.makedirs(output_dir, exist_ok=True) + ds_dict.save_to_disk(output_dir) + + mprint(f"Dataset splits:\n{ds_dict}") + mprint(f"Saved processed datasets to {output_dir}") + + +if __name__ == "__main__": + fire.Fire(process_and_save_dataset) From e3ed0a44802379c859cde61737b95d3c95c2e4c4 Mon Sep 17 00:00:00 2001 From: Daniel Korzekwa Date: Sat, 1 Nov 2025 14:14:58 +0100 Subject: [PATCH 59/81] Update docs Signed-off-by: Daniel Korzekwa --- examples/compress/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/compress/README.md b/examples/compress/README.md index 23b862f67..13634db32 100644 --- a/examples/compress/README.md +++ b/examples/compress/README.md @@ -8,7 +8,7 @@ In this example, we compress Llama 3.2 1B by searching for the optimal `ffn_inte 1. Specify the `puzzle_dir`, `input_hf_model_path`, `dataset_path`, `intermediate_size_list`, and `target_memory` arguments in the [llama_3.2_1B_pruneffn_memory.yaml](./configs/llama_3.2_1B_pruneffn_memory/llama_3.2_1B_pruneffn_memory.yaml) configuration file. -2. Download and prepare the dataset: +2. Download and prepare the dataset (2.62GB): ```bash python -m modelopt.torch._compress.dataset.prepare_dataset --dataset_name nvidia/Nemotron-Post-Training-Dataset-v2 --output_dir path/to/Nemotron-Post-Training-Dataset-v2 From 9e09e8f1bc02763b125a1799a971f1b61076f89b Mon Sep 17 00:00:00 2001 From: Daniel Korzekwa Date: Sat, 1 Nov 2025 16:01:00 +0100 Subject: [PATCH 60/81] Refactoring. Change the compress tutorial from Llama 3.2 1B-instruct to Llma 3.1 8B-instruct Signed-off-by: Daniel Korzekwa --- examples/compress/README.md | 20 ++++++++++--------- .../Llama-3_1-8B.yaml} | 0 .../llama-3_1-8B_pruneffn_memory.yaml} | 4 ++-- .../pruning/attn_pruning.yaml | 0 .../pruning/ffn_pruning.yaml | 0 .../pruning/hidden_dim_pruning.yaml | 0 .../pruning/pruning_defaults.yaml | 0 .../validate_model_defaults.yaml | 0 .../validate_solutions_defaults.yaml | 0 9 files changed, 13 insertions(+), 11 deletions(-) rename examples/compress/configs/{llama_3.2_1B_pruneffn_memory/Llama-3_2-1B.yaml => llama-3_1-8B_pruneffn_memory/Llama-3_1-8B.yaml} (100%) rename examples/compress/configs/{llama_3.2_1B_pruneffn_memory/llama_3.2_1B_pruneffn_memory.yaml => llama-3_1-8B_pruneffn_memory/llama-3_1-8B_pruneffn_memory.yaml} (84%) rename examples/compress/configs/{llama_3.2_1B_pruneffn_memory => llama-3_1-8B_pruneffn_memory}/pruning/attn_pruning.yaml (100%) rename examples/compress/configs/{llama_3.2_1B_pruneffn_memory => llama-3_1-8B_pruneffn_memory}/pruning/ffn_pruning.yaml (100%) rename examples/compress/configs/{llama_3.2_1B_pruneffn_memory => llama-3_1-8B_pruneffn_memory}/pruning/hidden_dim_pruning.yaml (100%) rename examples/compress/configs/{llama_3.2_1B_pruneffn_memory => llama-3_1-8B_pruneffn_memory}/pruning/pruning_defaults.yaml (100%) rename examples/compress/configs/{llama_3.2_1B_pruneffn_memory => llama-3_1-8B_pruneffn_memory}/validate_model_defaults.yaml (100%) rename examples/compress/configs/{llama_3.2_1B_pruneffn_memory => llama-3_1-8B_pruneffn_memory}/validate_solutions_defaults.yaml (100%) diff --git a/examples/compress/README.md b/examples/compress/README.md index 13634db32..dcee6f796 100644 --- a/examples/compress/README.md +++ b/examples/compress/README.md @@ -2,37 +2,39 @@ This tutorial demonstrates how to compress large language models using the compress algorithm based on the [Puzzle paper](https://arxiv.org/abs/2411.19146). -In this example, we compress Llama 3.2 1B by searching for the optimal `ffn_intermediate_size` across MLP layers, resulting in a heterogeneous architecture while reducing GPU memory usage by 20%. +In this example, we compress [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) model by searching for the optimal `ffn_intermediate_size` across MLP layers, resulting in a heterogeneous architecture while reducing GPU memory usage by 20%. ## Compress the Model -1. Specify the `puzzle_dir`, `input_hf_model_path`, `dataset_path`, `intermediate_size_list`, and `target_memory` arguments in the [llama_3.2_1B_pruneffn_memory.yaml](./configs/llama_3.2_1B_pruneffn_memory/llama_3.2_1B_pruneffn_memory.yaml) configuration file. +1. Specify the `puzzle_dir`, `input_hf_model_path`, `dataset_path`, `intermediate_size_list`, and `target_memory` arguments in the [llama-3_1-8B_pruneffn_memory.yaml](./configs/llama-3_1-8B_pruneffn_memory/llama-3_1-8B_pruneffn_memory.yaml) configuration file. -2. Download and prepare the dataset (2.62GB): +2. Download and prepare the [Nemotron-Post-Training-Dataset-v2](https://huggingface.co/datasets/nvidia/Nemotron-Post-Training-Dataset-v2). + + dataset split: "code", "math", "stem", "chat", excluding reasoning samples (2.62GB) ```bash python -m modelopt.torch._compress.dataset.prepare_dataset --dataset_name nvidia/Nemotron-Post-Training-Dataset-v2 --output_dir path/to/Nemotron-Post-Training-Dataset-v2 ``` -3. Run the compression script: +3. Run the compression script. ```bash - torchrun examples/compress/main.py --config path/to/llama_3.2_1B_pruneffn_memory.yaml + torchrun examples/compress/main.py --config path/to/llama-3_1-8B_pruneffn_memory.yaml ``` -## Evaluate Model Accuracy +## Evaluate model accuracy ```bash # TODO ``` -## Re-run MIP Search with Different Memory Constraints +## Re-run MIP Search with different memory constraints If you want to try different memory constraints without re-running the expensive pruning and scoring steps, use the `--mip-only` flag: ```bash torchrun examples/compress/main.py \ - --config path/to/llama_3.2_1B_pruneffn_memory.yaml \ + --config path/to/llama-3_1_8B_pruneffn_memory.yaml \ --mip-only ``` @@ -52,4 +54,4 @@ This assumes pruning, replacement library building, NAS scoring, and subblock st ## Advanced usage -Modify `path/to/Llama-3_2-1B yaml` file for advanced compression scenarios. +Modify `path/to/Llama-3_1-8B yaml` file for advanced compression scenarios. diff --git a/examples/compress/configs/llama_3.2_1B_pruneffn_memory/Llama-3_2-1B.yaml b/examples/compress/configs/llama-3_1-8B_pruneffn_memory/Llama-3_1-8B.yaml similarity index 100% rename from examples/compress/configs/llama_3.2_1B_pruneffn_memory/Llama-3_2-1B.yaml rename to examples/compress/configs/llama-3_1-8B_pruneffn_memory/Llama-3_1-8B.yaml diff --git a/examples/compress/configs/llama_3.2_1B_pruneffn_memory/llama_3.2_1B_pruneffn_memory.yaml b/examples/compress/configs/llama-3_1-8B_pruneffn_memory/llama-3_1-8B_pruneffn_memory.yaml similarity index 84% rename from examples/compress/configs/llama_3.2_1B_pruneffn_memory/llama_3.2_1B_pruneffn_memory.yaml rename to examples/compress/configs/llama-3_1-8B_pruneffn_memory/llama-3_1-8B_pruneffn_memory.yaml index f3fa86953..74af0cad6 100644 --- a/examples/compress/configs/llama_3.2_1B_pruneffn_memory/llama_3.2_1B_pruneffn_memory.yaml +++ b/examples/compress/configs/llama-3_1-8B_pruneffn_memory/llama-3_1-8B_pruneffn_memory.yaml @@ -1,9 +1,9 @@ defaults: - - Llama-3_2-1B + - Llama-3_1-8B - _self_ # Input Hugging Face model to compress -input_hf_model_path: /workspace/hf_models/meta-llama/Llama-3.2-1B +input_hf_model_path: /workspace/hf_models/meta-llama/Llama-3.1-8B-Instruct # Dataset path for pruning and NAS scoring dataset_path: /workspace/datasets/Nemotron-Post-Training-Dataset-v2 diff --git a/examples/compress/configs/llama_3.2_1B_pruneffn_memory/pruning/attn_pruning.yaml b/examples/compress/configs/llama-3_1-8B_pruneffn_memory/pruning/attn_pruning.yaml similarity index 100% rename from examples/compress/configs/llama_3.2_1B_pruneffn_memory/pruning/attn_pruning.yaml rename to examples/compress/configs/llama-3_1-8B_pruneffn_memory/pruning/attn_pruning.yaml diff --git a/examples/compress/configs/llama_3.2_1B_pruneffn_memory/pruning/ffn_pruning.yaml b/examples/compress/configs/llama-3_1-8B_pruneffn_memory/pruning/ffn_pruning.yaml similarity index 100% rename from examples/compress/configs/llama_3.2_1B_pruneffn_memory/pruning/ffn_pruning.yaml rename to examples/compress/configs/llama-3_1-8B_pruneffn_memory/pruning/ffn_pruning.yaml diff --git a/examples/compress/configs/llama_3.2_1B_pruneffn_memory/pruning/hidden_dim_pruning.yaml b/examples/compress/configs/llama-3_1-8B_pruneffn_memory/pruning/hidden_dim_pruning.yaml similarity index 100% rename from examples/compress/configs/llama_3.2_1B_pruneffn_memory/pruning/hidden_dim_pruning.yaml rename to examples/compress/configs/llama-3_1-8B_pruneffn_memory/pruning/hidden_dim_pruning.yaml diff --git a/examples/compress/configs/llama_3.2_1B_pruneffn_memory/pruning/pruning_defaults.yaml b/examples/compress/configs/llama-3_1-8B_pruneffn_memory/pruning/pruning_defaults.yaml similarity index 100% rename from examples/compress/configs/llama_3.2_1B_pruneffn_memory/pruning/pruning_defaults.yaml rename to examples/compress/configs/llama-3_1-8B_pruneffn_memory/pruning/pruning_defaults.yaml diff --git a/examples/compress/configs/llama_3.2_1B_pruneffn_memory/validate_model_defaults.yaml b/examples/compress/configs/llama-3_1-8B_pruneffn_memory/validate_model_defaults.yaml similarity index 100% rename from examples/compress/configs/llama_3.2_1B_pruneffn_memory/validate_model_defaults.yaml rename to examples/compress/configs/llama-3_1-8B_pruneffn_memory/validate_model_defaults.yaml diff --git a/examples/compress/configs/llama_3.2_1B_pruneffn_memory/validate_solutions_defaults.yaml b/examples/compress/configs/llama-3_1-8B_pruneffn_memory/validate_solutions_defaults.yaml similarity index 100% rename from examples/compress/configs/llama_3.2_1B_pruneffn_memory/validate_solutions_defaults.yaml rename to examples/compress/configs/llama-3_1-8B_pruneffn_memory/validate_solutions_defaults.yaml From abb39f3f325939ea2757c7ddd779fa783626ee00 Mon Sep 17 00:00:00 2001 From: Daniel Korzekwa Date: Sat, 1 Nov 2025 20:41:36 +0100 Subject: [PATCH 61/81] Improve logging Signed-off-by: Daniel Korzekwa --- examples/compress/README.md | 15 ++++++- examples/compress/main.py | 4 +- modelopt/torch/_compress/dateutils.py | 41 +++++++++++++++++++ .../nas/plugins/compress_nas_plugin.py | 11 +++++ 4 files changed, 69 insertions(+), 2 deletions(-) create mode 100644 modelopt/torch/_compress/dateutils.py diff --git a/examples/compress/README.md b/examples/compress/README.md index dcee6f796..7a4535441 100644 --- a/examples/compress/README.md +++ b/examples/compress/README.md @@ -19,7 +19,20 @@ In this example, we compress [meta-llama/Llama-3.1-8B-Instruct](https://huggingf 3. Run the compression script. ```bash - torchrun examples/compress/main.py --config path/to/llama-3_1-8B_pruneffn_memory.yaml + torchrun examples/compress/main.py --config path/to/llama-3_1-8B_pruneffn_memory.yaml 2>&1 | tee ./log.txt |grep "Compress Progress" + ``` + + screen output: + + ```bash + [2025-11-01 19:26:38] Compress Progress 1/8: starting compression pipeline + [2025-11-01 19:26:38] Compress Progress 2/8: converting model from HF to DeciLM + [2025-11-01 19:26:39] Compress Progress 3/8: scoring pruning activations + [2025-11-01 19:26:46] Compress Progress 4/8: pruning the model and saving pruned checkpoints + [2025-11-01 19:26:46] Compress Progress 5/8: building replacement library and calculating subblock statistics + [2025-11-01 19:26:46] Compress Progress 6/8: calculating one block scores + [2025-11-01 19:26:52] Compress Progress 7/8: running MIP and realizing models + [2025-11-01 19:26:59] Compress Progress 8/8: compression pipeline completed ``` ## Evaluate model accuracy diff --git a/examples/compress/main.py b/examples/compress/main.py index 155490e13..95cda0d9a 100644 --- a/examples/compress/main.py +++ b/examples/compress/main.py @@ -36,6 +36,7 @@ from puzzle_tools.hydra_utils import register_hydra_resolvers import modelopt.torch.nas as mtn +from modelopt.torch._compress.dateutils import timestamped from modelopt.torch._compress.nas.plugins.compress_nas_plugin import CompressModel from modelopt.torch._compress.runtime import NativeDdpRuntime from tests.utils.test_utils import initialize_hydra_config_for_dir @@ -68,6 +69,7 @@ def run_full_compress(hydra_config_path: str): config_path: Path to the YAML configuration file """ + print(timestamped("Compress Progress 1/8: starting compression pipeline")) with NativeDdpRuntime(dtype=torch.bfloat16, torch_distributed_timeout=datetime.timedelta(10)): # Register Hydra custom resolvers (needed for config resolution) register_hydra_resolvers() @@ -111,7 +113,7 @@ def run_full_compress(hydra_config_path: str): config={}, # this is not used as the search space is defined in the hydra config ) - print(f"\nCompression completed. Output in: {hydra_cfg.puzzle_dir}") + print(timestamped("Compress Progress 8/8: compression pipeline completed")) def run_mip_only(hydra_config_path: str): diff --git a/modelopt/torch/_compress/dateutils.py b/modelopt/torch/_compress/dateutils.py new file mode 100644 index 000000000..76a8aec2a --- /dev/null +++ b/modelopt/torch/_compress/dateutils.py @@ -0,0 +1,41 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Date and time utility functions for the compress module. +""" + +import datetime + + +def get_timestamp() -> str: + """Get a formatted timestamp string for logging. + + Returns: + A formatted timestamp string in the format 'YYYY-MM-DD HH:MM:SS'. + """ + return datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + + +def timestamped(message: str) -> str: + """Add a timestamp prefix to a message. + + Args: + message: The message to prefix with a timestamp. + + Returns: + The message with a timestamp prefix in the format '[YYYY-MM-DD HH:MM:SS] message'. + """ + return f"[{get_timestamp()}] {message}" diff --git a/modelopt/torch/_compress/nas/plugins/compress_nas_plugin.py b/modelopt/torch/_compress/nas/plugins/compress_nas_plugin.py index 3b881c2e2..aa06c217b 100644 --- a/modelopt/torch/_compress/nas/plugins/compress_nas_plugin.py +++ b/modelopt/torch/_compress/nas/plugins/compress_nas_plugin.py @@ -28,6 +28,7 @@ import torch from torch import nn +from modelopt.torch._compress.dateutils import timestamped from modelopt.torch._compress.decilm.converters.convert_llama3_to_decilm import ( convert_llama3_to_decilm, ) @@ -116,6 +117,7 @@ def convert_compress_model(model: nn.Module, config: CompressConfig) -> ConvertR ) # Convert Llama3 model to DeciLM model + print(timestamped("Compress Progress 2/8: converting model from HF to DeciLM")) hf_ckpt_teacher_dir = "ckpts/teacher" # TODO: make it configurable convert_llama3_to_decilm( input_dir=config.input_model_path, @@ -123,9 +125,11 @@ def convert_compress_model(model: nn.Module, config: CompressConfig) -> ConvertR ) # Score_pruning_activations (distributed processing) + print(timestamped("Compress Progress 3/8: scoring pruning activations")) score_pruning_activations.launch_score_activations(hydra_cfg, runtime) # Prune the model and save pruned checkpoints + print(timestamped("Compress Progress 4/8: pruning the model and saving pruned checkpoints")) if runtime.global_rank == 0: pruning_ckpts.launch_prune_ckpt(hydra_cfg) runtime.wait_for_everyone() @@ -203,12 +207,19 @@ def run_search(self) -> None: ) # Build_library_and_stats (single process) + print( + timestamped( + "Compress Progress 5/8: building replacement library and calculating subblock statistics" + ) + ) if runtime.global_rank == 0: build_library_and_stats.launch_build_library_and_stats(hydra_cfg) runtime.wait_for_everyone() # Calc_one_block_scores (distributed processing) + print(timestamped("Compress Progress 6/8: calculating one block scores")) scoring.launch_scoring(hydra_cfg, runtime) # mip_and_realize_models (distributed processing) + print(timestamped("Compress Progress 7/8: running MIP and realizing models")) mip_and_realize_models.launch_mip_and_realize_model(hydra_cfg, runtime) From 64b33e24cd89f0c9cdc313e9c690d7d96b7ff9a8 Mon Sep 17 00:00:00 2001 From: Daniel Korzekwa Date: Sat, 1 Nov 2025 21:27:32 +0100 Subject: [PATCH 62/81] Update docs Signed-off-by: Daniel Korzekwa --- examples/compress/README.md | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/examples/compress/README.md b/examples/compress/README.md index 7a4535441..37c12d723 100644 --- a/examples/compress/README.md +++ b/examples/compress/README.md @@ -46,9 +46,7 @@ In this example, we compress [meta-llama/Llama-3.1-8B-Instruct](https://huggingf If you want to try different memory constraints without re-running the expensive pruning and scoring steps, use the `--mip-only` flag: ```bash -torchrun examples/compress/main.py \ - --config path/to/llama-3_1_8B_pruneffn_memory.yaml \ - --mip-only +torchrun examples/compress/main.py --config path/to/llama-3_1-8B_pruneffn_memory.yaml --mip-only 2>&1 | tee ./log.txt |grep "Compress Progress" ``` This assumes pruning, replacement library building, NAS scoring, and subblock stats calculation have already been completed. From 21a602ce44049b083e629e0f41aba95eea489ca0 Mon Sep 17 00:00:00 2001 From: Daniel Korzekwa Date: Sat, 1 Nov 2025 21:53:42 +0100 Subject: [PATCH 63/81] Update compress tutorial Signed-off-by: Daniel Korzekwa --- examples/compress/README.md | 23 ++++++++++--------- .../llama-3_1-8B_pruneffn_memory.yaml | 2 +- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/examples/compress/README.md b/examples/compress/README.md index 37c12d723..67ad3985a 100644 --- a/examples/compress/README.md +++ b/examples/compress/README.md @@ -19,20 +19,21 @@ In this example, we compress [meta-llama/Llama-3.1-8B-Instruct](https://huggingf 3. Run the compression script. ```bash - torchrun examples/compress/main.py --config path/to/llama-3_1-8B_pruneffn_memory.yaml 2>&1 | tee ./log.txt |grep "Compress Progress" + torchrun examples/compress/main.py --config path/to/llama-3_1-8B_pruneffn_memory.yaml 2>&1 | tee ./log.txt | grep "Compress Progress" ``` - screen output: + This will save the full output to `log.txt` and display the following progress on screen: ```bash - [2025-11-01 19:26:38] Compress Progress 1/8: starting compression pipeline - [2025-11-01 19:26:38] Compress Progress 2/8: converting model from HF to DeciLM - [2025-11-01 19:26:39] Compress Progress 3/8: scoring pruning activations - [2025-11-01 19:26:46] Compress Progress 4/8: pruning the model and saving pruned checkpoints - [2025-11-01 19:26:46] Compress Progress 5/8: building replacement library and calculating subblock statistics - [2025-11-01 19:26:46] Compress Progress 6/8: calculating one block scores - [2025-11-01 19:26:52] Compress Progress 7/8: running MIP and realizing models - [2025-11-01 19:26:59] Compress Progress 8/8: compression pipeline completed + # Produced on a single NVIDIA H100 80GB HBM3 card + [2025-11-01 13:43:10] Compress Progress 1/8: starting compression pipeline + [2025-11-01 13:43:10] Compress Progress 2/8: converting model from HF to DeciLM + [2025-11-01 13:43:30] Compress Progress 3/8: scoring pruning activations + [2025-11-01 13:44:38] Compress Progress 4/8: pruning the model and saving pruned checkpoints + [2025-11-01 13:44:45] Compress Progress 5/8: building replacement library and calculating subblock statistics + [2025-11-01 13:44:46] Compress Progress 6/8: calculating one block scores + [2025-11-01 13:49:29] Compress Progress 7/8: running MIP and realizing models + [2025-11-01 13:52:59] Compress Progress 8/8: compression pipeline completed ``` ## Evaluate model accuracy @@ -46,7 +47,7 @@ In this example, we compress [meta-llama/Llama-3.1-8B-Instruct](https://huggingf If you want to try different memory constraints without re-running the expensive pruning and scoring steps, use the `--mip-only` flag: ```bash -torchrun examples/compress/main.py --config path/to/llama-3_1-8B_pruneffn_memory.yaml --mip-only 2>&1 | tee ./log.txt |grep "Compress Progress" +torchrun examples/compress/main.py --config path/to/llama-3_1-8B_pruneffn_memory.yaml --mip-only 2>&1 | tee ./log.txt | grep "Compress Progress" ``` This assumes pruning, replacement library building, NAS scoring, and subblock stats calculation have already been completed. diff --git a/examples/compress/configs/llama-3_1-8B_pruneffn_memory/llama-3_1-8B_pruneffn_memory.yaml b/examples/compress/configs/llama-3_1-8B_pruneffn_memory/llama-3_1-8B_pruneffn_memory.yaml index 74af0cad6..ab697fd93 100644 --- a/examples/compress/configs/llama-3_1-8B_pruneffn_memory/llama-3_1-8B_pruneffn_memory.yaml +++ b/examples/compress/configs/llama-3_1-8B_pruneffn_memory/llama-3_1-8B_pruneffn_memory.yaml @@ -14,7 +14,7 @@ puzzle_dir: /workspace/puzzle_dir # MIP memory constraint (in MiB) mip: human_constraints: - target_memory: 2_000 # 2 GiB + target_memory: 78_000 # 78 GiB # FFN intermediate sizes to search over (heterogeneous architecture) pruning: From 9a381fe39dfe1a6b5fcbafe76be97c9b07dd35f8 Mon Sep 17 00:00:00 2001 From: Daniel Korzekwa Date: Sat, 1 Nov 2025 22:00:31 +0100 Subject: [PATCH 64/81] Update compress tutorial ffn search space Signed-off-by: Daniel Korzekwa --- .../llama-3_1-8B_pruneffn_memory.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/compress/configs/llama-3_1-8B_pruneffn_memory/llama-3_1-8B_pruneffn_memory.yaml b/examples/compress/configs/llama-3_1-8B_pruneffn_memory/llama-3_1-8B_pruneffn_memory.yaml index ab697fd93..c9a0cabf3 100644 --- a/examples/compress/configs/llama-3_1-8B_pruneffn_memory/llama-3_1-8B_pruneffn_memory.yaml +++ b/examples/compress/configs/llama-3_1-8B_pruneffn_memory/llama-3_1-8B_pruneffn_memory.yaml @@ -18,4 +18,4 @@ mip: # FFN intermediate sizes to search over (heterogeneous architecture) pruning: - intermediate_size_list: [256] # Llama 3.2 1B baseline: 8192 + intermediate_size_list: [3072, 5888, 8704, 11520] # teacher_intermediate_size is 14336 From c47e0af48009ba872a927f53ea9aa0dfdf74c7fb Mon Sep 17 00:00:00 2001 From: Daniel Korzekwa Date: Sat, 1 Nov 2025 22:15:52 +0100 Subject: [PATCH 65/81] Update tutorial Signed-off-by: Daniel Korzekwa --- examples/compress/README.md | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/examples/compress/README.md b/examples/compress/README.md index 67ad3985a..8f8db8f04 100644 --- a/examples/compress/README.md +++ b/examples/compress/README.md @@ -26,14 +26,14 @@ In this example, we compress [meta-llama/Llama-3.1-8B-Instruct](https://huggingf ```bash # Produced on a single NVIDIA H100 80GB HBM3 card - [2025-11-01 13:43:10] Compress Progress 1/8: starting compression pipeline - [2025-11-01 13:43:10] Compress Progress 2/8: converting model from HF to DeciLM - [2025-11-01 13:43:30] Compress Progress 3/8: scoring pruning activations - [2025-11-01 13:44:38] Compress Progress 4/8: pruning the model and saving pruned checkpoints - [2025-11-01 13:44:45] Compress Progress 5/8: building replacement library and calculating subblock statistics - [2025-11-01 13:44:46] Compress Progress 6/8: calculating one block scores - [2025-11-01 13:49:29] Compress Progress 7/8: running MIP and realizing models - [2025-11-01 13:52:59] Compress Progress 8/8: compression pipeline completed + [2025-11-01 14:01:10] Compress Progress 1/8: starting compression pipeline + [2025-11-01 14:01:10] Compress Progress 2/8: converting model from HF to DeciLM + [2025-11-01 14:01:29] Compress Progress 3/8: scoring pruning activations + [2025-11-01 14:02:30] Compress Progress 4/8: pruning the model and saving pruned checkpoints + [2025-11-01 14:03:18] Compress Progress 5/8: building replacement library and calculating subblock statistics + [2025-11-01 14:03:19] Compress Progress 6/8: calculating one block scores + [2025-11-01 14:13:35] Compress Progress 7/8: running MIP and realizing models + [2025-11-01 14:13:52] Compress Progress 8/8: compression pipeline completed ``` ## Evaluate model accuracy From ce8d53afd5173978923fd81ae927ff7095eb0c31 Mon Sep 17 00:00:00 2001 From: Daniel Korzekwa Date: Sun, 2 Nov 2025 09:58:06 +0100 Subject: [PATCH 66/81] Implement mip_only mode. Signed-off-by: Daniel Korzekwa --- examples/compress/main.py | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/examples/compress/main.py b/examples/compress/main.py index 95cda0d9a..991af9b69 100644 --- a/examples/compress/main.py +++ b/examples/compress/main.py @@ -32,6 +32,7 @@ import datetime from pathlib import Path +import mip_and_realize_models import torch from puzzle_tools.hydra_utils import register_hydra_resolvers @@ -125,10 +126,30 @@ def run_mip_only(hydra_config_path: str): Args: hydra_config_path: Path to the YAML configuration file """ - raise NotImplementedError("MIP-only mode is not implemented yet") - # hydra_config_path = Path(hydra_config_path).resolve() - # config_dir = str(hydra_config_path.parent) - # config_name = hydra_config_path.stem + + with NativeDdpRuntime( + dtype=torch.bfloat16, torch_distributed_timeout=datetime.timedelta(10) + ) as runtime: + # Register Hydra custom resolvers (needed for config resolution) + register_hydra_resolvers() + + hydra_config_path = Path(hydra_config_path).resolve() + hydra_config_dir = str(hydra_config_path.parent) + hydra_config_name = hydra_config_path.stem + + # Load hydra config + hydra_cfg = initialize_hydra_config_for_dir( + config_dir=hydra_config_dir, + config_name=hydra_config_name, + overrides=[], + ) + + # mip_and_realize_models (distributed processing) + # TODO: How to make it part of mnt.search() api, similarly to run_full_compress() API + print(timestamped("Compress Progress 7/8: running MIP and realizing models")) + mip_and_realize_models.launch_mip_and_realize_model(hydra_cfg, runtime) + + print(timestamped("Compress Progress 8/8: compression pipeline completed")) def main(): From c754419d0495e49987d824c48c2a300fc7c0d2d8 Mon Sep 17 00:00:00 2001 From: Daniel Korzekwa Date: Sun, 2 Nov 2025 10:45:01 +0100 Subject: [PATCH 67/81] Improve logging. Convert HF to DeciLM checkpoint only once (single-gpu) Signed-off-by: Daniel Korzekwa --- examples/compress/main.py | 6 +-- .../nas/plugins/compress_nas_plugin.py | 37 +++++++++++-------- 2 files changed, 25 insertions(+), 18 deletions(-) diff --git a/examples/compress/main.py b/examples/compress/main.py index 991af9b69..93ea0b8ab 100644 --- a/examples/compress/main.py +++ b/examples/compress/main.py @@ -114,7 +114,7 @@ def run_full_compress(hydra_config_path: str): config={}, # this is not used as the search space is defined in the hydra config ) - print(timestamped("Compress Progress 8/8: compression pipeline completed")) + print(timestamped("Compress Progress 8/8: compression pipeline completed (multi-gpu)")) def run_mip_only(hydra_config_path: str): @@ -146,10 +146,10 @@ def run_mip_only(hydra_config_path: str): # mip_and_realize_models (distributed processing) # TODO: How to make it part of mnt.search() api, similarly to run_full_compress() API - print(timestamped("Compress Progress 7/8: running MIP and realizing models")) + print(timestamped("Compress Progress 7/8: running MIP and realizing models (multi-gpu)")) mip_and_realize_models.launch_mip_and_realize_model(hydra_cfg, runtime) - print(timestamped("Compress Progress 8/8: compression pipeline completed")) + print(timestamped("Compress Progress 8/8: compression pipeline completed (multi-gpu)")) def main(): diff --git a/modelopt/torch/_compress/nas/plugins/compress_nas_plugin.py b/modelopt/torch/_compress/nas/plugins/compress_nas_plugin.py index aa06c217b..bcaaa1114 100644 --- a/modelopt/torch/_compress/nas/plugins/compress_nas_plugin.py +++ b/modelopt/torch/_compress/nas/plugins/compress_nas_plugin.py @@ -117,20 +117,27 @@ def convert_compress_model(model: nn.Module, config: CompressConfig) -> ConvertR ) # Convert Llama3 model to DeciLM model - print(timestamped("Compress Progress 2/8: converting model from HF to DeciLM")) - hf_ckpt_teacher_dir = "ckpts/teacher" # TODO: make it configurable - convert_llama3_to_decilm( - input_dir=config.input_model_path, - output_dir=Path(config.puzzle_dir) / hf_ckpt_teacher_dir, - ) + if runtime.global_rank == 0: + print(timestamped("Compress Progress 2/8: converting model from HF to DeciLM (single-gpu)")) + hf_ckpt_teacher_dir = "ckpts/teacher" # TODO: make it configurable + convert_llama3_to_decilm( + input_dir=config.input_model_path, + output_dir=Path(config.puzzle_dir) / hf_ckpt_teacher_dir, + ) + runtime.wait_for_everyone() # Score_pruning_activations (distributed processing) - print(timestamped("Compress Progress 3/8: scoring pruning activations")) + print(timestamped("Compress Progress 3/8: scoring pruning activations (multi-gpu)")) score_pruning_activations.launch_score_activations(hydra_cfg, runtime) # Prune the model and save pruned checkpoints - print(timestamped("Compress Progress 4/8: pruning the model and saving pruned checkpoints")) + if runtime.global_rank == 0: + print( + timestamped( + "Compress Progress 4/8: pruning the model and saving pruned checkpoints (single-gpu)" + ) + ) pruning_ckpts.launch_prune_ckpt(hydra_cfg) runtime.wait_for_everyone() @@ -207,19 +214,19 @@ def run_search(self) -> None: ) # Build_library_and_stats (single process) - print( - timestamped( - "Compress Progress 5/8: building replacement library and calculating subblock statistics" - ) - ) if runtime.global_rank == 0: + print( + timestamped( + "Compress Progress 5/8: building replacement library and subblock statistics (single-gpu)" + ) + ) build_library_and_stats.launch_build_library_and_stats(hydra_cfg) runtime.wait_for_everyone() # Calc_one_block_scores (distributed processing) - print(timestamped("Compress Progress 6/8: calculating one block scores")) + print(timestamped("Compress Progress 6/8: calculating one block scores (multi-gpu)")) scoring.launch_scoring(hydra_cfg, runtime) # mip_and_realize_models (distributed processing) - print(timestamped("Compress Progress 7/8: running MIP and realizing models")) + print(timestamped("Compress Progress 7/8: running MIP and realizing models (multi-gpu)")) mip_and_realize_models.launch_mip_and_realize_model(hydra_cfg, runtime) From 6505631acf2e19c3e1a0a7e12be35477018ab93d Mon Sep 17 00:00:00 2001 From: Daniel Korzekwa Date: Sun, 2 Nov 2025 11:12:44 +0100 Subject: [PATCH 68/81] update docs Signed-off-by: Daniel Korzekwa --- examples/compress/README.md | 4 ++-- tests/experimental/torch/_compress/test_compress.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/compress/README.md b/examples/compress/README.md index 8f8db8f04..615703664 100644 --- a/examples/compress/README.md +++ b/examples/compress/README.md @@ -19,7 +19,7 @@ In this example, we compress [meta-llama/Llama-3.1-8B-Instruct](https://huggingf 3. Run the compression script. ```bash - torchrun examples/compress/main.py --config path/to/llama-3_1-8B_pruneffn_memory.yaml 2>&1 | tee ./log.txt | grep "Compress Progress" + torchrun --nproc_per_node 1 examples/compress/main.py --config path/to/llama-3_1-8B_pruneffn_memory.yaml 2>&1 | tee ./log.txt | grep "Compress Progress" ``` This will save the full output to `log.txt` and display the following progress on screen: @@ -47,7 +47,7 @@ In this example, we compress [meta-llama/Llama-3.1-8B-Instruct](https://huggingf If you want to try different memory constraints without re-running the expensive pruning and scoring steps, use the `--mip-only` flag: ```bash -torchrun examples/compress/main.py --config path/to/llama-3_1-8B_pruneffn_memory.yaml --mip-only 2>&1 | tee ./log.txt | grep "Compress Progress" +torchrun --nproc_per_node 1 examples/compress/main.py --config path/to/llama-3_1-8B_pruneffn_memory.yaml --mip-only 2>&1 | tee ./log.txt | grep "Compress Progress" ``` This assumes pruning, replacement library building, NAS scoring, and subblock stats calculation have already been completed. diff --git a/tests/experimental/torch/_compress/test_compress.py b/tests/experimental/torch/_compress/test_compress.py index 2ef786d14..0622bbbda 100644 --- a/tests/experimental/torch/_compress/test_compress.py +++ b/tests/experimental/torch/_compress/test_compress.py @@ -52,7 +52,7 @@ # pip install mip # pip install lru-dict # -# export PYTHONPATH=$PYTHONPATH:/workspace/puzzletron/v1 +# export PYTHONPATH=$PYTHONPATH:.:/workspace/puzzletron/v1 # # pytest -s -v ./tests/experimental/torch/_compress/test_compress.py::test_compress -o addopts="" From 734c32cd92c62b4128c008f86117989cd17e62e6 Mon Sep 17 00:00:00 2001 From: Daniel Korzekwa Date: Sun, 2 Nov 2025 13:59:48 +0100 Subject: [PATCH 69/81] Update compress tutorial with --mip_only part. Signed-off-by: Daniel Korzekwa --- examples/compress/README.md | 87 ++++++++++++++++++- .../torch/_compress/test_compress.py | 3 - 2 files changed, 85 insertions(+), 5 deletions(-) diff --git a/examples/compress/README.md b/examples/compress/README.md index 615703664..0ea718175 100644 --- a/examples/compress/README.md +++ b/examples/compress/README.md @@ -36,6 +36,47 @@ In this example, we compress [meta-llama/Llama-3.1-8B-Instruct](https://huggingf [2025-11-01 14:13:52] Compress Progress 8/8: compression pipeline completed ``` + This will generate the following network architecture (see `log.txt`): + + ```bash + block_0: attention gqa_4 ffn intermediate_14336 + block_1: attention gqa_4 ffn intermediate_14336 + block_2: attention gqa_4 ffn intermediate_14336 + block_3: attention gqa_4 ffn intermediate_14336 + block_4: attention gqa_4 ffn intermediate_14336 + block_5: attention gqa_4 ffn intermediate_14336 + block_6: attention gqa_4 ffn intermediate_14336 + block_7: attention gqa_4 ffn intermediate_14336 + block_8: attention gqa_4 ffn intermediate_14336 + block_9: attention gqa_4 ffn intermediate_14336 + block_10: attention gqa_4 ffn intermediate_14336 + block_11: attention gqa_4 ffn intermediate_14336 + block_12: attention gqa_4 ffn intermediate_14336 + block_13: attention gqa_4 ffn intermediate_14336 + block_14: attention gqa_4 ffn intermediate_14336 + block_15: attention gqa_4 ffn intermediate_14336 + block_16: attention gqa_4 ffn intermediate_14336 + block_17: attention no_op ffn intermediate_14336 + block_18: attention no_op ffn intermediate_14336 + block_19: attention no_op ffn intermediate_14336 + block_20: attention no_op ffn intermediate_14336 + block_21: attention no_op ffn intermediate_14336 + block_22: attention no_op ffn intermediate_14336 + block_23: attention no_op ffn intermediate_14336 + block_24: attention no_op ffn intermediate_14336 + block_25: attention no_op ffn intermediate_14336 + block_26: attention no_op ffn intermediate_14336 + block_27: attention no_op ffn intermediate_14336 + block_28: attention no_op ffn intermediate_14336 + block_29: attention gqa_4 ffn intermediate_14336 + block_30: attention gqa_4 ffn intermediate_14336 + block_31: attention gqa_4 ffn intermediate_14336 + + [2025-11-02 04:53:11,332]^[[92m[rank-0]^[[0m[run_puzzle.py:295] Total costs: {'stats.memory_mib': 75796.4140625, 'stats.ffn_num_params': 5637275648, 'stats.num_kv_heads': 160, 'stats.kv_cache_memory_mib': 61440.0, 'stats.ffn_memory_mib': 10752.25, 'stats.attention_memory_mib': 63040.15625, 'stats.attention_num_params': 838942720, 'stats.num_params': 7526895616, 'stats.has_attention': 20, 'stats.has_ffn': 32} + [2025-11-02 04:53:11,341]^[[92m[rank-0]^[[0m[run_puzzle.py:300] /workspace/puzzle_dir/mip/puzzle_solutions/target_memory_78000MiB/solutions.json + [2025-11-02 04:53:11,341]^[[92m[rank-0]^[[0m[mip_and_realize_models.py:49] Realize model for the solution: /workspace/puzzle_dir/mip/puzzle_solutions/target_memory_78000MiB/solutions.json + ``` + ## Evaluate model accuracy ```bash @@ -44,13 +85,55 @@ In this example, we compress [meta-llama/Llama-3.1-8B-Instruct](https://huggingf ## Re-run MIP Search with different memory constraints -If you want to try different memory constraints without re-running the expensive pruning and scoring steps, use the `--mip-only` flag: +If you want to try different memory constraints without re-running the expensive pruning and scoring steps, use the `--mip-only` flag. +This assumes pruning, replacement library building, NAS scoring, and subblock stats calculation have already been completed. + +Set `target_memory: 28_000` in `llama-3_1-8B_pruneffn_memory.yaml`. ```bash torchrun --nproc_per_node 1 examples/compress/main.py --config path/to/llama-3_1-8B_pruneffn_memory.yaml --mip-only 2>&1 | tee ./log.txt | grep "Compress Progress" ``` -This assumes pruning, replacement library building, NAS scoring, and subblock stats calculation have already been completed. +This will generate the following network architecture (see `log.txt`): + +```bash +block_0: attention gqa_4 ffn intermediate_14336 +block_1: attention gqa_4 ffn intermediate_14336 +block_2: attention gqa_4 ffn intermediate_14336 +block_3: attention gqa_4 ffn intermediate_14336 +block_4: attention gqa_4 ffn intermediate_14336 +block_5: attention no_op ffn intermediate_11520 +block_6: attention no_op ffn intermediate_14336 +block_7: attention no_op ffn intermediate_8704 +block_8: attention no_op ffn intermediate_14336 +block_9: attention no_op ffn intermediate_3072 +block_10: attention no_op ffn intermediate_11520 +block_11: attention no_op ffn intermediate_11520 +block_12: attention no_op ffn intermediate_11520 +block_13: attention no_op ffn intermediate_11520 +block_14: attention no_op ffn intermediate_3072 +block_15: attention no_op ffn intermediate_14336 +block_16: attention no_op ffn intermediate_14336 +block_17: attention no_op ffn intermediate_14336 +block_18: attention no_op ffn intermediate_14336 +block_19: attention no_op ffn intermediate_14336 +block_20: attention no_op ffn intermediate_14336 +block_21: attention no_op ffn intermediate_14336 +block_22: attention no_op ffn intermediate_14336 +block_23: attention no_op ffn intermediate_14336 +block_24: attention no_op ffn intermediate_14336 +block_25: attention no_op ffn intermediate_14336 +block_26: attention no_op ffn intermediate_14336 +block_27: attention no_op ffn intermediate_14336 +block_28: attention no_op ffn intermediate_14336 +block_29: attention no_op ffn intermediate_14336 +block_30: attention no_op ffn intermediate_14336 +block_31: attention no_op ffn intermediate_14336 + +[2025-11-02 04:47:51,874]^[[92m[rank-0]^[[0m[run_puzzle.py:295] Total costs: {'stats.memory_mib': 27526.296875, 'stats.num_kv_heads': 40, 'stats.kv_cache_memory_mib': 15360.0, 'stats.has_ffn': 32, 'stats.attention_num_params': 209735680, 'stats.ffn_num_params': 5118230528, 'stats.attention_memory_mib': 15760.0390625, 'stats.num_params': 6378643456, 'stats.has_attention': 5, 'stats.ffn_memory_mib': 9762.25} +[2025-11-02 04:47:51,882]^[[92m[rank-0]^[[0m[run_puzzle.py:300] /workspace/puzzle_dir/mip/puzzle_solutions/target_memory_28000MiB/solutions.json +[2025-11-02 04:47:51,882]^[[92m[rank-0]^[[0m[mip_and_realize_models.py:49] Realize model for the solution: /workspace/puzzle_dir/mip/puzzle_solutions/target_memory_28000MiB/solutions.json +``` ## Deploy to TensorRT-LLM diff --git a/tests/experimental/torch/_compress/test_compress.py b/tests/experimental/torch/_compress/test_compress.py index 0622bbbda..9d009c313 100644 --- a/tests/experimental/torch/_compress/test_compress.py +++ b/tests/experimental/torch/_compress/test_compress.py @@ -49,9 +49,6 @@ # --image gitlab-master.nvidia.com/deci/puzzletron:trtllm_main \ # --workdir $MODELOPT SRC DIRECTORY --interactive --gpu 1 # -# pip install mip -# pip install lru-dict -# # export PYTHONPATH=$PYTHONPATH:.:/workspace/puzzletron/v1 # # pytest -s -v ./tests/experimental/torch/_compress/test_compress.py::test_compress -o addopts="" From ee14792fc89b610a96418119ae145f3762b35221 Mon Sep 17 00:00:00 2001 From: Daniel Korzekwa Date: Sun, 2 Nov 2025 19:01:27 +0100 Subject: [PATCH 70/81] Update docs Signed-off-by: Daniel Korzekwa --- examples/compress/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/compress/README.md b/examples/compress/README.md index 0ea718175..0cdc10577 100644 --- a/examples/compress/README.md +++ b/examples/compress/README.md @@ -2,7 +2,7 @@ This tutorial demonstrates how to compress large language models using the compress algorithm based on the [Puzzle paper](https://arxiv.org/abs/2411.19146). -In this example, we compress [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) model by searching for the optimal `ffn_intermediate_size` across MLP layers, resulting in a heterogeneous architecture while reducing GPU memory usage by 20%. +In this example, we compress [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) model by searching for the optimal `ffn_intermediate_size` across MLP layers, resulting in a heterogeneous architecture while reducing GPU memory usage from 113GB to 76GB. ## Compress the Model From 5dca0aa5eaf399aac0e09368a9cca8e334e8ab0e Mon Sep 17 00:00:00 2001 From: Daniel Korzekwa Date: Sun, 2 Nov 2025 19:48:12 +0100 Subject: [PATCH 71/81] Update tutorial llama config file. Signed-off-by: Daniel Korzekwa --- .../llama-3_1-8B_pruneffn_memory/Llama-3_1-8B.yaml | 10 ++++++---- tests/experimental/torch/_compress/test_compress.py | 2 +- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/examples/compress/configs/llama-3_1-8B_pruneffn_memory/Llama-3_1-8B.yaml b/examples/compress/configs/llama-3_1-8B_pruneffn_memory/Llama-3_1-8B.yaml index 1d8fac655..6d9c90fa9 100644 --- a/examples/compress/configs/llama-3_1-8B_pruneffn_memory/Llama-3_1-8B.yaml +++ b/examples/compress/configs/llama-3_1-8B_pruneffn_memory/Llama-3_1-8B.yaml @@ -28,6 +28,8 @@ calc_subblock_stats: merge_with_existing_stats: false subblock_stats_filename: "subblock_stats.json" moe_stats_filename: "moe_stats.json" + runtime_stats: + backend: trt_torch scoring: solutions_to_validate: @@ -40,9 +42,9 @@ scoring: eval_samples: 2 micro_batch_size: 1 - dataset_path: ${dataset_path}/valid seed: 42 shuffle_seed: 444 + dataset_path: ${dataset_path} mip: single_block_replacement_validation_dir: ${to_path:${scoring.output_dir}} @@ -76,7 +78,7 @@ mip: - stats.attention_num_params human_constraints: - target_memory: 780_000 # 78_000 + target_memory: 78_000 mip_constraints: use_greedy_search: false @@ -94,11 +96,11 @@ realize_model: # Validate params skip_validation: false # To enable validation of the model solution set `skip_validation` as False - eval_samples: 2 + eval_samples: 128 micro_batch_size: 1 - dataset_path: ${dataset_path}/valid seed: 42 shuffle_seed: 444 + dataset_path: ${dataset_path} nccl_timeout_minutes: ${timedelta_minutes:10} diff --git a/tests/experimental/torch/_compress/test_compress.py b/tests/experimental/torch/_compress/test_compress.py index 9d009c313..1c673da51 100644 --- a/tests/experimental/torch/_compress/test_compress.py +++ b/tests/experimental/torch/_compress/test_compress.py @@ -46,7 +46,7 @@ # /workspace/puzzletron # # submit_job --partition interactive --time 0 \ -# --image gitlab-master.nvidia.com/deci/puzzletron:trtllm_main \ +# --image gitlab-master.nvidia.com/deci/puzzletron:modelopt_main \ # --workdir $MODELOPT SRC DIRECTORY --interactive --gpu 1 # # export PYTHONPATH=$PYTHONPATH:.:/workspace/puzzletron/v1 From 5454c59395dcfd67face2186df9729ca3b6c9bf8 Mon Sep 17 00:00:00 2001 From: Daniel Korzekwa Date: Sun, 2 Nov 2025 20:45:30 +0100 Subject: [PATCH 72/81] Update compress tutorial Signed-off-by: Daniel Korzekwa --- examples/compress/README.md | 102 ++++++++++-------- .../llama-3_1-8B_pruneffn_memory.yaml | 2 +- 2 files changed, 60 insertions(+), 44 deletions(-) diff --git a/examples/compress/README.md b/examples/compress/README.md index 0cdc10577..db62207a8 100644 --- a/examples/compress/README.md +++ b/examples/compress/README.md @@ -2,12 +2,14 @@ This tutorial demonstrates how to compress large language models using the compress algorithm based on the [Puzzle paper](https://arxiv.org/abs/2411.19146). -In this example, we compress [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) model by searching for the optimal `ffn_intermediate_size` across MLP layers, resulting in a heterogeneous architecture while reducing GPU memory usage from 113GB to 76GB. +In this example, we compress the [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) model by searching for the optimal `ffn_intermediate_size` across MLP layers and `attention op/noop`. This results in a heterogeneous architecture while reducing GPU memory usage from 113GB to 96GB (15% reduction) with less than 1% regression in the token_accuracy_top_10 metric. ## Compress the Model 1. Specify the `puzzle_dir`, `input_hf_model_path`, `dataset_path`, `intermediate_size_list`, and `target_memory` arguments in the [llama-3_1-8B_pruneffn_memory.yaml](./configs/llama-3_1-8B_pruneffn_memory/llama-3_1-8B_pruneffn_memory.yaml) configuration file. + Let's first shoot for 32% GPU memory reduction setting `target_memory = 78_000` GiB. + 2. Download and prepare the [Nemotron-Post-Training-Dataset-v2](https://huggingface.co/datasets/nvidia/Nemotron-Post-Training-Dataset-v2). dataset split: "code", "math", "stem", "chat", excluding reasoning samples (2.62GB) @@ -39,6 +41,7 @@ In this example, we compress [meta-llama/Llama-3.1-8B-Instruct](https://huggingf This will generate the following network architecture (see `log.txt`): ```bash + ... block_0: attention gqa_4 ffn intermediate_14336 block_1: attention gqa_4 ffn intermediate_14336 block_2: attention gqa_4 ffn intermediate_14336 @@ -71,24 +74,31 @@ In this example, we compress [meta-llama/Llama-3.1-8B-Instruct](https://huggingf block_29: attention gqa_4 ffn intermediate_14336 block_30: attention gqa_4 ffn intermediate_14336 block_31: attention gqa_4 ffn intermediate_14336 - + [2025-11-02 04:53:11,332]^[[92m[rank-0]^[[0m[run_puzzle.py:295] Total costs: {'stats.memory_mib': 75796.4140625, 'stats.ffn_num_params': 5637275648, 'stats.num_kv_heads': 160, 'stats.kv_cache_memory_mib': 61440.0, 'stats.ffn_memory_mib': 10752.25, 'stats.attention_memory_mib': 63040.15625, 'stats.attention_num_params': 838942720, 'stats.num_params': 7526895616, 'stats.has_attention': 20, 'stats.has_ffn': 32} - [2025-11-02 04:53:11,341]^[[92m[rank-0]^[[0m[run_puzzle.py:300] /workspace/puzzle_dir/mip/puzzle_solutions/target_memory_78000MiB/solutions.json - [2025-11-02 04:53:11,341]^[[92m[rank-0]^[[0m[mip_and_realize_models.py:49] Realize model for the solution: /workspace/puzzle_dir/mip/puzzle_solutions/target_memory_78000MiB/solutions.json - ``` + ... + ################################################################ + validate_model_and_extract_token_probs(model_name='teacher') + ################################################################ + ... + Average losses = {'lm_loss': 1.118250765837729, 'token_accuracy_top_1': 0.7331905364990234, 'token_accuracy_top_5': 0.9094219207763672, 'token_accuracy_top_10': 0.9423646926879883, + ... + ################################################################ + validate_model_with_kl_div(model_name='solution_0', is_calc_kl_div=True) + ################################################################ + .... + Average losses = {'lm_loss': 1.7577573340386152, 'token_accuracy_top_1': 0.6225490570068359, 'token_accuracy_top_5': 0.846257209777832, 'token_accuracy_top_10': 0.8987817764282227} -## Evaluate model accuracy + ``` -```bash -# TODO -``` + 30% GPU memory reduction leads to nearly 5% regression in token_accuracy_top_10 metric (0.898 / 0.942). Let's rerun MIP search aiming for 15% memory reduction. ## Re-run MIP Search with different memory constraints If you want to try different memory constraints without re-running the expensive pruning and scoring steps, use the `--mip-only` flag. This assumes pruning, replacement library building, NAS scoring, and subblock stats calculation have already been completed. -Set `target_memory: 28_000` in `llama-3_1-8B_pruneffn_memory.yaml`. +Set `target_memory: 96_000` in `llama-3_1-8B_pruneffn_memory.yaml`. ```bash torchrun --nproc_per_node 1 examples/compress/main.py --config path/to/llama-3_1-8B_pruneffn_memory.yaml --mip-only 2>&1 | tee ./log.txt | grep "Compress Progress" @@ -102,49 +112,55 @@ block_1: attention gqa_4 ffn intermediate_14336 block_2: attention gqa_4 ffn intermediate_14336 block_3: attention gqa_4 ffn intermediate_14336 block_4: attention gqa_4 ffn intermediate_14336 -block_5: attention no_op ffn intermediate_11520 -block_6: attention no_op ffn intermediate_14336 -block_7: attention no_op ffn intermediate_8704 -block_8: attention no_op ffn intermediate_14336 -block_9: attention no_op ffn intermediate_3072 -block_10: attention no_op ffn intermediate_11520 -block_11: attention no_op ffn intermediate_11520 -block_12: attention no_op ffn intermediate_11520 -block_13: attention no_op ffn intermediate_11520 -block_14: attention no_op ffn intermediate_3072 -block_15: attention no_op ffn intermediate_14336 -block_16: attention no_op ffn intermediate_14336 -block_17: attention no_op ffn intermediate_14336 +block_5: attention gqa_4 ffn intermediate_14336 +block_6: attention gqa_4 ffn intermediate_14336 +block_7: attention gqa_4 ffn intermediate_14336 +block_8: attention gqa_4 ffn intermediate_14336 +block_9: attention gqa_4 ffn intermediate_14336 +block_10: attention gqa_4 ffn intermediate_14336 +block_11: attention gqa_4 ffn intermediate_14336 +block_12: attention gqa_4 ffn intermediate_14336 +block_13: attention gqa_4 ffn intermediate_14336 +block_14: attention gqa_4 ffn intermediate_14336 +block_15: attention gqa_4 ffn intermediate_14336 +block_16: attention gqa_4 ffn intermediate_14336 +block_17: attention gqa_4 ffn intermediate_14336 block_18: attention no_op ffn intermediate_14336 block_19: attention no_op ffn intermediate_14336 block_20: attention no_op ffn intermediate_14336 -block_21: attention no_op ffn intermediate_14336 +block_21: attention gqa_4 ffn intermediate_14336 block_22: attention no_op ffn intermediate_14336 block_23: attention no_op ffn intermediate_14336 block_24: attention no_op ffn intermediate_14336 -block_25: attention no_op ffn intermediate_14336 -block_26: attention no_op ffn intermediate_14336 -block_27: attention no_op ffn intermediate_14336 -block_28: attention no_op ffn intermediate_14336 -block_29: attention no_op ffn intermediate_14336 -block_30: attention no_op ffn intermediate_14336 -block_31: attention no_op ffn intermediate_14336 - -[2025-11-02 04:47:51,874]^[[92m[rank-0]^[[0m[run_puzzle.py:295] Total costs: {'stats.memory_mib': 27526.296875, 'stats.num_kv_heads': 40, 'stats.kv_cache_memory_mib': 15360.0, 'stats.has_ffn': 32, 'stats.attention_num_params': 209735680, 'stats.ffn_num_params': 5118230528, 'stats.attention_memory_mib': 15760.0390625, 'stats.num_params': 6378643456, 'stats.has_attention': 5, 'stats.ffn_memory_mib': 9762.25} -[2025-11-02 04:47:51,882]^[[92m[rank-0]^[[0m[run_puzzle.py:300] /workspace/puzzle_dir/mip/puzzle_solutions/target_memory_28000MiB/solutions.json -[2025-11-02 04:47:51,882]^[[92m[rank-0]^[[0m[mip_and_realize_models.py:49] Realize model for the solution: /workspace/puzzle_dir/mip/puzzle_solutions/target_memory_28000MiB/solutions.json -``` - -## Deploy to TensorRT-LLM - -```bash -# TODO +block_25: attention gqa_4 ffn intermediate_14336 +block_26: attention gqa_4 ffn intermediate_14336 +block_27: attention gqa_4 ffn intermediate_14336 +block_28: attention gqa_4 ffn intermediate_14336 +block_29: attention gqa_4 ffn intermediate_14336 +block_30: attention gqa_4 ffn intermediate_14336 +block_31: attention gqa_4 ffn intermediate_14336 + +[2025-11-02 11:01:56,443]^[[92m[rank-0]^[[0m[run_puzzle.py:295] Total costs: {'stats.memory_mib': 94708.4609375, 'stats.attention_memory_mib': 81952.203125, 'stats.ffn_memory_mib': 10752.25, 'stats.has_ffn': 32, 'stats.ffn_num_params': 5637275648, 'stats.attention_num_params': 1090625536, 'stats.has_attention': 26, 'stats.kv_cache_memory_mib': 79872.0, 'stats.num_kv_heads': 208, 'stats.num_params': 7778578432} +... +################################################################ +validate_model_with_kl_div(model_name='solution_0', is_calc_kl_div=True) +################################################################ +Average losses = {'lm_loss': 1.2425934937782586, 'token_accuracy_top_1': 0.703862190246582, 'token_accuracy_top_5': 0.8954982757568359, 'token_accuracy_top_10': 0.9336576461791992, ``` -## Export to NeMo for Knowledge Distillation +On the other hand, if you set `target_memory: 28_000`, you would observe that for some layers the intermediate FFN size starts to reduce (see `log.txt`): ```bash -# TODO +block_5: attention no_op ffn intermediate_11520 +block_6: attention no_op ffn intermediate_14336 +block_7: attention no_op ffn intermediate_8704 +block_8: attention no_op ffn intermediate_14336 +block_9: attention no_op ffn intermediate_3072 +block_10: attention no_op ffn intermediate_11520 +block_11: attention no_op ffn intermediate_11520 +block_12: attention no_op ffn intermediate_11520 +block_13: attention no_op ffn intermediate_11520 +block_14: attention no_op ffn intermediate_3072 ``` ## Advanced usage diff --git a/examples/compress/configs/llama-3_1-8B_pruneffn_memory/llama-3_1-8B_pruneffn_memory.yaml b/examples/compress/configs/llama-3_1-8B_pruneffn_memory/llama-3_1-8B_pruneffn_memory.yaml index c9a0cabf3..cfd7f93e8 100644 --- a/examples/compress/configs/llama-3_1-8B_pruneffn_memory/llama-3_1-8B_pruneffn_memory.yaml +++ b/examples/compress/configs/llama-3_1-8B_pruneffn_memory/llama-3_1-8B_pruneffn_memory.yaml @@ -14,7 +14,7 @@ puzzle_dir: /workspace/puzzle_dir # MIP memory constraint (in MiB) mip: human_constraints: - target_memory: 78_000 # 78 GiB + target_memory: 96_000 # 96 GiB # FFN intermediate sizes to search over (heterogeneous architecture) pruning: From b3fd9df191d808d482aa1441002dbe651fd94643 Mon Sep 17 00:00:00 2001 From: Daniel Korzekwa Date: Sun, 2 Nov 2025 20:53:27 +0100 Subject: [PATCH 73/81] Update docs Signed-off-by: Daniel Korzekwa --- examples/compress/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/compress/README.md b/examples/compress/README.md index db62207a8..a1cc1eced 100644 --- a/examples/compress/README.md +++ b/examples/compress/README.md @@ -27,7 +27,7 @@ In this example, we compress the [meta-llama/Llama-3.1-8B-Instruct](https://hugg This will save the full output to `log.txt` and display the following progress on screen: ```bash - # Produced on a single NVIDIA H100 80GB HBM3 card + # Produced on 2x NVIDIA H100 80GB HBM3 [2025-11-01 14:01:10] Compress Progress 1/8: starting compression pipeline [2025-11-01 14:01:10] Compress Progress 2/8: converting model from HF to DeciLM [2025-11-01 14:01:29] Compress Progress 3/8: scoring pruning activations From d4ed34a6f1a4f649b3ee0c611c5939c165e2b72c Mon Sep 17 00:00:00 2001 From: Daniel Korzekwa Date: Sun, 2 Nov 2025 21:04:21 +0100 Subject: [PATCH 74/81] Update compress setting to increase the number of eval samples. Signed-off-by: Daniel Korzekwa --- .../configs/llama-3_1-8B_pruneffn_memory/Llama-3_1-8B.yaml | 2 +- .../pruning/pruning_defaults.yaml | 4 ++-- .../_compress/resources/configs/pruning/ffn_pruning.yaml | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/compress/configs/llama-3_1-8B_pruneffn_memory/Llama-3_1-8B.yaml b/examples/compress/configs/llama-3_1-8B_pruneffn_memory/Llama-3_1-8B.yaml index 6d9c90fa9..70b5304c5 100644 --- a/examples/compress/configs/llama-3_1-8B_pruneffn_memory/Llama-3_1-8B.yaml +++ b/examples/compress/configs/llama-3_1-8B_pruneffn_memory/Llama-3_1-8B.yaml @@ -40,7 +40,7 @@ scoring: teacher_dir: ${to_path:${teacher_dir}} output_dir: ${puzzle_dir}/single_sequence_replacement_solutions--validation - eval_samples: 2 + eval_samples: 10 # default is 128 micro_batch_size: 1 seed: 42 shuffle_seed: 444 diff --git a/examples/compress/configs/llama-3_1-8B_pruneffn_memory/pruning/pruning_defaults.yaml b/examples/compress/configs/llama-3_1-8B_pruneffn_memory/pruning/pruning_defaults.yaml index 0a5eafcff..5d5307b9c 100644 --- a/examples/compress/configs/llama-3_1-8B_pruneffn_memory/pruning/pruning_defaults.yaml +++ b/examples/compress/configs/llama-3_1-8B_pruneffn_memory/pruning/pruning_defaults.yaml @@ -7,7 +7,7 @@ activations_log_dir: ??? activation_hooks_kwargs: ??? # Data: -eval_samples: 100 +eval_samples: 1000 # default is 10000 micro_batch_size: 4 dataset_path: ${dataset_path} val_dataset_name: train @@ -17,7 +17,7 @@ pruned_ckpts_outpt_dir: ${puzzle_dir}/pruning/${pruning.experiment_id} ## FFN pruning ffn_list: -mlp_init_mode: "Truncate" +mlp_init_mode: "Truncate" # PruneByActivationsLog ## KV-heads pruning n_heads_in_group_list: diff --git a/tests/experimental/torch/_compress/resources/configs/pruning/ffn_pruning.yaml b/tests/experimental/torch/_compress/resources/configs/pruning/ffn_pruning.yaml index f0c852eec..96a8ca72e 100644 --- a/tests/experimental/torch/_compress/resources/configs/pruning/ffn_pruning.yaml +++ b/tests/experimental/torch/_compress/resources/configs/pruning/ffn_pruning.yaml @@ -8,5 +8,5 @@ activation_hooks_kwargs: target_layer: "mlp.down_proj" layer_input_descriptors_path: -intermediate_size_list: [256] # teacher_intermediate_size is 14336 +intermediate_size_list: [3072, 5888, 8704, 11520] # teacher_intermediate_size is 14336 mlp_init_mode: "PruneByActivationsLog" From 99798727ede4fc158763be8cc810c2a9bf09f43e Mon Sep 17 00:00:00 2001 From: Daniel Korzekwa Date: Sun, 2 Nov 2025 22:25:16 +0100 Subject: [PATCH 75/81] Update compress tutorial Signed-off-by: Daniel Korzekwa --- examples/compress/README.md | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/examples/compress/README.md b/examples/compress/README.md index a1cc1eced..5efd879a0 100644 --- a/examples/compress/README.md +++ b/examples/compress/README.md @@ -28,14 +28,14 @@ In this example, we compress the [meta-llama/Llama-3.1-8B-Instruct](https://hugg ```bash # Produced on 2x NVIDIA H100 80GB HBM3 - [2025-11-01 14:01:10] Compress Progress 1/8: starting compression pipeline - [2025-11-01 14:01:10] Compress Progress 2/8: converting model from HF to DeciLM - [2025-11-01 14:01:29] Compress Progress 3/8: scoring pruning activations - [2025-11-01 14:02:30] Compress Progress 4/8: pruning the model and saving pruned checkpoints - [2025-11-01 14:03:18] Compress Progress 5/8: building replacement library and calculating subblock statistics - [2025-11-01 14:03:19] Compress Progress 6/8: calculating one block scores - [2025-11-01 14:13:35] Compress Progress 7/8: running MIP and realizing models - [2025-11-01 14:13:52] Compress Progress 8/8: compression pipeline completed + [2025-11-02 12:06:34] Compress Progress 1/8: starting compression pipeline + [2025-11-02 12:06:45] Compress Progress 2/8: converting model from HF to DeciLM (single-gpu) + [2025-11-02 12:07:07] Compress Progress 3/8: scoring pruning activations (multi-gpu) + [2025-11-02 12:11:36] Compress Progress 4/8: pruning the model and saving pruned checkpoints (single-gpu) + [2025-11-02 12:12:20] Compress Progress 5/8: building replacement library and subblock statistics (single-gpu) + [2025-11-02 12:12:21] Compress Progress 6/8: calculating one block scores (multi-gpu) + [2025-11-02 12:50:41] Compress Progress 7/8: running MIP and realizing models (multi-gpu) + [2025-11-02 12:52:34] Compress Progress 8/8: compression pipeline completed (multi-gpu) ``` This will generate the following network architecture (see `log.txt`): @@ -81,14 +81,13 @@ In this example, we compress the [meta-llama/Llama-3.1-8B-Instruct](https://hugg validate_model_and_extract_token_probs(model_name='teacher') ################################################################ ... - Average losses = {'lm_loss': 1.118250765837729, 'token_accuracy_top_1': 0.7331905364990234, 'token_accuracy_top_5': 0.9094219207763672, 'token_accuracy_top_10': 0.9423646926879883, + Average losses = {'lm_loss': 1.118250765837729, 'token_accuracy_top_1': 0.7331905364990234, 'token_accuracy_top_5': 0.9094219207763672, 'token_accuracy_top_10': 0.9423646926879883} ... ################################################################ validate_model_with_kl_div(model_name='solution_0', is_calc_kl_div=True) ################################################################ .... - Average losses = {'lm_loss': 1.7577573340386152, 'token_accuracy_top_1': 0.6225490570068359, 'token_accuracy_top_5': 0.846257209777832, 'token_accuracy_top_10': 0.8987817764282227} - + Average losses = {'lm_loss': 1.7577573340386152, 'token_accuracy_top_1': 0.6225490570068359, 'token_accuracy_top_5': 0.846257209777832, 'token_accuracy_top_10': 0.8987817764282227} ``` 30% GPU memory reduction leads to nearly 5% regression in token_accuracy_top_10 metric (0.898 / 0.942). Let's rerun MIP search aiming for 15% memory reduction. @@ -140,12 +139,12 @@ block_29: attention gqa_4 ffn intermediate_14336 block_30: attention gqa_4 ffn intermediate_14336 block_31: attention gqa_4 ffn intermediate_14336 -[2025-11-02 11:01:56,443]^[[92m[rank-0]^[[0m[run_puzzle.py:295] Total costs: {'stats.memory_mib': 94708.4609375, 'stats.attention_memory_mib': 81952.203125, 'stats.ffn_memory_mib': 10752.25, 'stats.has_ffn': 32, 'stats.ffn_num_params': 5637275648, 'stats.attention_num_params': 1090625536, 'stats.has_attention': 26, 'stats.kv_cache_memory_mib': 79872.0, 'stats.num_kv_heads': 208, 'stats.num_params': 7778578432} +[2025-11-02 12:50:42,024]^[[92m[rank-0]^[[0m[run_puzzle.py:295] Total costs: {'stats.memory_mib': 94708.4609375, 'stats.has_ffn': 32, 'stats.ffn_memory_mib': 10752.25, 'stats.kv_cache_memory_mib': 79872.0, 'stats.attention_num_params': 1090625536, 'stats.ffn_num_params': 5637275648, 'stats.has_attention': 26, 'stats.num_params': 7778578432, 'stats.attention_memory_mib': 81952.203125, 'stats.num_kv_heads': 208} ... ################################################################ validate_model_with_kl_div(model_name='solution_0', is_calc_kl_div=True) ################################################################ -Average losses = {'lm_loss': 1.2425934937782586, 'token_accuracy_top_1': 0.703862190246582, 'token_accuracy_top_5': 0.8954982757568359, 'token_accuracy_top_10': 0.9336576461791992, +Average losses = {'lm_loss': 1.2425934937782586, 'token_accuracy_top_1': 0.703862190246582, 'token_accuracy_top_5': 0.8954982757568359, 'token_accuracy_top_10': 0.9336576461791992 ``` On the other hand, if you set `target_memory: 28_000`, you would observe that for some layers the intermediate FFN size starts to reduce (see `log.txt`): From 8cb50d45fbe669d82e5eecbfd12c7468c97b3e86 Mon Sep 17 00:00:00 2001 From: Daniel Korzekwa Date: Sun, 2 Nov 2025 22:27:08 +0100 Subject: [PATCH 76/81] Update tutorial Signed-off-by: Daniel Korzekwa --- examples/compress/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/compress/README.md b/examples/compress/README.md index 5efd879a0..4d2f99ac9 100644 --- a/examples/compress/README.md +++ b/examples/compress/README.md @@ -2,7 +2,7 @@ This tutorial demonstrates how to compress large language models using the compress algorithm based on the [Puzzle paper](https://arxiv.org/abs/2411.19146). -In this example, we compress the [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) model by searching for the optimal `ffn_intermediate_size` across MLP layers and `attention op/noop`. This results in a heterogeneous architecture while reducing GPU memory usage from 113GB to 96GB (15% reduction) with less than 1% regression in the token_accuracy_top_10 metric. +In this example, we compress the [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) model by searching for the optimal `ffn_intermediate_size` across MLP layers and `attention op/noop`. This results in a heterogeneous architecture while reducing GPU memory usage from 113 GiB to 96 GiB (15% reduction) with less than 1% regression in the token_accuracy_top_10 metric. ## Compress the Model From 2856ca1a108c65bf6f2b7fc8edc2af22acbc6b47 Mon Sep 17 00:00:00 2001 From: Daniel Korzekwa Date: Sun, 2 Nov 2025 22:45:05 +0100 Subject: [PATCH 77/81] Update tutorial. Signed-off-by: Daniel Korzekwa --- examples/compress/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/compress/README.md b/examples/compress/README.md index 4d2f99ac9..778bf2688 100644 --- a/examples/compress/README.md +++ b/examples/compress/README.md @@ -27,7 +27,7 @@ In this example, we compress the [meta-llama/Llama-3.1-8B-Instruct](https://hugg This will save the full output to `log.txt` and display the following progress on screen: ```bash - # Produced on 2x NVIDIA H100 80GB HBM3 + # Produced on 8x NVIDIA H100 80GB HBM3 [2025-11-02 12:06:34] Compress Progress 1/8: starting compression pipeline [2025-11-02 12:06:45] Compress Progress 2/8: converting model from HF to DeciLM (single-gpu) [2025-11-02 12:07:07] Compress Progress 3/8: scoring pruning activations (multi-gpu) From 553107af81e0421dead7a8e341f3b820e6d9d834 Mon Sep 17 00:00:00 2001 From: Daniel Korzekwa Date: Sun, 2 Nov 2025 23:22:31 +0100 Subject: [PATCH 78/81] Update compress tutorial. Signed-off-by: Daniel Korzekwa --- examples/compress/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/compress/README.md b/examples/compress/README.md index 778bf2688..4d2f99ac9 100644 --- a/examples/compress/README.md +++ b/examples/compress/README.md @@ -27,7 +27,7 @@ In this example, we compress the [meta-llama/Llama-3.1-8B-Instruct](https://hugg This will save the full output to `log.txt` and display the following progress on screen: ```bash - # Produced on 8x NVIDIA H100 80GB HBM3 + # Produced on 2x NVIDIA H100 80GB HBM3 [2025-11-02 12:06:34] Compress Progress 1/8: starting compression pipeline [2025-11-02 12:06:45] Compress Progress 2/8: converting model from HF to DeciLM (single-gpu) [2025-11-02 12:07:07] Compress Progress 3/8: scoring pruning activations (multi-gpu) From 3917a789c10c965e4404896ccb5a516a5a8d9dc9 Mon Sep 17 00:00:00 2001 From: Daniel Korzekwa Date: Mon, 3 Nov 2025 08:14:14 +0100 Subject: [PATCH 79/81] Add Dockerfile for the compress tutorial Signed-off-by: Daniel Korzekwa --- examples/compress/Dockerfile | 26 ++++++++++++++++++++++++++ examples/compress/README.md | 6 +++++- 2 files changed, 31 insertions(+), 1 deletion(-) create mode 100644 examples/compress/Dockerfile diff --git a/examples/compress/Dockerfile b/examples/compress/Dockerfile new file mode 100644 index 000000000..5a65839de --- /dev/null +++ b/examples/compress/Dockerfile @@ -0,0 +1,26 @@ +# Docker file for compress example + +FROM nvcr.io/nvidia/tensorrt-llm/release:1.1.0rc5 + +# TODO: The MIP solver would not work with this torch version. +# Fix it, otherwise, mamba models will not be supported by the Compress algorithm. +# # Required for mamba_ssm to work (the default torch version in the 1.1.0rc5 does not work) +# RUN pip uninstall -y torch +# RUN pip uninstall -y torchvision +# RUN pip install torch==2.7.1 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128 + +# # Mamba SSM +# RUN pip install causal-conv1d --no-build-isolation +# RUN pip install mamba_ssm --no-build-isolation + +# Required for puzzletron calc_subblock_stats +RUN pip install hydra-core==1.3.2 +RUN pip install wandb~=0.17.5 +RUN pip install "frozendict>=2.4.4" +RUN pip install fire +RUN pip install mip +RUN pip install lru-dict + +WORKDIR /workspace/ + +COPY . . diff --git a/examples/compress/README.md b/examples/compress/README.md index 4d2f99ac9..f3ff36232 100644 --- a/examples/compress/README.md +++ b/examples/compress/README.md @@ -4,6 +4,11 @@ This tutorial demonstrates how to compress large language models using the compr In this example, we compress the [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) model by searching for the optimal `ffn_intermediate_size` across MLP layers and `attention op/noop`. This results in a heterogeneous architecture while reducing GPU memory usage from 113 GiB to 96 GiB (15% reduction) with less than 1% regression in the token_accuracy_top_10 metric. +## Environment + +- [Dockerfile](./Dockerfile) to use. +- 2x NVIDIA H100 80GB HBM3 (1 card will be good as well). + ## Compress the Model 1. Specify the `puzzle_dir`, `input_hf_model_path`, `dataset_path`, `intermediate_size_list`, and `target_memory` arguments in the [llama-3_1-8B_pruneffn_memory.yaml](./configs/llama-3_1-8B_pruneffn_memory/llama-3_1-8B_pruneffn_memory.yaml) configuration file. @@ -27,7 +32,6 @@ In this example, we compress the [meta-llama/Llama-3.1-8B-Instruct](https://hugg This will save the full output to `log.txt` and display the following progress on screen: ```bash - # Produced on 2x NVIDIA H100 80GB HBM3 [2025-11-02 12:06:34] Compress Progress 1/8: starting compression pipeline [2025-11-02 12:06:45] Compress Progress 2/8: converting model from HF to DeciLM (single-gpu) [2025-11-02 12:07:07] Compress Progress 3/8: scoring pruning activations (multi-gpu) From 6e1d910453a73c39e752555589ea46d049c847a0 Mon Sep 17 00:00:00 2001 From: Daniel Korzekwa Date: Mon, 3 Nov 2025 08:36:15 +0100 Subject: [PATCH 80/81] Update compress tutorial Signed-off-by: Daniel Korzekwa --- examples/compress/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/compress/README.md b/examples/compress/README.md index f3ff36232..55dab2cda 100644 --- a/examples/compress/README.md +++ b/examples/compress/README.md @@ -26,7 +26,7 @@ In this example, we compress the [meta-llama/Llama-3.1-8B-Instruct](https://hugg 3. Run the compression script. ```bash - torchrun --nproc_per_node 1 examples/compress/main.py --config path/to/llama-3_1-8B_pruneffn_memory.yaml 2>&1 | tee ./log.txt | grep "Compress Progress" + torchrun --nproc_per_node 2 examples/compress/main.py --config path/to/llama-3_1-8B_pruneffn_memory.yaml 2>&1 | tee ./log.txt | grep "Compress Progress" ``` This will save the full output to `log.txt` and display the following progress on screen: @@ -104,7 +104,7 @@ This assumes pruning, replacement library building, NAS scoring, and subblock st Set `target_memory: 96_000` in `llama-3_1-8B_pruneffn_memory.yaml`. ```bash -torchrun --nproc_per_node 1 examples/compress/main.py --config path/to/llama-3_1-8B_pruneffn_memory.yaml --mip-only 2>&1 | tee ./log.txt | grep "Compress Progress" +torchrun --nproc_per_node 2 examples/compress/main.py --config path/to/llama-3_1-8B_pruneffn_memory.yaml --mip-only 2>&1 | tee ./log.txt | grep "Compress Progress" ``` This will generate the following network architecture (see `log.txt`): From bb91d73a8eaae4c05d71679adbedffe84e82d15c Mon Sep 17 00:00:00 2001 From: Liana Mikaelyan <45925959+LianaMikael@users.noreply.github.com> Date: Tue, 4 Nov 2025 10:03:27 +0000 Subject: [PATCH 81/81] Update Puzzle Compression Tutorial (#493) ## What does this PR do? **Type of change:** Documentation **Overview:** Updated the tutorial with more details on how to choose the required config parameters and added MMLU evaluation. --------- Signed-off-by: Liana Mikaelyan --- examples/compress/README.md | 39 +++++++++++++++++++++++++++++-------- examples/pruning/README.md | 2 ++ 2 files changed, 33 insertions(+), 8 deletions(-) diff --git a/examples/compress/README.md b/examples/compress/README.md index 55dab2cda..0b165f46b 100644 --- a/examples/compress/README.md +++ b/examples/compress/README.md @@ -1,8 +1,15 @@ # Compress Algorithm Tutorial This tutorial demonstrates how to compress large language models using the compress algorithm based on the [Puzzle paper](https://arxiv.org/abs/2411.19146). +The goal of the algorithm it to find the most optimal modifications to MLP and attention layers of the model, resulting in a heterogeneous model architecture. +The supported modifications are: -In this example, we compress the [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) model by searching for the optimal `ffn_intermediate_size` across MLP layers and `attention op/noop`. This results in a heterogeneous architecture while reducing GPU memory usage from 113 GiB to 96 GiB (15% reduction) with less than 1% regression in the token_accuracy_top_10 metric. +- `ffn_intermediate_size`: different FFN intermediate sizes +- `attention op/noop`: complete removal of attention layers + +To use the Puzzle algorithm effectively, we need to specify the target number of parameters and/or the memory. The final stage is based on Mixed-Integer Programming (MIP) algorithm to find the most optimal combination of layer modifications that satisfy the target requirements. + +In this example, we compress the [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) model reducing GPU memory usage from 113 GiB to 96 GiB (15% reduction) with less than 1% regression in the token_accuracy_top_10 metric. ## Environment @@ -13,7 +20,11 @@ In this example, we compress the [meta-llama/Llama-3.1-8B-Instruct](https://hugg 1. Specify the `puzzle_dir`, `input_hf_model_path`, `dataset_path`, `intermediate_size_list`, and `target_memory` arguments in the [llama-3_1-8B_pruneffn_memory.yaml](./configs/llama-3_1-8B_pruneffn_memory/llama-3_1-8B_pruneffn_memory.yaml) configuration file. - Let's first shoot for 32% GPU memory reduction setting `target_memory = 78_000` GiB. + **_NOTE:_** + How to choose `intermediate_size_list`? + The list specifies the candidate FFN sizes that we wish to search over. It is recommended to choose several pruning sizes (e.g. 15%, 20%, 30% etc of the original). Note that the values must be hardware-friendly (divisible by a multiple of 2) to avoid issues with tensor operations in subsequent steps. + + Let's first shoot for 32% GPU memory reduction setting `target_memory = 78_000` GiB. This means that the algorithm will choose the candidates with highest accuracy that also meet the specified requirements. 2. Download and prepare the [Nemotron-Post-Training-Dataset-v2](https://huggingface.co/datasets/nvidia/Nemotron-Post-Training-Dataset-v2). @@ -23,7 +34,7 @@ In this example, we compress the [meta-llama/Llama-3.1-8B-Instruct](https://hugg python -m modelopt.torch._compress.dataset.prepare_dataset --dataset_name nvidia/Nemotron-Post-Training-Dataset-v2 --output_dir path/to/Nemotron-Post-Training-Dataset-v2 ``` -3. Run the compression script. +3. Run the compression script. ```bash torchrun --nproc_per_node 2 examples/compress/main.py --config path/to/llama-3_1-8B_pruneffn_memory.yaml 2>&1 | tee ./log.txt | grep "Compress Progress" @@ -42,7 +53,7 @@ In this example, we compress the [meta-llama/Llama-3.1-8B-Instruct](https://hugg [2025-11-02 12:52:34] Compress Progress 8/8: compression pipeline completed (multi-gpu) ``` - This will generate the following network architecture (see `log.txt`): + Once the process is complete, the resulting network architecture will be recorded in `log.txt` for your review: ```bash ... @@ -96,12 +107,12 @@ In this example, we compress the [meta-llama/Llama-3.1-8B-Instruct](https://hugg 30% GPU memory reduction leads to nearly 5% regression in token_accuracy_top_10 metric (0.898 / 0.942). Let's rerun MIP search aiming for 15% memory reduction. -## Re-run MIP Search with different memory constraints +## Re-run MIP Search with different constraints -If you want to try different memory constraints without re-running the expensive pruning and scoring steps, use the `--mip-only` flag. +If you want to try different constraints without re-running the expensive pruning and scoring steps, use the `--mip-only` flag. This assumes pruning, replacement library building, NAS scoring, and subblock stats calculation have already been completed. -Set `target_memory: 96_000` in `llama-3_1-8B_pruneffn_memory.yaml`. +For example, let's set `target_memory: 96_000` in `llama-3_1-8B_pruneffn_memory.yaml`. ```bash torchrun --nproc_per_node 2 examples/compress/main.py --config path/to/llama-3_1-8B_pruneffn_memory.yaml --mip-only 2>&1 | tee ./log.txt | grep "Compress Progress" @@ -151,7 +162,7 @@ validate_model_with_kl_div(model_name='solution_0', is_calc_kl_div=True) Average losses = {'lm_loss': 1.2425934937782586, 'token_accuracy_top_1': 0.703862190246582, 'token_accuracy_top_5': 0.8954982757568359, 'token_accuracy_top_10': 0.9336576461791992 ``` -On the other hand, if you set `target_memory: 28_000`, you would observe that for some layers the intermediate FFN size starts to reduce (see `log.txt`): +On the other hand, if you set `target_memory: 28_000`, you'll observe that the intermediate FFN sizes are significantly reduced in certain layers (see `log.txt` for details): ```bash block_5: attention no_op ffn intermediate_11520 @@ -166,6 +177,18 @@ block_13: attention no_op ffn intermediate_11520 block_14: attention no_op ffn intermediate_3072 ``` +## Evaluation + +Once the model is ready, you can evaluate it using [Language Model Evaluation Harness](https://pypi.org/project/lm-eval/). For example, run the following to evaluate the model on a subset of [MMLU](https://huggingface.co/datasets/cais/mmlu). + +```bash +lm_eval --model hf \ + --model_args pretrained=path/to/model,dtype=bfloat16,trust_remote_code=true,parallelize=True \ + --tasks mmlu_humanities \ + --num_fewshot 5 \ + --batch_size 4 +``` + ## Advanced usage Modify `path/to/Llama-3_1-8B yaml` file for advanced compression scenarios. diff --git a/examples/pruning/README.md b/examples/pruning/README.md index 3efa9eb79..54f7322b1 100644 --- a/examples/pruning/README.md +++ b/examples/pruning/README.md @@ -23,6 +23,8 @@ This section focuses on applying Model Optimizer's state-of-the-art complementar +For more advanced pruning strategies, such as the [Puzzle methodology](https://arxiv.org/pdf/2411.19146), please see [Puzzle pruning example](https://github.com/NVIDIA/TensorRT-Model-Optimizer/tree/feature/compress/examples/compress). + ## Pre-Requisites For Minitron pruning for Megatron-LM / NeMo models, use the NeMo container (e.g., `nvcr.io/nvidia/nemo:25.07`) which has all the dependencies installed.