NVIDIA
diff --git a/‎tests/_test_utils/ptq_utils.py‎ renamed to ‎tests/_test_utils/examples/llm_ptq_utils.py‎
Lines changed: 1 addition & 1 deletion b/‎tests/_test_utils/ptq_utils.py‎ renamed to ‎tests/_test_utils/examples/llm_ptq_utils.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tests/_test_utils/model.py‎ renamed to ‎tests/_test_utils/examples/models.py‎ b/‎tests/_test_utils/model.py‎ renamed to ‎tests/_test_utils/examples/models.py‎
diff --git a/‎tests/_test_utils/examples/run_command.py‎
Lines changed: 18 additions & 100 deletions b/‎tests/_test_utils/examples/run_command.py‎
Lines changed: 18 additions & 100 deletions
diff --git a/‎tests/_test_utils/onnx_autocast/utils.py‎ renamed to ‎tests/_test_utils/onnx/autocast/utils.py‎ b/‎tests/_test_utils/onnx_autocast/utils.py‎ renamed to ‎tests/_test_utils/onnx/autocast/utils.py‎
diff --git a/‎tests/_test_utils/onnx_quantization/lib_test_models.py‎ renamed to ‎tests/_test_utils/onnx/quantization/lib_test_models.py‎ b/‎tests/_test_utils/onnx_quantization/lib_test_models.py‎ renamed to ‎tests/_test_utils/onnx/quantization/lib_test_models.py‎
diff --git a/‎tests/_test_utils/onnx_quantization/utils.py‎ renamed to ‎tests/_test_utils/onnx/quantization/utils.py‎
Lines changed: 1 addition & 1 deletion b/‎tests/_test_utils/onnx_quantization/utils.py‎ renamed to ‎tests/_test_utils/onnx/quantization/utils.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tests/_test_utils/torch_deploy/device_model.py‎ renamed to ‎tests/_test_utils/torch/deploy/device_model.py‎
Lines changed: 1 addition & 1 deletion b/‎tests/_test_utils/torch_deploy/device_model.py‎ renamed to ‎tests/_test_utils/torch/deploy/device_model.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tests/_test_utils/torch_model/deploy_models.py‎ renamed to ‎tests/_test_utils/torch/deploy/lib_test_models.py‎ b/‎tests/_test_utils/torch_model/deploy_models.py‎ renamed to ‎tests/_test_utils/torch/deploy/lib_test_models.py‎
diff --git a/‎tests/_test_utils/torch_deploy/runtime.py‎ renamed to ‎tests/_test_utils/torch/deploy/runtime.py‎ b/‎tests/_test_utils/torch_deploy/runtime.py‎ renamed to ‎tests/_test_utils/torch/deploy/runtime.py‎
diff --git a/‎tests/_test_utils/torch_model/diffusers_models.py‎ renamed to ‎tests/_test_utils/torch/diffusers_models.py‎ b/‎tests/_test_utils/torch_model/diffusers_models.py‎ renamed to ‎tests/_test_utils/torch/diffusers_models.py‎
@@ -21,7 +21,7 @@
 import pytest
 import torch
 
-PTQ_EXAMPLE_DIR = Path(__file__).parents[2] / "examples" / "llm_ptq"
+PTQ_EXAMPLE_DIR = Path(__file__).parents[3] / "examples" / "llm_ptq"
 
 
 @dataclass
 
@@ -12,18 +12,18 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""Utility functions for running example commands reused in multiple example tests."""
 
 import os
 import subprocess
-import time
 from pathlib import Path
 
-from _test_utils.torch_dist.dist_utils import get_free_port
+from _test_utils.torch.distributed.utils import get_free_port
 
-MODELOPT_ROOT = Path(__file__).parent.parent.parent.parent
+MODELOPT_ROOT = Path(__file__).parents[3]
 
 
-def _extend_cmd_parts(cmd_parts: list[str], **kwargs):
+def extend_cmd_parts(cmd_parts: list[str], **kwargs):
     for key, value in kwargs.items():
         if value is not None:
             cmd_parts.extend([f"--{key}", str(value)])
@@ -32,9 +32,14 @@ def _extend_cmd_parts(cmd_parts: list[str], **kwargs):
     return cmd_parts
 
 
-def run_example_command(cmd_parts: list[str], example_path: str, setup_free_port: bool = False):
+def run_example_command(
+    cmd_parts: list[str],
+    example_path: str,
+    setup_free_port: bool = False,
+    env: dict[str, str] | None = None,
+):
     print(f"[{example_path}] Running command: {cmd_parts}")
-    env = os.environ.copy()
+    env = env or os.environ.copy()
 
     if setup_free_port:
         free_port = get_free_port()
@@ -43,7 +48,9 @@ def run_example_command(cmd_parts: list[str], example_path: str, setup_free_port
     subprocess.run(cmd_parts, cwd=MODELOPT_ROOT / "examples" / example_path, env=env, check=True)
 
 
-def run_command_in_background(cmd_parts, example_path, stdout=None, stderr=None, text=True):
+def run_command_in_background(
+    cmd_parts: list[str], example_path: str, stdout=None, stderr=None, text=True
+):
     print(f"Running command in background: {' '.join(str(part) for part in cmd_parts)}")
     process = subprocess.Popen(
         cmd_parts,
@@ -55,57 +62,7 @@ def run_command_in_background(cmd_parts, example_path, stdout=None, stderr=None,
     return process
 
 
-def run_llm_autodeploy_command(
-    model: str, quant: str, effective_bits: float, output_dir: str, **kwargs
-):
-    # Create temporary directory for saving the quantized checkpoint
-    port = get_free_port()
-    quantized_ckpt_dir = os.path.join(output_dir, "quantized_model")
-    kwargs.update(
-        {
-            "hf_ckpt": model,
-            "quant": quant,
-            "effective_bits": effective_bits,
-            "save_quantized_ckpt": quantized_ckpt_dir,
-            "port": port,
-        }
-    )
-
-    server_handler = None
-    try:
-        # Quantize and deploy the model to the background
-        cmd_parts = _extend_cmd_parts(["scripts/run_auto_quant_and_deploy.sh"], **kwargs)
-        # Pass None to stdout and stderr to see the output in the console
-        server_handler = run_command_in_background(
-            cmd_parts, "llm_autodeploy", stdout=None, stderr=None
-        )
-
-        # Wait for the server to start. We might need to build
-        time.sleep(100)
-
-        # Test the deployment
-        run_example_command(
-            ["python", "api_client.py", "--prompt", "What is AI?", "--port", str(port)],
-            "llm_autodeploy",
-        )
-    finally:
-        if server_handler:
-            server_handler.terminate()
-
-
-def run_torch_onnx_command(*, quantize_mode: str, onnx_save_path: str, calib_size: str, **kwargs):
-    kwargs.update(
-        {
-            "quantize_mode": quantize_mode,
-            "onnx_save_path": onnx_save_path,
-            "calibration_data_size": calib_size,
-        }
-    )
-    cmd_parts = _extend_cmd_parts(["python", "torch_quant_to_onnx.py"], **kwargs)
-    run_example_command(cmd_parts, "onnx_ptq")
-
-
-def run_llm_export_command(
+def run_onnx_llm_export_command(
     *, torch_dir: str, dtype: str, lm_head: str, output_dir: str, calib_size: str, **kwargs
 ):
     kwargs.update(
@@ -117,7 +74,7 @@ def run_llm_export_command(
             "calib_size": calib_size,
         }
     )
-    cmd_parts = _extend_cmd_parts(["python", "llm_export.py"], **kwargs)
+    cmd_parts = extend_cmd_parts(["python", "llm_export.py"], **kwargs)
     run_example_command(cmd_parts, "onnx_ptq")
 
 
@@ -126,7 +83,7 @@ def run_llm_ptq_command(*, model: str, quant: str, **kwargs):
     kwargs.setdefault("tasks", "quant")
     kwargs.setdefault("calib", 16)
 
-    cmd_parts = _extend_cmd_parts(["scripts/huggingface_example.sh", "--no-verbose"], **kwargs)
+    cmd_parts = extend_cmd_parts(["scripts/huggingface_example.sh", "--no-verbose"], **kwargs)
     run_example_command(cmd_parts, "llm_ptq")
 
 
@@ -135,44 +92,5 @@ def run_vlm_ptq_command(*, model: str, quant: str, **kwargs):
     kwargs.setdefault("tasks", "quant")
     kwargs.setdefault("calib", 16)
 
-    cmd_parts = _extend_cmd_parts(["scripts/huggingface_example.sh"], **kwargs)
+    cmd_parts = extend_cmd_parts(["scripts/huggingface_example.sh"], **kwargs)
     run_example_command(cmd_parts, "vlm_ptq")
-
-
-def run_diffusers_cmd(cmd_parts: list[str]):
-    run_example_command(cmd_parts, "diffusers/quantization")
-
-
-def run_llm_sparsity_command(
-    *, model: str, output_dir: str, sparsity_fmt: str = "sparsegpt", **kwargs
-):
-    kwargs.update(
-        {"model_name_or_path": model, "sparsity_fmt": sparsity_fmt, "output_dir": output_dir}
-    )
-    kwargs.setdefault("calib_size", 16)
-    kwargs.setdefault("device", "cuda")
-    kwargs.setdefault("dtype", "fp16")
-    kwargs.setdefault("model_max_length", 1024)
-
-    cmd_parts = _extend_cmd_parts(["python", "hf_pts.py"], **kwargs)
-    run_example_command(cmd_parts, "llm_sparsity")
-
-
-def run_llm_sparsity_ft_command(
-    *, model: str, restore_path: str, output_dir: str, data_path: str, **kwargs
-):
-    kwargs.update(
-        {
-            "model": model,
-            "restore_path": restore_path,
-            "output_dir": output_dir,
-            "data_path": data_path,
-        }
-    )
-    kwargs.setdefault("num_epochs", 0.01)
-    kwargs.setdefault("max_length", 128)
-    kwargs.setdefault("train_bs", 1)
-    kwargs.setdefault("eval_bs", 1)
-
-    cmd_parts = _extend_cmd_parts(["bash", "launch_finetune.sh"], **kwargs)
-    run_example_command(cmd_parts, "llm_sparsity")
@@ -16,7 +16,7 @@
 import onnx_graphsurgeon as gs
 
 
-def _assert_nodes_are_quantized(nodes):
+def assert_nodes_are_quantized(nodes):
     for node in nodes:
         for inp_idx, inp in enumerate(node.inputs):
             if isinstance(inp, gs.Variable):
 
@@ -18,7 +18,7 @@
 
 import pytest
 import torch
-from _test_utils.torch_model.deploy_models import BaseDeployModel
+from _test_utils.torch.deploy.lib_test_models import BaseDeployModel
 
 from modelopt.torch._deploy import compile
 from modelopt.torch._deploy.utils.torch_onnx import _to_expected_onnx_type