model_init: Add --verbose argument and hide TP split by default

turboderp · turboderp · commit 3c92d75d1ee9 · 2025-08-17T12:53:53.000+02:00
diff --git a/exllamav3/model/model.py b/exllamav3/model/model.py
@@ -120,7 +120,8 @@ def load_gen(
         callback: Callable[[int, int], None] | None = None,
         generator: bool = True,
         tp_dev_limits: dict | None = None,
-        tp_backend: str = "native"
+        tp_backend: str = "native",
+        verbose: bool = False
     ):
         """
         Load model, generator function. For regular function, call load() with the same arguments
@@ -197,6 +198,9 @@ def load_gen(
 
         :param tp_backend:
             str, either "nccl" (default) or "native"
+
+        :param verbose:
+            bool, more info while loading including full TP split
         """
 
         free_mem()
@@ -214,7 +218,7 @@ def load_gen(
                 "Cannot specify reserve_per_device or use_per_device when loading to single device."
             assert not tensor_p, \
                 "Cannot use tensor_p when loading to single device."
-            self._load_single(progressbar, device, self.config, self.modules)
+            self._load_single(progressbar, device, self.config, self.modules, verbose)
 
         # Use/reserve
         else:
@@ -264,6 +268,7 @@ def load_gen(
                     generator,
                     self.config,
                     self.modules,
+                    verbose,
                 )
                 self.output_device = self.modules[-1].device
 
@@ -290,6 +295,7 @@ def load_gen(
                     self.modules,
                     tp_dev_limits,
                     tp_backend,
+                    verbose,
                 )
                 self.output_device = tp_output_device
 
diff --git a/exllamav3/model/model_ls.py b/exllamav3/model/model_ls.py
@@ -24,6 +24,7 @@ def _load_single(
         device: torch.device,
         config: Config,
         modules: list,
+        verbose: bool
     ):
         with ProgressBar(f"Loading" if progressbar else None, len(modules)) as progress:
             for idx, module in enumerate(modules):
@@ -57,6 +58,7 @@ def _load_autosplit(
         generator: bool,
         config: Config,
         modules: list,
+        verbose: bool
     ):
         current_device_i = 0
         backup_shape, backup_dtype = self.default_load_shape_dtype(max_chunk_size)
@@ -65,7 +67,7 @@ def _load_autosplit(
         touched_devices = []
         params = self.default_load_params()
 
-        with ProgressBar(f"Loading" if progressbar else None, len(modules)) as progress:
+        with ProgressBar(f"Loading (LS)" if progressbar else None, len(modules)) as progress:
 
             for idx, module in enumerate(modules):
 
diff --git a/exllamav3/model/model_tp.py b/exllamav3/model/model_tp.py
@@ -232,6 +232,7 @@ def _load_tp(
         modules: list,
         dev_limits: dict | None,
         tp_backend: str,
+        verbose: bool
     ):
         assert use_per_device is None or reserve_per_device is None
         if dev_limits is None: dev_limits = {}
@@ -272,7 +273,8 @@ def _load_tp(
             dev_limits = dev_limits,
         )
         allocator.initial_split(max_mem)
-        allocator.print_split()
+        if verbose:
+            allocator.print_split()
         plan = allocator.compile_tp_plan()
         self.tp_worker_dispatch_wait_multi(self.active_devices, mp_set_plan, (plan, self.active_devices))
 
@@ -286,7 +288,7 @@ def _load_tp(
         )
 
         # Begin loading modules
-        with (ProgressBar(f"Loading" if progressbar else None, len(modules)) as progress):
+        with (ProgressBar(f"Loading (TP)" if progressbar else None, len(modules)) as progress):
             for idx, module in enumerate(modules):
                 last_module = module
 
diff --git a/exllamav3/model_init.py b/exllamav3/model_init.py
@@ -34,6 +34,8 @@ def add_args(
     parser.add_argument("-tp_moe", "--tp_max_parallelism_moe", type = int, help = "(TP) Maximum parallelism for MoE layers", default = None)
     parser.add_argument("-tp_linear", "--tp_max_parallelism_linear", type = int, help = "(TP) Maximum parallelism for linear (output) layers", default = None)
 
+    parser.add_argument("-v", "--verbose", action = "store_true", help = "Verbose output while loading")
+
     if cache:
         parser.add_argument("-cs", "--cache_size", type = int, help = f"Total cache size in tokens, default: {default_cache_size}", default = default_cache_size)
         parser.add_argument("-cq", "--cache_quant", type = str, help = "Use quantized cache. Specify either kv_bits or k_bits,v_bits pair")
@@ -158,6 +160,7 @@ def printp(p: bool, s: str):
         progressbar = progress,
         tp_dev_limits = tp_dev_limits,
         tp_backend = args.tp_backend,
+        verbose = args.verbose,
         **kwargs
     )