fix: move parameter validation before fit_memory_scaling_model (IBM#101)

tjohnson31415 · web-flow · commit 041fffb6c0cc · 2024-05-30T12:01:25.000-07:00
The launch of `fit_memory_scaling_model` uses the values for `quantize`
and `dtype_str`, so those should be validated and defaulted before it is
ran.

Before this change, if `dtype_str` was set to `None` it would be passed
to `fit_memory_scaling_model` as `None` resulting in an error:
```
Shard 1: Process SpawnProcess-33:
Shard 1: Traceback (most recent call last):
Shard 1:   File "/opt/tgis/lib/python3.11/multiprocessing/process.py", line 314, in _bootstrap
Shard 1:     self.run()
Shard 1:   File "/opt/tgis/lib/python3.11/multiprocessing/process.py", line 108, in run
Shard 1:     self._target(*self._args, **self._kwargs)
Shard 1:   File "/opt/tgis/lib/python3.11/site-packages/text_generation_server/utils/paged.py", line 37, in fit_memory_scaling_model
Shard 1:     model = get_model(
Shard 1:             ^^^^^^^^^^
Shard 1:   File "/opt/tgis/lib/python3.11/site-packages/text_generation_server/models/__init__.py", line 39, in get_model
Shard 1:     dtype = get_torch_dtype(dtype_str)
Shard 1:             ^^^^^^^^^^^^^^^^^^^^^^^^^^
Shard 1:   File "/opt/tgis/lib/python3.11/site-packages/text_generation_server/utils/dist.py", line 64, in get_torch_dtype
Shard 1:     dt = getattr(torch, dtype_str, None)
Shard 1:          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Shard 1: TypeError: attribute name must be string, not 'NoneType'
```

After this change, a value will always be set before calling
`fit_memory_scaling_model`.

Signed-off-by: Travis Johnson &lt;tsjohnso@us.ibm.com&gt;
diff --git a/server/text_generation_server/server.py b/server/text_generation_server/server.py
@@ -273,6 +273,22 @@ async def serve_inner(
         batch_safety_margin: int,
         sharded: bool = False,
     ):
+        if quantize not in [None, "gptq", "bitsandbytes"]:
+            raise ValueError(f"Unrecognized quantization method specified: {quantize}")
+
+        if quantize is None and dtype_str == "int8":
+            print_rank_n("Inferring quantize = bitsandbytes because dtype == int8")
+            quantize = "bitsandbytes"
+
+        cuda_available = torch.cuda.is_available()
+
+        # Default dtype based on device if not provided
+        if dtype_str is None:
+            dtype_str = "float16" if cuda_available else "float32"
+
+        if quantize is not None and not cuda_available:
+            raise ValueError("Quantization requires CUDA")
+
         if ESTIMATE_MEMORY == "auto" and PAGED_ATTENTION:
             # fit memory model using flash model in separate process (ensures GPU memory is entirely cleaned up)
             from text_generation_server.utils.paged import fit_memory_scaling_model
@@ -296,22 +312,6 @@ async def serve_inner(
         ]
         local_url = server_urls[local_rank]
 
-        if quantize not in [None, "gptq", "bitsandbytes"]:
-            raise ValueError(f"Unrecognized quantization method specified: {quantize}")
-
-        # Default dtype based on device if not provided
-        if dtype_str is None:
-            dtype_str = "float16" if torch.cuda.is_available() else "float32"
-
-        if quantize is None and dtype_str == "int8":
-            print_rank_n("Inferring quantize = bitsandbytes because dtype == int8")
-            quantize = "bitsandbytes"
-
-        cuda_available = torch.cuda.is_available()
-
-        if quantize is not None and not cuda_available:
-            raise ValueError("Quantization requires CUDA")
-
         # Set the fraction of cuda/gpu mem available to this process, then load the model
         if cuda_available and cuda_process_memory_fraction < 1:
             torch.cuda.set_per_process_memory_fraction(cuda_process_memory_fraction)