feat: add generate_strategy option to litgpt serve (#2188)

adi776borate · pre-commit-ci[bot] · bhimrazy · web-flow · commit dbddd23b471d · 2026-01-20T08:28:23.000Z
Co-authored-by: pre-commit-ci[bot] &lt;66853113+pre-commit-ci[bot]@users.noreply.github.com&gt;
Co-authored-by: Bhimraj Yadav &lt;bhimrajyadav977@gmail.com&gt;
diff --git a/litgpt/deploy/serve.py b/litgpt/deploy/serve.py
@@ -33,6 +33,7 @@ def __init__(
         max_new_tokens: int = 50,
         devices: int = 1,
         api_path: Optional[str] = None,
+        generate_strategy: Optional[Literal["sequential", "tensor_parallel"]] = None,
     ) -> None:
         if not _LITSERVE_AVAILABLE:
             raise ImportError(str(_LITSERVE_AVAILABLE))
@@ -47,6 +48,7 @@ def __init__(
         self.max_new_tokens = max_new_tokens
         self.top_p = top_p
         self.devices = devices
+        self.generate_strategy = generate_strategy
 
     def setup(self, device: str) -> None:
         if ":" in device:
@@ -64,7 +66,8 @@ def setup(self, device: str) -> None:
             accelerator=accelerator,
             quantize=self.quantize,
             precision=self.precision,
-            generate_strategy=("sequential" if self.devices is not None and self.devices > 1 else None),
+            generate_strategy=self.generate_strategy
+            or ("sequential" if self.devices is not None and self.devices > 1 else None),
         )
         print("Model successfully initialized.", file=sys.stderr)
 
@@ -85,6 +88,7 @@ def __init__(
         max_new_tokens: int = 50,
         devices: int = 1,
         api_path: Optional[str] = None,
+        generate_strategy: Optional[str] = None,
     ):
         super().__init__(
             checkpoint_dir,
@@ -96,6 +100,7 @@ def __init__(
             max_new_tokens,
             devices,
             api_path=api_path,
+            generate_strategy=generate_strategy,
         )
 
     def setup(self, device: str):
@@ -128,6 +133,7 @@ def __init__(
         max_new_tokens: int = 50,
         devices: int = 1,
         api_path: Optional[str] = None,
+        generate_strategy: Optional[str] = None,
     ):
         super().__init__(
             checkpoint_dir,
@@ -139,6 +145,7 @@ def __init__(
             max_new_tokens,
             devices,
             api_path=api_path,
+            generate_strategy=generate_strategy,
         )
 
     def setup(self, device: str):
@@ -171,6 +178,7 @@ def __init__(
         max_new_tokens: int = 50,
         devices: int = 1,
         api_path: Optional[str] = None,
+        generate_strategy: Optional[str] = None,
     ):
         super().__init__(
             checkpoint_dir,
@@ -182,6 +190,7 @@ def __init__(
             max_new_tokens,
             devices,
             api_path=api_path,
+            generate_strategy=generate_strategy,
         )
 
     def setup(self, device: str):
@@ -241,6 +250,7 @@ def run_server(
     access_token: Optional[str] = None,
     api_path: Optional[str] = "/predict",
     timeout: int = 30,
+    generate_strategy: Optional[Literal["sequential", "tensor_parallel"]] = None,
 ) -> None:
     """Serve a LitGPT model using LitServe.
 
@@ -284,6 +294,10 @@ def run_server(
         access_token: Optional API token to access models with restrictions.
         api_path: The custom API path for the endpoint (e.g., "/my_api/classify").
         timeout: Request timeout in seconds. Defaults to 30.
+        generate_strategy: The generation strategy to use. The "sequential" strategy (default for devices > 1)
+            allows running models that wouldn't fit in a single card by partitioning the transformer blocks across
+            all devices and running them sequentially. "tensor_parallel" shards the model using tensor parallelism.
+            If None (default for devices = 1), the model is not distributed.
     """
     checkpoint_dir = auto_download_checkpoint(model_name=checkpoint_dir, access_token=access_token)
     pprint(locals())
@@ -301,6 +315,7 @@ def run_server(
             max_new_tokens=max_new_tokens,
             devices=devices,
             api_path=api_path,
+            generate_strategy=generate_strategy,
         ),
         spec=OpenAISpec() if openai_spec else None,
         accelerator=accelerator,
diff --git a/tests/test_serve.py b/tests/test_serve.py
@@ -254,3 +254,45 @@ def run_server():
         if process:
             kill_process_tree(process.pid)
         server_thread.join()
+
+
+@pytest.mark.parametrize(
+    "generate_strategy",
+    [
+        pytest.param("sequential", marks=_RunIf(min_cuda_gpus=1)),
+        pytest.param("tensor_parallel", marks=_RunIf(min_cuda_gpus=2)),
+    ],
+)
+def test_serve_with_generate_strategy(tmp_path, generate_strategy):
+    seed_everything(123)
+    ours_config = Config.from_name("pythia-14m")
+    download_from_hub(repo_id="EleutherAI/pythia-14m", tokenizer_only=True, checkpoint_dir=tmp_path)
+    shutil.move(str(tmp_path / "EleutherAI" / "pythia-14m" / "tokenizer.json"), str(tmp_path))
+    shutil.move(str(tmp_path / "EleutherAI" / "pythia-14m" / "tokenizer_config.json"), str(tmp_path))
+    ours_model = GPT(ours_config)
+    checkpoint_path = tmp_path / "lit_model.pth"
+    torch.save(ours_model.state_dict(), checkpoint_path)
+    config_path = tmp_path / "model_config.yaml"
+    with open(config_path, "w", encoding="utf-8") as fp:
+        yaml.dump(asdict(ours_config), fp)
+
+    # Test with generate strategy
+    run_command = ["litgpt", "serve", tmp_path, "--generate_strategy", generate_strategy]
+
+    process = None
+
+    def run_server():
+        nonlocal process
+        try:
+            process = subprocess.Popen(run_command, stdout=None, stderr=None, text=True)
+        except subprocess.TimeoutExpired:
+            print("Server start-up timeout expired")
+
+    server_thread = threading.Thread(target=run_server)
+    server_thread.start()
+
+    _wait_and_check_response()
+
+    if process:
+        kill_process_tree(process.pid)
+    server_thread.join()