From 4eac006d89538d675bf8bac8f6d365aac34a1337 Mon Sep 17 00:00:00 2001
From: Tosin Segun <insanechills@gmail.com>
Date: Mon, 19 Jun 2023 23:59:09 +0000
Subject: [PATCH 01/69] Removing load balancing config

---
 mii/client.py     |   6 +--
 mii/config.py     |   8 ----
 mii/deployment.py |  40 ++++++++--------
 mii/server.py     | 119 ++++++++++++++++++----------------------------
 4 files changed, 70 insertions(+), 103 deletions(-)

diff --git a/mii/client.py b/mii/client.py
index 5542d93a..b2970a5e 100644
--- a/mii/client.py
+++ b/mii/client.py
@@ -35,14 +35,14 @@ def mii_query_handle(deployment_name):
         query_handle: A query handle with a single method `.query(request_dictionary)` using which queries can be sent to the model.
     """
     task_name, mii_configs = _get_deployment_info(deployment_name)
-    if mii_configs.enable_load_balancing:
-        return MIIClient(task_name, "localhost", mii_configs.port_number)
+    return MIIClient(task_name, "localhost", mii_configs.port_number)
+    """
     else:
         return MIITensorParallelClient(
             task_name,
             "localhost",
             [mii_configs.port_number + i for i in range(mii_configs.tensor_parallel)])
-
+    """
 
 def create_channel(host, port):
     return grpc.aio.insecure_channel(f'{host}:{port}',
diff --git a/mii/config.py b/mii/config.py
index 0f7d24b3..34e93ddd 100644
--- a/mii/config.py
+++ b/mii/config.py
@@ -55,7 +55,6 @@ class MIIConfig(BaseModel):
     max_tokens: int = 1024
     enable_restful_api: bool = False
     restful_api_port: int = 51080
-    enable_load_balancing: bool = False
     replica_num: int = 1
     hostfile: str = DLTS_HOSTFILE
 
@@ -92,13 +91,6 @@ def checkpoint_dict_valid(cls, value):
                 raise ValueError(f"Missing key={k} in checkpoint_dict")
         return value
 
-    @root_validator
-    def auto_enable_load_balancing(cls, values):
-        if values["enable_restful_api"] and not values["enable_load_balancing"]:
-            logger.warn("Restful API is enabled, enabling Load Balancing")
-            values["enable_load_balancing"] = True
-        return values
-
     class Config:
         validate_all = True
         validate_assignment = True
diff --git a/mii/deployment.py b/mii/deployment.py
index d7ec3226..254ac7b5 100644
--- a/mii/deployment.py
+++ b/mii/deployment.py
@@ -104,26 +104,25 @@ def deploy(task,
 
     # add fields for replica deployment
     lb_config = None
-    if mii_config.enable_load_balancing:
-        replica_pool = _allocate_processes(mii_config.hostfile,
-                                           mii_config.tensor_parallel,
-                                           mii_config.replica_num)
-        replica_configs = []
-        for i, (hostname, gpu_indices) in enumerate(replica_pool):
-            # Reserver port for a LB proxy when replication is enabled
-            port_offset = 1 if mii_config.enable_load_balancing else 0
-            base_port = mii_config.port_number + i * mii_config.tensor_parallel + port_offset
-            tensor_parallel_ports = list(
-                range(base_port,
-                      base_port + mii_config.tensor_parallel))
-            torch_dist_port = mii_config.torch_dist_port + i
-            replica_configs.append(
-                ReplicaConfig(hostname=hostname,
-                              tensor_parallel_ports=tensor_parallel_ports,
-                              torch_dist_port=torch_dist_port,
-                              gpu_indices=gpu_indices))
-        lb_config = LoadBalancerConfig(port=mii_config.port_number,
-                                       replica_configs=replica_configs)
+    replica_pool = _allocate_processes(mii_config.hostfile,
+                                       mii_config.tensor_parallel,
+                                       mii_config.replica_num)
+    replica_configs = []
+    for i, (hostname, gpu_indices) in enumerate(replica_pool):
+        # Reserver port for a LB proxy when replication is enabled
+        port_offset = 1
+        base_port = mii_config.port_number + i * mii_config.tensor_parallel + port_offset
+        tensor_parallel_ports = list(
+            range(base_port,
+                  base_port + mii_config.tensor_parallel))
+        torch_dist_port = mii_config.torch_dist_port + i
+        replica_configs.append(
+            ReplicaConfig(hostname=hostname,
+                          tensor_parallel_ports=tensor_parallel_ports,
+                          torch_dist_port=torch_dist_port,
+                          gpu_indices=gpu_indices))
+    lb_config = LoadBalancerConfig(port=mii_config.port_number,
+                                   replica_configs=replica_configs)
 
     create_score_file(deployment_name=deployment_name,
                       deployment_type=deployment_type,
@@ -165,6 +164,7 @@ def _allocate_processes(hostfile_path, tensor_parallel, num_replicas):
     assert resource_pool is not None and len(
         resource_pool) > 0, f'No hosts found in {hostfile_path}'
 
+    print(resource_pool)
     replica_pool = []
     allocated_num = 0
     for host, slots in resource_pool.items():
diff --git a/mii/server.py b/mii/server.py
index 158b7a5c..626a2522 100644
--- a/mii/server.py
+++ b/mii/server.py
@@ -48,7 +48,7 @@ def __init__(self,
 
         self.port_number = mii_configs.port_number
 
-        if mii_configs.enable_load_balancing and mii_configs.hostfile is None:
+        if mii_configs.hostfile is None:
             raise ValueError(
                 "hostfile must be provided if enable_load_balancing == True")
 
@@ -60,11 +60,7 @@ def __init__(self,
                                              ds_config,
                                              mii_configs,
                                              lb_config)
-        deployment = lb_config.replica_configs if mii_configs.enable_load_balancing else [
-            ReplicaConfig(hostname='localhost',
-                          tensor_parallel_ports=[mii_configs.port_number],
-                          torch_dist_port=mii_configs.torch_dist_port)
-        ]
+        deployment = lb_config.replica_configs
         self._wait_until_server_is_live(processes, deployment)
 
     def _wait_until_server_is_live(self, processes, deployment):
@@ -297,78 +293,57 @@ def _initialize_service(self,
                             lb_config):
 
         processes = []
-        if mii_configs.enable_load_balancing:
-
-            host_gpus = defaultdict(list)
-            for repl_config in lb_config.replica_configs:
-                host_gpus[repl_config.hostname].extend(repl_config.gpu_indices)
-
-            # Start replica instances
-            for i, repl_config in enumerate(lb_config.replica_configs):
-                hostfile = tempfile.NamedTemporaryFile(delete=False)
-                hostfile.write(
-                    f'{repl_config.hostname} slots={max(host_gpus[repl_config.hostname])+1}\n'
-                    .encode())
-                processes.append(
+
+        host_gpus = defaultdict(list)
+        for repl_config in lb_config.replica_configs:
+            host_gpus[repl_config.hostname].extend(repl_config.gpu_indices)
+
+        # Start replica instances
+        for i, repl_config in enumerate(lb_config.replica_configs):
+            hostfile = tempfile.NamedTemporaryFile(delete=False)
+            hostfile.write(
+                f'{repl_config.hostname} slots={max(host_gpus[repl_config.hostname])+1}\n'
+                .encode())
+            processes.append(
                     self._launch_deepspeed(
-                        deployment_name,
-                        model_name,
-                        model_path,
-                        ds_optimize,
-                        ds_zero,
-                        ds_config,
-                        mii_configs,
-                        hostfile.name,
-                        repl_config.hostname,
-                        repl_config.tensor_parallel_ports[0],
-                        mii_configs.torch_dist_port + (100 * i) +
-                        repl_config.gpu_indices[0],
-                        repl_config.gpu_indices))
+                    deployment_name,
+                    model_name,
+                    model_path,
+                    ds_optimize,
+                    ds_zero,
+                    ds_config,
+                    mii_configs,
+                    hostfile.name,
+                    repl_config.hostname,
+                    repl_config.tensor_parallel_ports[0],
+                    mii_configs.torch_dist_port + (100 * i) +
+                    repl_config.gpu_indices[0],
+                    repl_config.gpu_indices))
 
             # start load balancer here.
             # we don't use deepspeed launcher for the load balancer because it does not need a GPU.
             # The deepspeed launcher determines the number of processes to launch based on GPUs available on the host or CUDA_VISIBLE_DEVICES,
             # and it is expected to assign one GPU to one process.
+        processes.append(
+            self._launch_load_balancer(deployment_name,
+                                        model_name,
+                                        model_path,
+                                        ds_optimize,
+                                        ds_zero,
+                                        ds_config,
+                                        mii_configs,
+                                        lb_config))
+
+        if mii_configs.enable_restful_api:
+            # start rest api server
             processes.append(
-                self._launch_load_balancer(deployment_name,
-                                           model_name,
-                                           model_path,
-                                           ds_optimize,
-                                           ds_zero,
-                                           ds_config,
-                                           mii_configs,
-                                           lb_config))
-
-            if mii_configs.enable_restful_api:
-                # start rest api server
-                processes.append(
-                    self._launch_restful_gateway(deployment_name,
-                                                 model_name,
-                                                 model_path,
-                                                 ds_optimize,
-                                                 ds_zero,
-                                                 ds_config,
-                                                 mii_configs,
-                                                 mii_configs.port_number))
-
-            return processes
-        else:
-            if self._is_socket_open("localhost", self.port_number):
-                raise RuntimeError(
-                    f"Server is already running on port {self.port_number}, please shutdown or use different port."
-                )
+                self._launch_restful_gateway(deployment_name,
+                                             model_name,
+                                             model_path,
+                                             ds_optimize,
+                                             ds_zero,
+                                             ds_config,
+                                             mii_configs,
+                                             mii_configs.port_number))
 
-            processes.append(
-                self._launch_deepspeed(deployment_name,
-                                       model_name,
-                                       model_path,
-                                       ds_optimize,
-                                       ds_zero,
-                                       ds_config,
-                                       mii_configs,
-                                       '/dev/null',
-                                       'localhost',
-                                       mii_configs.port_number,
-                                       mii_configs.torch_dist_port,
-                                       mii_configs.deploy_rank))
         return processes

From c68e999000b0669a2e85d64865c762257aa6284c Mon Sep 17 00:00:00 2001
From: Tosin Segun <insanechills@gmail.com>
Date: Tue, 20 Jun 2023 17:24:13 +0000
Subject: [PATCH 02/69] Reformatting tests

---
 tests/test_local_deployment.py | 37 +++-------------------------------
 1 file changed, 3 insertions(+), 34 deletions(-)

diff --git a/tests/test_local_deployment.py b/tests/test_local_deployment.py
index d28af701..b7ae8da5 100644
--- a/tests/test_local_deployment.py
+++ b/tests/test_local_deployment.py
@@ -43,11 +43,6 @@ def load_with_sys_mem(request):
     return request.param
 
 
-@pytest.fixture(scope="function", params=[False])
-def enable_load_balancing(request):
-    return request.param
-
-
 @pytest.fixture(scope="function", params=[False])
 def enable_restful_api(request):
     return request.param
@@ -83,7 +78,6 @@ def mii_configs(
     tensor_parallel: int,
     port_number: int,
     load_with_sys_mem: bool,
-    enable_load_balancing: bool,
     enable_restful_api: bool,
     restful_api_port: int,
 ):
@@ -91,18 +85,15 @@ def mii_configs(
     # Create a hostfile for DeepSpeed launcher when load_balancing is enabled
     hostfile = os.path.join(tmpdir, "hostfile")
     num_gpu = torch.cuda.device_count()
-    enable_load_balancing = enable_load_balancing or enable_restful_api
-    if enable_load_balancing:
-        with open(hostfile, "w") as f:
-            f.write(f"localhost slots={num_gpu}")
+    with open(hostfile, "w") as f:
+        f.write(f"localhost slots={num_gpu}")
 
     return {
         'dtype': dtype,
         'tensor_parallel': tensor_parallel,
         'port_number': port_number,
         'load_with_sys_mem': load_with_sys_mem,
-        'enable_load_balancing': enable_load_balancing,
-        'replica_num': num_gpu * enable_load_balancing // tensor_parallel,
+        'replica_num': num_gpu * 1 // tensor_parallel,
         'hostfile': hostfile,
         'enable_restful_api': enable_restful_api,
         'restful_api_port': restful_api_port,
@@ -215,28 +206,6 @@ def test_single_GPU(local_deployment, query):
     assert result
 
 
-@pytest.mark.local
-@pytest.mark.parametrize("enable_load_balancing", [True])
-@pytest.mark.parametrize("tensor_parallel", [1, 2])
-@pytest.mark.parametrize(
-    "task_name, model_name, query",
-    [
-        (
-            "text-generation",
-            "bigscience/bloom-560m",
-            {
-                "query": ["DeepSpeed is the greatest"]
-            },
-        ),
-    ],
-)
-def test_load_balancing(local_deployment, query):
-    generator = mii.mii_query_handle(local_deployment.deployment_name)
-    for _ in range(10):
-        result = generator.query(query)
-    assert result
-
-
 @pytest.mark.local
 @pytest.mark.parametrize("enable_restful_api", [True])
 @pytest.mark.parametrize("restful_api_port", [28080])

From 5ce1a922fec29ce3d555b9847838ae7cb3ff18ee Mon Sep 17 00:00:00 2001
From: Tosin Segun <insanechills@gmail.com>
Date: Tue, 20 Jun 2023 18:05:57 +0000
Subject: [PATCH 03/69] Fixed the formatting

---
 mii/client.py |  1 +
 mii/config.py |  4 +---
 mii/server.py | 20 +++++++++-----------
 3 files changed, 11 insertions(+), 14 deletions(-)

diff --git a/mii/client.py b/mii/client.py
index b2970a5e..964e9624 100644
--- a/mii/client.py
+++ b/mii/client.py
@@ -44,6 +44,7 @@ def mii_query_handle(deployment_name):
             [mii_configs.port_number + i for i in range(mii_configs.tensor_parallel)])
     """
 
+
 def create_channel(host, port):
     return grpc.aio.insecure_channel(f'{host}:{port}',
                                      options=[('grpc.max_send_message_length',
diff --git a/mii/config.py b/mii/config.py
index 34e93ddd..d9e5aeb7 100644
--- a/mii/config.py
+++ b/mii/config.py
@@ -5,12 +5,10 @@
 import torch
 from typing import Union, List
 from enum import Enum
-from pydantic import BaseModel, validator, root_validator
+from pydantic import BaseModel, validator
 
 from deepspeed.launcher.runner import DLTS_HOSTFILE
 
-from .utils import logger
-
 
 class DtypeEnum(Enum):
     # The torch dtype must always be the first value (so we return torch.dtype)
diff --git a/mii/server.py b/mii/server.py
index 626a2522..d62da132 100644
--- a/mii/server.py
+++ b/mii/server.py
@@ -14,7 +14,6 @@
 
 import mii
 from mii.utils import get_num_gpus, logger
-from mii.config import ReplicaConfig
 
 
 def config_to_b64_str(config):
@@ -305,7 +304,7 @@ def _initialize_service(self,
                 f'{repl_config.hostname} slots={max(host_gpus[repl_config.hostname])+1}\n'
                 .encode())
             processes.append(
-                    self._launch_deepspeed(
+                self._launch_deepspeed(
                     deployment_name,
                     model_name,
                     model_path,
@@ -316,8 +315,7 @@ def _initialize_service(self,
                     hostfile.name,
                     repl_config.hostname,
                     repl_config.tensor_parallel_ports[0],
-                    mii_configs.torch_dist_port + (100 * i) +
-                    repl_config.gpu_indices[0],
+                    mii_configs.torch_dist_port + (100 * i) + repl_config.gpu_indices[0],
                     repl_config.gpu_indices))
 
             # start load balancer here.
@@ -326,13 +324,13 @@ def _initialize_service(self,
             # and it is expected to assign one GPU to one process.
         processes.append(
             self._launch_load_balancer(deployment_name,
-                                        model_name,
-                                        model_path,
-                                        ds_optimize,
-                                        ds_zero,
-                                        ds_config,
-                                        mii_configs,
-                                        lb_config))
+                                       model_name,
+                                       model_path,
+                                       ds_optimize,
+                                       ds_zero,
+                                       ds_config,
+                                       mii_configs,
+                                       lb_config))
 
         if mii_configs.enable_restful_api:
             # start rest api server

From fa10e19f9612b9452f6d02c0d85e33519e28052c Mon Sep 17 00:00:00 2001
From: Tosin Segun <insanechills@gmail.com>
Date: Tue, 20 Jun 2023 18:12:01 +0000
Subject: [PATCH 04/69] Removed print statement

---
 mii/deployment.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/mii/deployment.py b/mii/deployment.py
index 254ac7b5..e4c36ae6 100644
--- a/mii/deployment.py
+++ b/mii/deployment.py
@@ -164,7 +164,6 @@ def _allocate_processes(hostfile_path, tensor_parallel, num_replicas):
     assert resource_pool is not None and len(
         resource_pool) > 0, f'No hosts found in {hostfile_path}'
 
-    print(resource_pool)
     replica_pool = []
     allocated_num = 0
     for host, slots in resource_pool.items():

From 8970f4e69a9dd368574f084a6bb246f1ed0e926e Mon Sep 17 00:00:00 2001
From: Tosin Segun <insanechills@gmail.com>
Date: Mon, 26 Jun 2023 16:54:34 +0000
Subject: [PATCH 05/69] Removing unused import

---
 mii/server.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/mii/server.py b/mii/server.py
index f0bdad0b..35c1745d 100644
--- a/mii/server.py
+++ b/mii/server.py
@@ -14,7 +14,6 @@
 
 import mii
 from mii.utils import get_num_gpus, logger, get_provider_name
-from mii.config import ReplicaConfig
 
 
 def config_to_b64_str(config):

From 517bea8c6df19195c088c463d343a6831303afdf Mon Sep 17 00:00:00 2001
From: Tosin Segun <insanechills@gmail.com>
Date: Mon, 26 Jun 2023 17:39:52 +0000
Subject: [PATCH 06/69] Fixing tests

---
 tests/test_non_persistent_deployment.py | 30 ++-----------------------
 1 file changed, 2 insertions(+), 28 deletions(-)

diff --git a/tests/test_non_persistent_deployment.py b/tests/test_non_persistent_deployment.py
index 3fd5825b..c2347581 100644
--- a/tests/test_non_persistent_deployment.py
+++ b/tests/test_non_persistent_deployment.py
@@ -16,22 +16,18 @@ def mii_configs(
     dtype: str,
     tensor_parallel: int,
     load_with_sys_mem: bool,
-    enable_load_balancing: bool,
 ):
 
     # Create a hostfile for DeepSpeed launcher when load_balancing is enabled
     hostfile = os.path.join(tmpdir, "hostfile")
     num_gpu = torch.cuda.device_count()
-    enable_load_balancing = enable_load_balancing
-    if enable_load_balancing:
-        with open(hostfile, "w") as f:
-            f.write(f"localhost slots={num_gpu}")
+    with open(hostfile, "w") as f:
+        f.write(f"localhost slots={num_gpu}")
 
     return {
         'dtype': dtype,
         'tensor_parallel': tensor_parallel,
         'load_with_sys_mem': load_with_sys_mem,
-        'enable_load_balancing': enable_load_balancing,
     }
 
 
@@ -134,25 +130,3 @@ def test_single_GPU(non_persistent_deployment, query):
     generator = mii.mii_query_handle(non_persistent_deployment.deployment_name)
     result = generator.query(query)
     assert result
-
-
-@pytest.mark.local
-@pytest.mark.parametrize("enable_load_balancing", [True])
-@pytest.mark.parametrize("expected_failure", [AssertionError])
-@pytest.mark.parametrize("tensor_parallel", [1, 2])
-@pytest.mark.parametrize(
-    "task_name, model_name, query",
-    [
-        (
-            "text-generation",
-            "bigscience/bloom-560m",
-            {
-                "query": ["DeepSpeed is the greatest"]
-            },
-        ),
-    ],
-)
-def test_load_balancing(non_persistent_deployment, query):
-    print(f"TESTING NON_PERSISTENT_DEPLOYMENT: {non_persistent_deployment}")
-    assert "Cannot use Load Balancing with Non persistent deployment" in str(
-        non_persistent_deployment.value)

From 58dd2b2eb504ddb5170b26f7521cbcdda85bb63f Mon Sep 17 00:00:00 2001
From: Tosin Segun <insanechills@gmail.com>
Date: Mon, 26 Jun 2023 17:44:13 +0000
Subject: [PATCH 07/69] Fixing merge issue

---
 mii/deployment.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/mii/deployment.py b/mii/deployment.py
index 040142e4..fd5a11c9 100644
--- a/mii/deployment.py
+++ b/mii/deployment.py
@@ -143,7 +143,6 @@ def deploy(task,
     elif deployment_type == DeploymentType.LOCAL:
         return _deploy_local(deployment_name, model_path=model_path)
     elif deployment_type == DeploymentType.NON_PERSISTENT:
-        assert not mii_config.enable_load_balancing, "Cannot use Load Balancing with Non persistent deployment"
         assert int(os.getenv('WORLD_SIZE', '1')) == mii_config.tensor_parallel, "World Size does not equal number of tensors. When using non-persistent deployment type, please launch with `deepspeed --num_gpus <tensor_parallel>`"
         provider = MODEL_PROVIDER_MAP[get_provider_name(model, task)]
         mii.non_persistent_models[deployment_name] = (load_models(

From bb0d5518f11ff122ac17119837e849dc42c31dc6 Mon Sep 17 00:00:00 2001
From: Tosin Segun <insanechills@gmail.com>
Date: Mon, 26 Jun 2023 18:36:06 +0000
Subject: [PATCH 08/69] Creating hostfile when one is not provided

---
 mii/client.py                  | 7 -------
 mii/server.py                  | 8 ++++++--
 tests/test_local_deployment.py | 2 +-
 tests/utils.py                 | 5 -----
 4 files changed, 7 insertions(+), 15 deletions(-)

diff --git a/mii/client.py b/mii/client.py
index f2fb887e..535b55c8 100644
--- a/mii/client.py
+++ b/mii/client.py
@@ -41,13 +41,6 @@ def mii_query_handle(deployment_name):
 
     task_name, mii_configs = _get_deployment_info(deployment_name)
     return MIIClient(task_name, "localhost", mii_configs.port_number)
-    """
-    else:
-        return MIITensorParallelClient(
-            task_name,
-            "localhost",
-            [mii_configs.port_number + i for i in range(mii_configs.tensor_parallel)])
-    """
 
 
 def create_channel(host, port):
diff --git a/mii/server.py b/mii/server.py
index 35c1745d..c734beba 100644
--- a/mii/server.py
+++ b/mii/server.py
@@ -9,6 +9,7 @@
 import sys
 import tempfile
 import time
+import torch
 from pathlib import Path
 from collections import defaultdict
 
@@ -48,8 +49,11 @@ def __init__(self,
         self.port_number = mii_configs.port_number
 
         if mii_configs.hostfile is None:
-            raise ValueError(
-                "hostfile must be provided if enable_load_balancing == True")
+            hostfile = tempfile.NamedTemporaryFile()
+            num_gpu = torch.cuda.device_count()
+            with open(hostfile, "w") as f:
+                f.write(f"localhost slots={num_gpu}")
+            mii.configs.hostfile = hostfile
 
         processes = self._initialize_service(deployment_name,
                                              model_name,
diff --git a/tests/test_local_deployment.py b/tests/test_local_deployment.py
index ac5befd6..b3cd8218 100644
--- a/tests/test_local_deployment.py
+++ b/tests/test_local_deployment.py
@@ -53,7 +53,7 @@ def mii_configs(
         'tensor_parallel': tensor_parallel,
         'port_number': port_number,
         'load_with_sys_mem': load_with_sys_mem,
-        'replica_num': num_gpu * 1 // tensor_parallel,
+        'replica_num': num_gpu // tensor_parallel,
         'hostfile': hostfile,
         'enable_restful_api': enable_restful_api,
         'restful_api_port': restful_api_port,
diff --git a/tests/utils.py b/tests/utils.py
index 3fd2b950..babec323 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -49,8 +49,3 @@ def ds_config(request):
 @pytest.fixture(scope="function", params=[None])
 def expected_failure(request):
     return request.param
-
-
-@pytest.fixture(scope="function", params=[False])
-def enable_load_balancing(request):
-    return request.param

From 3823534e07d03c43f91a19c9e531bdb07c2302fd Mon Sep 17 00:00:00 2001
From: Tosin Segun <insanechills@gmail.com>
Date: Mon, 26 Jun 2023 20:24:00 +0000
Subject: [PATCH 09/69] Fixing import statements removed by merge

---
 mii/config.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/mii/config.py b/mii/config.py
index 30604e74..2d5d520a 100644
--- a/mii/config.py
+++ b/mii/config.py
@@ -5,10 +5,12 @@
 import torch
 from typing import Union, List
 from enum import Enum
-from pydantic import BaseModel, validator
+from pydantic import BaseModel, validator, root_validator
 
 from deepspeed.launcher.runner import DLTS_HOSTFILE
 
+from .utils import logger
+
 
 class DtypeEnum(Enum):
     # The torch dtype must always be the first value (so we return torch.dtype)

From 6f9b4ad9c81e2ea10102193b8433a89c532f1fe6 Mon Sep 17 00:00:00 2001
From: Tosin Segun <insanechills@gmail.com>
Date: Mon, 26 Jun 2023 20:59:45 +0000
Subject: [PATCH 10/69] Removing load_balancing check

---
 mii/config.py | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/mii/config.py b/mii/config.py
index 2d5d520a..6a8bac16 100644
--- a/mii/config.py
+++ b/mii/config.py
@@ -9,8 +9,6 @@
 
 from deepspeed.launcher.runner import DLTS_HOSTFILE
 
-from .utils import logger
-
 
 class DtypeEnum(Enum):
     # The torch dtype must always be the first value (so we return torch.dtype)
@@ -92,13 +90,6 @@ def checkpoint_dict_valid(cls, value):
                 raise ValueError(f"Missing key={k} in checkpoint_dict")
         return value
 
-    @root_validator
-    def auto_enable_load_balancing(cls, values):
-        if values["enable_restful_api"] and not values["enable_load_balancing"]:
-            logger.warn("Restful API is enabled, enabling Load Balancing")
-            values["enable_load_balancing"] = True
-        return values
-
     @root_validator
     def meta_tensor_or_sys_mem(cls, values):
         if values.get("meta_tensor") and values.get("load_with_sys_mem"):

From 499b9ad83a192a24b47252d6e00ad4f7bf02f0ee Mon Sep 17 00:00:00 2001
From: Tosin Segun <insanechills@gmail.com>
Date: Mon, 26 Jun 2023 21:39:56 +0000
Subject: [PATCH 11/69] Removing redudant definitions

---
 mii/deployment.py | 1 -
 mii/server.py     | 3 +--
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/mii/deployment.py b/mii/deployment.py
index fd5a11c9..3cadd994 100644
--- a/mii/deployment.py
+++ b/mii/deployment.py
@@ -105,7 +105,6 @@ def deploy(task,
         model_path = "model"
 
     # add fields for replica deployment
-    lb_config = None
     replica_pool = _allocate_processes(mii_config.hostfile,
                                        mii_config.tensor_parallel,
                                        mii_config.replica_num)
diff --git a/mii/server.py b/mii/server.py
index c734beba..77e50e26 100644
--- a/mii/server.py
+++ b/mii/server.py
@@ -63,8 +63,7 @@ def __init__(self,
                                              ds_config,
                                              mii_configs,
                                              lb_config)
-        deployment = lb_config.replica_configs
-        self._wait_until_server_is_live(processes, deployment)
+        self._wait_until_server_is_live(processes, lb_config.replica_configs)
 
     def _wait_until_server_is_live(self, processes, deployment):
         for process, repl_config in zip(processes, deployment):

From 5419ef6f649b1ec2bfeb68385786b442cac56fa1 Mon Sep 17 00:00:00 2001
From: Tosin Segun <insanechills@gmail.com>
Date: Mon, 26 Jun 2023 22:23:20 +0000
Subject: [PATCH 12/69] Removing hostfile from test

---
 tests/test_local_deployment.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/tests/test_local_deployment.py b/tests/test_local_deployment.py
index 1acf91c3..3d1ad214 100644
--- a/tests/test_local_deployment.py
+++ b/tests/test_local_deployment.py
@@ -43,12 +43,7 @@ def mii_configs(
     restful_api_port: int,
 ):
 
-    # Create a hostfile for DeepSpeed launcher when load_balancing is enabled
-    hostfile = os.path.join(tmpdir, "hostfile")
     num_gpu = torch.cuda.device_count()
-    with open(hostfile, "w") as f:
-        f.write(f"localhost slots={num_gpu}")
-
     return {
         'dtype': dtype,
         'tensor_parallel': tensor_parallel,
@@ -56,7 +51,6 @@ def mii_configs(
         'meta_tensor': meta_tensor,
         'load_with_sys_mem': load_with_sys_mem,
         'replica_num': num_gpu // tensor_parallel,
-        'hostfile': hostfile,
         'enable_restful_api': enable_restful_api,
         'restful_api_port': restful_api_port,
     }

From a70b6de25a835cefd969d01dd917290db892cbfa Mon Sep 17 00:00:00 2001
From: Tosin Segun <insanechills@gmail.com>
Date: Mon, 26 Jun 2023 23:10:33 +0000
Subject: [PATCH 13/69] Removing hostfile from non-persistent test

---
 tests/test_non_persistent_deployment.py | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/tests/test_non_persistent_deployment.py b/tests/test_non_persistent_deployment.py
index c2347581..50861493 100644
--- a/tests/test_non_persistent_deployment.py
+++ b/tests/test_non_persistent_deployment.py
@@ -4,7 +4,6 @@
 # DeepSpeed Team
 import pytest
 import os
-import torch
 from types import SimpleNamespace
 from .utils import *  # noqa: F401
 import mii
@@ -18,12 +17,6 @@ def mii_configs(
     load_with_sys_mem: bool,
 ):
 
-    # Create a hostfile for DeepSpeed launcher when load_balancing is enabled
-    hostfile = os.path.join(tmpdir, "hostfile")
-    num_gpu = torch.cuda.device_count()
-    with open(hostfile, "w") as f:
-        f.write(f"localhost slots={num_gpu}")
-
     return {
         'dtype': dtype,
         'tensor_parallel': tensor_parallel,

From eea658beb7e1ffff5b6b6e778880bd7d70f7e914 Mon Sep 17 00:00:00 2001
From: Tosin Segun <insanechills@gmail.com>
Date: Tue, 27 Jun 2023 21:33:42 +0000
Subject: [PATCH 14/69] initial changes

---
 mii/__init__.py              |  1 +
 mii/config.py                | 14 ++++++-
 mii/constants.py             |  2 +-
 mii/deployment.py            | 80 +++++++++++++++++-------------------
 mii/models/score/generate.py | 59 +++++++++++++-------------
 mii/server.py                |  9 +---
 6 files changed, 83 insertions(+), 82 deletions(-)

diff --git a/mii/__init__.py b/mii/__init__.py
index ab409d4c..6b9740e7 100644
--- a/mii/__init__.py
+++ b/mii/__init__.py
@@ -15,6 +15,7 @@
 
 __version__ = "0.0.0"
 non_persistent_models = {}
+multi_model_deployments = {}
 try:
     from .version import __version__
 except ImportError:
diff --git a/mii/config.py b/mii/config.py
index 6a8bac16..19889740 100644
--- a/mii/config.py
+++ b/mii/config.py
@@ -123,4 +123,16 @@ class LoadBalancerConfig(BaseModel):
 
     class Config:
         validate_all = True
-        validate_assignment = True
+validate_assignment = True
+
+
+class Deployment(BaseModel):
+    deployment_name: str
+    task: str
+    model: str
+    enable_deepspeed: bool = True
+    enable_zero: bool = True
+    GPU_index_map: dict = None
+    mii_config: dict = None
+    ds_config: dict = None
+    version: int = 1
diff --git a/mii/constants.py b/mii/constants.py
index ba4cfa2f..29493433 100644
--- a/mii/constants.py
+++ b/mii/constants.py
@@ -94,7 +94,7 @@ class ModelProvider(enum.Enum):
 DEPLOYMENT_NAME_KEY = 'deployment_name'
 MODEL_PATH_KEY = 'model_path'
 LOAD_BALANCER_CONFIG_KEY = 'load_balancer_config'
-
+DEPLOYMENT_TAG_KEY = 'deployment_tag'
 ENABLE_DEEPSPEED_KEY = 'ds_optimize'
 ENABLE_DEEPSPEED_ZERO_KEY = 'ds_zero'
 DEEPSPEED_CONFIG_KEY = 'ds_config'
diff --git a/mii/deployment.py b/mii/deployment.py
index 3cadd994..afb5abf9 100644
--- a/mii/deployment.py
+++ b/mii/deployment.py
@@ -16,16 +16,10 @@
 from .config import ReplicaConfig, LoadBalancerConfig
 
 
-def deploy(task,
-           model,
-           deployment_name,
+def deploy(deployment_tag,
+           deployments,
            deployment_type=DeploymentType.LOCAL,
-           model_path=None,
-           enable_deepspeed=True,
-           enable_zero=False,
-           ds_config=None,
-           mii_config={},
-           version=1):
+           model_path=None):
     """Deploy a task using specified model. For usage examples see:
 
         mii/examples/local/text-generation-example.py
@@ -66,15 +60,19 @@ def deploy(task,
         If deployment_type is `LOCAL`, returns just the name of the deployment that can be used to create a query handle using `mii.mii_query_handle(deployment_name)`
 
     """
-
+    mii.multi_model_deployments[deployment_tag] = deployments
+    ports = set()
     # parse and validate mii config
-    mii_config = mii.config.MIIConfig(**mii_config)
-    if enable_zero:
-        if ds_config.get("fp16", {}).get("enabled", False):
-            assert (mii_config.dtype == torch.half), "MII Config Error: MII dtype and ZeRO dtype must match"
-        else:
-            assert (mii_config.dtype == torch.float), "MII Config Error: MII dtype and ZeRO dtype must match"
-    assert not (enable_deepspeed and enable_zero), "MII Config Error: DeepSpeed and ZeRO cannot both be enabled, select only one"
+    for deployment in deployments:
+        mii_config = mii.config.MIIConfig(**deployment.mii_config)
+        assert mii_config.port_number not in ports, f"duplicate port numbers not allowed - {mii.config.port_number}"
+        ports.add(mii_config.port_number)
+        if deployment.enable_zero:
+            if deployment.ds_config.get("fp16", {}).get("enabled", False):
+                assert (mii_config.dtype == torch.half), "MII Config Error: MII dtype and ZeRO dtype must match"
+            else:
+                assert (mii_config.dtype == torch.float), "MII Config Error: MII dtype and ZeRO dtype must match"
+        assert not (enable_deepspeed and enable_zero), "MII Config Error: DeepSpeed and ZeRO cannot both be enabled, select only one"
 
     # aml only allows certain characters for deployment names
     if deployment_type == DeploymentType.AML:
@@ -82,21 +80,22 @@ def deploy(task,
                             string.digits + '-')
         assert set(deployment_name) <= allowed_chars, "AML deployment names can only contain a-z, A-Z, 0-9, and '-'"
 
-    task = mii.utils.get_task(task)
+    for deployment in deployments:
+        deployment.task = mii.utils.get_task(deployment.task)
 
-    if not mii_config.skip_model_check:
-        mii.utils.check_if_task_and_model_is_valid(task, model)
-        if enable_deepspeed:
-            mii.utils.check_if_task_and_model_is_supported(task, model)
+        if not mii_config.skip_model_check:
+            mii.utils.check_if_task_and_model_is_valid(deployment.task, deployment.model)
+            if enable_deepspeed:
+                mii.utils.check_if_task_and_model_is_supported(deployment.task, deployment.model)
 
-    if enable_deepspeed:
-        logger.info(
-            f"************* MII is using DeepSpeed Optimizations to accelerate your model *************"
-        )
-    else:
-        logger.info(
-            f"************* DeepSpeed Optimizations not enabled. Please use enable_deepspeed to get better performance *************"
-        )
+        if enable_deepspeed:
+            logger.info(
+                    f"************* MII is using DeepSpeed Optimizations to accelerate your model: {deployment.model} *************"
+            )
+        else:
+            logger.info(
+                    f"************* DeepSpeed Optimizations not enabled. Please use enable_deepspeed to get better performance for: {deployment.model} *************"
+            )
 
     # In local deployments use default path if no model path set
     if model_path is None and deployment_type == DeploymentType.LOCAL:
@@ -126,21 +125,16 @@ def deploy(task,
                                    replica_configs=replica_configs)
 
     if deployment_type != DeploymentType.NON_PERSISTENT:
-        create_score_file(deployment_name=deployment_name,
+        create_score_file(deployment_tag=deployment_tag,
+                          deployments=deployments,
                           deployment_type=deployment_type,
-                          task=task,
-                          model_name=model,
-                          ds_optimize=enable_deepspeed,
-                          ds_zero=enable_zero,
-                          ds_config=ds_config,
-                          mii_config=mii_config,
                           model_path=model_path,
                           lb_config=lb_config)
 
     if deployment_type == DeploymentType.AML:
-        _deploy_aml(deployment_name=deployment_name, model_name=model, version=version)
+        _deploy_aml(deployment_tag=deployment_tag, model_name=model, version=version)
     elif deployment_type == DeploymentType.LOCAL:
-        return _deploy_local(deployment_name, model_path=model_path)
+        return _deploy_local(deployment_tag, model_path=model_path)
     elif deployment_type == DeploymentType.NON_PERSISTENT:
         assert int(os.getenv('WORLD_SIZE', '1')) == mii_config.tensor_parallel, "World Size does not equal number of tensors. When using non-persistent deployment type, please launch with `deepspeed --num_gpus <tensor_parallel>`"
         provider = MODEL_PROVIDER_MAP[get_provider_name(model, task)]
@@ -157,14 +151,14 @@ def deploy(task,
         raise Exception(f"Unknown deployment type: {deployment_type}")
 
 
-def _deploy_local(deployment_name, model_path):
-    mii.utils.import_score_file(deployment_name).init()
+def _deploy_local(deployment_tag, model_path):
+    mii.utils.import_score_file(deployment_tag).init()
 
 
-def _deploy_aml(deployment_name, model_name, version):
+def _deploy_aml(deployment_tag, model_name, version):
     acr_name = mii.aml_related.utils.get_acr_name()
     mii.aml_related.utils.generate_aml_scripts(acr_name=acr_name,
-                                               deployment_name=deployment_name,
+                                               deployment_name=deployment_tag,
                                                model_name=model_name,
                                                version=version)
     print(
diff --git a/mii/models/score/generate.py b/mii/models/score/generate.py
index 1184d70e..6da8f3d9 100644
--- a/mii/models/score/generate.py
+++ b/mii/models/score/generate.py
@@ -9,53 +9,52 @@
 from mii.constants import DeploymentType
 
 
-def create_score_file(deployment_name,
+def create_score_file(deployment_tag,
                       deployment_type,
-                      task,
-                      model_name,
-                      ds_optimize,
-                      ds_zero,
-                      ds_config,
-                      mii_config,
+                      deployments,
                       model_path,
-                      lb_config):
+                      lb_config)
+    
     config_dict = {}
-    config_dict[mii.constants.DEPLOYMENT_NAME_KEY] = deployment_name
-    config_dict[mii.constants.TASK_NAME_KEY] = mii.utils.get_task_name(task)
-    config_dict[mii.constants.MODEL_NAME_KEY] = model_name
-    config_dict[mii.constants.ENABLE_DEEPSPEED_KEY] = ds_optimize
-    config_dict[mii.constants.MII_CONFIGS_KEY] = mii_config.dict()
-    config_dict[mii.constants.ENABLE_DEEPSPEED_ZERO_KEY] = ds_zero
-    config_dict[mii.constants.DEEPSPEED_CONFIG_KEY] = ds_config
     config_dict[mii.constants.MODEL_PATH_KEY] = model_path
-
-    if lb_config is not None:
-        config_dict[mii.constants.LOAD_BALANCER_CONFIG_KEY] = lb_config
-
-    if len(mii.__path__) > 1:
-        logger.warning(
-            f"Detected mii path as multiple sources: {mii.__path__}, might cause unknown behavior"
-        )
+    config_dict[mii.constants.DEPLOYMENT_TAG_KEY] = deployment_tag
+    for deployment in deployments:
+        config_dict[deployment.deployment_name] = {}
+        config_dict[deployment.deployment_name][mii.constants.DEPLOYMENT_NAME_KEY] = deployment_name
+        config_dict[deployment.deployment_name][mii.constants.TASK_NAME_KEY] = mii.utils.get_task_name(task)
+        config_dict[deployment.deployment_name][mii.constants.MODEL_NAME_KEY] = model_name
+        config_dict[deployment.deployment_name][mii.constants.ENABLE_DEEPSPEED_KEY] = ds_optimize
+        config_dict[deployment.deployment_name][mii.constants.MII_CONFIGS_KEY] = mii_config.dict()
+        config_dict[deployment.deployment_name][mii.constants.ENABLE_DEEPSPEED_ZERO_KEY] = ds_zero
+        config_dict[deployment.deployment_name][mii.constants.DEEPSPEED_CONFIG_KEY] = ds_config
+
+        if lb_config is not None:
+            config_dict[deployment.deployment_name][mii.constants.LOAD_BALANCER_CONFIG_KEY] = lb_config
+
+        if len(mii.__path__) > 1:
+            logger.warning(
+                f"Detected mii path as multiple sources: {mii.__path__}, might cause unknown behavior"
+            )
 
     with open(os.path.join(mii.__path__[0],
-                           "models/score/score_template.py"),
-              "r") as fd:
+                        "models/score/score_template.py"),
+            "r") as fd:
         score_src = fd.read()
 
     # update score file w. global config dict
     source_with_config = f"{score_src}\n"
     source_with_config += f"configs = {pprint.pformat(config_dict, indent=4)}"
 
-    with open(generated_score_path(deployment_name, deployment_type), "w") as fd:
-        fd.write(source_with_config)
+    with open(generated_score_path(deployment_tag, deployment_type), "w") as fd:
+        fd.write(source_with_config):
         fd.write("\n")
 
 
-def generated_score_path(deployment_name, deployment_type):
+def generated_score_path(deployment_tag, deployment_type):
     if deployment_type == DeploymentType.LOCAL:
-        score_path = os.path.join(mii.utils.mii_cache_path(), deployment_name)
+        score_path = os.path.join(mii.utils.mii_cache_path(), deployment_tag)
     elif deployment_type == DeploymentType.AML:
-        score_path = os.path.join(mii.aml_related.utils.aml_output_path(deployment_name),
+        score_path = os.path.join(mii.aml_related.utils.aml_output_path(deployment_tag),
                                   "code")
     if not os.path.isdir(score_path):
         os.makedirs(score_path)
diff --git a/mii/server.py b/mii/server.py
index 77e50e26..61b5f9cb 100644
--- a/mii/server.py
+++ b/mii/server.py
@@ -29,14 +29,9 @@ def config_to_b64_str(config):
 class MIIServer():
     '''Initialize the model, setup the server for the model under model_path'''
     def __init__(self,
-                 deployment_name,
-                 task_name,
-                 model_name,
+                 deployment_tag,
+                 deployments,
                  model_path,
-                 ds_optimize=True,
-                 ds_zero=False,
-                 ds_config=None,
-                 mii_configs={},
                  lb_config=None):
 
         mii_configs = mii.config.MIIConfig(**mii_configs)

From c21c31bf87f9c3dbbc0550ac09076a764f736d36 Mon Sep 17 00:00:00 2001
From: Tosin Segun <insanechills@gmail.com>
Date: Wed, 28 Jun 2023 18:02:17 +0000
Subject: [PATCH 15/69] Maintaining current behavior

---
 mii/config.py                      |  1 +
 mii/deployment.py                  | 18 ++++++++++++++++--
 mii/models/score/generate.py       |  4 ++--
 mii/models/score/score_template.py |  2 ++
 mii/server.py                      | 29 +++++++++++------------------
 5 files changed, 32 insertions(+), 22 deletions(-)

diff --git a/mii/config.py b/mii/config.py
index 19889740..1e74df85 100644
--- a/mii/config.py
+++ b/mii/config.py
@@ -107,6 +107,7 @@ class Config:
 
 
 class ReplicaConfig(BaseModel):
+    deployment_name: str = ""
     hostname: str = ""
     tensor_parallel_ports: List[int] = []
     torch_dist_port: int = None
diff --git a/mii/deployment.py b/mii/deployment.py
index afb5abf9..c0175c09 100644
--- a/mii/deployment.py
+++ b/mii/deployment.py
@@ -16,8 +16,15 @@
 from .config import ReplicaConfig, LoadBalancerConfig
 
 
-def deploy(deployment_tag,
-           deployments,
+def deploy(task=None,
+           model=None,
+           deployment_name=None,
+           enable_deepspeed=True,
+           enable_zero=False,
+           ds_config=None,
+           mii_config={},
+           deployment_tag=None,
+           deployments=[],
            deployment_type=DeploymentType.LOCAL,
            model_path=None):
     """Deploy a task using specified model. For usage examples see:
@@ -60,6 +67,13 @@ def deploy(deployment_tag,
         If deployment_type is `LOCAL`, returns just the name of the deployment that can be used to create a query handle using `mii.mii_query_handle(deployment_name)`
 
     """
+    if len(deployments == 0):
+        assert model is not None and task is not None and deployment_name is not None, "model, task, and deployment name must be set to deploy sigular model"
+        deployments = [Deployment(deployment_name, task, model, enable_deepspeed, enable_zero, None, mii_config, ds_config, version)]
+        deployment_tag = deployment_name + "_tag"
+    else:
+        assert deployment_tag is not None, "deployment_tag must be set to deploy multiple models"
+
     mii.multi_model_deployments[deployment_tag] = deployments
     ports = set()
     # parse and validate mii config
diff --git a/mii/models/score/generate.py b/mii/models/score/generate.py
index 6da8f3d9..68036960 100644
--- a/mii/models/score/generate.py
+++ b/mii/models/score/generate.py
@@ -13,7 +13,7 @@ def create_score_file(deployment_tag,
                       deployment_type,
                       deployments,
                       model_path,
-                      lb_config)
+                      lb_config):
     
     config_dict = {}
     config_dict[mii.constants.MODEL_PATH_KEY] = model_path
@@ -46,7 +46,7 @@ def create_score_file(deployment_tag,
     source_with_config += f"configs = {pprint.pformat(config_dict, indent=4)}"
 
     with open(generated_score_path(deployment_tag, deployment_type), "w") as fd:
-        fd.write(source_with_config):
+        fd.write(source_with_config)
         fd.write("\n")
 
 
diff --git a/mii/models/score/score_template.py b/mii/models/score/score_template.py
index 04e47fae..80c220df 100644
--- a/mii/models/score/score_template.py
+++ b/mii/models/score/score_template.py
@@ -16,6 +16,8 @@
 
 def init():
     model_path = mii.utils.full_model_path(configs[mii.constants.MODEL_PATH_KEY])
+    deployment_tag = configs[mii.constants.DEPLOYMENT_TAG_KEY]
+    deployments = mii.multi_model_deployments[deployment_tag]
 
     deployment_name = configs[mii.constants.DEPLOYMENT_NAME_KEY]
     model_name = configs[mii.constants.MODEL_NAME_KEY]
diff --git a/mii/server.py b/mii/server.py
index 7c2e58dd..c7aff4de 100644
--- a/mii/server.py
+++ b/mii/server.py
@@ -50,13 +50,9 @@ def __init__(self,
                 f.write(f"localhost slots={num_gpu}")
             mii.configs.hostfile = hostfile
 
-        processes = self._initialize_service(deployment_name,
-                                             model_name,
+        processes = self._initialize_service(deployment_tag,
+                                             deployments,
                                              model_path,
-                                             ds_optimize,
-                                             ds_zero,
-                                             ds_config,
-                                             mii_configs,
                                              lb_config)
         self._wait_until_server_is_live(processes, lb_config.replica_configs)
 
@@ -273,13 +269,9 @@ def _launch_deepspeed(self,
                                            ds_launch_str=ds_launch_str)
 
     def _initialize_service(self,
-                            deployment_name,
-                            model_name,
+                            deployment_tag,
+                            deployments,
                             model_path,
-                            ds_optimize,
-                            ds_zero,
-                            ds_config,
-                            mii_configs,
                             lb_config):
 
         processes = []
@@ -290,19 +282,20 @@ def _initialize_service(self,
 
         # Start replica instances
         for i, repl_config in enumerate(lb_config.replica_configs):
+            name = repl_config.deployment_name
             hostfile = tempfile.NamedTemporaryFile(delete=False)
             hostfile.write(
                 f'{repl_config.hostname} slots={max(host_gpus[repl_config.hostname])+1}\n'
                 .encode())
             processes.append(
                 self._launch_deepspeed(
-                    deployment_name,
-                    model_name,
+                    name,
+                    deployments[name].model,
                     model_path,
-                    ds_optimize,
-                    ds_zero,
-                    ds_config,
-                    mii_configs,
+                    deployments[name].enable_deepspeed,
+                    deployments[name].enable_zero,
+                    deployments[name].ds_config,
+                    deployments[name].mii_configs,
                     hostfile.name,
                     repl_config.hostname,
                     repl_config.tensor_parallel_ports[0],

From f5253298654fce56156c2750c6184f7b967ddfe8 Mon Sep 17 00:00:00 2001
From: Tosin Segun <insanechills@gmail.com>
Date: Wed, 28 Jun 2023 19:04:12 +0000
Subject: [PATCH 16/69] Reading from score file

---
 mii/config.py                      | 2 +-
 mii/deployment.py                  | 4 ++--
 mii/models/score/score_template.py | 4 +++-
 3 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/mii/config.py b/mii/config.py
index 1e74df85..954cecd7 100644
--- a/mii/config.py
+++ b/mii/config.py
@@ -134,6 +134,6 @@ class Deployment(BaseModel):
     enable_deepspeed: bool = True
     enable_zero: bool = True
     GPU_index_map: dict = None
-    mii_config: dict = None
+    mii_config: MIIConfig = None
     ds_config: dict = None
     version: int = 1
diff --git a/mii/deployment.py b/mii/deployment.py
index c0175c09..a8998a9e 100644
--- a/mii/deployment.py
+++ b/mii/deployment.py
@@ -67,8 +67,8 @@ def deploy(task=None,
         If deployment_type is `LOCAL`, returns just the name of the deployment that can be used to create a query handle using `mii.mii_query_handle(deployment_name)`
 
     """
-    if len(deployments == 0):
-        assert model is not None and task is not None and deployment_name is not None, "model, task, and deployment name must be set to deploy sigular model"
+    if not deployments:
+        assert all((model, task, deployment_name)), "model, task, and deployment name must be set to deploy singular model"
         deployments = [Deployment(deployment_name, task, model, enable_deepspeed, enable_zero, None, mii_config, ds_config, version)]
         deployment_tag = deployment_name + "_tag"
     else:
diff --git a/mii/models/score/score_template.py b/mii/models/score/score_template.py
index 80c220df..0681ac2f 100644
--- a/mii/models/score/score_template.py
+++ b/mii/models/score/score_template.py
@@ -17,7 +17,9 @@
 def init():
     model_path = mii.utils.full_model_path(configs[mii.constants.MODEL_PATH_KEY])
     deployment_tag = configs[mii.constants.DEPLOYMENT_TAG_KEY]
-    deployments = mii.multi_model_deployments[deployment_tag]
+    deployments = []
+    for deployment in configs.values():
+        deployments.append(Deployment(deployment[mii.constants.DEPLOYMENT_NAME_KEY], deployment[mii.constants.TASK_NAME_KEY], deployment[mii.constants.DEPLOYMENT_MODEL_NAME_KEY], deloyment[mii.constants.ENABLE_DEEPSPEED_KEY], deployment[mii.constants.ENABLE_DEEPSPEED_ZERO_KEY], None, deployment[mii.constants.MII_CONFIGS_KEY], deployment[mii.constants.DS_CONFIG_KEY], 1))
 
     deployment_name = configs[mii.constants.DEPLOYMENT_NAME_KEY]
     model_name = configs[mii.constants.MODEL_NAME_KEY]

From 3c0937f2a07bb8b71d1864f5a299bf9b52124211 Mon Sep 17 00:00:00 2001
From: Tosin Segun <insanechills@gmail.com>
Date: Wed, 28 Jun 2023 21:36:41 +0000
Subject: [PATCH 17/69] fixing syntax errors

---
 mii/__init__.py                          | 2 +-
 mii/config.py                            | 4 ++--
 mii/deployment.py                        | 7 ++++---
 mii/grpc_related/modelresponse_server.py | 1 +
 4 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/mii/__init__.py b/mii/__init__.py
index 6b9740e7..5c84d1dc 100644
--- a/mii/__init__.py
+++ b/mii/__init__.py
@@ -10,7 +10,7 @@
 from .constants import DeploymentType, Tasks
 from .aml_related.utils import aml_output_path
 
-from .config import MIIConfig, LoadBalancerConfig
+from .config import MIIConfig, LoadBalancerConfig, Deployment
 from .grpc_related.proto import modelresponse_pb2_grpc
 
 __version__ = "0.0.0"
diff --git a/mii/config.py b/mii/config.py
index 954cecd7..531ee800 100644
--- a/mii/config.py
+++ b/mii/config.py
@@ -132,8 +132,8 @@ class Deployment(BaseModel):
     task: str
     model: str
     enable_deepspeed: bool = True
-    enable_zero: bool = True
+    enable_zero: bool = False
     GPU_index_map: dict = None
-    mii_config: MIIConfig = None
+    mii_config: MIIConfig = MIIConfig.parse_obj({})
     ds_config: dict = None
     version: int = 1
diff --git a/mii/deployment.py b/mii/deployment.py
index a8998a9e..01dbd71b 100644
--- a/mii/deployment.py
+++ b/mii/deployment.py
@@ -78,9 +78,10 @@ def deploy(task=None,
     ports = set()
     # parse and validate mii config
     for deployment in deployments:
-        mii_config = mii.config.MIIConfig(**deployment.mii_config)
-        assert mii_config.port_number not in ports, f"duplicate port numbers not allowed - {mii.config.port_number}"
-        ports.add(mii_config.port_number)
+        mii_config = deployment.mii_config
+        print(mii_config)
+        assert mii_config.port_number not in ports, f"duplicate port numbers not allowed - {mii_config.port_number}"
+        #ports.add(mii_config.port_number)
         if deployment.enable_zero:
             if deployment.ds_config.get("fp16", {}).get("enabled", False):
                 assert (mii_config.dtype == torch.half), "MII Config Error: MII dtype and ZeRO dtype must match"
diff --git a/mii/grpc_related/modelresponse_server.py b/mii/grpc_related/modelresponse_server.py
index 4a0a5d00..47325f6c 100644
--- a/mii/grpc_related/modelresponse_server.py
+++ b/mii/grpc_related/modelresponse_server.py
@@ -173,6 +173,7 @@ def __init__(self, task_name, replica_configs):
                                 replica.tensor_parallel_ports)
             for replica in replica_configs
         ]
+        print(self.stubs)
         self.counter = AtomicCounter()
         self.task = get_task(task_name)
         self.replica_sessions = {}

From 156ac8391f54fb95a6f204d88366b287c930ca1d Mon Sep 17 00:00:00 2001
From: Tosin Segun <insanechills@gmail.com>
Date: Wed, 28 Jun 2023 22:58:26 +0000
Subject: [PATCH 18/69] Fixing more syntax errors

---
 mii/deployment.py                  | 37 ++++++++++++++++--------------
 mii/models/score/generate.py       | 14 +++++------
 mii/models/score/score_template.py | 20 ++++++++--------
 mii/server.py                      | 23 +++++++++++--------
 4 files changed, 52 insertions(+), 42 deletions(-)

diff --git a/mii/deployment.py b/mii/deployment.py
index 01dbd71b..ecdb95c2 100644
--- a/mii/deployment.py
+++ b/mii/deployment.py
@@ -119,30 +119,33 @@ def deploy(task=None,
         model_path = "model"
 
     # add fields for replica deployment
-    replica_pool = _allocate_processes(mii_config.hostfile,
-                                       mii_config.tensor_parallel,
-                                       mii_config.replica_num)
     replica_configs = []
-    for i, (hostname, gpu_indices) in enumerate(replica_pool):
-        # Reserver port for a LB proxy when replication is enabled
-        port_offset = 1
-        base_port = mii_config.port_number + i * mii_config.tensor_parallel + port_offset
-        tensor_parallel_ports = list(
-            range(base_port,
-                  base_port + mii_config.tensor_parallel))
-        torch_dist_port = mii_config.torch_dist_port + i
-        replica_configs.append(
-            ReplicaConfig(hostname=hostname,
-                          tensor_parallel_ports=tensor_parallel_ports,
-                          torch_dist_port=torch_dist_port,
-                          gpu_indices=gpu_indices))
+    for deployment in deployments:
+        mii_config = deployment.mii_config
+        replica_pool = _allocate_processes(mii_config.hostfile,
+                                           mii_config.tensor_parallel,
+                                           mii_config.replica_num)
+
+        for i, (hostname, gpu_indices) in enumerate(replica_pool):
+            # Reserver port for a LB proxy when replication is enabled
+            port_offset = 1
+            base_port = mii_config.port_number + i * mii_config.tensor_parallel + port_offset
+            tensor_parallel_ports = list(
+                range(base_port,
+                    base_port + mii_config.tensor_parallel))
+            torch_dist_port = mii_config.torch_dist_port + i
+            replica_configs.append(
+                ReplicaConfig(hostname=hostname,
+                              tensor_parallel_ports=tensor_parallel_ports,
+                              torch_dist_port=torch_dist_port,
+                              gpu_indices=gpu_indices))
     lb_config = LoadBalancerConfig(port=mii_config.port_number,
                                    replica_configs=replica_configs)
 
     if deployment_type != DeploymentType.NON_PERSISTENT:
         create_score_file(deployment_tag=deployment_tag,
-                          deployments=deployments,
                           deployment_type=deployment_type,
+                          deployments=deployments,
                           model_path=model_path,
                           lb_config=lb_config)
 
diff --git a/mii/models/score/generate.py b/mii/models/score/generate.py
index 68036960..27716bd6 100644
--- a/mii/models/score/generate.py
+++ b/mii/models/score/generate.py
@@ -20,13 +20,13 @@ def create_score_file(deployment_tag,
     config_dict[mii.constants.DEPLOYMENT_TAG_KEY] = deployment_tag
     for deployment in deployments:
         config_dict[deployment.deployment_name] = {}
-        config_dict[deployment.deployment_name][mii.constants.DEPLOYMENT_NAME_KEY] = deployment_name
-        config_dict[deployment.deployment_name][mii.constants.TASK_NAME_KEY] = mii.utils.get_task_name(task)
-        config_dict[deployment.deployment_name][mii.constants.MODEL_NAME_KEY] = model_name
-        config_dict[deployment.deployment_name][mii.constants.ENABLE_DEEPSPEED_KEY] = ds_optimize
-        config_dict[deployment.deployment_name][mii.constants.MII_CONFIGS_KEY] = mii_config.dict()
-        config_dict[deployment.deployment_name][mii.constants.ENABLE_DEEPSPEED_ZERO_KEY] = ds_zero
-        config_dict[deployment.deployment_name][mii.constants.DEEPSPEED_CONFIG_KEY] = ds_config
+        config_dict[deployment.deployment_name][mii.constants.DEPLOYMENT_NAME_KEY] = deployment.deployment_name
+        config_dict[deployment.deployment_name][mii.constants.TASK_NAME_KEY] = mii.utils.get_task_name(deployment.task)
+        config_dict[deployment.deployment_name][mii.constants.MODEL_NAME_KEY] = deployment.model
+        config_dict[deployment.deployment_name][mii.constants.ENABLE_DEEPSPEED_KEY] = deployment.enable_deepspeed
+        config_dict[deployment.deployment_name][mii.constants.MII_CONFIGS_KEY] = deployment.mii_config.dict()
+        config_dict[deployment.deployment_name][mii.constants.ENABLE_DEEPSPEED_ZERO_KEY] = deployment.enable_zero
+        config_dict[deployment.deployment_name][mii.constants.DEEPSPEED_CONFIG_KEY] = deployment.ds_config
 
         if lb_config is not None:
             config_dict[deployment.deployment_name][mii.constants.LOAD_BALANCER_CONFIG_KEY] = lb_config
diff --git a/mii/models/score/score_template.py b/mii/models/score/score_template.py
index 0681ac2f..7127a8ee 100644
--- a/mii/models/score/score_template.py
+++ b/mii/models/score/score_template.py
@@ -19,23 +19,25 @@ def init():
     deployment_tag = configs[mii.constants.DEPLOYMENT_TAG_KEY]
     deployments = []
     for deployment in configs.values():
-        deployments.append(Deployment(deployment[mii.constants.DEPLOYMENT_NAME_KEY], deployment[mii.constants.TASK_NAME_KEY], deployment[mii.constants.DEPLOYMENT_MODEL_NAME_KEY], deloyment[mii.constants.ENABLE_DEEPSPEED_KEY], deployment[mii.constants.ENABLE_DEEPSPEED_ZERO_KEY], None, deployment[mii.constants.MII_CONFIGS_KEY], deployment[mii.constants.DS_CONFIG_KEY], 1))
-
+        if not isinstance(deployment, dict):
+            continue
+        print(f"\nDEPLOYMENT ->{configs.values()}")
+        data = {'deployment_name': deployment[mii.constants.DEPLOYMENT_NAME_KEY], 'task': deployment[mii.constants.TASK_NAME_KEY], 'model': deployment[mii.constants.MODEL_NAME_KEY], 'enable_deepspeed': deployment[mii.constants.ENABLE_DEEPSPEED_KEY], 'enable_zero': deployment[mii.constants.ENABLE_DEEPSPEED_ZERO_KEY], 'GPU_index_map': None, 'mii_config': deployment[mii.constants.MII_CONFIGS_KEY], 'ds_config': deployment[mii.constants.DEEPSPEED_CONFIG_KEY], 'version': 1}
+        deployments.append(mii.Deployment.parse_obj(data))
+
+    print(f"WITHIN INIT {deployments}")
+    """
     deployment_name = configs[mii.constants.DEPLOYMENT_NAME_KEY]
     model_name = configs[mii.constants.MODEL_NAME_KEY]
     task_name = configs[mii.constants.TASK_NAME_KEY]
 
     assert model_name is not None, "The model name should be set before calling init"
     assert task_name is not None, "The task name should be set before calling init"
+    """
 
-    mii.MIIServer(deployment_name,
-                  task_name,
-                  model_name,
+    mii.MIIServer(deployment_tag,
+                  deployments,
                   model_path,
-                  ds_optimize=configs[mii.constants.ENABLE_DEEPSPEED_KEY],
-                  ds_zero=configs[mii.constants.ENABLE_DEEPSPEED_ZERO_KEY],
-                  ds_config=configs[mii.constants.DEEPSPEED_CONFIG_KEY],
-                  mii_configs=configs[mii.constants.MII_CONFIGS_KEY],
                   lb_config=configs.get(mii.constants.LOAD_BALANCER_CONFIG_KEY,
                                         None))
 
diff --git a/mii/server.py b/mii/server.py
index c7aff4de..496ecd01 100644
--- a/mii/server.py
+++ b/mii/server.py
@@ -34,14 +34,14 @@ def __init__(self,
                  model_path,
                  lb_config=None):
 
-        mii_configs = mii.config.MIIConfig(**mii_configs)
+        #mii_configs = mii.config.MIIConfig(**mii_configs)
 
-        self.task = mii.utils.get_task(task_name)
+        #self.task = mii.utils.get_task(task_name)
 
-        self.num_gpus = get_num_gpus(mii_configs)
-        assert self.num_gpus > 0, "GPU count must be greater than 0"
+        for deployment in deployments:
+            assert get_num_gpus(deployment.mii_configs) > 0, f"GPU count for {deployment.deployment_name} must be greater than 0"
 
-        self.port_number = mii_configs.port_number
+        #self.port_number = mii_configs.port_number
 
         if mii_configs.hostfile is None:
             hostfile = tempfile.NamedTemporaryFile(delete=False)
@@ -104,12 +104,17 @@ def _build_server_args(self,
                            port):
         # serialize mii config
         b64_config_str = config_to_b64_str(mii_configs)
-
-        server_args_str = f"--deployment-name {deployment_name} --task-name {mii.utils.get_task_name(self.task)} --model {model_name} --model-path {model_path} --port {port}"
+        
+        task = ""
+        for deployment in deployments:
+            if deployment_name == deployment.deployment_name:
+                task = deployment.task
+                break
+        server_args_str = f"--deployment-name {deployment_name} --task-name {mii.utils.get_task_name(task)} --model {model_name} --model-path {model_path} --port {port}"
         server_args_str += " --ds-optimize" if ds_optimize else ""
 
         # XXX: fetch model provider based on model name in a more general way
-        provider = get_provider_name(model_name, self.task)
+        provider = get_provider_name(model_name, task)
         server_args_str += f" --provider {provider}"
 
         server_args_str += f" --config {b64_config_str}"
@@ -134,7 +139,7 @@ def create_config_from_dict(tmpdir, config_dict):
                     f"Expected a string path to an existing deepspeed config, or a dictionary. Received: {ds_config}"
                 )
             server_args_str += f" --ds-config {ds_config_path}"
-        printable_config = f"task-name {mii.utils.get_task_name(self.task)} model {model_name} model-path {model_path} port {self.port_number} provider {provider}"
+        printable_config = f"task-name task  model {model_name} model-path {model_path} port 50050 provider {provider}"
         logger.info(f"MII using multi-gpu deepspeed launcher:\n" +
                     self.print_helper(printable_config))
         return server_args_str

From 38e270ec2a8e70f330ff5d802f95cb429ce5eb94 Mon Sep 17 00:00:00 2001
From: Tosin Segun <insanechills@gmail.com>
Date: Thu, 29 Jun 2023 18:19:11 +0000
Subject: [PATCH 19/69] Fixing more syntax issues

---
 mii/__init__.py              |  1 -
 mii/deployment.py            |  4 ++--
 mii/models/score/generate.py |  4 ++--
 mii/server.py                | 43 ++++++++++++++++++++----------------
 4 files changed, 28 insertions(+), 24 deletions(-)

diff --git a/mii/__init__.py b/mii/__init__.py
index 5c84d1dc..b0008c06 100644
--- a/mii/__init__.py
+++ b/mii/__init__.py
@@ -15,7 +15,6 @@
 
 __version__ = "0.0.0"
 non_persistent_models = {}
-multi_model_deployments = {}
 try:
     from .version import __version__
 except ImportError:
diff --git a/mii/deployment.py b/mii/deployment.py
index ecdb95c2..60a4c8cf 100644
--- a/mii/deployment.py
+++ b/mii/deployment.py
@@ -74,7 +74,6 @@ def deploy(task=None,
     else:
         assert deployment_tag is not None, "deployment_tag must be set to deploy multiple models"
 
-    mii.multi_model_deployments[deployment_tag] = deployments
     ports = set()
     # parse and validate mii config
     for deployment in deployments:
@@ -135,7 +134,8 @@ def deploy(task=None,
                     base_port + mii_config.tensor_parallel))
             torch_dist_port = mii_config.torch_dist_port + i
             replica_configs.append(
-                ReplicaConfig(hostname=hostname,
+                ReplicaConfig(deployment_name = deployment.deployment_name,
+                              hostname=hostname,
                               tensor_parallel_ports=tensor_parallel_ports,
                               torch_dist_port=torch_dist_port,
                               gpu_indices=gpu_indices))
diff --git a/mii/models/score/generate.py b/mii/models/score/generate.py
index 27716bd6..ecbfeea7 100644
--- a/mii/models/score/generate.py
+++ b/mii/models/score/generate.py
@@ -28,8 +28,8 @@ def create_score_file(deployment_tag,
         config_dict[deployment.deployment_name][mii.constants.ENABLE_DEEPSPEED_ZERO_KEY] = deployment.enable_zero
         config_dict[deployment.deployment_name][mii.constants.DEEPSPEED_CONFIG_KEY] = deployment.ds_config
 
-        if lb_config is not None:
-            config_dict[deployment.deployment_name][mii.constants.LOAD_BALANCER_CONFIG_KEY] = lb_config
+    if lb_config is not None:
+        config_dict[mii.constants.LOAD_BALANCER_CONFIG_KEY] = lb_config
 
         if len(mii.__path__) > 1:
             logger.warning(
diff --git a/mii/server.py b/mii/server.py
index 496ecd01..3325bc48 100644
--- a/mii/server.py
+++ b/mii/server.py
@@ -37,18 +37,17 @@ def __init__(self,
         #mii_configs = mii.config.MIIConfig(**mii_configs)
 
         #self.task = mii.utils.get_task(task_name)
-
+        self.deployments = deployments
         for deployment in deployments:
-            assert get_num_gpus(deployment.mii_configs) > 0, f"GPU count for {deployment.deployment_name} must be greater than 0"
-
-        #self.port_number = mii_configs.port_number
-
-        if mii_configs.hostfile is None:
-            hostfile = tempfile.NamedTemporaryFile(delete=False)
-            num_gpu = torch.cuda.device_count()
-            with open(hostfile, "w") as f:
-                f.write(f"localhost slots={num_gpu}")
-            mii.configs.hostfile = hostfile
+            assert get_num_gpus(deployment.mii_config) > 0, f"GPU count for {deployment.deployment_name} must be greater than 0"
+            mii_configs = deployment.mii_config
+            deployment.task = mii.utils.get_task(deployment.task)
+            if mii_configs.hostfile is None:
+                hostfile = tempfile.NamedTemporaryFile(delete=False)
+                num_gpu = torch.cuda.device_count()
+                with open(hostfile, "w") as f:
+                    f.write(f"localhost slots={num_gpu}")
+                mii.configs.hostfile = hostfile
 
         processes = self._initialize_service(deployment_tag,
                                              deployments,
@@ -106,7 +105,7 @@ def _build_server_args(self,
         b64_config_str = config_to_b64_str(mii_configs)
         
         task = ""
-        for deployment in deployments:
+        for deployment in self.deployments:
             if deployment_name == deployment.deployment_name:
                 task = deployment.task
                 break
@@ -139,7 +138,7 @@ def create_config_from_dict(tmpdir, config_dict):
                     f"Expected a string path to an existing deepspeed config, or a dictionary. Received: {ds_config}"
                 )
             server_args_str += f" --ds-config {ds_config_path}"
-        printable_config = f"task-name task  model {model_name} model-path {model_path} port 50050 provider {provider}"
+        printable_config = f"task-name {task} model {model_name} model-path {model_path} port {port} provider {provider}"
         logger.info(f"MII using multi-gpu deepspeed launcher:\n" +
                     self.print_helper(printable_config))
         return server_args_str
@@ -288,6 +287,12 @@ def _initialize_service(self,
         # Start replica instances
         for i, repl_config in enumerate(lb_config.replica_configs):
             name = repl_config.deployment_name
+            deployment = None
+            print (f"IN SERVER NAME -> {name}")
+            for dep in deployments:
+                print(f"\nDEPLOYMENT_NAME {dep.deployment_name}")
+                if dep.deployment_name == name:
+                    deployment = dep
             hostfile = tempfile.NamedTemporaryFile(delete=False)
             hostfile.write(
                 f'{repl_config.hostname} slots={max(host_gpus[repl_config.hostname])+1}\n'
@@ -295,16 +300,16 @@ def _initialize_service(self,
             processes.append(
                 self._launch_deepspeed(
                     name,
-                    deployments[name].model,
+                    deployment.model,
                     model_path,
-                    deployments[name].enable_deepspeed,
-                    deployments[name].enable_zero,
-                    deployments[name].ds_config,
-                    deployments[name].mii_configs,
+                    deployment.enable_deepspeed,
+                    deployment.enable_zero,
+                    deployment.ds_config,
+                    deployment.mii_config,
                     hostfile.name,
                     repl_config.hostname,
                     repl_config.tensor_parallel_ports[0],
-                    mii_configs.torch_dist_port + (100 * i) + repl_config.gpu_indices[0],
+                    deployment.mii_config.torch_dist_port + (100 * i) + repl_config.gpu_indices[0],
                     repl_config.gpu_indices))
 
             # start load balancer here.

From 4d4e0d8795a4db46d39246960e6cc4284f3072b6 Mon Sep 17 00:00:00 2001
From: Tosin Segun <insanechills@gmail.com>
Date: Thu, 29 Jun 2023 18:43:09 +0000
Subject: [PATCH 20/69] initial lb changes

---
 mii/grpc_related/modelresponse_server.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/mii/grpc_related/modelresponse_server.py b/mii/grpc_related/modelresponse_server.py
index 47325f6c..a92dcb2a 100644
--- a/mii/grpc_related/modelresponse_server.py
+++ b/mii/grpc_related/modelresponse_server.py
@@ -168,11 +168,12 @@ def __init__(self, task_name, replica_configs):
         super().__init__()
         self.asyncio_loop = asyncio.get_event_loop()
 
-        self.stubs = [
-            ParallelStubInvoker(replica.hostname,
-                                replica.tensor_parallel_ports)
-            for replica in replica_configs
-        ]
+        self.stubs = {}
+        for repl in replica_configs:
+            stubs[repl.deployment_name] = [ParallelStubInvoker(replica.hostname,
+                                                               replica.tensor_parallel_ports)
+                                                    for replica in replica_configs
+                                          ]
         print(self.stubs)
         self.counter = AtomicCounter()
         self.task = get_task(task_name)

From f801b360e5dcd4fe1a49129d65f634a208325013 Mon Sep 17 00:00:00 2001
From: Tosin Segun <insanechills@gmail.com>
Date: Thu, 29 Jun 2023 20:59:25 +0000
Subject: [PATCH 21/69] More load balancing changes

---
 mii/grpc_related/modelresponse_server.py | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/mii/grpc_related/modelresponse_server.py b/mii/grpc_related/modelresponse_server.py
index a92dcb2a..dc5772b5 100644
--- a/mii/grpc_related/modelresponse_server.py
+++ b/mii/grpc_related/modelresponse_server.py
@@ -169,16 +169,24 @@ def __init__(self, task_name, replica_configs):
         self.asyncio_loop = asyncio.get_event_loop()
 
         self.stubs = {}
+        self.counter = {}
         for repl in replica_configs:
-            stubs[repl.deployment_name] = [ParallelStubInvoker(replica.hostname,
+            stubs[repl.deployment_name] = []
+            self.counter[repl.deployment_name] = AtomicCounter()
+
+
+        for repl in replica_configs:
+            stubs[repl.deployment_name].extend(ParallelStubInvoker(replica.hostname,
                                                                replica.tensor_parallel_ports)
                                                     for replica in replica_configs
-                                          ]
+                                                    )
         print(self.stubs)
+        """
         self.counter = AtomicCounter()
         self.task = get_task(task_name)
         self.replica_sessions = {}
-
+        """
+    
         # Start the asyncio loop in a separate thread
         def run_asyncio_loop(loop):
             asyncio.set_event_loop(loop)

From fd4e2ed030d817a5df12d8b979c46c9be23aa28b Mon Sep 17 00:00:00 2001
From: Tosin Segun <insanechills@gmail.com>
Date: Fri, 30 Jun 2023 18:18:43 +0000
Subject: [PATCH 22/69] LB changes and syntax

---
 mii/grpc_related/modelresponse_server.py | 29 +++++++++++--------
 mii/server.py                            | 36 +++++++++++++-----------
 2 files changed, 36 insertions(+), 29 deletions(-)

diff --git a/mii/grpc_related/modelresponse_server.py b/mii/grpc_related/modelresponse_server.py
index dc5772b5..bbf10857 100644
--- a/mii/grpc_related/modelresponse_server.py
+++ b/mii/grpc_related/modelresponse_server.py
@@ -171,16 +171,15 @@ def __init__(self, task_name, replica_configs):
         self.stubs = {}
         self.counter = {}
         for repl in replica_configs:
-            stubs[repl.deployment_name] = []
+            self.stubs[repl.deployment_name] = []
             self.counter[repl.deployment_name] = AtomicCounter()
 
 
         for repl in replica_configs:
-            stubs[repl.deployment_name].extend(ParallelStubInvoker(replica.hostname,
+            self.stubs[repl.deployment_name].extend(ParallelStubInvoker(replica.hostname,
                                                                replica.tensor_parallel_ports)
-                                                    for replica in replica_configs
-                                                    )
-        print(self.stubs)
+                                                               for replica in replica_configs if replica.deployment_name == repl.deployment_name)
+        print(f"\nSTUBS-> {self.stubs}\nCOUNTERS-> {self.counter}")
         """
         self.counter = AtomicCounter()
         self.task = get_task(task_name)
@@ -200,7 +199,14 @@ def choose_stub(self, call_count):
     def intercept_service(self, continuation, handler_call_details):
         next_handler = continuation(handler_call_details)
         assert next_handler.unary_unary is not None
-
+        deployment_name = ""
+        #USE KWARGS LIKE THEY ARE USED TO MAKE SESSIONS TO GET THE DEPLOYMENT NAME TO HASH THE COUNTERS/STUBS
+        kwargs = unpack_proto_query_kwargs(request_proto.query_kwargs)
+        assert "deployment_name" in kwargs, "Must include deployment_name in kwargs for query"
+        deployment_name = kwargs['deployment_name']
+
+        print(f"\nDEPLOYMENT NAME WITHIN INTERCEPTOR -> {deployment_name}")
+        
         def invoke_intercept_method(request_proto, context):
             method_name = _get_grpc_method_name(handler_call_details.method)
 
@@ -211,30 +217,29 @@ def invoke_intercept_method(request_proto, context):
                 self.asyncio_loop.call_soon_threadsafe(self.asyncio_loop.stop)
                 return next_handler.unary_unary(request_proto, context)
 
-            call_count = self.counter.get_and_increment()
-            replica_index = call_count % len(self.stubs)
+            call_count = self.counter[deployment_name].get_and_increment()
+            replica_index = call_count % len(self.stubs[deployment_name])
 
             if method_name == CREATE_SESSION_METHOD:
                 if request_proto.session_id in self.sessions:
                     raise ValueError(
                         f"session {request_proto.session_id} already exists")
                 self.replica_sessions[request_proto.session_id] = replica_index
-                self.stubs[replica_index].invoke(CREATE_SESSION_METHOD, request_proto)
+                self.stubs[deployment_name][replica_index].invoke(CREATE_SESSION_METHOD, request_proto)
                 return google_dot_protobuf_dot_empty__pb2.Empty()
 
             if method_name == DESTROY_SESSION_METHOD:
                 replica_index = self.replica_sessions.pop(request_proto.session_id)
-                self.stubs[replica_index].invoke(DESTROY_SESSION_METHOD, request_proto)
+                self.stubs[deployment_name][replica_index].invoke(DESTROY_SESSION_METHOD, request_proto)
                 return google_dot_protobuf_dot_empty__pb2.Empty()
 
-            kwargs = unpack_proto_query_kwargs(request_proto.query_kwargs)
             if "session_id" in kwargs:
                 session_id = kwargs["session_id"]
                 if session_id not in self.replica_sessions:
                     raise ValueError(f"session not found")
                 replica_index = self.replica_sessions[session_id]
 
-            ret = self.stubs[replica_index].invoke(method_name, request_proto)
+            ret = self.stubs[deployment_name][replica_index].invoke(method_name, request_proto)
             return ret
 
         return grpc.unary_unary_rpc_method_handler(
diff --git a/mii/server.py b/mii/server.py
index 3325bc48..dc0768b8 100644
--- a/mii/server.py
+++ b/mii/server.py
@@ -317,25 +317,27 @@ def _initialize_service(self,
             # The deepspeed launcher determines the number of processes to launch based on GPUs available on the host or CUDA_VISIBLE_DEVICES,
             # and it is expected to assign one GPU to one process.
         processes.append(
-            self._launch_load_balancer(deployment_name,
-                                       model_name,
+            self._launch_load_balancer(self.deployments[0].deployment_name,
+                                       self.deployments[0].model,
                                        model_path,
-                                       ds_optimize,
-                                       ds_zero,
-                                       ds_config,
-                                       mii_configs,
+                                       self.deployments[0].enable_deepspeed,
+                                       self.deployments[0].enable_zero,
+                                       self.deployments[0].ds_config,
+                                       self.deployments[0].mii_config,
                                        lb_config))
 
-        if mii_configs.enable_restful_api:
-            # start rest api server
-            processes.append(
-                self._launch_restful_gateway(deployment_name,
-                                             model_name,
-                                             model_path,
-                                             ds_optimize,
-                                             ds_zero,
-                                             ds_config,
-                                             mii_configs,
-                                             mii_configs.port_number))
+        for deployment in self.deployments:
+            if deployment.mii_config.enable_restful_api:
+                # start rest api server
+                processes.append(
+                    self._launch_restful_gateway(deployment.deployment_name,
+                                                 deployment.model,
+                                                 model_path,
+                                                 deployment.enable_deepspeed,
+                                                 deployment.enable_zero,
+                                                 deployment.ds_config,
+                                                 deployment.mii_config,
+                                                 deployment.mii_config.port_number))
+                break
 
         return processes

From 0a3b7e5cab714a1466dc7264432e68b6101dc289 Mon Sep 17 00:00:00 2001
From: Tosin Segun <insanechills@gmail.com>
Date: Fri, 30 Jun 2023 21:13:25 +0000
Subject: [PATCH 23/69] Refactor client, and unpack request in load balancer

---
 mii/client.py                            | 15 +++++-----
 mii/grpc_related/modelresponse_server.py | 36 +++++++++++++++++++-----
 2 files changed, 37 insertions(+), 14 deletions(-)

diff --git a/mii/client.py b/mii/client.py
index 535b55c8..31216f47 100644
--- a/mii/client.py
+++ b/mii/client.py
@@ -12,17 +12,17 @@
 from mii.method_table import GRPC_METHOD_TABLE
 
 
-def _get_deployment_info(deployment_name):
-    configs = mii.utils.import_score_file(deployment_name).configs
-    task = configs[mii.constants.TASK_NAME_KEY]
-    mii_configs_dict = configs[mii.constants.MII_CONFIGS_KEY]
+def _get_deployment_info(deployment_tag, deployment_name):
+    configs = mii.utils.import_score_file(deployment_tag).configs
+    task = configs[deployment_name][mii.constants.TASK_NAME_KEY]
+    mii_configs_dict = configs[deployment_name][mii.constants.MII_CONFIGS_KEY]
     mii_configs = mii.config.MIIConfig(**mii_configs_dict)
 
     assert task is not None, "The task name should be set before calling init"
     return task, mii_configs
 
 
-def mii_query_handle(deployment_name):
+def mii_query_handle(deployment_tag, deployment_name):
     """Get a query handle for a local deployment:
 
         mii/examples/local/gpt2-query-example.py
@@ -39,7 +39,7 @@ def mii_query_handle(deployment_name):
         inference_pipeline, task = mii.non_persistent_models[deployment_name]
         return MIINonPersistentClient(task, deployment_name)
 
-    task_name, mii_configs = _get_deployment_info(deployment_name)
+    task_name, mii_configs = _get_deployment_info(deployment_tag, deployment_name)
     return MIIClient(task_name, "localhost", mii_configs.port_number)
 
 
@@ -60,7 +60,8 @@ def __init__(self, task_name, host, port):
         channel = create_channel(host, port)
         self.stub = modelresponse_pb2_grpc.ModelResponseStub(channel)
         self.task = get_task(task_name)
-
+        
+        print(f"IN CLEINT TASK -> {self.task}\n STUB -> {self.stub}")
     async def _request_async_response(self, request_dict, **query_kwargs):
         if self.task not in GRPC_METHOD_TABLE:
             raise ValueError(f"unknown task: {self.task}")
diff --git a/mii/grpc_related/modelresponse_server.py b/mii/grpc_related/modelresponse_server.py
index bbf10857..6792af63 100644
--- a/mii/grpc_related/modelresponse_server.py
+++ b/mii/grpc_related/modelresponse_server.py
@@ -16,7 +16,7 @@
 from mii.constants import GRPC_MAX_MSG_SIZE, CREATE_SESSION_METHOD, DESTROY_SESSION_METHOD, TERMINATE_METHOD, LB_MAX_WORKER_THREADS, SERVER_SHUTDOWN_TIMEOUT, Tasks
 from mii.method_table import GRPC_METHOD_TABLE
 from mii.client import create_channel
-from mii.utils import get_task, unpack_proto_query_kwargs
+from mii.utils import get_task, unpack_proto_query_kwargs, kwarg_dict_to_proto
 
 
 class ServiceBase(modelresponse_pb2_grpc.ModelResponseServicer):
@@ -199,15 +199,37 @@ def choose_stub(self, call_count):
     def intercept_service(self, continuation, handler_call_details):
         next_handler = continuation(handler_call_details)
         assert next_handler.unary_unary is not None
-        deployment_name = ""
         #USE KWARGS LIKE THEY ARE USED TO MAKE SESSIONS TO GET THE DEPLOYMENT NAME TO HASH THE COUNTERS/STUBS
-        kwargs = unpack_proto_query_kwargs(request_proto.query_kwargs)
-        assert "deployment_name" in kwargs, "Must include deployment_name in kwargs for query"
-        deployment_name = kwargs['deployment_name']
 
-        print(f"\nDEPLOYMENT NAME WITHIN INTERCEPTOR -> {deployment_name}")
-        
         def invoke_intercept_method(request_proto, context):
+            kwargs = unpack_proto_query_kwargs(request_proto.query_kwargs)
+            assert "deployment_name" in kwargs, "Must include deployment_name in kwargs for query"
+            deployment_name = kwargs.get('deployment_name')
+            del kwargs['deployment_name']
+            kwargs = kwarg_dict_to_proto(**kwargs)
+            task = None
+            for repl in replica_configs:
+                if repl.deployment_name == deployment_name:
+                    task = repl.task
+                    break
+            method = GRPC_METHOD_TABLE[get_task(task)]
+            if method_name == "ConversationalReply":
+                request_dict = {}
+                request_dict['text'] = request_proto.text
+                request_dict['conversation_id'] = getattr(request_proto, 'conversation_id')
+                request_dict['past_user_inputs'] = request_proto.past_user_inputs
+                request_dict['generated_responses'] = request_proto.generated_responses
+                request_proto = method.pack_request_to_proto(request_dict, kwargs)
+
+            elif method_name == "QuestionAndAnswerReply":
+                request_dict = {}
+                request_dict['question'] = request_proto.question
+                request_dict['context'] = requet_proto.context
+                request_proto = method.pack_request_to_proto(request_dict, kwargs)
+            else
+                request_proto = method.pack_request_to_proto(request_proto.query, kwargs)
+
+            print(f"\nDEPLOYMENT NAME WITHIN INTERCEPTOR -> {deployment_name}")
             method_name = _get_grpc_method_name(handler_call_details.method)
 
             if method_name == TERMINATE_METHOD:

From 6523c0477c204ce7a02a81d837fd1af4ba0587e3 Mon Sep 17 00:00:00 2001
From: Tosin Segun <insanechills@gmail.com>
Date: Mon, 3 Jul 2023 17:08:15 +0000
Subject: [PATCH 24/69] First working queries

---
 examples/multi_model/query.py                 | 11 ++++++
 examples/multi_model/shutdown.py              |  2 ++
 .../text-generation-bloom560m-example.py      | 18 ++++++++++
 mii/config.py                                 |  3 +-
 mii/deployment.py                             |  3 +-
 mii/grpc_related/modelresponse_server.py      | 35 ++++++++++++-------
 mii/method_table.py                           |  3 ++
 7 files changed, 60 insertions(+), 15 deletions(-)
 create mode 100644 examples/multi_model/query.py
 create mode 100644 examples/multi_model/shutdown.py
 create mode 100644 examples/multi_model/text-generation-bloom560m-example.py

diff --git a/examples/multi_model/query.py b/examples/multi_model/query.py
new file mode 100644
index 00000000..052c293d
--- /dev/null
+++ b/examples/multi_model/query.py
@@ -0,0 +1,11 @@
+import mii
+import time
+generator = mii.mii_query_handle("first_test", "bloom560m_deployment")
+result = generator.query({"query": ["DeepSpeed is", "Seattle is"]}, do_sample=True, max_new_tokens = 30, deployment_name = "bloom560m_deployment")
+print(result)
+
+time.sleep(5)
+generator2 = mii.mii_query_handle("first_test", "microsoft/DialogRPT-human-vs-rand_deployment")
+result = generator2.query({'query': "DeepSpeed is the greatest"}, deployment_name = "microsoft/DialogRPT-human-vs-rand_deployment")
+print(result)
+
diff --git a/examples/multi_model/shutdown.py b/examples/multi_model/shutdown.py
new file mode 100644
index 00000000..5f082f2f
--- /dev/null
+++ b/examples/multi_model/shutdown.py
@@ -0,0 +1,2 @@
+import mii
+mii.terminate("bloom560m_deployment")
diff --git a/examples/multi_model/text-generation-bloom560m-example.py b/examples/multi_model/text-generation-bloom560m-example.py
new file mode 100644
index 00000000..6b5d25fe
--- /dev/null
+++ b/examples/multi_model/text-generation-bloom560m-example.py
@@ -0,0 +1,18 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+import mii
+
+deployments = []
+mii_configs1 = {"tensor_parallel": 1, "dtype": "fp16"}
+deployments.append(mii.Deployment(task='text-generation',
+           model="bigscience/bloom-560m",
+           deployment_name="bloom560m_deployment",
+           mii_config=mii.config.MIIConfig(**mii_configs1)))
+
+# gpt2
+name = "microsoft/DialogRPT-human-vs-rand"
+deployments.append(mii.Deployment(task='text-classification', model=name, deployment_name=name + "_deployment"))
+
+mii.deploy(deployment_tag="first_test", deployments=deployments)
diff --git a/mii/config.py b/mii/config.py
index 531ee800..e425a2e6 100644
--- a/mii/config.py
+++ b/mii/config.py
@@ -6,7 +6,7 @@
 from typing import Union, List
 from enum import Enum
 from pydantic import BaseModel, validator, root_validator
-
+from .constants import Tasks
 from deepspeed.launcher.runner import DLTS_HOSTFILE
 
 
@@ -107,6 +107,7 @@ class Config:
 
 
 class ReplicaConfig(BaseModel):
+    task: str = ""
     deployment_name: str = ""
     hostname: str = ""
     tensor_parallel_ports: List[int] = []
diff --git a/mii/deployment.py b/mii/deployment.py
index 60a4c8cf..7f35c264 100644
--- a/mii/deployment.py
+++ b/mii/deployment.py
@@ -134,7 +134,8 @@ def deploy(task=None,
                     base_port + mii_config.tensor_parallel))
             torch_dist_port = mii_config.torch_dist_port + i
             replica_configs.append(
-                ReplicaConfig(deployment_name = deployment.deployment_name,
+                ReplicaConfig(task=get_task_name(deployment.task),
+                              deployment_name = deployment.deployment_name,
                               hostname=hostname,
                               tensor_parallel_ports=tensor_parallel_ports,
                               torch_dist_port=torch_dist_port,
diff --git a/mii/grpc_related/modelresponse_server.py b/mii/grpc_related/modelresponse_server.py
index 6792af63..1db6f77f 100644
--- a/mii/grpc_related/modelresponse_server.py
+++ b/mii/grpc_related/modelresponse_server.py
@@ -170,6 +170,7 @@ def __init__(self, task_name, replica_configs):
 
         self.stubs = {}
         self.counter = {}
+        self.replica_configs = replica_configs
         for repl in replica_configs:
             self.stubs[repl.deployment_name] = []
             self.counter[repl.deployment_name] = AtomicCounter()
@@ -202,42 +203,48 @@ def intercept_service(self, continuation, handler_call_details):
         #USE KWARGS LIKE THEY ARE USED TO MAKE SESSIONS TO GET THE DEPLOYMENT NAME TO HASH THE COUNTERS/STUBS
 
         def invoke_intercept_method(request_proto, context):
+            method_name = _get_grpc_method_name(handler_call_details.method)
             kwargs = unpack_proto_query_kwargs(request_proto.query_kwargs)
             assert "deployment_name" in kwargs, "Must include deployment_name in kwargs for query"
             deployment_name = kwargs.get('deployment_name')
-            del kwargs['deployment_name']
-            kwargs = kwarg_dict_to_proto(**kwargs)
+            kwargs.pop('deployment_name', None)
             task = None
-            for repl in replica_configs:
+            for repl in self.replica_configs:
                 if repl.deployment_name == deployment_name:
                     task = repl.task
                     break
+            print(f"\nTASK ->{task}")
             method = GRPC_METHOD_TABLE[get_task(task)]
+            new_request = None
             if method_name == "ConversationalReply":
                 request_dict = {}
                 request_dict['text'] = request_proto.text
-                request_dict['conversation_id'] = getattr(request_proto, 'conversation_id')
+                val =  getattr(request_proto, 'conversation_id')
+                request_dict['conversation_id'] = int(val) if val is not None else None
                 request_dict['past_user_inputs'] = request_proto.past_user_inputs
                 request_dict['generated_responses'] = request_proto.generated_responses
-                request_proto = method.pack_request_to_proto(request_dict, kwargs)
+                new_request = method.pack_request_to_proto(request_dict, **kwargs)
 
             elif method_name == "QuestionAndAnswerReply":
                 request_dict = {}
                 request_dict['question'] = request_proto.question
                 request_dict['context'] = requet_proto.context
-                request_proto = method.pack_request_to_proto(request_dict, kwargs)
-            else
-                request_proto = method.pack_request_to_proto(request_proto.query, kwargs)
+                new_request = method.pack_request_to_proto(request_dict, **kwargs)
+            else:
+                request_dict = {}
+                request_dict["query"] = list(request_proto.request) if method_name == "GeneratorReply" or method_name == "Txt2ImgReply" else str(request_proto.request)
+                print(f"HERE request_dict -> {request_dict}\nKWARGS-> {kwargs}")
+                new_request = method.pack_request_to_proto(request_dict, **kwargs)
+                print("done?")
 
             print(f"\nDEPLOYMENT NAME WITHIN INTERCEPTOR -> {deployment_name}")
-            method_name = _get_grpc_method_name(handler_call_details.method)
 
             if method_name == TERMINATE_METHOD:
                 for stub in self.stubs:
                     stub.invoke(TERMINATE_METHOD,
                                 google_dot_protobuf_dot_empty__pb2.Empty())
                 self.asyncio_loop.call_soon_threadsafe(self.asyncio_loop.stop)
-                return next_handler.unary_unary(request_proto, context)
+                return next_handler.unary_unary(new_request, context)
 
             call_count = self.counter[deployment_name].get_and_increment()
             replica_index = call_count % len(self.stubs[deployment_name])
@@ -247,12 +254,12 @@ def invoke_intercept_method(request_proto, context):
                     raise ValueError(
                         f"session {request_proto.session_id} already exists")
                 self.replica_sessions[request_proto.session_id] = replica_index
-                self.stubs[deployment_name][replica_index].invoke(CREATE_SESSION_METHOD, request_proto)
+                self.stubs[deployment_name][replica_index].invoke(CREATE_SESSION_METHOD, new_request)
                 return google_dot_protobuf_dot_empty__pb2.Empty()
 
             if method_name == DESTROY_SESSION_METHOD:
                 replica_index = self.replica_sessions.pop(request_proto.session_id)
-                self.stubs[deployment_name][replica_index].invoke(DESTROY_SESSION_METHOD, request_proto)
+                self.stubs[deployment_name][replica_index].invoke(DESTROY_SESSION_METHOD, new_request)
                 return google_dot_protobuf_dot_empty__pb2.Empty()
 
             if "session_id" in kwargs:
@@ -261,7 +268,9 @@ def invoke_intercept_method(request_proto, context):
                     raise ValueError(f"session not found")
                 replica_index = self.replica_sessions[session_id]
 
-            ret = self.stubs[deployment_name][replica_index].invoke(method_name, request_proto)
+            assert new_request is not None, "test"
+            print("ASSERT DONE")
+            ret = self.stubs[deployment_name][replica_index].invoke(method_name, new_request)
             return ret
 
         return grpc.unary_unary_rpc_method_handler(
diff --git a/mii/method_table.py b/mii/method_table.py
index c412f446..8dfea390 100644
--- a/mii/method_table.py
+++ b/mii/method_table.py
@@ -23,6 +23,9 @@ def single_string_response_to_proto(self, response, time_taken, model_time_taken
 
 
 def multi_string_request_to_proto(self, request_dict, **query_kwargs):
+    temp = kwarg_dict_to_proto(query_kwargs)
+    print(f"FINE {temp}\nrd->{request_dict}")
+    print(isinstance(request_dict['query'], list))
     return modelresponse_pb2.MultiStringRequest(
         request=request_dict['query'] if isinstance(request_dict['query'],
                                                     list) else [request_dict['query']],

From 06b40f5ee3c469261610b52d6dfac4153f28000b Mon Sep 17 00:00:00 2001
From: Tosin Segun <insanechills@gmail.com>
Date: Mon, 3 Jul 2023 21:53:37 +0000
Subject: [PATCH 25/69] Fixing conversational and q&a args

---
 mii/grpc_related/modelresponse_server.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/mii/grpc_related/modelresponse_server.py b/mii/grpc_related/modelresponse_server.py
index 1db6f77f..a77d000b 100644
--- a/mii/grpc_related/modelresponse_server.py
+++ b/mii/grpc_related/modelresponse_server.py
@@ -213,22 +213,22 @@ def invoke_intercept_method(request_proto, context):
                 if repl.deployment_name == deployment_name:
                     task = repl.task
                     break
-            print(f"\nTASK ->{task}")
+            print(f"\nTASK ->{task}\nMETHOD NAME-> {method_name}")
             method = GRPC_METHOD_TABLE[get_task(task)]
             new_request = None
             if method_name == "ConversationalReply":
                 request_dict = {}
-                request_dict['text'] = request_proto.text
+                request_dict['text'] = str(request_proto.text)
                 val =  getattr(request_proto, 'conversation_id')
                 request_dict['conversation_id'] = int(val) if val is not None else None
-                request_dict['past_user_inputs'] = request_proto.past_user_inputs
-                request_dict['generated_responses'] = request_proto.generated_responses
+                request_dict['past_user_inputs'] = list(request_proto.past_user_inputs)
+                request_dict['generated_responses'] = list(request_proto.generated_responses)
                 new_request = method.pack_request_to_proto(request_dict, **kwargs)
 
             elif method_name == "QuestionAndAnswerReply":
                 request_dict = {}
-                request_dict['question'] = request_proto.question
-                request_dict['context'] = requet_proto.context
+                request_dict['question'] = str(request_proto.question)
+                request_dict['context'] = str(requet_proto.context)
                 new_request = method.pack_request_to_proto(request_dict, **kwargs)
             else:
                 request_dict = {}
@@ -270,6 +270,7 @@ def invoke_intercept_method(request_proto, context):
 
             assert new_request is not None, "test"
             print("ASSERT DONE")
+            print(new_request.query_kwargs)
             ret = self.stubs[deployment_name][replica_index].invoke(method_name, new_request)
             return ret
 

From 96d0dcb8d332153ad28df639c999e70dc6baff3f Mon Sep 17 00:00:00 2001
From: Tosin Segun <insanechills@gmail.com>
Date: Wed, 5 Jul 2023 18:33:46 +0000
Subject: [PATCH 26/69] Updates to _allocate_processes and fixing example

---
 examples/multi_model/query.py                 | 11 ++++++++
 .../text-generation-bloom560m-example.py      | 17 ++++++++++--
 mii/client.py                                 |  1 -
 mii/deployment.py                             | 26 ++++++++++++++-----
 mii/grpc_related/modelresponse_server.py      |  2 +-
 5 files changed, 46 insertions(+), 11 deletions(-)

diff --git a/examples/multi_model/query.py b/examples/multi_model/query.py
index 052c293d..e4bfd8d9 100644
--- a/examples/multi_model/query.py
+++ b/examples/multi_model/query.py
@@ -1,5 +1,6 @@
 import mii
 import time
+
 generator = mii.mii_query_handle("first_test", "bloom560m_deployment")
 result = generator.query({"query": ["DeepSpeed is", "Seattle is"]}, do_sample=True, max_new_tokens = 30, deployment_name = "bloom560m_deployment")
 print(result)
@@ -9,3 +10,13 @@
 result = generator2.query({'query': "DeepSpeed is the greatest"}, deployment_name = "microsoft/DialogRPT-human-vs-rand_deployment")
 print(result)
 
+time.sleep(5)
+
+generator3 = mii.mii_query_handle("first_test", "microsoft/DialoGPT-large_deployment")
+result = generator3.query({'text': "DeepSpeed is the greatest",
+                           'conversation_id': 3,
+                           'past_user_inputs': [],
+                           'generated_responses': []
+                         }, deployment_name= "microsoft/DialoGPT-large_deployment")
+print(result)
+
diff --git a/examples/multi_model/text-generation-bloom560m-example.py b/examples/multi_model/text-generation-bloom560m-example.py
index 6b5d25fe..b1d0d6b6 100644
--- a/examples/multi_model/text-generation-bloom560m-example.py
+++ b/examples/multi_model/text-generation-bloom560m-example.py
@@ -4,15 +4,28 @@
 # DeepSpeed Team
 import mii
 
+gpu_index_map1 = {'master': [0]}
+gpu_index_map2 = {'master': [1]}
+gpu_index_map3 = {'master': [0, 1]}
+
 deployments = []
-mii_configs1 = {"tensor_parallel": 1, "dtype": "fp16"}
+mii_configs1 = {"tensor_parallel": 2, "dtype": "fp16"}
 deployments.append(mii.Deployment(task='text-generation',
            model="bigscience/bloom-560m",
            deployment_name="bloom560m_deployment",
+           GPU_index_map=gpu_index_map3,
            mii_config=mii.config.MIIConfig(**mii_configs1)))
 
 # gpt2
 name = "microsoft/DialogRPT-human-vs-rand"
-deployments.append(mii.Deployment(task='text-classification', model=name, deployment_name=name + "_deployment"))
+deployments.append(mii.Deployment(task='text-classification', model=name, deployment_name=name + "_deployment", GPU_index_map=gpu_index_map2))
+
+mii_configs2 = {"tensor_parallel": 1}
+
+
+name = "microsoft/DialoGPT-large"
+
+deployments.append(mii.Deployment(task='conversational', model=name, deployment_name=name + "_deployment", GPU_index_map=gpu_index_map1, mii_config=mii.config.MIIConfig(**mii_configs2)))
+
 
 mii.deploy(deployment_tag="first_test", deployments=deployments)
diff --git a/mii/client.py b/mii/client.py
index 31216f47..fe884ed1 100644
--- a/mii/client.py
+++ b/mii/client.py
@@ -61,7 +61,6 @@ def __init__(self, task_name, host, port):
         self.stub = modelresponse_pb2_grpc.ModelResponseStub(channel)
         self.task = get_task(task_name)
         
-        print(f"IN CLEINT TASK -> {self.task}\n STUB -> {self.stub}")
     async def _request_async_response(self, request_dict, **query_kwargs):
         if self.task not in GRPC_METHOD_TABLE:
             raise ValueError(f"unknown task: {self.task}")
diff --git a/mii/deployment.py b/mii/deployment.py
index 7f35c264..330acd51 100644
--- a/mii/deployment.py
+++ b/mii/deployment.py
@@ -74,13 +74,9 @@ def deploy(task=None,
     else:
         assert deployment_tag is not None, "deployment_tag must be set to deploy multiple models"
 
-    ports = set()
     # parse and validate mii config
     for deployment in deployments:
         mii_config = deployment.mii_config
-        print(mii_config)
-        assert mii_config.port_number not in ports, f"duplicate port numbers not allowed - {mii_config.port_number}"
-        #ports.add(mii_config.port_number)
         if deployment.enable_zero:
             if deployment.ds_config.get("fp16", {}).get("enabled", False):
                 assert (mii_config.dtype == torch.half), "MII Config Error: MII dtype and ZeRO dtype must match"
@@ -119,19 +115,25 @@ def deploy(task=None,
 
     # add fields for replica deployment
     replica_configs = []
+    ports = set()
     for deployment in deployments:
         mii_config = deployment.mii_config
         replica_pool = _allocate_processes(mii_config.hostfile,
                                            mii_config.tensor_parallel,
-                                           mii_config.replica_num)
+                                           mii_config.replica_num,
+                                           deployment.GPU_index_map)
 
         for i, (hostname, gpu_indices) in enumerate(replica_pool):
             # Reserver port for a LB proxy when replication is enabled
             port_offset = 1
             base_port = mii_config.port_number + i * mii_config.tensor_parallel + port_offset
+            if base_port in ports:
+                base_port = max(ports) + 1
             tensor_parallel_ports = list(
                 range(base_port,
                     base_port + mii_config.tensor_parallel))
+            for i in range(base_port, base_port + mii_config.tensor_parallel):
+                ports.add(i)
             torch_dist_port = mii_config.torch_dist_port + i
             replica_configs.append(
                 ReplicaConfig(task=get_task_name(deployment.task),
@@ -186,12 +188,22 @@ def _deploy_aml(deployment_tag, model_name, version):
     print("Please run 'deploy.sh' to bring your deployment online")
 
 
-def _allocate_processes(hostfile_path, tensor_parallel, num_replicas):
+def _allocate_processes(hostfile_path, tensor_parallel, num_replicas, gpu_index_map=None):
     resource_pool = fetch_hostfile(hostfile_path)
     assert resource_pool is not None and len(
         resource_pool) > 0, f'No hosts found in {hostfile_path}'
-
+    
     replica_pool = []
+
+    if gpu_index_map is not None:
+        assert len(gpu_index_map) == num_replicas, "Number of Hosts must match number of replicas"
+        for host in gpu_index_map:
+            assert host in resource_pool, f"Host: {host} was not found"
+            assert resource_pool[host] >= tensor_parallel, f"Host {host} has {slots} slot(s), but {tensor_parallel} slot(s) are required"
+        for host in gpu_index_map:
+            replica_pool.append((host, gpu_index_map[host]))
+        return replica_pool
+
     allocated_num = 0
     for host, slots in resource_pool.items():
         available_on_host = slots
diff --git a/mii/grpc_related/modelresponse_server.py b/mii/grpc_related/modelresponse_server.py
index a77d000b..7ca739aa 100644
--- a/mii/grpc_related/modelresponse_server.py
+++ b/mii/grpc_related/modelresponse_server.py
@@ -175,7 +175,7 @@ def __init__(self, task_name, replica_configs):
             self.stubs[repl.deployment_name] = []
             self.counter[repl.deployment_name] = AtomicCounter()
 
-
+        print(replica_configs)
         for repl in replica_configs:
             self.stubs[repl.deployment_name].extend(ParallelStubInvoker(replica.hostname,
                                                                replica.tensor_parallel_ports)

From ab41d24b6445a22e1e171f544c9c4ab16ee333d7 Mon Sep 17 00:00:00 2001
From: Tosin Segun <insanechills@gmail.com>
Date: Wed, 5 Jul 2023 21:36:10 +0000
Subject: [PATCH 27/69] Adding host map for allocating processes and formatting

---
 examples/multi_model/query.py                 | 31 +++++++----
 examples/multi_model/shutdown.py              |  5 ++
 .../text-generation-bloom560m-example.py      | 26 ++++++----
 mii/client.py                                 |  2 +-
 mii/config.py                                 |  3 +-
 mii/deployment.py                             | 51 ++++++++++++-------
 mii/grpc_related/modelresponse_server.py      | 37 +++++++++-----
 mii/models/score/generate.py                  | 27 ++++++----
 mii/models/score/score_template.py            | 12 ++++-
 mii/server.py                                 | 19 +++----
 10 files changed, 140 insertions(+), 73 deletions(-)

diff --git a/examples/multi_model/query.py b/examples/multi_model/query.py
index e4bfd8d9..37c55476 100644
--- a/examples/multi_model/query.py
+++ b/examples/multi_model/query.py
@@ -1,22 +1,35 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 import mii
 import time
 
 generator = mii.mii_query_handle("first_test", "bloom560m_deployment")
-result = generator.query({"query": ["DeepSpeed is", "Seattle is"]}, do_sample=True, max_new_tokens = 30, deployment_name = "bloom560m_deployment")
+result = generator.query({"query": ["DeepSpeed is",
+                                    "Seattle is"]},
+                         do_sample=True,
+                         max_new_tokens=30,
+                         deployment_name="bloom560m_deployment")
 print(result)
 
 time.sleep(5)
-generator2 = mii.mii_query_handle("first_test", "microsoft/DialogRPT-human-vs-rand_deployment")
-result = generator2.query({'query': "DeepSpeed is the greatest"}, deployment_name = "microsoft/DialogRPT-human-vs-rand_deployment")
+generator2 = mii.mii_query_handle("first_test",
+                                  "microsoft/DialogRPT-human-vs-rand_deployment")
+result = generator2.query({'query': "DeepSpeed is the greatest"},
+                          deployment_name="microsoft/DialogRPT-human-vs-rand_deployment")
 print(result)
 
 time.sleep(5)
 
 generator3 = mii.mii_query_handle("first_test", "microsoft/DialoGPT-large_deployment")
-result = generator3.query({'text': "DeepSpeed is the greatest",
-                           'conversation_id': 3,
-                           'past_user_inputs': [],
-                           'generated_responses': []
-                         }, deployment_name= "microsoft/DialoGPT-large_deployment")
+result = generator3.query(
+    {
+        'text': "DeepSpeed is the greatest",
+        'conversation_id': 3,
+        'past_user_inputs': [],
+        'generated_responses': []
+    },
+    deployment_name="microsoft/DialoGPT-large_deployment")
 print(result)
-
diff --git a/examples/multi_model/shutdown.py b/examples/multi_model/shutdown.py
index 5f082f2f..11e0b4b9 100644
--- a/examples/multi_model/shutdown.py
+++ b/examples/multi_model/shutdown.py
@@ -1,2 +1,7 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 import mii
+
 mii.terminate("bloom560m_deployment")
diff --git a/examples/multi_model/text-generation-bloom560m-example.py b/examples/multi_model/text-generation-bloom560m-example.py
index b1d0d6b6..a5dc202c 100644
--- a/examples/multi_model/text-generation-bloom560m-example.py
+++ b/examples/multi_model/text-generation-bloom560m-example.py
@@ -10,22 +10,30 @@
 
 deployments = []
 mii_configs1 = {"tensor_parallel": 2, "dtype": "fp16"}
-deployments.append(mii.Deployment(task='text-generation',
-           model="bigscience/bloom-560m",
-           deployment_name="bloom560m_deployment",
-           GPU_index_map=gpu_index_map3,
-           mii_config=mii.config.MIIConfig(**mii_configs1)))
+deployments.append(
+    mii.Deployment(task='text-generation',
+                   model="bigscience/bloom-560m",
+                   deployment_name="bloom560m_deployment",
+                   GPU_index_map=gpu_index_map3,
+                   mii_config=mii.config.MIIConfig(**mii_configs1)))
 
 # gpt2
 name = "microsoft/DialogRPT-human-vs-rand"
-deployments.append(mii.Deployment(task='text-classification', model=name, deployment_name=name + "_deployment", GPU_index_map=gpu_index_map2))
+deployments.append(
+    mii.Deployment(task='text-classification',
+                   model=name,
+                   deployment_name=name + "_deployment",
+                   GPU_index_map=gpu_index_map2))
 
 mii_configs2 = {"tensor_parallel": 1}
 
-
 name = "microsoft/DialoGPT-large"
 
-deployments.append(mii.Deployment(task='conversational', model=name, deployment_name=name + "_deployment", GPU_index_map=gpu_index_map1, mii_config=mii.config.MIIConfig(**mii_configs2)))
-
+deployments.append(
+    mii.Deployment(task='conversational',
+                   model=name,
+                   deployment_name=name + "_deployment",
+                   GPU_index_map=gpu_index_map1,
+                   mii_config=mii.config.MIIConfig(**mii_configs2)))
 
 mii.deploy(deployment_tag="first_test", deployments=deployments)
diff --git a/mii/client.py b/mii/client.py
index fe884ed1..3a314384 100644
--- a/mii/client.py
+++ b/mii/client.py
@@ -60,7 +60,7 @@ def __init__(self, task_name, host, port):
         channel = create_channel(host, port)
         self.stub = modelresponse_pb2_grpc.ModelResponseStub(channel)
         self.task = get_task(task_name)
-        
+
     async def _request_async_response(self, request_dict, **query_kwargs):
         if self.task not in GRPC_METHOD_TABLE:
             raise ValueError(f"unknown task: {self.task}")
diff --git a/mii/config.py b/mii/config.py
index e425a2e6..4eb6b597 100644
--- a/mii/config.py
+++ b/mii/config.py
@@ -6,7 +6,6 @@
 from typing import Union, List
 from enum import Enum
 from pydantic import BaseModel, validator, root_validator
-from .constants import Tasks
 from deepspeed.launcher.runner import DLTS_HOSTFILE
 
 
@@ -125,6 +124,8 @@ class LoadBalancerConfig(BaseModel):
 
     class Config:
         validate_all = True
+
+
 validate_assignment = True
 
 
diff --git a/mii/deployment.py b/mii/deployment.py
index 330acd51..95be4c01 100644
--- a/mii/deployment.py
+++ b/mii/deployment.py
@@ -13,7 +13,7 @@
 from .utils import logger, get_task_name, get_provider_name
 from .models.score import create_score_file
 from .models import load_models
-from .config import ReplicaConfig, LoadBalancerConfig
+from .config import ReplicaConfig, LoadBalancerConfig, Deployment
 
 
 def deploy(task=None,
@@ -26,7 +26,8 @@ def deploy(task=None,
            deployment_tag=None,
            deployments=[],
            deployment_type=DeploymentType.LOCAL,
-           model_path=None):
+           model_path=None,
+           version=1):
     """Deploy a task using specified model. For usage examples see:
 
         mii/examples/local/text-generation-example.py
@@ -69,7 +70,17 @@ def deploy(task=None,
     """
     if not deployments:
         assert all((model, task, deployment_name)), "model, task, and deployment name must be set to deploy singular model"
-        deployments = [Deployment(deployment_name, task, model, enable_deepspeed, enable_zero, None, mii_config, ds_config, version)]
+        deployments = [
+            Deployment(deployment_name,
+                       task,
+                       model,
+                       enable_deepspeed,
+                       enable_zero,
+                       None,
+                       mii_config,
+                       ds_config,
+                       version)
+        ]
         deployment_tag = deployment_name + "_tag"
     else:
         assert deployment_tag is not None, "deployment_tag must be set to deploy multiple models"
@@ -96,15 +107,17 @@ def deploy(task=None,
         if not mii_config.skip_model_check:
             mii.utils.check_if_task_and_model_is_valid(deployment.task, deployment.model)
             if enable_deepspeed:
-                mii.utils.check_if_task_and_model_is_supported(deployment.task, deployment.model)
+                mii.utils.check_if_task_and_model_is_supported(
+                    deployment.task,
+                    deployment.model)
 
         if enable_deepspeed:
             logger.info(
-                    f"************* MII is using DeepSpeed Optimizations to accelerate your model: {deployment.model} *************"
+                f"************* MII is using DeepSpeed Optimizations to accelerate your model: {deployment.model} *************"
             )
         else:
             logger.info(
-                    f"************* DeepSpeed Optimizations not enabled. Please use enable_deepspeed to get better performance for: {deployment.model} *************"
+                f"************* DeepSpeed Optimizations not enabled. Please use enable_deepspeed to get better performance for: {deployment.model} *************"
             )
 
     # In local deployments use default path if no model path set
@@ -115,7 +128,7 @@ def deploy(task=None,
 
     # add fields for replica deployment
     replica_configs = []
-    ports = set()
+    port_map = {}
     for deployment in deployments:
         mii_config = deployment.mii_config
         replica_pool = _allocate_processes(mii_config.hostfile,
@@ -125,19 +138,21 @@ def deploy(task=None,
 
         for i, (hostname, gpu_indices) in enumerate(replica_pool):
             # Reserver port for a LB proxy when replication is enabled
+            if hostname not in port_map:
+                port_map[hostname] = set()
             port_offset = 1
             base_port = mii_config.port_number + i * mii_config.tensor_parallel + port_offset
-            if base_port in ports:
-                base_port = max(ports) + 1
+            if base_port in port_map[hostname]:
+                base_port = max(port_map[hostname]) + 1
             tensor_parallel_ports = list(
                 range(base_port,
-                    base_port + mii_config.tensor_parallel))
+                      base_port + mii_config.tensor_parallel))
             for i in range(base_port, base_port + mii_config.tensor_parallel):
-                ports.add(i)
+                port_map[hostname].add(i)
             torch_dist_port = mii_config.torch_dist_port + i
             replica_configs.append(
                 ReplicaConfig(task=get_task_name(deployment.task),
-                              deployment_name = deployment.deployment_name,
+                              deployment_name=deployment.deployment_name,
                               hostname=hostname,
                               tensor_parallel_ports=tensor_parallel_ports,
                               torch_dist_port=torch_dist_port,
@@ -183,23 +198,25 @@ def _deploy_aml(deployment_tag, model_name, version):
                                                model_name=model_name,
                                                version=version)
     print(
-        f"AML deployment assets at {mii.aml_related.utils.aml_output_path(deployment_name)}"
+        f"AML deployment assets at {mii.aml_related.utils.aml_output_path(deployment_tag)}"
     )
     print("Please run 'deploy.sh' to bring your deployment online")
 
 
-def _allocate_processes(hostfile_path, tensor_parallel, num_replicas, gpu_index_map=None):
+def _allocate_processes(hostfile_path,
+                        tensor_parallel,
+                        num_replicas,
+                        gpu_index_map=None):
     resource_pool = fetch_hostfile(hostfile_path)
     assert resource_pool is not None and len(
         resource_pool) > 0, f'No hosts found in {hostfile_path}'
-    
+
     replica_pool = []
 
     if gpu_index_map is not None:
-        assert len(gpu_index_map) == num_replicas, "Number of Hosts must match number of replicas"
         for host in gpu_index_map:
             assert host in resource_pool, f"Host: {host} was not found"
-            assert resource_pool[host] >= tensor_parallel, f"Host {host} has {slots} slot(s), but {tensor_parallel} slot(s) are required"
+            assert resource_pool[host] >= tensor_parallel, f"Host {host} has {resource_pool[host]} slot(s), but {tensor_parallel} slot(s) are required"
         for host in gpu_index_map:
             replica_pool.append((host, gpu_index_map[host]))
         return replica_pool
diff --git a/mii/grpc_related/modelresponse_server.py b/mii/grpc_related/modelresponse_server.py
index 7ca739aa..6e97c085 100644
--- a/mii/grpc_related/modelresponse_server.py
+++ b/mii/grpc_related/modelresponse_server.py
@@ -16,7 +16,7 @@
 from mii.constants import GRPC_MAX_MSG_SIZE, CREATE_SESSION_METHOD, DESTROY_SESSION_METHOD, TERMINATE_METHOD, LB_MAX_WORKER_THREADS, SERVER_SHUTDOWN_TIMEOUT, Tasks
 from mii.method_table import GRPC_METHOD_TABLE
 from mii.client import create_channel
-from mii.utils import get_task, unpack_proto_query_kwargs, kwarg_dict_to_proto
+from mii.utils import get_task, unpack_proto_query_kwargs
 
 
 class ServiceBase(modelresponse_pb2_grpc.ModelResponseServicer):
@@ -177,16 +177,18 @@ def __init__(self, task_name, replica_configs):
 
         print(replica_configs)
         for repl in replica_configs:
-            self.stubs[repl.deployment_name].extend(ParallelStubInvoker(replica.hostname,
-                                                               replica.tensor_parallel_ports)
-                                                               for replica in replica_configs if replica.deployment_name == repl.deployment_name)
+            self.stubs[repl.deployment_name].extend(
+                ParallelStubInvoker(replica.hostname,
+                                    replica.tensor_parallel_ports)
+                for replica in replica_configs
+                if replica.deployment_name == repl.deployment_name)
         print(f"\nSTUBS-> {self.stubs}\nCOUNTERS-> {self.counter}")
         """
         self.counter = AtomicCounter()
         self.task = get_task(task_name)
         self.replica_sessions = {}
         """
-    
+
         # Start the asyncio loop in a separate thread
         def run_asyncio_loop(loop):
             asyncio.set_event_loop(loop)
@@ -200,6 +202,7 @@ def choose_stub(self, call_count):
     def intercept_service(self, continuation, handler_call_details):
         next_handler = continuation(handler_call_details)
         assert next_handler.unary_unary is not None
+
         #USE KWARGS LIKE THEY ARE USED TO MAKE SESSIONS TO GET THE DEPLOYMENT NAME TO HASH THE COUNTERS/STUBS
 
         def invoke_intercept_method(request_proto, context):
@@ -219,20 +222,24 @@ def invoke_intercept_method(request_proto, context):
             if method_name == "ConversationalReply":
                 request_dict = {}
                 request_dict['text'] = str(request_proto.text)
-                val =  getattr(request_proto, 'conversation_id')
+                val = getattr(request_proto, 'conversation_id')
                 request_dict['conversation_id'] = int(val) if val is not None else None
                 request_dict['past_user_inputs'] = list(request_proto.past_user_inputs)
-                request_dict['generated_responses'] = list(request_proto.generated_responses)
+                request_dict['generated_responses'] = list(
+                    request_proto.generated_responses)
                 new_request = method.pack_request_to_proto(request_dict, **kwargs)
 
             elif method_name == "QuestionAndAnswerReply":
                 request_dict = {}
                 request_dict['question'] = str(request_proto.question)
-                request_dict['context'] = str(requet_proto.context)
+                request_dict['context'] = str(request_proto.context)
                 new_request = method.pack_request_to_proto(request_dict, **kwargs)
             else:
                 request_dict = {}
-                request_dict["query"] = list(request_proto.request) if method_name == "GeneratorReply" or method_name == "Txt2ImgReply" else str(request_proto.request)
+                request_dict["query"] = list(
+                    request_proto.request
+                ) if method_name == "GeneratorReply" or method_name == "Txt2ImgReply" else str(
+                    request_proto.request)
                 print(f"HERE request_dict -> {request_dict}\nKWARGS-> {kwargs}")
                 new_request = method.pack_request_to_proto(request_dict, **kwargs)
                 print("done?")
@@ -254,12 +261,16 @@ def invoke_intercept_method(request_proto, context):
                     raise ValueError(
                         f"session {request_proto.session_id} already exists")
                 self.replica_sessions[request_proto.session_id] = replica_index
-                self.stubs[deployment_name][replica_index].invoke(CREATE_SESSION_METHOD, new_request)
+                self.stubs[deployment_name][replica_index].invoke(
+                    CREATE_SESSION_METHOD,
+                    new_request)
                 return google_dot_protobuf_dot_empty__pb2.Empty()
 
             if method_name == DESTROY_SESSION_METHOD:
                 replica_index = self.replica_sessions.pop(request_proto.session_id)
-                self.stubs[deployment_name][replica_index].invoke(DESTROY_SESSION_METHOD, new_request)
+                self.stubs[deployment_name][replica_index].invoke(
+                    DESTROY_SESSION_METHOD,
+                    new_request)
                 return google_dot_protobuf_dot_empty__pb2.Empty()
 
             if "session_id" in kwargs:
@@ -271,7 +282,9 @@ def invoke_intercept_method(request_proto, context):
             assert new_request is not None, "test"
             print("ASSERT DONE")
             print(new_request.query_kwargs)
-            ret = self.stubs[deployment_name][replica_index].invoke(method_name, new_request)
+            ret = self.stubs[deployment_name][replica_index].invoke(
+                method_name,
+                new_request)
             return ret
 
         return grpc.unary_unary_rpc_method_handler(
diff --git a/mii/models/score/generate.py b/mii/models/score/generate.py
index ecbfeea7..ecd15ffe 100644
--- a/mii/models/score/generate.py
+++ b/mii/models/score/generate.py
@@ -14,19 +14,26 @@ def create_score_file(deployment_tag,
                       deployments,
                       model_path,
                       lb_config):
-    
+
     config_dict = {}
     config_dict[mii.constants.MODEL_PATH_KEY] = model_path
     config_dict[mii.constants.DEPLOYMENT_TAG_KEY] = deployment_tag
     for deployment in deployments:
         config_dict[deployment.deployment_name] = {}
-        config_dict[deployment.deployment_name][mii.constants.DEPLOYMENT_NAME_KEY] = deployment.deployment_name
-        config_dict[deployment.deployment_name][mii.constants.TASK_NAME_KEY] = mii.utils.get_task_name(deployment.task)
-        config_dict[deployment.deployment_name][mii.constants.MODEL_NAME_KEY] = deployment.model
-        config_dict[deployment.deployment_name][mii.constants.ENABLE_DEEPSPEED_KEY] = deployment.enable_deepspeed
-        config_dict[deployment.deployment_name][mii.constants.MII_CONFIGS_KEY] = deployment.mii_config.dict()
-        config_dict[deployment.deployment_name][mii.constants.ENABLE_DEEPSPEED_ZERO_KEY] = deployment.enable_zero
-        config_dict[deployment.deployment_name][mii.constants.DEEPSPEED_CONFIG_KEY] = deployment.ds_config
+        config_dict[deployment.deployment_name][
+            mii.constants.DEPLOYMENT_NAME_KEY] = deployment.deployment_name
+        config_dict[deployment.deployment_name][
+            mii.constants.TASK_NAME_KEY] = mii.utils.get_task_name(deployment.task)
+        config_dict[deployment.deployment_name][
+            mii.constants.MODEL_NAME_KEY] = deployment.model
+        config_dict[deployment.deployment_name][
+            mii.constants.ENABLE_DEEPSPEED_KEY] = deployment.enable_deepspeed
+        config_dict[deployment.deployment_name][
+            mii.constants.MII_CONFIGS_KEY] = deployment.mii_config.dict()
+        config_dict[deployment.deployment_name][
+            mii.constants.ENABLE_DEEPSPEED_ZERO_KEY] = deployment.enable_zero
+        config_dict[deployment.deployment_name][
+            mii.constants.DEEPSPEED_CONFIG_KEY] = deployment.ds_config
 
     if lb_config is not None:
         config_dict[mii.constants.LOAD_BALANCER_CONFIG_KEY] = lb_config
@@ -37,8 +44,8 @@ def create_score_file(deployment_tag,
             )
 
     with open(os.path.join(mii.__path__[0],
-                        "models/score/score_template.py"),
-            "r") as fd:
+                           "models/score/score_template.py"),
+              "r") as fd:
         score_src = fd.read()
 
     # update score file w. global config dict
diff --git a/mii/models/score/score_template.py b/mii/models/score/score_template.py
index 7127a8ee..2faa1ebc 100644
--- a/mii/models/score/score_template.py
+++ b/mii/models/score/score_template.py
@@ -22,7 +22,17 @@ def init():
         if not isinstance(deployment, dict):
             continue
         print(f"\nDEPLOYMENT ->{configs.values()}")
-        data = {'deployment_name': deployment[mii.constants.DEPLOYMENT_NAME_KEY], 'task': deployment[mii.constants.TASK_NAME_KEY], 'model': deployment[mii.constants.MODEL_NAME_KEY], 'enable_deepspeed': deployment[mii.constants.ENABLE_DEEPSPEED_KEY], 'enable_zero': deployment[mii.constants.ENABLE_DEEPSPEED_ZERO_KEY], 'GPU_index_map': None, 'mii_config': deployment[mii.constants.MII_CONFIGS_KEY], 'ds_config': deployment[mii.constants.DEEPSPEED_CONFIG_KEY], 'version': 1}
+        data = {
+            'deployment_name': deployment[mii.constants.DEPLOYMENT_NAME_KEY],
+            'task': deployment[mii.constants.TASK_NAME_KEY],
+            'model': deployment[mii.constants.MODEL_NAME_KEY],
+            'enable_deepspeed': deployment[mii.constants.ENABLE_DEEPSPEED_KEY],
+            'enable_zero': deployment[mii.constants.ENABLE_DEEPSPEED_ZERO_KEY],
+            'GPU_index_map': None,
+            'mii_config': deployment[mii.constants.MII_CONFIGS_KEY],
+            'ds_config': deployment[mii.constants.DEEPSPEED_CONFIG_KEY],
+            'version': 1
+        }
         deployments.append(mii.Deployment.parse_obj(data))
 
     print(f"WITHIN INIT {deployments}")
diff --git a/mii/server.py b/mii/server.py
index dc0768b8..76ad9443 100644
--- a/mii/server.py
+++ b/mii/server.py
@@ -28,11 +28,7 @@ def config_to_b64_str(config):
 
 class MIIServer():
     '''Initialize the model, setup the server for the model under model_path'''
-    def __init__(self,
-                 deployment_tag,
-                 deployments,
-                 model_path,
-                 lb_config=None):
+    def __init__(self, deployment_tag, deployments, model_path, lb_config=None):
 
         #mii_configs = mii.config.MIIConfig(**mii_configs)
 
@@ -103,7 +99,7 @@ def _build_server_args(self,
                            port):
         # serialize mii config
         b64_config_str = config_to_b64_str(mii_configs)
-        
+
         task = ""
         for deployment in self.deployments:
             if deployment_name == deployment.deployment_name:
@@ -272,11 +268,7 @@ def _launch_deepspeed(self,
                                            "MII server",
                                            ds_launch_str=ds_launch_str)
 
-    def _initialize_service(self,
-                            deployment_tag,
-                            deployments,
-                            model_path,
-                            lb_config):
+    def _initialize_service(self, deployment_tag, deployments, model_path, lb_config):
 
         processes = []
 
@@ -288,7 +280,7 @@ def _initialize_service(self,
         for i, repl_config in enumerate(lb_config.replica_configs):
             name = repl_config.deployment_name
             deployment = None
-            print (f"IN SERVER NAME -> {name}")
+            print(f"IN SERVER NAME -> {name}")
             for dep in deployments:
                 print(f"\nDEPLOYMENT_NAME {dep.deployment_name}")
                 if dep.deployment_name == name:
@@ -309,7 +301,8 @@ def _initialize_service(self,
                     hostfile.name,
                     repl_config.hostname,
                     repl_config.tensor_parallel_ports[0],
-                    deployment.mii_config.torch_dist_port + (100 * i) + repl_config.gpu_indices[0],
+                    deployment.mii_config.torch_dist_port + (100 * i) +
+                    repl_config.gpu_indices[0],
                     repl_config.gpu_indices))
 
             # start load balancer here.

From 8673a9adc7e22b17154a1f6ebb857bc304d41420 Mon Sep 17 00:00:00 2001
From: Tosin Segun <insanechills@gmail.com>
Date: Wed, 5 Jul 2023 22:48:33 +0000
Subject: [PATCH 28/69] Fixing terminate functionality

---
 examples/multi_model/shutdown.py         |  2 +-
 mii/client.py                            |  4 ++--
 mii/grpc_related/modelresponse_server.py | 17 +++++++++--------
 mii/terminate.py                         |  6 +++---
 4 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/examples/multi_model/shutdown.py b/examples/multi_model/shutdown.py
index 11e0b4b9..4de143e6 100644
--- a/examples/multi_model/shutdown.py
+++ b/examples/multi_model/shutdown.py
@@ -4,4 +4,4 @@
 # DeepSpeed Team
 import mii
 
-mii.terminate("bloom560m_deployment")
+mii.terminate("first_test", "bloom560m_deployment")
diff --git a/mii/client.py b/mii/client.py
index 3a314384..b58049c7 100644
--- a/mii/client.py
+++ b/mii/client.py
@@ -188,7 +188,7 @@ def terminate(self):
         del mii.non_persistent_models[self.deployment_name]
 
 
-def terminate_restful_gateway(deployment_name):
-    _, mii_configs = _get_deployment_info(deployment_name)
+def terminate_restful_gateway(deployment_tag, deployment_name):
+    _, mii_configs = _get_deployment_info(deployment_tag, deployment_name)
     if mii_configs.enable_restful_api:
         requests.get(f"http://localhost:{mii_configs.restful_api_port}/terminate")
diff --git a/mii/grpc_related/modelresponse_server.py b/mii/grpc_related/modelresponse_server.py
index 6e97c085..441faffd 100644
--- a/mii/grpc_related/modelresponse_server.py
+++ b/mii/grpc_related/modelresponse_server.py
@@ -207,8 +207,16 @@ def intercept_service(self, continuation, handler_call_details):
 
         def invoke_intercept_method(request_proto, context):
             method_name = _get_grpc_method_name(handler_call_details.method)
+            if method_name == TERMINATE_METHOD:
+                for deployment in self.stubs:
+                    for stub in self.stubs[deployment]:
+                        stub.invoke(TERMINATE_METHOD,
+                                    google_dot_protobuf_dot_empty__pb2.Empty())
+                self.asyncio_loop.call_soon_threadsafe(self.asyncio_loop.stop)
+                return next_handler.unary_unary(request_proto, context)
             kwargs = unpack_proto_query_kwargs(request_proto.query_kwargs)
-            assert "deployment_name" in kwargs, "Must include deployment_name in kwargs for query"
+            if method_name != TERMINATE_METHOD:
+                assert "deployment_name" in kwargs, "Must include deployment_name in kwargs for query"
             deployment_name = kwargs.get('deployment_name')
             kwargs.pop('deployment_name', None)
             task = None
@@ -246,13 +254,6 @@ def invoke_intercept_method(request_proto, context):
 
             print(f"\nDEPLOYMENT NAME WITHIN INTERCEPTOR -> {deployment_name}")
 
-            if method_name == TERMINATE_METHOD:
-                for stub in self.stubs:
-                    stub.invoke(TERMINATE_METHOD,
-                                google_dot_protobuf_dot_empty__pb2.Empty())
-                self.asyncio_loop.call_soon_threadsafe(self.asyncio_loop.stop)
-                return next_handler.unary_unary(new_request, context)
-
             call_count = self.counter[deployment_name].get_and_increment()
             replica_index = call_count % len(self.stubs[deployment_name])
 
diff --git a/mii/terminate.py b/mii/terminate.py
index 167c5a5a..94fa7a77 100644
--- a/mii/terminate.py
+++ b/mii/terminate.py
@@ -7,9 +7,9 @@
 import mii
 
 
-def terminate(deployment_name):
+def terminate(deployment_tag, deployment_name):
     mii.utils.logger.info(f"Terminating server for {deployment_name}")
-    generator = mii.mii_query_handle(deployment_name)
+    generator = mii.mii_query_handle(deployment_tag, deployment_name)
     if (deployment_name in mii.non_persistent_models):
         generator.terminate()
         return
@@ -24,4 +24,4 @@ def terminate(deployment_name):
         pass
 
     generator.terminate()
-    mii.client.terminate_restful_gateway(deployment_name)
+    mii.client.terminate_restful_gateway(deployment_tag, deployment_name)

From 8d09b3757ba1327247979f1f8bb414bfa3ee1aa1 Mon Sep 17 00:00:00 2001
From: Tosin Segun <insanechills@gmail.com>
Date: Thu, 6 Jul 2023 18:27:32 +0000
Subject: [PATCH 29/69] Refactored client

---
 examples/multi_model/query.py | 25 +++++++-------
 mii/client.py                 | 64 ++++++++++++++++++++++++++---------
 2 files changed, 60 insertions(+), 29 deletions(-)

diff --git a/examples/multi_model/query.py b/examples/multi_model/query.py
index 37c55476..2e06f159 100644
--- a/examples/multi_model/query.py
+++ b/examples/multi_model/query.py
@@ -6,30 +6,29 @@
 import mii
 import time
 
-generator = mii.mii_query_handle("first_test", "bloom560m_deployment")
-result = generator.query({"query": ["DeepSpeed is",
-                                    "Seattle is"]},
-                         do_sample=True,
-                         max_new_tokens=30,
-                         deployment_name="bloom560m_deployment")
+generator = mii.mii_query_handle("first_test")
+result = generator.query(
+    {"query": ["DeepSpeed is",
+               "Seattle is"]},
+    "bloom560m_deployment",
+    do_sample=True,
+    max_new_tokens=30,
+)
 print(result)
 
 time.sleep(5)
-generator2 = mii.mii_query_handle("first_test",
-                                  "microsoft/DialogRPT-human-vs-rand_deployment")
-result = generator2.query({'query': "DeepSpeed is the greatest"},
-                          deployment_name="microsoft/DialogRPT-human-vs-rand_deployment")
+result = generator.query({'query': "DeepSpeed is the greatest"},
+                         "microsoft/DialogRPT-human-vs-rand_deployment")
 print(result)
 
 time.sleep(5)
 
-generator3 = mii.mii_query_handle("first_test", "microsoft/DialoGPT-large_deployment")
-result = generator3.query(
+result = generator.query(
     {
         'text': "DeepSpeed is the greatest",
         'conversation_id': 3,
         'past_user_inputs': [],
         'generated_responses': []
     },
-    deployment_name="microsoft/DialoGPT-large_deployment")
+    "microsoft/DialoGPT-large_deployment")
 print(result)
diff --git a/mii/client.py b/mii/client.py
index b58049c7..e4e7ed50 100644
--- a/mii/client.py
+++ b/mii/client.py
@@ -12,17 +12,28 @@
 from mii.method_table import GRPC_METHOD_TABLE
 
 
-def _get_deployment_info(deployment_tag, deployment_name):
+def _get_deployment_info(deployment_tag):
+    deployments = []
     configs = mii.utils.import_score_file(deployment_tag).configs
+    for deployment in configs:
+        if not isinstance(configs[deployment], dict):
+            continue
+        print("here")
+        deployments.append(configs[deployment])
+        mii_configs_dict = configs[deployment][mii.constants.MII_CONFIGS_KEY]
+        mii_configs = mii.config.MIIConfig(**mii_configs_dict)
+    return deployments
+    """
     task = configs[deployment_name][mii.constants.TASK_NAME_KEY]
     mii_configs_dict = configs[deployment_name][mii.constants.MII_CONFIGS_KEY]
     mii_configs = mii.config.MIIConfig(**mii_configs_dict)
 
     assert task is not None, "The task name should be set before calling init"
     return task, mii_configs
+    """
 
 
-def mii_query_handle(deployment_tag, deployment_name):
+def mii_query_handle(deployment_tag, deployment_name=None):
     """Get a query handle for a local deployment:
 
         mii/examples/local/gpt2-query-example.py
@@ -35,12 +46,15 @@ def mii_query_handle(deployment_tag, deployment_name):
         query_handle: A query handle with a single method `.query(request_dictionary)` using which queries can be sent to the model.
     """
 
-    if deployment_name in mii.non_persistent_models:
+    if deployment_name is not None and deployment_name in mii.non_persistent_models:
         inference_pipeline, task = mii.non_persistent_models[deployment_name]
         return MIINonPersistentClient(task, deployment_name)
 
-    task_name, mii_configs = _get_deployment_info(deployment_tag, deployment_name)
-    return MIIClient(task_name, "localhost", mii_configs.port_number)
+    deployments = _get_deployment_info(deployment_tag)
+    print(deployments)
+    mii_configs_dict = deployments[0][mii.constants.MII_CONFIGS_KEY]
+    mii_configs = mii.config.MIIConfig(**mii_configs_dict)
+    return MIIClient(deployments, "localhost", mii_configs.port_number)
 
 
 def create_channel(host, port):
@@ -55,24 +69,32 @@ class MIIClient():
     """
     Client to send queries to a single endpoint.
     """
-    def __init__(self, task_name, host, port):
+    def __init__(self, deployments, host, port):
         self.asyncio_loop = asyncio.get_event_loop()
         channel = create_channel(host, port)
         self.stub = modelresponse_pb2_grpc.ModelResponseStub(channel)
-        self.task = get_task(task_name)
+        #self.task = get_task(task_name)
+        self.deployments = deployments
 
-    async def _request_async_response(self, request_dict, **query_kwargs):
-        if self.task not in GRPC_METHOD_TABLE:
-            raise ValueError(f"unknown task: {self.task}")
+    async def _request_async_response(self, request_dict, task, **query_kwargs):
+        if task not in GRPC_METHOD_TABLE:
+            raise ValueError(f"unknown task: {task}")
 
-        task_methods = GRPC_METHOD_TABLE[self.task]
+        task_methods = GRPC_METHOD_TABLE[task]
         proto_request = task_methods.pack_request_to_proto(request_dict, **query_kwargs)
         proto_response = await getattr(self.stub, task_methods.method)(proto_request)
         return task_methods.unpack_response_from_proto(proto_response)
 
-    def query(self, request_dict, **query_kwargs):
+    def query(self, request_dict, deployment_name, **query_kwargs):
+        task = None
+        for deployment in self.deployments:
+            if deployment[mii.constants.DEPLOYMENT_NAME_KEY] == deployment_name:
+                task = get_task(deployment[mii.constants.TASK_NAME_KEY])
+                break
+        query_kwargs['deployment_name'] = deployment_name
         return self.asyncio_loop.run_until_complete(
             self._request_async_response(request_dict,
+                                         task,
                                          **query_kwargs))
 
     async def terminate_async(self):
@@ -86,8 +108,13 @@ async def create_session_async(self, session_id):
         return await self.stub.CreateSession(
             modelresponse_pb2.SessionID(session_id=session_id))
 
-    def create_session(self, session_id):
-        assert self.task == Tasks.TEXT_GENERATION, f"Session creation only available for task '{Tasks.TEXT_GENERATION}'."
+    def create_session(self, session_id, deployment_name):
+        task = None
+        for deployment in self.deployments:
+            if deployment[mii.constants.DEPLOYMENT_NAME_KEY] == deployment_name:
+                task = get_task(deployment[mii.constants.TASK_NAME_KEY])
+                break
+        assert task == Tasks.TEXT_GENERATION, f"Session creation only available for task '{Tasks.TEXT_GENERATION}'."
         return self.asyncio_loop.run_until_complete(
             self.create_session_async(session_id))
 
@@ -95,8 +122,13 @@ async def destroy_session_async(self, session_id):
         await self.stub.DestroySession(modelresponse_pb2.SessionID(session_id=session_id)
                                        )
 
-    def destroy_session(self, session_id):
-        assert self.task == Tasks.TEXT_GENERATION, f"Session deletion only available for task '{Tasks.TEXT_GENERATION}'."
+    def destroy_session(self, session_id, deployment_name):
+        task = None
+        for deployment in self.deployments:
+            if deployment[mii.constants.DEPLOYMENT_NAME_KEY] == deployment_name:
+                task = get_task(deployment[mii.constants.TASK_NAME_KEY])
+                break
+        assert task == Tasks.TEXT_GENERATION, f"Session deletion only available for task '{Tasks.TEXT_GENERATION}'."
         self.asyncio_loop.run_until_complete(self.destroy_session_async(session_id))
 
 

From 7a136d6a2d88390690323b5874fa719e32649f48 Mon Sep 17 00:00:00 2001
From: Tosin Segun <insanechills@gmail.com>
Date: Thu, 6 Jul 2023 21:11:09 +0000
Subject: [PATCH 30/69] More Refactoring and q/a example

---
 examples/multi_model/query.py                 |  8 ++++++++
 examples/multi_model/shutdown.py              |  2 +-
 .../text-generation-bloom560m-example.py      |  7 +++++++
 mii/client.py                                 | 19 +++++++++++--------
 mii/deployment.py                             |  2 +-
 mii/terminate.py                              | 14 +++++++-------
 6 files changed, 35 insertions(+), 17 deletions(-)

diff --git a/examples/multi_model/query.py b/examples/multi_model/query.py
index 2e06f159..377dcf1f 100644
--- a/examples/multi_model/query.py
+++ b/examples/multi_model/query.py
@@ -32,3 +32,11 @@
     },
     "microsoft/DialoGPT-large_deployment")
 print(result)
+
+results = generator.query(
+    {
+        'question': "What is the greatest?",
+        'context': "DeepSpeed is the greatest"
+    }, 
+    "deepset/roberta-large-squad2" + "-qa-deployment")
+print(results)
diff --git a/examples/multi_model/shutdown.py b/examples/multi_model/shutdown.py
index 4de143e6..281389c4 100644
--- a/examples/multi_model/shutdown.py
+++ b/examples/multi_model/shutdown.py
@@ -4,4 +4,4 @@
 # DeepSpeed Team
 import mii
 
-mii.terminate("first_test", "bloom560m_deployment")
+mii.terminate("first_test")
diff --git a/examples/multi_model/text-generation-bloom560m-example.py b/examples/multi_model/text-generation-bloom560m-example.py
index a5dc202c..f070195e 100644
--- a/examples/multi_model/text-generation-bloom560m-example.py
+++ b/examples/multi_model/text-generation-bloom560m-example.py
@@ -36,4 +36,11 @@
                    GPU_index_map=gpu_index_map1,
                    mii_config=mii.config.MIIConfig(**mii_configs2)))
 
+name = "deepset/roberta-large-squad2"
+deployments.append(
+    mii.Deployment(task="question-answering",
+                   model=name,
+                   deployment_name=name + "-qa-deployment",
+                   GPU_index_map=gpu_index_map2))
+
 mii.deploy(deployment_tag="first_test", deployments=deployments)
diff --git a/mii/client.py b/mii/client.py
index e4e7ed50..2a9e2f1a 100644
--- a/mii/client.py
+++ b/mii/client.py
@@ -18,7 +18,6 @@ def _get_deployment_info(deployment_tag):
     for deployment in configs:
         if not isinstance(configs[deployment], dict):
             continue
-        print("here")
         deployments.append(configs[deployment])
         mii_configs_dict = configs[deployment][mii.constants.MII_CONFIGS_KEY]
         mii_configs = mii.config.MIIConfig(**mii_configs_dict)
@@ -33,7 +32,7 @@ def _get_deployment_info(deployment_tag):
     """
 
 
-def mii_query_handle(deployment_tag, deployment_name=None):
+def mii_query_handle(deployment_tag):
     """Get a query handle for a local deployment:
 
         mii/examples/local/gpt2-query-example.py
@@ -46,12 +45,11 @@ def mii_query_handle(deployment_tag, deployment_name=None):
         query_handle: A query handle with a single method `.query(request_dictionary)` using which queries can be sent to the model.
     """
 
-    if deployment_name is not None and deployment_name in mii.non_persistent_models:
+    if deployment_tag in mii.non_persistent_models:
         inference_pipeline, task = mii.non_persistent_models[deployment_name]
         return MIINonPersistentClient(task, deployment_name)
 
     deployments = _get_deployment_info(deployment_tag)
-    print(deployments)
     mii_configs_dict = deployments[0][mii.constants.MII_CONFIGS_KEY]
     mii_configs = mii.config.MIIConfig(**mii_configs_dict)
     return MIIClient(deployments, "localhost", mii_configs.port_number)
@@ -86,6 +84,8 @@ async def _request_async_response(self, request_dict, task, **query_kwargs):
         return task_methods.unpack_response_from_proto(proto_response)
 
     def query(self, request_dict, deployment_name, **query_kwargs):
+        if deployment_name is None: #mii.terminate()
+            return len(self.deployments)
         task = None
         for deployment in self.deployments:
             if deployment[mii.constants.DEPLOYMENT_NAME_KEY] == deployment_name:
@@ -220,7 +220,10 @@ def terminate(self):
         del mii.non_persistent_models[self.deployment_name]
 
 
-def terminate_restful_gateway(deployment_tag, deployment_name):
-    _, mii_configs = _get_deployment_info(deployment_tag, deployment_name)
-    if mii_configs.enable_restful_api:
-        requests.get(f"http://localhost:{mii_configs.restful_api_port}/terminate")
+def terminate_restful_gateway(deployment_tag):
+    deployments = _get_deployment_info(deployment_tag)
+    for deployment in deployments:
+        mii_configs_dict = deployment[mii.constants.MII_CONFIGS_KEY]
+        mii_configs = mii.config.MIIConfig(**mii_configs_dict)
+        if mii_configs.enable_restful_api:
+            requests.get(f"http://localhost:{mii_configs.restful_api_port}/terminate")
diff --git a/mii/deployment.py b/mii/deployment.py
index 95be4c01..4d638537 100644
--- a/mii/deployment.py
+++ b/mii/deployment.py
@@ -81,7 +81,7 @@ def deploy(task=None,
                        ds_config,
                        version)
         ]
-        deployment_tag = deployment_name + "_tag"
+        deployment_tag = deployment_name
     else:
         assert deployment_tag is not None, "deployment_tag must be set to deploy multiple models"
 
diff --git a/mii/terminate.py b/mii/terminate.py
index 94fa7a77..0a2b82b4 100644
--- a/mii/terminate.py
+++ b/mii/terminate.py
@@ -7,21 +7,21 @@
 import mii
 
 
-def terminate(deployment_tag, deployment_name):
-    mii.utils.logger.info(f"Terminating server for {deployment_name}")
-    generator = mii.mii_query_handle(deployment_tag, deployment_name)
-    if (deployment_name in mii.non_persistent_models):
+def terminate(deployment_tag):
+    mii.utils.logger.info(f"Terminating server for {deployment_tag}")
+    generator = mii.mii_query_handle(deployment_tag)
+    if (deployment_tag in mii.non_persistent_models):
         generator.terminate()
         return
     try:
-        generator.query({'query': ''})
+        generator.query({'query': ''}, None)
     except grpc.aio._call.AioRpcError as error:
         if error._code == grpc.StatusCode.UNAVAILABLE:
-            mii.utils.logger.warn(f"Server for {deployment_name} not found")
+            mii.utils.logger.warn(f"Server for {deployment_tag} not found")
         else:
             pass
     except (KeyError, TypeError) as error:
         pass
 
     generator.terminate()
-    mii.client.terminate_restful_gateway(deployment_tag, deployment_name)
+    mii.client.terminate_restful_gateway(deployment_tag)

From 2c6ec08299262a9d0954541f56e4b1d3b9020e6a Mon Sep 17 00:00:00 2001
From: Tosin Segun <insanechills@gmail.com>
Date: Thu, 6 Jul 2023 21:55:48 +0000
Subject: [PATCH 31/69] Reformatting to maintain previous syntax

---
 examples/multi_model/query.py |  2 +-
 mii/client.py                 | 48 +++++++++++++++++++++--------------
 mii/deployment.py             | 18 ++++++-------
 3 files changed, 39 insertions(+), 29 deletions(-)

diff --git a/examples/multi_model/query.py b/examples/multi_model/query.py
index 377dcf1f..2be15c8b 100644
--- a/examples/multi_model/query.py
+++ b/examples/multi_model/query.py
@@ -37,6 +37,6 @@
     {
         'question': "What is the greatest?",
         'context': "DeepSpeed is the greatest"
-    }, 
+    },
     "deepset/roberta-large-squad2" + "-qa-deployment")
 print(results)
diff --git a/mii/client.py b/mii/client.py
index 2a9e2f1a..2282fc27 100644
--- a/mii/client.py
+++ b/mii/client.py
@@ -46,8 +46,8 @@ def mii_query_handle(deployment_tag):
     """
 
     if deployment_tag in mii.non_persistent_models:
-        inference_pipeline, task = mii.non_persistent_models[deployment_name]
-        return MIINonPersistentClient(task, deployment_name)
+        inference_pipeline, task = mii.non_persistent_models[deployment_tag]
+        return MIINonPersistentClient(task, deployment_tag)
 
     deployments = _get_deployment_info(deployment_tag)
     mii_configs_dict = deployments[0][mii.constants.MII_CONFIGS_KEY]
@@ -83,14 +83,16 @@ async def _request_async_response(self, request_dict, task, **query_kwargs):
         proto_response = await getattr(self.stub, task_methods.method)(proto_request)
         return task_methods.unpack_response_from_proto(proto_response)
 
-    def query(self, request_dict, deployment_name, **query_kwargs):
-        if deployment_name is None: #mii.terminate()
-            return len(self.deployments)
+    def query(self, request_dict, deployment_name=None, **query_kwargs):
         task = None
-        for deployment in self.deployments:
-            if deployment[mii.constants.DEPLOYMENT_NAME_KEY] == deployment_name:
-                task = get_task(deployment[mii.constants.TASK_NAME_KEY])
-                break
+        if deployment_name is None:  #mii.terminate() or single model
+            deployment_name = self.deployments[0][mii.constants.DEPLOYMENT_NAME_KEY]
+            task = get_task(self.deployments[0][mii.constants.TASK_NAME_KEY])
+        else:
+            for deployment in self.deployments:
+                if deployment[mii.constants.DEPLOYMENT_NAME_KEY] == deployment_name:
+                    task = get_task(deployment[mii.constants.TASK_NAME_KEY])
+                    break
         query_kwargs['deployment_name'] = deployment_name
         return self.asyncio_loop.run_until_complete(
             self._request_async_response(request_dict,
@@ -108,12 +110,16 @@ async def create_session_async(self, session_id):
         return await self.stub.CreateSession(
             modelresponse_pb2.SessionID(session_id=session_id))
 
-    def create_session(self, session_id, deployment_name):
+    def create_session(self, session_id, deployment_name=None):
         task = None
-        for deployment in self.deployments:
-            if deployment[mii.constants.DEPLOYMENT_NAME_KEY] == deployment_name:
-                task = get_task(deployment[mii.constants.TASK_NAME_KEY])
-                break
+        if deployment_name is None:  #mii.terminate() or single model
+            deployment_name = self.deployments[0][mii.constants.DEPLOYMENT_NAME_KEY]
+            task = get_task(self.deployments[0][mii.constants.TASK_NAME_KEY])
+        else:
+            for deployment in self.deployments:
+                if deployment[mii.constants.DEPLOYMENT_NAME_KEY] == deployment_name:
+                    task = get_task(deployment[mii.constants.TASK_NAME_KEY])
+                    break
         assert task == Tasks.TEXT_GENERATION, f"Session creation only available for task '{Tasks.TEXT_GENERATION}'."
         return self.asyncio_loop.run_until_complete(
             self.create_session_async(session_id))
@@ -122,12 +128,16 @@ async def destroy_session_async(self, session_id):
         await self.stub.DestroySession(modelresponse_pb2.SessionID(session_id=session_id)
                                        )
 
-    def destroy_session(self, session_id, deployment_name):
+    def destroy_session(self, session_id, deployment_name=None):
         task = None
-        for deployment in self.deployments:
-            if deployment[mii.constants.DEPLOYMENT_NAME_KEY] == deployment_name:
-                task = get_task(deployment[mii.constants.TASK_NAME_KEY])
-                break
+        if deployment_name is None:  #mii.terminate() or single model
+            deployment_name = self.deployments[0][mii.constants.DEPLOYMENT_NAME_KEY]
+            task = get_task(self.deployments[0][mii.constants.TASK_NAME_KEY])
+        else:
+            for deployment in self.deployments:
+                if deployment[mii.constants.DEPLOYMENT_NAME_KEY] == deployment_name:
+                    task = get_task(deployment[mii.constants.TASK_NAME_KEY])
+                    break
         assert task == Tasks.TEXT_GENERATION, f"Session deletion only available for task '{Tasks.TEXT_GENERATION}'."
         self.asyncio_loop.run_until_complete(self.destroy_session_async(session_id))
 
diff --git a/mii/deployment.py b/mii/deployment.py
index 4d638537..986186f3 100644
--- a/mii/deployment.py
+++ b/mii/deployment.py
@@ -71,15 +71,15 @@ def deploy(task=None,
     if not deployments:
         assert all((model, task, deployment_name)), "model, task, and deployment name must be set to deploy singular model"
         deployments = [
-            Deployment(deployment_name,
-                       task,
-                       model,
-                       enable_deepspeed,
-                       enable_zero,
-                       None,
-                       mii_config,
-                       ds_config,
-                       version)
+            Deployment(deployment_name=deployment_name,
+                       task=task,
+                       model=model,
+                       enable_deepspeed=enable_deepspeed,
+                       enable_zero=enable_zero,
+                       GPU_index_map=None,
+                       mii_config=mii.config.MIIConfig(**mii_config),
+                       ds_config=ds_config,
+                       version=version)
         ]
         deployment_tag = deployment_name
     else:

From 0cb88a9f2696d4a4aae0231af72eb4df1503b969 Mon Sep 17 00:00:00 2001
From: Tosin Segun <insanechills@gmail.com>
Date: Thu, 6 Jul 2023 22:01:06 +0000
Subject: [PATCH 32/69] Removing print/debug statements

---
 mii/grpc_related/modelresponse_server.py | 9 ---------
 mii/method_table.py                      | 3 ---
 mii/models/score/score_template.py       | 3 ---
 mii/server.py                            | 2 --
 4 files changed, 17 deletions(-)

diff --git a/mii/grpc_related/modelresponse_server.py b/mii/grpc_related/modelresponse_server.py
index 441faffd..0531f68a 100644
--- a/mii/grpc_related/modelresponse_server.py
+++ b/mii/grpc_related/modelresponse_server.py
@@ -175,14 +175,12 @@ def __init__(self, task_name, replica_configs):
             self.stubs[repl.deployment_name] = []
             self.counter[repl.deployment_name] = AtomicCounter()
 
-        print(replica_configs)
         for repl in replica_configs:
             self.stubs[repl.deployment_name].extend(
                 ParallelStubInvoker(replica.hostname,
                                     replica.tensor_parallel_ports)
                 for replica in replica_configs
                 if replica.deployment_name == repl.deployment_name)
-        print(f"\nSTUBS-> {self.stubs}\nCOUNTERS-> {self.counter}")
         """
         self.counter = AtomicCounter()
         self.task = get_task(task_name)
@@ -224,7 +222,6 @@ def invoke_intercept_method(request_proto, context):
                 if repl.deployment_name == deployment_name:
                     task = repl.task
                     break
-            print(f"\nTASK ->{task}\nMETHOD NAME-> {method_name}")
             method = GRPC_METHOD_TABLE[get_task(task)]
             new_request = None
             if method_name == "ConversationalReply":
@@ -248,11 +245,7 @@ def invoke_intercept_method(request_proto, context):
                     request_proto.request
                 ) if method_name == "GeneratorReply" or method_name == "Txt2ImgReply" else str(
                     request_proto.request)
-                print(f"HERE request_dict -> {request_dict}\nKWARGS-> {kwargs}")
                 new_request = method.pack_request_to_proto(request_dict, **kwargs)
-                print("done?")
-
-            print(f"\nDEPLOYMENT NAME WITHIN INTERCEPTOR -> {deployment_name}")
 
             call_count = self.counter[deployment_name].get_and_increment()
             replica_index = call_count % len(self.stubs[deployment_name])
@@ -281,8 +274,6 @@ def invoke_intercept_method(request_proto, context):
                 replica_index = self.replica_sessions[session_id]
 
             assert new_request is not None, "test"
-            print("ASSERT DONE")
-            print(new_request.query_kwargs)
             ret = self.stubs[deployment_name][replica_index].invoke(
                 method_name,
                 new_request)
diff --git a/mii/method_table.py b/mii/method_table.py
index 8dfea390..c412f446 100644
--- a/mii/method_table.py
+++ b/mii/method_table.py
@@ -23,9 +23,6 @@ def single_string_response_to_proto(self, response, time_taken, model_time_taken
 
 
 def multi_string_request_to_proto(self, request_dict, **query_kwargs):
-    temp = kwarg_dict_to_proto(query_kwargs)
-    print(f"FINE {temp}\nrd->{request_dict}")
-    print(isinstance(request_dict['query'], list))
     return modelresponse_pb2.MultiStringRequest(
         request=request_dict['query'] if isinstance(request_dict['query'],
                                                     list) else [request_dict['query']],
diff --git a/mii/models/score/score_template.py b/mii/models/score/score_template.py
index 2faa1ebc..94eb6ca8 100644
--- a/mii/models/score/score_template.py
+++ b/mii/models/score/score_template.py
@@ -21,7 +21,6 @@ def init():
     for deployment in configs.values():
         if not isinstance(deployment, dict):
             continue
-        print(f"\nDEPLOYMENT ->{configs.values()}")
         data = {
             'deployment_name': deployment[mii.constants.DEPLOYMENT_NAME_KEY],
             'task': deployment[mii.constants.TASK_NAME_KEY],
@@ -34,8 +33,6 @@ def init():
             'version': 1
         }
         deployments.append(mii.Deployment.parse_obj(data))
-
-    print(f"WITHIN INIT {deployments}")
     """
     deployment_name = configs[mii.constants.DEPLOYMENT_NAME_KEY]
     model_name = configs[mii.constants.MODEL_NAME_KEY]
diff --git a/mii/server.py b/mii/server.py
index 76ad9443..3ac9ee08 100644
--- a/mii/server.py
+++ b/mii/server.py
@@ -280,9 +280,7 @@ def _initialize_service(self, deployment_tag, deployments, model_path, lb_config
         for i, repl_config in enumerate(lb_config.replica_configs):
             name = repl_config.deployment_name
             deployment = None
-            print(f"IN SERVER NAME -> {name}")
             for dep in deployments:
-                print(f"\nDEPLOYMENT_NAME {dep.deployment_name}")
                 if dep.deployment_name == name:
                     deployment = dep
             hostfile = tempfile.NamedTemporaryFile(delete=False)

From 7c0ee125f94856227d9719ee2f1023a116fd6e1e Mon Sep 17 00:00:00 2001
From: Tosin Segun <insanechills@gmail.com>
Date: Thu, 6 Jul 2023 22:44:18 +0000
Subject: [PATCH 33/69] Fixing non-persistent deloyments

---
 mii/deployment.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mii/deployment.py b/mii/deployment.py
index 986186f3..2fe30830 100644
--- a/mii/deployment.py
+++ b/mii/deployment.py
@@ -175,7 +175,7 @@ def deploy(task=None,
         assert int(os.getenv('WORLD_SIZE', '1')) == mii_config.tensor_parallel, "World Size does not equal number of tensors. When using non-persistent deployment type, please launch with `deepspeed --num_gpus <tensor_parallel>`"
         provider = MODEL_PROVIDER_MAP[get_provider_name(model, task)]
         mii.non_persistent_models[deployment_name] = (load_models(
-            get_task_name(task),
+            task,
             model,
             model_path,
             enable_deepspeed,

From 7a956d5260a38537f4dc7c9dc2449dd231fe787b Mon Sep 17 00:00:00 2001
From: Tosin Segun <insanechills@gmail.com>
Date: Fri, 7 Jul 2023 22:34:16 +0000
Subject: [PATCH 34/69] Refactoring Load balancer launch

---
 mii/client.py                            |  1 +
 mii/grpc_related/modelresponse_server.py |  7 +++---
 mii/launch/load_balance_server.py        | 31 ++++++++++++++++++++++++
 mii/launch/multi_gpu_server.py           | 12 +--------
 mii/launch/utils.py                      | 15 ++++++++++++
 mii/server.py                            | 29 ++++++++--------------
 6 files changed, 61 insertions(+), 34 deletions(-)
 create mode 100644 mii/launch/load_balance_server.py
 create mode 100644 mii/launch/utils.py

diff --git a/mii/client.py b/mii/client.py
index 2282fc27..cc25b77b 100644
--- a/mii/client.py
+++ b/mii/client.py
@@ -86,6 +86,7 @@ async def _request_async_response(self, request_dict, task, **query_kwargs):
     def query(self, request_dict, deployment_name=None, **query_kwargs):
         task = None
         if deployment_name is None:  #mii.terminate() or single model
+            #assert len(self.deployments) == 1, "Must pass deployment_name to query when using multiple deployments"
             deployment_name = self.deployments[0][mii.constants.DEPLOYMENT_NAME_KEY]
             task = get_task(self.deployments[0][mii.constants.TASK_NAME_KEY])
         else:
diff --git a/mii/grpc_related/modelresponse_server.py b/mii/grpc_related/modelresponse_server.py
index 0531f68a..01bc1310 100644
--- a/mii/grpc_related/modelresponse_server.py
+++ b/mii/grpc_related/modelresponse_server.py
@@ -164,7 +164,7 @@ def invoke(self, method_name, proto_request):
 
 
 class LoadBalancingInterceptor(grpc.ServerInterceptor):
-    def __init__(self, task_name, replica_configs):
+    def __init__(self, replica_configs):
         super().__init__()
         self.asyncio_loop = asyncio.get_event_loop()
 
@@ -306,11 +306,10 @@ def serve_inference(inference_pipeline, port):
     _do_serve(ModelResponse(inference_pipeline), port)
 
 
-def serve_load_balancing(task_name, lb_config):
+def serve_load_balancing(lb_config):
     _do_serve(ServiceBase(),
               lb_config.port,
-              [LoadBalancingInterceptor(task_name,
-                                        lb_config.replica_configs)])
+              [LoadBalancingInterceptor(lb_config.replica_configs)])
 
 
 if __name__ == '__main__':
diff --git a/mii/launch/load_balance_server.py b/mii/launch/load_balance_server.py
new file mode 100644
index 00000000..01de3822
--- /dev/null
+++ b/mii/launch/load_balance_server.py
@@ -0,0 +1,31 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+import argparse
+
+from mii import LoadBalancerConfig
+
+from mii.grpc_related.modelresponse_server import serve_load_balancing
+from .utils import decode_config_from_str
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--load-balancer",
+                        type=str,
+                        default=None,
+                        help="base64 encoded load balancer config")
+
+    args = parser.parse_args()
+    assert args.load_balancer is not None, "lb_config required to use load balancer"
+    lb_config_dict = decode_config_from_str(args.load_balancer)
+    lb_config = LoadBalancerConfig(**lb_config_dict)
+
+    print(f"Starting load balancer on port: {lb_config.port}")
+    serve_load_balancing(lb_config)
+
+
+if __name__ == "__main__":
+    # python -m mii.launch.load_balance_server
+    main()
diff --git a/mii/launch/multi_gpu_server.py b/mii/launch/multi_gpu_server.py
index 27878725..1f7fc00a 100644
--- a/mii/launch/multi_gpu_server.py
+++ b/mii/launch/multi_gpu_server.py
@@ -5,23 +5,13 @@
 import os
 import argparse
 import mii
-import base64
-import json
 
 from mii import MIIConfig, LoadBalancerConfig
 
 from mii.models.load_models import load_models
 from mii.grpc_related.modelresponse_server import serve_inference, serve_load_balancing
 from mii.grpc_related.restful_gateway import RestfulGatewayThread
-
-
-def decode_config_from_str(config_str):
-    # str -> bytes
-    b64_bytes = config_str.encode()
-    # decode b64 bytes -> json bytes
-    config_bytes = base64.urlsafe_b64decode(b64_bytes)
-    # convert json bytes -> str -> dict
-    return json.loads(config_bytes.decode())
+from .utils import decode_config_from_str
 
 
 def main():
diff --git a/mii/launch/utils.py b/mii/launch/utils.py
new file mode 100644
index 00000000..9e039409
--- /dev/null
+++ b/mii/launch/utils.py
@@ -0,0 +1,15 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+import base64
+import json
+
+
+def decode_config_from_str(config_str):
+    # str -> bytes
+    b64_bytes = config_str.encode()
+    # decode b64 bytes -> json bytes
+    config_bytes = base64.urlsafe_b64decode(b64_bytes)
+    # convert json bytes -> str -> dict
+    return json.loads(config_bytes.decode())
diff --git a/mii/server.py b/mii/server.py
index 3ac9ee08..ceaf2912 100644
--- a/mii/server.py
+++ b/mii/server.py
@@ -152,19 +152,17 @@ def print_helper(self, args):
         printable_string += " " + "-" * 60
         return printable_string
 
-    def _launch_load_balancer(self,
-                              deployment_name,
-                              model_name,
-                              model_path,
-                              ds_optimize,
-                              ds_zero,
-                              ds_config,
-                              mii_configs,
-                              lb_config):
+    def _launch_load_balancer(self, model_path, lb_config):
 
         # serialize mii config
         b64_config_str = config_to_b64_str(lb_config)
-
+        launch_str = f"{sys.executable} -m mii.launch.load_balance_server --load-balancer {b64_config_str}"
+        cmd = launch_str.split(" ")
+        mii_env = os.environ.copy()
+        mii_env["TRANSFORMERS_CACHE"] = model_path
+        logger.info(f"load balancer server launch: {cmd}")
+        return subprocess.Popen(cmd, env=mii_env)
+        """
         return self._launch_server_process(
             deployment_name,
             model_name,
@@ -176,6 +174,7 @@ def _launch_load_balancer(self,
             mii_configs.port_number,
             "load balancer",
             ex_server_args=[f"--load-balancer {b64_config_str}"])
+        """
 
     def _launch_restful_gateway(self,
                                 deployment_name,
@@ -307,15 +306,7 @@ def _initialize_service(self, deployment_tag, deployments, model_path, lb_config
             # we don't use deepspeed launcher for the load balancer because it does not need a GPU.
             # The deepspeed launcher determines the number of processes to launch based on GPUs available on the host or CUDA_VISIBLE_DEVICES,
             # and it is expected to assign one GPU to one process.
-        processes.append(
-            self._launch_load_balancer(self.deployments[0].deployment_name,
-                                       self.deployments[0].model,
-                                       model_path,
-                                       self.deployments[0].enable_deepspeed,
-                                       self.deployments[0].enable_zero,
-                                       self.deployments[0].ds_config,
-                                       self.deployments[0].mii_config,
-                                       lb_config))
+        processes.append(self._launch_load_balancer(model_path, lb_config))
 
         for deployment in self.deployments:
             if deployment.mii_config.enable_restful_api:

From f8cfe28f0a0edf8f23b6b2b17f0b01ea9e5c8a0d Mon Sep 17 00:00:00 2001
From: Tosin Segun <insanechills@gmail.com>
Date: Mon, 10 Jul 2023 20:52:35 +0000
Subject: [PATCH 35/69] Fixing restful gateway client

---
 mii/client.py                       | 4 ++--
 mii/grpc_related/restful_gateway.py | 4 +++-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/mii/client.py b/mii/client.py
index cc25b77b..a33b71ba 100644
--- a/mii/client.py
+++ b/mii/client.py
@@ -87,8 +87,8 @@ def query(self, request_dict, deployment_name=None, **query_kwargs):
         task = None
         if deployment_name is None:  #mii.terminate() or single model
             #assert len(self.deployments) == 1, "Must pass deployment_name to query when using multiple deployments"
-            deployment_name = self.deployments[0][mii.constants.DEPLOYMENT_NAME_KEY]
-            task = get_task(self.deployments[0][mii.constants.TASK_NAME_KEY])
+            deployment_name = self.deployments[0]['deployment_name']
+            task = get_task(self.deployments[0]['task_name'])
         else:
             for deployment in self.deployments:
                 if deployment[mii.constants.DEPLOYMENT_NAME_KEY] == deployment_name:
diff --git a/mii/grpc_related/restful_gateway.py b/mii/grpc_related/restful_gateway.py
index e8cfa934..d3dc53da 100644
--- a/mii/grpc_related/restful_gateway.py
+++ b/mii/grpc_related/restful_gateway.py
@@ -19,7 +19,9 @@ def shutdown(thread):
 
 def createRestfulGatewayApp(deployment_name, task, mii_config, server_thread):
     # client must be thread-safe
-    client = mii.MIIClient(task, "localhost", mii_config.port_number)
+    client = mii.mii_query_handle(deployment_name)
+
+    #client = mii.MIIClient(deployment_name, "localhost", mii_config.port_number)
 
     class RestfulGatewayService(Resource):
         def __init__(self):

From 079807d8255f27119d3007f1fb62861645ec9c41 Mon Sep 17 00:00:00 2001
From: Tosin Segun <insanechills@gmail.com>
Date: Mon, 10 Jul 2023 22:23:53 +0000
Subject: [PATCH 36/69] Fixing replica issue

---
 mii/grpc_related/modelresponse_server.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/mii/grpc_related/modelresponse_server.py b/mii/grpc_related/modelresponse_server.py
index 01bc1310..9fb4608f 100644
--- a/mii/grpc_related/modelresponse_server.py
+++ b/mii/grpc_related/modelresponse_server.py
@@ -176,11 +176,9 @@ def __init__(self, replica_configs):
             self.counter[repl.deployment_name] = AtomicCounter()
 
         for repl in replica_configs:
-            self.stubs[repl.deployment_name].extend(
-                ParallelStubInvoker(replica.hostname,
-                                    replica.tensor_parallel_ports)
-                for replica in replica_configs
-                if replica.deployment_name == repl.deployment_name)
+            self.stubs[repl.deployment_name].append(
+                ParallelStubInvoker(repl.hostname,
+                                    repl.tensor_parallel_ports))
         """
         self.counter = AtomicCounter()
         self.task = get_task(task_name)

From ea1e47e2bf2a3b03964e76e3cb309044a6e9dc87 Mon Sep 17 00:00:00 2001
From: Tosin Segun <insanechills@gmail.com>
Date: Mon, 10 Jul 2023 22:42:35 +0000
Subject: [PATCH 37/69] Fixing non persistent client

---
 mii/client.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mii/client.py b/mii/client.py
index a33b71ba..60c6ae44 100644
--- a/mii/client.py
+++ b/mii/client.py
@@ -200,7 +200,7 @@ def destroy_session(self, session_id):
 
 class MIINonPersistentClient():
     def __init__(self, task, deployment_name):
-        self.task = task
+        self.task = get_task(task)
         self.deployment_name = deployment_name
 
     def query(self, request_dict, **query_kwargs):

From 98b61290c7bc5c55251adb5d7b2834bcfebc9545 Mon Sep 17 00:00:00 2001
From: Mahesh Sinha <31616939+msinha251@users.noreply.github.com>
Date: Wed, 12 Jul 2023 00:43:04 +0200
Subject: [PATCH 38/69] Adding trust_remote_code support (#203)

- [Enhancement](https://github.com/microsoft/DeepSpeed-MII/issues/181)

Co-authored-by: Mahesh Sinha <mahesh.sinha@workday.com>
Co-authored-by: Michael Wyatt <michaelwyatt@microsoft.com>
---
 mii/config.py                       | 1 +
 mii/models/providers/huggingface.py | 1 +
 2 files changed, 2 insertions(+)

diff --git a/mii/config.py b/mii/config.py
index 6a8bac16..2714cb40 100644
--- a/mii/config.py
+++ b/mii/config.py
@@ -56,6 +56,7 @@ class MIIConfig(BaseModel):
     restful_api_port: int = 51080
     replica_num: int = 1
     hostfile: str = DLTS_HOSTFILE
+    trust_remote_code: bool = False
 
     @validator("deploy_rank")
     def deploy_valid(cls, field_value, values):
diff --git a/mii/models/providers/huggingface.py b/mii/models/providers/huggingface.py
index c04a6829..27f456aa 100644
--- a/mii/models/providers/huggingface.py
+++ b/mii/models/providers/huggingface.py
@@ -194,5 +194,6 @@ def hf_provider(model_path, model_name, task_name, mii_config):
             framework="pt",
             use_auth_token=mii_config.hf_auth_token,
             torch_dtype=mii_config.dtype,
+            trust_remote_code=mii_config.trust_remote_code,
         )
         return inference_pipeline

From daab5e68a9494e43c8bb70cfc17d5b9f6d77c6e7 Mon Sep 17 00:00:00 2001
From: Tosin Segun <insanechills@gmail.com>
Date: Wed, 12 Jul 2023 22:58:47 +0000
Subject: [PATCH 39/69] Refactoring

---
 ...eration-bloom560m-example.py => deploy.py} | 13 ++--
 examples/multi_model/query.py                 | 17 +++---
 examples/multi_model/shutdown.py              |  2 +-
 mii/__init__.py                               |  2 +-
 mii/client.py                                 | 61 ++++++-------------
 mii/config.py                                 |  2 +-
 mii/deployment.py                             | 22 +++----
 mii/grpc_related/modelresponse_server.py      | 10 ++-
 mii/models/score/generate.py                  |  8 +--
 mii/models/score/score_template.py            |  2 +-
 10 files changed, 57 insertions(+), 82 deletions(-)
 rename examples/multi_model/{text-generation-bloom560m-example.py => deploy.py} (87%)

diff --git a/examples/multi_model/text-generation-bloom560m-example.py b/examples/multi_model/deploy.py
similarity index 87%
rename from examples/multi_model/text-generation-bloom560m-example.py
rename to examples/multi_model/deploy.py
index f070195e..455ed498 100644
--- a/examples/multi_model/text-generation-bloom560m-example.py
+++ b/examples/multi_model/deploy.py
@@ -9,11 +9,15 @@
 gpu_index_map3 = {'master': [0, 1]}
 
 deployments = []
+
 mii_configs1 = {"tensor_parallel": 2, "dtype": "fp16"}
+mii_configs2 = {"tensor_parallel": 1}
+
+name = "bigscience/bloom-560m"
 deployments.append(
     mii.Deployment(task='text-generation',
-                   model="bigscience/bloom-560m",
-                   deployment_name="bloom560m_deployment",
+                   model=name,
+                   deployment_name=name + "_deployment",
                    GPU_index_map=gpu_index_map3,
                    mii_config=mii.config.MIIConfig(**mii_configs1)))
 
@@ -25,10 +29,7 @@
                    deployment_name=name + "_deployment",
                    GPU_index_map=gpu_index_map2))
 
-mii_configs2 = {"tensor_parallel": 1}
-
 name = "microsoft/DialoGPT-large"
-
 deployments.append(
     mii.Deployment(task='conversational',
                    model=name,
@@ -43,4 +44,4 @@
                    deployment_name=name + "-qa-deployment",
                    GPU_index_map=gpu_index_map2))
 
-mii.deploy(deployment_tag="first_test", deployments=deployments)
+mii.deploy(deployment_tag="multi_models", deployments=deployments)
diff --git a/examples/multi_model/query.py b/examples/multi_model/query.py
index 2be15c8b..519953a1 100644
--- a/examples/multi_model/query.py
+++ b/examples/multi_model/query.py
@@ -4,9 +4,9 @@
 # DeepSpeed Team
 
 import mii
-import time
 
-generator = mii.mii_query_handle("first_test")
+results = []
+generator = mii.mii_query_handle("multi_models")
 result = generator.query(
     {"query": ["DeepSpeed is",
                "Seattle is"]},
@@ -14,14 +14,11 @@
     do_sample=True,
     max_new_tokens=30,
 )
-print(result)
+results.append(result)
 
-time.sleep(5)
 result = generator.query({'query': "DeepSpeed is the greatest"},
                          "microsoft/DialogRPT-human-vs-rand_deployment")
-print(result)
-
-time.sleep(5)
+results.append(result)
 
 result = generator.query(
     {
@@ -31,12 +28,12 @@
         'generated_responses': []
     },
     "microsoft/DialoGPT-large_deployment")
-print(result)
+results.append(result)
 
-results = generator.query(
+result = generator.query(
     {
         'question': "What is the greatest?",
         'context': "DeepSpeed is the greatest"
     },
     "deepset/roberta-large-squad2" + "-qa-deployment")
-print(results)
+results.append(result)
diff --git a/examples/multi_model/shutdown.py b/examples/multi_model/shutdown.py
index 281389c4..6b718a4d 100644
--- a/examples/multi_model/shutdown.py
+++ b/examples/multi_model/shutdown.py
@@ -4,4 +4,4 @@
 # DeepSpeed Team
 import mii
 
-mii.terminate("first_test")
+mii.terminate("multi_models")
diff --git a/mii/__init__.py b/mii/__init__.py
index b0008c06..66748a56 100644
--- a/mii/__init__.py
+++ b/mii/__init__.py
@@ -10,7 +10,7 @@
 from .constants import DeploymentType, Tasks
 from .aml_related.utils import aml_output_path
 
-from .config import MIIConfig, LoadBalancerConfig, Deployment
+from .config import MIIConfig, LoadBalancerConfig, DeploymentConfig
 from .grpc_related.proto import modelresponse_pb2_grpc
 
 __version__ = "0.0.0"
diff --git a/mii/client.py b/mii/client.py
index 60c6ae44..13131cbb 100644
--- a/mii/client.py
+++ b/mii/client.py
@@ -12,24 +12,14 @@
 from mii.method_table import GRPC_METHOD_TABLE
 
 
-def _get_deployment_info(deployment_tag):
+def _get_deployment_configs(deployment_tag):
     deployments = []
     configs = mii.utils.import_score_file(deployment_tag).configs
     for deployment in configs:
         if not isinstance(configs[deployment], dict):
             continue
         deployments.append(configs[deployment])
-        mii_configs_dict = configs[deployment][mii.constants.MII_CONFIGS_KEY]
-        mii_configs = mii.config.MIIConfig(**mii_configs_dict)
     return deployments
-    """
-    task = configs[deployment_name][mii.constants.TASK_NAME_KEY]
-    mii_configs_dict = configs[deployment_name][mii.constants.MII_CONFIGS_KEY]
-    mii_configs = mii.config.MIIConfig(**mii_configs_dict)
-
-    assert task is not None, "The task name should be set before calling init"
-    return task, mii_configs
-    """
 
 
 def mii_query_handle(deployment_tag):
@@ -49,7 +39,7 @@ def mii_query_handle(deployment_tag):
         inference_pipeline, task = mii.non_persistent_models[deployment_tag]
         return MIINonPersistentClient(task, deployment_tag)
 
-    deployments = _get_deployment_info(deployment_tag)
+    deployments = _get_deployment_configs(deployment_tag)
     mii_configs_dict = deployments[0][mii.constants.MII_CONFIGS_KEY]
     mii_configs = mii.config.MIIConfig(**mii_configs_dict)
     return MIIClient(deployments, "localhost", mii_configs.port_number)
@@ -74,6 +64,20 @@ def __init__(self, deployments, host, port):
         #self.task = get_task(task_name)
         self.deployments = deployments
 
+    def _get_deployment_task(self, deployment_name=None):
+        task = None
+        if deployment_name is None:  #mii.terminate() or single model
+            assert len(self.deployments) == 1, "Must pass deployment_name to query when using multiple deployments"
+            deployment_name = self.deployments[0][mii.constants.DEPLOYMENT_NAME_KEY]
+            task = get_task(self.deployments[0][mii.constants.TASK_NAME_KEY])
+        else:
+            for deployment in self.deployments:
+                if deployment[mii.constants.DEPLOYMENT_NAME_KEY] == deployment_name:
+                    task = get_task(deployment[mii.constants.TASK_NAME_KEY])
+                    break
+            assert False, f"{deployment_name} not found in list of deployments"
+        return deployment_name, task
+
     async def _request_async_response(self, request_dict, task, **query_kwargs):
         if task not in GRPC_METHOD_TABLE:
             raise ValueError(f"unknown task: {task}")
@@ -84,16 +88,7 @@ async def _request_async_response(self, request_dict, task, **query_kwargs):
         return task_methods.unpack_response_from_proto(proto_response)
 
     def query(self, request_dict, deployment_name=None, **query_kwargs):
-        task = None
-        if deployment_name is None:  #mii.terminate() or single model
-            #assert len(self.deployments) == 1, "Must pass deployment_name to query when using multiple deployments"
-            deployment_name = self.deployments[0]['deployment_name']
-            task = get_task(self.deployments[0]['task_name'])
-        else:
-            for deployment in self.deployments:
-                if deployment[mii.constants.DEPLOYMENT_NAME_KEY] == deployment_name:
-                    task = get_task(deployment[mii.constants.TASK_NAME_KEY])
-                    break
+        deployment_name, task = self._get_deployment_task(deployment_name)
         query_kwargs['deployment_name'] = deployment_name
         return self.asyncio_loop.run_until_complete(
             self._request_async_response(request_dict,
@@ -112,15 +107,7 @@ async def create_session_async(self, session_id):
             modelresponse_pb2.SessionID(session_id=session_id))
 
     def create_session(self, session_id, deployment_name=None):
-        task = None
-        if deployment_name is None:  #mii.terminate() or single model
-            deployment_name = self.deployments[0][mii.constants.DEPLOYMENT_NAME_KEY]
-            task = get_task(self.deployments[0][mii.constants.TASK_NAME_KEY])
-        else:
-            for deployment in self.deployments:
-                if deployment[mii.constants.DEPLOYMENT_NAME_KEY] == deployment_name:
-                    task = get_task(deployment[mii.constants.TASK_NAME_KEY])
-                    break
+        deployment_name, task = self._get_deployment_task(deployment_name)
         assert task == Tasks.TEXT_GENERATION, f"Session creation only available for task '{Tasks.TEXT_GENERATION}'."
         return self.asyncio_loop.run_until_complete(
             self.create_session_async(session_id))
@@ -130,15 +117,7 @@ async def destroy_session_async(self, session_id):
                                        )
 
     def destroy_session(self, session_id, deployment_name=None):
-        task = None
-        if deployment_name is None:  #mii.terminate() or single model
-            deployment_name = self.deployments[0][mii.constants.DEPLOYMENT_NAME_KEY]
-            task = get_task(self.deployments[0][mii.constants.TASK_NAME_KEY])
-        else:
-            for deployment in self.deployments:
-                if deployment[mii.constants.DEPLOYMENT_NAME_KEY] == deployment_name:
-                    task = get_task(deployment[mii.constants.TASK_NAME_KEY])
-                    break
+        deployment_name, task = self._get_deployment_task(deployment_name)
         assert task == Tasks.TEXT_GENERATION, f"Session deletion only available for task '{Tasks.TEXT_GENERATION}'."
         self.asyncio_loop.run_until_complete(self.destroy_session_async(session_id))
 
@@ -232,7 +211,7 @@ def terminate(self):
 
 
 def terminate_restful_gateway(deployment_tag):
-    deployments = _get_deployment_info(deployment_tag)
+    deployments = _get_deployment_configs(deployment_tag)
     for deployment in deployments:
         mii_configs_dict = deployment[mii.constants.MII_CONFIGS_KEY]
         mii_configs = mii.config.MIIConfig(**mii_configs_dict)
diff --git a/mii/config.py b/mii/config.py
index 4eb6b597..6d7be86e 100644
--- a/mii/config.py
+++ b/mii/config.py
@@ -129,7 +129,7 @@ class Config:
 validate_assignment = True
 
 
-class Deployment(BaseModel):
+class DeploymentConfig(BaseModel):
     deployment_name: str
     task: str
     model: str
diff --git a/mii/deployment.py b/mii/deployment.py
index 2fe30830..dc970035 100644
--- a/mii/deployment.py
+++ b/mii/deployment.py
@@ -13,7 +13,7 @@
 from .utils import logger, get_task_name, get_provider_name
 from .models.score import create_score_file
 from .models import load_models
-from .config import ReplicaConfig, LoadBalancerConfig, Deployment
+from .config import ReplicaConfig, LoadBalancerConfig, DeploymentConfig
 
 
 def deploy(task=None,
@@ -71,15 +71,15 @@ def deploy(task=None,
     if not deployments:
         assert all((model, task, deployment_name)), "model, task, and deployment name must be set to deploy singular model"
         deployments = [
-            Deployment(deployment_name=deployment_name,
-                       task=task,
-                       model=model,
-                       enable_deepspeed=enable_deepspeed,
-                       enable_zero=enable_zero,
-                       GPU_index_map=None,
-                       mii_config=mii.config.MIIConfig(**mii_config),
-                       ds_config=ds_config,
-                       version=version)
+            DeploymentConfig(deployment_name=deployment_name,
+                             task=task,
+                             model=model,
+                             enable_deepspeed=enable_deepspeed,
+                             enable_zero=enable_zero,
+                             GPU_index_map=None,
+                             mii_config=mii.config.MIIConfig(**mii_config),
+                             ds_config=ds_config,
+                             version=version)
         ]
         deployment_tag = deployment_name
     else:
@@ -129,6 +129,7 @@ def deploy(task=None,
     # add fields for replica deployment
     replica_configs = []
     port_map = {}
+    port_offset = 1
     for deployment in deployments:
         mii_config = deployment.mii_config
         replica_pool = _allocate_processes(mii_config.hostfile,
@@ -140,7 +141,6 @@ def deploy(task=None,
             # Reserver port for a LB proxy when replication is enabled
             if hostname not in port_map:
                 port_map[hostname] = set()
-            port_offset = 1
             base_port = mii_config.port_number + i * mii_config.tensor_parallel + port_offset
             if base_port in port_map[hostname]:
                 base_port = max(port_map[hostname]) + 1
diff --git a/mii/grpc_related/modelresponse_server.py b/mii/grpc_related/modelresponse_server.py
index 9fb4608f..20007b91 100644
--- a/mii/grpc_related/modelresponse_server.py
+++ b/mii/grpc_related/modelresponse_server.py
@@ -171,9 +171,11 @@ def __init__(self, replica_configs):
         self.stubs = {}
         self.counter = {}
         self.replica_configs = replica_configs
+        self.tasks = {}
         for repl in replica_configs:
             self.stubs[repl.deployment_name] = []
             self.counter[repl.deployment_name] = AtomicCounter()
+            self.tasks[repl.deployment_name] = repl.task
 
         for repl in replica_configs:
             self.stubs[repl.deployment_name].append(
@@ -215,11 +217,8 @@ def invoke_intercept_method(request_proto, context):
                 assert "deployment_name" in kwargs, "Must include deployment_name in kwargs for query"
             deployment_name = kwargs.get('deployment_name')
             kwargs.pop('deployment_name', None)
-            task = None
-            for repl in self.replica_configs:
-                if repl.deployment_name == deployment_name:
-                    task = repl.task
-                    break
+            task = self.tasks[deployment_name]
+            assert task is not None, f"task for {deployment_name} not found"
             method = GRPC_METHOD_TABLE[get_task(task)]
             new_request = None
             if method_name == "ConversationalReply":
@@ -271,7 +270,6 @@ def invoke_intercept_method(request_proto, context):
                     raise ValueError(f"session not found")
                 replica_index = self.replica_sessions[session_id]
 
-            assert new_request is not None, "test"
             ret = self.stubs[deployment_name][replica_index].invoke(
                 method_name,
                 new_request)
diff --git a/mii/models/score/generate.py b/mii/models/score/generate.py
index ecd15ffe..7a3ae021 100644
--- a/mii/models/score/generate.py
+++ b/mii/models/score/generate.py
@@ -38,10 +38,10 @@ def create_score_file(deployment_tag,
     if lb_config is not None:
         config_dict[mii.constants.LOAD_BALANCER_CONFIG_KEY] = lb_config
 
-        if len(mii.__path__) > 1:
-            logger.warning(
-                f"Detected mii path as multiple sources: {mii.__path__}, might cause unknown behavior"
-            )
+    if len(mii.__path__) > 1:
+        logger.warning(
+            f"Detected mii path as multiple sources: {mii.__path__}, might cause unknown behavior"
+        )
 
     with open(os.path.join(mii.__path__[0],
                            "models/score/score_template.py"),
diff --git a/mii/models/score/score_template.py b/mii/models/score/score_template.py
index 94eb6ca8..83b46de3 100644
--- a/mii/models/score/score_template.py
+++ b/mii/models/score/score_template.py
@@ -32,7 +32,7 @@ def init():
             'ds_config': deployment[mii.constants.DEEPSPEED_CONFIG_KEY],
             'version': 1
         }
-        deployments.append(mii.Deployment.parse_obj(data))
+        deployments.append(mii.DeploymentConfig.parse_obj(data))
     """
     deployment_name = configs[mii.constants.DEPLOYMENT_NAME_KEY]
     model_name = configs[mii.constants.MODEL_NAME_KEY]

From 84073f9f60086c1f335cbc983bbcf3d0c3c29f87 Mon Sep 17 00:00:00 2001
From: TosinSeg <90005810+TosinSeg@users.noreply.github.com>
Date: Wed, 12 Jul 2023 15:59:43 -0700
Subject: [PATCH 40/69] Update mii/models/score/generate.py

Co-authored-by: Michael Wyatt <mrwyattii@gmail.com>
---
 mii/models/score/generate.py | 25 ++++++++++---------------
 1 file changed, 10 insertions(+), 15 deletions(-)

diff --git a/mii/models/score/generate.py b/mii/models/score/generate.py
index 7a3ae021..dc73fdb9 100644
--- a/mii/models/score/generate.py
+++ b/mii/models/score/generate.py
@@ -19,21 +19,16 @@ def create_score_file(deployment_tag,
     config_dict[mii.constants.MODEL_PATH_KEY] = model_path
     config_dict[mii.constants.DEPLOYMENT_TAG_KEY] = deployment_tag
     for deployment in deployments:
-        config_dict[deployment.deployment_name] = {}
-        config_dict[deployment.deployment_name][
-            mii.constants.DEPLOYMENT_NAME_KEY] = deployment.deployment_name
-        config_dict[deployment.deployment_name][
-            mii.constants.TASK_NAME_KEY] = mii.utils.get_task_name(deployment.task)
-        config_dict[deployment.deployment_name][
-            mii.constants.MODEL_NAME_KEY] = deployment.model
-        config_dict[deployment.deployment_name][
-            mii.constants.ENABLE_DEEPSPEED_KEY] = deployment.enable_deepspeed
-        config_dict[deployment.deployment_name][
-            mii.constants.MII_CONFIGS_KEY] = deployment.mii_config.dict()
-        config_dict[deployment.deployment_name][
-            mii.constants.ENABLE_DEEPSPEED_ZERO_KEY] = deployment.enable_zero
-        config_dict[deployment.deployment_name][
-            mii.constants.DEEPSPEED_CONFIG_KEY] = deployment.ds_config
+        deployment_config = {
+            mii.constants.DEPLOYMENT_NAME_KEY: deployment.deployment_name,
+            mii.constants.TASK_NAME_KEY: mii.utils.get_task_name(deployment.task),
+            mii.constants.MODEL_NAME_KEY: deployment.model,
+            mii.constants.ENABLE_DEEPSPEED_KEY: deployment.enable_deepspeed,
+            mii.constants.MII_CONFIGS_KEY: deployment.mii_config.dict(),
+            mii.constants.ENABLE_DEEPSPEED_ZERO_KEY: deployment.enable_zero,
+            mii.constants.DEEPSPEED_CONFIG_KEY: deployment.ds_config,
+        }
+        config_dict[deployment.deployment_name] = deployment_config
 
     if lb_config is not None:
         config_dict[mii.constants.LOAD_BALANCER_CONFIG_KEY] = lb_config

From b4edc2bd8617fee80286b00e1deb223533837db3 Mon Sep 17 00:00:00 2001
From: Tosin Segun <insanechils@gmail.com>
Date: Thu, 13 Jul 2023 23:29:32 +0000
Subject: [PATCH 41/69] Refactoring Load Balancer and request_proto

---
 examples/multi_model/deploy.py                |   8 +-
 examples/multi_model/query.py                 |  21 +-
 mii/client.py                                 |  10 +-
 mii/grpc_related/modelresponse_server.py      |  16 +-
 mii/grpc_related/proto/modelresponse.proto    |   8 +
 mii/grpc_related/proto/modelresponse_pb2.py   |  97 ++--
 .../proto/modelresponse_pb2_grpc.py           | 520 +++++++-----------
 mii/method_table.py                           |  12 +-
 8 files changed, 301 insertions(+), 391 deletions(-)

diff --git a/examples/multi_model/deploy.py b/examples/multi_model/deploy.py
index 455ed498..03d2a3a0 100644
--- a/examples/multi_model/deploy.py
+++ b/examples/multi_model/deploy.py
@@ -15,7 +15,7 @@
 
 name = "bigscience/bloom-560m"
 deployments.append(
-    mii.Deployment(task='text-generation',
+    mii.DeploymentConfig(task='text-generation',
                    model=name,
                    deployment_name=name + "_deployment",
                    GPU_index_map=gpu_index_map3,
@@ -24,14 +24,14 @@
 # gpt2
 name = "microsoft/DialogRPT-human-vs-rand"
 deployments.append(
-    mii.Deployment(task='text-classification',
+    mii.DeploymentConfig(task='text-classification',
                    model=name,
                    deployment_name=name + "_deployment",
                    GPU_index_map=gpu_index_map2))
 
 name = "microsoft/DialoGPT-large"
 deployments.append(
-    mii.Deployment(task='conversational',
+    mii.DeploymentConfig(task='conversational',
                    model=name,
                    deployment_name=name + "_deployment",
                    GPU_index_map=gpu_index_map1,
@@ -39,7 +39,7 @@
 
 name = "deepset/roberta-large-squad2"
 deployments.append(
-    mii.Deployment(task="question-answering",
+    mii.DeploymentConfig(task="question-answering",
                    model=name,
                    deployment_name=name + "-qa-deployment",
                    GPU_index_map=gpu_index_map2))
diff --git a/examples/multi_model/query.py b/examples/multi_model/query.py
index 519953a1..caf85934 100644
--- a/examples/multi_model/query.py
+++ b/examples/multi_model/query.py
@@ -9,15 +9,16 @@
 generator = mii.mii_query_handle("multi_models")
 result = generator.query(
     {"query": ["DeepSpeed is",
-               "Seattle is"]},
-    "bloom560m_deployment",
+               "Seattle is"],
+
+              "deployment_name": "bigscience/bloom-560m_deployment"
+    },
     do_sample=True,
     max_new_tokens=30,
 )
 results.append(result)
 
-result = generator.query({'query': "DeepSpeed is the greatest"},
-                         "microsoft/DialogRPT-human-vs-rand_deployment")
+result = generator.query({'query': "DeepSpeed is the greatest", "deployment_name": "microsoft/DialogRPT-human-vs-rand_deployment"})
 results.append(result)
 
 result = generator.query(
@@ -25,15 +26,15 @@
         'text': "DeepSpeed is the greatest",
         'conversation_id': 3,
         'past_user_inputs': [],
-        'generated_responses': []
-    },
-    "microsoft/DialoGPT-large_deployment")
+        'generated_responses': [],
+        "deployment_name": "microsoft/DialoGPT-large_deployment"
+    })
 results.append(result)
 
 result = generator.query(
     {
         'question': "What is the greatest?",
-        'context': "DeepSpeed is the greatest"
-    },
-    "deepset/roberta-large-squad2" + "-qa-deployment")
+        'context': "DeepSpeed is the greatest",
+        "deployment_name": "deepset/roberta-large-squad2" + "-qa-deployment"
+    })
 results.append(result)
diff --git a/mii/client.py b/mii/client.py
index 13131cbb..d71dce89 100644
--- a/mii/client.py
+++ b/mii/client.py
@@ -74,7 +74,7 @@ def _get_deployment_task(self, deployment_name=None):
             for deployment in self.deployments:
                 if deployment[mii.constants.DEPLOYMENT_NAME_KEY] == deployment_name:
                     task = get_task(deployment[mii.constants.TASK_NAME_KEY])
-                    break
+                    return deployment_name, task
             assert False, f"{deployment_name} not found in list of deployments"
         return deployment_name, task
 
@@ -87,9 +87,9 @@ async def _request_async_response(self, request_dict, task, **query_kwargs):
         proto_response = await getattr(self.stub, task_methods.method)(proto_request)
         return task_methods.unpack_response_from_proto(proto_response)
 
-    def query(self, request_dict, deployment_name=None, **query_kwargs):
+    def query(self, request_dict, **query_kwargs):
+        deployment_name = request_dict.get('deployment_name')
         deployment_name, task = self._get_deployment_task(deployment_name)
-        query_kwargs['deployment_name'] = deployment_name
         return self.asyncio_loop.run_until_complete(
             self._request_async_response(request_dict,
                                          task,
@@ -107,6 +107,8 @@ async def create_session_async(self, session_id):
             modelresponse_pb2.SessionID(session_id=session_id))
 
     def create_session(self, session_id, deployment_name=None):
+        if len(self.deployments > 1):
+            assert deployment_name is not None, "Deployment name must be passed in to create session when there are multiple models"
         deployment_name, task = self._get_deployment_task(deployment_name)
         assert task == Tasks.TEXT_GENERATION, f"Session creation only available for task '{Tasks.TEXT_GENERATION}'."
         return self.asyncio_loop.run_until_complete(
@@ -117,6 +119,8 @@ async def destroy_session_async(self, session_id):
                                        )
 
     def destroy_session(self, session_id, deployment_name=None):
+        if len(self.deployments > 1):
+            assert deployment_name is not None, "Deployment name must be passed in to destroy session when there are multiple models"
         deployment_name, task = self._get_deployment_task(deployment_name)
         assert task == Tasks.TEXT_GENERATION, f"Session deletion only available for task '{Tasks.TEXT_GENERATION}'."
         self.asyncio_loop.run_until_complete(self.destroy_session_async(session_id))
diff --git a/mii/grpc_related/modelresponse_server.py b/mii/grpc_related/modelresponse_server.py
index 20007b91..6f123a7f 100644
--- a/mii/grpc_related/modelresponse_server.py
+++ b/mii/grpc_related/modelresponse_server.py
@@ -212,6 +212,8 @@ def invoke_intercept_method(request_proto, context):
                                     google_dot_protobuf_dot_empty__pb2.Empty())
                 self.asyncio_loop.call_soon_threadsafe(self.asyncio_loop.stop)
                 return next_handler.unary_unary(request_proto, context)
+            deployment_name = getattr(request_proto, 'deployment_name')
+            """
             kwargs = unpack_proto_query_kwargs(request_proto.query_kwargs)
             if method_name != TERMINATE_METHOD:
                 assert "deployment_name" in kwargs, "Must include deployment_name in kwargs for query"
@@ -243,7 +245,7 @@ def invoke_intercept_method(request_proto, context):
                 ) if method_name == "GeneratorReply" or method_name == "Txt2ImgReply" else str(
                     request_proto.request)
                 new_request = method.pack_request_to_proto(request_dict, **kwargs)
-
+            """
             call_count = self.counter[deployment_name].get_and_increment()
             replica_index = call_count % len(self.stubs[deployment_name])
 
@@ -254,25 +256,25 @@ def invoke_intercept_method(request_proto, context):
                 self.replica_sessions[request_proto.session_id] = replica_index
                 self.stubs[deployment_name][replica_index].invoke(
                     CREATE_SESSION_METHOD,
-                    new_request)
+                    request_proto)
                 return google_dot_protobuf_dot_empty__pb2.Empty()
 
             if method_name == DESTROY_SESSION_METHOD:
                 replica_index = self.replica_sessions.pop(request_proto.session_id)
                 self.stubs[deployment_name][replica_index].invoke(
                     DESTROY_SESSION_METHOD,
-                    new_request)
+                    request_proto)
                 return google_dot_protobuf_dot_empty__pb2.Empty()
-
-            if "session_id" in kwargs:
-                session_id = kwargs["session_id"]
+            
+            if "session_id" in request_proto.query_kwargs:
+                session_id = request_proto.query_kwargs["session_id"]
                 if session_id not in self.replica_sessions:
                     raise ValueError(f"session not found")
                 replica_index = self.replica_sessions[session_id]
 
             ret = self.stubs[deployment_name][replica_index].invoke(
                 method_name,
-                new_request)
+                request_proto)
             return ret
 
         return grpc.unary_unary_rpc_method_handler(
diff --git a/mii/grpc_related/proto/modelresponse.proto b/mii/grpc_related/proto/modelresponse.proto
index a0698899..ce55522b 100644
--- a/mii/grpc_related/proto/modelresponse.proto
+++ b/mii/grpc_related/proto/modelresponse.proto
@@ -52,29 +52,34 @@ message SessionID {
 message SingleStringRequest {
   string request = 1;
   map<string,Value> query_kwargs = 2;
+  optional string deployment_name = 3;
 }
 
 message MultiStringRequest {
   repeated string request = 1;
   map<string,Value> query_kwargs = 2;
+  optional string deployment_name = 3;
 }
 
 message SingleStringReply {
   string response = 1;
   float time_taken = 2;
   float model_time_taken = 3;
+  optional string deployment_name = 4;
 }
 
 message MultiStringReply {
   repeated string response = 1;
   float time_taken = 2;
   float model_time_taken = 3;
+  optional string deployment_name = 4;
 }
 
 message QARequest {
   string question = 1;
   string context = 2;
   map<string,Value> query_kwargs = 3;
+  optional string deployment_name = 4;
 }
 
 message ConversationRequest {
@@ -83,6 +88,7 @@ message ConversationRequest {
   repeated string past_user_inputs = 3;
   repeated string generated_responses = 4;
   map<string,Value> query_kwargs = 5;
+  optional string deployment_name = 6;
 }
 
 message ConversationReply {
@@ -91,6 +97,7 @@ message ConversationReply {
   repeated string generated_responses = 3;
   float time_taken = 4;
   float model_time_taken = 5;
+  optional string deployment_name = 6;
 }
 
 message ImageReply {
@@ -100,4 +107,5 @@ message ImageReply {
   int64 size_w = 4;
   int64 size_h = 5;
   float time_taken = 6;
+  optional string deployment_name = 7;
 }
diff --git a/mii/grpc_related/proto/modelresponse_pb2.py b/mii/grpc_related/proto/modelresponse_pb2.py
index 76b1f994..53305ca5 100644
--- a/mii/grpc_related/proto/modelresponse_pb2.py
+++ b/mii/grpc_related/proto/modelresponse_pb2.py
@@ -1,66 +1,63 @@
-# Copyright (c) Microsoft Corporation.
-# SPDX-License-Identifier: Apache-2.0
-
-# DeepSpeed Team
-
+# -*- coding: utf-8 -*-
 # Generated by the protocol buffer compiler.  DO NOT EDIT!
 # source: modelresponse.proto
 """Generated protocol buffer code."""
-from google.protobuf.internal import builder as _builder
 from google.protobuf import descriptor as _descriptor
 from google.protobuf import descriptor_pool as _descriptor_pool
 from google.protobuf import symbol_database as _symbol_database
+from google.protobuf.internal import builder as _builder
 # @@protoc_insertion_point(imports)
 
 _sym_db = _symbol_database.Default()
 
+
 from google.protobuf import empty_pb2 as google_dot_protobuf_dot_empty__pb2
 
-DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(
-    b'\n\x13modelresponse.proto\x12\rmodelresponse\x1a\x1bgoogle/protobuf/empty.proto\"_\n\x05Value\x12\x10\n\x06svalue\x18\x01 \x01(\tH\x00\x12\x10\n\x06ivalue\x18\x02 \x01(\x03H\x00\x12\x10\n\x06\x66value\x18\x03 \x01(\x02H\x00\x12\x10\n\x06\x62value\x18\x04 \x01(\x08H\x00\x42\x0e\n\x0coneof_values\"\x1f\n\tSessionID\x12\x12\n\nsession_id\x18\x01 \x01(\t\"\xbb\x01\n\x13SingleStringRequest\x12\x0f\n\x07request\x18\x01 \x01(\t\x12I\n\x0cquery_kwargs\x18\x02 \x03(\x0b\x32\x33.modelresponse.SingleStringRequest.QueryKwargsEntry\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\"\xb9\x01\n\x12MultiStringRequest\x12\x0f\n\x07request\x18\x01 \x03(\t\x12H\n\x0cquery_kwargs\x18\x02 \x03(\x0b\x32\x32.modelresponse.MultiStringRequest.QueryKwargsEntry\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\"S\n\x11SingleStringReply\x12\x10\n\x08response\x18\x01 \x01(\t\x12\x12\n\ntime_taken\x18\x02 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x03 \x01(\x02\"R\n\x10MultiStringReply\x12\x10\n\x08response\x18\x01 \x03(\t\x12\x12\n\ntime_taken\x18\x02 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x03 \x01(\x02\"\xb9\x01\n\tQARequest\x12\x10\n\x08question\x18\x01 \x01(\t\x12\x0f\n\x07\x63ontext\x18\x02 \x01(\t\x12?\n\x0cquery_kwargs\x18\x03 \x03(\x0b\x32).modelresponse.QARequest.QueryKwargsEntry\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\"\xa1\x02\n\x13\x43onversationRequest\x12\x0c\n\x04text\x18\x01 \x01(\t\x12\x1c\n\x0f\x63onversation_id\x18\x02 \x01(\x03H\x00\x88\x01\x01\x12\x18\n\x10past_user_inputs\x18\x03 \x03(\t\x12\x1b\n\x13generated_responses\x18\x04 \x03(\t\x12I\n\x0cquery_kwargs\x18\x05 \x03(\x0b\x32\x33.modelresponse.ConversationRequest.QueryKwargsEntry\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_conversation_id\"\x91\x01\n\x11\x43onversationReply\x12\x17\n\x0f\x63onversation_id\x18\x01 \x01(\x03\x12\x18\n\x10past_user_inputs\x18\x02 \x03(\t\x12\x1b\n\x13generated_responses\x18\x03 \x03(\t\x12\x12\n\ntime_taken\x18\x04 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x05 \x01(\x02\"}\n\nImageReply\x12\x0e\n\x06images\x18\x01 \x03(\x0c\x12\x1d\n\x15nsfw_content_detected\x18\x02 \x03(\x08\x12\x0c\n\x04mode\x18\x03 \x01(\t\x12\x0e\n\x06size_w\x18\x04 \x01(\x03\x12\x0e\n\x06size_h\x18\x05 \x01(\x03\x12\x12\n\ntime_taken\x18\x06 \x01(\x02\x32\xd4\x06\n\rModelResponse\x12=\n\tTerminate\x12\x16.google.protobuf.Empty\x1a\x16.google.protobuf.Empty\"\x00\x12\x43\n\rCreateSession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12\x44\n\x0e\x44\x65stroySession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12V\n\x0eGeneratorReply\x12!.modelresponse.MultiStringRequest\x1a\x1f.modelresponse.MultiStringReply\"\x00\x12]\n\x13\x43lassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12V\n\x16QuestionAndAnswerReply\x12\x18.modelresponse.QARequest\x1a .modelresponse.SingleStringReply\"\x00\x12W\n\rFillMaskReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12\x62\n\x18TokenClassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12]\n\x13\x43onversationalReply\x12\".modelresponse.ConversationRequest\x1a .modelresponse.ConversationReply\"\x00\x12N\n\x0cTxt2ImgReply\x12!.modelresponse.MultiStringRequest\x1a\x19.modelresponse.ImageReply\"\x00\x62\x06proto3'
-)
 
-_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, globals())
-_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'modelresponse_pb2', globals())
+DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x13modelresponse.proto\x12\rmodelresponse\x1a\x1bgoogle/protobuf/empty.proto\"_\n\x05Value\x12\x10\n\x06svalue\x18\x01 \x01(\tH\x00\x12\x10\n\x06ivalue\x18\x02 \x01(\x03H\x00\x12\x10\n\x06\x66value\x18\x03 \x01(\x02H\x00\x12\x10\n\x06\x62value\x18\x04 \x01(\x08H\x00\x42\x0e\n\x0coneof_values\"\x1f\n\tSessionID\x12\x12\n\nsession_id\x18\x01 \x01(\t\"\xed\x01\n\x13SingleStringRequest\x12\x0f\n\x07request\x18\x01 \x01(\t\x12I\n\x0cquery_kwargs\x18\x02 \x03(\x0b\x32\x33.modelresponse.SingleStringRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x03 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\xeb\x01\n\x12MultiStringRequest\x12\x0f\n\x07request\x18\x01 \x03(\t\x12H\n\x0cquery_kwargs\x18\x02 \x03(\x0b\x32\x32.modelresponse.MultiStringRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x03 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\x85\x01\n\x11SingleStringReply\x12\x10\n\x08response\x18\x01 \x01(\t\x12\x12\n\ntime_taken\x18\x02 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x03 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\x84\x01\n\x10MultiStringReply\x12\x10\n\x08response\x18\x01 \x03(\t\x12\x12\n\ntime_taken\x18\x02 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x03 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\xeb\x01\n\tQARequest\x12\x10\n\x08question\x18\x01 \x01(\t\x12\x0f\n\x07\x63ontext\x18\x02 \x01(\t\x12?\n\x0cquery_kwargs\x18\x03 \x03(\x0b\x32).modelresponse.QARequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\xd3\x02\n\x13\x43onversationRequest\x12\x0c\n\x04text\x18\x01 \x01(\t\x12\x1c\n\x0f\x63onversation_id\x18\x02 \x01(\x03H\x00\x88\x01\x01\x12\x18\n\x10past_user_inputs\x18\x03 \x03(\t\x12\x1b\n\x13generated_responses\x18\x04 \x03(\t\x12I\n\x0cquery_kwargs\x18\x05 \x03(\x0b\x32\x33.modelresponse.ConversationRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x06 \x01(\tH\x01\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_conversation_idB\x12\n\x10_deployment_name\"\xc3\x01\n\x11\x43onversationReply\x12\x17\n\x0f\x63onversation_id\x18\x01 \x01(\x03\x12\x18\n\x10past_user_inputs\x18\x02 \x03(\t\x12\x1b\n\x13generated_responses\x18\x03 \x03(\t\x12\x12\n\ntime_taken\x18\x04 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x05 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x06 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\xaf\x01\n\nImageReply\x12\x0e\n\x06images\x18\x01 \x03(\x0c\x12\x1d\n\x15nsfw_content_detected\x18\x02 \x03(\x08\x12\x0c\n\x04mode\x18\x03 \x01(\t\x12\x0e\n\x06size_w\x18\x04 \x01(\x03\x12\x0e\n\x06size_h\x18\x05 \x01(\x03\x12\x12\n\ntime_taken\x18\x06 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x07 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name2\xd4\x06\n\rModelResponse\x12=\n\tTerminate\x12\x16.google.protobuf.Empty\x1a\x16.google.protobuf.Empty\"\x00\x12\x43\n\rCreateSession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12\x44\n\x0e\x44\x65stroySession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12V\n\x0eGeneratorReply\x12!.modelresponse.MultiStringRequest\x1a\x1f.modelresponse.MultiStringReply\"\x00\x12]\n\x13\x43lassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12V\n\x16QuestionAndAnswerReply\x12\x18.modelresponse.QARequest\x1a .modelresponse.SingleStringReply\"\x00\x12W\n\rFillMaskReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12\x62\n\x18TokenClassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12]\n\x13\x43onversationalReply\x12\".modelresponse.ConversationRequest\x1a .modelresponse.ConversationReply\"\x00\x12N\n\x0cTxt2ImgReply\x12!.modelresponse.MultiStringRequest\x1a\x19.modelresponse.ImageReply\"\x00\x62\x06proto3')
+
+_globals = globals()
+_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
+_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'modelresponse_pb2', _globals)
 if _descriptor._USE_C_DESCRIPTORS == False:
 
-    DESCRIPTOR._options = None
-    _SINGLESTRINGREQUEST_QUERYKWARGSENTRY._options = None
-    _SINGLESTRINGREQUEST_QUERYKWARGSENTRY._serialized_options = b'8\001'
-    _MULTISTRINGREQUEST_QUERYKWARGSENTRY._options = None
-    _MULTISTRINGREQUEST_QUERYKWARGSENTRY._serialized_options = b'8\001'
-    _QAREQUEST_QUERYKWARGSENTRY._options = None
-    _QAREQUEST_QUERYKWARGSENTRY._serialized_options = b'8\001'
-    _CONVERSATIONREQUEST_QUERYKWARGSENTRY._options = None
-    _CONVERSATIONREQUEST_QUERYKWARGSENTRY._serialized_options = b'8\001'
-    _VALUE._serialized_start = 67
-    _VALUE._serialized_end = 162
-    _SESSIONID._serialized_start = 164
-    _SESSIONID._serialized_end = 195
-    _SINGLESTRINGREQUEST._serialized_start = 198
-    _SINGLESTRINGREQUEST._serialized_end = 385
-    _SINGLESTRINGREQUEST_QUERYKWARGSENTRY._serialized_start = 313
-    _SINGLESTRINGREQUEST_QUERYKWARGSENTRY._serialized_end = 385
-    _MULTISTRINGREQUEST._serialized_start = 388
-    _MULTISTRINGREQUEST._serialized_end = 573
-    _MULTISTRINGREQUEST_QUERYKWARGSENTRY._serialized_start = 313
-    _MULTISTRINGREQUEST_QUERYKWARGSENTRY._serialized_end = 385
-    _SINGLESTRINGREPLY._serialized_start = 575
-    _SINGLESTRINGREPLY._serialized_end = 658
-    _MULTISTRINGREPLY._serialized_start = 660
-    _MULTISTRINGREPLY._serialized_end = 742
-    _QAREQUEST._serialized_start = 745
-    _QAREQUEST._serialized_end = 930
-    _QAREQUEST_QUERYKWARGSENTRY._serialized_start = 313
-    _QAREQUEST_QUERYKWARGSENTRY._serialized_end = 385
-    _CONVERSATIONREQUEST._serialized_start = 933
-    _CONVERSATIONREQUEST._serialized_end = 1222
-    _CONVERSATIONREQUEST_QUERYKWARGSENTRY._serialized_start = 313
-    _CONVERSATIONREQUEST_QUERYKWARGSENTRY._serialized_end = 385
-    _CONVERSATIONREPLY._serialized_start = 1225
-    _CONVERSATIONREPLY._serialized_end = 1370
-    _IMAGEREPLY._serialized_start = 1372
-    _IMAGEREPLY._serialized_end = 1497
-    _MODELRESPONSE._serialized_start = 1500
-    _MODELRESPONSE._serialized_end = 2352
+  DESCRIPTOR._options = None
+  _SINGLESTRINGREQUEST_QUERYKWARGSENTRY._options = None
+  _SINGLESTRINGREQUEST_QUERYKWARGSENTRY._serialized_options = b'8\001'
+  _MULTISTRINGREQUEST_QUERYKWARGSENTRY._options = None
+  _MULTISTRINGREQUEST_QUERYKWARGSENTRY._serialized_options = b'8\001'
+  _QAREQUEST_QUERYKWARGSENTRY._options = None
+  _QAREQUEST_QUERYKWARGSENTRY._serialized_options = b'8\001'
+  _CONVERSATIONREQUEST_QUERYKWARGSENTRY._options = None
+  _CONVERSATIONREQUEST_QUERYKWARGSENTRY._serialized_options = b'8\001'
+  _globals['_VALUE']._serialized_start=67
+  _globals['_VALUE']._serialized_end=162
+  _globals['_SESSIONID']._serialized_start=164
+  _globals['_SESSIONID']._serialized_end=195
+  _globals['_SINGLESTRINGREQUEST']._serialized_start=198
+  _globals['_SINGLESTRINGREQUEST']._serialized_end=435
+  _globals['_SINGLESTRINGREQUEST_QUERYKWARGSENTRY']._serialized_start=343
+  _globals['_SINGLESTRINGREQUEST_QUERYKWARGSENTRY']._serialized_end=415
+  _globals['_MULTISTRINGREQUEST']._serialized_start=438
+  _globals['_MULTISTRINGREQUEST']._serialized_end=673
+  _globals['_MULTISTRINGREQUEST_QUERYKWARGSENTRY']._serialized_start=343
+  _globals['_MULTISTRINGREQUEST_QUERYKWARGSENTRY']._serialized_end=415
+  _globals['_SINGLESTRINGREPLY']._serialized_start=676
+  _globals['_SINGLESTRINGREPLY']._serialized_end=809
+  _globals['_MULTISTRINGREPLY']._serialized_start=812
+  _globals['_MULTISTRINGREPLY']._serialized_end=944
+  _globals['_QAREQUEST']._serialized_start=947
+  _globals['_QAREQUEST']._serialized_end=1182
+  _globals['_QAREQUEST_QUERYKWARGSENTRY']._serialized_start=343
+  _globals['_QAREQUEST_QUERYKWARGSENTRY']._serialized_end=415
+  _globals['_CONVERSATIONREQUEST']._serialized_start=1185
+  _globals['_CONVERSATIONREQUEST']._serialized_end=1524
+  _globals['_CONVERSATIONREQUEST_QUERYKWARGSENTRY']._serialized_start=343
+  _globals['_CONVERSATIONREQUEST_QUERYKWARGSENTRY']._serialized_end=415
+  _globals['_CONVERSATIONREPLY']._serialized_start=1527
+  _globals['_CONVERSATIONREPLY']._serialized_end=1722
+  _globals['_IMAGEREPLY']._serialized_start=1725
+  _globals['_IMAGEREPLY']._serialized_end=1900
+  _globals['_MODELRESPONSE']._serialized_start=1903
+  _globals['_MODELRESPONSE']._serialized_end=2755
 # @@protoc_insertion_point(module_scope)
diff --git a/mii/grpc_related/proto/modelresponse_pb2_grpc.py b/mii/grpc_related/proto/modelresponse_pb2_grpc.py
index 95cfa825..683e4962 100644
--- a/mii/grpc_related/proto/modelresponse_pb2_grpc.py
+++ b/mii/grpc_related/proto/modelresponse_pb2_grpc.py
@@ -1,8 +1,3 @@
-# Copyright (c) Microsoft Corporation.
-# SPDX-License-Identifier: Apache-2.0
-
-# DeepSpeed Team
-
 # Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
 """Client and server classes corresponding to protobuf-defined services."""
 import grpc
@@ -13,6 +8,7 @@
 
 class ModelResponseStub(object):
     """Missing associated documentation comment in .proto file."""
+
     def __init__(self, channel):
         """Constructor.
 
@@ -20,60 +16,60 @@ def __init__(self, channel):
             channel: A grpc.Channel.
         """
         self.Terminate = channel.unary_unary(
-            '/modelresponse.ModelResponse/Terminate',
-            request_serializer=google_dot_protobuf_dot_empty__pb2.Empty.
-            SerializeToString,
-            response_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString,
-        )
+                '/modelresponse.ModelResponse/Terminate',
+                request_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString,
+                response_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString,
+                )
         self.CreateSession = channel.unary_unary(
-            '/modelresponse.ModelResponse/CreateSession',
-            request_serializer=modelresponse__pb2.SessionID.SerializeToString,
-            response_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString,
-        )
+                '/modelresponse.ModelResponse/CreateSession',
+                request_serializer=modelresponse__pb2.SessionID.SerializeToString,
+                response_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString,
+                )
         self.DestroySession = channel.unary_unary(
-            '/modelresponse.ModelResponse/DestroySession',
-            request_serializer=modelresponse__pb2.SessionID.SerializeToString,
-            response_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString,
-        )
+                '/modelresponse.ModelResponse/DestroySession',
+                request_serializer=modelresponse__pb2.SessionID.SerializeToString,
+                response_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString,
+                )
         self.GeneratorReply = channel.unary_unary(
-            '/modelresponse.ModelResponse/GeneratorReply',
-            request_serializer=modelresponse__pb2.MultiStringRequest.SerializeToString,
-            response_deserializer=modelresponse__pb2.MultiStringReply.FromString,
-        )
+                '/modelresponse.ModelResponse/GeneratorReply',
+                request_serializer=modelresponse__pb2.MultiStringRequest.SerializeToString,
+                response_deserializer=modelresponse__pb2.MultiStringReply.FromString,
+                )
         self.ClassificationReply = channel.unary_unary(
-            '/modelresponse.ModelResponse/ClassificationReply',
-            request_serializer=modelresponse__pb2.SingleStringRequest.SerializeToString,
-            response_deserializer=modelresponse__pb2.SingleStringReply.FromString,
-        )
+                '/modelresponse.ModelResponse/ClassificationReply',
+                request_serializer=modelresponse__pb2.SingleStringRequest.SerializeToString,
+                response_deserializer=modelresponse__pb2.SingleStringReply.FromString,
+                )
         self.QuestionAndAnswerReply = channel.unary_unary(
-            '/modelresponse.ModelResponse/QuestionAndAnswerReply',
-            request_serializer=modelresponse__pb2.QARequest.SerializeToString,
-            response_deserializer=modelresponse__pb2.SingleStringReply.FromString,
-        )
+                '/modelresponse.ModelResponse/QuestionAndAnswerReply',
+                request_serializer=modelresponse__pb2.QARequest.SerializeToString,
+                response_deserializer=modelresponse__pb2.SingleStringReply.FromString,
+                )
         self.FillMaskReply = channel.unary_unary(
-            '/modelresponse.ModelResponse/FillMaskReply',
-            request_serializer=modelresponse__pb2.SingleStringRequest.SerializeToString,
-            response_deserializer=modelresponse__pb2.SingleStringReply.FromString,
-        )
+                '/modelresponse.ModelResponse/FillMaskReply',
+                request_serializer=modelresponse__pb2.SingleStringRequest.SerializeToString,
+                response_deserializer=modelresponse__pb2.SingleStringReply.FromString,
+                )
         self.TokenClassificationReply = channel.unary_unary(
-            '/modelresponse.ModelResponse/TokenClassificationReply',
-            request_serializer=modelresponse__pb2.SingleStringRequest.SerializeToString,
-            response_deserializer=modelresponse__pb2.SingleStringReply.FromString,
-        )
+                '/modelresponse.ModelResponse/TokenClassificationReply',
+                request_serializer=modelresponse__pb2.SingleStringRequest.SerializeToString,
+                response_deserializer=modelresponse__pb2.SingleStringReply.FromString,
+                )
         self.ConversationalReply = channel.unary_unary(
-            '/modelresponse.ModelResponse/ConversationalReply',
-            request_serializer=modelresponse__pb2.ConversationRequest.SerializeToString,
-            response_deserializer=modelresponse__pb2.ConversationReply.FromString,
-        )
+                '/modelresponse.ModelResponse/ConversationalReply',
+                request_serializer=modelresponse__pb2.ConversationRequest.SerializeToString,
+                response_deserializer=modelresponse__pb2.ConversationReply.FromString,
+                )
         self.Txt2ImgReply = channel.unary_unary(
-            '/modelresponse.ModelResponse/Txt2ImgReply',
-            request_serializer=modelresponse__pb2.MultiStringRequest.SerializeToString,
-            response_deserializer=modelresponse__pb2.ImageReply.FromString,
-        )
+                '/modelresponse.ModelResponse/Txt2ImgReply',
+                request_serializer=modelresponse__pb2.MultiStringRequest.SerializeToString,
+                response_deserializer=modelresponse__pb2.ImageReply.FromString,
+                )
 
 
 class ModelResponseServicer(object):
     """Missing associated documentation comment in .proto file."""
+
     def Terminate(self, request, context):
         """Missing associated documentation comment in .proto file."""
         context.set_code(grpc.StatusCode.UNIMPLEMENTED)
@@ -137,334 +133,232 @@ def Txt2ImgReply(self, request, context):
 
 def add_ModelResponseServicer_to_server(servicer, server):
     rpc_method_handlers = {
-        'Terminate':
-        grpc.unary_unary_rpc_method_handler(
-            servicer.Terminate,
-            request_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString,
-            response_serializer=google_dot_protobuf_dot_empty__pb2.Empty.
-            SerializeToString,
-        ),
-        'CreateSession':
-        grpc.unary_unary_rpc_method_handler(
-            servicer.CreateSession,
-            request_deserializer=modelresponse__pb2.SessionID.FromString,
-            response_serializer=google_dot_protobuf_dot_empty__pb2.Empty.
-            SerializeToString,
-        ),
-        'DestroySession':
-        grpc.unary_unary_rpc_method_handler(
-            servicer.DestroySession,
-            request_deserializer=modelresponse__pb2.SessionID.FromString,
-            response_serializer=google_dot_protobuf_dot_empty__pb2.Empty.
-            SerializeToString,
-        ),
-        'GeneratorReply':
-        grpc.unary_unary_rpc_method_handler(
-            servicer.GeneratorReply,
-            request_deserializer=modelresponse__pb2.MultiStringRequest.FromString,
-            response_serializer=modelresponse__pb2.MultiStringReply.SerializeToString,
-        ),
-        'ClassificationReply':
-        grpc.unary_unary_rpc_method_handler(
-            servicer.ClassificationReply,
-            request_deserializer=modelresponse__pb2.SingleStringRequest.FromString,
-            response_serializer=modelresponse__pb2.SingleStringReply.SerializeToString,
-        ),
-        'QuestionAndAnswerReply':
-        grpc.unary_unary_rpc_method_handler(
-            servicer.QuestionAndAnswerReply,
-            request_deserializer=modelresponse__pb2.QARequest.FromString,
-            response_serializer=modelresponse__pb2.SingleStringReply.SerializeToString,
-        ),
-        'FillMaskReply':
-        grpc.unary_unary_rpc_method_handler(
-            servicer.FillMaskReply,
-            request_deserializer=modelresponse__pb2.SingleStringRequest.FromString,
-            response_serializer=modelresponse__pb2.SingleStringReply.SerializeToString,
-        ),
-        'TokenClassificationReply':
-        grpc.unary_unary_rpc_method_handler(
-            servicer.TokenClassificationReply,
-            request_deserializer=modelresponse__pb2.SingleStringRequest.FromString,
-            response_serializer=modelresponse__pb2.SingleStringReply.SerializeToString,
-        ),
-        'ConversationalReply':
-        grpc.unary_unary_rpc_method_handler(
-            servicer.ConversationalReply,
-            request_deserializer=modelresponse__pb2.ConversationRequest.FromString,
-            response_serializer=modelresponse__pb2.ConversationReply.SerializeToString,
-        ),
-        'Txt2ImgReply':
-        grpc.unary_unary_rpc_method_handler(
-            servicer.Txt2ImgReply,
-            request_deserializer=modelresponse__pb2.MultiStringRequest.FromString,
-            response_serializer=modelresponse__pb2.ImageReply.SerializeToString,
-        ),
+            'Terminate': grpc.unary_unary_rpc_method_handler(
+                    servicer.Terminate,
+                    request_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString,
+                    response_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString,
+            ),
+            'CreateSession': grpc.unary_unary_rpc_method_handler(
+                    servicer.CreateSession,
+                    request_deserializer=modelresponse__pb2.SessionID.FromString,
+                    response_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString,
+            ),
+            'DestroySession': grpc.unary_unary_rpc_method_handler(
+                    servicer.DestroySession,
+                    request_deserializer=modelresponse__pb2.SessionID.FromString,
+                    response_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString,
+            ),
+            'GeneratorReply': grpc.unary_unary_rpc_method_handler(
+                    servicer.GeneratorReply,
+                    request_deserializer=modelresponse__pb2.MultiStringRequest.FromString,
+                    response_serializer=modelresponse__pb2.MultiStringReply.SerializeToString,
+            ),
+            'ClassificationReply': grpc.unary_unary_rpc_method_handler(
+                    servicer.ClassificationReply,
+                    request_deserializer=modelresponse__pb2.SingleStringRequest.FromString,
+                    response_serializer=modelresponse__pb2.SingleStringReply.SerializeToString,
+            ),
+            'QuestionAndAnswerReply': grpc.unary_unary_rpc_method_handler(
+                    servicer.QuestionAndAnswerReply,
+                    request_deserializer=modelresponse__pb2.QARequest.FromString,
+                    response_serializer=modelresponse__pb2.SingleStringReply.SerializeToString,
+            ),
+            'FillMaskReply': grpc.unary_unary_rpc_method_handler(
+                    servicer.FillMaskReply,
+                    request_deserializer=modelresponse__pb2.SingleStringRequest.FromString,
+                    response_serializer=modelresponse__pb2.SingleStringReply.SerializeToString,
+            ),
+            'TokenClassificationReply': grpc.unary_unary_rpc_method_handler(
+                    servicer.TokenClassificationReply,
+                    request_deserializer=modelresponse__pb2.SingleStringRequest.FromString,
+                    response_serializer=modelresponse__pb2.SingleStringReply.SerializeToString,
+            ),
+            'ConversationalReply': grpc.unary_unary_rpc_method_handler(
+                    servicer.ConversationalReply,
+                    request_deserializer=modelresponse__pb2.ConversationRequest.FromString,
+                    response_serializer=modelresponse__pb2.ConversationReply.SerializeToString,
+            ),
+            'Txt2ImgReply': grpc.unary_unary_rpc_method_handler(
+                    servicer.Txt2ImgReply,
+                    request_deserializer=modelresponse__pb2.MultiStringRequest.FromString,
+                    response_serializer=modelresponse__pb2.ImageReply.SerializeToString,
+            ),
     }
-    generic_handler = grpc.method_handlers_generic_handler('modelresponse.ModelResponse',
-                                                           rpc_method_handlers)
-    server.add_generic_rpc_handlers((generic_handler, ))
+    generic_handler = grpc.method_handlers_generic_handler(
+            'modelresponse.ModelResponse', rpc_method_handlers)
+    server.add_generic_rpc_handlers((generic_handler,))
 
 
-# This class is part of an EXPERIMENTAL API.
+ # This class is part of an EXPERIMENTAL API.
 class ModelResponse(object):
     """Missing associated documentation comment in .proto file."""
+
     @staticmethod
     def Terminate(request,
-                  target,
-                  options=(),
-                  channel_credentials=None,
-                  call_credentials=None,
-                  insecure=False,
-                  compression=None,
-                  wait_for_ready=None,
-                  timeout=None,
-                  metadata=None):
-        return grpc.experimental.unary_unary(
-            request,
             target,
-            '/modelresponse.ModelResponse/Terminate',
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/modelresponse.ModelResponse/Terminate',
             google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString,
             google_dot_protobuf_dot_empty__pb2.Empty.FromString,
-            options,
-            channel_credentials,
-            insecure,
-            call_credentials,
-            compression,
-            wait_for_ready,
-            timeout,
-            metadata)
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
 
     @staticmethod
     def CreateSession(request,
-                      target,
-                      options=(),
-                      channel_credentials=None,
-                      call_credentials=None,
-                      insecure=False,
-                      compression=None,
-                      wait_for_ready=None,
-                      timeout=None,
-                      metadata=None):
-        return grpc.experimental.unary_unary(
-            request,
             target,
-            '/modelresponse.ModelResponse/CreateSession',
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/modelresponse.ModelResponse/CreateSession',
             modelresponse__pb2.SessionID.SerializeToString,
             google_dot_protobuf_dot_empty__pb2.Empty.FromString,
-            options,
-            channel_credentials,
-            insecure,
-            call_credentials,
-            compression,
-            wait_for_ready,
-            timeout,
-            metadata)
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
 
     @staticmethod
     def DestroySession(request,
-                       target,
-                       options=(),
-                       channel_credentials=None,
-                       call_credentials=None,
-                       insecure=False,
-                       compression=None,
-                       wait_for_ready=None,
-                       timeout=None,
-                       metadata=None):
-        return grpc.experimental.unary_unary(
-            request,
             target,
-            '/modelresponse.ModelResponse/DestroySession',
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/modelresponse.ModelResponse/DestroySession',
             modelresponse__pb2.SessionID.SerializeToString,
             google_dot_protobuf_dot_empty__pb2.Empty.FromString,
-            options,
-            channel_credentials,
-            insecure,
-            call_credentials,
-            compression,
-            wait_for_ready,
-            timeout,
-            metadata)
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
 
     @staticmethod
     def GeneratorReply(request,
-                       target,
-                       options=(),
-                       channel_credentials=None,
-                       call_credentials=None,
-                       insecure=False,
-                       compression=None,
-                       wait_for_ready=None,
-                       timeout=None,
-                       metadata=None):
-        return grpc.experimental.unary_unary(
-            request,
             target,
-            '/modelresponse.ModelResponse/GeneratorReply',
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/modelresponse.ModelResponse/GeneratorReply',
             modelresponse__pb2.MultiStringRequest.SerializeToString,
             modelresponse__pb2.MultiStringReply.FromString,
-            options,
-            channel_credentials,
-            insecure,
-            call_credentials,
-            compression,
-            wait_for_ready,
-            timeout,
-            metadata)
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
 
     @staticmethod
     def ClassificationReply(request,
-                            target,
-                            options=(),
-                            channel_credentials=None,
-                            call_credentials=None,
-                            insecure=False,
-                            compression=None,
-                            wait_for_ready=None,
-                            timeout=None,
-                            metadata=None):
-        return grpc.experimental.unary_unary(
-            request,
             target,
-            '/modelresponse.ModelResponse/ClassificationReply',
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/modelresponse.ModelResponse/ClassificationReply',
             modelresponse__pb2.SingleStringRequest.SerializeToString,
             modelresponse__pb2.SingleStringReply.FromString,
-            options,
-            channel_credentials,
-            insecure,
-            call_credentials,
-            compression,
-            wait_for_ready,
-            timeout,
-            metadata)
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
 
     @staticmethod
     def QuestionAndAnswerReply(request,
-                               target,
-                               options=(),
-                               channel_credentials=None,
-                               call_credentials=None,
-                               insecure=False,
-                               compression=None,
-                               wait_for_ready=None,
-                               timeout=None,
-                               metadata=None):
-        return grpc.experimental.unary_unary(
-            request,
             target,
-            '/modelresponse.ModelResponse/QuestionAndAnswerReply',
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/modelresponse.ModelResponse/QuestionAndAnswerReply',
             modelresponse__pb2.QARequest.SerializeToString,
             modelresponse__pb2.SingleStringReply.FromString,
-            options,
-            channel_credentials,
-            insecure,
-            call_credentials,
-            compression,
-            wait_for_ready,
-            timeout,
-            metadata)
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
 
     @staticmethod
     def FillMaskReply(request,
-                      target,
-                      options=(),
-                      channel_credentials=None,
-                      call_credentials=None,
-                      insecure=False,
-                      compression=None,
-                      wait_for_ready=None,
-                      timeout=None,
-                      metadata=None):
-        return grpc.experimental.unary_unary(
-            request,
             target,
-            '/modelresponse.ModelResponse/FillMaskReply',
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/modelresponse.ModelResponse/FillMaskReply',
             modelresponse__pb2.SingleStringRequest.SerializeToString,
             modelresponse__pb2.SingleStringReply.FromString,
-            options,
-            channel_credentials,
-            insecure,
-            call_credentials,
-            compression,
-            wait_for_ready,
-            timeout,
-            metadata)
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
 
     @staticmethod
     def TokenClassificationReply(request,
-                                 target,
-                                 options=(),
-                                 channel_credentials=None,
-                                 call_credentials=None,
-                                 insecure=False,
-                                 compression=None,
-                                 wait_for_ready=None,
-                                 timeout=None,
-                                 metadata=None):
-        return grpc.experimental.unary_unary(
-            request,
             target,
-            '/modelresponse.ModelResponse/TokenClassificationReply',
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/modelresponse.ModelResponse/TokenClassificationReply',
             modelresponse__pb2.SingleStringRequest.SerializeToString,
             modelresponse__pb2.SingleStringReply.FromString,
-            options,
-            channel_credentials,
-            insecure,
-            call_credentials,
-            compression,
-            wait_for_ready,
-            timeout,
-            metadata)
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
 
     @staticmethod
     def ConversationalReply(request,
-                            target,
-                            options=(),
-                            channel_credentials=None,
-                            call_credentials=None,
-                            insecure=False,
-                            compression=None,
-                            wait_for_ready=None,
-                            timeout=None,
-                            metadata=None):
-        return grpc.experimental.unary_unary(
-            request,
             target,
-            '/modelresponse.ModelResponse/ConversationalReply',
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/modelresponse.ModelResponse/ConversationalReply',
             modelresponse__pb2.ConversationRequest.SerializeToString,
             modelresponse__pb2.ConversationReply.FromString,
-            options,
-            channel_credentials,
-            insecure,
-            call_credentials,
-            compression,
-            wait_for_ready,
-            timeout,
-            metadata)
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
 
     @staticmethod
     def Txt2ImgReply(request,
-                     target,
-                     options=(),
-                     channel_credentials=None,
-                     call_credentials=None,
-                     insecure=False,
-                     compression=None,
-                     wait_for_ready=None,
-                     timeout=None,
-                     metadata=None):
-        return grpc.experimental.unary_unary(
-            request,
             target,
-            '/modelresponse.ModelResponse/Txt2ImgReply',
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/modelresponse.ModelResponse/Txt2ImgReply',
             modelresponse__pb2.MultiStringRequest.SerializeToString,
             modelresponse__pb2.ImageReply.FromString,
-            options,
-            channel_credentials,
-            insecure,
-            call_credentials,
-            compression,
-            wait_for_ready,
-            timeout,
-            metadata)
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
diff --git a/mii/method_table.py b/mii/method_table.py
index c412f446..f7f87d28 100644
--- a/mii/method_table.py
+++ b/mii/method_table.py
@@ -13,7 +13,8 @@
 def single_string_request_to_proto(self, request_dict, **query_kwargs):
     return modelresponse_pb2.SingleStringRequest(
         request=request_dict['query'],
-        query_kwargs=kwarg_dict_to_proto(query_kwargs))
+        query_kwargs=kwarg_dict_to_proto(query_kwargs),
+        deployment_name=request_dict.get('deployment_name'))
 
 
 def single_string_response_to_proto(self, response, time_taken, model_time_taken):
@@ -26,7 +27,8 @@ def multi_string_request_to_proto(self, request_dict, **query_kwargs):
     return modelresponse_pb2.MultiStringRequest(
         request=request_dict['query'] if isinstance(request_dict['query'],
                                                     list) else [request_dict['query']],
-        query_kwargs=kwarg_dict_to_proto(query_kwargs))
+        query_kwargs=kwarg_dict_to_proto(query_kwargs),
+        deployment_name=request_dict.get('deployment_name'))
 
 
 def proto_request_to_single_input(self, request):
@@ -143,7 +145,8 @@ def pack_request_to_proto(self, request_dict, **query_kwargs):
         return modelresponse_pb2.QARequest(
             question=request_dict['question'],
             context=request_dict['context'],
-            query_kwargs=kwarg_dict_to_proto(query_kwargs))
+            query_kwargs=kwarg_dict_to_proto(query_kwargs),
+            deployment_name=request_dict.get('deployment_name'))
 
     def unpack_request_from_proto(self, request):
         kwargs = unpack_proto_query_kwargs(request.query_kwargs)
@@ -222,7 +225,8 @@ def pack_request_to_proto(self, request_dict, **query_kwargs):
             if 'conversation_id' in request_dict else None,
             past_user_inputs=request_dict['past_user_inputs'],
             generated_responses=request_dict['generated_responses'],
-            query_kwargs=kwarg_dict_to_proto(query_kwargs))
+            query_kwargs=kwarg_dict_to_proto(query_kwargs),
+            deployment_name=request_dict.get('deployment_name'))
 
 
 class Text2ImgMethods(TaskMethods):

From 63461949e0e44be45958dba53da9f24f7983e793 Mon Sep 17 00:00:00 2001
From: Tosin Segun <insanechils@gmail.com>
Date: Thu, 13 Jul 2023 23:37:38 +0000
Subject: [PATCH 42/69] Formatting

---
 examples/multi_model/deploy.py                |  28 +-
 examples/multi_model/query.py                 |  44 +-
 mii/grpc_related/modelresponse_server.py      |   3 +-
 mii/grpc_related/proto/modelresponse_pb2.py   |  90 +--
 .../proto/modelresponse_pb2_grpc.py           | 520 +++++++++++-------
 5 files changed, 400 insertions(+), 285 deletions(-)

diff --git a/examples/multi_model/deploy.py b/examples/multi_model/deploy.py
index 03d2a3a0..f0408da7 100644
--- a/examples/multi_model/deploy.py
+++ b/examples/multi_model/deploy.py
@@ -16,32 +16,32 @@
 name = "bigscience/bloom-560m"
 deployments.append(
     mii.DeploymentConfig(task='text-generation',
-                   model=name,
-                   deployment_name=name + "_deployment",
-                   GPU_index_map=gpu_index_map3,
-                   mii_config=mii.config.MIIConfig(**mii_configs1)))
+                         model=name,
+                         deployment_name=name + "_deployment",
+                         GPU_index_map=gpu_index_map3,
+                         mii_config=mii.config.MIIConfig(**mii_configs1)))
 
 # gpt2
 name = "microsoft/DialogRPT-human-vs-rand"
 deployments.append(
     mii.DeploymentConfig(task='text-classification',
-                   model=name,
-                   deployment_name=name + "_deployment",
-                   GPU_index_map=gpu_index_map2))
+                         model=name,
+                         deployment_name=name + "_deployment",
+                         GPU_index_map=gpu_index_map2))
 
 name = "microsoft/DialoGPT-large"
 deployments.append(
     mii.DeploymentConfig(task='conversational',
-                   model=name,
-                   deployment_name=name + "_deployment",
-                   GPU_index_map=gpu_index_map1,
-                   mii_config=mii.config.MIIConfig(**mii_configs2)))
+                         model=name,
+                         deployment_name=name + "_deployment",
+                         GPU_index_map=gpu_index_map1,
+                         mii_config=mii.config.MIIConfig(**mii_configs2)))
 
 name = "deepset/roberta-large-squad2"
 deployments.append(
     mii.DeploymentConfig(task="question-answering",
-                   model=name,
-                   deployment_name=name + "-qa-deployment",
-                   GPU_index_map=gpu_index_map2))
+                         model=name,
+                         deployment_name=name + "-qa-deployment",
+                         GPU_index_map=gpu_index_map2))
 
 mii.deploy(deployment_tag="multi_models", deployments=deployments)
diff --git a/examples/multi_model/query.py b/examples/multi_model/query.py
index caf85934..bf760b49 100644
--- a/examples/multi_model/query.py
+++ b/examples/multi_model/query.py
@@ -8,33 +8,39 @@
 results = []
 generator = mii.mii_query_handle("multi_models")
 result = generator.query(
-    {"query": ["DeepSpeed is",
-               "Seattle is"],
-
-              "deployment_name": "bigscience/bloom-560m_deployment"
+    {
+        "query": ["DeepSpeed is",
+                  "Seattle is"],
+        "deployment_name": "bigscience/bloom-560m_deployment"
     },
     do_sample=True,
     max_new_tokens=30,
 )
 results.append(result)
 
-result = generator.query({'query': "DeepSpeed is the greatest", "deployment_name": "microsoft/DialogRPT-human-vs-rand_deployment"})
+result = generator.query({
+    'query':
+    "DeepSpeed is the greatest",
+    "deployment_name":
+    "microsoft/DialogRPT-human-vs-rand_deployment"
+})
 results.append(result)
 
-result = generator.query(
-    {
-        'text': "DeepSpeed is the greatest",
-        'conversation_id': 3,
-        'past_user_inputs': [],
-        'generated_responses': [],
-        "deployment_name": "microsoft/DialoGPT-large_deployment"
-    })
+result = generator.query({
+    'text': "DeepSpeed is the greatest",
+    'conversation_id': 3,
+    'past_user_inputs': [],
+    'generated_responses': [],
+    "deployment_name": "microsoft/DialoGPT-large_deployment"
+})
 results.append(result)
 
-result = generator.query(
-    {
-        'question': "What is the greatest?",
-        'context': "DeepSpeed is the greatest",
-        "deployment_name": "deepset/roberta-large-squad2" + "-qa-deployment"
-    })
+result = generator.query({
+    'question':
+    "What is the greatest?",
+    'context':
+    "DeepSpeed is the greatest",
+    "deployment_name":
+    "deepset/roberta-large-squad2" + "-qa-deployment"
+})
 results.append(result)
diff --git a/mii/grpc_related/modelresponse_server.py b/mii/grpc_related/modelresponse_server.py
index 6f123a7f..94eec2d2 100644
--- a/mii/grpc_related/modelresponse_server.py
+++ b/mii/grpc_related/modelresponse_server.py
@@ -16,7 +16,6 @@
 from mii.constants import GRPC_MAX_MSG_SIZE, CREATE_SESSION_METHOD, DESTROY_SESSION_METHOD, TERMINATE_METHOD, LB_MAX_WORKER_THREADS, SERVER_SHUTDOWN_TIMEOUT, Tasks
 from mii.method_table import GRPC_METHOD_TABLE
 from mii.client import create_channel
-from mii.utils import get_task, unpack_proto_query_kwargs
 
 
 class ServiceBase(modelresponse_pb2_grpc.ModelResponseServicer):
@@ -265,7 +264,7 @@ def invoke_intercept_method(request_proto, context):
                     DESTROY_SESSION_METHOD,
                     request_proto)
                 return google_dot_protobuf_dot_empty__pb2.Empty()
-            
+
             if "session_id" in request_proto.query_kwargs:
                 session_id = request_proto.query_kwargs["session_id"]
                 if session_id not in self.replica_sessions:
diff --git a/mii/grpc_related/proto/modelresponse_pb2.py b/mii/grpc_related/proto/modelresponse_pb2.py
index 53305ca5..7802b7f2 100644
--- a/mii/grpc_related/proto/modelresponse_pb2.py
+++ b/mii/grpc_related/proto/modelresponse_pb2.py
@@ -1,4 +1,8 @@
-# -*- coding: utf-8 -*-
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 # Generated by the protocol buffer compiler.  DO NOT EDIT!
 # source: modelresponse.proto
 """Generated protocol buffer code."""
@@ -10,54 +14,54 @@
 
 _sym_db = _symbol_database.Default()
 
-
 from google.protobuf import empty_pb2 as google_dot_protobuf_dot_empty__pb2
 
-
-DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x13modelresponse.proto\x12\rmodelresponse\x1a\x1bgoogle/protobuf/empty.proto\"_\n\x05Value\x12\x10\n\x06svalue\x18\x01 \x01(\tH\x00\x12\x10\n\x06ivalue\x18\x02 \x01(\x03H\x00\x12\x10\n\x06\x66value\x18\x03 \x01(\x02H\x00\x12\x10\n\x06\x62value\x18\x04 \x01(\x08H\x00\x42\x0e\n\x0coneof_values\"\x1f\n\tSessionID\x12\x12\n\nsession_id\x18\x01 \x01(\t\"\xed\x01\n\x13SingleStringRequest\x12\x0f\n\x07request\x18\x01 \x01(\t\x12I\n\x0cquery_kwargs\x18\x02 \x03(\x0b\x32\x33.modelresponse.SingleStringRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x03 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\xeb\x01\n\x12MultiStringRequest\x12\x0f\n\x07request\x18\x01 \x03(\t\x12H\n\x0cquery_kwargs\x18\x02 \x03(\x0b\x32\x32.modelresponse.MultiStringRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x03 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\x85\x01\n\x11SingleStringReply\x12\x10\n\x08response\x18\x01 \x01(\t\x12\x12\n\ntime_taken\x18\x02 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x03 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\x84\x01\n\x10MultiStringReply\x12\x10\n\x08response\x18\x01 \x03(\t\x12\x12\n\ntime_taken\x18\x02 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x03 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\xeb\x01\n\tQARequest\x12\x10\n\x08question\x18\x01 \x01(\t\x12\x0f\n\x07\x63ontext\x18\x02 \x01(\t\x12?\n\x0cquery_kwargs\x18\x03 \x03(\x0b\x32).modelresponse.QARequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\xd3\x02\n\x13\x43onversationRequest\x12\x0c\n\x04text\x18\x01 \x01(\t\x12\x1c\n\x0f\x63onversation_id\x18\x02 \x01(\x03H\x00\x88\x01\x01\x12\x18\n\x10past_user_inputs\x18\x03 \x03(\t\x12\x1b\n\x13generated_responses\x18\x04 \x03(\t\x12I\n\x0cquery_kwargs\x18\x05 \x03(\x0b\x32\x33.modelresponse.ConversationRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x06 \x01(\tH\x01\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_conversation_idB\x12\n\x10_deployment_name\"\xc3\x01\n\x11\x43onversationReply\x12\x17\n\x0f\x63onversation_id\x18\x01 \x01(\x03\x12\x18\n\x10past_user_inputs\x18\x02 \x03(\t\x12\x1b\n\x13generated_responses\x18\x03 \x03(\t\x12\x12\n\ntime_taken\x18\x04 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x05 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x06 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\xaf\x01\n\nImageReply\x12\x0e\n\x06images\x18\x01 \x03(\x0c\x12\x1d\n\x15nsfw_content_detected\x18\x02 \x03(\x08\x12\x0c\n\x04mode\x18\x03 \x01(\t\x12\x0e\n\x06size_w\x18\x04 \x01(\x03\x12\x0e\n\x06size_h\x18\x05 \x01(\x03\x12\x12\n\ntime_taken\x18\x06 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x07 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name2\xd4\x06\n\rModelResponse\x12=\n\tTerminate\x12\x16.google.protobuf.Empty\x1a\x16.google.protobuf.Empty\"\x00\x12\x43\n\rCreateSession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12\x44\n\x0e\x44\x65stroySession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12V\n\x0eGeneratorReply\x12!.modelresponse.MultiStringRequest\x1a\x1f.modelresponse.MultiStringReply\"\x00\x12]\n\x13\x43lassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12V\n\x16QuestionAndAnswerReply\x12\x18.modelresponse.QARequest\x1a .modelresponse.SingleStringReply\"\x00\x12W\n\rFillMaskReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12\x62\n\x18TokenClassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12]\n\x13\x43onversationalReply\x12\".modelresponse.ConversationRequest\x1a .modelresponse.ConversationReply\"\x00\x12N\n\x0cTxt2ImgReply\x12!.modelresponse.MultiStringRequest\x1a\x19.modelresponse.ImageReply\"\x00\x62\x06proto3')
+DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(
+    b'\n\x13modelresponse.proto\x12\rmodelresponse\x1a\x1bgoogle/protobuf/empty.proto\"_\n\x05Value\x12\x10\n\x06svalue\x18\x01 \x01(\tH\x00\x12\x10\n\x06ivalue\x18\x02 \x01(\x03H\x00\x12\x10\n\x06\x66value\x18\x03 \x01(\x02H\x00\x12\x10\n\x06\x62value\x18\x04 \x01(\x08H\x00\x42\x0e\n\x0coneof_values\"\x1f\n\tSessionID\x12\x12\n\nsession_id\x18\x01 \x01(\t\"\xed\x01\n\x13SingleStringRequest\x12\x0f\n\x07request\x18\x01 \x01(\t\x12I\n\x0cquery_kwargs\x18\x02 \x03(\x0b\x32\x33.modelresponse.SingleStringRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x03 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\xeb\x01\n\x12MultiStringRequest\x12\x0f\n\x07request\x18\x01 \x03(\t\x12H\n\x0cquery_kwargs\x18\x02 \x03(\x0b\x32\x32.modelresponse.MultiStringRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x03 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\x85\x01\n\x11SingleStringReply\x12\x10\n\x08response\x18\x01 \x01(\t\x12\x12\n\ntime_taken\x18\x02 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x03 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\x84\x01\n\x10MultiStringReply\x12\x10\n\x08response\x18\x01 \x03(\t\x12\x12\n\ntime_taken\x18\x02 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x03 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\xeb\x01\n\tQARequest\x12\x10\n\x08question\x18\x01 \x01(\t\x12\x0f\n\x07\x63ontext\x18\x02 \x01(\t\x12?\n\x0cquery_kwargs\x18\x03 \x03(\x0b\x32).modelresponse.QARequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\xd3\x02\n\x13\x43onversationRequest\x12\x0c\n\x04text\x18\x01 \x01(\t\x12\x1c\n\x0f\x63onversation_id\x18\x02 \x01(\x03H\x00\x88\x01\x01\x12\x18\n\x10past_user_inputs\x18\x03 \x03(\t\x12\x1b\n\x13generated_responses\x18\x04 \x03(\t\x12I\n\x0cquery_kwargs\x18\x05 \x03(\x0b\x32\x33.modelresponse.ConversationRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x06 \x01(\tH\x01\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_conversation_idB\x12\n\x10_deployment_name\"\xc3\x01\n\x11\x43onversationReply\x12\x17\n\x0f\x63onversation_id\x18\x01 \x01(\x03\x12\x18\n\x10past_user_inputs\x18\x02 \x03(\t\x12\x1b\n\x13generated_responses\x18\x03 \x03(\t\x12\x12\n\ntime_taken\x18\x04 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x05 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x06 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\xaf\x01\n\nImageReply\x12\x0e\n\x06images\x18\x01 \x03(\x0c\x12\x1d\n\x15nsfw_content_detected\x18\x02 \x03(\x08\x12\x0c\n\x04mode\x18\x03 \x01(\t\x12\x0e\n\x06size_w\x18\x04 \x01(\x03\x12\x0e\n\x06size_h\x18\x05 \x01(\x03\x12\x12\n\ntime_taken\x18\x06 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x07 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name2\xd4\x06\n\rModelResponse\x12=\n\tTerminate\x12\x16.google.protobuf.Empty\x1a\x16.google.protobuf.Empty\"\x00\x12\x43\n\rCreateSession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12\x44\n\x0e\x44\x65stroySession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12V\n\x0eGeneratorReply\x12!.modelresponse.MultiStringRequest\x1a\x1f.modelresponse.MultiStringReply\"\x00\x12]\n\x13\x43lassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12V\n\x16QuestionAndAnswerReply\x12\x18.modelresponse.QARequest\x1a .modelresponse.SingleStringReply\"\x00\x12W\n\rFillMaskReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12\x62\n\x18TokenClassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12]\n\x13\x43onversationalReply\x12\".modelresponse.ConversationRequest\x1a .modelresponse.ConversationReply\"\x00\x12N\n\x0cTxt2ImgReply\x12!.modelresponse.MultiStringRequest\x1a\x19.modelresponse.ImageReply\"\x00\x62\x06proto3'
+)
 
 _globals = globals()
 _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
 _builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'modelresponse_pb2', _globals)
 if _descriptor._USE_C_DESCRIPTORS == False:
 
-  DESCRIPTOR._options = None
-  _SINGLESTRINGREQUEST_QUERYKWARGSENTRY._options = None
-  _SINGLESTRINGREQUEST_QUERYKWARGSENTRY._serialized_options = b'8\001'
-  _MULTISTRINGREQUEST_QUERYKWARGSENTRY._options = None
-  _MULTISTRINGREQUEST_QUERYKWARGSENTRY._serialized_options = b'8\001'
-  _QAREQUEST_QUERYKWARGSENTRY._options = None
-  _QAREQUEST_QUERYKWARGSENTRY._serialized_options = b'8\001'
-  _CONVERSATIONREQUEST_QUERYKWARGSENTRY._options = None
-  _CONVERSATIONREQUEST_QUERYKWARGSENTRY._serialized_options = b'8\001'
-  _globals['_VALUE']._serialized_start=67
-  _globals['_VALUE']._serialized_end=162
-  _globals['_SESSIONID']._serialized_start=164
-  _globals['_SESSIONID']._serialized_end=195
-  _globals['_SINGLESTRINGREQUEST']._serialized_start=198
-  _globals['_SINGLESTRINGREQUEST']._serialized_end=435
-  _globals['_SINGLESTRINGREQUEST_QUERYKWARGSENTRY']._serialized_start=343
-  _globals['_SINGLESTRINGREQUEST_QUERYKWARGSENTRY']._serialized_end=415
-  _globals['_MULTISTRINGREQUEST']._serialized_start=438
-  _globals['_MULTISTRINGREQUEST']._serialized_end=673
-  _globals['_MULTISTRINGREQUEST_QUERYKWARGSENTRY']._serialized_start=343
-  _globals['_MULTISTRINGREQUEST_QUERYKWARGSENTRY']._serialized_end=415
-  _globals['_SINGLESTRINGREPLY']._serialized_start=676
-  _globals['_SINGLESTRINGREPLY']._serialized_end=809
-  _globals['_MULTISTRINGREPLY']._serialized_start=812
-  _globals['_MULTISTRINGREPLY']._serialized_end=944
-  _globals['_QAREQUEST']._serialized_start=947
-  _globals['_QAREQUEST']._serialized_end=1182
-  _globals['_QAREQUEST_QUERYKWARGSENTRY']._serialized_start=343
-  _globals['_QAREQUEST_QUERYKWARGSENTRY']._serialized_end=415
-  _globals['_CONVERSATIONREQUEST']._serialized_start=1185
-  _globals['_CONVERSATIONREQUEST']._serialized_end=1524
-  _globals['_CONVERSATIONREQUEST_QUERYKWARGSENTRY']._serialized_start=343
-  _globals['_CONVERSATIONREQUEST_QUERYKWARGSENTRY']._serialized_end=415
-  _globals['_CONVERSATIONREPLY']._serialized_start=1527
-  _globals['_CONVERSATIONREPLY']._serialized_end=1722
-  _globals['_IMAGEREPLY']._serialized_start=1725
-  _globals['_IMAGEREPLY']._serialized_end=1900
-  _globals['_MODELRESPONSE']._serialized_start=1903
-  _globals['_MODELRESPONSE']._serialized_end=2755
+    DESCRIPTOR._options = None
+    _SINGLESTRINGREQUEST_QUERYKWARGSENTRY._options = None
+    _SINGLESTRINGREQUEST_QUERYKWARGSENTRY._serialized_options = b'8\001'
+    _MULTISTRINGREQUEST_QUERYKWARGSENTRY._options = None
+    _MULTISTRINGREQUEST_QUERYKWARGSENTRY._serialized_options = b'8\001'
+    _QAREQUEST_QUERYKWARGSENTRY._options = None
+    _QAREQUEST_QUERYKWARGSENTRY._serialized_options = b'8\001'
+    _CONVERSATIONREQUEST_QUERYKWARGSENTRY._options = None
+    _CONVERSATIONREQUEST_QUERYKWARGSENTRY._serialized_options = b'8\001'
+    _globals['_VALUE']._serialized_start = 67
+    _globals['_VALUE']._serialized_end = 162
+    _globals['_SESSIONID']._serialized_start = 164
+    _globals['_SESSIONID']._serialized_end = 195
+    _globals['_SINGLESTRINGREQUEST']._serialized_start = 198
+    _globals['_SINGLESTRINGREQUEST']._serialized_end = 435
+    _globals['_SINGLESTRINGREQUEST_QUERYKWARGSENTRY']._serialized_start = 343
+    _globals['_SINGLESTRINGREQUEST_QUERYKWARGSENTRY']._serialized_end = 415
+    _globals['_MULTISTRINGREQUEST']._serialized_start = 438
+    _globals['_MULTISTRINGREQUEST']._serialized_end = 673
+    _globals['_MULTISTRINGREQUEST_QUERYKWARGSENTRY']._serialized_start = 343
+    _globals['_MULTISTRINGREQUEST_QUERYKWARGSENTRY']._serialized_end = 415
+    _globals['_SINGLESTRINGREPLY']._serialized_start = 676
+    _globals['_SINGLESTRINGREPLY']._serialized_end = 809
+    _globals['_MULTISTRINGREPLY']._serialized_start = 812
+    _globals['_MULTISTRINGREPLY']._serialized_end = 944
+    _globals['_QAREQUEST']._serialized_start = 947
+    _globals['_QAREQUEST']._serialized_end = 1182
+    _globals['_QAREQUEST_QUERYKWARGSENTRY']._serialized_start = 343
+    _globals['_QAREQUEST_QUERYKWARGSENTRY']._serialized_end = 415
+    _globals['_CONVERSATIONREQUEST']._serialized_start = 1185
+    _globals['_CONVERSATIONREQUEST']._serialized_end = 1524
+    _globals['_CONVERSATIONREQUEST_QUERYKWARGSENTRY']._serialized_start = 343
+    _globals['_CONVERSATIONREQUEST_QUERYKWARGSENTRY']._serialized_end = 415
+    _globals['_CONVERSATIONREPLY']._serialized_start = 1527
+    _globals['_CONVERSATIONREPLY']._serialized_end = 1722
+    _globals['_IMAGEREPLY']._serialized_start = 1725
+    _globals['_IMAGEREPLY']._serialized_end = 1900
+    _globals['_MODELRESPONSE']._serialized_start = 1903
+    _globals['_MODELRESPONSE']._serialized_end = 2755
 # @@protoc_insertion_point(module_scope)
diff --git a/mii/grpc_related/proto/modelresponse_pb2_grpc.py b/mii/grpc_related/proto/modelresponse_pb2_grpc.py
index 683e4962..95cfa825 100644
--- a/mii/grpc_related/proto/modelresponse_pb2_grpc.py
+++ b/mii/grpc_related/proto/modelresponse_pb2_grpc.py
@@ -1,3 +1,8 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 # Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
 """Client and server classes corresponding to protobuf-defined services."""
 import grpc
@@ -8,7 +13,6 @@
 
 class ModelResponseStub(object):
     """Missing associated documentation comment in .proto file."""
-
     def __init__(self, channel):
         """Constructor.
 
@@ -16,60 +20,60 @@ def __init__(self, channel):
             channel: A grpc.Channel.
         """
         self.Terminate = channel.unary_unary(
-                '/modelresponse.ModelResponse/Terminate',
-                request_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString,
-                response_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString,
-                )
+            '/modelresponse.ModelResponse/Terminate',
+            request_serializer=google_dot_protobuf_dot_empty__pb2.Empty.
+            SerializeToString,
+            response_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString,
+        )
         self.CreateSession = channel.unary_unary(
-                '/modelresponse.ModelResponse/CreateSession',
-                request_serializer=modelresponse__pb2.SessionID.SerializeToString,
-                response_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString,
-                )
+            '/modelresponse.ModelResponse/CreateSession',
+            request_serializer=modelresponse__pb2.SessionID.SerializeToString,
+            response_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString,
+        )
         self.DestroySession = channel.unary_unary(
-                '/modelresponse.ModelResponse/DestroySession',
-                request_serializer=modelresponse__pb2.SessionID.SerializeToString,
-                response_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString,
-                )
+            '/modelresponse.ModelResponse/DestroySession',
+            request_serializer=modelresponse__pb2.SessionID.SerializeToString,
+            response_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString,
+        )
         self.GeneratorReply = channel.unary_unary(
-                '/modelresponse.ModelResponse/GeneratorReply',
-                request_serializer=modelresponse__pb2.MultiStringRequest.SerializeToString,
-                response_deserializer=modelresponse__pb2.MultiStringReply.FromString,
-                )
+            '/modelresponse.ModelResponse/GeneratorReply',
+            request_serializer=modelresponse__pb2.MultiStringRequest.SerializeToString,
+            response_deserializer=modelresponse__pb2.MultiStringReply.FromString,
+        )
         self.ClassificationReply = channel.unary_unary(
-                '/modelresponse.ModelResponse/ClassificationReply',
-                request_serializer=modelresponse__pb2.SingleStringRequest.SerializeToString,
-                response_deserializer=modelresponse__pb2.SingleStringReply.FromString,
-                )
+            '/modelresponse.ModelResponse/ClassificationReply',
+            request_serializer=modelresponse__pb2.SingleStringRequest.SerializeToString,
+            response_deserializer=modelresponse__pb2.SingleStringReply.FromString,
+        )
         self.QuestionAndAnswerReply = channel.unary_unary(
-                '/modelresponse.ModelResponse/QuestionAndAnswerReply',
-                request_serializer=modelresponse__pb2.QARequest.SerializeToString,
-                response_deserializer=modelresponse__pb2.SingleStringReply.FromString,
-                )
+            '/modelresponse.ModelResponse/QuestionAndAnswerReply',
+            request_serializer=modelresponse__pb2.QARequest.SerializeToString,
+            response_deserializer=modelresponse__pb2.SingleStringReply.FromString,
+        )
         self.FillMaskReply = channel.unary_unary(
-                '/modelresponse.ModelResponse/FillMaskReply',
-                request_serializer=modelresponse__pb2.SingleStringRequest.SerializeToString,
-                response_deserializer=modelresponse__pb2.SingleStringReply.FromString,
-                )
+            '/modelresponse.ModelResponse/FillMaskReply',
+            request_serializer=modelresponse__pb2.SingleStringRequest.SerializeToString,
+            response_deserializer=modelresponse__pb2.SingleStringReply.FromString,
+        )
         self.TokenClassificationReply = channel.unary_unary(
-                '/modelresponse.ModelResponse/TokenClassificationReply',
-                request_serializer=modelresponse__pb2.SingleStringRequest.SerializeToString,
-                response_deserializer=modelresponse__pb2.SingleStringReply.FromString,
-                )
+            '/modelresponse.ModelResponse/TokenClassificationReply',
+            request_serializer=modelresponse__pb2.SingleStringRequest.SerializeToString,
+            response_deserializer=modelresponse__pb2.SingleStringReply.FromString,
+        )
         self.ConversationalReply = channel.unary_unary(
-                '/modelresponse.ModelResponse/ConversationalReply',
-                request_serializer=modelresponse__pb2.ConversationRequest.SerializeToString,
-                response_deserializer=modelresponse__pb2.ConversationReply.FromString,
-                )
+            '/modelresponse.ModelResponse/ConversationalReply',
+            request_serializer=modelresponse__pb2.ConversationRequest.SerializeToString,
+            response_deserializer=modelresponse__pb2.ConversationReply.FromString,
+        )
         self.Txt2ImgReply = channel.unary_unary(
-                '/modelresponse.ModelResponse/Txt2ImgReply',
-                request_serializer=modelresponse__pb2.MultiStringRequest.SerializeToString,
-                response_deserializer=modelresponse__pb2.ImageReply.FromString,
-                )
+            '/modelresponse.ModelResponse/Txt2ImgReply',
+            request_serializer=modelresponse__pb2.MultiStringRequest.SerializeToString,
+            response_deserializer=modelresponse__pb2.ImageReply.FromString,
+        )
 
 
 class ModelResponseServicer(object):
     """Missing associated documentation comment in .proto file."""
-
     def Terminate(self, request, context):
         """Missing associated documentation comment in .proto file."""
         context.set_code(grpc.StatusCode.UNIMPLEMENTED)
@@ -133,232 +137,334 @@ def Txt2ImgReply(self, request, context):
 
 def add_ModelResponseServicer_to_server(servicer, server):
     rpc_method_handlers = {
-            'Terminate': grpc.unary_unary_rpc_method_handler(
-                    servicer.Terminate,
-                    request_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString,
-                    response_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString,
-            ),
-            'CreateSession': grpc.unary_unary_rpc_method_handler(
-                    servicer.CreateSession,
-                    request_deserializer=modelresponse__pb2.SessionID.FromString,
-                    response_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString,
-            ),
-            'DestroySession': grpc.unary_unary_rpc_method_handler(
-                    servicer.DestroySession,
-                    request_deserializer=modelresponse__pb2.SessionID.FromString,
-                    response_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString,
-            ),
-            'GeneratorReply': grpc.unary_unary_rpc_method_handler(
-                    servicer.GeneratorReply,
-                    request_deserializer=modelresponse__pb2.MultiStringRequest.FromString,
-                    response_serializer=modelresponse__pb2.MultiStringReply.SerializeToString,
-            ),
-            'ClassificationReply': grpc.unary_unary_rpc_method_handler(
-                    servicer.ClassificationReply,
-                    request_deserializer=modelresponse__pb2.SingleStringRequest.FromString,
-                    response_serializer=modelresponse__pb2.SingleStringReply.SerializeToString,
-            ),
-            'QuestionAndAnswerReply': grpc.unary_unary_rpc_method_handler(
-                    servicer.QuestionAndAnswerReply,
-                    request_deserializer=modelresponse__pb2.QARequest.FromString,
-                    response_serializer=modelresponse__pb2.SingleStringReply.SerializeToString,
-            ),
-            'FillMaskReply': grpc.unary_unary_rpc_method_handler(
-                    servicer.FillMaskReply,
-                    request_deserializer=modelresponse__pb2.SingleStringRequest.FromString,
-                    response_serializer=modelresponse__pb2.SingleStringReply.SerializeToString,
-            ),
-            'TokenClassificationReply': grpc.unary_unary_rpc_method_handler(
-                    servicer.TokenClassificationReply,
-                    request_deserializer=modelresponse__pb2.SingleStringRequest.FromString,
-                    response_serializer=modelresponse__pb2.SingleStringReply.SerializeToString,
-            ),
-            'ConversationalReply': grpc.unary_unary_rpc_method_handler(
-                    servicer.ConversationalReply,
-                    request_deserializer=modelresponse__pb2.ConversationRequest.FromString,
-                    response_serializer=modelresponse__pb2.ConversationReply.SerializeToString,
-            ),
-            'Txt2ImgReply': grpc.unary_unary_rpc_method_handler(
-                    servicer.Txt2ImgReply,
-                    request_deserializer=modelresponse__pb2.MultiStringRequest.FromString,
-                    response_serializer=modelresponse__pb2.ImageReply.SerializeToString,
-            ),
+        'Terminate':
+        grpc.unary_unary_rpc_method_handler(
+            servicer.Terminate,
+            request_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString,
+            response_serializer=google_dot_protobuf_dot_empty__pb2.Empty.
+            SerializeToString,
+        ),
+        'CreateSession':
+        grpc.unary_unary_rpc_method_handler(
+            servicer.CreateSession,
+            request_deserializer=modelresponse__pb2.SessionID.FromString,
+            response_serializer=google_dot_protobuf_dot_empty__pb2.Empty.
+            SerializeToString,
+        ),
+        'DestroySession':
+        grpc.unary_unary_rpc_method_handler(
+            servicer.DestroySession,
+            request_deserializer=modelresponse__pb2.SessionID.FromString,
+            response_serializer=google_dot_protobuf_dot_empty__pb2.Empty.
+            SerializeToString,
+        ),
+        'GeneratorReply':
+        grpc.unary_unary_rpc_method_handler(
+            servicer.GeneratorReply,
+            request_deserializer=modelresponse__pb2.MultiStringRequest.FromString,
+            response_serializer=modelresponse__pb2.MultiStringReply.SerializeToString,
+        ),
+        'ClassificationReply':
+        grpc.unary_unary_rpc_method_handler(
+            servicer.ClassificationReply,
+            request_deserializer=modelresponse__pb2.SingleStringRequest.FromString,
+            response_serializer=modelresponse__pb2.SingleStringReply.SerializeToString,
+        ),
+        'QuestionAndAnswerReply':
+        grpc.unary_unary_rpc_method_handler(
+            servicer.QuestionAndAnswerReply,
+            request_deserializer=modelresponse__pb2.QARequest.FromString,
+            response_serializer=modelresponse__pb2.SingleStringReply.SerializeToString,
+        ),
+        'FillMaskReply':
+        grpc.unary_unary_rpc_method_handler(
+            servicer.FillMaskReply,
+            request_deserializer=modelresponse__pb2.SingleStringRequest.FromString,
+            response_serializer=modelresponse__pb2.SingleStringReply.SerializeToString,
+        ),
+        'TokenClassificationReply':
+        grpc.unary_unary_rpc_method_handler(
+            servicer.TokenClassificationReply,
+            request_deserializer=modelresponse__pb2.SingleStringRequest.FromString,
+            response_serializer=modelresponse__pb2.SingleStringReply.SerializeToString,
+        ),
+        'ConversationalReply':
+        grpc.unary_unary_rpc_method_handler(
+            servicer.ConversationalReply,
+            request_deserializer=modelresponse__pb2.ConversationRequest.FromString,
+            response_serializer=modelresponse__pb2.ConversationReply.SerializeToString,
+        ),
+        'Txt2ImgReply':
+        grpc.unary_unary_rpc_method_handler(
+            servicer.Txt2ImgReply,
+            request_deserializer=modelresponse__pb2.MultiStringRequest.FromString,
+            response_serializer=modelresponse__pb2.ImageReply.SerializeToString,
+        ),
     }
-    generic_handler = grpc.method_handlers_generic_handler(
-            'modelresponse.ModelResponse', rpc_method_handlers)
-    server.add_generic_rpc_handlers((generic_handler,))
+    generic_handler = grpc.method_handlers_generic_handler('modelresponse.ModelResponse',
+                                                           rpc_method_handlers)
+    server.add_generic_rpc_handlers((generic_handler, ))
 
 
- # This class is part of an EXPERIMENTAL API.
+# This class is part of an EXPERIMENTAL API.
 class ModelResponse(object):
     """Missing associated documentation comment in .proto file."""
-
     @staticmethod
     def Terminate(request,
+                  target,
+                  options=(),
+                  channel_credentials=None,
+                  call_credentials=None,
+                  insecure=False,
+                  compression=None,
+                  wait_for_ready=None,
+                  timeout=None,
+                  metadata=None):
+        return grpc.experimental.unary_unary(
+            request,
             target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/modelresponse.ModelResponse/Terminate',
+            '/modelresponse.ModelResponse/Terminate',
             google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString,
             google_dot_protobuf_dot_empty__pb2.Empty.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+            options,
+            channel_credentials,
+            insecure,
+            call_credentials,
+            compression,
+            wait_for_ready,
+            timeout,
+            metadata)
 
     @staticmethod
     def CreateSession(request,
+                      target,
+                      options=(),
+                      channel_credentials=None,
+                      call_credentials=None,
+                      insecure=False,
+                      compression=None,
+                      wait_for_ready=None,
+                      timeout=None,
+                      metadata=None):
+        return grpc.experimental.unary_unary(
+            request,
             target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/modelresponse.ModelResponse/CreateSession',
+            '/modelresponse.ModelResponse/CreateSession',
             modelresponse__pb2.SessionID.SerializeToString,
             google_dot_protobuf_dot_empty__pb2.Empty.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+            options,
+            channel_credentials,
+            insecure,
+            call_credentials,
+            compression,
+            wait_for_ready,
+            timeout,
+            metadata)
 
     @staticmethod
     def DestroySession(request,
+                       target,
+                       options=(),
+                       channel_credentials=None,
+                       call_credentials=None,
+                       insecure=False,
+                       compression=None,
+                       wait_for_ready=None,
+                       timeout=None,
+                       metadata=None):
+        return grpc.experimental.unary_unary(
+            request,
             target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/modelresponse.ModelResponse/DestroySession',
+            '/modelresponse.ModelResponse/DestroySession',
             modelresponse__pb2.SessionID.SerializeToString,
             google_dot_protobuf_dot_empty__pb2.Empty.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+            options,
+            channel_credentials,
+            insecure,
+            call_credentials,
+            compression,
+            wait_for_ready,
+            timeout,
+            metadata)
 
     @staticmethod
     def GeneratorReply(request,
+                       target,
+                       options=(),
+                       channel_credentials=None,
+                       call_credentials=None,
+                       insecure=False,
+                       compression=None,
+                       wait_for_ready=None,
+                       timeout=None,
+                       metadata=None):
+        return grpc.experimental.unary_unary(
+            request,
             target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/modelresponse.ModelResponse/GeneratorReply',
+            '/modelresponse.ModelResponse/GeneratorReply',
             modelresponse__pb2.MultiStringRequest.SerializeToString,
             modelresponse__pb2.MultiStringReply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+            options,
+            channel_credentials,
+            insecure,
+            call_credentials,
+            compression,
+            wait_for_ready,
+            timeout,
+            metadata)
 
     @staticmethod
     def ClassificationReply(request,
+                            target,
+                            options=(),
+                            channel_credentials=None,
+                            call_credentials=None,
+                            insecure=False,
+                            compression=None,
+                            wait_for_ready=None,
+                            timeout=None,
+                            metadata=None):
+        return grpc.experimental.unary_unary(
+            request,
             target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/modelresponse.ModelResponse/ClassificationReply',
+            '/modelresponse.ModelResponse/ClassificationReply',
             modelresponse__pb2.SingleStringRequest.SerializeToString,
             modelresponse__pb2.SingleStringReply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+            options,
+            channel_credentials,
+            insecure,
+            call_credentials,
+            compression,
+            wait_for_ready,
+            timeout,
+            metadata)
 
     @staticmethod
     def QuestionAndAnswerReply(request,
+                               target,
+                               options=(),
+                               channel_credentials=None,
+                               call_credentials=None,
+                               insecure=False,
+                               compression=None,
+                               wait_for_ready=None,
+                               timeout=None,
+                               metadata=None):
+        return grpc.experimental.unary_unary(
+            request,
             target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/modelresponse.ModelResponse/QuestionAndAnswerReply',
+            '/modelresponse.ModelResponse/QuestionAndAnswerReply',
             modelresponse__pb2.QARequest.SerializeToString,
             modelresponse__pb2.SingleStringReply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+            options,
+            channel_credentials,
+            insecure,
+            call_credentials,
+            compression,
+            wait_for_ready,
+            timeout,
+            metadata)
 
     @staticmethod
     def FillMaskReply(request,
+                      target,
+                      options=(),
+                      channel_credentials=None,
+                      call_credentials=None,
+                      insecure=False,
+                      compression=None,
+                      wait_for_ready=None,
+                      timeout=None,
+                      metadata=None):
+        return grpc.experimental.unary_unary(
+            request,
             target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/modelresponse.ModelResponse/FillMaskReply',
+            '/modelresponse.ModelResponse/FillMaskReply',
             modelresponse__pb2.SingleStringRequest.SerializeToString,
             modelresponse__pb2.SingleStringReply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+            options,
+            channel_credentials,
+            insecure,
+            call_credentials,
+            compression,
+            wait_for_ready,
+            timeout,
+            metadata)
 
     @staticmethod
     def TokenClassificationReply(request,
+                                 target,
+                                 options=(),
+                                 channel_credentials=None,
+                                 call_credentials=None,
+                                 insecure=False,
+                                 compression=None,
+                                 wait_for_ready=None,
+                                 timeout=None,
+                                 metadata=None):
+        return grpc.experimental.unary_unary(
+            request,
             target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/modelresponse.ModelResponse/TokenClassificationReply',
+            '/modelresponse.ModelResponse/TokenClassificationReply',
             modelresponse__pb2.SingleStringRequest.SerializeToString,
             modelresponse__pb2.SingleStringReply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+            options,
+            channel_credentials,
+            insecure,
+            call_credentials,
+            compression,
+            wait_for_ready,
+            timeout,
+            metadata)
 
     @staticmethod
     def ConversationalReply(request,
+                            target,
+                            options=(),
+                            channel_credentials=None,
+                            call_credentials=None,
+                            insecure=False,
+                            compression=None,
+                            wait_for_ready=None,
+                            timeout=None,
+                            metadata=None):
+        return grpc.experimental.unary_unary(
+            request,
             target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/modelresponse.ModelResponse/ConversationalReply',
+            '/modelresponse.ModelResponse/ConversationalReply',
             modelresponse__pb2.ConversationRequest.SerializeToString,
             modelresponse__pb2.ConversationReply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+            options,
+            channel_credentials,
+            insecure,
+            call_credentials,
+            compression,
+            wait_for_ready,
+            timeout,
+            metadata)
 
     @staticmethod
     def Txt2ImgReply(request,
+                     target,
+                     options=(),
+                     channel_credentials=None,
+                     call_credentials=None,
+                     insecure=False,
+                     compression=None,
+                     wait_for_ready=None,
+                     timeout=None,
+                     metadata=None):
+        return grpc.experimental.unary_unary(
+            request,
             target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/modelresponse.ModelResponse/Txt2ImgReply',
+            '/modelresponse.ModelResponse/Txt2ImgReply',
             modelresponse__pb2.MultiStringRequest.SerializeToString,
             modelresponse__pb2.ImageReply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+            options,
+            channel_credentials,
+            insecure,
+            call_credentials,
+            compression,
+            wait_for_ready,
+            timeout,
+            metadata)

From 94b66993702d54223f377cdcd6a5af19f524236b Mon Sep 17 00:00:00 2001
From: Tosin Segun <insanechils@gmail.com>
Date: Fri, 14 Jul 2023 16:51:58 +0000
Subject: [PATCH 43/69] Fixing the client

---
 mii/client.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mii/client.py b/mii/client.py
index d71dce89..5d18b81c 100644
--- a/mii/client.py
+++ b/mii/client.py
@@ -90,6 +90,7 @@ async def _request_async_response(self, request_dict, task, **query_kwargs):
     def query(self, request_dict, **query_kwargs):
         deployment_name = request_dict.get('deployment_name')
         deployment_name, task = self._get_deployment_task(deployment_name)
+        request_dict['deployment_name'] = deployment_name
         return self.asyncio_loop.run_until_complete(
             self._request_async_response(request_dict,
                                          task,

From 710c20b901634dff1602a1e53f59010022d52b2c Mon Sep 17 00:00:00 2001
From: Tosin Segun <insanechils@gmail.com>
Date: Fri, 21 Jul 2023 00:02:42 +0000
Subject: [PATCH 44/69] Initial partial deployment commit

---
 mii/__init__.py                               |   5 +
 mii/client.py                                 |  54 +-
 mii/config.py                                 |   1 +
 mii/constants.py                              |   3 +-
 mii/deployment.py                             |  55 +-
 mii/grpc_related/modelresponse_server.py      |  22 +-
 mii/grpc_related/proto/modelresponse.proto    |   4 +
 mii/grpc_related/proto/modelresponse_pb2.py   |  92 ++-
 .../proto/modelresponse_pb2_grpc.py           | 581 ++++++++----------
 9 files changed, 439 insertions(+), 378 deletions(-)

diff --git a/mii/__init__.py b/mii/__init__.py
index 66748a56..aba912f3 100644
--- a/mii/__init__.py
+++ b/mii/__init__.py
@@ -15,6 +15,11 @@
 
 __version__ = "0.0.0"
 non_persistent_models = {}
+port_map = {}
+deployment_tag = ""
+lb_config = None
+model_path = None
+deployment_type = None
 try:
     from .version import __version__
 except ImportError:
diff --git a/mii/client.py b/mii/client.py
index 5d18b81c..f76105ed 100644
--- a/mii/client.py
+++ b/mii/client.py
@@ -8,8 +8,10 @@
 import mii
 from mii.utils import get_task
 from mii.grpc_related.proto import modelresponse_pb2, modelresponse_pb2_grpc
-from mii.constants import GRPC_MAX_MSG_SIZE, Tasks
+from mii.constants import GRPC_MAX_MSG_SIZE, Tasks, DeploymentType
 from mii.method_table import GRPC_METHOD_TABLE
+from mii.deployment import allocate_processes, create_score_file
+from mii.config import DeploymentConfig
 
 
 def _get_deployment_configs(deployment_tag):
@@ -18,6 +20,7 @@ def _get_deployment_configs(deployment_tag):
     for deployment in configs:
         if not isinstance(configs[deployment], dict):
             continue
+        configs[deployment][mii.constants.DEPLOYED_KEY] = True
         deployments.append(configs[deployment])
     return deployments
 
@@ -60,7 +63,7 @@ class MIIClient():
     def __init__(self, deployments, host, port):
         self.asyncio_loop = asyncio.get_event_loop()
         channel = create_channel(host, port)
-        self.stub = modelresponse_pb2_grpc.ModelResponseStub(channel)
+        self.stub = modelresponse_pb2_grpc.DeploymentManagementStub(channel)
         #self.task = get_task(task_name)
         self.deployments = deployments
 
@@ -126,7 +129,54 @@ def destroy_session(self, session_id, deployment_name=None):
         assert task == Tasks.TEXT_GENERATION, f"Session deletion only available for task '{Tasks.TEXT_GENERATION}'."
         self.asyncio_loop.run_until_complete(self.destroy_session_async(session_id))
 
+    async def add_models_async(self, request=None):
+        await getattr(self.stub, "AddDeployment")(modelresponse_pb2.google_dot_protobuf_dot_empty__pb2.Empty())
+
+    def add_models(self,
+                   task=None,
+                   model=None,
+                   deployment_name=None,
+                   enable_deepspeed=True,
+                   enable_zero=False,
+                   ds_config=None,
+                   mii_config={},
+                   deployment_tag=None,
+                   deployments=[],
+                   deployment_type=DeploymentType.LOCAL,
+                   model_path=None,
+                   version=1):
+        if not deployments:
+            assert all((model, task, deployment_name)), "model, task, and deployment name must be set to deploy singular model"
+            deployments = [
+                DeploymentConfig(deployment_name=deployment_name,
+                             task=task,
+                             model=model,
+                             enable_deepspeed=enable_deepspeed,
+                             enable_zero=enable_zero,
+                             GPU_index_map=None,
+                             mii_config=mii.config.MIIConfig(**mii_config),
+                             ds_config=ds_config,
+                             version=version,
+                             deployed=False)
+            ]
 
+        """
+        deployment_tag = mii.deployment_tag
+        lb_config = allocate_processes(deployments)
+        if mii.lb_config is not None:
+            mii.lb_config.replica_configs.extend(lb_config.replica_configs)
+        else:
+            mii.lb_config = lb_config
+        self.deployments.extend(deployments)
+        if mii.model_path is None and deployment_type == DeploymentType.LOCAL:
+            mii.model_path = MII_MODEL_PATH_DEFAULT
+        elif mii.model_path is None and deployment_type == DeploymentType.AML:
+            model_path = "model"
+        create_score_file(deployment_tag=deployment_tag, deployment_type=mii.deployment_type, deployments=self.deployments, model_path=mii.model_path, lb_config=mii.lb_config)
+        if mii.deployment_type == DeploymentType.Local:
+            mii.utils.import_score_file(deployment_tag).init()
+        """
+        self.asyncio_loop.run_until_complete(self.add_models_async())
 class MIITensorParallelClient():
     """
     Client to send queries to multiple endpoints in parallel.
diff --git a/mii/config.py b/mii/config.py
index d7a246c0..ea3fbe43 100644
--- a/mii/config.py
+++ b/mii/config.py
@@ -140,3 +140,4 @@ class DeploymentConfig(BaseModel):
     mii_config: MIIConfig = MIIConfig.parse_obj({})
     ds_config: dict = None
     version: int = 1
+    deployed: bool = False
diff --git a/mii/constants.py b/mii/constants.py
index 29493433..baffdcf9 100644
--- a/mii/constants.py
+++ b/mii/constants.py
@@ -99,7 +99,7 @@ class ModelProvider(enum.Enum):
 ENABLE_DEEPSPEED_ZERO_KEY = 'ds_zero'
 DEEPSPEED_CONFIG_KEY = 'ds_config'
 CHECKPOINT_KEY = "checkpoint"
-
+DEPLOYED_KEY = "deployed"
 MII_CACHE_PATH = "MII_CACHE_PATH"
 MII_CACHE_PATH_DEFAULT = "/tmp/mii_cache"
 
@@ -118,6 +118,7 @@ class ModelProvider(enum.Enum):
 TERMINATE_METHOD = "Terminate"
 CREATE_SESSION_METHOD = "CreateSession"
 DESTROY_SESSION_METHOD = "DestroySession"
+ADD_DEPLOYMENT_METHOD = "AddDeployment"
 
 LB_MAX_WORKER_THREADS = 32
 
diff --git a/mii/deployment.py b/mii/deployment.py
index dc970035..ae539b10 100644
--- a/mii/deployment.py
+++ b/mii/deployment.py
@@ -68,6 +68,7 @@ def deploy(task=None,
         If deployment_type is `LOCAL`, returns just the name of the deployment that can be used to create a query handle using `mii.mii_query_handle(deployment_name)`
 
     """
+    mii.deployment_type = deployment_type
     if not deployments:
         assert all((model, task, deployment_name)), "model, task, and deployment name must be set to deploy singular model"
         deployments = [
@@ -79,12 +80,13 @@ def deploy(task=None,
                              GPU_index_map=None,
                              mii_config=mii.config.MIIConfig(**mii_config),
                              ds_config=ds_config,
-                             version=version)
+                             version=version,
+                             deployed=False)
         ]
         deployment_tag = deployment_name
     else:
         assert deployment_tag is not None, "deployment_tag must be set to deploy multiple models"
-
+    mii.deployment_tag = deployment_tag
     # parse and validate mii config
     for deployment in deployments:
         mii_config = deployment.mii_config
@@ -125,10 +127,10 @@ def deploy(task=None,
         model_path = MII_MODEL_PATH_DEFAULT
     elif model_path is None and deployment_type == DeploymentType.AML:
         model_path = "model"
-
+    
+    mii.model_path = model_path
     # add fields for replica deployment
     replica_configs = []
-    port_map = {}
     port_offset = 1
     for deployment in deployments:
         mii_config = deployment.mii_config
@@ -139,16 +141,16 @@ def deploy(task=None,
 
         for i, (hostname, gpu_indices) in enumerate(replica_pool):
             # Reserver port for a LB proxy when replication is enabled
-            if hostname not in port_map:
-                port_map[hostname] = set()
+            if hostname not in mii.port_map:
+                mii.port_map[hostname] = set()
             base_port = mii_config.port_number + i * mii_config.tensor_parallel + port_offset
-            if base_port in port_map[hostname]:
-                base_port = max(port_map[hostname]) + 1
+            if base_port in mii.port_map[hostname]:
+                base_port = max(mii.port_map[hostname]) + 1
             tensor_parallel_ports = list(
                 range(base_port,
                       base_port + mii_config.tensor_parallel))
             for i in range(base_port, base_port + mii_config.tensor_parallel):
-                port_map[hostname].add(i)
+                mii.port_map[hostname].add(i)
             torch_dist_port = mii_config.torch_dist_port + i
             replica_configs.append(
                 ReplicaConfig(task=get_task_name(deployment.task),
@@ -166,7 +168,7 @@ def deploy(task=None,
                           deployments=deployments,
                           model_path=model_path,
                           lb_config=lb_config)
-
+    
     if deployment_type == DeploymentType.AML:
         _deploy_aml(deployment_tag=deployment_tag, model_name=model, version=version)
     elif deployment_type == DeploymentType.LOCAL:
@@ -186,6 +188,39 @@ def deploy(task=None,
     else:
         raise Exception(f"Unknown deployment type: {deployment_type}")
 
+def allocate_processes(deployments):
+    replica_configs = []
+    port_offset = 1
+    for deployment in deployments:
+        mii_config = deployment.mii_config
+        replica_pool = _allocate_processes(mii_config.hostfile,
+                                           mii_config.tensor_parallel,
+                                           mii_config.replica_num,
+                                           deployment.GPU_index_map)
+
+        for i, (hostname, gpu_indices) in enumerate(replica_pool):
+            # Reserver port for a LB proxy when replication is enabled
+            if hostname not in mii.port_map:
+                mii.port_map[hostname] = set()
+            base_port = mii_config.port_number + i * mii_config.tensor_parallel + port_offset
+            if base_port in mii.port_map[hostname]:
+                base_port = max(mii.port_map[hostname]) + 1
+            tensor_parallel_ports = list(
+                range(base_port,
+                      base_port + mii_config.tensor_parallel))
+            for i in range(base_port, base_port + mii_config.tensor_parallel):
+                mii.port_map[hostname].add(i)
+            torch_dist_port = mii_config.torch_dist_port + i
+            replica_configs.append(
+                ReplicaConfig(task=get_task_name(deployment.task),
+                              deployment_name=deployment.deployment_name,
+                              hostname=hostname,
+                              tensor_parallel_ports=tensor_parallel_ports,
+                              torch_dist_port=torch_dist_port,
+                              gpu_indices=gpu_indices))
+    lb_config = LoadBalancerConfig(port=mii_config.port_number,
+                                   replica_configs=replica_configs)
+    return lb_config
 
 def _deploy_local(deployment_tag, model_path):
     mii.utils.import_score_file(deployment_tag).init()
diff --git a/mii/grpc_related/modelresponse_server.py b/mii/grpc_related/modelresponse_server.py
index 94eec2d2..ab55ae32 100644
--- a/mii/grpc_related/modelresponse_server.py
+++ b/mii/grpc_related/modelresponse_server.py
@@ -13,7 +13,7 @@
 import threading
 import time
 
-from mii.constants import GRPC_MAX_MSG_SIZE, CREATE_SESSION_METHOD, DESTROY_SESSION_METHOD, TERMINATE_METHOD, LB_MAX_WORKER_THREADS, SERVER_SHUTDOWN_TIMEOUT, Tasks
+from mii.constants import GRPC_MAX_MSG_SIZE, ADD_DEPLOYMENT_METHOD, CREATE_SESSION_METHOD, DESTROY_SESSION_METHOD, TERMINATE_METHOD, LB_MAX_WORKER_THREADS, SERVER_SHUTDOWN_TIMEOUT, Tasks
 from mii.method_table import GRPC_METHOD_TABLE
 from mii.client import create_channel
 
@@ -32,6 +32,11 @@ def Terminate(self, request, context):
     def get_stop_event(self):
         return self._stop_event
 
+class DeploymentManagement(ServiceBase, modelresponse_pb2_grpc.DeploymentManagementServicer):
+    def AddDeployment(self, request, context):
+        print("TESTING ADD DEPLOYMENT")
+        return google_dot_protobuf_dot_empty__pb2.Empty()
+
 
 class ModelResponse(ServiceBase):
     """
@@ -142,7 +147,7 @@ def __init__(self, host, ports):
         self.stubs = []
         for port in ports:
             channel = create_channel(host, port)
-            stub = modelresponse_pb2_grpc.ModelResponseStub(channel)
+            stub = modelresponse_pb2_grpc.DeploymentManagementStub(channel)
             self.stubs.append(stub)
 
         self.asyncio_loop = asyncio.get_event_loop()
@@ -198,12 +203,19 @@ def choose_stub(self, call_count):
 
     def intercept_service(self, continuation, handler_call_details):
         next_handler = continuation(handler_call_details)
+        print(next_handler)
         assert next_handler.unary_unary is not None
 
         #USE KWARGS LIKE THEY ARE USED TO MAKE SESSIONS TO GET THE DEPLOYMENT NAME TO HASH THE COUNTERS/STUBS
 
         def invoke_intercept_method(request_proto, context):
             method_name = _get_grpc_method_name(handler_call_details.method)
+            if method_name == ADD_DEPLOYMENT_METHOD:
+                for name in self.stubs:
+                    for stub in self.stubs[name]:
+                        stub.invoke(ADD_DEPLOYMENT_METHOD, request_proto)
+                return google_dot_protobuf_dot_empty__pb2.Empty()
+
             if method_name == TERMINATE_METHOD:
                 for deployment in self.stubs:
                     for stub in self.stubs[deployment]:
@@ -290,7 +302,7 @@ def _do_serve(service_impl, port, interceptors=[]):
                                    GRPC_MAX_MSG_SIZE),
                                   ('grpc.max_receive_message_length',
                                    GRPC_MAX_MSG_SIZE)])
-    modelresponse_pb2_grpc.add_ModelResponseServicer_to_server(service_impl, server)
+    modelresponse_pb2_grpc.add_DeploymentManagementServicer_to_server(service_impl, server)
     server.add_insecure_port(f'[::]:{port}')
     print(f"About to start server")
     server.start()
@@ -300,11 +312,11 @@ def _do_serve(service_impl, port, interceptors=[]):
 
 
 def serve_inference(inference_pipeline, port):
-    _do_serve(ModelResponse(inference_pipeline), port)
+    _do_serve(DeploymentManagement(), port)
 
 
 def serve_load_balancing(lb_config):
-    _do_serve(ServiceBase(),
+    _do_serve(DeploymentManagement(),
               lb_config.port,
               [LoadBalancingInterceptor(lb_config.replica_configs)])
 
diff --git a/mii/grpc_related/proto/modelresponse.proto b/mii/grpc_related/proto/modelresponse.proto
index ce55522b..c622074e 100644
--- a/mii/grpc_related/proto/modelresponse.proto
+++ b/mii/grpc_related/proto/modelresponse.proto
@@ -36,6 +36,10 @@ service ModelResponse {
   rpc Txt2ImgReply(MultiStringRequest) returns (ImageReply) {}
 }
 
+service DeploymentManagement {
+  rpc AddDeployment(google.protobuf.Empty) returns (google.protobuf.Empty) {}	
+}
+
 message Value {
     oneof oneof_values {
         string svalue = 1;
diff --git a/mii/grpc_related/proto/modelresponse_pb2.py b/mii/grpc_related/proto/modelresponse_pb2.py
index 7802b7f2..515ebb80 100644
--- a/mii/grpc_related/proto/modelresponse_pb2.py
+++ b/mii/grpc_related/proto/modelresponse_pb2.py
@@ -1,8 +1,4 @@
-# Copyright (c) Microsoft Corporation.
-# SPDX-License-Identifier: Apache-2.0
-
-# DeepSpeed Team
-
+# -*- coding: utf-8 -*-
 # Generated by the protocol buffer compiler.  DO NOT EDIT!
 # source: modelresponse.proto
 """Generated protocol buffer code."""
@@ -14,54 +10,56 @@
 
 _sym_db = _symbol_database.Default()
 
+
 from google.protobuf import empty_pb2 as google_dot_protobuf_dot_empty__pb2
 
-DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(
-    b'\n\x13modelresponse.proto\x12\rmodelresponse\x1a\x1bgoogle/protobuf/empty.proto\"_\n\x05Value\x12\x10\n\x06svalue\x18\x01 \x01(\tH\x00\x12\x10\n\x06ivalue\x18\x02 \x01(\x03H\x00\x12\x10\n\x06\x66value\x18\x03 \x01(\x02H\x00\x12\x10\n\x06\x62value\x18\x04 \x01(\x08H\x00\x42\x0e\n\x0coneof_values\"\x1f\n\tSessionID\x12\x12\n\nsession_id\x18\x01 \x01(\t\"\xed\x01\n\x13SingleStringRequest\x12\x0f\n\x07request\x18\x01 \x01(\t\x12I\n\x0cquery_kwargs\x18\x02 \x03(\x0b\x32\x33.modelresponse.SingleStringRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x03 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\xeb\x01\n\x12MultiStringRequest\x12\x0f\n\x07request\x18\x01 \x03(\t\x12H\n\x0cquery_kwargs\x18\x02 \x03(\x0b\x32\x32.modelresponse.MultiStringRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x03 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\x85\x01\n\x11SingleStringReply\x12\x10\n\x08response\x18\x01 \x01(\t\x12\x12\n\ntime_taken\x18\x02 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x03 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\x84\x01\n\x10MultiStringReply\x12\x10\n\x08response\x18\x01 \x03(\t\x12\x12\n\ntime_taken\x18\x02 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x03 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\xeb\x01\n\tQARequest\x12\x10\n\x08question\x18\x01 \x01(\t\x12\x0f\n\x07\x63ontext\x18\x02 \x01(\t\x12?\n\x0cquery_kwargs\x18\x03 \x03(\x0b\x32).modelresponse.QARequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\xd3\x02\n\x13\x43onversationRequest\x12\x0c\n\x04text\x18\x01 \x01(\t\x12\x1c\n\x0f\x63onversation_id\x18\x02 \x01(\x03H\x00\x88\x01\x01\x12\x18\n\x10past_user_inputs\x18\x03 \x03(\t\x12\x1b\n\x13generated_responses\x18\x04 \x03(\t\x12I\n\x0cquery_kwargs\x18\x05 \x03(\x0b\x32\x33.modelresponse.ConversationRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x06 \x01(\tH\x01\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_conversation_idB\x12\n\x10_deployment_name\"\xc3\x01\n\x11\x43onversationReply\x12\x17\n\x0f\x63onversation_id\x18\x01 \x01(\x03\x12\x18\n\x10past_user_inputs\x18\x02 \x03(\t\x12\x1b\n\x13generated_responses\x18\x03 \x03(\t\x12\x12\n\ntime_taken\x18\x04 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x05 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x06 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\xaf\x01\n\nImageReply\x12\x0e\n\x06images\x18\x01 \x03(\x0c\x12\x1d\n\x15nsfw_content_detected\x18\x02 \x03(\x08\x12\x0c\n\x04mode\x18\x03 \x01(\t\x12\x0e\n\x06size_w\x18\x04 \x01(\x03\x12\x0e\n\x06size_h\x18\x05 \x01(\x03\x12\x12\n\ntime_taken\x18\x06 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x07 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name2\xd4\x06\n\rModelResponse\x12=\n\tTerminate\x12\x16.google.protobuf.Empty\x1a\x16.google.protobuf.Empty\"\x00\x12\x43\n\rCreateSession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12\x44\n\x0e\x44\x65stroySession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12V\n\x0eGeneratorReply\x12!.modelresponse.MultiStringRequest\x1a\x1f.modelresponse.MultiStringReply\"\x00\x12]\n\x13\x43lassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12V\n\x16QuestionAndAnswerReply\x12\x18.modelresponse.QARequest\x1a .modelresponse.SingleStringReply\"\x00\x12W\n\rFillMaskReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12\x62\n\x18TokenClassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12]\n\x13\x43onversationalReply\x12\".modelresponse.ConversationRequest\x1a .modelresponse.ConversationReply\"\x00\x12N\n\x0cTxt2ImgReply\x12!.modelresponse.MultiStringRequest\x1a\x19.modelresponse.ImageReply\"\x00\x62\x06proto3'
-)
+
+DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x13modelresponse.proto\x12\rmodelresponse\x1a\x1bgoogle/protobuf/empty.proto\"_\n\x05Value\x12\x10\n\x06svalue\x18\x01 \x01(\tH\x00\x12\x10\n\x06ivalue\x18\x02 \x01(\x03H\x00\x12\x10\n\x06\x66value\x18\x03 \x01(\x02H\x00\x12\x10\n\x06\x62value\x18\x04 \x01(\x08H\x00\x42\x0e\n\x0coneof_values\"\x1f\n\tSessionID\x12\x12\n\nsession_id\x18\x01 \x01(\t\"\xed\x01\n\x13SingleStringRequest\x12\x0f\n\x07request\x18\x01 \x01(\t\x12I\n\x0cquery_kwargs\x18\x02 \x03(\x0b\x32\x33.modelresponse.SingleStringRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x03 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\xeb\x01\n\x12MultiStringRequest\x12\x0f\n\x07request\x18\x01 \x03(\t\x12H\n\x0cquery_kwargs\x18\x02 \x03(\x0b\x32\x32.modelresponse.MultiStringRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x03 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\x85\x01\n\x11SingleStringReply\x12\x10\n\x08response\x18\x01 \x01(\t\x12\x12\n\ntime_taken\x18\x02 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x03 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\x84\x01\n\x10MultiStringReply\x12\x10\n\x08response\x18\x01 \x03(\t\x12\x12\n\ntime_taken\x18\x02 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x03 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\xeb\x01\n\tQARequest\x12\x10\n\x08question\x18\x01 \x01(\t\x12\x0f\n\x07\x63ontext\x18\x02 \x01(\t\x12?\n\x0cquery_kwargs\x18\x03 \x03(\x0b\x32).modelresponse.QARequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\xd3\x02\n\x13\x43onversationRequest\x12\x0c\n\x04text\x18\x01 \x01(\t\x12\x1c\n\x0f\x63onversation_id\x18\x02 \x01(\x03H\x00\x88\x01\x01\x12\x18\n\x10past_user_inputs\x18\x03 \x03(\t\x12\x1b\n\x13generated_responses\x18\x04 \x03(\t\x12I\n\x0cquery_kwargs\x18\x05 \x03(\x0b\x32\x33.modelresponse.ConversationRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x06 \x01(\tH\x01\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_conversation_idB\x12\n\x10_deployment_name\"\xc3\x01\n\x11\x43onversationReply\x12\x17\n\x0f\x63onversation_id\x18\x01 \x01(\x03\x12\x18\n\x10past_user_inputs\x18\x02 \x03(\t\x12\x1b\n\x13generated_responses\x18\x03 \x03(\t\x12\x12\n\ntime_taken\x18\x04 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x05 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x06 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\xaf\x01\n\nImageReply\x12\x0e\n\x06images\x18\x01 \x03(\x0c\x12\x1d\n\x15nsfw_content_detected\x18\x02 \x03(\x08\x12\x0c\n\x04mode\x18\x03 \x01(\t\x12\x0e\n\x06size_w\x18\x04 \x01(\x03\x12\x0e\n\x06size_h\x18\x05 \x01(\x03\x12\x12\n\ntime_taken\x18\x06 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x07 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name2\xd4\x06\n\rModelResponse\x12=\n\tTerminate\x12\x16.google.protobuf.Empty\x1a\x16.google.protobuf.Empty\"\x00\x12\x43\n\rCreateSession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12\x44\n\x0e\x44\x65stroySession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12V\n\x0eGeneratorReply\x12!.modelresponse.MultiStringRequest\x1a\x1f.modelresponse.MultiStringReply\"\x00\x12]\n\x13\x43lassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12V\n\x16QuestionAndAnswerReply\x12\x18.modelresponse.QARequest\x1a .modelresponse.SingleStringReply\"\x00\x12W\n\rFillMaskReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12\x62\n\x18TokenClassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12]\n\x13\x43onversationalReply\x12\".modelresponse.ConversationRequest\x1a .modelresponse.ConversationReply\"\x00\x12N\n\x0cTxt2ImgReply\x12!.modelresponse.MultiStringRequest\x1a\x19.modelresponse.ImageReply\"\x00\x32Y\n\x14\x44\x65ploymentManagement\x12\x41\n\rAddDeployment\x12\x16.google.protobuf.Empty\x1a\x16.google.protobuf.Empty\"\x00\x62\x06proto3')
 
 _globals = globals()
 _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
 _builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'modelresponse_pb2', _globals)
 if _descriptor._USE_C_DESCRIPTORS == False:
 
-    DESCRIPTOR._options = None
-    _SINGLESTRINGREQUEST_QUERYKWARGSENTRY._options = None
-    _SINGLESTRINGREQUEST_QUERYKWARGSENTRY._serialized_options = b'8\001'
-    _MULTISTRINGREQUEST_QUERYKWARGSENTRY._options = None
-    _MULTISTRINGREQUEST_QUERYKWARGSENTRY._serialized_options = b'8\001'
-    _QAREQUEST_QUERYKWARGSENTRY._options = None
-    _QAREQUEST_QUERYKWARGSENTRY._serialized_options = b'8\001'
-    _CONVERSATIONREQUEST_QUERYKWARGSENTRY._options = None
-    _CONVERSATIONREQUEST_QUERYKWARGSENTRY._serialized_options = b'8\001'
-    _globals['_VALUE']._serialized_start = 67
-    _globals['_VALUE']._serialized_end = 162
-    _globals['_SESSIONID']._serialized_start = 164
-    _globals['_SESSIONID']._serialized_end = 195
-    _globals['_SINGLESTRINGREQUEST']._serialized_start = 198
-    _globals['_SINGLESTRINGREQUEST']._serialized_end = 435
-    _globals['_SINGLESTRINGREQUEST_QUERYKWARGSENTRY']._serialized_start = 343
-    _globals['_SINGLESTRINGREQUEST_QUERYKWARGSENTRY']._serialized_end = 415
-    _globals['_MULTISTRINGREQUEST']._serialized_start = 438
-    _globals['_MULTISTRINGREQUEST']._serialized_end = 673
-    _globals['_MULTISTRINGREQUEST_QUERYKWARGSENTRY']._serialized_start = 343
-    _globals['_MULTISTRINGREQUEST_QUERYKWARGSENTRY']._serialized_end = 415
-    _globals['_SINGLESTRINGREPLY']._serialized_start = 676
-    _globals['_SINGLESTRINGREPLY']._serialized_end = 809
-    _globals['_MULTISTRINGREPLY']._serialized_start = 812
-    _globals['_MULTISTRINGREPLY']._serialized_end = 944
-    _globals['_QAREQUEST']._serialized_start = 947
-    _globals['_QAREQUEST']._serialized_end = 1182
-    _globals['_QAREQUEST_QUERYKWARGSENTRY']._serialized_start = 343
-    _globals['_QAREQUEST_QUERYKWARGSENTRY']._serialized_end = 415
-    _globals['_CONVERSATIONREQUEST']._serialized_start = 1185
-    _globals['_CONVERSATIONREQUEST']._serialized_end = 1524
-    _globals['_CONVERSATIONREQUEST_QUERYKWARGSENTRY']._serialized_start = 343
-    _globals['_CONVERSATIONREQUEST_QUERYKWARGSENTRY']._serialized_end = 415
-    _globals['_CONVERSATIONREPLY']._serialized_start = 1527
-    _globals['_CONVERSATIONREPLY']._serialized_end = 1722
-    _globals['_IMAGEREPLY']._serialized_start = 1725
-    _globals['_IMAGEREPLY']._serialized_end = 1900
-    _globals['_MODELRESPONSE']._serialized_start = 1903
-    _globals['_MODELRESPONSE']._serialized_end = 2755
+  DESCRIPTOR._options = None
+  _SINGLESTRINGREQUEST_QUERYKWARGSENTRY._options = None
+  _SINGLESTRINGREQUEST_QUERYKWARGSENTRY._serialized_options = b'8\001'
+  _MULTISTRINGREQUEST_QUERYKWARGSENTRY._options = None
+  _MULTISTRINGREQUEST_QUERYKWARGSENTRY._serialized_options = b'8\001'
+  _QAREQUEST_QUERYKWARGSENTRY._options = None
+  _QAREQUEST_QUERYKWARGSENTRY._serialized_options = b'8\001'
+  _CONVERSATIONREQUEST_QUERYKWARGSENTRY._options = None
+  _CONVERSATIONREQUEST_QUERYKWARGSENTRY._serialized_options = b'8\001'
+  _globals['_VALUE']._serialized_start=67
+  _globals['_VALUE']._serialized_end=162
+  _globals['_SESSIONID']._serialized_start=164
+  _globals['_SESSIONID']._serialized_end=195
+  _globals['_SINGLESTRINGREQUEST']._serialized_start=198
+  _globals['_SINGLESTRINGREQUEST']._serialized_end=435
+  _globals['_SINGLESTRINGREQUEST_QUERYKWARGSENTRY']._serialized_start=343
+  _globals['_SINGLESTRINGREQUEST_QUERYKWARGSENTRY']._serialized_end=415
+  _globals['_MULTISTRINGREQUEST']._serialized_start=438
+  _globals['_MULTISTRINGREQUEST']._serialized_end=673
+  _globals['_MULTISTRINGREQUEST_QUERYKWARGSENTRY']._serialized_start=343
+  _globals['_MULTISTRINGREQUEST_QUERYKWARGSENTRY']._serialized_end=415
+  _globals['_SINGLESTRINGREPLY']._serialized_start=676
+  _globals['_SINGLESTRINGREPLY']._serialized_end=809
+  _globals['_MULTISTRINGREPLY']._serialized_start=812
+  _globals['_MULTISTRINGREPLY']._serialized_end=944
+  _globals['_QAREQUEST']._serialized_start=947
+  _globals['_QAREQUEST']._serialized_end=1182
+  _globals['_QAREQUEST_QUERYKWARGSENTRY']._serialized_start=343
+  _globals['_QAREQUEST_QUERYKWARGSENTRY']._serialized_end=415
+  _globals['_CONVERSATIONREQUEST']._serialized_start=1185
+  _globals['_CONVERSATIONREQUEST']._serialized_end=1524
+  _globals['_CONVERSATIONREQUEST_QUERYKWARGSENTRY']._serialized_start=343
+  _globals['_CONVERSATIONREQUEST_QUERYKWARGSENTRY']._serialized_end=415
+  _globals['_CONVERSATIONREPLY']._serialized_start=1527
+  _globals['_CONVERSATIONREPLY']._serialized_end=1722
+  _globals['_IMAGEREPLY']._serialized_start=1725
+  _globals['_IMAGEREPLY']._serialized_end=1900
+  _globals['_MODELRESPONSE']._serialized_start=1903
+  _globals['_MODELRESPONSE']._serialized_end=2755
+  _globals['_DEPLOYMENTMANAGEMENT']._serialized_start=2757
+  _globals['_DEPLOYMENTMANAGEMENT']._serialized_end=2846
 # @@protoc_insertion_point(module_scope)
diff --git a/mii/grpc_related/proto/modelresponse_pb2_grpc.py b/mii/grpc_related/proto/modelresponse_pb2_grpc.py
index 95cfa825..438fa0c2 100644
--- a/mii/grpc_related/proto/modelresponse_pb2_grpc.py
+++ b/mii/grpc_related/proto/modelresponse_pb2_grpc.py
@@ -1,8 +1,3 @@
-# Copyright (c) Microsoft Corporation.
-# SPDX-License-Identifier: Apache-2.0
-
-# DeepSpeed Team
-
 # Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
 """Client and server classes corresponding to protobuf-defined services."""
 import grpc
@@ -13,6 +8,7 @@
 
 class ModelResponseStub(object):
     """Missing associated documentation comment in .proto file."""
+
     def __init__(self, channel):
         """Constructor.
 
@@ -20,60 +16,60 @@ def __init__(self, channel):
             channel: A grpc.Channel.
         """
         self.Terminate = channel.unary_unary(
-            '/modelresponse.ModelResponse/Terminate',
-            request_serializer=google_dot_protobuf_dot_empty__pb2.Empty.
-            SerializeToString,
-            response_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString,
-        )
+                '/modelresponse.ModelResponse/Terminate',
+                request_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString,
+                response_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString,
+                )
         self.CreateSession = channel.unary_unary(
-            '/modelresponse.ModelResponse/CreateSession',
-            request_serializer=modelresponse__pb2.SessionID.SerializeToString,
-            response_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString,
-        )
+                '/modelresponse.ModelResponse/CreateSession',
+                request_serializer=modelresponse__pb2.SessionID.SerializeToString,
+                response_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString,
+                )
         self.DestroySession = channel.unary_unary(
-            '/modelresponse.ModelResponse/DestroySession',
-            request_serializer=modelresponse__pb2.SessionID.SerializeToString,
-            response_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString,
-        )
+                '/modelresponse.ModelResponse/DestroySession',
+                request_serializer=modelresponse__pb2.SessionID.SerializeToString,
+                response_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString,
+                )
         self.GeneratorReply = channel.unary_unary(
-            '/modelresponse.ModelResponse/GeneratorReply',
-            request_serializer=modelresponse__pb2.MultiStringRequest.SerializeToString,
-            response_deserializer=modelresponse__pb2.MultiStringReply.FromString,
-        )
+                '/modelresponse.ModelResponse/GeneratorReply',
+                request_serializer=modelresponse__pb2.MultiStringRequest.SerializeToString,
+                response_deserializer=modelresponse__pb2.MultiStringReply.FromString,
+                )
         self.ClassificationReply = channel.unary_unary(
-            '/modelresponse.ModelResponse/ClassificationReply',
-            request_serializer=modelresponse__pb2.SingleStringRequest.SerializeToString,
-            response_deserializer=modelresponse__pb2.SingleStringReply.FromString,
-        )
+                '/modelresponse.ModelResponse/ClassificationReply',
+                request_serializer=modelresponse__pb2.SingleStringRequest.SerializeToString,
+                response_deserializer=modelresponse__pb2.SingleStringReply.FromString,
+                )
         self.QuestionAndAnswerReply = channel.unary_unary(
-            '/modelresponse.ModelResponse/QuestionAndAnswerReply',
-            request_serializer=modelresponse__pb2.QARequest.SerializeToString,
-            response_deserializer=modelresponse__pb2.SingleStringReply.FromString,
-        )
+                '/modelresponse.ModelResponse/QuestionAndAnswerReply',
+                request_serializer=modelresponse__pb2.QARequest.SerializeToString,
+                response_deserializer=modelresponse__pb2.SingleStringReply.FromString,
+                )
         self.FillMaskReply = channel.unary_unary(
-            '/modelresponse.ModelResponse/FillMaskReply',
-            request_serializer=modelresponse__pb2.SingleStringRequest.SerializeToString,
-            response_deserializer=modelresponse__pb2.SingleStringReply.FromString,
-        )
+                '/modelresponse.ModelResponse/FillMaskReply',
+                request_serializer=modelresponse__pb2.SingleStringRequest.SerializeToString,
+                response_deserializer=modelresponse__pb2.SingleStringReply.FromString,
+                )
         self.TokenClassificationReply = channel.unary_unary(
-            '/modelresponse.ModelResponse/TokenClassificationReply',
-            request_serializer=modelresponse__pb2.SingleStringRequest.SerializeToString,
-            response_deserializer=modelresponse__pb2.SingleStringReply.FromString,
-        )
+                '/modelresponse.ModelResponse/TokenClassificationReply',
+                request_serializer=modelresponse__pb2.SingleStringRequest.SerializeToString,
+                response_deserializer=modelresponse__pb2.SingleStringReply.FromString,
+                )
         self.ConversationalReply = channel.unary_unary(
-            '/modelresponse.ModelResponse/ConversationalReply',
-            request_serializer=modelresponse__pb2.ConversationRequest.SerializeToString,
-            response_deserializer=modelresponse__pb2.ConversationReply.FromString,
-        )
+                '/modelresponse.ModelResponse/ConversationalReply',
+                request_serializer=modelresponse__pb2.ConversationRequest.SerializeToString,
+                response_deserializer=modelresponse__pb2.ConversationReply.FromString,
+                )
         self.Txt2ImgReply = channel.unary_unary(
-            '/modelresponse.ModelResponse/Txt2ImgReply',
-            request_serializer=modelresponse__pb2.MultiStringRequest.SerializeToString,
-            response_deserializer=modelresponse__pb2.ImageReply.FromString,
-        )
+                '/modelresponse.ModelResponse/Txt2ImgReply',
+                request_serializer=modelresponse__pb2.MultiStringRequest.SerializeToString,
+                response_deserializer=modelresponse__pb2.ImageReply.FromString,
+                )
 
 
 class ModelResponseServicer(object):
     """Missing associated documentation comment in .proto file."""
+
     def Terminate(self, request, context):
         """Missing associated documentation comment in .proto file."""
         context.set_code(grpc.StatusCode.UNIMPLEMENTED)
@@ -137,334 +133,293 @@ def Txt2ImgReply(self, request, context):
 
 def add_ModelResponseServicer_to_server(servicer, server):
     rpc_method_handlers = {
-        'Terminate':
-        grpc.unary_unary_rpc_method_handler(
-            servicer.Terminate,
-            request_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString,
-            response_serializer=google_dot_protobuf_dot_empty__pb2.Empty.
-            SerializeToString,
-        ),
-        'CreateSession':
-        grpc.unary_unary_rpc_method_handler(
-            servicer.CreateSession,
-            request_deserializer=modelresponse__pb2.SessionID.FromString,
-            response_serializer=google_dot_protobuf_dot_empty__pb2.Empty.
-            SerializeToString,
-        ),
-        'DestroySession':
-        grpc.unary_unary_rpc_method_handler(
-            servicer.DestroySession,
-            request_deserializer=modelresponse__pb2.SessionID.FromString,
-            response_serializer=google_dot_protobuf_dot_empty__pb2.Empty.
-            SerializeToString,
-        ),
-        'GeneratorReply':
-        grpc.unary_unary_rpc_method_handler(
-            servicer.GeneratorReply,
-            request_deserializer=modelresponse__pb2.MultiStringRequest.FromString,
-            response_serializer=modelresponse__pb2.MultiStringReply.SerializeToString,
-        ),
-        'ClassificationReply':
-        grpc.unary_unary_rpc_method_handler(
-            servicer.ClassificationReply,
-            request_deserializer=modelresponse__pb2.SingleStringRequest.FromString,
-            response_serializer=modelresponse__pb2.SingleStringReply.SerializeToString,
-        ),
-        'QuestionAndAnswerReply':
-        grpc.unary_unary_rpc_method_handler(
-            servicer.QuestionAndAnswerReply,
-            request_deserializer=modelresponse__pb2.QARequest.FromString,
-            response_serializer=modelresponse__pb2.SingleStringReply.SerializeToString,
-        ),
-        'FillMaskReply':
-        grpc.unary_unary_rpc_method_handler(
-            servicer.FillMaskReply,
-            request_deserializer=modelresponse__pb2.SingleStringRequest.FromString,
-            response_serializer=modelresponse__pb2.SingleStringReply.SerializeToString,
-        ),
-        'TokenClassificationReply':
-        grpc.unary_unary_rpc_method_handler(
-            servicer.TokenClassificationReply,
-            request_deserializer=modelresponse__pb2.SingleStringRequest.FromString,
-            response_serializer=modelresponse__pb2.SingleStringReply.SerializeToString,
-        ),
-        'ConversationalReply':
-        grpc.unary_unary_rpc_method_handler(
-            servicer.ConversationalReply,
-            request_deserializer=modelresponse__pb2.ConversationRequest.FromString,
-            response_serializer=modelresponse__pb2.ConversationReply.SerializeToString,
-        ),
-        'Txt2ImgReply':
-        grpc.unary_unary_rpc_method_handler(
-            servicer.Txt2ImgReply,
-            request_deserializer=modelresponse__pb2.MultiStringRequest.FromString,
-            response_serializer=modelresponse__pb2.ImageReply.SerializeToString,
-        ),
+            'Terminate': grpc.unary_unary_rpc_method_handler(
+                    servicer.Terminate,
+                    request_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString,
+                    response_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString,
+            ),
+            'CreateSession': grpc.unary_unary_rpc_method_handler(
+                    servicer.CreateSession,
+                    request_deserializer=modelresponse__pb2.SessionID.FromString,
+                    response_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString,
+            ),
+            'DestroySession': grpc.unary_unary_rpc_method_handler(
+                    servicer.DestroySession,
+                    request_deserializer=modelresponse__pb2.SessionID.FromString,
+                    response_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString,
+            ),
+            'GeneratorReply': grpc.unary_unary_rpc_method_handler(
+                    servicer.GeneratorReply,
+                    request_deserializer=modelresponse__pb2.MultiStringRequest.FromString,
+                    response_serializer=modelresponse__pb2.MultiStringReply.SerializeToString,
+            ),
+            'ClassificationReply': grpc.unary_unary_rpc_method_handler(
+                    servicer.ClassificationReply,
+                    request_deserializer=modelresponse__pb2.SingleStringRequest.FromString,
+                    response_serializer=modelresponse__pb2.SingleStringReply.SerializeToString,
+            ),
+            'QuestionAndAnswerReply': grpc.unary_unary_rpc_method_handler(
+                    servicer.QuestionAndAnswerReply,
+                    request_deserializer=modelresponse__pb2.QARequest.FromString,
+                    response_serializer=modelresponse__pb2.SingleStringReply.SerializeToString,
+            ),
+            'FillMaskReply': grpc.unary_unary_rpc_method_handler(
+                    servicer.FillMaskReply,
+                    request_deserializer=modelresponse__pb2.SingleStringRequest.FromString,
+                    response_serializer=modelresponse__pb2.SingleStringReply.SerializeToString,
+            ),
+            'TokenClassificationReply': grpc.unary_unary_rpc_method_handler(
+                    servicer.TokenClassificationReply,
+                    request_deserializer=modelresponse__pb2.SingleStringRequest.FromString,
+                    response_serializer=modelresponse__pb2.SingleStringReply.SerializeToString,
+            ),
+            'ConversationalReply': grpc.unary_unary_rpc_method_handler(
+                    servicer.ConversationalReply,
+                    request_deserializer=modelresponse__pb2.ConversationRequest.FromString,
+                    response_serializer=modelresponse__pb2.ConversationReply.SerializeToString,
+            ),
+            'Txt2ImgReply': grpc.unary_unary_rpc_method_handler(
+                    servicer.Txt2ImgReply,
+                    request_deserializer=modelresponse__pb2.MultiStringRequest.FromString,
+                    response_serializer=modelresponse__pb2.ImageReply.SerializeToString,
+            ),
     }
-    generic_handler = grpc.method_handlers_generic_handler('modelresponse.ModelResponse',
-                                                           rpc_method_handlers)
-    server.add_generic_rpc_handlers((generic_handler, ))
+    generic_handler = grpc.method_handlers_generic_handler(
+            'modelresponse.ModelResponse', rpc_method_handlers)
+    server.add_generic_rpc_handlers((generic_handler,))
 
 
-# This class is part of an EXPERIMENTAL API.
+ # This class is part of an EXPERIMENTAL API.
 class ModelResponse(object):
     """Missing associated documentation comment in .proto file."""
+
     @staticmethod
     def Terminate(request,
-                  target,
-                  options=(),
-                  channel_credentials=None,
-                  call_credentials=None,
-                  insecure=False,
-                  compression=None,
-                  wait_for_ready=None,
-                  timeout=None,
-                  metadata=None):
-        return grpc.experimental.unary_unary(
-            request,
             target,
-            '/modelresponse.ModelResponse/Terminate',
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/modelresponse.ModelResponse/Terminate',
             google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString,
             google_dot_protobuf_dot_empty__pb2.Empty.FromString,
-            options,
-            channel_credentials,
-            insecure,
-            call_credentials,
-            compression,
-            wait_for_ready,
-            timeout,
-            metadata)
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
 
     @staticmethod
     def CreateSession(request,
-                      target,
-                      options=(),
-                      channel_credentials=None,
-                      call_credentials=None,
-                      insecure=False,
-                      compression=None,
-                      wait_for_ready=None,
-                      timeout=None,
-                      metadata=None):
-        return grpc.experimental.unary_unary(
-            request,
             target,
-            '/modelresponse.ModelResponse/CreateSession',
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/modelresponse.ModelResponse/CreateSession',
             modelresponse__pb2.SessionID.SerializeToString,
             google_dot_protobuf_dot_empty__pb2.Empty.FromString,
-            options,
-            channel_credentials,
-            insecure,
-            call_credentials,
-            compression,
-            wait_for_ready,
-            timeout,
-            metadata)
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
 
     @staticmethod
     def DestroySession(request,
-                       target,
-                       options=(),
-                       channel_credentials=None,
-                       call_credentials=None,
-                       insecure=False,
-                       compression=None,
-                       wait_for_ready=None,
-                       timeout=None,
-                       metadata=None):
-        return grpc.experimental.unary_unary(
-            request,
             target,
-            '/modelresponse.ModelResponse/DestroySession',
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/modelresponse.ModelResponse/DestroySession',
             modelresponse__pb2.SessionID.SerializeToString,
             google_dot_protobuf_dot_empty__pb2.Empty.FromString,
-            options,
-            channel_credentials,
-            insecure,
-            call_credentials,
-            compression,
-            wait_for_ready,
-            timeout,
-            metadata)
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
 
     @staticmethod
     def GeneratorReply(request,
-                       target,
-                       options=(),
-                       channel_credentials=None,
-                       call_credentials=None,
-                       insecure=False,
-                       compression=None,
-                       wait_for_ready=None,
-                       timeout=None,
-                       metadata=None):
-        return grpc.experimental.unary_unary(
-            request,
             target,
-            '/modelresponse.ModelResponse/GeneratorReply',
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/modelresponse.ModelResponse/GeneratorReply',
             modelresponse__pb2.MultiStringRequest.SerializeToString,
             modelresponse__pb2.MultiStringReply.FromString,
-            options,
-            channel_credentials,
-            insecure,
-            call_credentials,
-            compression,
-            wait_for_ready,
-            timeout,
-            metadata)
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
 
     @staticmethod
     def ClassificationReply(request,
-                            target,
-                            options=(),
-                            channel_credentials=None,
-                            call_credentials=None,
-                            insecure=False,
-                            compression=None,
-                            wait_for_ready=None,
-                            timeout=None,
-                            metadata=None):
-        return grpc.experimental.unary_unary(
-            request,
             target,
-            '/modelresponse.ModelResponse/ClassificationReply',
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/modelresponse.ModelResponse/ClassificationReply',
             modelresponse__pb2.SingleStringRequest.SerializeToString,
             modelresponse__pb2.SingleStringReply.FromString,
-            options,
-            channel_credentials,
-            insecure,
-            call_credentials,
-            compression,
-            wait_for_ready,
-            timeout,
-            metadata)
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
 
     @staticmethod
     def QuestionAndAnswerReply(request,
-                               target,
-                               options=(),
-                               channel_credentials=None,
-                               call_credentials=None,
-                               insecure=False,
-                               compression=None,
-                               wait_for_ready=None,
-                               timeout=None,
-                               metadata=None):
-        return grpc.experimental.unary_unary(
-            request,
             target,
-            '/modelresponse.ModelResponse/QuestionAndAnswerReply',
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/modelresponse.ModelResponse/QuestionAndAnswerReply',
             modelresponse__pb2.QARequest.SerializeToString,
             modelresponse__pb2.SingleStringReply.FromString,
-            options,
-            channel_credentials,
-            insecure,
-            call_credentials,
-            compression,
-            wait_for_ready,
-            timeout,
-            metadata)
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
 
     @staticmethod
     def FillMaskReply(request,
-                      target,
-                      options=(),
-                      channel_credentials=None,
-                      call_credentials=None,
-                      insecure=False,
-                      compression=None,
-                      wait_for_ready=None,
-                      timeout=None,
-                      metadata=None):
-        return grpc.experimental.unary_unary(
-            request,
             target,
-            '/modelresponse.ModelResponse/FillMaskReply',
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/modelresponse.ModelResponse/FillMaskReply',
             modelresponse__pb2.SingleStringRequest.SerializeToString,
             modelresponse__pb2.SingleStringReply.FromString,
-            options,
-            channel_credentials,
-            insecure,
-            call_credentials,
-            compression,
-            wait_for_ready,
-            timeout,
-            metadata)
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
 
     @staticmethod
     def TokenClassificationReply(request,
-                                 target,
-                                 options=(),
-                                 channel_credentials=None,
-                                 call_credentials=None,
-                                 insecure=False,
-                                 compression=None,
-                                 wait_for_ready=None,
-                                 timeout=None,
-                                 metadata=None):
-        return grpc.experimental.unary_unary(
-            request,
             target,
-            '/modelresponse.ModelResponse/TokenClassificationReply',
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/modelresponse.ModelResponse/TokenClassificationReply',
             modelresponse__pb2.SingleStringRequest.SerializeToString,
             modelresponse__pb2.SingleStringReply.FromString,
-            options,
-            channel_credentials,
-            insecure,
-            call_credentials,
-            compression,
-            wait_for_ready,
-            timeout,
-            metadata)
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
 
     @staticmethod
     def ConversationalReply(request,
-                            target,
-                            options=(),
-                            channel_credentials=None,
-                            call_credentials=None,
-                            insecure=False,
-                            compression=None,
-                            wait_for_ready=None,
-                            timeout=None,
-                            metadata=None):
-        return grpc.experimental.unary_unary(
-            request,
             target,
-            '/modelresponse.ModelResponse/ConversationalReply',
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/modelresponse.ModelResponse/ConversationalReply',
             modelresponse__pb2.ConversationRequest.SerializeToString,
             modelresponse__pb2.ConversationReply.FromString,
-            options,
-            channel_credentials,
-            insecure,
-            call_credentials,
-            compression,
-            wait_for_ready,
-            timeout,
-            metadata)
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
 
     @staticmethod
     def Txt2ImgReply(request,
-                     target,
-                     options=(),
-                     channel_credentials=None,
-                     call_credentials=None,
-                     insecure=False,
-                     compression=None,
-                     wait_for_ready=None,
-                     timeout=None,
-                     metadata=None):
-        return grpc.experimental.unary_unary(
-            request,
             target,
-            '/modelresponse.ModelResponse/Txt2ImgReply',
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/modelresponse.ModelResponse/Txt2ImgReply',
             modelresponse__pb2.MultiStringRequest.SerializeToString,
             modelresponse__pb2.ImageReply.FromString,
-            options,
-            channel_credentials,
-            insecure,
-            call_credentials,
-            compression,
-            wait_for_ready,
-            timeout,
-            metadata)
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+
+
+class DeploymentManagementStub(object):
+    """Missing associated documentation comment in .proto file."""
+
+    def __init__(self, channel):
+        """Constructor.
+
+        Args:
+            channel: A grpc.Channel.
+        """
+        self.AddDeployment = channel.unary_unary(
+                '/modelresponse.DeploymentManagement/AddDeployment',
+                request_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString,
+                response_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString,
+                )
+
+
+class DeploymentManagementServicer(object):
+    """Missing associated documentation comment in .proto file."""
+
+    def AddDeployment(self, request, context):
+        """Missing associated documentation comment in .proto file."""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+
+def add_DeploymentManagementServicer_to_server(servicer, server):
+    rpc_method_handlers = {
+            'AddDeployment': grpc.unary_unary_rpc_method_handler(
+                    servicer.AddDeployment,
+                    request_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString,
+                    response_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString,
+            ),
+    }
+    generic_handler = grpc.method_handlers_generic_handler(
+            'modelresponse.DeploymentManagement', rpc_method_handlers)
+    server.add_generic_rpc_handlers((generic_handler,))
+
+
+ # This class is part of an EXPERIMENTAL API.
+class DeploymentManagement(object):
+    """Missing associated documentation comment in .proto file."""
+
+    @staticmethod
+    def AddDeployment(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/modelresponse.DeploymentManagement/AddDeployment',
+            google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString,
+            google_dot_protobuf_dot_empty__pb2.Empty.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)

From c2636b7d36f50d270e21f7f8eb867abb3457c06c Mon Sep 17 00:00:00 2001
From: Tosin Segun <insanechils@gmail.com>
Date: Fri, 21 Jul 2023 01:02:31 +0000
Subject: [PATCH 45/69] More partial deploy updates

---
 mii/client.py                               | 25 +++++++++++++++++----
 mii/grpc_related/proto/modelresponse.proto  | 10 +++++++++
 mii/grpc_related/proto/modelresponse_pb2.py | 12 +++++-----
 mii/models/score/generate.py                |  1 +
 mii/models/score/score_template.py          | 10 +++++++--
 mii/server.py                               | 12 ++++++----
 6 files changed, 55 insertions(+), 15 deletions(-)

diff --git a/mii/client.py b/mii/client.py
index f76105ed..14dc7273 100644
--- a/mii/client.py
+++ b/mii/client.py
@@ -43,8 +43,10 @@ def mii_query_handle(deployment_tag):
         return MIINonPersistentClient(task, deployment_tag)
 
     deployments = _get_deployment_configs(deployment_tag)
-    mii_configs_dict = deployments[0][mii.constants.MII_CONFIGS_KEY]
-    mii_configs = mii.config.MIIConfig(**mii_configs_dict)
+    if len(deployments) > 0:
+        mii_configs_dict = deployments[0][mii.constants.MII_CONFIGS_KEY]
+        mii_configs = mii.config.MIIConfig(**mii_configs_dict)
+
     return MIIClient(deployments, "localhost", mii_configs.port_number)
 
 
@@ -160,7 +162,7 @@ def add_models(self,
                              deployed=False)
             ]
 
-        """
+        
         deployment_tag = mii.deployment_tag
         lb_config = allocate_processes(deployments)
         if mii.lb_config is not None:
@@ -172,10 +174,25 @@ def add_models(self,
             mii.model_path = MII_MODEL_PATH_DEFAULT
         elif mii.model_path is None and deployment_type == DeploymentType.AML:
             model_path = "model"
+        deps = []
+        for deployment in self.deployments:
+             data = {
+                'deployment_name': deployment[mii.constants.DEPLOYMENT_NAME_KEY],
+                'task': deployment[mii.constants.TASK_NAME_KEY],
+                'model': deployment[mii.constants.MODEL_NAME_KEY],
+                'enable_deepspeed': deployment[mii.constants.ENABLE_DEEPSPEED_KEY],
+                'enable_zero': deployment[mii.constants.ENABLE_DEEPSPEED_ZERO_KEY],
+                'GPU_index_map': None,
+                'mii_config': deployment[mii.constants.MII_CONFIGS_KEY],
+                'ds_config': deployment[mii.constants.DEEPSPEED_CONFIG_KEY],
+                'version': 1
+                'deployed' deployment[mii.constants.DEPLOYED_KEY]
+            }
+             
         create_score_file(deployment_tag=deployment_tag, deployment_type=mii.deployment_type, deployments=self.deployments, model_path=mii.model_path, lb_config=mii.lb_config)
         if mii.deployment_type == DeploymentType.Local:
             mii.utils.import_score_file(deployment_tag).init()
-        """
+        
         self.asyncio_loop.run_until_complete(self.add_models_async())
 class MIITensorParallelClient():
     """
diff --git a/mii/grpc_related/proto/modelresponse.proto b/mii/grpc_related/proto/modelresponse.proto
index c622074e..7e4d3520 100644
--- a/mii/grpc_related/proto/modelresponse.proto
+++ b/mii/grpc_related/proto/modelresponse.proto
@@ -113,3 +113,13 @@ message ImageReply {
   float time_taken = 6;
   optional string deployment_name = 7;
 }
+
+message AddDeployRequest {
+  string task = 1;
+  string deployment_name = 2;
+  string hostname = 3;
+  repeated int64 tensor_parallel_ports = 4;
+  int64 torch_dist_port = 5;
+  repeated int64 gpu_indices = 6;
+  
+}
diff --git a/mii/grpc_related/proto/modelresponse_pb2.py b/mii/grpc_related/proto/modelresponse_pb2.py
index 515ebb80..1fc27665 100644
--- a/mii/grpc_related/proto/modelresponse_pb2.py
+++ b/mii/grpc_related/proto/modelresponse_pb2.py
@@ -14,7 +14,7 @@
 from google.protobuf import empty_pb2 as google_dot_protobuf_dot_empty__pb2
 
 
-DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x13modelresponse.proto\x12\rmodelresponse\x1a\x1bgoogle/protobuf/empty.proto\"_\n\x05Value\x12\x10\n\x06svalue\x18\x01 \x01(\tH\x00\x12\x10\n\x06ivalue\x18\x02 \x01(\x03H\x00\x12\x10\n\x06\x66value\x18\x03 \x01(\x02H\x00\x12\x10\n\x06\x62value\x18\x04 \x01(\x08H\x00\x42\x0e\n\x0coneof_values\"\x1f\n\tSessionID\x12\x12\n\nsession_id\x18\x01 \x01(\t\"\xed\x01\n\x13SingleStringRequest\x12\x0f\n\x07request\x18\x01 \x01(\t\x12I\n\x0cquery_kwargs\x18\x02 \x03(\x0b\x32\x33.modelresponse.SingleStringRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x03 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\xeb\x01\n\x12MultiStringRequest\x12\x0f\n\x07request\x18\x01 \x03(\t\x12H\n\x0cquery_kwargs\x18\x02 \x03(\x0b\x32\x32.modelresponse.MultiStringRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x03 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\x85\x01\n\x11SingleStringReply\x12\x10\n\x08response\x18\x01 \x01(\t\x12\x12\n\ntime_taken\x18\x02 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x03 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\x84\x01\n\x10MultiStringReply\x12\x10\n\x08response\x18\x01 \x03(\t\x12\x12\n\ntime_taken\x18\x02 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x03 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\xeb\x01\n\tQARequest\x12\x10\n\x08question\x18\x01 \x01(\t\x12\x0f\n\x07\x63ontext\x18\x02 \x01(\t\x12?\n\x0cquery_kwargs\x18\x03 \x03(\x0b\x32).modelresponse.QARequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\xd3\x02\n\x13\x43onversationRequest\x12\x0c\n\x04text\x18\x01 \x01(\t\x12\x1c\n\x0f\x63onversation_id\x18\x02 \x01(\x03H\x00\x88\x01\x01\x12\x18\n\x10past_user_inputs\x18\x03 \x03(\t\x12\x1b\n\x13generated_responses\x18\x04 \x03(\t\x12I\n\x0cquery_kwargs\x18\x05 \x03(\x0b\x32\x33.modelresponse.ConversationRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x06 \x01(\tH\x01\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_conversation_idB\x12\n\x10_deployment_name\"\xc3\x01\n\x11\x43onversationReply\x12\x17\n\x0f\x63onversation_id\x18\x01 \x01(\x03\x12\x18\n\x10past_user_inputs\x18\x02 \x03(\t\x12\x1b\n\x13generated_responses\x18\x03 \x03(\t\x12\x12\n\ntime_taken\x18\x04 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x05 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x06 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\xaf\x01\n\nImageReply\x12\x0e\n\x06images\x18\x01 \x03(\x0c\x12\x1d\n\x15nsfw_content_detected\x18\x02 \x03(\x08\x12\x0c\n\x04mode\x18\x03 \x01(\t\x12\x0e\n\x06size_w\x18\x04 \x01(\x03\x12\x0e\n\x06size_h\x18\x05 \x01(\x03\x12\x12\n\ntime_taken\x18\x06 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x07 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name2\xd4\x06\n\rModelResponse\x12=\n\tTerminate\x12\x16.google.protobuf.Empty\x1a\x16.google.protobuf.Empty\"\x00\x12\x43\n\rCreateSession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12\x44\n\x0e\x44\x65stroySession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12V\n\x0eGeneratorReply\x12!.modelresponse.MultiStringRequest\x1a\x1f.modelresponse.MultiStringReply\"\x00\x12]\n\x13\x43lassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12V\n\x16QuestionAndAnswerReply\x12\x18.modelresponse.QARequest\x1a .modelresponse.SingleStringReply\"\x00\x12W\n\rFillMaskReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12\x62\n\x18TokenClassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12]\n\x13\x43onversationalReply\x12\".modelresponse.ConversationRequest\x1a .modelresponse.ConversationReply\"\x00\x12N\n\x0cTxt2ImgReply\x12!.modelresponse.MultiStringRequest\x1a\x19.modelresponse.ImageReply\"\x00\x32Y\n\x14\x44\x65ploymentManagement\x12\x41\n\rAddDeployment\x12\x16.google.protobuf.Empty\x1a\x16.google.protobuf.Empty\"\x00\x62\x06proto3')
+DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x13modelresponse.proto\x12\rmodelresponse\x1a\x1bgoogle/protobuf/empty.proto\"_\n\x05Value\x12\x10\n\x06svalue\x18\x01 \x01(\tH\x00\x12\x10\n\x06ivalue\x18\x02 \x01(\x03H\x00\x12\x10\n\x06\x66value\x18\x03 \x01(\x02H\x00\x12\x10\n\x06\x62value\x18\x04 \x01(\x08H\x00\x42\x0e\n\x0coneof_values\"\x1f\n\tSessionID\x12\x12\n\nsession_id\x18\x01 \x01(\t\"\xed\x01\n\x13SingleStringRequest\x12\x0f\n\x07request\x18\x01 \x01(\t\x12I\n\x0cquery_kwargs\x18\x02 \x03(\x0b\x32\x33.modelresponse.SingleStringRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x03 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\xeb\x01\n\x12MultiStringRequest\x12\x0f\n\x07request\x18\x01 \x03(\t\x12H\n\x0cquery_kwargs\x18\x02 \x03(\x0b\x32\x32.modelresponse.MultiStringRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x03 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\x85\x01\n\x11SingleStringReply\x12\x10\n\x08response\x18\x01 \x01(\t\x12\x12\n\ntime_taken\x18\x02 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x03 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\x84\x01\n\x10MultiStringReply\x12\x10\n\x08response\x18\x01 \x03(\t\x12\x12\n\ntime_taken\x18\x02 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x03 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\xeb\x01\n\tQARequest\x12\x10\n\x08question\x18\x01 \x01(\t\x12\x0f\n\x07\x63ontext\x18\x02 \x01(\t\x12?\n\x0cquery_kwargs\x18\x03 \x03(\x0b\x32).modelresponse.QARequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\xd3\x02\n\x13\x43onversationRequest\x12\x0c\n\x04text\x18\x01 \x01(\t\x12\x1c\n\x0f\x63onversation_id\x18\x02 \x01(\x03H\x00\x88\x01\x01\x12\x18\n\x10past_user_inputs\x18\x03 \x03(\t\x12\x1b\n\x13generated_responses\x18\x04 \x03(\t\x12I\n\x0cquery_kwargs\x18\x05 \x03(\x0b\x32\x33.modelresponse.ConversationRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x06 \x01(\tH\x01\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_conversation_idB\x12\n\x10_deployment_name\"\xc3\x01\n\x11\x43onversationReply\x12\x17\n\x0f\x63onversation_id\x18\x01 \x01(\x03\x12\x18\n\x10past_user_inputs\x18\x02 \x03(\t\x12\x1b\n\x13generated_responses\x18\x03 \x03(\t\x12\x12\n\ntime_taken\x18\x04 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x05 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x06 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\xaf\x01\n\nImageReply\x12\x0e\n\x06images\x18\x01 \x03(\x0c\x12\x1d\n\x15nsfw_content_detected\x18\x02 \x03(\x08\x12\x0c\n\x04mode\x18\x03 \x01(\t\x12\x0e\n\x06size_w\x18\x04 \x01(\x03\x12\x0e\n\x06size_h\x18\x05 \x01(\x03\x12\x12\n\ntime_taken\x18\x06 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x07 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\x98\x01\n\x10\x41\x64\x64\x44\x65ployRequest\x12\x0c\n\x04task\x18\x01 \x01(\t\x12\x17\n\x0f\x64\x65ployment_name\x18\x02 \x01(\t\x12\x10\n\x08hostname\x18\x03 \x01(\t\x12\x1d\n\x15tensor_parallel_ports\x18\x04 \x03(\x03\x12\x17\n\x0ftorch_dist_port\x18\x05 \x01(\x03\x12\x13\n\x0bgpu_indices\x18\x06 \x03(\x03\x32\xd4\x06\n\rModelResponse\x12=\n\tTerminate\x12\x16.google.protobuf.Empty\x1a\x16.google.protobuf.Empty\"\x00\x12\x43\n\rCreateSession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12\x44\n\x0e\x44\x65stroySession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12V\n\x0eGeneratorReply\x12!.modelresponse.MultiStringRequest\x1a\x1f.modelresponse.MultiStringReply\"\x00\x12]\n\x13\x43lassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12V\n\x16QuestionAndAnswerReply\x12\x18.modelresponse.QARequest\x1a .modelresponse.SingleStringReply\"\x00\x12W\n\rFillMaskReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12\x62\n\x18TokenClassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12]\n\x13\x43onversationalReply\x12\".modelresponse.ConversationRequest\x1a .modelresponse.ConversationReply\"\x00\x12N\n\x0cTxt2ImgReply\x12!.modelresponse.MultiStringRequest\x1a\x19.modelresponse.ImageReply\"\x00\x32Y\n\x14\x44\x65ploymentManagement\x12\x41\n\rAddDeployment\x12\x16.google.protobuf.Empty\x1a\x16.google.protobuf.Empty\"\x00\x62\x06proto3')
 
 _globals = globals()
 _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
@@ -58,8 +58,10 @@
   _globals['_CONVERSATIONREPLY']._serialized_end=1722
   _globals['_IMAGEREPLY']._serialized_start=1725
   _globals['_IMAGEREPLY']._serialized_end=1900
-  _globals['_MODELRESPONSE']._serialized_start=1903
-  _globals['_MODELRESPONSE']._serialized_end=2755
-  _globals['_DEPLOYMENTMANAGEMENT']._serialized_start=2757
-  _globals['_DEPLOYMENTMANAGEMENT']._serialized_end=2846
+  _globals['_ADDDEPLOYREQUEST']._serialized_start=1903
+  _globals['_ADDDEPLOYREQUEST']._serialized_end=2055
+  _globals['_MODELRESPONSE']._serialized_start=2058
+  _globals['_MODELRESPONSE']._serialized_end=2910
+  _globals['_DEPLOYMENTMANAGEMENT']._serialized_start=2912
+  _globals['_DEPLOYMENTMANAGEMENT']._serialized_end=3001
 # @@protoc_insertion_point(module_scope)
diff --git a/mii/models/score/generate.py b/mii/models/score/generate.py
index dc73fdb9..50f0446f 100644
--- a/mii/models/score/generate.py
+++ b/mii/models/score/generate.py
@@ -27,6 +27,7 @@ def create_score_file(deployment_tag,
             mii.constants.MII_CONFIGS_KEY: deployment.mii_config.dict(),
             mii.constants.ENABLE_DEEPSPEED_ZERO_KEY: deployment.enable_zero,
             mii.constants.DEEPSPEED_CONFIG_KEY: deployment.ds_config,
+            mii.constants.DEPLOYED_KEY: deployment.deployed,
         }
         config_dict[deployment.deployment_name] = deployment_config
 
diff --git a/mii/models/score/score_template.py b/mii/models/score/score_template.py
index 83b46de3..84238511 100644
--- a/mii/models/score/score_template.py
+++ b/mii/models/score/score_template.py
@@ -18,7 +18,11 @@ def init():
     model_path = mii.utils.full_model_path(configs[mii.constants.MODEL_PATH_KEY])
     deployment_tag = configs[mii.constants.DEPLOYMENT_TAG_KEY]
     deployments = []
+    lb_enabled = False
     for deployment in configs.values():
+        if isinstance(deployment, dict) and deployment[mii.constants.DEPLOYED_KEY]:
+            lb_enabled = True
+            continue
         if not isinstance(deployment, dict):
             continue
         data = {
@@ -42,11 +46,13 @@ def init():
     assert task_name is not None, "The task name should be set before calling init"
     """
 
-    mii.MIIServer(deployment_tag,
+    if len(deployments) > 0:
+        mii.MIIServer(deployment_tag,
                   deployments,
                   model_path,
                   lb_config=configs.get(mii.constants.LOAD_BALANCER_CONFIG_KEY,
-                                        None))
+                                        None),
+                  lb_enabled=lb_enabled)
 
     global model
     model = None
diff --git a/mii/server.py b/mii/server.py
index ceaf2912..201c1a37 100644
--- a/mii/server.py
+++ b/mii/server.py
@@ -28,10 +28,10 @@ def config_to_b64_str(config):
 
 class MIIServer():
     '''Initialize the model, setup the server for the model under model_path'''
-    def __init__(self, deployment_tag, deployments, model_path, lb_config=None):
+    def __init__(self, deployment_tag, deployments, model_path, lb_config=None, lb_enabled=False):
 
         #mii_configs = mii.config.MIIConfig(**mii_configs)
-
+        self.lb_enabled = lb_enabled
         #self.task = mii.utils.get_task(task_name)
         self.deployments = deployments
         for deployment in deployments:
@@ -48,7 +48,8 @@ def __init__(self, deployment_tag, deployments, model_path, lb_config=None):
         processes = self._initialize_service(deployment_tag,
                                              deployments,
                                              model_path,
-                                             lb_config)
+                                             lb_config,
+                                             name_map)
         self._wait_until_server_is_live(processes, lb_config.replica_configs)
 
     def _wait_until_server_is_live(self, processes, deployment):
@@ -282,6 +283,8 @@ def _initialize_service(self, deployment_tag, deployments, model_path, lb_config
             for dep in deployments:
                 if dep.deployment_name == name:
                     deployment = dep
+            if deployment is None:
+                continue
             hostfile = tempfile.NamedTemporaryFile(delete=False)
             hostfile.write(
                 f'{repl_config.hostname} slots={max(host_gpus[repl_config.hostname])+1}\n'
@@ -306,7 +309,8 @@ def _initialize_service(self, deployment_tag, deployments, model_path, lb_config
             # we don't use deepspeed launcher for the load balancer because it does not need a GPU.
             # The deepspeed launcher determines the number of processes to launch based on GPUs available on the host or CUDA_VISIBLE_DEVICES,
             # and it is expected to assign one GPU to one process.
-        processes.append(self._launch_load_balancer(model_path, lb_config))
+        if not self.lb_enabled:
+            processes.append(self._launch_load_balancer(model_path, lb_config))
 
         for deployment in self.deployments:
             if deployment.mii_config.enable_restful_api:

From 189e75ce41883b5b9b51b3c712b52c0397b36f75 Mon Sep 17 00:00:00 2001
From: Tosin Segun <insanechils@gmail.com>
Date: Fri, 21 Jul 2023 21:03:04 +0000
Subject: [PATCH 46/69] Partial deploy started

---
 mii/__init__.py                               |  5 -
 mii/client.py                                 | 92 ++++++++++++-------
 mii/constants.py                              |  1 +
 mii/deployment.py                             | 27 +++---
 mii/grpc_related/modelresponse_server.py      | 14 ++-
 mii/grpc_related/proto/modelresponse.proto    |  2 +-
 mii/grpc_related/proto/modelresponse_pb2.py   |  4 +-
 .../proto/modelresponse_pb2_grpc.py           |  6 +-
 mii/models/score/generate.py                  |  2 +
 mii/models/score/score_template.py            |  2 +
 mii/server.py                                 |  2 +-
 11 files changed, 93 insertions(+), 64 deletions(-)

diff --git a/mii/__init__.py b/mii/__init__.py
index aba912f3..66748a56 100644
--- a/mii/__init__.py
+++ b/mii/__init__.py
@@ -15,11 +15,6 @@
 
 __version__ = "0.0.0"
 non_persistent_models = {}
-port_map = {}
-deployment_tag = ""
-lb_config = None
-model_path = None
-deployment_type = None
 try:
     from .version import __version__
 except ImportError:
diff --git a/mii/client.py b/mii/client.py
index 14dc7273..9da14d4f 100644
--- a/mii/client.py
+++ b/mii/client.py
@@ -6,6 +6,7 @@
 import grpc
 import requests
 import mii
+import time
 from mii.utils import get_task
 from mii.grpc_related.proto import modelresponse_pb2, modelresponse_pb2_grpc
 from mii.constants import GRPC_MAX_MSG_SIZE, Tasks, DeploymentType
@@ -18,11 +19,14 @@ def _get_deployment_configs(deployment_tag):
     deployments = []
     configs = mii.utils.import_score_file(deployment_tag).configs
     for deployment in configs:
-        if not isinstance(configs[deployment], dict):
+        if not isinstance(configs[deployment], dict) or deployment == mii.constants.PORT_MAP_KEY:
             continue
         configs[deployment][mii.constants.DEPLOYED_KEY] = True
         deployments.append(configs[deployment])
-    return deployments
+    lb_config = configs[mii.constants.LOAD_BALANCER_CONFIG_KEY]
+    model_path = configs[mii.constants.MODEL_PATH_KEY]
+    port_map = configs[mii.constants.PORT_MAP_KEY]
+    return deployments, lb_config, model_path, port_map
 
 
 def mii_query_handle(deployment_tag):
@@ -42,12 +46,12 @@ def mii_query_handle(deployment_tag):
         inference_pipeline, task = mii.non_persistent_models[deployment_tag]
         return MIINonPersistentClient(task, deployment_tag)
 
-    deployments = _get_deployment_configs(deployment_tag)
+    deployments, lb_config, model_path, port_map = _get_deployment_configs(deployment_tag)
     if len(deployments) > 0:
         mii_configs_dict = deployments[0][mii.constants.MII_CONFIGS_KEY]
         mii_configs = mii.config.MIIConfig(**mii_configs_dict)
 
-    return MIIClient(deployments, "localhost", mii_configs.port_number)
+    return MIIClient(deployments, "localhost", mii_configs.port_number, lb_config, model_path, port_map, deployment_tag)
 
 
 def create_channel(host, port):
@@ -62,12 +66,16 @@ class MIIClient():
     """
     Client to send queries to a single endpoint.
     """
-    def __init__(self, deployments, host, port):
+    def __init__(self, deployments, host, port, lb_config=None, model_path=None, port_map=None, deployment_tag=None):
         self.asyncio_loop = asyncio.get_event_loop()
         channel = create_channel(host, port)
         self.stub = modelresponse_pb2_grpc.DeploymentManagementStub(channel)
         #self.task = get_task(task_name)
         self.deployments = deployments
+        self.lb_config = lb_config
+        self.model_path = model_path
+        self.port_map = port_map
+        self.deployment_tag = deployment_tag
 
     def _get_deployment_task(self, deployment_name=None):
         task = None
@@ -131,8 +139,8 @@ def destroy_session(self, session_id, deployment_name=None):
         assert task == Tasks.TEXT_GENERATION, f"Session deletion only available for task '{Tasks.TEXT_GENERATION}'."
         self.asyncio_loop.run_until_complete(self.destroy_session_async(session_id))
 
-    async def add_models_async(self, request=None):
-        await getattr(self.stub, "AddDeployment")(modelresponse_pb2.google_dot_protobuf_dot_empty__pb2.Empty())
+    async def add_models_async(self, proto_request):
+        await getattr(self.stub, "AddDeployment")(proto_request)
 
     def add_models(self,
                    task=None,
@@ -161,39 +169,57 @@ def add_models(self,
                              version=version,
                              deployed=False)
             ]
-
         
-        deployment_tag = mii.deployment_tag
-        lb_config = allocate_processes(deployments)
-        if mii.lb_config is not None:
-            mii.lb_config.replica_configs.extend(lb_config.replica_configs)
+        for deployment in deployments:
+            deployment.task = get_task(deployment.task)
+        lb_config = allocate_processes(deployments, self.port_map)
+        if self.lb_config is not None:
+            self.lb_config.replica_configs.extend(lb_config.replica_configs)
         else:
-            mii.lb_config = lb_config
+            self.lb_config = lb_config
         self.deployments.extend(deployments)
-        if mii.model_path is None and deployment_type == DeploymentType.LOCAL:
-            mii.model_path = MII_MODEL_PATH_DEFAULT
-        elif mii.model_path is None and deployment_type == DeploymentType.AML:
+        if self.model_path is None and deployment_type == DeploymentType.LOCAL:
+            self.model_path = mii.constants.MII_MODEL_PATH_DEFAULT
+        elif self.model_path is None and deployment_type == DeploymentType.AML:
             model_path = "model"
         deps = []
         for deployment in self.deployments:
-             data = {
-                'deployment_name': deployment[mii.constants.DEPLOYMENT_NAME_KEY],
-                'task': deployment[mii.constants.TASK_NAME_KEY],
-                'model': deployment[mii.constants.MODEL_NAME_KEY],
-                'enable_deepspeed': deployment[mii.constants.ENABLE_DEEPSPEED_KEY],
-                'enable_zero': deployment[mii.constants.ENABLE_DEEPSPEED_ZERO_KEY],
-                'GPU_index_map': None,
-                'mii_config': deployment[mii.constants.MII_CONFIGS_KEY],
-                'ds_config': deployment[mii.constants.DEEPSPEED_CONFIG_KEY],
-                'version': 1
-                'deployed' deployment[mii.constants.DEPLOYED_KEY]
-            }
-             
-        create_score_file(deployment_tag=deployment_tag, deployment_type=mii.deployment_type, deployments=self.deployments, model_path=mii.model_path, lb_config=mii.lb_config)
-        if mii.deployment_type == DeploymentType.Local:
-            mii.utils.import_score_file(deployment_tag).init()
+            if isinstance(deployment, dict):
+
+                data = {
+                    'deployment_name': deployment[mii.constants.DEPLOYMENT_NAME_KEY],
+                    'task': deployment[mii.constants.TASK_NAME_KEY],
+                    'model': deployment[mii.constants.MODEL_NAME_KEY],
+                    'enable_deepspeed': deployment[mii.constants.ENABLE_DEEPSPEED_KEY],
+                    'enable_zero': deployment[mii.constants.ENABLE_DEEPSPEED_ZERO_KEY],
+                    'GPU_index_map': None,
+                    'mii_config': deployment[mii.constants.MII_CONFIGS_KEY],
+                    'ds_config': deployment[mii.constants.DEEPSPEED_CONFIG_KEY],
+                    'version': 1,
+                    'deployed': deployment[mii.constants.DEPLOYED_KEY]
+                    }
+                deps.append(DeploymentConfig.parse_obj(data))
+            else:
+                deps.append(deployment)
+        for deployment in deps:
+            if isinstance(deployment.task, str):
+                deployment.task = get_task(deployment.task)
+        print(deps)
+        time.sleep(5)
+        create_score_file(deployment_tag=self.deployment_tag, deployment_type=deployment_type, deployments=deps, model_path=self.model_path, port_map=self.port_map, lb_config=lb_config)
+        if deployment_type == DeploymentType.LOCAL:
+            mii.utils.import_score_file(self.deployment_tag).init()
         
-        self.asyncio_loop.run_until_complete(self.add_models_async())
+        for replica in lb_config.replica_configs:
+            request_proto = modelresponse_pb2.AddDeployRequest(task=replica.task,
+                                                               deployment_name=replica.deployment_name,
+                                                               hostname=replica.hostname,
+                                                               tensor_parallel_ports=replica.tensor_parallel_ports,
+                                                               torch_dist_port=replica.torch_dist_port,
+                                                               gpu_indices=replica.gpu_indices
+                                                               )
+
+            self.asyncio_loop.run_until_complete(self.add_models_async(request_proto))
 class MIITensorParallelClient():
     """
     Client to send queries to multiple endpoints in parallel.
diff --git a/mii/constants.py b/mii/constants.py
index baffdcf9..61c7c474 100644
--- a/mii/constants.py
+++ b/mii/constants.py
@@ -89,6 +89,7 @@ class ModelProvider(enum.Enum):
     TEXT2IMG_NAME: ["query"]
 }
 
+PORT_MAP_KEY = 'port_map'
 MODEL_NAME_KEY = 'model_name'
 TASK_NAME_KEY = 'task_name'
 DEPLOYMENT_NAME_KEY = 'deployment_name'
diff --git a/mii/deployment.py b/mii/deployment.py
index ae539b10..0848e89d 100644
--- a/mii/deployment.py
+++ b/mii/deployment.py
@@ -68,7 +68,6 @@ def deploy(task=None,
         If deployment_type is `LOCAL`, returns just the name of the deployment that can be used to create a query handle using `mii.mii_query_handle(deployment_name)`
 
     """
-    mii.deployment_type = deployment_type
     if not deployments:
         assert all((model, task, deployment_name)), "model, task, and deployment name must be set to deploy singular model"
         deployments = [
@@ -86,7 +85,6 @@ def deploy(task=None,
         deployment_tag = deployment_name
     else:
         assert deployment_tag is not None, "deployment_tag must be set to deploy multiple models"
-    mii.deployment_tag = deployment_tag
     # parse and validate mii config
     for deployment in deployments:
         mii_config = deployment.mii_config
@@ -128,10 +126,10 @@ def deploy(task=None,
     elif model_path is None and deployment_type == DeploymentType.AML:
         model_path = "model"
     
-    mii.model_path = model_path
     # add fields for replica deployment
     replica_configs = []
     port_offset = 1
+    port_map = {}
     for deployment in deployments:
         mii_config = deployment.mii_config
         replica_pool = _allocate_processes(mii_config.hostfile,
@@ -141,16 +139,16 @@ def deploy(task=None,
 
         for i, (hostname, gpu_indices) in enumerate(replica_pool):
             # Reserver port for a LB proxy when replication is enabled
-            if hostname not in mii.port_map:
-                mii.port_map[hostname] = set()
+            if hostname not in port_map:
+                port_map[hostname] = set()
             base_port = mii_config.port_number + i * mii_config.tensor_parallel + port_offset
-            if base_port in mii.port_map[hostname]:
-                base_port = max(mii.port_map[hostname]) + 1
+            if base_port in port_map[hostname]:
+                base_port = max(port_map[hostname]) + 1
             tensor_parallel_ports = list(
                 range(base_port,
                       base_port + mii_config.tensor_parallel))
             for i in range(base_port, base_port + mii_config.tensor_parallel):
-                mii.port_map[hostname].add(i)
+                port_map[hostname].add(i)
             torch_dist_port = mii_config.torch_dist_port + i
             replica_configs.append(
                 ReplicaConfig(task=get_task_name(deployment.task),
@@ -167,6 +165,7 @@ def deploy(task=None,
                           deployment_type=deployment_type,
                           deployments=deployments,
                           model_path=model_path,
+                          port_map=port_map,
                           lb_config=lb_config)
     
     if deployment_type == DeploymentType.AML:
@@ -188,7 +187,7 @@ def deploy(task=None,
     else:
         raise Exception(f"Unknown deployment type: {deployment_type}")
 
-def allocate_processes(deployments):
+def allocate_processes(deployments, port_map):
     replica_configs = []
     port_offset = 1
     for deployment in deployments:
@@ -200,16 +199,16 @@ def allocate_processes(deployments):
 
         for i, (hostname, gpu_indices) in enumerate(replica_pool):
             # Reserver port for a LB proxy when replication is enabled
-            if hostname not in mii.port_map:
-                mii.port_map[hostname] = set()
+            if hostname not in port_map:
+                port_map[hostname] = set()
             base_port = mii_config.port_number + i * mii_config.tensor_parallel + port_offset
-            if base_port in mii.port_map[hostname]:
-                base_port = max(mii.port_map[hostname]) + 1
+            if base_port in port_map[hostname]:
+                base_port = max(port_map[hostname]) + 1
             tensor_parallel_ports = list(
                 range(base_port,
                       base_port + mii_config.tensor_parallel))
             for i in range(base_port, base_port + mii_config.tensor_parallel):
-                mii.port_map[hostname].add(i)
+                port_map[hostname].add(i)
             torch_dist_port = mii_config.torch_dist_port + i
             replica_configs.append(
                 ReplicaConfig(task=get_task_name(deployment.task),
diff --git a/mii/grpc_related/modelresponse_server.py b/mii/grpc_related/modelresponse_server.py
index ab55ae32..97c6c3cb 100644
--- a/mii/grpc_related/modelresponse_server.py
+++ b/mii/grpc_related/modelresponse_server.py
@@ -34,7 +34,7 @@ def get_stop_event(self):
 
 class DeploymentManagement(ServiceBase, modelresponse_pb2_grpc.DeploymentManagementServicer):
     def AddDeployment(self, request, context):
-        print("TESTING ADD DEPLOYMENT")
+        print("DEPLOYMENT ADDED")
         return google_dot_protobuf_dot_empty__pb2.Empty()
 
 
@@ -203,7 +203,6 @@ def choose_stub(self, call_count):
 
     def intercept_service(self, continuation, handler_call_details):
         next_handler = continuation(handler_call_details)
-        print(next_handler)
         assert next_handler.unary_unary is not None
 
         #USE KWARGS LIKE THEY ARE USED TO MAKE SESSIONS TO GET THE DEPLOYMENT NAME TO HASH THE COUNTERS/STUBS
@@ -211,9 +210,14 @@ def intercept_service(self, continuation, handler_call_details):
         def invoke_intercept_method(request_proto, context):
             method_name = _get_grpc_method_name(handler_call_details.method)
             if method_name == ADD_DEPLOYMENT_METHOD:
-                for name in self.stubs:
-                    for stub in self.stubs[name]:
-                        stub.invoke(ADD_DEPLOYMENT_METHOD, request_proto)
+                print(f"REQUEST PROTO -> {request_proto}")
+                task = str(getattr(request_proto, "task"))
+                deployment_name = str(getattr(request_proto, "deployment_name"))
+                hostname = str(getattr(request_proto, "hostname"))
+                tensor_parallel_ports = list(getattr(request_proto, "tensor_parallel_ports"))
+                torch_dist_port = int(getattr(request_proto, "torch_dist_port"))
+                gpu_indices = list(getattr(request_proto, "gpu_indices"))
+                print(type(gpu_indices[0]))
                 return google_dot_protobuf_dot_empty__pb2.Empty()
 
             if method_name == TERMINATE_METHOD:
diff --git a/mii/grpc_related/proto/modelresponse.proto b/mii/grpc_related/proto/modelresponse.proto
index 7e4d3520..36d8c0e9 100644
--- a/mii/grpc_related/proto/modelresponse.proto
+++ b/mii/grpc_related/proto/modelresponse.proto
@@ -37,7 +37,7 @@ service ModelResponse {
 }
 
 service DeploymentManagement {
-  rpc AddDeployment(google.protobuf.Empty) returns (google.protobuf.Empty) {}	
+  rpc AddDeployment(AddDeployRequest) returns (google.protobuf.Empty) {}
 }
 
 message Value {
diff --git a/mii/grpc_related/proto/modelresponse_pb2.py b/mii/grpc_related/proto/modelresponse_pb2.py
index 1fc27665..452de039 100644
--- a/mii/grpc_related/proto/modelresponse_pb2.py
+++ b/mii/grpc_related/proto/modelresponse_pb2.py
@@ -14,7 +14,7 @@
 from google.protobuf import empty_pb2 as google_dot_protobuf_dot_empty__pb2
 
 
-DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x13modelresponse.proto\x12\rmodelresponse\x1a\x1bgoogle/protobuf/empty.proto\"_\n\x05Value\x12\x10\n\x06svalue\x18\x01 \x01(\tH\x00\x12\x10\n\x06ivalue\x18\x02 \x01(\x03H\x00\x12\x10\n\x06\x66value\x18\x03 \x01(\x02H\x00\x12\x10\n\x06\x62value\x18\x04 \x01(\x08H\x00\x42\x0e\n\x0coneof_values\"\x1f\n\tSessionID\x12\x12\n\nsession_id\x18\x01 \x01(\t\"\xed\x01\n\x13SingleStringRequest\x12\x0f\n\x07request\x18\x01 \x01(\t\x12I\n\x0cquery_kwargs\x18\x02 \x03(\x0b\x32\x33.modelresponse.SingleStringRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x03 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\xeb\x01\n\x12MultiStringRequest\x12\x0f\n\x07request\x18\x01 \x03(\t\x12H\n\x0cquery_kwargs\x18\x02 \x03(\x0b\x32\x32.modelresponse.MultiStringRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x03 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\x85\x01\n\x11SingleStringReply\x12\x10\n\x08response\x18\x01 \x01(\t\x12\x12\n\ntime_taken\x18\x02 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x03 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\x84\x01\n\x10MultiStringReply\x12\x10\n\x08response\x18\x01 \x03(\t\x12\x12\n\ntime_taken\x18\x02 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x03 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\xeb\x01\n\tQARequest\x12\x10\n\x08question\x18\x01 \x01(\t\x12\x0f\n\x07\x63ontext\x18\x02 \x01(\t\x12?\n\x0cquery_kwargs\x18\x03 \x03(\x0b\x32).modelresponse.QARequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\xd3\x02\n\x13\x43onversationRequest\x12\x0c\n\x04text\x18\x01 \x01(\t\x12\x1c\n\x0f\x63onversation_id\x18\x02 \x01(\x03H\x00\x88\x01\x01\x12\x18\n\x10past_user_inputs\x18\x03 \x03(\t\x12\x1b\n\x13generated_responses\x18\x04 \x03(\t\x12I\n\x0cquery_kwargs\x18\x05 \x03(\x0b\x32\x33.modelresponse.ConversationRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x06 \x01(\tH\x01\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_conversation_idB\x12\n\x10_deployment_name\"\xc3\x01\n\x11\x43onversationReply\x12\x17\n\x0f\x63onversation_id\x18\x01 \x01(\x03\x12\x18\n\x10past_user_inputs\x18\x02 \x03(\t\x12\x1b\n\x13generated_responses\x18\x03 \x03(\t\x12\x12\n\ntime_taken\x18\x04 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x05 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x06 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\xaf\x01\n\nImageReply\x12\x0e\n\x06images\x18\x01 \x03(\x0c\x12\x1d\n\x15nsfw_content_detected\x18\x02 \x03(\x08\x12\x0c\n\x04mode\x18\x03 \x01(\t\x12\x0e\n\x06size_w\x18\x04 \x01(\x03\x12\x0e\n\x06size_h\x18\x05 \x01(\x03\x12\x12\n\ntime_taken\x18\x06 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x07 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\x98\x01\n\x10\x41\x64\x64\x44\x65ployRequest\x12\x0c\n\x04task\x18\x01 \x01(\t\x12\x17\n\x0f\x64\x65ployment_name\x18\x02 \x01(\t\x12\x10\n\x08hostname\x18\x03 \x01(\t\x12\x1d\n\x15tensor_parallel_ports\x18\x04 \x03(\x03\x12\x17\n\x0ftorch_dist_port\x18\x05 \x01(\x03\x12\x13\n\x0bgpu_indices\x18\x06 \x03(\x03\x32\xd4\x06\n\rModelResponse\x12=\n\tTerminate\x12\x16.google.protobuf.Empty\x1a\x16.google.protobuf.Empty\"\x00\x12\x43\n\rCreateSession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12\x44\n\x0e\x44\x65stroySession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12V\n\x0eGeneratorReply\x12!.modelresponse.MultiStringRequest\x1a\x1f.modelresponse.MultiStringReply\"\x00\x12]\n\x13\x43lassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12V\n\x16QuestionAndAnswerReply\x12\x18.modelresponse.QARequest\x1a .modelresponse.SingleStringReply\"\x00\x12W\n\rFillMaskReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12\x62\n\x18TokenClassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12]\n\x13\x43onversationalReply\x12\".modelresponse.ConversationRequest\x1a .modelresponse.ConversationReply\"\x00\x12N\n\x0cTxt2ImgReply\x12!.modelresponse.MultiStringRequest\x1a\x19.modelresponse.ImageReply\"\x00\x32Y\n\x14\x44\x65ploymentManagement\x12\x41\n\rAddDeployment\x12\x16.google.protobuf.Empty\x1a\x16.google.protobuf.Empty\"\x00\x62\x06proto3')
+DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x13modelresponse.proto\x12\rmodelresponse\x1a\x1bgoogle/protobuf/empty.proto\"_\n\x05Value\x12\x10\n\x06svalue\x18\x01 \x01(\tH\x00\x12\x10\n\x06ivalue\x18\x02 \x01(\x03H\x00\x12\x10\n\x06\x66value\x18\x03 \x01(\x02H\x00\x12\x10\n\x06\x62value\x18\x04 \x01(\x08H\x00\x42\x0e\n\x0coneof_values\"\x1f\n\tSessionID\x12\x12\n\nsession_id\x18\x01 \x01(\t\"\xed\x01\n\x13SingleStringRequest\x12\x0f\n\x07request\x18\x01 \x01(\t\x12I\n\x0cquery_kwargs\x18\x02 \x03(\x0b\x32\x33.modelresponse.SingleStringRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x03 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\xeb\x01\n\x12MultiStringRequest\x12\x0f\n\x07request\x18\x01 \x03(\t\x12H\n\x0cquery_kwargs\x18\x02 \x03(\x0b\x32\x32.modelresponse.MultiStringRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x03 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\x85\x01\n\x11SingleStringReply\x12\x10\n\x08response\x18\x01 \x01(\t\x12\x12\n\ntime_taken\x18\x02 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x03 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\x84\x01\n\x10MultiStringReply\x12\x10\n\x08response\x18\x01 \x03(\t\x12\x12\n\ntime_taken\x18\x02 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x03 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\xeb\x01\n\tQARequest\x12\x10\n\x08question\x18\x01 \x01(\t\x12\x0f\n\x07\x63ontext\x18\x02 \x01(\t\x12?\n\x0cquery_kwargs\x18\x03 \x03(\x0b\x32).modelresponse.QARequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\xd3\x02\n\x13\x43onversationRequest\x12\x0c\n\x04text\x18\x01 \x01(\t\x12\x1c\n\x0f\x63onversation_id\x18\x02 \x01(\x03H\x00\x88\x01\x01\x12\x18\n\x10past_user_inputs\x18\x03 \x03(\t\x12\x1b\n\x13generated_responses\x18\x04 \x03(\t\x12I\n\x0cquery_kwargs\x18\x05 \x03(\x0b\x32\x33.modelresponse.ConversationRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x06 \x01(\tH\x01\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_conversation_idB\x12\n\x10_deployment_name\"\xc3\x01\n\x11\x43onversationReply\x12\x17\n\x0f\x63onversation_id\x18\x01 \x01(\x03\x12\x18\n\x10past_user_inputs\x18\x02 \x03(\t\x12\x1b\n\x13generated_responses\x18\x03 \x03(\t\x12\x12\n\ntime_taken\x18\x04 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x05 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x06 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\xaf\x01\n\nImageReply\x12\x0e\n\x06images\x18\x01 \x03(\x0c\x12\x1d\n\x15nsfw_content_detected\x18\x02 \x03(\x08\x12\x0c\n\x04mode\x18\x03 \x01(\t\x12\x0e\n\x06size_w\x18\x04 \x01(\x03\x12\x0e\n\x06size_h\x18\x05 \x01(\x03\x12\x12\n\ntime_taken\x18\x06 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x07 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\x98\x01\n\x10\x41\x64\x64\x44\x65ployRequest\x12\x0c\n\x04task\x18\x01 \x01(\t\x12\x17\n\x0f\x64\x65ployment_name\x18\x02 \x01(\t\x12\x10\n\x08hostname\x18\x03 \x01(\t\x12\x1d\n\x15tensor_parallel_ports\x18\x04 \x03(\x03\x12\x17\n\x0ftorch_dist_port\x18\x05 \x01(\x03\x12\x13\n\x0bgpu_indices\x18\x06 \x03(\x03\x32\xd4\x06\n\rModelResponse\x12=\n\tTerminate\x12\x16.google.protobuf.Empty\x1a\x16.google.protobuf.Empty\"\x00\x12\x43\n\rCreateSession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12\x44\n\x0e\x44\x65stroySession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12V\n\x0eGeneratorReply\x12!.modelresponse.MultiStringRequest\x1a\x1f.modelresponse.MultiStringReply\"\x00\x12]\n\x13\x43lassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12V\n\x16QuestionAndAnswerReply\x12\x18.modelresponse.QARequest\x1a .modelresponse.SingleStringReply\"\x00\x12W\n\rFillMaskReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12\x62\n\x18TokenClassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12]\n\x13\x43onversationalReply\x12\".modelresponse.ConversationRequest\x1a .modelresponse.ConversationReply\"\x00\x12N\n\x0cTxt2ImgReply\x12!.modelresponse.MultiStringRequest\x1a\x19.modelresponse.ImageReply\"\x00\x32\x62\n\x14\x44\x65ploymentManagement\x12J\n\rAddDeployment\x12\x1f.modelresponse.AddDeployRequest\x1a\x16.google.protobuf.Empty\"\x00\x62\x06proto3')
 
 _globals = globals()
 _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
@@ -63,5 +63,5 @@
   _globals['_MODELRESPONSE']._serialized_start=2058
   _globals['_MODELRESPONSE']._serialized_end=2910
   _globals['_DEPLOYMENTMANAGEMENT']._serialized_start=2912
-  _globals['_DEPLOYMENTMANAGEMENT']._serialized_end=3001
+  _globals['_DEPLOYMENTMANAGEMENT']._serialized_end=3010
 # @@protoc_insertion_point(module_scope)
diff --git a/mii/grpc_related/proto/modelresponse_pb2_grpc.py b/mii/grpc_related/proto/modelresponse_pb2_grpc.py
index 438fa0c2..dc91fcfc 100644
--- a/mii/grpc_related/proto/modelresponse_pb2_grpc.py
+++ b/mii/grpc_related/proto/modelresponse_pb2_grpc.py
@@ -375,7 +375,7 @@ def __init__(self, channel):
         """
         self.AddDeployment = channel.unary_unary(
                 '/modelresponse.DeploymentManagement/AddDeployment',
-                request_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString,
+                request_serializer=modelresponse__pb2.AddDeployRequest.SerializeToString,
                 response_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString,
                 )
 
@@ -394,7 +394,7 @@ def add_DeploymentManagementServicer_to_server(servicer, server):
     rpc_method_handlers = {
             'AddDeployment': grpc.unary_unary_rpc_method_handler(
                     servicer.AddDeployment,
-                    request_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString,
+                    request_deserializer=modelresponse__pb2.AddDeployRequest.FromString,
                     response_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString,
             ),
     }
@@ -419,7 +419,7 @@ def AddDeployment(request,
             timeout=None,
             metadata=None):
         return grpc.experimental.unary_unary(request, target, '/modelresponse.DeploymentManagement/AddDeployment',
-            google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString,
+            modelresponse__pb2.AddDeployRequest.SerializeToString,
             google_dot_protobuf_dot_empty__pb2.Empty.FromString,
             options, channel_credentials,
             insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
diff --git a/mii/models/score/generate.py b/mii/models/score/generate.py
index 50f0446f..d807ab45 100644
--- a/mii/models/score/generate.py
+++ b/mii/models/score/generate.py
@@ -13,11 +13,13 @@ def create_score_file(deployment_tag,
                       deployment_type,
                       deployments,
                       model_path,
+                      port_map,
                       lb_config):
 
     config_dict = {}
     config_dict[mii.constants.MODEL_PATH_KEY] = model_path
     config_dict[mii.constants.DEPLOYMENT_TAG_KEY] = deployment_tag
+    config_dict[mii.constants.PORT_MAP_KEY] = port_map
     for deployment in deployments:
         deployment_config = {
             mii.constants.DEPLOYMENT_NAME_KEY: deployment.deployment_name,
diff --git a/mii/models/score/score_template.py b/mii/models/score/score_template.py
index 84238511..fd6a7f0f 100644
--- a/mii/models/score/score_template.py
+++ b/mii/models/score/score_template.py
@@ -19,9 +19,11 @@ def init():
     deployment_tag = configs[mii.constants.DEPLOYMENT_TAG_KEY]
     deployments = []
     lb_enabled = False
+    del configs[mii.constants.PORT_MAP_KEY]
     for deployment in configs.values():
         if isinstance(deployment, dict) and deployment[mii.constants.DEPLOYED_KEY]:
             lb_enabled = True
+            print(deployment)
             continue
         if not isinstance(deployment, dict):
             continue
diff --git a/mii/server.py b/mii/server.py
index 201c1a37..f29e3a28 100644
--- a/mii/server.py
+++ b/mii/server.py
@@ -49,7 +49,7 @@ def __init__(self, deployment_tag, deployments, model_path, lb_config=None, lb_e
                                              deployments,
                                              model_path,
                                              lb_config,
-                                             name_map)
+                                             )
         self._wait_until_server_is_live(processes, lb_config.replica_configs)
 
     def _wait_until_server_is_live(self, processes, deployment):

From adee843f3430c5ab02e9e3e7f4193cf91757aa90 Mon Sep 17 00:00:00 2001
From: Tosin Segun <insanechils@gmail.com>
Date: Mon, 24 Jul 2023 20:38:21 +0000
Subject: [PATCH 47/69] fixing add deploy api queries

---
 mii/client.py                                 |  60 ++--
 mii/deployment.py                             |   2 +-
 mii/grpc_related/modelresponse_server.py      |  68 ++--
 mii/grpc_related/proto/modelresponse.proto    |   9 +
 mii/grpc_related/proto/modelresponse_pb2.py   |   6 +-
 .../proto/modelresponse_pb2_grpc.py           | 297 ++++++++++++++++++
 6 files changed, 356 insertions(+), 86 deletions(-)

diff --git a/mii/client.py b/mii/client.py
index 9da14d4f..9facd25c 100644
--- a/mii/client.py
+++ b/mii/client.py
@@ -6,7 +6,6 @@
 import grpc
 import requests
 import mii
-import time
 from mii.utils import get_task
 from mii.grpc_related.proto import modelresponse_pb2, modelresponse_pb2_grpc
 from mii.constants import GRPC_MAX_MSG_SIZE, Tasks, DeploymentType
@@ -22,7 +21,19 @@ def _get_deployment_configs(deployment_tag):
         if not isinstance(configs[deployment], dict) or deployment == mii.constants.PORT_MAP_KEY:
             continue
         configs[deployment][mii.constants.DEPLOYED_KEY] = True
-        deployments.append(configs[deployment])
+        data = {
+                'deployment_name':configs[deployment][mii.constants.DEPLOYMENT_NAME_KEY],
+                'task': configs[deployment][mii.constants.TASK_NAME_KEY],
+                'model': configs[deployment][mii.constants.MODEL_NAME_KEY],
+                'enable_deepspeed': configs[deployment][mii.constants.ENABLE_DEEPSPEED_KEY],
+                'enable_zero': configs[deployment][mii.constants.ENABLE_DEEPSPEED_ZERO_KEY],
+                'GPU_index_map': None,
+                'mii_config': configs[deployment][mii.constants.MII_CONFIGS_KEY],
+                'ds_config': configs[deployment][mii.constants.DEEPSPEED_CONFIG_KEY],
+                'version': 1,
+                'deployed': configs[deployment][mii.constants.DEPLOYED_KEY]
+               }
+        deployments.append(DeploymentConfig.parse_obj(data))
     lb_config = configs[mii.constants.LOAD_BALANCER_CONFIG_KEY]
     model_path = configs[mii.constants.MODEL_PATH_KEY]
     port_map = configs[mii.constants.PORT_MAP_KEY]
@@ -48,10 +59,10 @@ def mii_query_handle(deployment_tag):
 
     deployments, lb_config, model_path, port_map = _get_deployment_configs(deployment_tag)
     if len(deployments) > 0:
-        mii_configs_dict = deployments[0][mii.constants.MII_CONFIGS_KEY]
-        mii_configs = mii.config.MIIConfig(**mii_configs_dict)
+        mii_configs_dict = deployments[0].mii_config
+        #mii_configs = mii.config.MIIConfig(**mii_configs_dict)
 
-    return MIIClient(deployments, "localhost", mii_configs.port_number, lb_config, model_path, port_map, deployment_tag)
+    return MIIClient(deployments, "localhost", mii_configs_dict.port_number, lb_config, model_path, port_map, deployment_tag)
 
 
 def create_channel(host, port):
@@ -70,7 +81,6 @@ def __init__(self, deployments, host, port, lb_config=None, model_path=None, por
         self.asyncio_loop = asyncio.get_event_loop()
         channel = create_channel(host, port)
         self.stub = modelresponse_pb2_grpc.DeploymentManagementStub(channel)
-        #self.task = get_task(task_name)
         self.deployments = deployments
         self.lb_config = lb_config
         self.model_path = model_path
@@ -81,12 +91,13 @@ def _get_deployment_task(self, deployment_name=None):
         task = None
         if deployment_name is None:  #mii.terminate() or single model
             assert len(self.deployments) == 1, "Must pass deployment_name to query when using multiple deployments"
-            deployment_name = self.deployments[0][mii.constants.DEPLOYMENT_NAME_KEY]
-            task = get_task(self.deployments[0][mii.constants.TASK_NAME_KEY])
+            deployment_name = self.deployments[0].deployment_name
+            task = get_task(self.deployments[0].task) if isinstance(deployment.task, str) else self.deployments[0].task
         else:
             for deployment in self.deployments:
-                if deployment[mii.constants.DEPLOYMENT_NAME_KEY] == deployment_name:
-                    task = get_task(deployment[mii.constants.TASK_NAME_KEY])
+                print(deployment.deployment_name)
+                if deployment.deployment_name == deployment_name:
+                    task = get_task(deployment.task) if isinstance(deployment.task, str) else deployment.task
                     return deployment_name, task
             assert False, f"{deployment_name} not found in list of deployments"
         return deployment_name, task
@@ -155,6 +166,7 @@ def add_models(self,
                    deployment_type=DeploymentType.LOCAL,
                    model_path=None,
                    version=1):
+        
         if not deployments:
             assert all((model, task, deployment_name)), "model, task, and deployment name must be set to deploy singular model"
             deployments = [
@@ -172,7 +184,8 @@ def add_models(self,
         
         for deployment in deployments:
             deployment.task = get_task(deployment.task)
-        lb_config = allocate_processes(deployments, self.port_map)
+        lb_config, self.port_map = allocate_processes(deployments, self.port_map)
+        
         if self.lb_config is not None:
             self.lb_config.replica_configs.extend(lb_config.replica_configs)
         else:
@@ -182,34 +195,13 @@ def add_models(self,
             self.model_path = mii.constants.MII_MODEL_PATH_DEFAULT
         elif self.model_path is None and deployment_type == DeploymentType.AML:
             model_path = "model"
-        deps = []
         for deployment in self.deployments:
-            if isinstance(deployment, dict):
-
-                data = {
-                    'deployment_name': deployment[mii.constants.DEPLOYMENT_NAME_KEY],
-                    'task': deployment[mii.constants.TASK_NAME_KEY],
-                    'model': deployment[mii.constants.MODEL_NAME_KEY],
-                    'enable_deepspeed': deployment[mii.constants.ENABLE_DEEPSPEED_KEY],
-                    'enable_zero': deployment[mii.constants.ENABLE_DEEPSPEED_ZERO_KEY],
-                    'GPU_index_map': None,
-                    'mii_config': deployment[mii.constants.MII_CONFIGS_KEY],
-                    'ds_config': deployment[mii.constants.DEEPSPEED_CONFIG_KEY],
-                    'version': 1,
-                    'deployed': deployment[mii.constants.DEPLOYED_KEY]
-                    }
-                deps.append(DeploymentConfig.parse_obj(data))
-            else:
-                deps.append(deployment)
-        for deployment in deps:
             if isinstance(deployment.task, str):
                 deployment.task = get_task(deployment.task)
-        print(deps)
-        time.sleep(5)
-        create_score_file(deployment_tag=self.deployment_tag, deployment_type=deployment_type, deployments=deps, model_path=self.model_path, port_map=self.port_map, lb_config=lb_config)
+        create_score_file(deployment_tag=self.deployment_tag, deployment_type=deployment_type, deployments=self.deployments, model_path=self.model_path, port_map=self.port_map, lb_config=lb_config)
         if deployment_type == DeploymentType.LOCAL:
             mii.utils.import_score_file(self.deployment_tag).init()
-        
+         
         for replica in lb_config.replica_configs:
             request_proto = modelresponse_pb2.AddDeployRequest(task=replica.task,
                                                                deployment_name=replica.deployment_name,
diff --git a/mii/deployment.py b/mii/deployment.py
index 0848e89d..20332fa4 100644
--- a/mii/deployment.py
+++ b/mii/deployment.py
@@ -219,7 +219,7 @@ def allocate_processes(deployments, port_map):
                               gpu_indices=gpu_indices))
     lb_config = LoadBalancerConfig(port=mii_config.port_number,
                                    replica_configs=replica_configs)
-    return lb_config
+    return lb_config, port_map
 
 def _deploy_local(deployment_tag, model_path):
     mii.utils.import_score_file(deployment_tag).init()
diff --git a/mii/grpc_related/modelresponse_server.py b/mii/grpc_related/modelresponse_server.py
index 97c6c3cb..aa000e53 100644
--- a/mii/grpc_related/modelresponse_server.py
+++ b/mii/grpc_related/modelresponse_server.py
@@ -142,15 +142,16 @@ class ParallelStubInvoker:
     This class aims to call gRPC methods without conversions between proto and python object.
     TensorParallelClient can be used for invocation with the conversions.
     """
-    def __init__(self, host, ports):
+    def __init__(self, host, ports, asyncio_loop):
         # Assumption: target services are all on the same host
         self.stubs = []
         for port in ports:
+            asyncio.set_event_loop(asyncio_loop)
             channel = create_channel(host, port)
-            stub = modelresponse_pb2_grpc.DeploymentManagementStub(channel)
+            stub = modelresponse_pb2_grpc.ModelResponseStub(channel)
             self.stubs.append(stub)
 
-        self.asyncio_loop = asyncio.get_event_loop()
+        self.asyncio_loop = asyncio_loop
 
     async def _invoke_async(self, method_name, proto_request):
         responses = []
@@ -184,12 +185,8 @@ def __init__(self, replica_configs):
         for repl in replica_configs:
             self.stubs[repl.deployment_name].append(
                 ParallelStubInvoker(repl.hostname,
-                                    repl.tensor_parallel_ports))
-        """
-        self.counter = AtomicCounter()
-        self.task = get_task(task_name)
-        self.replica_sessions = {}
-        """
+                                    repl.tensor_parallel_ports,
+                                    self.asyncio_loop))
 
         # Start the asyncio loop in a separate thread
         def run_asyncio_loop(loop):
@@ -210,14 +207,17 @@ def intercept_service(self, continuation, handler_call_details):
         def invoke_intercept_method(request_proto, context):
             method_name = _get_grpc_method_name(handler_call_details.method)
             if method_name == ADD_DEPLOYMENT_METHOD:
-                print(f"REQUEST PROTO -> {request_proto}")
                 task = str(getattr(request_proto, "task"))
                 deployment_name = str(getattr(request_proto, "deployment_name"))
                 hostname = str(getattr(request_proto, "hostname"))
                 tensor_parallel_ports = list(getattr(request_proto, "tensor_parallel_ports"))
                 torch_dist_port = int(getattr(request_proto, "torch_dist_port"))
                 gpu_indices = list(getattr(request_proto, "gpu_indices"))
-                print(type(gpu_indices[0]))
+                if deployment_name not in self.stubs:
+                    self.stubs[deployment_name] = []
+                self.counter[deployment_name] = AtomicCounter()
+                self.tasks[deployment_name] = task
+                self.stubs[deployment_name].append(ParallelStubInvoker(hostname, tensor_parallel_ports, self.asyncio_loop))
                 return google_dot_protobuf_dot_empty__pb2.Empty()
 
             if method_name == TERMINATE_METHOD:
@@ -227,40 +227,8 @@ def invoke_intercept_method(request_proto, context):
                                     google_dot_protobuf_dot_empty__pb2.Empty())
                 self.asyncio_loop.call_soon_threadsafe(self.asyncio_loop.stop)
                 return next_handler.unary_unary(request_proto, context)
+            
             deployment_name = getattr(request_proto, 'deployment_name')
-            """
-            kwargs = unpack_proto_query_kwargs(request_proto.query_kwargs)
-            if method_name != TERMINATE_METHOD:
-                assert "deployment_name" in kwargs, "Must include deployment_name in kwargs for query"
-            deployment_name = kwargs.get('deployment_name')
-            kwargs.pop('deployment_name', None)
-            task = self.tasks[deployment_name]
-            assert task is not None, f"task for {deployment_name} not found"
-            method = GRPC_METHOD_TABLE[get_task(task)]
-            new_request = None
-            if method_name == "ConversationalReply":
-                request_dict = {}
-                request_dict['text'] = str(request_proto.text)
-                val = getattr(request_proto, 'conversation_id')
-                request_dict['conversation_id'] = int(val) if val is not None else None
-                request_dict['past_user_inputs'] = list(request_proto.past_user_inputs)
-                request_dict['generated_responses'] = list(
-                    request_proto.generated_responses)
-                new_request = method.pack_request_to_proto(request_dict, **kwargs)
-
-            elif method_name == "QuestionAndAnswerReply":
-                request_dict = {}
-                request_dict['question'] = str(request_proto.question)
-                request_dict['context'] = str(request_proto.context)
-                new_request = method.pack_request_to_proto(request_dict, **kwargs)
-            else:
-                request_dict = {}
-                request_dict["query"] = list(
-                    request_proto.request
-                ) if method_name == "GeneratorReply" or method_name == "Txt2ImgReply" else str(
-                    request_proto.request)
-                new_request = method.pack_request_to_proto(request_dict, **kwargs)
-            """
             call_count = self.counter[deployment_name].get_and_increment()
             replica_index = call_count % len(self.stubs[deployment_name])
 
@@ -298,7 +266,7 @@ def invoke_intercept_method(request_proto, context):
             response_serializer=next_handler.response_serializer)
 
 
-def _do_serve(service_impl, port, interceptors=[]):
+def _do_serve(service_impl, port, interceptors=[], is_lb=False):
     stop_event = service_impl.get_stop_event()
     server = grpc.server(futures.ThreadPoolExecutor(max_workers=LB_MAX_WORKER_THREADS),
                          interceptors=interceptors,
@@ -306,7 +274,10 @@ def _do_serve(service_impl, port, interceptors=[]):
                                    GRPC_MAX_MSG_SIZE),
                                   ('grpc.max_receive_message_length',
                                    GRPC_MAX_MSG_SIZE)])
-    modelresponse_pb2_grpc.add_DeploymentManagementServicer_to_server(service_impl, server)
+    if is_lb:
+        modelresponse_pb2_grpc.add_DeploymentManagementServicer_to_server(service_impl, server)
+    else:
+        modelresponse_pb2_grpc.add_ModelResponseServicer_to_server(service_impl, server)
     server.add_insecure_port(f'[::]:{port}')
     print(f"About to start server")
     server.start()
@@ -316,13 +287,14 @@ def _do_serve(service_impl, port, interceptors=[]):
 
 
 def serve_inference(inference_pipeline, port):
-    _do_serve(DeploymentManagement(), port)
+    _do_serve(ModelResponse(inference_pipeline), port)
 
 
 def serve_load_balancing(lb_config):
     _do_serve(DeploymentManagement(),
               lb_config.port,
-              [LoadBalancingInterceptor(lb_config.replica_configs)])
+              [LoadBalancingInterceptor(lb_config.replica_configs)],
+              True)
 
 
 if __name__ == '__main__':
diff --git a/mii/grpc_related/proto/modelresponse.proto b/mii/grpc_related/proto/modelresponse.proto
index 36d8c0e9..757fa0da 100644
--- a/mii/grpc_related/proto/modelresponse.proto
+++ b/mii/grpc_related/proto/modelresponse.proto
@@ -38,6 +38,15 @@ service ModelResponse {
 
 service DeploymentManagement {
   rpc AddDeployment(AddDeployRequest) returns (google.protobuf.Empty) {}
+  rpc CreateSession (SessionID) returns (google.protobuf.Empty) {}
+  rpc DestroySession (SessionID) returns (google.protobuf.Empty) {}
+  rpc GeneratorReply (MultiStringRequest) returns (MultiStringReply) {}
+  rpc ClassificationReply (SingleStringRequest) returns (SingleStringReply) {}
+  rpc QuestionAndAnswerReply(QARequest) returns (SingleStringReply) {}
+  rpc FillMaskReply(SingleStringRequest) returns (SingleStringReply) {}
+  rpc TokenClassificationReply(SingleStringRequest) returns (SingleStringReply) {}
+  rpc ConversationalReply(ConversationRequest) returns (ConversationReply) {}
+  rpc Txt2ImgReply(MultiStringRequest) returns (ImageReply) {}
 }
 
 message Value {
diff --git a/mii/grpc_related/proto/modelresponse_pb2.py b/mii/grpc_related/proto/modelresponse_pb2.py
index 452de039..073083b4 100644
--- a/mii/grpc_related/proto/modelresponse_pb2.py
+++ b/mii/grpc_related/proto/modelresponse_pb2.py
@@ -14,7 +14,7 @@
 from google.protobuf import empty_pb2 as google_dot_protobuf_dot_empty__pb2
 
 
-DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x13modelresponse.proto\x12\rmodelresponse\x1a\x1bgoogle/protobuf/empty.proto\"_\n\x05Value\x12\x10\n\x06svalue\x18\x01 \x01(\tH\x00\x12\x10\n\x06ivalue\x18\x02 \x01(\x03H\x00\x12\x10\n\x06\x66value\x18\x03 \x01(\x02H\x00\x12\x10\n\x06\x62value\x18\x04 \x01(\x08H\x00\x42\x0e\n\x0coneof_values\"\x1f\n\tSessionID\x12\x12\n\nsession_id\x18\x01 \x01(\t\"\xed\x01\n\x13SingleStringRequest\x12\x0f\n\x07request\x18\x01 \x01(\t\x12I\n\x0cquery_kwargs\x18\x02 \x03(\x0b\x32\x33.modelresponse.SingleStringRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x03 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\xeb\x01\n\x12MultiStringRequest\x12\x0f\n\x07request\x18\x01 \x03(\t\x12H\n\x0cquery_kwargs\x18\x02 \x03(\x0b\x32\x32.modelresponse.MultiStringRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x03 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\x85\x01\n\x11SingleStringReply\x12\x10\n\x08response\x18\x01 \x01(\t\x12\x12\n\ntime_taken\x18\x02 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x03 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\x84\x01\n\x10MultiStringReply\x12\x10\n\x08response\x18\x01 \x03(\t\x12\x12\n\ntime_taken\x18\x02 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x03 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\xeb\x01\n\tQARequest\x12\x10\n\x08question\x18\x01 \x01(\t\x12\x0f\n\x07\x63ontext\x18\x02 \x01(\t\x12?\n\x0cquery_kwargs\x18\x03 \x03(\x0b\x32).modelresponse.QARequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\xd3\x02\n\x13\x43onversationRequest\x12\x0c\n\x04text\x18\x01 \x01(\t\x12\x1c\n\x0f\x63onversation_id\x18\x02 \x01(\x03H\x00\x88\x01\x01\x12\x18\n\x10past_user_inputs\x18\x03 \x03(\t\x12\x1b\n\x13generated_responses\x18\x04 \x03(\t\x12I\n\x0cquery_kwargs\x18\x05 \x03(\x0b\x32\x33.modelresponse.ConversationRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x06 \x01(\tH\x01\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_conversation_idB\x12\n\x10_deployment_name\"\xc3\x01\n\x11\x43onversationReply\x12\x17\n\x0f\x63onversation_id\x18\x01 \x01(\x03\x12\x18\n\x10past_user_inputs\x18\x02 \x03(\t\x12\x1b\n\x13generated_responses\x18\x03 \x03(\t\x12\x12\n\ntime_taken\x18\x04 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x05 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x06 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\xaf\x01\n\nImageReply\x12\x0e\n\x06images\x18\x01 \x03(\x0c\x12\x1d\n\x15nsfw_content_detected\x18\x02 \x03(\x08\x12\x0c\n\x04mode\x18\x03 \x01(\t\x12\x0e\n\x06size_w\x18\x04 \x01(\x03\x12\x0e\n\x06size_h\x18\x05 \x01(\x03\x12\x12\n\ntime_taken\x18\x06 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x07 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\x98\x01\n\x10\x41\x64\x64\x44\x65ployRequest\x12\x0c\n\x04task\x18\x01 \x01(\t\x12\x17\n\x0f\x64\x65ployment_name\x18\x02 \x01(\t\x12\x10\n\x08hostname\x18\x03 \x01(\t\x12\x1d\n\x15tensor_parallel_ports\x18\x04 \x03(\x03\x12\x17\n\x0ftorch_dist_port\x18\x05 \x01(\x03\x12\x13\n\x0bgpu_indices\x18\x06 \x03(\x03\x32\xd4\x06\n\rModelResponse\x12=\n\tTerminate\x12\x16.google.protobuf.Empty\x1a\x16.google.protobuf.Empty\"\x00\x12\x43\n\rCreateSession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12\x44\n\x0e\x44\x65stroySession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12V\n\x0eGeneratorReply\x12!.modelresponse.MultiStringRequest\x1a\x1f.modelresponse.MultiStringReply\"\x00\x12]\n\x13\x43lassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12V\n\x16QuestionAndAnswerReply\x12\x18.modelresponse.QARequest\x1a .modelresponse.SingleStringReply\"\x00\x12W\n\rFillMaskReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12\x62\n\x18TokenClassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12]\n\x13\x43onversationalReply\x12\".modelresponse.ConversationRequest\x1a .modelresponse.ConversationReply\"\x00\x12N\n\x0cTxt2ImgReply\x12!.modelresponse.MultiStringRequest\x1a\x19.modelresponse.ImageReply\"\x00\x32\x62\n\x14\x44\x65ploymentManagement\x12J\n\rAddDeployment\x12\x1f.modelresponse.AddDeployRequest\x1a\x16.google.protobuf.Empty\"\x00\x62\x06proto3')
+DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x13modelresponse.proto\x12\rmodelresponse\x1a\x1bgoogle/protobuf/empty.proto\"_\n\x05Value\x12\x10\n\x06svalue\x18\x01 \x01(\tH\x00\x12\x10\n\x06ivalue\x18\x02 \x01(\x03H\x00\x12\x10\n\x06\x66value\x18\x03 \x01(\x02H\x00\x12\x10\n\x06\x62value\x18\x04 \x01(\x08H\x00\x42\x0e\n\x0coneof_values\"\x1f\n\tSessionID\x12\x12\n\nsession_id\x18\x01 \x01(\t\"\xed\x01\n\x13SingleStringRequest\x12\x0f\n\x07request\x18\x01 \x01(\t\x12I\n\x0cquery_kwargs\x18\x02 \x03(\x0b\x32\x33.modelresponse.SingleStringRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x03 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\xeb\x01\n\x12MultiStringRequest\x12\x0f\n\x07request\x18\x01 \x03(\t\x12H\n\x0cquery_kwargs\x18\x02 \x03(\x0b\x32\x32.modelresponse.MultiStringRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x03 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\x85\x01\n\x11SingleStringReply\x12\x10\n\x08response\x18\x01 \x01(\t\x12\x12\n\ntime_taken\x18\x02 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x03 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\x84\x01\n\x10MultiStringReply\x12\x10\n\x08response\x18\x01 \x03(\t\x12\x12\n\ntime_taken\x18\x02 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x03 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\xeb\x01\n\tQARequest\x12\x10\n\x08question\x18\x01 \x01(\t\x12\x0f\n\x07\x63ontext\x18\x02 \x01(\t\x12?\n\x0cquery_kwargs\x18\x03 \x03(\x0b\x32).modelresponse.QARequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\xd3\x02\n\x13\x43onversationRequest\x12\x0c\n\x04text\x18\x01 \x01(\t\x12\x1c\n\x0f\x63onversation_id\x18\x02 \x01(\x03H\x00\x88\x01\x01\x12\x18\n\x10past_user_inputs\x18\x03 \x03(\t\x12\x1b\n\x13generated_responses\x18\x04 \x03(\t\x12I\n\x0cquery_kwargs\x18\x05 \x03(\x0b\x32\x33.modelresponse.ConversationRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x06 \x01(\tH\x01\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_conversation_idB\x12\n\x10_deployment_name\"\xc3\x01\n\x11\x43onversationReply\x12\x17\n\x0f\x63onversation_id\x18\x01 \x01(\x03\x12\x18\n\x10past_user_inputs\x18\x02 \x03(\t\x12\x1b\n\x13generated_responses\x18\x03 \x03(\t\x12\x12\n\ntime_taken\x18\x04 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x05 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x06 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\xaf\x01\n\nImageReply\x12\x0e\n\x06images\x18\x01 \x03(\x0c\x12\x1d\n\x15nsfw_content_detected\x18\x02 \x03(\x08\x12\x0c\n\x04mode\x18\x03 \x01(\t\x12\x0e\n\x06size_w\x18\x04 \x01(\x03\x12\x0e\n\x06size_h\x18\x05 \x01(\x03\x12\x12\n\ntime_taken\x18\x06 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x07 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\x98\x01\n\x10\x41\x64\x64\x44\x65ployRequest\x12\x0c\n\x04task\x18\x01 \x01(\t\x12\x17\n\x0f\x64\x65ployment_name\x18\x02 \x01(\t\x12\x10\n\x08hostname\x18\x03 \x01(\t\x12\x1d\n\x15tensor_parallel_ports\x18\x04 \x03(\x03\x12\x17\n\x0ftorch_dist_port\x18\x05 \x01(\x03\x12\x13\n\x0bgpu_indices\x18\x06 \x03(\x03\x32\xd4\x06\n\rModelResponse\x12=\n\tTerminate\x12\x16.google.protobuf.Empty\x1a\x16.google.protobuf.Empty\"\x00\x12\x43\n\rCreateSession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12\x44\n\x0e\x44\x65stroySession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12V\n\x0eGeneratorReply\x12!.modelresponse.MultiStringRequest\x1a\x1f.modelresponse.MultiStringReply\"\x00\x12]\n\x13\x43lassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12V\n\x16QuestionAndAnswerReply\x12\x18.modelresponse.QARequest\x1a .modelresponse.SingleStringReply\"\x00\x12W\n\rFillMaskReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12\x62\n\x18TokenClassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12]\n\x13\x43onversationalReply\x12\".modelresponse.ConversationRequest\x1a .modelresponse.ConversationReply\"\x00\x12N\n\x0cTxt2ImgReply\x12!.modelresponse.MultiStringRequest\x1a\x19.modelresponse.ImageReply\"\x00\x32\xe8\x06\n\x14\x44\x65ploymentManagement\x12J\n\rAddDeployment\x12\x1f.modelresponse.AddDeployRequest\x1a\x16.google.protobuf.Empty\"\x00\x12\x43\n\rCreateSession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12\x44\n\x0e\x44\x65stroySession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12V\n\x0eGeneratorReply\x12!.modelresponse.MultiStringRequest\x1a\x1f.modelresponse.MultiStringReply\"\x00\x12]\n\x13\x43lassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12V\n\x16QuestionAndAnswerReply\x12\x18.modelresponse.QARequest\x1a .modelresponse.SingleStringReply\"\x00\x12W\n\rFillMaskReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12\x62\n\x18TokenClassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12]\n\x13\x43onversationalReply\x12\".modelresponse.ConversationRequest\x1a .modelresponse.ConversationReply\"\x00\x12N\n\x0cTxt2ImgReply\x12!.modelresponse.MultiStringRequest\x1a\x19.modelresponse.ImageReply\"\x00\x62\x06proto3')
 
 _globals = globals()
 _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
@@ -62,6 +62,6 @@
   _globals['_ADDDEPLOYREQUEST']._serialized_end=2055
   _globals['_MODELRESPONSE']._serialized_start=2058
   _globals['_MODELRESPONSE']._serialized_end=2910
-  _globals['_DEPLOYMENTMANAGEMENT']._serialized_start=2912
-  _globals['_DEPLOYMENTMANAGEMENT']._serialized_end=3010
+  _globals['_DEPLOYMENTMANAGEMENT']._serialized_start=2913
+  _globals['_DEPLOYMENTMANAGEMENT']._serialized_end=3785
 # @@protoc_insertion_point(module_scope)
diff --git a/mii/grpc_related/proto/modelresponse_pb2_grpc.py b/mii/grpc_related/proto/modelresponse_pb2_grpc.py
index dc91fcfc..d93b85d7 100644
--- a/mii/grpc_related/proto/modelresponse_pb2_grpc.py
+++ b/mii/grpc_related/proto/modelresponse_pb2_grpc.py
@@ -378,6 +378,51 @@ def __init__(self, channel):
                 request_serializer=modelresponse__pb2.AddDeployRequest.SerializeToString,
                 response_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString,
                 )
+        self.CreateSession = channel.unary_unary(
+                '/modelresponse.DeploymentManagement/CreateSession',
+                request_serializer=modelresponse__pb2.SessionID.SerializeToString,
+                response_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString,
+                )
+        self.DestroySession = channel.unary_unary(
+                '/modelresponse.DeploymentManagement/DestroySession',
+                request_serializer=modelresponse__pb2.SessionID.SerializeToString,
+                response_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString,
+                )
+        self.GeneratorReply = channel.unary_unary(
+                '/modelresponse.DeploymentManagement/GeneratorReply',
+                request_serializer=modelresponse__pb2.MultiStringRequest.SerializeToString,
+                response_deserializer=modelresponse__pb2.MultiStringReply.FromString,
+                )
+        self.ClassificationReply = channel.unary_unary(
+                '/modelresponse.DeploymentManagement/ClassificationReply',
+                request_serializer=modelresponse__pb2.SingleStringRequest.SerializeToString,
+                response_deserializer=modelresponse__pb2.SingleStringReply.FromString,
+                )
+        self.QuestionAndAnswerReply = channel.unary_unary(
+                '/modelresponse.DeploymentManagement/QuestionAndAnswerReply',
+                request_serializer=modelresponse__pb2.QARequest.SerializeToString,
+                response_deserializer=modelresponse__pb2.SingleStringReply.FromString,
+                )
+        self.FillMaskReply = channel.unary_unary(
+                '/modelresponse.DeploymentManagement/FillMaskReply',
+                request_serializer=modelresponse__pb2.SingleStringRequest.SerializeToString,
+                response_deserializer=modelresponse__pb2.SingleStringReply.FromString,
+                )
+        self.TokenClassificationReply = channel.unary_unary(
+                '/modelresponse.DeploymentManagement/TokenClassificationReply',
+                request_serializer=modelresponse__pb2.SingleStringRequest.SerializeToString,
+                response_deserializer=modelresponse__pb2.SingleStringReply.FromString,
+                )
+        self.ConversationalReply = channel.unary_unary(
+                '/modelresponse.DeploymentManagement/ConversationalReply',
+                request_serializer=modelresponse__pb2.ConversationRequest.SerializeToString,
+                response_deserializer=modelresponse__pb2.ConversationReply.FromString,
+                )
+        self.Txt2ImgReply = channel.unary_unary(
+                '/modelresponse.DeploymentManagement/Txt2ImgReply',
+                request_serializer=modelresponse__pb2.MultiStringRequest.SerializeToString,
+                response_deserializer=modelresponse__pb2.ImageReply.FromString,
+                )
 
 
 class DeploymentManagementServicer(object):
@@ -389,6 +434,60 @@ def AddDeployment(self, request, context):
         context.set_details('Method not implemented!')
         raise NotImplementedError('Method not implemented!')
 
+    def CreateSession(self, request, context):
+        """Missing associated documentation comment in .proto file."""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+    def DestroySession(self, request, context):
+        """Missing associated documentation comment in .proto file."""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+    def GeneratorReply(self, request, context):
+        """Missing associated documentation comment in .proto file."""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+    def ClassificationReply(self, request, context):
+        """Missing associated documentation comment in .proto file."""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+    def QuestionAndAnswerReply(self, request, context):
+        """Missing associated documentation comment in .proto file."""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+    def FillMaskReply(self, request, context):
+        """Missing associated documentation comment in .proto file."""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+    def TokenClassificationReply(self, request, context):
+        """Missing associated documentation comment in .proto file."""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+    def ConversationalReply(self, request, context):
+        """Missing associated documentation comment in .proto file."""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+    def Txt2ImgReply(self, request, context):
+        """Missing associated documentation comment in .proto file."""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
 
 def add_DeploymentManagementServicer_to_server(servicer, server):
     rpc_method_handlers = {
@@ -397,6 +496,51 @@ def add_DeploymentManagementServicer_to_server(servicer, server):
                     request_deserializer=modelresponse__pb2.AddDeployRequest.FromString,
                     response_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString,
             ),
+            'CreateSession': grpc.unary_unary_rpc_method_handler(
+                    servicer.CreateSession,
+                    request_deserializer=modelresponse__pb2.SessionID.FromString,
+                    response_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString,
+            ),
+            'DestroySession': grpc.unary_unary_rpc_method_handler(
+                    servicer.DestroySession,
+                    request_deserializer=modelresponse__pb2.SessionID.FromString,
+                    response_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString,
+            ),
+            'GeneratorReply': grpc.unary_unary_rpc_method_handler(
+                    servicer.GeneratorReply,
+                    request_deserializer=modelresponse__pb2.MultiStringRequest.FromString,
+                    response_serializer=modelresponse__pb2.MultiStringReply.SerializeToString,
+            ),
+            'ClassificationReply': grpc.unary_unary_rpc_method_handler(
+                    servicer.ClassificationReply,
+                    request_deserializer=modelresponse__pb2.SingleStringRequest.FromString,
+                    response_serializer=modelresponse__pb2.SingleStringReply.SerializeToString,
+            ),
+            'QuestionAndAnswerReply': grpc.unary_unary_rpc_method_handler(
+                    servicer.QuestionAndAnswerReply,
+                    request_deserializer=modelresponse__pb2.QARequest.FromString,
+                    response_serializer=modelresponse__pb2.SingleStringReply.SerializeToString,
+            ),
+            'FillMaskReply': grpc.unary_unary_rpc_method_handler(
+                    servicer.FillMaskReply,
+                    request_deserializer=modelresponse__pb2.SingleStringRequest.FromString,
+                    response_serializer=modelresponse__pb2.SingleStringReply.SerializeToString,
+            ),
+            'TokenClassificationReply': grpc.unary_unary_rpc_method_handler(
+                    servicer.TokenClassificationReply,
+                    request_deserializer=modelresponse__pb2.SingleStringRequest.FromString,
+                    response_serializer=modelresponse__pb2.SingleStringReply.SerializeToString,
+            ),
+            'ConversationalReply': grpc.unary_unary_rpc_method_handler(
+                    servicer.ConversationalReply,
+                    request_deserializer=modelresponse__pb2.ConversationRequest.FromString,
+                    response_serializer=modelresponse__pb2.ConversationReply.SerializeToString,
+            ),
+            'Txt2ImgReply': grpc.unary_unary_rpc_method_handler(
+                    servicer.Txt2ImgReply,
+                    request_deserializer=modelresponse__pb2.MultiStringRequest.FromString,
+                    response_serializer=modelresponse__pb2.ImageReply.SerializeToString,
+            ),
     }
     generic_handler = grpc.method_handlers_generic_handler(
             'modelresponse.DeploymentManagement', rpc_method_handlers)
@@ -423,3 +567,156 @@ def AddDeployment(request,
             google_dot_protobuf_dot_empty__pb2.Empty.FromString,
             options, channel_credentials,
             insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+
+    @staticmethod
+    def CreateSession(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/modelresponse.DeploymentManagement/CreateSession',
+            modelresponse__pb2.SessionID.SerializeToString,
+            google_dot_protobuf_dot_empty__pb2.Empty.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+
+    @staticmethod
+    def DestroySession(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/modelresponse.DeploymentManagement/DestroySession',
+            modelresponse__pb2.SessionID.SerializeToString,
+            google_dot_protobuf_dot_empty__pb2.Empty.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+
+    @staticmethod
+    def GeneratorReply(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/modelresponse.DeploymentManagement/GeneratorReply',
+            modelresponse__pb2.MultiStringRequest.SerializeToString,
+            modelresponse__pb2.MultiStringReply.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+
+    @staticmethod
+    def ClassificationReply(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/modelresponse.DeploymentManagement/ClassificationReply',
+            modelresponse__pb2.SingleStringRequest.SerializeToString,
+            modelresponse__pb2.SingleStringReply.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+
+    @staticmethod
+    def QuestionAndAnswerReply(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/modelresponse.DeploymentManagement/QuestionAndAnswerReply',
+            modelresponse__pb2.QARequest.SerializeToString,
+            modelresponse__pb2.SingleStringReply.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+
+    @staticmethod
+    def FillMaskReply(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/modelresponse.DeploymentManagement/FillMaskReply',
+            modelresponse__pb2.SingleStringRequest.SerializeToString,
+            modelresponse__pb2.SingleStringReply.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+
+    @staticmethod
+    def TokenClassificationReply(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/modelresponse.DeploymentManagement/TokenClassificationReply',
+            modelresponse__pb2.SingleStringRequest.SerializeToString,
+            modelresponse__pb2.SingleStringReply.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+
+    @staticmethod
+    def ConversationalReply(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/modelresponse.DeploymentManagement/ConversationalReply',
+            modelresponse__pb2.ConversationRequest.SerializeToString,
+            modelresponse__pb2.ConversationReply.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+
+    @staticmethod
+    def Txt2ImgReply(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/modelresponse.DeploymentManagement/Txt2ImgReply',
+            modelresponse__pb2.MultiStringRequest.SerializeToString,
+            modelresponse__pb2.ImageReply.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)

From a145be5c20a0c1323b909b88a401452be8dfbc70 Mon Sep 17 00:00:00 2001
From: Tosin Segun <insanechils@gmail.com>
Date: Mon, 24 Jul 2023 21:33:57 +0000
Subject: [PATCH 48/69] Support for empty deployment 'group'

---
 mii/client.py                | 23 +++++++++++++++--------
 mii/models/score/generate.py | 29 ++++++++++++++++-------------
 2 files changed, 31 insertions(+), 21 deletions(-)

diff --git a/mii/client.py b/mii/client.py
index 9facd25c..aa6225be 100644
--- a/mii/client.py
+++ b/mii/client.py
@@ -34,9 +34,9 @@ def _get_deployment_configs(deployment_tag):
                 'deployed': configs[deployment][mii.constants.DEPLOYED_KEY]
                }
         deployments.append(DeploymentConfig.parse_obj(data))
-    lb_config = configs[mii.constants.LOAD_BALANCER_CONFIG_KEY]
+    lb_config = configs.get(mii.constants.LOAD_BALANCER_CONFIG_KEY)
     model_path = configs[mii.constants.MODEL_PATH_KEY]
-    port_map = configs[mii.constants.PORT_MAP_KEY]
+    port_map = configs.get(mii.constants.PORT_MAP_KEY)
     return deployments, lb_config, model_path, port_map
 
 
@@ -58,11 +58,13 @@ def mii_query_handle(deployment_tag):
         return MIINonPersistentClient(task, deployment_tag)
 
     deployments, lb_config, model_path, port_map = _get_deployment_configs(deployment_tag)
+    mii_configs_dict = None
     if len(deployments) > 0:
         mii_configs_dict = deployments[0].mii_config
         #mii_configs = mii.config.MIIConfig(**mii_configs_dict)
+    port_number = None if mii_configs_dict == None else mii_configs_dict.port_number
 
-    return MIIClient(deployments, "localhost", mii_configs_dict.port_number, lb_config, model_path, port_map, deployment_tag)
+    return MIIClient(deployments, "localhost", port_number, lb_config, model_path, port_map, deployment_tag)
 
 
 def create_channel(host, port):
@@ -79,12 +81,15 @@ class MIIClient():
     """
     def __init__(self, deployments, host, port, lb_config=None, model_path=None, port_map=None, deployment_tag=None):
         self.asyncio_loop = asyncio.get_event_loop()
-        channel = create_channel(host, port)
-        self.stub = modelresponse_pb2_grpc.DeploymentManagementStub(channel)
+        self.stub = None
+        self.host = host
+        if port is not None:
+            channel = create_channel(host, port)
+            self.stub = modelresponse_pb2_grpc.DeploymentManagementStub(channel)
         self.deployments = deployments
         self.lb_config = lb_config
         self.model_path = model_path
-        self.port_map = port_map
+        self.port_map = port_map if port_map is not None else {}
         self.deployment_tag = deployment_tag
 
     def _get_deployment_task(self, deployment_name=None):
@@ -161,7 +166,6 @@ def add_models(self,
                    enable_zero=False,
                    ds_config=None,
                    mii_config={},
-                   deployment_tag=None,
                    deployments=[],
                    deployment_type=DeploymentType.LOCAL,
                    model_path=None,
@@ -201,7 +205,10 @@ def add_models(self,
         create_score_file(deployment_tag=self.deployment_tag, deployment_type=deployment_type, deployments=self.deployments, model_path=self.model_path, port_map=self.port_map, lb_config=lb_config)
         if deployment_type == DeploymentType.LOCAL:
             mii.utils.import_score_file(self.deployment_tag).init()
-         
+        if self.stub is None:
+            self.port_number = self.deployments[0].mii_config.port_number
+            channel = create_channel(self.host, self.port_number)
+            self.stub = modelresponse_pb2_grpc.DeploymentManagementStub(channel)
         for replica in lb_config.replica_configs:
             request_proto = modelresponse_pb2.AddDeployRequest(task=replica.task,
                                                                deployment_name=replica.deployment_name,
diff --git a/mii/models/score/generate.py b/mii/models/score/generate.py
index d807ab45..303aa7e6 100644
--- a/mii/models/score/generate.py
+++ b/mii/models/score/generate.py
@@ -19,19 +19,22 @@ def create_score_file(deployment_tag,
     config_dict = {}
     config_dict[mii.constants.MODEL_PATH_KEY] = model_path
     config_dict[mii.constants.DEPLOYMENT_TAG_KEY] = deployment_tag
-    config_dict[mii.constants.PORT_MAP_KEY] = port_map
-    for deployment in deployments:
-        deployment_config = {
-            mii.constants.DEPLOYMENT_NAME_KEY: deployment.deployment_name,
-            mii.constants.TASK_NAME_KEY: mii.utils.get_task_name(deployment.task),
-            mii.constants.MODEL_NAME_KEY: deployment.model,
-            mii.constants.ENABLE_DEEPSPEED_KEY: deployment.enable_deepspeed,
-            mii.constants.MII_CONFIGS_KEY: deployment.mii_config.dict(),
-            mii.constants.ENABLE_DEEPSPEED_ZERO_KEY: deployment.enable_zero,
-            mii.constants.DEEPSPEED_CONFIG_KEY: deployment.ds_config,
-            mii.constants.DEPLOYED_KEY: deployment.deployed,
-        }
-        config_dict[deployment.deployment_name] = deployment_config
+    if port_map is not None:
+        config_dict[mii.constants.PORT_MAP_KEY] = port_map
+    
+    if deployments is not None:
+        for deployment in deployments:
+            deployment_config = {
+                mii.constants.DEPLOYMENT_NAME_KEY: deployment.deployment_name,
+                mii.constants.TASK_NAME_KEY: mii.utils.get_task_name(deployment.task),
+                mii.constants.MODEL_NAME_KEY: deployment.model,
+                mii.constants.ENABLE_DEEPSPEED_KEY: deployment.enable_deepspeed,
+                mii.constants.MII_CONFIGS_KEY: deployment.mii_config.dict(),
+                mii.constants.ENABLE_DEEPSPEED_ZERO_KEY: deployment.enable_zero,
+                mii.constants.DEEPSPEED_CONFIG_KEY: deployment.ds_config,
+                mii.constants.DEPLOYED_KEY: deployment.deployed,
+            }
+            config_dict[deployment.deployment_name] = deployment_config
 
     if lb_config is not None:
         config_dict[mii.constants.LOAD_BALANCER_CONFIG_KEY] = lb_config

From 082c05eecbab57f59288dd95d675bb82a550b0a5 Mon Sep 17 00:00:00 2001
From: Tosin Segun <insanechils@gmail.com>
Date: Mon, 24 Jul 2023 21:34:55 +0000
Subject: [PATCH 49/69] Support for empty deployment 'group'

---
 mii/deployment.py | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/mii/deployment.py b/mii/deployment.py
index 20332fa4..14edcb99 100644
--- a/mii/deployment.py
+++ b/mii/deployment.py
@@ -68,7 +68,22 @@ def deploy(task=None,
         If deployment_type is `LOCAL`, returns just the name of the deployment that can be used to create a query handle using `mii.mii_query_handle(deployment_name)`
 
     """
-    if not deployments:
+    if model_path is None and deployment_type == DeploymentType.LOCAL:
+        model_path = MII_MODEL_PATH_DEFAULT
+    elif model_path is None and deployment_type == DeploymentType.AML:
+        model_path = "model"
+
+    if not deployments and not all((model, task, deployment_name)):
+        assert deployment_tag is not None, "Deployment tag must be set when starting empty deployment"
+        create_score_file(deployment_tag=deployment_tag,
+                          deployment_type=deployment_type,
+                          deployments=None,
+                          model_path=model_path,
+                          port_map=None,
+                          lb_config=None)
+        return None
+
+    elif not deployments:
         assert all((model, task, deployment_name)), "model, task, and deployment name must be set to deploy singular model"
         deployments = [
             DeploymentConfig(deployment_name=deployment_name,
@@ -121,10 +136,6 @@ def deploy(task=None,
             )
 
     # In local deployments use default path if no model path set
-    if model_path is None and deployment_type == DeploymentType.LOCAL:
-        model_path = MII_MODEL_PATH_DEFAULT
-    elif model_path is None and deployment_type == DeploymentType.AML:
-        model_path = "model"
     
     # add fields for replica deployment
     replica_configs = []

From 3ce77d2efdecde0bafbf9a6dedecf9b3d2a60a4c Mon Sep 17 00:00:00 2001
From: Tosin Segun <insanechils@gmail.com>
Date: Tue, 25 Jul 2023 17:49:15 +0000
Subject: [PATCH 50/69] Partial Termination

---
 mii/client.py                                 | 13 ++++
 mii/constants.py                              |  2 +-
 mii/grpc_related/modelresponse_server.py      | 16 ++++-
 mii/grpc_related/proto/modelresponse.proto    |  6 ++
 mii/grpc_related/proto/modelresponse_pb2.py   | 12 ++--
 .../proto/modelresponse_pb2_grpc.py           | 66 +++++++++++++++++++
 6 files changed, 107 insertions(+), 8 deletions(-)

diff --git a/mii/client.py b/mii/client.py
index aa6225be..1a7c834b 100644
--- a/mii/client.py
+++ b/mii/client.py
@@ -155,6 +155,19 @@ def destroy_session(self, session_id, deployment_name=None):
         assert task == Tasks.TEXT_GENERATION, f"Session deletion only available for task '{Tasks.TEXT_GENERATION}'."
         self.asyncio_loop.run_until_complete(self.destroy_session_async(session_id))
 
+    async def delete_model_async(self, proto_request):
+        await getattr(self.stub, "DeleteDeployment")(proto_request)
+
+    def delete_model(self, deployment_name):
+        for deployment in self.deployments:
+            if deployment.deployment_name == deployment_name:
+                request_proto = modelresponse_pb2.DeleteDeployRequest(deployment_name=deployment_name)
+                self.asyncio_loop.run_until_complete(self.delete_model_async(request_proto))
+                return None
+        assert False, f"Deployment: {deployment_name} not found"
+            
+                
+
     async def add_models_async(self, proto_request):
         await getattr(self.stub, "AddDeployment")(proto_request)
 
diff --git a/mii/constants.py b/mii/constants.py
index 61c7c474..138da1e7 100644
--- a/mii/constants.py
+++ b/mii/constants.py
@@ -120,7 +120,7 @@ class ModelProvider(enum.Enum):
 CREATE_SESSION_METHOD = "CreateSession"
 DESTROY_SESSION_METHOD = "DestroySession"
 ADD_DEPLOYMENT_METHOD = "AddDeployment"
-
+DELETE_DEPLOYMENT_METHOD = "DeleteDeployment"
 LB_MAX_WORKER_THREADS = 32
 
 SERVER_SHUTDOWN_TIMEOUT = 10
diff --git a/mii/grpc_related/modelresponse_server.py b/mii/grpc_related/modelresponse_server.py
index aa000e53..3026e176 100644
--- a/mii/grpc_related/modelresponse_server.py
+++ b/mii/grpc_related/modelresponse_server.py
@@ -13,7 +13,7 @@
 import threading
 import time
 
-from mii.constants import GRPC_MAX_MSG_SIZE, ADD_DEPLOYMENT_METHOD, CREATE_SESSION_METHOD, DESTROY_SESSION_METHOD, TERMINATE_METHOD, LB_MAX_WORKER_THREADS, SERVER_SHUTDOWN_TIMEOUT, Tasks
+from mii.constants import GRPC_MAX_MSG_SIZE, ADD_DEPLOYMENT_METHOD, DELETE_DEPLOYMENT_METHOD, CREATE_SESSION_METHOD, DESTROY_SESSION_METHOD, TERMINATE_METHOD, LB_MAX_WORKER_THREADS, SERVER_SHUTDOWN_TIMEOUT, Tasks
 from mii.method_table import GRPC_METHOD_TABLE
 from mii.client import create_channel
 
@@ -36,7 +36,9 @@ class DeploymentManagement(ServiceBase, modelresponse_pb2_grpc.DeploymentManagem
     def AddDeployment(self, request, context):
         print("DEPLOYMENT ADDED")
         return google_dot_protobuf_dot_empty__pb2.Empty()
-
+    
+    def DeleteDeployment(self, request, context):
+        return google_dot_protobuf_dot_empty__pb2.Empty()
 
 class ModelResponse(ServiceBase):
     """
@@ -228,6 +230,16 @@ def invoke_intercept_method(request_proto, context):
                 self.asyncio_loop.call_soon_threadsafe(self.asyncio_loop.stop)
                 return next_handler.unary_unary(request_proto, context)
             
+            if method_name == DELETE_DEPLOYMENT_METHOD:
+                deployment_name = str(getattr(request_proto, "deployment_name"))
+                for stub in self.stubs[deployment_name]:
+                    stub.invoke(TERMINATE_METHOD,
+                                    google_dot_protobuf_dot_empty__pb2.Empty())
+                del self.stubs[deployment_name]
+                del self.counter[deployment_name]
+                del self.tasks[deployment_name]
+                return google_dot_protobuf_dot_empty__pb2.Empty()
+
             deployment_name = getattr(request_proto, 'deployment_name')
             call_count = self.counter[deployment_name].get_and_increment()
             replica_index = call_count % len(self.stubs[deployment_name])
diff --git a/mii/grpc_related/proto/modelresponse.proto b/mii/grpc_related/proto/modelresponse.proto
index 757fa0da..ad626810 100644
--- a/mii/grpc_related/proto/modelresponse.proto
+++ b/mii/grpc_related/proto/modelresponse.proto
@@ -47,6 +47,8 @@ service DeploymentManagement {
   rpc TokenClassificationReply(SingleStringRequest) returns (SingleStringReply) {}
   rpc ConversationalReply(ConversationRequest) returns (ConversationReply) {}
   rpc Txt2ImgReply(MultiStringRequest) returns (ImageReply) {}
+  rpc DeleteDeployment(DeleteDeployRequest) returns (google.protobuf.Empty) {}
+  rpc Terminate (google.protobuf.Empty) returns (google.protobuf.Empty) {}
 }
 
 message Value {
@@ -132,3 +134,7 @@ message AddDeployRequest {
   repeated int64 gpu_indices = 6;
   
 }
+
+message DeleteDeployRequest {
+  string deployment_name = 1;
+}
diff --git a/mii/grpc_related/proto/modelresponse_pb2.py b/mii/grpc_related/proto/modelresponse_pb2.py
index 073083b4..e7abcc14 100644
--- a/mii/grpc_related/proto/modelresponse_pb2.py
+++ b/mii/grpc_related/proto/modelresponse_pb2.py
@@ -14,7 +14,7 @@
 from google.protobuf import empty_pb2 as google_dot_protobuf_dot_empty__pb2
 
 
-DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x13modelresponse.proto\x12\rmodelresponse\x1a\x1bgoogle/protobuf/empty.proto\"_\n\x05Value\x12\x10\n\x06svalue\x18\x01 \x01(\tH\x00\x12\x10\n\x06ivalue\x18\x02 \x01(\x03H\x00\x12\x10\n\x06\x66value\x18\x03 \x01(\x02H\x00\x12\x10\n\x06\x62value\x18\x04 \x01(\x08H\x00\x42\x0e\n\x0coneof_values\"\x1f\n\tSessionID\x12\x12\n\nsession_id\x18\x01 \x01(\t\"\xed\x01\n\x13SingleStringRequest\x12\x0f\n\x07request\x18\x01 \x01(\t\x12I\n\x0cquery_kwargs\x18\x02 \x03(\x0b\x32\x33.modelresponse.SingleStringRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x03 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\xeb\x01\n\x12MultiStringRequest\x12\x0f\n\x07request\x18\x01 \x03(\t\x12H\n\x0cquery_kwargs\x18\x02 \x03(\x0b\x32\x32.modelresponse.MultiStringRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x03 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\x85\x01\n\x11SingleStringReply\x12\x10\n\x08response\x18\x01 \x01(\t\x12\x12\n\ntime_taken\x18\x02 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x03 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\x84\x01\n\x10MultiStringReply\x12\x10\n\x08response\x18\x01 \x03(\t\x12\x12\n\ntime_taken\x18\x02 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x03 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\xeb\x01\n\tQARequest\x12\x10\n\x08question\x18\x01 \x01(\t\x12\x0f\n\x07\x63ontext\x18\x02 \x01(\t\x12?\n\x0cquery_kwargs\x18\x03 \x03(\x0b\x32).modelresponse.QARequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\xd3\x02\n\x13\x43onversationRequest\x12\x0c\n\x04text\x18\x01 \x01(\t\x12\x1c\n\x0f\x63onversation_id\x18\x02 \x01(\x03H\x00\x88\x01\x01\x12\x18\n\x10past_user_inputs\x18\x03 \x03(\t\x12\x1b\n\x13generated_responses\x18\x04 \x03(\t\x12I\n\x0cquery_kwargs\x18\x05 \x03(\x0b\x32\x33.modelresponse.ConversationRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x06 \x01(\tH\x01\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_conversation_idB\x12\n\x10_deployment_name\"\xc3\x01\n\x11\x43onversationReply\x12\x17\n\x0f\x63onversation_id\x18\x01 \x01(\x03\x12\x18\n\x10past_user_inputs\x18\x02 \x03(\t\x12\x1b\n\x13generated_responses\x18\x03 \x03(\t\x12\x12\n\ntime_taken\x18\x04 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x05 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x06 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\xaf\x01\n\nImageReply\x12\x0e\n\x06images\x18\x01 \x03(\x0c\x12\x1d\n\x15nsfw_content_detected\x18\x02 \x03(\x08\x12\x0c\n\x04mode\x18\x03 \x01(\t\x12\x0e\n\x06size_w\x18\x04 \x01(\x03\x12\x0e\n\x06size_h\x18\x05 \x01(\x03\x12\x12\n\ntime_taken\x18\x06 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x07 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\x98\x01\n\x10\x41\x64\x64\x44\x65ployRequest\x12\x0c\n\x04task\x18\x01 \x01(\t\x12\x17\n\x0f\x64\x65ployment_name\x18\x02 \x01(\t\x12\x10\n\x08hostname\x18\x03 \x01(\t\x12\x1d\n\x15tensor_parallel_ports\x18\x04 \x03(\x03\x12\x17\n\x0ftorch_dist_port\x18\x05 \x01(\x03\x12\x13\n\x0bgpu_indices\x18\x06 \x03(\x03\x32\xd4\x06\n\rModelResponse\x12=\n\tTerminate\x12\x16.google.protobuf.Empty\x1a\x16.google.protobuf.Empty\"\x00\x12\x43\n\rCreateSession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12\x44\n\x0e\x44\x65stroySession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12V\n\x0eGeneratorReply\x12!.modelresponse.MultiStringRequest\x1a\x1f.modelresponse.MultiStringReply\"\x00\x12]\n\x13\x43lassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12V\n\x16QuestionAndAnswerReply\x12\x18.modelresponse.QARequest\x1a .modelresponse.SingleStringReply\"\x00\x12W\n\rFillMaskReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12\x62\n\x18TokenClassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12]\n\x13\x43onversationalReply\x12\".modelresponse.ConversationRequest\x1a .modelresponse.ConversationReply\"\x00\x12N\n\x0cTxt2ImgReply\x12!.modelresponse.MultiStringRequest\x1a\x19.modelresponse.ImageReply\"\x00\x32\xe8\x06\n\x14\x44\x65ploymentManagement\x12J\n\rAddDeployment\x12\x1f.modelresponse.AddDeployRequest\x1a\x16.google.protobuf.Empty\"\x00\x12\x43\n\rCreateSession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12\x44\n\x0e\x44\x65stroySession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12V\n\x0eGeneratorReply\x12!.modelresponse.MultiStringRequest\x1a\x1f.modelresponse.MultiStringReply\"\x00\x12]\n\x13\x43lassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12V\n\x16QuestionAndAnswerReply\x12\x18.modelresponse.QARequest\x1a .modelresponse.SingleStringReply\"\x00\x12W\n\rFillMaskReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12\x62\n\x18TokenClassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12]\n\x13\x43onversationalReply\x12\".modelresponse.ConversationRequest\x1a .modelresponse.ConversationReply\"\x00\x12N\n\x0cTxt2ImgReply\x12!.modelresponse.MultiStringRequest\x1a\x19.modelresponse.ImageReply\"\x00\x62\x06proto3')
+DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x13modelresponse.proto\x12\rmodelresponse\x1a\x1bgoogle/protobuf/empty.proto\"_\n\x05Value\x12\x10\n\x06svalue\x18\x01 \x01(\tH\x00\x12\x10\n\x06ivalue\x18\x02 \x01(\x03H\x00\x12\x10\n\x06\x66value\x18\x03 \x01(\x02H\x00\x12\x10\n\x06\x62value\x18\x04 \x01(\x08H\x00\x42\x0e\n\x0coneof_values\"\x1f\n\tSessionID\x12\x12\n\nsession_id\x18\x01 \x01(\t\"\xed\x01\n\x13SingleStringRequest\x12\x0f\n\x07request\x18\x01 \x01(\t\x12I\n\x0cquery_kwargs\x18\x02 \x03(\x0b\x32\x33.modelresponse.SingleStringRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x03 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\xeb\x01\n\x12MultiStringRequest\x12\x0f\n\x07request\x18\x01 \x03(\t\x12H\n\x0cquery_kwargs\x18\x02 \x03(\x0b\x32\x32.modelresponse.MultiStringRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x03 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\x85\x01\n\x11SingleStringReply\x12\x10\n\x08response\x18\x01 \x01(\t\x12\x12\n\ntime_taken\x18\x02 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x03 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\x84\x01\n\x10MultiStringReply\x12\x10\n\x08response\x18\x01 \x03(\t\x12\x12\n\ntime_taken\x18\x02 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x03 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\xeb\x01\n\tQARequest\x12\x10\n\x08question\x18\x01 \x01(\t\x12\x0f\n\x07\x63ontext\x18\x02 \x01(\t\x12?\n\x0cquery_kwargs\x18\x03 \x03(\x0b\x32).modelresponse.QARequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\xd3\x02\n\x13\x43onversationRequest\x12\x0c\n\x04text\x18\x01 \x01(\t\x12\x1c\n\x0f\x63onversation_id\x18\x02 \x01(\x03H\x00\x88\x01\x01\x12\x18\n\x10past_user_inputs\x18\x03 \x03(\t\x12\x1b\n\x13generated_responses\x18\x04 \x03(\t\x12I\n\x0cquery_kwargs\x18\x05 \x03(\x0b\x32\x33.modelresponse.ConversationRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x06 \x01(\tH\x01\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_conversation_idB\x12\n\x10_deployment_name\"\xc3\x01\n\x11\x43onversationReply\x12\x17\n\x0f\x63onversation_id\x18\x01 \x01(\x03\x12\x18\n\x10past_user_inputs\x18\x02 \x03(\t\x12\x1b\n\x13generated_responses\x18\x03 \x03(\t\x12\x12\n\ntime_taken\x18\x04 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x05 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x06 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\xaf\x01\n\nImageReply\x12\x0e\n\x06images\x18\x01 \x03(\x0c\x12\x1d\n\x15nsfw_content_detected\x18\x02 \x03(\x08\x12\x0c\n\x04mode\x18\x03 \x01(\t\x12\x0e\n\x06size_w\x18\x04 \x01(\x03\x12\x0e\n\x06size_h\x18\x05 \x01(\x03\x12\x12\n\ntime_taken\x18\x06 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x07 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\x98\x01\n\x10\x41\x64\x64\x44\x65ployRequest\x12\x0c\n\x04task\x18\x01 \x01(\t\x12\x17\n\x0f\x64\x65ployment_name\x18\x02 \x01(\t\x12\x10\n\x08hostname\x18\x03 \x01(\t\x12\x1d\n\x15tensor_parallel_ports\x18\x04 \x03(\x03\x12\x17\n\x0ftorch_dist_port\x18\x05 \x01(\x03\x12\x13\n\x0bgpu_indices\x18\x06 \x03(\x03\".\n\x13\x44\x65leteDeployRequest\x12\x17\n\x0f\x64\x65ployment_name\x18\x01 \x01(\t2\xd4\x06\n\rModelResponse\x12=\n\tTerminate\x12\x16.google.protobuf.Empty\x1a\x16.google.protobuf.Empty\"\x00\x12\x43\n\rCreateSession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12\x44\n\x0e\x44\x65stroySession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12V\n\x0eGeneratorReply\x12!.modelresponse.MultiStringRequest\x1a\x1f.modelresponse.MultiStringReply\"\x00\x12]\n\x13\x43lassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12V\n\x16QuestionAndAnswerReply\x12\x18.modelresponse.QARequest\x1a .modelresponse.SingleStringReply\"\x00\x12W\n\rFillMaskReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12\x62\n\x18TokenClassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12]\n\x13\x43onversationalReply\x12\".modelresponse.ConversationRequest\x1a .modelresponse.ConversationReply\"\x00\x12N\n\x0cTxt2ImgReply\x12!.modelresponse.MultiStringRequest\x1a\x19.modelresponse.ImageReply\"\x00\x32\xf9\x07\n\x14\x44\x65ploymentManagement\x12J\n\rAddDeployment\x12\x1f.modelresponse.AddDeployRequest\x1a\x16.google.protobuf.Empty\"\x00\x12\x43\n\rCreateSession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12\x44\n\x0e\x44\x65stroySession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12V\n\x0eGeneratorReply\x12!.modelresponse.MultiStringRequest\x1a\x1f.modelresponse.MultiStringReply\"\x00\x12]\n\x13\x43lassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12V\n\x16QuestionAndAnswerReply\x12\x18.modelresponse.QARequest\x1a .modelresponse.SingleStringReply\"\x00\x12W\n\rFillMaskReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12\x62\n\x18TokenClassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12]\n\x13\x43onversationalReply\x12\".modelresponse.ConversationRequest\x1a .modelresponse.ConversationReply\"\x00\x12N\n\x0cTxt2ImgReply\x12!.modelresponse.MultiStringRequest\x1a\x19.modelresponse.ImageReply\"\x00\x12P\n\x10\x44\x65leteDeployment\x12\".modelresponse.DeleteDeployRequest\x1a\x16.google.protobuf.Empty\"\x00\x12=\n\tTerminate\x12\x16.google.protobuf.Empty\x1a\x16.google.protobuf.Empty\"\x00\x62\x06proto3')
 
 _globals = globals()
 _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
@@ -60,8 +60,10 @@
   _globals['_IMAGEREPLY']._serialized_end=1900
   _globals['_ADDDEPLOYREQUEST']._serialized_start=1903
   _globals['_ADDDEPLOYREQUEST']._serialized_end=2055
-  _globals['_MODELRESPONSE']._serialized_start=2058
-  _globals['_MODELRESPONSE']._serialized_end=2910
-  _globals['_DEPLOYMENTMANAGEMENT']._serialized_start=2913
-  _globals['_DEPLOYMENTMANAGEMENT']._serialized_end=3785
+  _globals['_DELETEDEPLOYREQUEST']._serialized_start=2057
+  _globals['_DELETEDEPLOYREQUEST']._serialized_end=2103
+  _globals['_MODELRESPONSE']._serialized_start=2106
+  _globals['_MODELRESPONSE']._serialized_end=2958
+  _globals['_DEPLOYMENTMANAGEMENT']._serialized_start=2961
+  _globals['_DEPLOYMENTMANAGEMENT']._serialized_end=3978
 # @@protoc_insertion_point(module_scope)
diff --git a/mii/grpc_related/proto/modelresponse_pb2_grpc.py b/mii/grpc_related/proto/modelresponse_pb2_grpc.py
index d93b85d7..9c3ce85d 100644
--- a/mii/grpc_related/proto/modelresponse_pb2_grpc.py
+++ b/mii/grpc_related/proto/modelresponse_pb2_grpc.py
@@ -423,6 +423,16 @@ def __init__(self, channel):
                 request_serializer=modelresponse__pb2.MultiStringRequest.SerializeToString,
                 response_deserializer=modelresponse__pb2.ImageReply.FromString,
                 )
+        self.DeleteDeployment = channel.unary_unary(
+                '/modelresponse.DeploymentManagement/DeleteDeployment',
+                request_serializer=modelresponse__pb2.DeleteDeployRequest.SerializeToString,
+                response_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString,
+                )
+        self.Terminate = channel.unary_unary(
+                '/modelresponse.DeploymentManagement/Terminate',
+                request_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString,
+                response_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString,
+                )
 
 
 class DeploymentManagementServicer(object):
@@ -488,6 +498,18 @@ def Txt2ImgReply(self, request, context):
         context.set_details('Method not implemented!')
         raise NotImplementedError('Method not implemented!')
 
+    def DeleteDeployment(self, request, context):
+        """Missing associated documentation comment in .proto file."""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+    def Terminate(self, request, context):
+        """Missing associated documentation comment in .proto file."""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
 
 def add_DeploymentManagementServicer_to_server(servicer, server):
     rpc_method_handlers = {
@@ -541,6 +563,16 @@ def add_DeploymentManagementServicer_to_server(servicer, server):
                     request_deserializer=modelresponse__pb2.MultiStringRequest.FromString,
                     response_serializer=modelresponse__pb2.ImageReply.SerializeToString,
             ),
+            'DeleteDeployment': grpc.unary_unary_rpc_method_handler(
+                    servicer.DeleteDeployment,
+                    request_deserializer=modelresponse__pb2.DeleteDeployRequest.FromString,
+                    response_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString,
+            ),
+            'Terminate': grpc.unary_unary_rpc_method_handler(
+                    servicer.Terminate,
+                    request_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString,
+                    response_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString,
+            ),
     }
     generic_handler = grpc.method_handlers_generic_handler(
             'modelresponse.DeploymentManagement', rpc_method_handlers)
@@ -720,3 +752,37 @@ def Txt2ImgReply(request,
             modelresponse__pb2.ImageReply.FromString,
             options, channel_credentials,
             insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+
+    @staticmethod
+    def DeleteDeployment(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/modelresponse.DeploymentManagement/DeleteDeployment',
+            modelresponse__pb2.DeleteDeployRequest.SerializeToString,
+            google_dot_protobuf_dot_empty__pb2.Empty.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+
+    @staticmethod
+    def Terminate(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/modelresponse.DeploymentManagement/Terminate',
+            google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString,
+            google_dot_protobuf_dot_empty__pb2.Empty.FromString,
+            options, channel_credentials,
+            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)

From b40ecbdd0697abd4aeb57c10f50e6bb27020a9fe Mon Sep 17 00:00:00 2001
From: Tosin Segun <insanechils@gmail.com>
Date: Tue, 25 Jul 2023 21:14:58 +0000
Subject: [PATCH 51/69] Refactoring

---
 mii/client.py                            | 72 ++++++++++++------------
 mii/constants.py                         |  1 +
 mii/deployment.py                        | 39 ++-----------
 mii/grpc_related/modelresponse_server.py |  7 ++-
 mii/models/score/generate.py             |  5 +-
 mii/models/score/score_template.py       | 10 +---
 mii/server.py                            | 34 ++++++-----
 7 files changed, 71 insertions(+), 97 deletions(-)

diff --git a/mii/client.py b/mii/client.py
index 1a7c834b..8bcbc39c 100644
--- a/mii/client.py
+++ b/mii/client.py
@@ -15,25 +15,24 @@
 
 
 def _get_deployment_configs(deployment_tag):
-    deployments = []
+    deployments = {}
     configs = mii.utils.import_score_file(deployment_tag).configs
-    for deployment in configs:
-        if not isinstance(configs[deployment], dict) or deployment == mii.constants.PORT_MAP_KEY:
-            continue
-        configs[deployment][mii.constants.DEPLOYED_KEY] = True
+    for deployment in configs[mii.constants.DEPLOYMENTS_KEY].values():
+        deployment[mii.constants.DEPLOYED_KEY] = True
+        deployment_name = deployment[mii.constants.DEPLOYMENT_NAME_KEY]
         data = {
-                'deployment_name':configs[deployment][mii.constants.DEPLOYMENT_NAME_KEY],
-                'task': configs[deployment][mii.constants.TASK_NAME_KEY],
-                'model': configs[deployment][mii.constants.MODEL_NAME_KEY],
-                'enable_deepspeed': configs[deployment][mii.constants.ENABLE_DEEPSPEED_KEY],
-                'enable_zero': configs[deployment][mii.constants.ENABLE_DEEPSPEED_ZERO_KEY],
+                'deployment_name':deployment[mii.constants.DEPLOYMENT_NAME_KEY],
+                'task': deployment[mii.constants.TASK_NAME_KEY],
+                'model': deployment[mii.constants.MODEL_NAME_KEY],
+                'enable_deepspeed': deployment[mii.constants.ENABLE_DEEPSPEED_KEY],
+                'enable_zero': deployment[mii.constants.ENABLE_DEEPSPEED_ZERO_KEY],
                 'GPU_index_map': None,
-                'mii_config': configs[deployment][mii.constants.MII_CONFIGS_KEY],
-                'ds_config': configs[deployment][mii.constants.DEEPSPEED_CONFIG_KEY],
+                'mii_config': deployment[mii.constants.MII_CONFIGS_KEY],
+                'ds_config': deployment[mii.constants.DEEPSPEED_CONFIG_KEY],
                 'version': 1,
-                'deployed': configs[deployment][mii.constants.DEPLOYED_KEY]
+                'deployed': deployment[mii.constants.DEPLOYED_KEY]
                }
-        deployments.append(DeploymentConfig.parse_obj(data))
+        deployments[deployment_name] = DeploymentConfig.parse_obj(data)
     lb_config = configs.get(mii.constants.LOAD_BALANCER_CONFIG_KEY)
     model_path = configs[mii.constants.MODEL_PATH_KEY]
     port_map = configs.get(mii.constants.PORT_MAP_KEY)
@@ -60,7 +59,7 @@ def mii_query_handle(deployment_tag):
     deployments, lb_config, model_path, port_map = _get_deployment_configs(deployment_tag)
     mii_configs_dict = None
     if len(deployments) > 0:
-        mii_configs_dict = deployments[0].mii_config
+        mii_configs_dict = next(iter(deployments.values())).mii_config
         #mii_configs = mii.config.MIIConfig(**mii_configs_dict)
     port_number = None if mii_configs_dict == None else mii_configs_dict.port_number
 
@@ -96,14 +95,14 @@ def _get_deployment_task(self, deployment_name=None):
         task = None
         if deployment_name is None:  #mii.terminate() or single model
             assert len(self.deployments) == 1, "Must pass deployment_name to query when using multiple deployments"
-            deployment_name = self.deployments[0].deployment_name
-            task = get_task(self.deployments[0].task) if isinstance(deployment.task, str) else self.deployments[0].task
+            deployment = next(iter(self.deployments.values()))
+            deployment_name = deployment.deployment_name
+            task = get_task(deployment.task) if isinstance(deployment.task, str) else deployment.task
         else:
-            for deployment in self.deployments:
-                print(deployment.deployment_name)
-                if deployment.deployment_name == deployment_name:
-                    task = get_task(deployment.task) if isinstance(deployment.task, str) else deployment.task
-                    return deployment_name, task
+            if deployment_name in self.deployments:
+                deployment = self.deployments[deployment_name]
+                task = get_task(deployment.task) if isinstance(deployment.task, str) else deployment.task
+                return deployment_name, task
             assert False, f"{deployment_name} not found in list of deployments"
         return deployment_name, task
 
@@ -159,11 +158,11 @@ async def delete_model_async(self, proto_request):
         await getattr(self.stub, "DeleteDeployment")(proto_request)
 
     def delete_model(self, deployment_name):
-        for deployment in self.deployments:
-            if deployment.deployment_name == deployment_name:
-                request_proto = modelresponse_pb2.DeleteDeployRequest(deployment_name=deployment_name)
-                self.asyncio_loop.run_until_complete(self.delete_model_async(request_proto))
-                return None
+        if deployment_name in self.deployments:
+            request_proto = modelresponse_pb2.DeleteDeployRequest(deployment_name=deployment_name)
+            self.asyncio_loop.run_until_complete(self.delete_model_async(request_proto))
+            del self.deployments[deployment_name]
+            return None
         assert False, f"Deployment: {deployment_name} not found"
             
                 
@@ -199,27 +198,30 @@ def add_models(self,
                              deployed=False)
             ]
         
+        deps = {deployment.deployment_name: deployment for deployment in deployments}
         for deployment in deployments:
             deployment.task = get_task(deployment.task)
-        lb_config, self.port_map = allocate_processes(deployments, self.port_map)
+        lb_config, self.port_map = allocate_processes(deps, self.port_map)
         
         if self.lb_config is not None:
             self.lb_config.replica_configs.extend(lb_config.replica_configs)
         else:
             self.lb_config = lb_config
-        self.deployments.extend(deployments)
+        for deployment in deployments:
+            self.deployments[deployment.deployment_name] = deployment
+        #self.deployments.extend(deployments)
         if self.model_path is None and deployment_type == DeploymentType.LOCAL:
             self.model_path = mii.constants.MII_MODEL_PATH_DEFAULT
         elif self.model_path is None and deployment_type == DeploymentType.AML:
             model_path = "model"
-        for deployment in self.deployments:
+        for deployment in self.deployments.values():
             if isinstance(deployment.task, str):
                 deployment.task = get_task(deployment.task)
         create_score_file(deployment_tag=self.deployment_tag, deployment_type=deployment_type, deployments=self.deployments, model_path=self.model_path, port_map=self.port_map, lb_config=lb_config)
         if deployment_type == DeploymentType.LOCAL:
             mii.utils.import_score_file(self.deployment_tag).init()
         if self.stub is None:
-            self.port_number = self.deployments[0].mii_config.port_number
+            self.port_number = next(iter(self.deployments.values())).mii_config.port_number
             channel = create_channel(self.host, self.port_number)
             self.stub = modelresponse_pb2_grpc.DeploymentManagementStub(channel)
         for replica in lb_config.replica_configs:
@@ -321,9 +323,9 @@ def terminate(self):
 
 
 def terminate_restful_gateway(deployment_tag):
-    deployments = _get_deployment_configs(deployment_tag)
-    for deployment in deployments:
-        mii_configs_dict = deployment[mii.constants.MII_CONFIGS_KEY]
-        mii_configs = mii.config.MIIConfig(**mii_configs_dict)
+    deployments, _, _, _ = _get_deployment_configs(deployment_tag)
+    for deployment in deployments.values():
+        mii_configs_dict = deployment.mii_config
+        #mii_configs = mii.config.MIIConfig(**mii_configs_dict)
         if mii_configs.enable_restful_api:
             requests.get(f"http://localhost:{mii_configs.restful_api_port}/terminate")
diff --git a/mii/constants.py b/mii/constants.py
index 138da1e7..520a3c3d 100644
--- a/mii/constants.py
+++ b/mii/constants.py
@@ -89,6 +89,7 @@ class ModelProvider(enum.Enum):
     TEXT2IMG_NAME: ["query"]
 }
 
+DEPLOYMENTS_KEY = 'deployments'
 PORT_MAP_KEY = 'port_map'
 MODEL_NAME_KEY = 'model_name'
 TASK_NAME_KEY = 'task_name'
diff --git a/mii/deployment.py b/mii/deployment.py
index 14edcb99..bc71be48 100644
--- a/mii/deployment.py
+++ b/mii/deployment.py
@@ -126,6 +126,7 @@ def deploy(task=None,
                     deployment.task,
                     deployment.model)
 
+
         if enable_deepspeed:
             logger.info(
                 f"************* MII is using DeepSpeed Optimizations to accelerate your model: {deployment.model} *************"
@@ -135,46 +136,18 @@ def deploy(task=None,
                 f"************* DeepSpeed Optimizations not enabled. Please use enable_deepspeed to get better performance for: {deployment.model} *************"
             )
 
+    deps = {deployment.deployment_name: deployment for deployment in deployments}
+
     # In local deployments use default path if no model path set
     
     # add fields for replica deployment
-    replica_configs = []
-    port_offset = 1
     port_map = {}
-    for deployment in deployments:
-        mii_config = deployment.mii_config
-        replica_pool = _allocate_processes(mii_config.hostfile,
-                                           mii_config.tensor_parallel,
-                                           mii_config.replica_num,
-                                           deployment.GPU_index_map)
-
-        for i, (hostname, gpu_indices) in enumerate(replica_pool):
-            # Reserver port for a LB proxy when replication is enabled
-            if hostname not in port_map:
-                port_map[hostname] = set()
-            base_port = mii_config.port_number + i * mii_config.tensor_parallel + port_offset
-            if base_port in port_map[hostname]:
-                base_port = max(port_map[hostname]) + 1
-            tensor_parallel_ports = list(
-                range(base_port,
-                      base_port + mii_config.tensor_parallel))
-            for i in range(base_port, base_port + mii_config.tensor_parallel):
-                port_map[hostname].add(i)
-            torch_dist_port = mii_config.torch_dist_port + i
-            replica_configs.append(
-                ReplicaConfig(task=get_task_name(deployment.task),
-                              deployment_name=deployment.deployment_name,
-                              hostname=hostname,
-                              tensor_parallel_ports=tensor_parallel_ports,
-                              torch_dist_port=torch_dist_port,
-                              gpu_indices=gpu_indices))
-    lb_config = LoadBalancerConfig(port=mii_config.port_number,
-                                   replica_configs=replica_configs)
+    lb_config, port_map = allocate_processes(deps, port_map)
 
     if deployment_type != DeploymentType.NON_PERSISTENT:
         create_score_file(deployment_tag=deployment_tag,
                           deployment_type=deployment_type,
-                          deployments=deployments,
+                          deployments=deps,
                           model_path=model_path,
                           port_map=port_map,
                           lb_config=lb_config)
@@ -201,7 +174,7 @@ def deploy(task=None,
 def allocate_processes(deployments, port_map):
     replica_configs = []
     port_offset = 1
-    for deployment in deployments:
+    for deployment in deployments.values():
         mii_config = deployment.mii_config
         replica_pool = _allocate_processes(mii_config.hostfile,
                                            mii_config.tensor_parallel,
diff --git a/mii/grpc_related/modelresponse_server.py b/mii/grpc_related/modelresponse_server.py
index 3026e176..5c988bd7 100644
--- a/mii/grpc_related/modelresponse_server.py
+++ b/mii/grpc_related/modelresponse_server.py
@@ -223,8 +223,9 @@ def invoke_intercept_method(request_proto, context):
                 return google_dot_protobuf_dot_empty__pb2.Empty()
 
             if method_name == TERMINATE_METHOD:
-                for deployment in self.stubs:
-                    for stub in self.stubs[deployment]:
+                print(self.stubs.keys())
+                for deployment_name in self.stubs:
+                    for stub in self.stubs[deployment_name]:
                         stub.invoke(TERMINATE_METHOD,
                                     google_dot_protobuf_dot_empty__pb2.Empty())
                 self.asyncio_loop.call_soon_threadsafe(self.asyncio_loop.stop)
@@ -232,6 +233,7 @@ def invoke_intercept_method(request_proto, context):
             
             if method_name == DELETE_DEPLOYMENT_METHOD:
                 deployment_name = str(getattr(request_proto, "deployment_name"))
+                assert deployment_name in self.stubs, f"Deployment: {deployment_name} not found"
                 for stub in self.stubs[deployment_name]:
                     stub.invoke(TERMINATE_METHOD,
                                     google_dot_protobuf_dot_empty__pb2.Empty())
@@ -241,6 +243,7 @@ def invoke_intercept_method(request_proto, context):
                 return google_dot_protobuf_dot_empty__pb2.Empty()
 
             deployment_name = getattr(request_proto, 'deployment_name')
+            assert deployment_name in self.stubs, f"Deployment: {deployment_name} not found"
             call_count = self.counter[deployment_name].get_and_increment()
             replica_index = call_count % len(self.stubs[deployment_name])
 
diff --git a/mii/models/score/generate.py b/mii/models/score/generate.py
index 303aa7e6..55f63046 100644
--- a/mii/models/score/generate.py
+++ b/mii/models/score/generate.py
@@ -19,11 +19,12 @@ def create_score_file(deployment_tag,
     config_dict = {}
     config_dict[mii.constants.MODEL_PATH_KEY] = model_path
     config_dict[mii.constants.DEPLOYMENT_TAG_KEY] = deployment_tag
+    config_dict[mii.constants.DEPLOYMENTS_KEY] = {}
     if port_map is not None:
         config_dict[mii.constants.PORT_MAP_KEY] = port_map
     
     if deployments is not None:
-        for deployment in deployments:
+        for deployment in deployments.values():
             deployment_config = {
                 mii.constants.DEPLOYMENT_NAME_KEY: deployment.deployment_name,
                 mii.constants.TASK_NAME_KEY: mii.utils.get_task_name(deployment.task),
@@ -34,7 +35,7 @@ def create_score_file(deployment_tag,
                 mii.constants.DEEPSPEED_CONFIG_KEY: deployment.ds_config,
                 mii.constants.DEPLOYED_KEY: deployment.deployed,
             }
-            config_dict[deployment.deployment_name] = deployment_config
+            config_dict[mii.constants.DEPLOYMENTS_KEY][deployment.deployment_name] = deployment_config
 
     if lb_config is not None:
         config_dict[mii.constants.LOAD_BALANCER_CONFIG_KEY] = lb_config
diff --git a/mii/models/score/score_template.py b/mii/models/score/score_template.py
index fd6a7f0f..117f3866 100644
--- a/mii/models/score/score_template.py
+++ b/mii/models/score/score_template.py
@@ -19,14 +19,11 @@ def init():
     deployment_tag = configs[mii.constants.DEPLOYMENT_TAG_KEY]
     deployments = []
     lb_enabled = False
-    del configs[mii.constants.PORT_MAP_KEY]
-    for deployment in configs.values():
-        if isinstance(deployment, dict) and deployment[mii.constants.DEPLOYED_KEY]:
+    for deployment in configs[mii.constants.DEPLOYMENTS_KEY].values():
+        if deployment[mii.constants.DEPLOYED_KEY]:
             lb_enabled = True
             print(deployment)
             continue
-        if not isinstance(deployment, dict):
-            continue
         data = {
             'deployment_name': deployment[mii.constants.DEPLOYMENT_NAME_KEY],
             'task': deployment[mii.constants.TASK_NAME_KEY],
@@ -48,8 +45,7 @@ def init():
     assert task_name is not None, "The task name should be set before calling init"
     """
 
-    if len(deployments) > 0:
-        mii.MIIServer(deployment_tag,
+    mii.MIIServer(deployment_tag,
                   deployments,
                   model_path,
                   lb_config=configs.get(mii.constants.LOAD_BALANCER_CONFIG_KEY,
diff --git a/mii/server.py b/mii/server.py
index f29e3a28..e7f1360e 100644
--- a/mii/server.py
+++ b/mii/server.py
@@ -29,28 +29,26 @@ def config_to_b64_str(config):
 class MIIServer():
     '''Initialize the model, setup the server for the model under model_path'''
     def __init__(self, deployment_tag, deployments, model_path, lb_config=None, lb_enabled=False):
-
-        #mii_configs = mii.config.MIIConfig(**mii_configs)
-        self.lb_enabled = lb_enabled
-        #self.task = mii.utils.get_task(task_name)
-        self.deployments = deployments
-        for deployment in deployments:
-            assert get_num_gpus(deployment.mii_config) > 0, f"GPU count for {deployment.deployment_name} must be greater than 0"
-            mii_configs = deployment.mii_config
-            deployment.task = mii.utils.get_task(deployment.task)
-            if mii_configs.hostfile is None:
-                hostfile = tempfile.NamedTemporaryFile(delete=False)
-                num_gpu = torch.cuda.device_count()
-                with open(hostfile, "w") as f:
-                    f.write(f"localhost slots={num_gpu}")
-                mii.configs.hostfile = hostfile
-
-        processes = self._initialize_service(deployment_tag,
+        if len(deployments) > 0:
+            self.lb_enabled = lb_enabled
+            self.deployments = deployments
+            for deployment in deployments:
+                assert get_num_gpus(deployment.mii_config) > 0, f"GPU count for {deployment.deployment_name} must be greater than 0"
+                mii_configs = deployment.mii_config
+                deployment.task = mii.utils.get_task(deployment.task)
+                if mii_configs.hostfile is None:
+                    hostfile = tempfile.NamedTemporaryFile(delete=False)
+                    num_gpu = torch.cuda.device_count()
+                    with open(hostfile, "w") as f:
+                        f.write(f"localhost slots={num_gpu}")
+                    mii.configs.hostfile = hostfile
+
+            processes = self._initialize_service(deployment_tag,
                                              deployments,
                                              model_path,
                                              lb_config,
                                              )
-        self._wait_until_server_is_live(processes, lb_config.replica_configs)
+            self._wait_until_server_is_live(processes, lb_config.replica_configs)
 
     def _wait_until_server_is_live(self, processes, deployment):
         for process, repl_config in zip(processes, deployment):

From 72dd95c8440978e2b13bb9201f134a1bfcadb935 Mon Sep 17 00:00:00 2001
From: Tosin Segun <insanechils@gmail.com>
Date: Tue, 25 Jul 2023 21:21:04 +0000
Subject: [PATCH 52/69] formatting

---
 mii/client.py                                 |  102 +-
 mii/deployment.py                             |    7 +-
 mii/grpc_related/modelresponse_server.py      |   24 +-
 mii/grpc_related/proto/modelresponse.proto    |    2 +-
 mii/grpc_related/proto/modelresponse_pb2.py   |  101 +-
 .../proto/modelresponse_pb2_grpc.py           | 1135 ++++++++++-------
 mii/models/score/generate.py                  |    5 +-
 mii/server.py                                 |   18 +-
 8 files changed, 833 insertions(+), 561 deletions(-)

diff --git a/mii/client.py b/mii/client.py
index 8bcbc39c..f937d69d 100644
--- a/mii/client.py
+++ b/mii/client.py
@@ -21,17 +21,17 @@ def _get_deployment_configs(deployment_tag):
         deployment[mii.constants.DEPLOYED_KEY] = True
         deployment_name = deployment[mii.constants.DEPLOYMENT_NAME_KEY]
         data = {
-                'deployment_name':deployment[mii.constants.DEPLOYMENT_NAME_KEY],
-                'task': deployment[mii.constants.TASK_NAME_KEY],
-                'model': deployment[mii.constants.MODEL_NAME_KEY],
-                'enable_deepspeed': deployment[mii.constants.ENABLE_DEEPSPEED_KEY],
-                'enable_zero': deployment[mii.constants.ENABLE_DEEPSPEED_ZERO_KEY],
-                'GPU_index_map': None,
-                'mii_config': deployment[mii.constants.MII_CONFIGS_KEY],
-                'ds_config': deployment[mii.constants.DEEPSPEED_CONFIG_KEY],
-                'version': 1,
-                'deployed': deployment[mii.constants.DEPLOYED_KEY]
-               }
+            'deployment_name': deployment[mii.constants.DEPLOYMENT_NAME_KEY],
+            'task': deployment[mii.constants.TASK_NAME_KEY],
+            'model': deployment[mii.constants.MODEL_NAME_KEY],
+            'enable_deepspeed': deployment[mii.constants.ENABLE_DEEPSPEED_KEY],
+            'enable_zero': deployment[mii.constants.ENABLE_DEEPSPEED_ZERO_KEY],
+            'GPU_index_map': None,
+            'mii_config': deployment[mii.constants.MII_CONFIGS_KEY],
+            'ds_config': deployment[mii.constants.DEEPSPEED_CONFIG_KEY],
+            'version': 1,
+            'deployed': deployment[mii.constants.DEPLOYED_KEY]
+        }
         deployments[deployment_name] = DeploymentConfig.parse_obj(data)
     lb_config = configs.get(mii.constants.LOAD_BALANCER_CONFIG_KEY)
     model_path = configs[mii.constants.MODEL_PATH_KEY]
@@ -63,7 +63,13 @@ def mii_query_handle(deployment_tag):
         #mii_configs = mii.config.MIIConfig(**mii_configs_dict)
     port_number = None if mii_configs_dict == None else mii_configs_dict.port_number
 
-    return MIIClient(deployments, "localhost", port_number, lb_config, model_path, port_map, deployment_tag)
+    return MIIClient(deployments,
+                     "localhost",
+                     port_number,
+                     lb_config,
+                     model_path,
+                     port_map,
+                     deployment_tag)
 
 
 def create_channel(host, port):
@@ -78,7 +84,14 @@ class MIIClient():
     """
     Client to send queries to a single endpoint.
     """
-    def __init__(self, deployments, host, port, lb_config=None, model_path=None, port_map=None, deployment_tag=None):
+    def __init__(self,
+                 deployments,
+                 host,
+                 port,
+                 lb_config=None,
+                 model_path=None,
+                 port_map=None,
+                 deployment_tag=None):
         self.asyncio_loop = asyncio.get_event_loop()
         self.stub = None
         self.host = host
@@ -97,11 +110,13 @@ def _get_deployment_task(self, deployment_name=None):
             assert len(self.deployments) == 1, "Must pass deployment_name to query when using multiple deployments"
             deployment = next(iter(self.deployments.values()))
             deployment_name = deployment.deployment_name
-            task = get_task(deployment.task) if isinstance(deployment.task, str) else deployment.task
+            task = get_task(deployment.task) if isinstance(deployment.task,
+                                                           str) else deployment.task
         else:
             if deployment_name in self.deployments:
                 deployment = self.deployments[deployment_name]
-                task = get_task(deployment.task) if isinstance(deployment.task, str) else deployment.task
+                task = get_task(deployment.task) if isinstance(deployment.task,
+                                                               str) else deployment.task
                 return deployment_name, task
             assert False, f"{deployment_name} not found in list of deployments"
         return deployment_name, task
@@ -159,13 +174,12 @@ async def delete_model_async(self, proto_request):
 
     def delete_model(self, deployment_name):
         if deployment_name in self.deployments:
-            request_proto = modelresponse_pb2.DeleteDeployRequest(deployment_name=deployment_name)
+            request_proto = modelresponse_pb2.DeleteDeployRequest(
+                deployment_name=deployment_name)
             self.asyncio_loop.run_until_complete(self.delete_model_async(request_proto))
             del self.deployments[deployment_name]
             return None
         assert False, f"Deployment: {deployment_name} not found"
-            
-                
 
     async def add_models_async(self, proto_request):
         await getattr(self.stub, "AddDeployment")(proto_request)
@@ -182,27 +196,27 @@ def add_models(self,
                    deployment_type=DeploymentType.LOCAL,
                    model_path=None,
                    version=1):
-        
+
         if not deployments:
             assert all((model, task, deployment_name)), "model, task, and deployment name must be set to deploy singular model"
             deployments = [
                 DeploymentConfig(deployment_name=deployment_name,
-                             task=task,
-                             model=model,
-                             enable_deepspeed=enable_deepspeed,
-                             enable_zero=enable_zero,
-                             GPU_index_map=None,
-                             mii_config=mii.config.MIIConfig(**mii_config),
-                             ds_config=ds_config,
-                             version=version,
-                             deployed=False)
+                                 task=task,
+                                 model=model,
+                                 enable_deepspeed=enable_deepspeed,
+                                 enable_zero=enable_zero,
+                                 GPU_index_map=None,
+                                 mii_config=mii.config.MIIConfig(**mii_config),
+                                 ds_config=ds_config,
+                                 version=version,
+                                 deployed=False)
             ]
-        
+
         deps = {deployment.deployment_name: deployment for deployment in deployments}
         for deployment in deployments:
             deployment.task = get_task(deployment.task)
         lb_config, self.port_map = allocate_processes(deps, self.port_map)
-        
+
         if self.lb_config is not None:
             self.lb_config.replica_configs.extend(lb_config.replica_configs)
         else:
@@ -217,23 +231,31 @@ def add_models(self,
         for deployment in self.deployments.values():
             if isinstance(deployment.task, str):
                 deployment.task = get_task(deployment.task)
-        create_score_file(deployment_tag=self.deployment_tag, deployment_type=deployment_type, deployments=self.deployments, model_path=self.model_path, port_map=self.port_map, lb_config=lb_config)
+        create_score_file(deployment_tag=self.deployment_tag,
+                          deployment_type=deployment_type,
+                          deployments=self.deployments,
+                          model_path=self.model_path,
+                          port_map=self.port_map,
+                          lb_config=lb_config)
         if deployment_type == DeploymentType.LOCAL:
             mii.utils.import_score_file(self.deployment_tag).init()
         if self.stub is None:
-            self.port_number = next(iter(self.deployments.values())).mii_config.port_number
+            self.port_number = next(iter(
+                self.deployments.values())).mii_config.port_number
             channel = create_channel(self.host, self.port_number)
             self.stub = modelresponse_pb2_grpc.DeploymentManagementStub(channel)
         for replica in lb_config.replica_configs:
-            request_proto = modelresponse_pb2.AddDeployRequest(task=replica.task,
-                                                               deployment_name=replica.deployment_name,
-                                                               hostname=replica.hostname,
-                                                               tensor_parallel_ports=replica.tensor_parallel_ports,
-                                                               torch_dist_port=replica.torch_dist_port,
-                                                               gpu_indices=replica.gpu_indices
-                                                               )
+            request_proto = modelresponse_pb2.AddDeployRequest(
+                task=replica.task,
+                deployment_name=replica.deployment_name,
+                hostname=replica.hostname,
+                tensor_parallel_ports=replica.tensor_parallel_ports,
+                torch_dist_port=replica.torch_dist_port,
+                gpu_indices=replica.gpu_indices)
 
             self.asyncio_loop.run_until_complete(self.add_models_async(request_proto))
+
+
 class MIITensorParallelClient():
     """
     Client to send queries to multiple endpoints in parallel.
@@ -325,7 +347,7 @@ def terminate(self):
 def terminate_restful_gateway(deployment_tag):
     deployments, _, _, _ = _get_deployment_configs(deployment_tag)
     for deployment in deployments.values():
-        mii_configs_dict = deployment.mii_config
+        mii_configs = deployment.mii_config
         #mii_configs = mii.config.MIIConfig(**mii_configs_dict)
         if mii_configs.enable_restful_api:
             requests.get(f"http://localhost:{mii_configs.restful_api_port}/terminate")
diff --git a/mii/deployment.py b/mii/deployment.py
index bc71be48..54b8abce 100644
--- a/mii/deployment.py
+++ b/mii/deployment.py
@@ -126,7 +126,6 @@ def deploy(task=None,
                     deployment.task,
                     deployment.model)
 
-
         if enable_deepspeed:
             logger.info(
                 f"************* MII is using DeepSpeed Optimizations to accelerate your model: {deployment.model} *************"
@@ -139,7 +138,7 @@ def deploy(task=None,
     deps = {deployment.deployment_name: deployment for deployment in deployments}
 
     # In local deployments use default path if no model path set
-    
+
     # add fields for replica deployment
     port_map = {}
     lb_config, port_map = allocate_processes(deps, port_map)
@@ -151,7 +150,7 @@ def deploy(task=None,
                           model_path=model_path,
                           port_map=port_map,
                           lb_config=lb_config)
-    
+
     if deployment_type == DeploymentType.AML:
         _deploy_aml(deployment_tag=deployment_tag, model_name=model, version=version)
     elif deployment_type == DeploymentType.LOCAL:
@@ -171,6 +170,7 @@ def deploy(task=None,
     else:
         raise Exception(f"Unknown deployment type: {deployment_type}")
 
+
 def allocate_processes(deployments, port_map):
     replica_configs = []
     port_offset = 1
@@ -205,6 +205,7 @@ def allocate_processes(deployments, port_map):
                                    replica_configs=replica_configs)
     return lb_config, port_map
 
+
 def _deploy_local(deployment_tag, model_path):
     mii.utils.import_score_file(deployment_tag).init()
 
diff --git a/mii/grpc_related/modelresponse_server.py b/mii/grpc_related/modelresponse_server.py
index 5c988bd7..5204779c 100644
--- a/mii/grpc_related/modelresponse_server.py
+++ b/mii/grpc_related/modelresponse_server.py
@@ -32,14 +32,17 @@ def Terminate(self, request, context):
     def get_stop_event(self):
         return self._stop_event
 
-class DeploymentManagement(ServiceBase, modelresponse_pb2_grpc.DeploymentManagementServicer):
+
+class DeploymentManagement(ServiceBase,
+                           modelresponse_pb2_grpc.DeploymentManagementServicer):
     def AddDeployment(self, request, context):
         print("DEPLOYMENT ADDED")
         return google_dot_protobuf_dot_empty__pb2.Empty()
-    
+
     def DeleteDeployment(self, request, context):
         return google_dot_protobuf_dot_empty__pb2.Empty()
 
+
 class ModelResponse(ServiceBase):
     """
     Implementation class of an MII inference server
@@ -212,14 +215,19 @@ def invoke_intercept_method(request_proto, context):
                 task = str(getattr(request_proto, "task"))
                 deployment_name = str(getattr(request_proto, "deployment_name"))
                 hostname = str(getattr(request_proto, "hostname"))
-                tensor_parallel_ports = list(getattr(request_proto, "tensor_parallel_ports"))
+                tensor_parallel_ports = list(
+                    getattr(request_proto,
+                            "tensor_parallel_ports"))
                 torch_dist_port = int(getattr(request_proto, "torch_dist_port"))
                 gpu_indices = list(getattr(request_proto, "gpu_indices"))
                 if deployment_name not in self.stubs:
                     self.stubs[deployment_name] = []
                 self.counter[deployment_name] = AtomicCounter()
                 self.tasks[deployment_name] = task
-                self.stubs[deployment_name].append(ParallelStubInvoker(hostname, tensor_parallel_ports, self.asyncio_loop))
+                self.stubs[deployment_name].append(
+                    ParallelStubInvoker(hostname,
+                                        tensor_parallel_ports,
+                                        self.asyncio_loop))
                 return google_dot_protobuf_dot_empty__pb2.Empty()
 
             if method_name == TERMINATE_METHOD:
@@ -230,13 +238,13 @@ def invoke_intercept_method(request_proto, context):
                                     google_dot_protobuf_dot_empty__pb2.Empty())
                 self.asyncio_loop.call_soon_threadsafe(self.asyncio_loop.stop)
                 return next_handler.unary_unary(request_proto, context)
-            
+
             if method_name == DELETE_DEPLOYMENT_METHOD:
                 deployment_name = str(getattr(request_proto, "deployment_name"))
                 assert deployment_name in self.stubs, f"Deployment: {deployment_name} not found"
                 for stub in self.stubs[deployment_name]:
                     stub.invoke(TERMINATE_METHOD,
-                                    google_dot_protobuf_dot_empty__pb2.Empty())
+                                google_dot_protobuf_dot_empty__pb2.Empty())
                 del self.stubs[deployment_name]
                 del self.counter[deployment_name]
                 del self.tasks[deployment_name]
@@ -290,7 +298,9 @@ def _do_serve(service_impl, port, interceptors=[], is_lb=False):
                                   ('grpc.max_receive_message_length',
                                    GRPC_MAX_MSG_SIZE)])
     if is_lb:
-        modelresponse_pb2_grpc.add_DeploymentManagementServicer_to_server(service_impl, server)
+        modelresponse_pb2_grpc.add_DeploymentManagementServicer_to_server(
+            service_impl,
+            server)
     else:
         modelresponse_pb2_grpc.add_ModelResponseServicer_to_server(service_impl, server)
     server.add_insecure_port(f'[::]:{port}')
diff --git a/mii/grpc_related/proto/modelresponse.proto b/mii/grpc_related/proto/modelresponse.proto
index ad626810..7daf300a 100644
--- a/mii/grpc_related/proto/modelresponse.proto
+++ b/mii/grpc_related/proto/modelresponse.proto
@@ -132,7 +132,7 @@ message AddDeployRequest {
   repeated int64 tensor_parallel_ports = 4;
   int64 torch_dist_port = 5;
   repeated int64 gpu_indices = 6;
-  
+
 }
 
 message DeleteDeployRequest {
diff --git a/mii/grpc_related/proto/modelresponse_pb2.py b/mii/grpc_related/proto/modelresponse_pb2.py
index e7abcc14..30c7a340 100644
--- a/mii/grpc_related/proto/modelresponse_pb2.py
+++ b/mii/grpc_related/proto/modelresponse_pb2.py
@@ -1,4 +1,7 @@
-# -*- coding: utf-8 -*-
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 # Generated by the protocol buffer compiler.  DO NOT EDIT!
 # source: modelresponse.proto
 """Generated protocol buffer code."""
@@ -10,60 +13,60 @@
 
 _sym_db = _symbol_database.Default()
 
-
 from google.protobuf import empty_pb2 as google_dot_protobuf_dot_empty__pb2
 
-
-DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x13modelresponse.proto\x12\rmodelresponse\x1a\x1bgoogle/protobuf/empty.proto\"_\n\x05Value\x12\x10\n\x06svalue\x18\x01 \x01(\tH\x00\x12\x10\n\x06ivalue\x18\x02 \x01(\x03H\x00\x12\x10\n\x06\x66value\x18\x03 \x01(\x02H\x00\x12\x10\n\x06\x62value\x18\x04 \x01(\x08H\x00\x42\x0e\n\x0coneof_values\"\x1f\n\tSessionID\x12\x12\n\nsession_id\x18\x01 \x01(\t\"\xed\x01\n\x13SingleStringRequest\x12\x0f\n\x07request\x18\x01 \x01(\t\x12I\n\x0cquery_kwargs\x18\x02 \x03(\x0b\x32\x33.modelresponse.SingleStringRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x03 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\xeb\x01\n\x12MultiStringRequest\x12\x0f\n\x07request\x18\x01 \x03(\t\x12H\n\x0cquery_kwargs\x18\x02 \x03(\x0b\x32\x32.modelresponse.MultiStringRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x03 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\x85\x01\n\x11SingleStringReply\x12\x10\n\x08response\x18\x01 \x01(\t\x12\x12\n\ntime_taken\x18\x02 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x03 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\x84\x01\n\x10MultiStringReply\x12\x10\n\x08response\x18\x01 \x03(\t\x12\x12\n\ntime_taken\x18\x02 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x03 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\xeb\x01\n\tQARequest\x12\x10\n\x08question\x18\x01 \x01(\t\x12\x0f\n\x07\x63ontext\x18\x02 \x01(\t\x12?\n\x0cquery_kwargs\x18\x03 \x03(\x0b\x32).modelresponse.QARequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\xd3\x02\n\x13\x43onversationRequest\x12\x0c\n\x04text\x18\x01 \x01(\t\x12\x1c\n\x0f\x63onversation_id\x18\x02 \x01(\x03H\x00\x88\x01\x01\x12\x18\n\x10past_user_inputs\x18\x03 \x03(\t\x12\x1b\n\x13generated_responses\x18\x04 \x03(\t\x12I\n\x0cquery_kwargs\x18\x05 \x03(\x0b\x32\x33.modelresponse.ConversationRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x06 \x01(\tH\x01\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_conversation_idB\x12\n\x10_deployment_name\"\xc3\x01\n\x11\x43onversationReply\x12\x17\n\x0f\x63onversation_id\x18\x01 \x01(\x03\x12\x18\n\x10past_user_inputs\x18\x02 \x03(\t\x12\x1b\n\x13generated_responses\x18\x03 \x03(\t\x12\x12\n\ntime_taken\x18\x04 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x05 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x06 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\xaf\x01\n\nImageReply\x12\x0e\n\x06images\x18\x01 \x03(\x0c\x12\x1d\n\x15nsfw_content_detected\x18\x02 \x03(\x08\x12\x0c\n\x04mode\x18\x03 \x01(\t\x12\x0e\n\x06size_w\x18\x04 \x01(\x03\x12\x0e\n\x06size_h\x18\x05 \x01(\x03\x12\x12\n\ntime_taken\x18\x06 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x07 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\x98\x01\n\x10\x41\x64\x64\x44\x65ployRequest\x12\x0c\n\x04task\x18\x01 \x01(\t\x12\x17\n\x0f\x64\x65ployment_name\x18\x02 \x01(\t\x12\x10\n\x08hostname\x18\x03 \x01(\t\x12\x1d\n\x15tensor_parallel_ports\x18\x04 \x03(\x03\x12\x17\n\x0ftorch_dist_port\x18\x05 \x01(\x03\x12\x13\n\x0bgpu_indices\x18\x06 \x03(\x03\".\n\x13\x44\x65leteDeployRequest\x12\x17\n\x0f\x64\x65ployment_name\x18\x01 \x01(\t2\xd4\x06\n\rModelResponse\x12=\n\tTerminate\x12\x16.google.protobuf.Empty\x1a\x16.google.protobuf.Empty\"\x00\x12\x43\n\rCreateSession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12\x44\n\x0e\x44\x65stroySession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12V\n\x0eGeneratorReply\x12!.modelresponse.MultiStringRequest\x1a\x1f.modelresponse.MultiStringReply\"\x00\x12]\n\x13\x43lassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12V\n\x16QuestionAndAnswerReply\x12\x18.modelresponse.QARequest\x1a .modelresponse.SingleStringReply\"\x00\x12W\n\rFillMaskReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12\x62\n\x18TokenClassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12]\n\x13\x43onversationalReply\x12\".modelresponse.ConversationRequest\x1a .modelresponse.ConversationReply\"\x00\x12N\n\x0cTxt2ImgReply\x12!.modelresponse.MultiStringRequest\x1a\x19.modelresponse.ImageReply\"\x00\x32\xf9\x07\n\x14\x44\x65ploymentManagement\x12J\n\rAddDeployment\x12\x1f.modelresponse.AddDeployRequest\x1a\x16.google.protobuf.Empty\"\x00\x12\x43\n\rCreateSession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12\x44\n\x0e\x44\x65stroySession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12V\n\x0eGeneratorReply\x12!.modelresponse.MultiStringRequest\x1a\x1f.modelresponse.MultiStringReply\"\x00\x12]\n\x13\x43lassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12V\n\x16QuestionAndAnswerReply\x12\x18.modelresponse.QARequest\x1a .modelresponse.SingleStringReply\"\x00\x12W\n\rFillMaskReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12\x62\n\x18TokenClassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12]\n\x13\x43onversationalReply\x12\".modelresponse.ConversationRequest\x1a .modelresponse.ConversationReply\"\x00\x12N\n\x0cTxt2ImgReply\x12!.modelresponse.MultiStringRequest\x1a\x19.modelresponse.ImageReply\"\x00\x12P\n\x10\x44\x65leteDeployment\x12\".modelresponse.DeleteDeployRequest\x1a\x16.google.protobuf.Empty\"\x00\x12=\n\tTerminate\x12\x16.google.protobuf.Empty\x1a\x16.google.protobuf.Empty\"\x00\x62\x06proto3')
+DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(
+    b'\n\x13modelresponse.proto\x12\rmodelresponse\x1a\x1bgoogle/protobuf/empty.proto\"_\n\x05Value\x12\x10\n\x06svalue\x18\x01 \x01(\tH\x00\x12\x10\n\x06ivalue\x18\x02 \x01(\x03H\x00\x12\x10\n\x06\x66value\x18\x03 \x01(\x02H\x00\x12\x10\n\x06\x62value\x18\x04 \x01(\x08H\x00\x42\x0e\n\x0coneof_values\"\x1f\n\tSessionID\x12\x12\n\nsession_id\x18\x01 \x01(\t\"\xed\x01\n\x13SingleStringRequest\x12\x0f\n\x07request\x18\x01 \x01(\t\x12I\n\x0cquery_kwargs\x18\x02 \x03(\x0b\x32\x33.modelresponse.SingleStringRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x03 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\xeb\x01\n\x12MultiStringRequest\x12\x0f\n\x07request\x18\x01 \x03(\t\x12H\n\x0cquery_kwargs\x18\x02 \x03(\x0b\x32\x32.modelresponse.MultiStringRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x03 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\x85\x01\n\x11SingleStringReply\x12\x10\n\x08response\x18\x01 \x01(\t\x12\x12\n\ntime_taken\x18\x02 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x03 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\x84\x01\n\x10MultiStringReply\x12\x10\n\x08response\x18\x01 \x03(\t\x12\x12\n\ntime_taken\x18\x02 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x03 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\xeb\x01\n\tQARequest\x12\x10\n\x08question\x18\x01 \x01(\t\x12\x0f\n\x07\x63ontext\x18\x02 \x01(\t\x12?\n\x0cquery_kwargs\x18\x03 \x03(\x0b\x32).modelresponse.QARequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\xd3\x02\n\x13\x43onversationRequest\x12\x0c\n\x04text\x18\x01 \x01(\t\x12\x1c\n\x0f\x63onversation_id\x18\x02 \x01(\x03H\x00\x88\x01\x01\x12\x18\n\x10past_user_inputs\x18\x03 \x03(\t\x12\x1b\n\x13generated_responses\x18\x04 \x03(\t\x12I\n\x0cquery_kwargs\x18\x05 \x03(\x0b\x32\x33.modelresponse.ConversationRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x06 \x01(\tH\x01\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_conversation_idB\x12\n\x10_deployment_name\"\xc3\x01\n\x11\x43onversationReply\x12\x17\n\x0f\x63onversation_id\x18\x01 \x01(\x03\x12\x18\n\x10past_user_inputs\x18\x02 \x03(\t\x12\x1b\n\x13generated_responses\x18\x03 \x03(\t\x12\x12\n\ntime_taken\x18\x04 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x05 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x06 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\xaf\x01\n\nImageReply\x12\x0e\n\x06images\x18\x01 \x03(\x0c\x12\x1d\n\x15nsfw_content_detected\x18\x02 \x03(\x08\x12\x0c\n\x04mode\x18\x03 \x01(\t\x12\x0e\n\x06size_w\x18\x04 \x01(\x03\x12\x0e\n\x06size_h\x18\x05 \x01(\x03\x12\x12\n\ntime_taken\x18\x06 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x07 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\x98\x01\n\x10\x41\x64\x64\x44\x65ployRequest\x12\x0c\n\x04task\x18\x01 \x01(\t\x12\x17\n\x0f\x64\x65ployment_name\x18\x02 \x01(\t\x12\x10\n\x08hostname\x18\x03 \x01(\t\x12\x1d\n\x15tensor_parallel_ports\x18\x04 \x03(\x03\x12\x17\n\x0ftorch_dist_port\x18\x05 \x01(\x03\x12\x13\n\x0bgpu_indices\x18\x06 \x03(\x03\".\n\x13\x44\x65leteDeployRequest\x12\x17\n\x0f\x64\x65ployment_name\x18\x01 \x01(\t2\xd4\x06\n\rModelResponse\x12=\n\tTerminate\x12\x16.google.protobuf.Empty\x1a\x16.google.protobuf.Empty\"\x00\x12\x43\n\rCreateSession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12\x44\n\x0e\x44\x65stroySession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12V\n\x0eGeneratorReply\x12!.modelresponse.MultiStringRequest\x1a\x1f.modelresponse.MultiStringReply\"\x00\x12]\n\x13\x43lassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12V\n\x16QuestionAndAnswerReply\x12\x18.modelresponse.QARequest\x1a .modelresponse.SingleStringReply\"\x00\x12W\n\rFillMaskReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12\x62\n\x18TokenClassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12]\n\x13\x43onversationalReply\x12\".modelresponse.ConversationRequest\x1a .modelresponse.ConversationReply\"\x00\x12N\n\x0cTxt2ImgReply\x12!.modelresponse.MultiStringRequest\x1a\x19.modelresponse.ImageReply\"\x00\x32\xf9\x07\n\x14\x44\x65ploymentManagement\x12J\n\rAddDeployment\x12\x1f.modelresponse.AddDeployRequest\x1a\x16.google.protobuf.Empty\"\x00\x12\x43\n\rCreateSession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12\x44\n\x0e\x44\x65stroySession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12V\n\x0eGeneratorReply\x12!.modelresponse.MultiStringRequest\x1a\x1f.modelresponse.MultiStringReply\"\x00\x12]\n\x13\x43lassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12V\n\x16QuestionAndAnswerReply\x12\x18.modelresponse.QARequest\x1a .modelresponse.SingleStringReply\"\x00\x12W\n\rFillMaskReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12\x62\n\x18TokenClassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12]\n\x13\x43onversationalReply\x12\".modelresponse.ConversationRequest\x1a .modelresponse.ConversationReply\"\x00\x12N\n\x0cTxt2ImgReply\x12!.modelresponse.MultiStringRequest\x1a\x19.modelresponse.ImageReply\"\x00\x12P\n\x10\x44\x65leteDeployment\x12\".modelresponse.DeleteDeployRequest\x1a\x16.google.protobuf.Empty\"\x00\x12=\n\tTerminate\x12\x16.google.protobuf.Empty\x1a\x16.google.protobuf.Empty\"\x00\x62\x06proto3'
+)
 
 _globals = globals()
 _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
 _builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'modelresponse_pb2', _globals)
 if _descriptor._USE_C_DESCRIPTORS == False:
 
-  DESCRIPTOR._options = None
-  _SINGLESTRINGREQUEST_QUERYKWARGSENTRY._options = None
-  _SINGLESTRINGREQUEST_QUERYKWARGSENTRY._serialized_options = b'8\001'
-  _MULTISTRINGREQUEST_QUERYKWARGSENTRY._options = None
-  _MULTISTRINGREQUEST_QUERYKWARGSENTRY._serialized_options = b'8\001'
-  _QAREQUEST_QUERYKWARGSENTRY._options = None
-  _QAREQUEST_QUERYKWARGSENTRY._serialized_options = b'8\001'
-  _CONVERSATIONREQUEST_QUERYKWARGSENTRY._options = None
-  _CONVERSATIONREQUEST_QUERYKWARGSENTRY._serialized_options = b'8\001'
-  _globals['_VALUE']._serialized_start=67
-  _globals['_VALUE']._serialized_end=162
-  _globals['_SESSIONID']._serialized_start=164
-  _globals['_SESSIONID']._serialized_end=195
-  _globals['_SINGLESTRINGREQUEST']._serialized_start=198
-  _globals['_SINGLESTRINGREQUEST']._serialized_end=435
-  _globals['_SINGLESTRINGREQUEST_QUERYKWARGSENTRY']._serialized_start=343
-  _globals['_SINGLESTRINGREQUEST_QUERYKWARGSENTRY']._serialized_end=415
-  _globals['_MULTISTRINGREQUEST']._serialized_start=438
-  _globals['_MULTISTRINGREQUEST']._serialized_end=673
-  _globals['_MULTISTRINGREQUEST_QUERYKWARGSENTRY']._serialized_start=343
-  _globals['_MULTISTRINGREQUEST_QUERYKWARGSENTRY']._serialized_end=415
-  _globals['_SINGLESTRINGREPLY']._serialized_start=676
-  _globals['_SINGLESTRINGREPLY']._serialized_end=809
-  _globals['_MULTISTRINGREPLY']._serialized_start=812
-  _globals['_MULTISTRINGREPLY']._serialized_end=944
-  _globals['_QAREQUEST']._serialized_start=947
-  _globals['_QAREQUEST']._serialized_end=1182
-  _globals['_QAREQUEST_QUERYKWARGSENTRY']._serialized_start=343
-  _globals['_QAREQUEST_QUERYKWARGSENTRY']._serialized_end=415
-  _globals['_CONVERSATIONREQUEST']._serialized_start=1185
-  _globals['_CONVERSATIONREQUEST']._serialized_end=1524
-  _globals['_CONVERSATIONREQUEST_QUERYKWARGSENTRY']._serialized_start=343
-  _globals['_CONVERSATIONREQUEST_QUERYKWARGSENTRY']._serialized_end=415
-  _globals['_CONVERSATIONREPLY']._serialized_start=1527
-  _globals['_CONVERSATIONREPLY']._serialized_end=1722
-  _globals['_IMAGEREPLY']._serialized_start=1725
-  _globals['_IMAGEREPLY']._serialized_end=1900
-  _globals['_ADDDEPLOYREQUEST']._serialized_start=1903
-  _globals['_ADDDEPLOYREQUEST']._serialized_end=2055
-  _globals['_DELETEDEPLOYREQUEST']._serialized_start=2057
-  _globals['_DELETEDEPLOYREQUEST']._serialized_end=2103
-  _globals['_MODELRESPONSE']._serialized_start=2106
-  _globals['_MODELRESPONSE']._serialized_end=2958
-  _globals['_DEPLOYMENTMANAGEMENT']._serialized_start=2961
-  _globals['_DEPLOYMENTMANAGEMENT']._serialized_end=3978
+    DESCRIPTOR._options = None
+    _SINGLESTRINGREQUEST_QUERYKWARGSENTRY._options = None
+    _SINGLESTRINGREQUEST_QUERYKWARGSENTRY._serialized_options = b'8\001'
+    _MULTISTRINGREQUEST_QUERYKWARGSENTRY._options = None
+    _MULTISTRINGREQUEST_QUERYKWARGSENTRY._serialized_options = b'8\001'
+    _QAREQUEST_QUERYKWARGSENTRY._options = None
+    _QAREQUEST_QUERYKWARGSENTRY._serialized_options = b'8\001'
+    _CONVERSATIONREQUEST_QUERYKWARGSENTRY._options = None
+    _CONVERSATIONREQUEST_QUERYKWARGSENTRY._serialized_options = b'8\001'
+    _globals['_VALUE']._serialized_start = 67
+    _globals['_VALUE']._serialized_end = 162
+    _globals['_SESSIONID']._serialized_start = 164
+    _globals['_SESSIONID']._serialized_end = 195
+    _globals['_SINGLESTRINGREQUEST']._serialized_start = 198
+    _globals['_SINGLESTRINGREQUEST']._serialized_end = 435
+    _globals['_SINGLESTRINGREQUEST_QUERYKWARGSENTRY']._serialized_start = 343
+    _globals['_SINGLESTRINGREQUEST_QUERYKWARGSENTRY']._serialized_end = 415
+    _globals['_MULTISTRINGREQUEST']._serialized_start = 438
+    _globals['_MULTISTRINGREQUEST']._serialized_end = 673
+    _globals['_MULTISTRINGREQUEST_QUERYKWARGSENTRY']._serialized_start = 343
+    _globals['_MULTISTRINGREQUEST_QUERYKWARGSENTRY']._serialized_end = 415
+    _globals['_SINGLESTRINGREPLY']._serialized_start = 676
+    _globals['_SINGLESTRINGREPLY']._serialized_end = 809
+    _globals['_MULTISTRINGREPLY']._serialized_start = 812
+    _globals['_MULTISTRINGREPLY']._serialized_end = 944
+    _globals['_QAREQUEST']._serialized_start = 947
+    _globals['_QAREQUEST']._serialized_end = 1182
+    _globals['_QAREQUEST_QUERYKWARGSENTRY']._serialized_start = 343
+    _globals['_QAREQUEST_QUERYKWARGSENTRY']._serialized_end = 415
+    _globals['_CONVERSATIONREQUEST']._serialized_start = 1185
+    _globals['_CONVERSATIONREQUEST']._serialized_end = 1524
+    _globals['_CONVERSATIONREQUEST_QUERYKWARGSENTRY']._serialized_start = 343
+    _globals['_CONVERSATIONREQUEST_QUERYKWARGSENTRY']._serialized_end = 415
+    _globals['_CONVERSATIONREPLY']._serialized_start = 1527
+    _globals['_CONVERSATIONREPLY']._serialized_end = 1722
+    _globals['_IMAGEREPLY']._serialized_start = 1725
+    _globals['_IMAGEREPLY']._serialized_end = 1900
+    _globals['_ADDDEPLOYREQUEST']._serialized_start = 1903
+    _globals['_ADDDEPLOYREQUEST']._serialized_end = 2055
+    _globals['_DELETEDEPLOYREQUEST']._serialized_start = 2057
+    _globals['_DELETEDEPLOYREQUEST']._serialized_end = 2103
+    _globals['_MODELRESPONSE']._serialized_start = 2106
+    _globals['_MODELRESPONSE']._serialized_end = 2958
+    _globals['_DEPLOYMENTMANAGEMENT']._serialized_start = 2961
+    _globals['_DEPLOYMENTMANAGEMENT']._serialized_end = 3978
 # @@protoc_insertion_point(module_scope)
diff --git a/mii/grpc_related/proto/modelresponse_pb2_grpc.py b/mii/grpc_related/proto/modelresponse_pb2_grpc.py
index 9c3ce85d..49393660 100644
--- a/mii/grpc_related/proto/modelresponse_pb2_grpc.py
+++ b/mii/grpc_related/proto/modelresponse_pb2_grpc.py
@@ -1,3 +1,7 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 # Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
 """Client and server classes corresponding to protobuf-defined services."""
 import grpc
@@ -8,7 +12,6 @@
 
 class ModelResponseStub(object):
     """Missing associated documentation comment in .proto file."""
-
     def __init__(self, channel):
         """Constructor.
 
@@ -16,60 +19,60 @@ def __init__(self, channel):
             channel: A grpc.Channel.
         """
         self.Terminate = channel.unary_unary(
-                '/modelresponse.ModelResponse/Terminate',
-                request_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString,
-                response_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString,
-                )
+            '/modelresponse.ModelResponse/Terminate',
+            request_serializer=google_dot_protobuf_dot_empty__pb2.Empty.
+            SerializeToString,
+            response_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString,
+        )
         self.CreateSession = channel.unary_unary(
-                '/modelresponse.ModelResponse/CreateSession',
-                request_serializer=modelresponse__pb2.SessionID.SerializeToString,
-                response_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString,
-                )
+            '/modelresponse.ModelResponse/CreateSession',
+            request_serializer=modelresponse__pb2.SessionID.SerializeToString,
+            response_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString,
+        )
         self.DestroySession = channel.unary_unary(
-                '/modelresponse.ModelResponse/DestroySession',
-                request_serializer=modelresponse__pb2.SessionID.SerializeToString,
-                response_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString,
-                )
+            '/modelresponse.ModelResponse/DestroySession',
+            request_serializer=modelresponse__pb2.SessionID.SerializeToString,
+            response_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString,
+        )
         self.GeneratorReply = channel.unary_unary(
-                '/modelresponse.ModelResponse/GeneratorReply',
-                request_serializer=modelresponse__pb2.MultiStringRequest.SerializeToString,
-                response_deserializer=modelresponse__pb2.MultiStringReply.FromString,
-                )
+            '/modelresponse.ModelResponse/GeneratorReply',
+            request_serializer=modelresponse__pb2.MultiStringRequest.SerializeToString,
+            response_deserializer=modelresponse__pb2.MultiStringReply.FromString,
+        )
         self.ClassificationReply = channel.unary_unary(
-                '/modelresponse.ModelResponse/ClassificationReply',
-                request_serializer=modelresponse__pb2.SingleStringRequest.SerializeToString,
-                response_deserializer=modelresponse__pb2.SingleStringReply.FromString,
-                )
+            '/modelresponse.ModelResponse/ClassificationReply',
+            request_serializer=modelresponse__pb2.SingleStringRequest.SerializeToString,
+            response_deserializer=modelresponse__pb2.SingleStringReply.FromString,
+        )
         self.QuestionAndAnswerReply = channel.unary_unary(
-                '/modelresponse.ModelResponse/QuestionAndAnswerReply',
-                request_serializer=modelresponse__pb2.QARequest.SerializeToString,
-                response_deserializer=modelresponse__pb2.SingleStringReply.FromString,
-                )
+            '/modelresponse.ModelResponse/QuestionAndAnswerReply',
+            request_serializer=modelresponse__pb2.QARequest.SerializeToString,
+            response_deserializer=modelresponse__pb2.SingleStringReply.FromString,
+        )
         self.FillMaskReply = channel.unary_unary(
-                '/modelresponse.ModelResponse/FillMaskReply',
-                request_serializer=modelresponse__pb2.SingleStringRequest.SerializeToString,
-                response_deserializer=modelresponse__pb2.SingleStringReply.FromString,
-                )
+            '/modelresponse.ModelResponse/FillMaskReply',
+            request_serializer=modelresponse__pb2.SingleStringRequest.SerializeToString,
+            response_deserializer=modelresponse__pb2.SingleStringReply.FromString,
+        )
         self.TokenClassificationReply = channel.unary_unary(
-                '/modelresponse.ModelResponse/TokenClassificationReply',
-                request_serializer=modelresponse__pb2.SingleStringRequest.SerializeToString,
-                response_deserializer=modelresponse__pb2.SingleStringReply.FromString,
-                )
+            '/modelresponse.ModelResponse/TokenClassificationReply',
+            request_serializer=modelresponse__pb2.SingleStringRequest.SerializeToString,
+            response_deserializer=modelresponse__pb2.SingleStringReply.FromString,
+        )
         self.ConversationalReply = channel.unary_unary(
-                '/modelresponse.ModelResponse/ConversationalReply',
-                request_serializer=modelresponse__pb2.ConversationRequest.SerializeToString,
-                response_deserializer=modelresponse__pb2.ConversationReply.FromString,
-                )
+            '/modelresponse.ModelResponse/ConversationalReply',
+            request_serializer=modelresponse__pb2.ConversationRequest.SerializeToString,
+            response_deserializer=modelresponse__pb2.ConversationReply.FromString,
+        )
         self.Txt2ImgReply = channel.unary_unary(
-                '/modelresponse.ModelResponse/Txt2ImgReply',
-                request_serializer=modelresponse__pb2.MultiStringRequest.SerializeToString,
-                response_deserializer=modelresponse__pb2.ImageReply.FromString,
-                )
+            '/modelresponse.ModelResponse/Txt2ImgReply',
+            request_serializer=modelresponse__pb2.MultiStringRequest.SerializeToString,
+            response_deserializer=modelresponse__pb2.ImageReply.FromString,
+        )
 
 
 class ModelResponseServicer(object):
     """Missing associated documentation comment in .proto file."""
-
     def Terminate(self, request, context):
         """Missing associated documentation comment in .proto file."""
         context.set_code(grpc.StatusCode.UNIMPLEMENTED)
@@ -133,240 +136,341 @@ def Txt2ImgReply(self, request, context):
 
 def add_ModelResponseServicer_to_server(servicer, server):
     rpc_method_handlers = {
-            'Terminate': grpc.unary_unary_rpc_method_handler(
-                    servicer.Terminate,
-                    request_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString,
-                    response_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString,
-            ),
-            'CreateSession': grpc.unary_unary_rpc_method_handler(
-                    servicer.CreateSession,
-                    request_deserializer=modelresponse__pb2.SessionID.FromString,
-                    response_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString,
-            ),
-            'DestroySession': grpc.unary_unary_rpc_method_handler(
-                    servicer.DestroySession,
-                    request_deserializer=modelresponse__pb2.SessionID.FromString,
-                    response_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString,
-            ),
-            'GeneratorReply': grpc.unary_unary_rpc_method_handler(
-                    servicer.GeneratorReply,
-                    request_deserializer=modelresponse__pb2.MultiStringRequest.FromString,
-                    response_serializer=modelresponse__pb2.MultiStringReply.SerializeToString,
-            ),
-            'ClassificationReply': grpc.unary_unary_rpc_method_handler(
-                    servicer.ClassificationReply,
-                    request_deserializer=modelresponse__pb2.SingleStringRequest.FromString,
-                    response_serializer=modelresponse__pb2.SingleStringReply.SerializeToString,
-            ),
-            'QuestionAndAnswerReply': grpc.unary_unary_rpc_method_handler(
-                    servicer.QuestionAndAnswerReply,
-                    request_deserializer=modelresponse__pb2.QARequest.FromString,
-                    response_serializer=modelresponse__pb2.SingleStringReply.SerializeToString,
-            ),
-            'FillMaskReply': grpc.unary_unary_rpc_method_handler(
-                    servicer.FillMaskReply,
-                    request_deserializer=modelresponse__pb2.SingleStringRequest.FromString,
-                    response_serializer=modelresponse__pb2.SingleStringReply.SerializeToString,
-            ),
-            'TokenClassificationReply': grpc.unary_unary_rpc_method_handler(
-                    servicer.TokenClassificationReply,
-                    request_deserializer=modelresponse__pb2.SingleStringRequest.FromString,
-                    response_serializer=modelresponse__pb2.SingleStringReply.SerializeToString,
-            ),
-            'ConversationalReply': grpc.unary_unary_rpc_method_handler(
-                    servicer.ConversationalReply,
-                    request_deserializer=modelresponse__pb2.ConversationRequest.FromString,
-                    response_serializer=modelresponse__pb2.ConversationReply.SerializeToString,
-            ),
-            'Txt2ImgReply': grpc.unary_unary_rpc_method_handler(
-                    servicer.Txt2ImgReply,
-                    request_deserializer=modelresponse__pb2.MultiStringRequest.FromString,
-                    response_serializer=modelresponse__pb2.ImageReply.SerializeToString,
-            ),
+        'Terminate':
+        grpc.unary_unary_rpc_method_handler(
+            servicer.Terminate,
+            request_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString,
+            response_serializer=google_dot_protobuf_dot_empty__pb2.Empty.
+            SerializeToString,
+        ),
+        'CreateSession':
+        grpc.unary_unary_rpc_method_handler(
+            servicer.CreateSession,
+            request_deserializer=modelresponse__pb2.SessionID.FromString,
+            response_serializer=google_dot_protobuf_dot_empty__pb2.Empty.
+            SerializeToString,
+        ),
+        'DestroySession':
+        grpc.unary_unary_rpc_method_handler(
+            servicer.DestroySession,
+            request_deserializer=modelresponse__pb2.SessionID.FromString,
+            response_serializer=google_dot_protobuf_dot_empty__pb2.Empty.
+            SerializeToString,
+        ),
+        'GeneratorReply':
+        grpc.unary_unary_rpc_method_handler(
+            servicer.GeneratorReply,
+            request_deserializer=modelresponse__pb2.MultiStringRequest.FromString,
+            response_serializer=modelresponse__pb2.MultiStringReply.SerializeToString,
+        ),
+        'ClassificationReply':
+        grpc.unary_unary_rpc_method_handler(
+            servicer.ClassificationReply,
+            request_deserializer=modelresponse__pb2.SingleStringRequest.FromString,
+            response_serializer=modelresponse__pb2.SingleStringReply.SerializeToString,
+        ),
+        'QuestionAndAnswerReply':
+        grpc.unary_unary_rpc_method_handler(
+            servicer.QuestionAndAnswerReply,
+            request_deserializer=modelresponse__pb2.QARequest.FromString,
+            response_serializer=modelresponse__pb2.SingleStringReply.SerializeToString,
+        ),
+        'FillMaskReply':
+        grpc.unary_unary_rpc_method_handler(
+            servicer.FillMaskReply,
+            request_deserializer=modelresponse__pb2.SingleStringRequest.FromString,
+            response_serializer=modelresponse__pb2.SingleStringReply.SerializeToString,
+        ),
+        'TokenClassificationReply':
+        grpc.unary_unary_rpc_method_handler(
+            servicer.TokenClassificationReply,
+            request_deserializer=modelresponse__pb2.SingleStringRequest.FromString,
+            response_serializer=modelresponse__pb2.SingleStringReply.SerializeToString,
+        ),
+        'ConversationalReply':
+        grpc.unary_unary_rpc_method_handler(
+            servicer.ConversationalReply,
+            request_deserializer=modelresponse__pb2.ConversationRequest.FromString,
+            response_serializer=modelresponse__pb2.ConversationReply.SerializeToString,
+        ),
+        'Txt2ImgReply':
+        grpc.unary_unary_rpc_method_handler(
+            servicer.Txt2ImgReply,
+            request_deserializer=modelresponse__pb2.MultiStringRequest.FromString,
+            response_serializer=modelresponse__pb2.ImageReply.SerializeToString,
+        ),
     }
-    generic_handler = grpc.method_handlers_generic_handler(
-            'modelresponse.ModelResponse', rpc_method_handlers)
-    server.add_generic_rpc_handlers((generic_handler,))
+    generic_handler = grpc.method_handlers_generic_handler('modelresponse.ModelResponse',
+                                                           rpc_method_handlers)
+    server.add_generic_rpc_handlers((generic_handler, ))
 
 
- # This class is part of an EXPERIMENTAL API.
+# This class is part of an EXPERIMENTAL API.
 class ModelResponse(object):
     """Missing associated documentation comment in .proto file."""
-
     @staticmethod
     def Terminate(request,
+                  target,
+                  options=(),
+                  channel_credentials=None,
+                  call_credentials=None,
+                  insecure=False,
+                  compression=None,
+                  wait_for_ready=None,
+                  timeout=None,
+                  metadata=None):
+        return grpc.experimental.unary_unary(
+            request,
             target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/modelresponse.ModelResponse/Terminate',
+            '/modelresponse.ModelResponse/Terminate',
             google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString,
             google_dot_protobuf_dot_empty__pb2.Empty.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+            options,
+            channel_credentials,
+            insecure,
+            call_credentials,
+            compression,
+            wait_for_ready,
+            timeout,
+            metadata)
 
     @staticmethod
     def CreateSession(request,
+                      target,
+                      options=(),
+                      channel_credentials=None,
+                      call_credentials=None,
+                      insecure=False,
+                      compression=None,
+                      wait_for_ready=None,
+                      timeout=None,
+                      metadata=None):
+        return grpc.experimental.unary_unary(
+            request,
             target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/modelresponse.ModelResponse/CreateSession',
+            '/modelresponse.ModelResponse/CreateSession',
             modelresponse__pb2.SessionID.SerializeToString,
             google_dot_protobuf_dot_empty__pb2.Empty.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+            options,
+            channel_credentials,
+            insecure,
+            call_credentials,
+            compression,
+            wait_for_ready,
+            timeout,
+            metadata)
 
     @staticmethod
     def DestroySession(request,
+                       target,
+                       options=(),
+                       channel_credentials=None,
+                       call_credentials=None,
+                       insecure=False,
+                       compression=None,
+                       wait_for_ready=None,
+                       timeout=None,
+                       metadata=None):
+        return grpc.experimental.unary_unary(
+            request,
             target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/modelresponse.ModelResponse/DestroySession',
+            '/modelresponse.ModelResponse/DestroySession',
             modelresponse__pb2.SessionID.SerializeToString,
             google_dot_protobuf_dot_empty__pb2.Empty.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+            options,
+            channel_credentials,
+            insecure,
+            call_credentials,
+            compression,
+            wait_for_ready,
+            timeout,
+            metadata)
 
     @staticmethod
     def GeneratorReply(request,
+                       target,
+                       options=(),
+                       channel_credentials=None,
+                       call_credentials=None,
+                       insecure=False,
+                       compression=None,
+                       wait_for_ready=None,
+                       timeout=None,
+                       metadata=None):
+        return grpc.experimental.unary_unary(
+            request,
             target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/modelresponse.ModelResponse/GeneratorReply',
+            '/modelresponse.ModelResponse/GeneratorReply',
             modelresponse__pb2.MultiStringRequest.SerializeToString,
             modelresponse__pb2.MultiStringReply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+            options,
+            channel_credentials,
+            insecure,
+            call_credentials,
+            compression,
+            wait_for_ready,
+            timeout,
+            metadata)
 
     @staticmethod
     def ClassificationReply(request,
+                            target,
+                            options=(),
+                            channel_credentials=None,
+                            call_credentials=None,
+                            insecure=False,
+                            compression=None,
+                            wait_for_ready=None,
+                            timeout=None,
+                            metadata=None):
+        return grpc.experimental.unary_unary(
+            request,
             target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/modelresponse.ModelResponse/ClassificationReply',
+            '/modelresponse.ModelResponse/ClassificationReply',
             modelresponse__pb2.SingleStringRequest.SerializeToString,
             modelresponse__pb2.SingleStringReply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+            options,
+            channel_credentials,
+            insecure,
+            call_credentials,
+            compression,
+            wait_for_ready,
+            timeout,
+            metadata)
 
     @staticmethod
     def QuestionAndAnswerReply(request,
+                               target,
+                               options=(),
+                               channel_credentials=None,
+                               call_credentials=None,
+                               insecure=False,
+                               compression=None,
+                               wait_for_ready=None,
+                               timeout=None,
+                               metadata=None):
+        return grpc.experimental.unary_unary(
+            request,
             target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/modelresponse.ModelResponse/QuestionAndAnswerReply',
+            '/modelresponse.ModelResponse/QuestionAndAnswerReply',
             modelresponse__pb2.QARequest.SerializeToString,
             modelresponse__pb2.SingleStringReply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+            options,
+            channel_credentials,
+            insecure,
+            call_credentials,
+            compression,
+            wait_for_ready,
+            timeout,
+            metadata)
 
     @staticmethod
     def FillMaskReply(request,
+                      target,
+                      options=(),
+                      channel_credentials=None,
+                      call_credentials=None,
+                      insecure=False,
+                      compression=None,
+                      wait_for_ready=None,
+                      timeout=None,
+                      metadata=None):
+        return grpc.experimental.unary_unary(
+            request,
             target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/modelresponse.ModelResponse/FillMaskReply',
+            '/modelresponse.ModelResponse/FillMaskReply',
             modelresponse__pb2.SingleStringRequest.SerializeToString,
             modelresponse__pb2.SingleStringReply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+            options,
+            channel_credentials,
+            insecure,
+            call_credentials,
+            compression,
+            wait_for_ready,
+            timeout,
+            metadata)
 
     @staticmethod
     def TokenClassificationReply(request,
+                                 target,
+                                 options=(),
+                                 channel_credentials=None,
+                                 call_credentials=None,
+                                 insecure=False,
+                                 compression=None,
+                                 wait_for_ready=None,
+                                 timeout=None,
+                                 metadata=None):
+        return grpc.experimental.unary_unary(
+            request,
             target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/modelresponse.ModelResponse/TokenClassificationReply',
+            '/modelresponse.ModelResponse/TokenClassificationReply',
             modelresponse__pb2.SingleStringRequest.SerializeToString,
             modelresponse__pb2.SingleStringReply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+            options,
+            channel_credentials,
+            insecure,
+            call_credentials,
+            compression,
+            wait_for_ready,
+            timeout,
+            metadata)
 
     @staticmethod
     def ConversationalReply(request,
+                            target,
+                            options=(),
+                            channel_credentials=None,
+                            call_credentials=None,
+                            insecure=False,
+                            compression=None,
+                            wait_for_ready=None,
+                            timeout=None,
+                            metadata=None):
+        return grpc.experimental.unary_unary(
+            request,
             target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/modelresponse.ModelResponse/ConversationalReply',
+            '/modelresponse.ModelResponse/ConversationalReply',
             modelresponse__pb2.ConversationRequest.SerializeToString,
             modelresponse__pb2.ConversationReply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+            options,
+            channel_credentials,
+            insecure,
+            call_credentials,
+            compression,
+            wait_for_ready,
+            timeout,
+            metadata)
 
     @staticmethod
     def Txt2ImgReply(request,
+                     target,
+                     options=(),
+                     channel_credentials=None,
+                     call_credentials=None,
+                     insecure=False,
+                     compression=None,
+                     wait_for_ready=None,
+                     timeout=None,
+                     metadata=None):
+        return grpc.experimental.unary_unary(
+            request,
             target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/modelresponse.ModelResponse/Txt2ImgReply',
+            '/modelresponse.ModelResponse/Txt2ImgReply',
             modelresponse__pb2.MultiStringRequest.SerializeToString,
             modelresponse__pb2.ImageReply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+            options,
+            channel_credentials,
+            insecure,
+            call_credentials,
+            compression,
+            wait_for_ready,
+            timeout,
+            metadata)
 
 
 class DeploymentManagementStub(object):
     """Missing associated documentation comment in .proto file."""
-
     def __init__(self, channel):
         """Constructor.
 
@@ -374,70 +478,70 @@ def __init__(self, channel):
             channel: A grpc.Channel.
         """
         self.AddDeployment = channel.unary_unary(
-                '/modelresponse.DeploymentManagement/AddDeployment',
-                request_serializer=modelresponse__pb2.AddDeployRequest.SerializeToString,
-                response_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString,
-                )
+            '/modelresponse.DeploymentManagement/AddDeployment',
+            request_serializer=modelresponse__pb2.AddDeployRequest.SerializeToString,
+            response_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString,
+        )
         self.CreateSession = channel.unary_unary(
-                '/modelresponse.DeploymentManagement/CreateSession',
-                request_serializer=modelresponse__pb2.SessionID.SerializeToString,
-                response_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString,
-                )
+            '/modelresponse.DeploymentManagement/CreateSession',
+            request_serializer=modelresponse__pb2.SessionID.SerializeToString,
+            response_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString,
+        )
         self.DestroySession = channel.unary_unary(
-                '/modelresponse.DeploymentManagement/DestroySession',
-                request_serializer=modelresponse__pb2.SessionID.SerializeToString,
-                response_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString,
-                )
+            '/modelresponse.DeploymentManagement/DestroySession',
+            request_serializer=modelresponse__pb2.SessionID.SerializeToString,
+            response_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString,
+        )
         self.GeneratorReply = channel.unary_unary(
-                '/modelresponse.DeploymentManagement/GeneratorReply',
-                request_serializer=modelresponse__pb2.MultiStringRequest.SerializeToString,
-                response_deserializer=modelresponse__pb2.MultiStringReply.FromString,
-                )
+            '/modelresponse.DeploymentManagement/GeneratorReply',
+            request_serializer=modelresponse__pb2.MultiStringRequest.SerializeToString,
+            response_deserializer=modelresponse__pb2.MultiStringReply.FromString,
+        )
         self.ClassificationReply = channel.unary_unary(
-                '/modelresponse.DeploymentManagement/ClassificationReply',
-                request_serializer=modelresponse__pb2.SingleStringRequest.SerializeToString,
-                response_deserializer=modelresponse__pb2.SingleStringReply.FromString,
-                )
+            '/modelresponse.DeploymentManagement/ClassificationReply',
+            request_serializer=modelresponse__pb2.SingleStringRequest.SerializeToString,
+            response_deserializer=modelresponse__pb2.SingleStringReply.FromString,
+        )
         self.QuestionAndAnswerReply = channel.unary_unary(
-                '/modelresponse.DeploymentManagement/QuestionAndAnswerReply',
-                request_serializer=modelresponse__pb2.QARequest.SerializeToString,
-                response_deserializer=modelresponse__pb2.SingleStringReply.FromString,
-                )
+            '/modelresponse.DeploymentManagement/QuestionAndAnswerReply',
+            request_serializer=modelresponse__pb2.QARequest.SerializeToString,
+            response_deserializer=modelresponse__pb2.SingleStringReply.FromString,
+        )
         self.FillMaskReply = channel.unary_unary(
-                '/modelresponse.DeploymentManagement/FillMaskReply',
-                request_serializer=modelresponse__pb2.SingleStringRequest.SerializeToString,
-                response_deserializer=modelresponse__pb2.SingleStringReply.FromString,
-                )
+            '/modelresponse.DeploymentManagement/FillMaskReply',
+            request_serializer=modelresponse__pb2.SingleStringRequest.SerializeToString,
+            response_deserializer=modelresponse__pb2.SingleStringReply.FromString,
+        )
         self.TokenClassificationReply = channel.unary_unary(
-                '/modelresponse.DeploymentManagement/TokenClassificationReply',
-                request_serializer=modelresponse__pb2.SingleStringRequest.SerializeToString,
-                response_deserializer=modelresponse__pb2.SingleStringReply.FromString,
-                )
+            '/modelresponse.DeploymentManagement/TokenClassificationReply',
+            request_serializer=modelresponse__pb2.SingleStringRequest.SerializeToString,
+            response_deserializer=modelresponse__pb2.SingleStringReply.FromString,
+        )
         self.ConversationalReply = channel.unary_unary(
-                '/modelresponse.DeploymentManagement/ConversationalReply',
-                request_serializer=modelresponse__pb2.ConversationRequest.SerializeToString,
-                response_deserializer=modelresponse__pb2.ConversationReply.FromString,
-                )
+            '/modelresponse.DeploymentManagement/ConversationalReply',
+            request_serializer=modelresponse__pb2.ConversationRequest.SerializeToString,
+            response_deserializer=modelresponse__pb2.ConversationReply.FromString,
+        )
         self.Txt2ImgReply = channel.unary_unary(
-                '/modelresponse.DeploymentManagement/Txt2ImgReply',
-                request_serializer=modelresponse__pb2.MultiStringRequest.SerializeToString,
-                response_deserializer=modelresponse__pb2.ImageReply.FromString,
-                )
+            '/modelresponse.DeploymentManagement/Txt2ImgReply',
+            request_serializer=modelresponse__pb2.MultiStringRequest.SerializeToString,
+            response_deserializer=modelresponse__pb2.ImageReply.FromString,
+        )
         self.DeleteDeployment = channel.unary_unary(
-                '/modelresponse.DeploymentManagement/DeleteDeployment',
-                request_serializer=modelresponse__pb2.DeleteDeployRequest.SerializeToString,
-                response_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString,
-                )
+            '/modelresponse.DeploymentManagement/DeleteDeployment',
+            request_serializer=modelresponse__pb2.DeleteDeployRequest.SerializeToString,
+            response_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString,
+        )
         self.Terminate = channel.unary_unary(
-                '/modelresponse.DeploymentManagement/Terminate',
-                request_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString,
-                response_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString,
-                )
+            '/modelresponse.DeploymentManagement/Terminate',
+            request_serializer=google_dot_protobuf_dot_empty__pb2.Empty.
+            SerializeToString,
+            response_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString,
+        )
 
 
 class DeploymentManagementServicer(object):
     """Missing associated documentation comment in .proto file."""
-
     def AddDeployment(self, request, context):
         """Missing associated documentation comment in .proto file."""
         context.set_code(grpc.StatusCode.UNIMPLEMENTED)
@@ -513,276 +617,401 @@ def Terminate(self, request, context):
 
 def add_DeploymentManagementServicer_to_server(servicer, server):
     rpc_method_handlers = {
-            'AddDeployment': grpc.unary_unary_rpc_method_handler(
-                    servicer.AddDeployment,
-                    request_deserializer=modelresponse__pb2.AddDeployRequest.FromString,
-                    response_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString,
-            ),
-            'CreateSession': grpc.unary_unary_rpc_method_handler(
-                    servicer.CreateSession,
-                    request_deserializer=modelresponse__pb2.SessionID.FromString,
-                    response_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString,
-            ),
-            'DestroySession': grpc.unary_unary_rpc_method_handler(
-                    servicer.DestroySession,
-                    request_deserializer=modelresponse__pb2.SessionID.FromString,
-                    response_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString,
-            ),
-            'GeneratorReply': grpc.unary_unary_rpc_method_handler(
-                    servicer.GeneratorReply,
-                    request_deserializer=modelresponse__pb2.MultiStringRequest.FromString,
-                    response_serializer=modelresponse__pb2.MultiStringReply.SerializeToString,
-            ),
-            'ClassificationReply': grpc.unary_unary_rpc_method_handler(
-                    servicer.ClassificationReply,
-                    request_deserializer=modelresponse__pb2.SingleStringRequest.FromString,
-                    response_serializer=modelresponse__pb2.SingleStringReply.SerializeToString,
-            ),
-            'QuestionAndAnswerReply': grpc.unary_unary_rpc_method_handler(
-                    servicer.QuestionAndAnswerReply,
-                    request_deserializer=modelresponse__pb2.QARequest.FromString,
-                    response_serializer=modelresponse__pb2.SingleStringReply.SerializeToString,
-            ),
-            'FillMaskReply': grpc.unary_unary_rpc_method_handler(
-                    servicer.FillMaskReply,
-                    request_deserializer=modelresponse__pb2.SingleStringRequest.FromString,
-                    response_serializer=modelresponse__pb2.SingleStringReply.SerializeToString,
-            ),
-            'TokenClassificationReply': grpc.unary_unary_rpc_method_handler(
-                    servicer.TokenClassificationReply,
-                    request_deserializer=modelresponse__pb2.SingleStringRequest.FromString,
-                    response_serializer=modelresponse__pb2.SingleStringReply.SerializeToString,
-            ),
-            'ConversationalReply': grpc.unary_unary_rpc_method_handler(
-                    servicer.ConversationalReply,
-                    request_deserializer=modelresponse__pb2.ConversationRequest.FromString,
-                    response_serializer=modelresponse__pb2.ConversationReply.SerializeToString,
-            ),
-            'Txt2ImgReply': grpc.unary_unary_rpc_method_handler(
-                    servicer.Txt2ImgReply,
-                    request_deserializer=modelresponse__pb2.MultiStringRequest.FromString,
-                    response_serializer=modelresponse__pb2.ImageReply.SerializeToString,
-            ),
-            'DeleteDeployment': grpc.unary_unary_rpc_method_handler(
-                    servicer.DeleteDeployment,
-                    request_deserializer=modelresponse__pb2.DeleteDeployRequest.FromString,
-                    response_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString,
-            ),
-            'Terminate': grpc.unary_unary_rpc_method_handler(
-                    servicer.Terminate,
-                    request_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString,
-                    response_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString,
-            ),
+        'AddDeployment':
+        grpc.unary_unary_rpc_method_handler(
+            servicer.AddDeployment,
+            request_deserializer=modelresponse__pb2.AddDeployRequest.FromString,
+            response_serializer=google_dot_protobuf_dot_empty__pb2.Empty.
+            SerializeToString,
+        ),
+        'CreateSession':
+        grpc.unary_unary_rpc_method_handler(
+            servicer.CreateSession,
+            request_deserializer=modelresponse__pb2.SessionID.FromString,
+            response_serializer=google_dot_protobuf_dot_empty__pb2.Empty.
+            SerializeToString,
+        ),
+        'DestroySession':
+        grpc.unary_unary_rpc_method_handler(
+            servicer.DestroySession,
+            request_deserializer=modelresponse__pb2.SessionID.FromString,
+            response_serializer=google_dot_protobuf_dot_empty__pb2.Empty.
+            SerializeToString,
+        ),
+        'GeneratorReply':
+        grpc.unary_unary_rpc_method_handler(
+            servicer.GeneratorReply,
+            request_deserializer=modelresponse__pb2.MultiStringRequest.FromString,
+            response_serializer=modelresponse__pb2.MultiStringReply.SerializeToString,
+        ),
+        'ClassificationReply':
+        grpc.unary_unary_rpc_method_handler(
+            servicer.ClassificationReply,
+            request_deserializer=modelresponse__pb2.SingleStringRequest.FromString,
+            response_serializer=modelresponse__pb2.SingleStringReply.SerializeToString,
+        ),
+        'QuestionAndAnswerReply':
+        grpc.unary_unary_rpc_method_handler(
+            servicer.QuestionAndAnswerReply,
+            request_deserializer=modelresponse__pb2.QARequest.FromString,
+            response_serializer=modelresponse__pb2.SingleStringReply.SerializeToString,
+        ),
+        'FillMaskReply':
+        grpc.unary_unary_rpc_method_handler(
+            servicer.FillMaskReply,
+            request_deserializer=modelresponse__pb2.SingleStringRequest.FromString,
+            response_serializer=modelresponse__pb2.SingleStringReply.SerializeToString,
+        ),
+        'TokenClassificationReply':
+        grpc.unary_unary_rpc_method_handler(
+            servicer.TokenClassificationReply,
+            request_deserializer=modelresponse__pb2.SingleStringRequest.FromString,
+            response_serializer=modelresponse__pb2.SingleStringReply.SerializeToString,
+        ),
+        'ConversationalReply':
+        grpc.unary_unary_rpc_method_handler(
+            servicer.ConversationalReply,
+            request_deserializer=modelresponse__pb2.ConversationRequest.FromString,
+            response_serializer=modelresponse__pb2.ConversationReply.SerializeToString,
+        ),
+        'Txt2ImgReply':
+        grpc.unary_unary_rpc_method_handler(
+            servicer.Txt2ImgReply,
+            request_deserializer=modelresponse__pb2.MultiStringRequest.FromString,
+            response_serializer=modelresponse__pb2.ImageReply.SerializeToString,
+        ),
+        'DeleteDeployment':
+        grpc.unary_unary_rpc_method_handler(
+            servicer.DeleteDeployment,
+            request_deserializer=modelresponse__pb2.DeleteDeployRequest.FromString,
+            response_serializer=google_dot_protobuf_dot_empty__pb2.Empty.
+            SerializeToString,
+        ),
+        'Terminate':
+        grpc.unary_unary_rpc_method_handler(
+            servicer.Terminate,
+            request_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString,
+            response_serializer=google_dot_protobuf_dot_empty__pb2.Empty.
+            SerializeToString,
+        ),
     }
     generic_handler = grpc.method_handlers_generic_handler(
-            'modelresponse.DeploymentManagement', rpc_method_handlers)
-    server.add_generic_rpc_handlers((generic_handler,))
+        'modelresponse.DeploymentManagement',
+        rpc_method_handlers)
+    server.add_generic_rpc_handlers((generic_handler, ))
 
 
- # This class is part of an EXPERIMENTAL API.
+# This class is part of an EXPERIMENTAL API.
 class DeploymentManagement(object):
     """Missing associated documentation comment in .proto file."""
-
     @staticmethod
     def AddDeployment(request,
+                      target,
+                      options=(),
+                      channel_credentials=None,
+                      call_credentials=None,
+                      insecure=False,
+                      compression=None,
+                      wait_for_ready=None,
+                      timeout=None,
+                      metadata=None):
+        return grpc.experimental.unary_unary(
+            request,
             target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/modelresponse.DeploymentManagement/AddDeployment',
+            '/modelresponse.DeploymentManagement/AddDeployment',
             modelresponse__pb2.AddDeployRequest.SerializeToString,
             google_dot_protobuf_dot_empty__pb2.Empty.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+            options,
+            channel_credentials,
+            insecure,
+            call_credentials,
+            compression,
+            wait_for_ready,
+            timeout,
+            metadata)
 
     @staticmethod
     def CreateSession(request,
+                      target,
+                      options=(),
+                      channel_credentials=None,
+                      call_credentials=None,
+                      insecure=False,
+                      compression=None,
+                      wait_for_ready=None,
+                      timeout=None,
+                      metadata=None):
+        return grpc.experimental.unary_unary(
+            request,
             target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/modelresponse.DeploymentManagement/CreateSession',
+            '/modelresponse.DeploymentManagement/CreateSession',
             modelresponse__pb2.SessionID.SerializeToString,
             google_dot_protobuf_dot_empty__pb2.Empty.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+            options,
+            channel_credentials,
+            insecure,
+            call_credentials,
+            compression,
+            wait_for_ready,
+            timeout,
+            metadata)
 
     @staticmethod
     def DestroySession(request,
+                       target,
+                       options=(),
+                       channel_credentials=None,
+                       call_credentials=None,
+                       insecure=False,
+                       compression=None,
+                       wait_for_ready=None,
+                       timeout=None,
+                       metadata=None):
+        return grpc.experimental.unary_unary(
+            request,
             target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/modelresponse.DeploymentManagement/DestroySession',
+            '/modelresponse.DeploymentManagement/DestroySession',
             modelresponse__pb2.SessionID.SerializeToString,
             google_dot_protobuf_dot_empty__pb2.Empty.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+            options,
+            channel_credentials,
+            insecure,
+            call_credentials,
+            compression,
+            wait_for_ready,
+            timeout,
+            metadata)
 
     @staticmethod
     def GeneratorReply(request,
+                       target,
+                       options=(),
+                       channel_credentials=None,
+                       call_credentials=None,
+                       insecure=False,
+                       compression=None,
+                       wait_for_ready=None,
+                       timeout=None,
+                       metadata=None):
+        return grpc.experimental.unary_unary(
+            request,
             target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/modelresponse.DeploymentManagement/GeneratorReply',
+            '/modelresponse.DeploymentManagement/GeneratorReply',
             modelresponse__pb2.MultiStringRequest.SerializeToString,
             modelresponse__pb2.MultiStringReply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+            options,
+            channel_credentials,
+            insecure,
+            call_credentials,
+            compression,
+            wait_for_ready,
+            timeout,
+            metadata)
 
     @staticmethod
     def ClassificationReply(request,
+                            target,
+                            options=(),
+                            channel_credentials=None,
+                            call_credentials=None,
+                            insecure=False,
+                            compression=None,
+                            wait_for_ready=None,
+                            timeout=None,
+                            metadata=None):
+        return grpc.experimental.unary_unary(
+            request,
             target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/modelresponse.DeploymentManagement/ClassificationReply',
+            '/modelresponse.DeploymentManagement/ClassificationReply',
             modelresponse__pb2.SingleStringRequest.SerializeToString,
             modelresponse__pb2.SingleStringReply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+            options,
+            channel_credentials,
+            insecure,
+            call_credentials,
+            compression,
+            wait_for_ready,
+            timeout,
+            metadata)
 
     @staticmethod
     def QuestionAndAnswerReply(request,
+                               target,
+                               options=(),
+                               channel_credentials=None,
+                               call_credentials=None,
+                               insecure=False,
+                               compression=None,
+                               wait_for_ready=None,
+                               timeout=None,
+                               metadata=None):
+        return grpc.experimental.unary_unary(
+            request,
             target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/modelresponse.DeploymentManagement/QuestionAndAnswerReply',
+            '/modelresponse.DeploymentManagement/QuestionAndAnswerReply',
             modelresponse__pb2.QARequest.SerializeToString,
             modelresponse__pb2.SingleStringReply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+            options,
+            channel_credentials,
+            insecure,
+            call_credentials,
+            compression,
+            wait_for_ready,
+            timeout,
+            metadata)
 
     @staticmethod
     def FillMaskReply(request,
+                      target,
+                      options=(),
+                      channel_credentials=None,
+                      call_credentials=None,
+                      insecure=False,
+                      compression=None,
+                      wait_for_ready=None,
+                      timeout=None,
+                      metadata=None):
+        return grpc.experimental.unary_unary(
+            request,
             target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/modelresponse.DeploymentManagement/FillMaskReply',
+            '/modelresponse.DeploymentManagement/FillMaskReply',
             modelresponse__pb2.SingleStringRequest.SerializeToString,
             modelresponse__pb2.SingleStringReply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+            options,
+            channel_credentials,
+            insecure,
+            call_credentials,
+            compression,
+            wait_for_ready,
+            timeout,
+            metadata)
 
     @staticmethod
     def TokenClassificationReply(request,
+                                 target,
+                                 options=(),
+                                 channel_credentials=None,
+                                 call_credentials=None,
+                                 insecure=False,
+                                 compression=None,
+                                 wait_for_ready=None,
+                                 timeout=None,
+                                 metadata=None):
+        return grpc.experimental.unary_unary(
+            request,
             target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/modelresponse.DeploymentManagement/TokenClassificationReply',
+            '/modelresponse.DeploymentManagement/TokenClassificationReply',
             modelresponse__pb2.SingleStringRequest.SerializeToString,
             modelresponse__pb2.SingleStringReply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+            options,
+            channel_credentials,
+            insecure,
+            call_credentials,
+            compression,
+            wait_for_ready,
+            timeout,
+            metadata)
 
     @staticmethod
     def ConversationalReply(request,
+                            target,
+                            options=(),
+                            channel_credentials=None,
+                            call_credentials=None,
+                            insecure=False,
+                            compression=None,
+                            wait_for_ready=None,
+                            timeout=None,
+                            metadata=None):
+        return grpc.experimental.unary_unary(
+            request,
             target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/modelresponse.DeploymentManagement/ConversationalReply',
+            '/modelresponse.DeploymentManagement/ConversationalReply',
             modelresponse__pb2.ConversationRequest.SerializeToString,
             modelresponse__pb2.ConversationReply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+            options,
+            channel_credentials,
+            insecure,
+            call_credentials,
+            compression,
+            wait_for_ready,
+            timeout,
+            metadata)
 
     @staticmethod
     def Txt2ImgReply(request,
+                     target,
+                     options=(),
+                     channel_credentials=None,
+                     call_credentials=None,
+                     insecure=False,
+                     compression=None,
+                     wait_for_ready=None,
+                     timeout=None,
+                     metadata=None):
+        return grpc.experimental.unary_unary(
+            request,
             target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/modelresponse.DeploymentManagement/Txt2ImgReply',
+            '/modelresponse.DeploymentManagement/Txt2ImgReply',
             modelresponse__pb2.MultiStringRequest.SerializeToString,
             modelresponse__pb2.ImageReply.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+            options,
+            channel_credentials,
+            insecure,
+            call_credentials,
+            compression,
+            wait_for_ready,
+            timeout,
+            metadata)
 
     @staticmethod
     def DeleteDeployment(request,
+                         target,
+                         options=(),
+                         channel_credentials=None,
+                         call_credentials=None,
+                         insecure=False,
+                         compression=None,
+                         wait_for_ready=None,
+                         timeout=None,
+                         metadata=None):
+        return grpc.experimental.unary_unary(
+            request,
             target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/modelresponse.DeploymentManagement/DeleteDeployment',
+            '/modelresponse.DeploymentManagement/DeleteDeployment',
             modelresponse__pb2.DeleteDeployRequest.SerializeToString,
             google_dot_protobuf_dot_empty__pb2.Empty.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+            options,
+            channel_credentials,
+            insecure,
+            call_credentials,
+            compression,
+            wait_for_ready,
+            timeout,
+            metadata)
 
     @staticmethod
     def Terminate(request,
+                  target,
+                  options=(),
+                  channel_credentials=None,
+                  call_credentials=None,
+                  insecure=False,
+                  compression=None,
+                  wait_for_ready=None,
+                  timeout=None,
+                  metadata=None):
+        return grpc.experimental.unary_unary(
+            request,
             target,
-            options=(),
-            channel_credentials=None,
-            call_credentials=None,
-            insecure=False,
-            compression=None,
-            wait_for_ready=None,
-            timeout=None,
-            metadata=None):
-        return grpc.experimental.unary_unary(request, target, '/modelresponse.DeploymentManagement/Terminate',
+            '/modelresponse.DeploymentManagement/Terminate',
             google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString,
             google_dot_protobuf_dot_empty__pb2.Empty.FromString,
-            options, channel_credentials,
-            insecure, call_credentials, compression, wait_for_ready, timeout, metadata)
+            options,
+            channel_credentials,
+            insecure,
+            call_credentials,
+            compression,
+            wait_for_ready,
+            timeout,
+            metadata)
diff --git a/mii/models/score/generate.py b/mii/models/score/generate.py
index 55f63046..b87218ad 100644
--- a/mii/models/score/generate.py
+++ b/mii/models/score/generate.py
@@ -22,7 +22,7 @@ def create_score_file(deployment_tag,
     config_dict[mii.constants.DEPLOYMENTS_KEY] = {}
     if port_map is not None:
         config_dict[mii.constants.PORT_MAP_KEY] = port_map
-    
+
     if deployments is not None:
         for deployment in deployments.values():
             deployment_config = {
@@ -35,7 +35,8 @@ def create_score_file(deployment_tag,
                 mii.constants.DEEPSPEED_CONFIG_KEY: deployment.ds_config,
                 mii.constants.DEPLOYED_KEY: deployment.deployed,
             }
-            config_dict[mii.constants.DEPLOYMENTS_KEY][deployment.deployment_name] = deployment_config
+            config_dict[mii.constants.DEPLOYMENTS_KEY][
+                deployment.deployment_name] = deployment_config
 
     if lb_config is not None:
         config_dict[mii.constants.LOAD_BALANCER_CONFIG_KEY] = lb_config
diff --git a/mii/server.py b/mii/server.py
index e7f1360e..5bb1b83a 100644
--- a/mii/server.py
+++ b/mii/server.py
@@ -28,7 +28,12 @@ def config_to_b64_str(config):
 
 class MIIServer():
     '''Initialize the model, setup the server for the model under model_path'''
-    def __init__(self, deployment_tag, deployments, model_path, lb_config=None, lb_enabled=False):
+    def __init__(self,
+                 deployment_tag,
+                 deployments,
+                 model_path,
+                 lb_config=None,
+                 lb_enabled=False):
         if len(deployments) > 0:
             self.lb_enabled = lb_enabled
             self.deployments = deployments
@@ -43,11 +48,12 @@ def __init__(self, deployment_tag, deployments, model_path, lb_config=None, lb_e
                         f.write(f"localhost slots={num_gpu}")
                     mii.configs.hostfile = hostfile
 
-            processes = self._initialize_service(deployment_tag,
-                                             deployments,
-                                             model_path,
-                                             lb_config,
-                                             )
+            processes = self._initialize_service(
+                deployment_tag,
+                deployments,
+                model_path,
+                lb_config,
+            )
             self._wait_until_server_is_live(processes, lb_config.replica_configs)
 
     def _wait_until_server_is_live(self, processes, deployment):

From a4e3d56f1d303cfaeb98aa1873ee12b71f6ebf11 Mon Sep 17 00:00:00 2001
From: Tosin Segun <insanechils@gmail.com>
Date: Tue, 25 Jul 2023 22:33:27 +0000
Subject: [PATCH 53/69] fixing bug for partial termination

---
 mii/grpc_related/modelresponse_server.py | 27 ++++++++++++------------
 1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/mii/grpc_related/modelresponse_server.py b/mii/grpc_related/modelresponse_server.py
index 5204779c..c4b49a21 100644
--- a/mii/grpc_related/modelresponse_server.py
+++ b/mii/grpc_related/modelresponse_server.py
@@ -212,26 +212,25 @@ def intercept_service(self, continuation, handler_call_details):
         def invoke_intercept_method(request_proto, context):
             method_name = _get_grpc_method_name(handler_call_details.method)
             if method_name == ADD_DEPLOYMENT_METHOD:
-                task = str(getattr(request_proto, "task"))
                 deployment_name = str(getattr(request_proto, "deployment_name"))
-                hostname = str(getattr(request_proto, "hostname"))
-                tensor_parallel_ports = list(
-                    getattr(request_proto,
-                            "tensor_parallel_ports"))
-                torch_dist_port = int(getattr(request_proto, "torch_dist_port"))
-                gpu_indices = list(getattr(request_proto, "gpu_indices"))
                 if deployment_name not in self.stubs:
+                    task = str(getattr(request_proto, "task"))
+                    hostname = str(getattr(request_proto, "hostname"))
+                    tensor_parallel_ports = list(
+                        getattr(request_proto,
+                                "tensor_parallel_ports"))
+                    torch_dist_port = int(getattr(request_proto, "torch_dist_port"))
+                    gpu_indices = list(getattr(request_proto, "gpu_indices"))
                     self.stubs[deployment_name] = []
-                self.counter[deployment_name] = AtomicCounter()
-                self.tasks[deployment_name] = task
-                self.stubs[deployment_name].append(
-                    ParallelStubInvoker(hostname,
-                                        tensor_parallel_ports,
-                                        self.asyncio_loop))
+                    self.counter[deployment_name] = AtomicCounter()
+                    self.tasks[deployment_name] = task
+                    self.stubs[deployment_name].append(
+                        ParallelStubInvoker(hostname,
+                                            tensor_parallel_ports,
+                                            self.asyncio_loop))
                 return google_dot_protobuf_dot_empty__pb2.Empty()
 
             if method_name == TERMINATE_METHOD:
-                print(self.stubs.keys())
                 for deployment_name in self.stubs:
                     for stub in self.stubs[deployment_name]:
                         stub.invoke(TERMINATE_METHOD,

From 4b5bb47235cf4cd1a5c9aa0eb6aac6b675f771e8 Mon Sep 17 00:00:00 2001
From: Tosin Segun <insanechils@gmail.com>
Date: Tue, 25 Jul 2023 22:58:18 +0000
Subject: [PATCH 54/69] Removing comments

---
 mii/client.py                       | 9 +++------
 mii/grpc_related/restful_gateway.py | 2 --
 2 files changed, 3 insertions(+), 8 deletions(-)

diff --git a/mii/client.py b/mii/client.py
index f937d69d..83e39681 100644
--- a/mii/client.py
+++ b/mii/client.py
@@ -57,11 +57,10 @@ def mii_query_handle(deployment_tag):
         return MIINonPersistentClient(task, deployment_tag)
 
     deployments, lb_config, model_path, port_map = _get_deployment_configs(deployment_tag)
-    mii_configs_dict = None
+    mii_configs = None
     if len(deployments) > 0:
-        mii_configs_dict = next(iter(deployments.values())).mii_config
-        #mii_configs = mii.config.MIIConfig(**mii_configs_dict)
-    port_number = None if mii_configs_dict == None else mii_configs_dict.port_number
+        mii_configs = next(iter(deployments.values())).mii_config
+    port_number = None if mii_configs == None else mii_configs.port_number
 
     return MIIClient(deployments,
                      "localhost",
@@ -223,7 +222,6 @@ def add_models(self,
             self.lb_config = lb_config
         for deployment in deployments:
             self.deployments[deployment.deployment_name] = deployment
-        #self.deployments.extend(deployments)
         if self.model_path is None and deployment_type == DeploymentType.LOCAL:
             self.model_path = mii.constants.MII_MODEL_PATH_DEFAULT
         elif self.model_path is None and deployment_type == DeploymentType.AML:
@@ -348,6 +346,5 @@ def terminate_restful_gateway(deployment_tag):
     deployments, _, _, _ = _get_deployment_configs(deployment_tag)
     for deployment in deployments.values():
         mii_configs = deployment.mii_config
-        #mii_configs = mii.config.MIIConfig(**mii_configs_dict)
         if mii_configs.enable_restful_api:
             requests.get(f"http://localhost:{mii_configs.restful_api_port}/terminate")
diff --git a/mii/grpc_related/restful_gateway.py b/mii/grpc_related/restful_gateway.py
index d3dc53da..f4302d45 100644
--- a/mii/grpc_related/restful_gateway.py
+++ b/mii/grpc_related/restful_gateway.py
@@ -21,8 +21,6 @@ def createRestfulGatewayApp(deployment_name, task, mii_config, server_thread):
     # client must be thread-safe
     client = mii.mii_query_handle(deployment_name)
 
-    #client = mii.MIIClient(deployment_name, "localhost", mii_config.port_number)
-
     class RestfulGatewayService(Resource):
         def __init__(self):
             super().__init__()

From 30d2b03ccf8f83289bce561c0cee0b19f6f091cb Mon Sep 17 00:00:00 2001
From: Tosin Segun <insanechils@gmail.com>
Date: Wed, 26 Jul 2023 00:41:34 +0000
Subject: [PATCH 55/69] Including GPU index map in score file

---
 examples/multi_model/query.py      | 4 ++++
 mii/client.py                      | 4 ++--
 mii/constants.py                   | 2 +-
 mii/models/score/generate.py       | 1 +
 mii/models/score/score_template.py | 2 +-
 5 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/examples/multi_model/query.py b/examples/multi_model/query.py
index bf760b49..f506830f 100644
--- a/examples/multi_model/query.py
+++ b/examples/multi_model/query.py
@@ -17,6 +17,7 @@
     max_new_tokens=30,
 )
 results.append(result)
+print(result)
 
 result = generator.query({
     'query':
@@ -25,6 +26,7 @@
     "microsoft/DialogRPT-human-vs-rand_deployment"
 })
 results.append(result)
+print(result)
 
 result = generator.query({
     'text': "DeepSpeed is the greatest",
@@ -34,6 +36,7 @@
     "deployment_name": "microsoft/DialoGPT-large_deployment"
 })
 results.append(result)
+print(result)
 
 result = generator.query({
     'question':
@@ -44,3 +47,4 @@
     "deepset/roberta-large-squad2" + "-qa-deployment"
 })
 results.append(result)
+print(result)
diff --git a/mii/client.py b/mii/client.py
index 83e39681..9a922070 100644
--- a/mii/client.py
+++ b/mii/client.py
@@ -26,13 +26,13 @@ def _get_deployment_configs(deployment_tag):
             'model': deployment[mii.constants.MODEL_NAME_KEY],
             'enable_deepspeed': deployment[mii.constants.ENABLE_DEEPSPEED_KEY],
             'enable_zero': deployment[mii.constants.ENABLE_DEEPSPEED_ZERO_KEY],
-            'GPU_index_map': None,
+            'GPU_index_map': deployment[mii.constants.GPU_INDEX_KEY],
             'mii_config': deployment[mii.constants.MII_CONFIGS_KEY],
             'ds_config': deployment[mii.constants.DEEPSPEED_CONFIG_KEY],
             'version': 1,
             'deployed': deployment[mii.constants.DEPLOYED_KEY]
         }
-        deployments[deployment_name] = DeploymentConfig.parse_obj(data)
+        deployments[deployment_name] = DeploymentConfig(**data)
     lb_config = configs.get(mii.constants.LOAD_BALANCER_CONFIG_KEY)
     model_path = configs[mii.constants.MODEL_PATH_KEY]
     port_map = configs.get(mii.constants.PORT_MAP_KEY)
diff --git a/mii/constants.py b/mii/constants.py
index 520a3c3d..9fff96c2 100644
--- a/mii/constants.py
+++ b/mii/constants.py
@@ -88,7 +88,7 @@ class ModelProvider(enum.Enum):
      'generated_responses'],
     TEXT2IMG_NAME: ["query"]
 }
-
+GPU_INDEX_KEY = "index_keys"
 DEPLOYMENTS_KEY = 'deployments'
 PORT_MAP_KEY = 'port_map'
 MODEL_NAME_KEY = 'model_name'
diff --git a/mii/models/score/generate.py b/mii/models/score/generate.py
index b87218ad..86ceca28 100644
--- a/mii/models/score/generate.py
+++ b/mii/models/score/generate.py
@@ -34,6 +34,7 @@ def create_score_file(deployment_tag,
                 mii.constants.ENABLE_DEEPSPEED_ZERO_KEY: deployment.enable_zero,
                 mii.constants.DEEPSPEED_CONFIG_KEY: deployment.ds_config,
                 mii.constants.DEPLOYED_KEY: deployment.deployed,
+                mii.constants.GPU_INDEX_KEY: deployment.GPU_index_map
             }
             config_dict[mii.constants.DEPLOYMENTS_KEY][
                 deployment.deployment_name] = deployment_config
diff --git a/mii/models/score/score_template.py b/mii/models/score/score_template.py
index 117f3866..a8969ee0 100644
--- a/mii/models/score/score_template.py
+++ b/mii/models/score/score_template.py
@@ -30,7 +30,7 @@ def init():
             'model': deployment[mii.constants.MODEL_NAME_KEY],
             'enable_deepspeed': deployment[mii.constants.ENABLE_DEEPSPEED_KEY],
             'enable_zero': deployment[mii.constants.ENABLE_DEEPSPEED_ZERO_KEY],
-            'GPU_index_map': None,
+            'GPU_index_map': deployment[mii.constants.GPU_INDEX_KEY],
             'mii_config': deployment[mii.constants.MII_CONFIGS_KEY],
             'ds_config': deployment[mii.constants.DEEPSPEED_CONFIG_KEY],
             'version': 1

From c5d59963366c5f6141b26e2c98e5a96900d51b2f Mon Sep 17 00:00:00 2001
From: Tosin Segun <insanechils@gmail.com>
Date: Wed, 26 Jul 2023 22:31:07 +0000
Subject: [PATCH 56/69] Refactoring deployment

---
 mii/client.py                            | 51 +++++++-------
 mii/config.py                            |  8 ++-
 mii/constants.py                         |  2 +
 mii/deployment.py                        | 84 +++++++++++++++++-------
 mii/grpc_related/modelresponse_server.py |  4 +-
 mii/grpc_related/proto/build_script.sh   |  2 +-
 mii/server.py                            |  1 -
 mii/terminate.py                         |  2 +-
 8 files changed, 100 insertions(+), 54 deletions(-)

diff --git a/mii/client.py b/mii/client.py
index 9a922070..e4e8fe2c 100644
--- a/mii/client.py
+++ b/mii/client.py
@@ -10,7 +10,7 @@
 from mii.grpc_related.proto import modelresponse_pb2, modelresponse_pb2_grpc
 from mii.constants import GRPC_MAX_MSG_SIZE, Tasks, DeploymentType
 from mii.method_table import GRPC_METHOD_TABLE
-from mii.deployment import allocate_processes, create_score_file
+from mii.deployment import allocate_processes, create_score_file, validate_deployment
 from mii.config import DeploymentConfig
 
 
@@ -105,7 +105,7 @@ def __init__(self,
 
     def _get_deployment_task(self, deployment_name=None):
         task = None
-        if deployment_name is None:  #mii.terminate() or single model
+        if deployment_name is None or deployment_name == mii.constants.MII_TERMINATE_DEP_NAME:  #mii.terminate() or single model
             assert len(self.deployments) == 1, "Must pass deployment_name to query when using multiple deployments"
             deployment = next(iter(self.deployments.values()))
             deployment_name = deployment.deployment_name
@@ -116,8 +116,8 @@ def _get_deployment_task(self, deployment_name=None):
                 deployment = self.deployments[deployment_name]
                 task = get_task(deployment.task) if isinstance(deployment.task,
                                                                str) else deployment.task
-                return deployment_name, task
-            assert False, f"{deployment_name} not found in list of deployments"
+            else:
+                assert False, f"{deployment_name} not found in list of deployments"
         return deployment_name, task
 
     async def _request_async_response(self, request_dict, task, **query_kwargs):
@@ -130,7 +130,7 @@ async def _request_async_response(self, request_dict, task, **query_kwargs):
         return task_methods.unpack_response_from_proto(proto_response)
 
     def query(self, request_dict, **query_kwargs):
-        deployment_name = request_dict.get('deployment_name')
+        deployment_name = request_dict.get(mii.constants.DEPLOYMENT_NAME_KEY)
         deployment_name, task = self._get_deployment_task(deployment_name)
         request_dict['deployment_name'] = deployment_name
         return self.asyncio_loop.run_until_complete(
@@ -195,25 +195,26 @@ def add_models(self,
                    deployment_type=DeploymentType.LOCAL,
                    model_path=None,
                    version=1):
-
-        if not deployments:
-            assert all((model, task, deployment_name)), "model, task, and deployment name must be set to deploy singular model"
-            deployments = [
-                DeploymentConfig(deployment_name=deployment_name,
-                                 task=task,
-                                 model=model,
-                                 enable_deepspeed=enable_deepspeed,
-                                 enable_zero=enable_zero,
-                                 GPU_index_map=None,
-                                 mii_config=mii.config.MIIConfig(**mii_config),
-                                 ds_config=ds_config,
-                                 version=version,
-                                 deployed=False)
-            ]
+        
+        _, deployments = validate_deployment(task=task,
+                                             model=model,
+                                             deployment_name=deployment_name,
+                                             enable_deepspeed=enable_deepspeed,
+                                             enable_zero=enable_zero,
+                                             ds_config=ds_config,
+                                             mii_config=mii_config,
+                                             deployment_tag=self.deployment_tag,
+                                             deployments=deployments,
+                                             deployment_type=deployment_type,
+                                             model_path=model_path,
+                                             version=version)
+
+        if not deployments: #Empty deployment
+            return None
 
         deps = {deployment.deployment_name: deployment for deployment in deployments}
-        for deployment in deployments:
-            deployment.task = get_task(deployment.task)
+        #for deployment in deployments:
+        #    deployment.task = get_task(deployment.task)
         lb_config, self.port_map = allocate_processes(deps, self.port_map)
 
         if self.lb_config is not None:
@@ -226,9 +227,9 @@ def add_models(self,
             self.model_path = mii.constants.MII_MODEL_PATH_DEFAULT
         elif self.model_path is None and deployment_type == DeploymentType.AML:
             model_path = "model"
-        for deployment in self.deployments.values():
-            if isinstance(deployment.task, str):
-                deployment.task = get_task(deployment.task)
+        #for deployment in self.deployments.values():
+            #if isinstance(deployment.task, str):
+                #deployment.task = get_task(deployment.task)
         create_score_file(deployment_tag=self.deployment_tag,
                           deployment_type=deployment_type,
                           deployments=self.deployments,
diff --git a/mii/config.py b/mii/config.py
index ea3fbe43..c28853df 100644
--- a/mii/config.py
+++ b/mii/config.py
@@ -7,7 +7,7 @@
 from enum import Enum
 from pydantic import BaseModel, validator, root_validator
 from deepspeed.launcher.runner import DLTS_HOSTFILE
-
+from mii.utils import get_task
 
 class DtypeEnum(Enum):
     # The torch dtype must always be the first value (so we return torch.dtype)
@@ -127,7 +127,7 @@ class Config:
         validate_all = True
 
 
-validate_assignment = True
+    validate_assignment = True
 
 
 class DeploymentConfig(BaseModel):
@@ -141,3 +141,7 @@ class DeploymentConfig(BaseModel):
     ds_config: dict = None
     version: int = 1
     deployed: bool = False
+
+    @validator("task")
+    def convert_task_str(cls, field_value, values):
+        return get_task(field_value)
diff --git a/mii/constants.py b/mii/constants.py
index 9fff96c2..beb5de0c 100644
--- a/mii/constants.py
+++ b/mii/constants.py
@@ -102,6 +102,8 @@ class ModelProvider(enum.Enum):
 DEEPSPEED_CONFIG_KEY = 'ds_config'
 CHECKPOINT_KEY = "checkpoint"
 DEPLOYED_KEY = "deployed"
+MII_TERMINATE_DEP_NAME="__MII_TERMINATE_CALL__"
+
 MII_CACHE_PATH = "MII_CACHE_PATH"
 MII_CACHE_PATH_DEFAULT = "/tmp/mii_cache"
 
diff --git a/mii/deployment.py b/mii/deployment.py
index 54b8abce..fb639fbb 100644
--- a/mii/deployment.py
+++ b/mii/deployment.py
@@ -73,33 +73,29 @@ def deploy(task=None,
     elif model_path is None and deployment_type == DeploymentType.AML:
         model_path = "model"
 
-    if not deployments and not all((model, task, deployment_name)):
-        assert deployment_tag is not None, "Deployment tag must be set when starting empty deployment"
+    deployment_tag, deployments = validate_deployment(task=task,
+                                                      model=model,
+                                                      deployment_name=deployment_name,
+                                                      enable_deepspeed=enable_deepspeed,
+                                                      enable_zero=enable_zero,
+                                                      ds_config=ds_config,
+                                                      mii_config=mii_config,
+                                                      deployment_tag=deployment_tag,
+                                                      deployments=deployments,
+                                                      deployment_type=deployment_type,
+                                                      model_path=model_path,
+                                                      version=version)
+
+    if not deployments: #Empty deployment
         create_score_file(deployment_tag=deployment_tag,
                           deployment_type=deployment_type,
                           deployments=None,
                           model_path=model_path,
                           port_map=None,
                           lb_config=None)
+        print(f"Starting empty deployment, deployment_tag -> {deployment_tag}"
         return None
 
-    elif not deployments:
-        assert all((model, task, deployment_name)), "model, task, and deployment name must be set to deploy singular model"
-        deployments = [
-            DeploymentConfig(deployment_name=deployment_name,
-                             task=task,
-                             model=model,
-                             enable_deepspeed=enable_deepspeed,
-                             enable_zero=enable_zero,
-                             GPU_index_map=None,
-                             mii_config=mii.config.MIIConfig(**mii_config),
-                             ds_config=ds_config,
-                             version=version,
-                             deployed=False)
-        ]
-        deployment_tag = deployment_name
-    else:
-        assert deployment_tag is not None, "deployment_tag must be set to deploy multiple models"
     # parse and validate mii config
     for deployment in deployments:
         mii_config = deployment.mii_config
@@ -112,13 +108,11 @@ def deploy(task=None,
 
     # aml only allows certain characters for deployment names
     if deployment_type == DeploymentType.AML:
+        assert len(deployments == 1), "mii does not currently support empty/multi-model deployment on AML"
         allowed_chars = set(string.ascii_lowercase + string.ascii_uppercase +
                             string.digits + '-')
         assert set(deployment_name) <= allowed_chars, "AML deployment names can only contain a-z, A-Z, 0-9, and '-'"
 
-    for deployment in deployments:
-        deployment.task = mii.utils.get_task(deployment.task)
-
         if not mii_config.skip_model_check:
             mii.utils.check_if_task_and_model_is_valid(deployment.task, deployment.model)
             if enable_deepspeed:
@@ -205,6 +199,52 @@ def allocate_processes(deployments, port_map):
                                    replica_configs=replica_configs)
     return lb_config, port_map
 
+def validate_deployment(task=None,
+           model=None,
+           deployment_name=None,
+           enable_deepspeed=True,
+           enable_zero=False,
+           ds_config=None,
+           mii_config={},
+           deployment_tag=None,
+           deployments=[],
+           deployment_type=DeploymentType.LOCAL,
+           model_path=None,
+           version=1):
+
+        if deployments and any((model, task, deployment_name)):
+            assert False, "Do not input deployments and model/task/deployment_name at the same time"
+
+        elif deployments:
+            assert deployment_tag, "deployment_tag must be set to for mulitple models"
+            return deployment_tag, deployments
+
+        elif not any((model, task, deployment_name)):
+            assert deployment_tag, "deployment_tag must be set for an empty deployment"
+            create_score_file(deployment_tag=deployment_tag,
+                          deployment_type=deployment_type,
+                          deployments=None,
+                          model_path=model_path,
+                          port_map=None,
+                          lb_config=None)
+            return deployment_tag, None
+
+        assert all((model, task, deployment_name)), "model, task, and deployment_name must be set for a single model"
+        deployments = [
+            DeploymentConfig(deployment_name=deployment_name,
+                             task=task,
+                             model=model,
+                             enable_deepspeed=enable_deepspeed,
+                             enable_zero=enable_zero,
+                             GPU_index_map=None,
+                             mii_config=mii.config.MIIConfig(**mii_config),
+                             ds_config=ds_config,
+                             version=version,
+                             deployed=False)
+            ]
+        if deployment_tag is None:
+            deployment_tag = deployment_name
+        return deployment_tag, deployments
 
 def _deploy_local(deployment_tag, model_path):
     mii.utils.import_score_file(deployment_tag).init()
diff --git a/mii/grpc_related/modelresponse_server.py b/mii/grpc_related/modelresponse_server.py
index c4b49a21..8cbc536e 100644
--- a/mii/grpc_related/modelresponse_server.py
+++ b/mii/grpc_related/modelresponse_server.py
@@ -36,7 +36,6 @@ def get_stop_event(self):
 class DeploymentManagement(ServiceBase,
                            modelresponse_pb2_grpc.DeploymentManagementServicer):
     def AddDeployment(self, request, context):
-        print("DEPLOYMENT ADDED")
         return google_dot_protobuf_dot_empty__pb2.Empty()
 
     def DeleteDeployment(self, request, context):
@@ -207,7 +206,6 @@ def intercept_service(self, continuation, handler_call_details):
         next_handler = continuation(handler_call_details)
         assert next_handler.unary_unary is not None
 
-        #USE KWARGS LIKE THEY ARE USED TO MAKE SESSIONS TO GET THE DEPLOYMENT NAME TO HASH THE COUNTERS/STUBS
 
         def invoke_intercept_method(request_proto, context):
             method_name = _get_grpc_method_name(handler_call_details.method)
@@ -228,6 +226,8 @@ def invoke_intercept_method(request_proto, context):
                         ParallelStubInvoker(hostname,
                                             tensor_parallel_ports,
                                             self.asyncio_loop))
+                else:
+                    print(f"deployment: {deployment_name} already exists")
                 return google_dot_protobuf_dot_empty__pb2.Empty()
 
             if method_name == TERMINATE_METHOD:
diff --git a/mii/grpc_related/proto/build_script.sh b/mii/grpc_related/proto/build_script.sh
index d8615a85..9aaf3bd2 100644
--- a/mii/grpc_related/proto/build_script.sh
+++ b/mii/grpc_related/proto/build_script.sh
@@ -3,7 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 # DeepSpeed Team
-python3 -m grpc_tools.protoc -I./ --python_out=. --grpc_python_out=. ./modelresponse.proto
+python -m grpc_tools.protoc -I./ --python_out=. --grpc_python_out=. ./modelresponse.proto
 
 # update import to be global wrt mii
 sed -i 's/modelresponse_pb2/mii.grpc_related.proto.modelresponse_pb2/g' modelresponse_pb2_grpc.py
diff --git a/mii/server.py b/mii/server.py
index 5bb1b83a..083f2ba3 100644
--- a/mii/server.py
+++ b/mii/server.py
@@ -40,7 +40,6 @@ def __init__(self,
             for deployment in deployments:
                 assert get_num_gpus(deployment.mii_config) > 0, f"GPU count for {deployment.deployment_name} must be greater than 0"
                 mii_configs = deployment.mii_config
-                deployment.task = mii.utils.get_task(deployment.task)
                 if mii_configs.hostfile is None:
                     hostfile = tempfile.NamedTemporaryFile(delete=False)
                     num_gpu = torch.cuda.device_count()
diff --git a/mii/terminate.py b/mii/terminate.py
index 0a2b82b4..77df55ff 100644
--- a/mii/terminate.py
+++ b/mii/terminate.py
@@ -14,7 +14,7 @@ def terminate(deployment_tag):
         generator.terminate()
         return
     try:
-        generator.query({'query': ''}, None)
+        generator.query({'query': ''}, mii.constants.MII_TERMINATE_DEP_NAME)
     except grpc.aio._call.AioRpcError as error:
         if error._code == grpc.StatusCode.UNAVAILABLE:
             mii.utils.logger.warn(f"Server for {deployment_tag} not found")

From 3ae178156a7c3ec2d2aebb36bb72397cd8081ede Mon Sep 17 00:00:00 2001
From: Tosin Segun <insanechils@gmail.com>
Date: Wed, 26 Jul 2023 23:49:28 +0000
Subject: [PATCH 57/69] Refactoring and formatting

---
 mii/client.py                            | 19 +++---
 mii/config.py                            |  3 +-
 mii/constants.py                         |  2 +-
 mii/deployment.py                        | 84 ++++++++++++------------
 mii/grpc_related/modelresponse_server.py |  1 -
 mii/models/score/generate.py             |  5 +-
 mii/models/score/score_template.py       |  6 +-
 7 files changed, 60 insertions(+), 60 deletions(-)

diff --git a/mii/client.py b/mii/client.py
index e4e8fe2c..dee6f78e 100644
--- a/mii/client.py
+++ b/mii/client.py
@@ -18,7 +18,6 @@ def _get_deployment_configs(deployment_tag):
     deployments = {}
     configs = mii.utils.import_score_file(deployment_tag).configs
     for deployment in configs[mii.constants.DEPLOYMENTS_KEY].values():
-        deployment[mii.constants.DEPLOYED_KEY] = True
         deployment_name = deployment[mii.constants.DEPLOYMENT_NAME_KEY]
         data = {
             'deployment_name': deployment[mii.constants.DEPLOYMENT_NAME_KEY],
@@ -30,7 +29,6 @@ def _get_deployment_configs(deployment_tag):
             'mii_config': deployment[mii.constants.MII_CONFIGS_KEY],
             'ds_config': deployment[mii.constants.DEEPSPEED_CONFIG_KEY],
             'version': 1,
-            'deployed': deployment[mii.constants.DEPLOYED_KEY]
         }
         deployments[deployment_name] = DeploymentConfig(**data)
     lb_config = configs.get(mii.constants.LOAD_BALANCER_CONFIG_KEY)
@@ -61,6 +59,9 @@ def mii_query_handle(deployment_tag):
     if len(deployments) > 0:
         mii_configs = next(iter(deployments.values())).mii_config
     port_number = None if mii_configs == None else mii_configs.port_number
+    if port_number:
+        for deployment in deployments.values():
+            assert deployment.mii_config.port_number == port_number, f"All port numbers is each deployments mii_configs must match"
 
     return MIIClient(deployments,
                      "localhost",
@@ -195,7 +196,7 @@ def add_models(self,
                    deployment_type=DeploymentType.LOCAL,
                    model_path=None,
                    version=1):
-        
+
         _, deployments = validate_deployment(task=task,
                                              model=model,
                                              deployment_name=deployment_name,
@@ -209,7 +210,7 @@ def add_models(self,
                                              model_path=model_path,
                                              version=version)
 
-        if not deployments: #Empty deployment
+        if not deployments:  #Empty deployment
             return None
 
         deps = {deployment.deployment_name: deployment for deployment in deployments}
@@ -228,14 +229,16 @@ def add_models(self,
         elif self.model_path is None and deployment_type == DeploymentType.AML:
             model_path = "model"
         #for deployment in self.deployments.values():
-            #if isinstance(deployment.task, str):
-                #deployment.task = get_task(deployment.task)
+        #if isinstance(deployment.task, str):
+        #deployment.task = get_task(deployment.task)
+        lb_enabled = True if len(self.deployments) else False
         create_score_file(deployment_tag=self.deployment_tag,
                           deployment_type=deployment_type,
-                          deployments=self.deployments,
+                          deployments=deps,
                           model_path=self.model_path,
                           port_map=self.port_map,
-                          lb_config=lb_config)
+                          lb_config=lb_config,
+                          deployed=lb_enabled)
         if deployment_type == DeploymentType.LOCAL:
             mii.utils.import_score_file(self.deployment_tag).init()
         if self.stub is None:
diff --git a/mii/config.py b/mii/config.py
index c28853df..89a4d328 100644
--- a/mii/config.py
+++ b/mii/config.py
@@ -9,6 +9,7 @@
 from deepspeed.launcher.runner import DLTS_HOSTFILE
 from mii.utils import get_task
 
+
 class DtypeEnum(Enum):
     # The torch dtype must always be the first value (so we return torch.dtype)
     fp16 = torch.float16, "torch.float16", "fp16", "float16", "half"
@@ -126,7 +127,6 @@ class LoadBalancerConfig(BaseModel):
     class Config:
         validate_all = True
 
-
     validate_assignment = True
 
 
@@ -140,7 +140,6 @@ class DeploymentConfig(BaseModel):
     mii_config: MIIConfig = MIIConfig.parse_obj({})
     ds_config: dict = None
     version: int = 1
-    deployed: bool = False
 
     @validator("task")
     def convert_task_str(cls, field_value, values):
diff --git a/mii/constants.py b/mii/constants.py
index beb5de0c..810b5088 100644
--- a/mii/constants.py
+++ b/mii/constants.py
@@ -102,7 +102,7 @@ class ModelProvider(enum.Enum):
 DEEPSPEED_CONFIG_KEY = 'ds_config'
 CHECKPOINT_KEY = "checkpoint"
 DEPLOYED_KEY = "deployed"
-MII_TERMINATE_DEP_NAME="__MII_TERMINATE_CALL__"
+MII_TERMINATE_DEP_NAME = "__MII_TERMINATE_CALL__"
 
 MII_CACHE_PATH = "MII_CACHE_PATH"
 MII_CACHE_PATH_DEFAULT = "/tmp/mii_cache"
diff --git a/mii/deployment.py b/mii/deployment.py
index fb639fbb..5e9c5737 100644
--- a/mii/deployment.py
+++ b/mii/deployment.py
@@ -86,14 +86,14 @@ def deploy(task=None,
                                                       model_path=model_path,
                                                       version=version)
 
-    if not deployments: #Empty deployment
+    if not deployments:  #Empty deployment
         create_score_file(deployment_tag=deployment_tag,
                           deployment_type=deployment_type,
                           deployments=None,
                           model_path=model_path,
                           port_map=None,
                           lb_config=None)
-        print(f"Starting empty deployment, deployment_tag -> {deployment_tag}"
+        print(f"Starting empty deployment, deployment_tag -> {deployment_tag}")
         return None
 
     # parse and validate mii config
@@ -199,52 +199,54 @@ def allocate_processes(deployments, port_map):
                                    replica_configs=replica_configs)
     return lb_config, port_map
 
-def validate_deployment(task=None,
-           model=None,
-           deployment_name=None,
-           enable_deepspeed=True,
-           enable_zero=False,
-           ds_config=None,
-           mii_config={},
-           deployment_tag=None,
-           deployments=[],
-           deployment_type=DeploymentType.LOCAL,
-           model_path=None,
-           version=1):
 
-        if deployments and any((model, task, deployment_name)):
-            assert False, "Do not input deployments and model/task/deployment_name at the same time"
-
-        elif deployments:
-            assert deployment_tag, "deployment_tag must be set to for mulitple models"
-            return deployment_tag, deployments
+def validate_deployment(task=None,
+                        model=None,
+                        deployment_name=None,
+                        enable_deepspeed=True,
+                        enable_zero=False,
+                        ds_config=None,
+                        mii_config={},
+                        deployment_tag=None,
+                        deployments=[],
+                        deployment_type=DeploymentType.LOCAL,
+                        model_path=None,
+                        version=1):
+
+    if deployments and any((model, task, deployment_name)):
+        assert False, "Do not input deployments and model/task/deployment_name at the same time"
+
+    elif deployments:
+        assert deployment_tag, "deployment_tag must be set to for multiple models"
+        return deployment_tag, deployments
 
-        elif not any((model, task, deployment_name)):
-            assert deployment_tag, "deployment_tag must be set for an empty deployment"
-            create_score_file(deployment_tag=deployment_tag,
+    elif not any((model, task, deployment_name)):
+        assert deployment_tag, "deployment_tag must be set for an empty deployment"
+        create_score_file(deployment_tag=deployment_tag,
                           deployment_type=deployment_type,
                           deployments=None,
                           model_path=model_path,
                           port_map=None,
                           lb_config=None)
-            return deployment_tag, None
-
-        assert all((model, task, deployment_name)), "model, task, and deployment_name must be set for a single model"
-        deployments = [
-            DeploymentConfig(deployment_name=deployment_name,
-                             task=task,
-                             model=model,
-                             enable_deepspeed=enable_deepspeed,
-                             enable_zero=enable_zero,
-                             GPU_index_map=None,
-                             mii_config=mii.config.MIIConfig(**mii_config),
-                             ds_config=ds_config,
-                             version=version,
-                             deployed=False)
-            ]
-        if deployment_tag is None:
-            deployment_tag = deployment_name
-        return deployment_tag, deployments
+        return deployment_tag, None
+
+    assert all((model, task, deployment_name)), "model, task, and deployment_name must be set for a single model"
+    deployments = [
+        DeploymentConfig(deployment_name=deployment_name,
+                         task=task,
+                         model=model,
+                         enable_deepspeed=enable_deepspeed,
+                         enable_zero=enable_zero,
+                         GPU_index_map=None,
+                         mii_config=mii.config.MIIConfig(**mii_config),
+                         ds_config=ds_config,
+                         version=version,
+                         deployed=False)
+    ]
+    if deployment_tag is None:
+        deployment_tag = deployment_name
+    return deployment_tag, deployments
+
 
 def _deploy_local(deployment_tag, model_path):
     mii.utils.import_score_file(deployment_tag).init()
diff --git a/mii/grpc_related/modelresponse_server.py b/mii/grpc_related/modelresponse_server.py
index 8cbc536e..6b35d56f 100644
--- a/mii/grpc_related/modelresponse_server.py
+++ b/mii/grpc_related/modelresponse_server.py
@@ -206,7 +206,6 @@ def intercept_service(self, continuation, handler_call_details):
         next_handler = continuation(handler_call_details)
         assert next_handler.unary_unary is not None
 
-
         def invoke_intercept_method(request_proto, context):
             method_name = _get_grpc_method_name(handler_call_details.method)
             if method_name == ADD_DEPLOYMENT_METHOD:
diff --git a/mii/models/score/generate.py b/mii/models/score/generate.py
index 86ceca28..48e74776 100644
--- a/mii/models/score/generate.py
+++ b/mii/models/score/generate.py
@@ -14,11 +14,13 @@ def create_score_file(deployment_tag,
                       deployments,
                       model_path,
                       port_map,
-                      lb_config):
+                      lb_config,
+                      deployed=False):
 
     config_dict = {}
     config_dict[mii.constants.MODEL_PATH_KEY] = model_path
     config_dict[mii.constants.DEPLOYMENT_TAG_KEY] = deployment_tag
+    config_dict[mii.constants.DEPLOYED_KEY] = deployed
     config_dict[mii.constants.DEPLOYMENTS_KEY] = {}
     if port_map is not None:
         config_dict[mii.constants.PORT_MAP_KEY] = port_map
@@ -33,7 +35,6 @@ def create_score_file(deployment_tag,
                 mii.constants.MII_CONFIGS_KEY: deployment.mii_config.dict(),
                 mii.constants.ENABLE_DEEPSPEED_ZERO_KEY: deployment.enable_zero,
                 mii.constants.DEEPSPEED_CONFIG_KEY: deployment.ds_config,
-                mii.constants.DEPLOYED_KEY: deployment.deployed,
                 mii.constants.GPU_INDEX_KEY: deployment.GPU_index_map
             }
             config_dict[mii.constants.DEPLOYMENTS_KEY][
diff --git a/mii/models/score/score_template.py b/mii/models/score/score_template.py
index a8969ee0..ec6046e2 100644
--- a/mii/models/score/score_template.py
+++ b/mii/models/score/score_template.py
@@ -18,12 +18,8 @@ def init():
     model_path = mii.utils.full_model_path(configs[mii.constants.MODEL_PATH_KEY])
     deployment_tag = configs[mii.constants.DEPLOYMENT_TAG_KEY]
     deployments = []
-    lb_enabled = False
+    lb_enabled = configs[mii.constants.DEPLOYED_KEY]
     for deployment in configs[mii.constants.DEPLOYMENTS_KEY].values():
-        if deployment[mii.constants.DEPLOYED_KEY]:
-            lb_enabled = True
-            print(deployment)
-            continue
         data = {
             'deployment_name': deployment[mii.constants.DEPLOYMENT_NAME_KEY],
             'task': deployment[mii.constants.TASK_NAME_KEY],

From 4b8f02fa0b12ca1534fddadac6a665a9fe0fefbb Mon Sep 17 00:00:00 2001
From: Tosin Segun <insanechils@gmail.com>
Date: Fri, 28 Jul 2023 07:35:56 +0000
Subject: [PATCH 58/69] Refactoring

---
 examples/multi_model/deploy.py     |  4 +--
 mii/client.py                      | 38 ++++++++++++++-------
 mii/config.py                      | 24 +++++++------
 mii/constants.py                   |  7 ++--
 mii/deployment.py                  | 51 ++++++++++++++++------------
 mii/models/score/generate.py       | 32 +++++++++++++-----
 mii/models/score/score_template.py |  4 ++-
 mii/server.py                      | 54 +++++++++++++++++++-----------
 8 files changed, 136 insertions(+), 78 deletions(-)

diff --git a/examples/multi_model/deploy.py b/examples/multi_model/deploy.py
index f0408da7..c0b93b56 100644
--- a/examples/multi_model/deploy.py
+++ b/examples/multi_model/deploy.py
@@ -19,7 +19,7 @@
                          model=name,
                          deployment_name=name + "_deployment",
                          GPU_index_map=gpu_index_map3,
-                         mii_config=mii.config.MIIConfig(**mii_configs1)))
+                         mii_configs=mii.config.MIIConfig(**mii_configs1)))
 
 # gpt2
 name = "microsoft/DialogRPT-human-vs-rand"
@@ -35,7 +35,7 @@
                          model=name,
                          deployment_name=name + "_deployment",
                          GPU_index_map=gpu_index_map1,
-                         mii_config=mii.config.MIIConfig(**mii_configs2)))
+                         mii_configs=mii.config.MIIConfig(**mii_configs2)))
 
 name = "deepset/roberta-large-squad2"
 deployments.append(
diff --git a/mii/client.py b/mii/client.py
index dee6f78e..8867e000 100644
--- a/mii/client.py
+++ b/mii/client.py
@@ -19,6 +19,7 @@ def _get_deployment_configs(deployment_tag):
     configs = mii.utils.import_score_file(deployment_tag).configs
     for deployment in configs[mii.constants.DEPLOYMENTS_KEY].values():
         deployment_name = deployment[mii.constants.DEPLOYMENT_NAME_KEY]
+        """
         data = {
             'deployment_name': deployment[mii.constants.DEPLOYMENT_NAME_KEY],
             'task': deployment[mii.constants.TASK_NAME_KEY],
@@ -30,7 +31,8 @@ def _get_deployment_configs(deployment_tag):
             'ds_config': deployment[mii.constants.DEEPSPEED_CONFIG_KEY],
             'version': 1,
         }
-        deployments[deployment_name] = DeploymentConfig(**data)
+        """
+        deployments[deployment_name] = DeploymentConfig(**deployment)
     lb_config = configs.get(mii.constants.LOAD_BALANCER_CONFIG_KEY)
     model_path = configs[mii.constants.MODEL_PATH_KEY]
     port_map = configs.get(mii.constants.PORT_MAP_KEY)
@@ -57,11 +59,12 @@ def mii_query_handle(deployment_tag):
     deployments, lb_config, model_path, port_map = _get_deployment_configs(deployment_tag)
     mii_configs = None
     if len(deployments) > 0:
-        mii_configs = next(iter(deployments.values())).mii_config
+        mii_configs = getattr(next(iter(deployments.values())),
+                              mii.constants.MII_CONFIGS_KEY)
     port_number = None if mii_configs == None else mii_configs.port_number
     if port_number:
         for deployment in deployments.values():
-            assert deployment.mii_config.port_number == port_number, f"All port numbers is each deployments mii_configs must match"
+            assert getattr(deployment, mii.constants.MII_CONFIGS_KEY).port_number == port_number, f"All port numbers is each deployments mii_configs must match"
 
     return MIIClient(deployments,
                      "localhost",
@@ -109,14 +112,18 @@ def _get_deployment_task(self, deployment_name=None):
         if deployment_name is None or deployment_name == mii.constants.MII_TERMINATE_DEP_NAME:  #mii.terminate() or single model
             assert len(self.deployments) == 1, "Must pass deployment_name to query when using multiple deployments"
             deployment = next(iter(self.deployments.values()))
-            deployment_name = deployment.deployment_name
-            task = get_task(deployment.task) if isinstance(deployment.task,
-                                                           str) else deployment.task
+            deployment_name = getattr(deployment, mii.constants.DEPLOYMENT_NAME_KEY)
+            #task = get_task(deployment.task) if isinstance(deployment.task,
+            #str) else deployment.task
+            task = getattr(deployment, mii.constants.TASK_NAME_KEY)
         else:
             if deployment_name in self.deployments:
                 deployment = self.deployments[deployment_name]
+                """
                 task = get_task(deployment.task) if isinstance(deployment.task,
                                                                str) else deployment.task
+                                                               """
+                task = getattr(deployment, mii.constants.TASK_NAME_KEY)
             else:
                 assert False, f"{deployment_name} not found in list of deployments"
         return deployment_name, task
@@ -213,17 +220,22 @@ def add_models(self,
         if not deployments:  #Empty deployment
             return None
 
-        deps = {deployment.deployment_name: deployment for deployment in deployments}
+        deps = {
+            getattr(deployment,
+                    mii.constants.DEPLOYMENT_NAME_KEY): deployment
+            for deployment in deployments
+        }
         #for deployment in deployments:
         #    deployment.task = get_task(deployment.task)
         lb_config, self.port_map = allocate_processes(deps, self.port_map)
-
+        lb_enabled = True if len(self.deployments) else False
         if self.lb_config is not None:
             self.lb_config.replica_configs.extend(lb_config.replica_configs)
         else:
             self.lb_config = lb_config
         for deployment in deployments:
-            self.deployments[deployment.deployment_name] = deployment
+            self.deployments[getattr(deployment,
+                                     mii.constants.DEPLOYMENT_NAME_KEY)] = deployment
         if self.model_path is None and deployment_type == DeploymentType.LOCAL:
             self.model_path = mii.constants.MII_MODEL_PATH_DEFAULT
         elif self.model_path is None and deployment_type == DeploymentType.AML:
@@ -231,7 +243,7 @@ def add_models(self,
         #for deployment in self.deployments.values():
         #if isinstance(deployment.task, str):
         #deployment.task = get_task(deployment.task)
-        lb_enabled = True if len(self.deployments) else False
+        #lb_enabled = True if len(self.deployments) else False
         create_score_file(deployment_tag=self.deployment_tag,
                           deployment_type=deployment_type,
                           deployments=deps,
@@ -242,8 +254,8 @@ def add_models(self,
         if deployment_type == DeploymentType.LOCAL:
             mii.utils.import_score_file(self.deployment_tag).init()
         if self.stub is None:
-            self.port_number = next(iter(
-                self.deployments.values())).mii_config.port_number
+            self.port_number = getattr(next(iter(self.deployments.values())),
+                                       mii.constants.MII_CONFIGS_KEY).port_number
             channel = create_channel(self.host, self.port_number)
             self.stub = modelresponse_pb2_grpc.DeploymentManagementStub(channel)
         for replica in lb_config.replica_configs:
@@ -349,6 +361,6 @@ def terminate(self):
 def terminate_restful_gateway(deployment_tag):
     deployments, _, _, _ = _get_deployment_configs(deployment_tag)
     for deployment in deployments.values():
-        mii_configs = deployment.mii_config
+        mii_configs = getattr(deployment, mii.constants.MII_CONFIGS_KEY)
         if mii_configs.enable_restful_api:
             requests.get(f"http://localhost:{mii_configs.restful_api_port}/terminate")
diff --git a/mii/config.py b/mii/config.py
index 89a4d328..b8325562 100644
--- a/mii/config.py
+++ b/mii/config.py
@@ -5,7 +5,7 @@
 import torch
 from typing import Union, List
 from enum import Enum
-from pydantic import BaseModel, validator, root_validator
+from pydantic import BaseModel, validator, root_validator, Field
 from deepspeed.launcher.runner import DLTS_HOSTFILE
 from mii.utils import get_task
 
@@ -131,16 +131,20 @@ class Config:
 
 
 class DeploymentConfig(BaseModel):
-    deployment_name: str
-    task: str
-    model: str
-    enable_deepspeed: bool = True
-    enable_zero: bool = False
-    GPU_index_map: dict = None
-    mii_config: MIIConfig = MIIConfig.parse_obj({})
-    ds_config: dict = None
-    version: int = 1
+    deployment_name: str = Field(alias="DEPLOYMENT_NAME_KEY")
+    task: str = Field(alias="TASK_NAME_KEY")
+    model: str = Field(alias="MODEL_NAME_KEY")
+    ds_optimize: bool = Field(default=True, alias="ENABLE_DEEPSPEED_KEY")
+    ds_zero: bool = Field(default=False, alias="ENABLE_DEEPSPEED_ZERO_KEY")
+    GPU_index_map: dict = Field(default=None, alias="GPU_INDEX_KEY")
+    mii_configs: MIIConfig = Field(default=MIIConfig.parse_obj({}),
+                                   alias="MII_CONFIGS_KEY")
+    ds_config: dict = Field(default=None, alias="DEEPSPEED_CONFIG_KEY")
+    version: int = Field(default=1, alias="VERSION_KEY")
 
     @validator("task")
     def convert_task_str(cls, field_value, values):
         return get_task(field_value)
+
+    class Config:
+        allow_population_by_field_name = True
diff --git a/mii/constants.py b/mii/constants.py
index 810b5088..f4860cc9 100644
--- a/mii/constants.py
+++ b/mii/constants.py
@@ -88,11 +88,11 @@ class ModelProvider(enum.Enum):
      'generated_responses'],
     TEXT2IMG_NAME: ["query"]
 }
-GPU_INDEX_KEY = "index_keys"
+GPU_INDEX_KEY = "GPU_index_map"
 DEPLOYMENTS_KEY = 'deployments'
 PORT_MAP_KEY = 'port_map'
-MODEL_NAME_KEY = 'model_name'
-TASK_NAME_KEY = 'task_name'
+MODEL_NAME_KEY = 'model'
+TASK_NAME_KEY = 'task'
 DEPLOYMENT_NAME_KEY = 'deployment_name'
 MODEL_PATH_KEY = 'model_path'
 LOAD_BALANCER_CONFIG_KEY = 'load_balancer_config'
@@ -102,6 +102,7 @@ class ModelProvider(enum.Enum):
 DEEPSPEED_CONFIG_KEY = 'ds_config'
 CHECKPOINT_KEY = "checkpoint"
 DEPLOYED_KEY = "deployed"
+VERSION_KEY = "version"
 MII_TERMINATE_DEP_NAME = "__MII_TERMINATE_CALL__"
 
 MII_CACHE_PATH = "MII_CACHE_PATH"
diff --git a/mii/deployment.py b/mii/deployment.py
index 5e9c5737..5744d182 100644
--- a/mii/deployment.py
+++ b/mii/deployment.py
@@ -98,9 +98,12 @@ def deploy(task=None,
 
     # parse and validate mii config
     for deployment in deployments:
-        mii_config = deployment.mii_config
-        if deployment.enable_zero:
-            if deployment.ds_config.get("fp16", {}).get("enabled", False):
+        mii_config = getattr(deployment, mii.constants.MII_CONFIGS_KEY)
+        if getattr(deployment, mii.constants.ENABLE_DEEPSPEED_ZERO_KEY):
+            if getattr(deployment,
+                       mii.constants.DEEPSPEED_CONFIG_KEY).get("fp16",
+                                                               {}).get("enabled",
+                                                                       False):
                 assert (mii_config.dtype == torch.half), "MII Config Error: MII dtype and ZeRO dtype must match"
             else:
                 assert (mii_config.dtype == torch.float), "MII Config Error: MII dtype and ZeRO dtype must match"
@@ -114,7 +117,11 @@ def deploy(task=None,
         assert set(deployment_name) <= allowed_chars, "AML deployment names can only contain a-z, A-Z, 0-9, and '-'"
 
         if not mii_config.skip_model_check:
-            mii.utils.check_if_task_and_model_is_valid(deployment.task, deployment.model)
+            mii.utils.check_if_task_and_model_is_valid(
+                getattr(deployment,
+                        mii.constants.TASK_NAME_KEY),
+                getattr(deployment,
+                        mii.constants.MODEL_NAME_KEY))
             if enable_deepspeed:
                 mii.utils.check_if_task_and_model_is_supported(
                     deployment.task,
@@ -169,7 +176,7 @@ def allocate_processes(deployments, port_map):
     replica_configs = []
     port_offset = 1
     for deployment in deployments.values():
-        mii_config = deployment.mii_config
+        mii_config = getattr(deployment, mii.constants.MII_CONFIGS_KEY)
         replica_pool = _allocate_processes(mii_config.hostfile,
                                            mii_config.tensor_parallel,
                                            mii_config.replica_num,
@@ -189,12 +196,15 @@ def allocate_processes(deployments, port_map):
                 port_map[hostname].add(i)
             torch_dist_port = mii_config.torch_dist_port + i
             replica_configs.append(
-                ReplicaConfig(task=get_task_name(deployment.task),
-                              deployment_name=deployment.deployment_name,
-                              hostname=hostname,
-                              tensor_parallel_ports=tensor_parallel_ports,
-                              torch_dist_port=torch_dist_port,
-                              gpu_indices=gpu_indices))
+                ReplicaConfig(
+                    task=get_task_name(getattr(deployment,
+                                               mii.constants.TASK_NAME_KEY)),
+                    deployment_name=(getattr(deployment,
+                                             mii.constants.DEPLOYMENT_NAME_KEY)),
+                    hostname=hostname,
+                    tensor_parallel_ports=tensor_parallel_ports,
+                    torch_dist_port=torch_dist_port,
+                    gpu_indices=gpu_indices))
     lb_config = LoadBalancerConfig(port=mii_config.port_number,
                                    replica_configs=replica_configs)
     return lb_config, port_map
@@ -232,16 +242,15 @@ def validate_deployment(task=None,
 
     assert all((model, task, deployment_name)), "model, task, and deployment_name must be set for a single model"
     deployments = [
-        DeploymentConfig(deployment_name=deployment_name,
-                         task=task,
-                         model=model,
-                         enable_deepspeed=enable_deepspeed,
-                         enable_zero=enable_zero,
-                         GPU_index_map=None,
-                         mii_config=mii.config.MIIConfig(**mii_config),
-                         ds_config=ds_config,
-                         version=version,
-                         deployed=False)
+        DeploymentConfig(DEPLOYMENT_NAME_KEY=deployment_name,
+                         TASK_NAME_KEY=task,
+                         MODEL_NAME_KEY=model,
+                         ENABLE_DEEPSPEED_KEY=enable_deepspeed,
+                         ENABLE_DEEPSPEED_ZERO_KEY=enable_zero,
+                         GPU_INDEX_KEY=None,
+                         MII_CONFIGS_KEY=mii.config.MIIConfig(**mii_config),
+                         DEEPSPEED_CONFIG_KEY=ds_config,
+                         VERSION_KEY=version)
     ]
     if deployment_tag is None:
         deployment_tag = deployment_name
diff --git a/mii/models/score/generate.py b/mii/models/score/generate.py
index 48e74776..2f2bf8b0 100644
--- a/mii/models/score/generate.py
+++ b/mii/models/score/generate.py
@@ -28,14 +28,30 @@ def create_score_file(deployment_tag,
     if deployments is not None:
         for deployment in deployments.values():
             deployment_config = {
-                mii.constants.DEPLOYMENT_NAME_KEY: deployment.deployment_name,
-                mii.constants.TASK_NAME_KEY: mii.utils.get_task_name(deployment.task),
-                mii.constants.MODEL_NAME_KEY: deployment.model,
-                mii.constants.ENABLE_DEEPSPEED_KEY: deployment.enable_deepspeed,
-                mii.constants.MII_CONFIGS_KEY: deployment.mii_config.dict(),
-                mii.constants.ENABLE_DEEPSPEED_ZERO_KEY: deployment.enable_zero,
-                mii.constants.DEEPSPEED_CONFIG_KEY: deployment.ds_config,
-                mii.constants.GPU_INDEX_KEY: deployment.GPU_index_map
+                mii.constants.DEPLOYMENT_NAME_KEY:
+                getattr(deployment,
+                        mii.constants.DEPLOYMENT_NAME_KEY),
+                mii.constants.TASK_NAME_KEY:
+                mii.utils.get_task_name(getattr(deployment,
+                                                mii.constants.TASK_NAME_KEY)),
+                mii.constants.MODEL_NAME_KEY:
+                getattr(deployment,
+                        mii.constants.MODEL_NAME_KEY),
+                mii.constants.ENABLE_DEEPSPEED_KEY:
+                getattr(deployment,
+                        mii.constants.ENABLE_DEEPSPEED_KEY),
+                mii.constants.MII_CONFIGS_KEY:
+                getattr(deployment,
+                        mii.constants.MII_CONFIGS_KEY).dict(),
+                mii.constants.ENABLE_DEEPSPEED_ZERO_KEY:
+                getattr(deployment,
+                        mii.constants.ENABLE_DEEPSPEED_ZERO_KEY),
+                mii.constants.DEEPSPEED_CONFIG_KEY:
+                getattr(deployment,
+                        mii.constants.DEEPSPEED_CONFIG_KEY),
+                mii.constants.GPU_INDEX_KEY:
+                getattr(deployment,
+                        mii.constants.GPU_INDEX_KEY)
             }
             config_dict[mii.constants.DEPLOYMENTS_KEY][
                 deployment.deployment_name] = deployment_config
diff --git a/mii/models/score/score_template.py b/mii/models/score/score_template.py
index ec6046e2..c4905f3c 100644
--- a/mii/models/score/score_template.py
+++ b/mii/models/score/score_template.py
@@ -20,6 +20,7 @@ def init():
     deployments = []
     lb_enabled = configs[mii.constants.DEPLOYED_KEY]
     for deployment in configs[mii.constants.DEPLOYMENTS_KEY].values():
+        """
         data = {
             'deployment_name': deployment[mii.constants.DEPLOYMENT_NAME_KEY],
             'task': deployment[mii.constants.TASK_NAME_KEY],
@@ -31,7 +32,8 @@ def init():
             'ds_config': deployment[mii.constants.DEEPSPEED_CONFIG_KEY],
             'version': 1
         }
-        deployments.append(mii.DeploymentConfig.parse_obj(data))
+        """
+        deployments.append(mii.DeploymentConfig(**deployment))
     """
     deployment_name = configs[mii.constants.DEPLOYMENT_NAME_KEY]
     model_name = configs[mii.constants.MODEL_NAME_KEY]
diff --git a/mii/server.py b/mii/server.py
index 083f2ba3..4fef14f4 100644
--- a/mii/server.py
+++ b/mii/server.py
@@ -38,8 +38,8 @@ def __init__(self,
             self.lb_enabled = lb_enabled
             self.deployments = deployments
             for deployment in deployments:
-                assert get_num_gpus(deployment.mii_config) > 0, f"GPU count for {deployment.deployment_name} must be greater than 0"
-                mii_configs = deployment.mii_config
+                mii_configs = getattr(deployment, mii.constants.MII_CONFIGS_KEY)
+                assert get_num_gpus(mii_configs) > 0, f"GPU count for {deployment.deployment_name} must be greater than 0"
                 if mii_configs.hostfile is None:
                     hostfile = tempfile.NamedTemporaryFile(delete=False)
                     num_gpu = torch.cuda.device_count()
@@ -106,8 +106,8 @@ def _build_server_args(self,
 
         task = ""
         for deployment in self.deployments:
-            if deployment_name == deployment.deployment_name:
-                task = deployment.task
+            if deployment_name == getattr(deployment, mii.constants.DEPLOYMENT_NAME_KEY):
+                task = getattr(deployment, mii.constants.TASK_NAME_KEY)
                 break
         server_args_str = f"--deployment-name {deployment_name} --task-name {mii.utils.get_task_name(task)} --model {model_name} --model-path {model_path} --port {port}"
         server_args_str += " --ds-optimize" if ds_optimize else ""
@@ -284,7 +284,7 @@ def _initialize_service(self, deployment_tag, deployments, model_path, lb_config
             name = repl_config.deployment_name
             deployment = None
             for dep in deployments:
-                if dep.deployment_name == name:
+                if getattr(dep, mii.constants.DEPLOYMENT_NAME_KEY) == name:
                     deployment = dep
             if deployment is None:
                 continue
@@ -295,16 +295,22 @@ def _initialize_service(self, deployment_tag, deployments, model_path, lb_config
             processes.append(
                 self._launch_deepspeed(
                     name,
-                    deployment.model,
+                    getattr(deployment,
+                            mii.constants.MODEL_NAME_KEY),
                     model_path,
-                    deployment.enable_deepspeed,
-                    deployment.enable_zero,
-                    deployment.ds_config,
-                    deployment.mii_config,
+                    getattr(deployment,
+                            mii.constants.ENABLE_DEEPSPEED_KEY),
+                    getattr(deployment,
+                            mii.constants.ENABLE_DEEPSPEED_ZERO_KEY),
+                    getattr(deployment,
+                            mii.constants.DEEPSPEED_CONFIG_KEY),
+                    getattr(deployment,
+                            mii.constants.MII_CONFIGS_KEY),
                     hostfile.name,
                     repl_config.hostname,
                     repl_config.tensor_parallel_ports[0],
-                    deployment.mii_config.torch_dist_port + (100 * i) +
+                    getattr(deployment,
+                            mii.constants.MII_CONFIGS_KEY).torch_dist_port + (100 * i) +
                     repl_config.gpu_indices[0],
                     repl_config.gpu_indices))
 
@@ -316,17 +322,25 @@ def _initialize_service(self, deployment_tag, deployments, model_path, lb_config
             processes.append(self._launch_load_balancer(model_path, lb_config))
 
         for deployment in self.deployments:
-            if deployment.mii_config.enable_restful_api:
+            if getattr(deployment, mii.constants.MII_CONFIGS_KEY).enable_restful_api:
                 # start rest api server
                 processes.append(
-                    self._launch_restful_gateway(deployment.deployment_name,
-                                                 deployment.model,
-                                                 model_path,
-                                                 deployment.enable_deepspeed,
-                                                 deployment.enable_zero,
-                                                 deployment.ds_config,
-                                                 deployment.mii_config,
-                                                 deployment.mii_config.port_number))
+                    self._launch_restful_gateway(
+                        getattr(deployment,
+                                mii.constants.DEPLOYMENT_NAME_KEY),
+                        getattr(deployment,
+                                mii.constants.MODEL_NAME_KEY),
+                        model_path,
+                        getattr(deployment,
+                                mii.constants.ENABLE_DEEPSPEED_KEY),
+                        getattr(deployment,
+                                mii.constants.ENABLE_DEEPSPEED_ZERO_KEY),
+                        getattr(deployment,
+                                mii.constants.DEEPSPEED_CONFIG_KEY),
+                        getattr(deployment,
+                                mii.constants.MII_CONFIGS_KEY),
+                        getattr(deployment,
+                                mii.constants.MII_CONFIGS_KEY).port_number))
                 break
 
         return processes

From c51ce3773f8427b40289464f1fad044eb576ab3d Mon Sep 17 00:00:00 2001
From: Tosin Segun <insanechils@gmail.com>
Date: Fri, 28 Jul 2023 07:45:29 +0000
Subject: [PATCH 59/69] Fixing Readme

---
 README.md | 36 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/README.md b/README.md
index 27e6513b..83eed709 100644
--- a/README.md
+++ b/README.md
@@ -178,24 +178,6 @@ mii.deploy(...
     mii_config=mii_configs)
 ```
 
-**Non-persistent Deployment**
-
-You can enable a non-persistent deployment which allows you to make queries without standing up a server. The non-persistent deployment acts as a simplified interface to DeepSpeed-inference for use cases that do not require creating a persistent model server process. Changing the `deployment_type` to `NON_PERSISTENT` in `mii.deploy(...)` will activate this option.
-
-```python
-...
-mii.deploy(deployment_name = DEPLOYMENT_NAME,
-	   deployment_type=mii.constants.DeploymentType.NON_PERSISTENT
-	   ...
-	   )
-
-generator = mii.mii_query_handle(DEPLOYMENT_NAME)
-result = generator.query({"query": ["DeepSpeed is", "Seattle is"]}, do_sample=True, max_new_tokens=30})
-
-```
-
-You can find a complete example [here]("https://github.com/microsoft/DeepSpeed-MII/tree/main/examples/non_persistent")
-
 Any HTTP client can be used to call the APIs. An example of using curl is:
 ```bash
 # Assume deployment_name and restful_api_port are set to bloom560m_deployment and 28080 respectively:
@@ -219,6 +201,24 @@ response = requests.post(url, data=json_params, headers={
 print(response.json())
 ```
 
+**Non-persistent Deployment**
+
+You can enable a non-persistent deployment which allows you to make queries without standing up a server. The non-persistent deployment acts as a simplified interface to DeepSpeed-inference for use cases that do not require creating a persistent model server process. Changing the `deployment_type` to `NON_PERSISTENT` in `mii.deploy(...)` will activate this option.
+
+```python
+...
+mii.deploy(deployment_name = DEPLOYMENT_NAME,
+           deployment_type=mii.constants.DeploymentType.NON_PERSISTENT
+           ...
+           )
+
+generator = mii.mii_query_handle(DEPLOYMENT_NAME)
+result = generator.query({"query": ["DeepSpeed is", "Seattle is"]}, do_sample=True, max_new_tokens=30})
+
+```
+
+You can find a complete example [here]("https://github.com/microsoft/DeepSpeed-MII/tree/main/examples/non_persistent")
+
 ## Deploying with MII-Azure
 
 MII supports deployment on Azure via AML Inference. To enable this, MII generates AML deployment assets for a given model that can be deployed using the Azure-CLI, as shown in the code below. Furthermore, deploying on Azure, allows MII to leverage DeepSpeed-Azure as its optimization backend, which offers better latency and cost reduction than DeepSpeed-Public.

From 43479db8357461a40624c9f097cb5e59854f1bf5 Mon Sep 17 00:00:00 2001
From: Tosin Segun <insanechils@gmail.com>
Date: Fri, 28 Jul 2023 21:48:59 +0000
Subject: [PATCH 60/69] Refactoring GRPC

---
 mii/client.py                                 |  94 ++--
 mii/grpc_related/modelresponse_server.py      |   3 +
 mii/grpc_related/proto/modelresponse.proto    |  10 -
 mii/grpc_related/proto/modelresponse_pb2.py   |   4 +-
 .../proto/modelresponse_pb2_grpc.py           | 434 ------------------
 5 files changed, 58 insertions(+), 487 deletions(-)

diff --git a/mii/client.py b/mii/client.py
index 8867e000..1236876b 100644
--- a/mii/client.py
+++ b/mii/client.py
@@ -66,13 +66,13 @@ def mii_query_handle(deployment_tag):
         for deployment in deployments.values():
             assert getattr(deployment, mii.constants.MII_CONFIGS_KEY).port_number == port_number, f"All port numbers is each deployments mii_configs must match"
 
-    return MIIClient(deployments,
-                     "localhost",
-                     port_number,
-                     lb_config,
-                     model_path,
-                     port_map,
-                     deployment_tag)
+    return LBClient(deployments,
+                    "localhost",
+                    port_number,
+                    lb_config,
+                    model_path,
+                    port_map,
+                    deployment_tag)
 
 
 def create_channel(host, port):
@@ -87,25 +87,15 @@ class MIIClient():
     """
     Client to send queries to a single endpoint.
     """
-    def __init__(self,
-                 deployments,
-                 host,
-                 port,
-                 lb_config=None,
-                 model_path=None,
-                 port_map=None,
-                 deployment_tag=None):
+    def __init__(self, deployments, host, port):
         self.asyncio_loop = asyncio.get_event_loop()
-        self.stub = None
+        self.mr_stub = None
+        self.channel = None
         self.host = host
         if port is not None:
-            channel = create_channel(host, port)
-            self.stub = modelresponse_pb2_grpc.DeploymentManagementStub(channel)
+            self.channel = create_channel(host, port)
+            self.mr_stub = modelresponse_pb2_grpc.ModelResponseStub(self.channel)
         self.deployments = deployments
-        self.lb_config = lb_config
-        self.model_path = model_path
-        self.port_map = port_map if port_map is not None else {}
-        self.deployment_tag = deployment_tag
 
     def _get_deployment_task(self, deployment_name=None):
         task = None
@@ -134,7 +124,7 @@ async def _request_async_response(self, request_dict, task, **query_kwargs):
 
         task_methods = GRPC_METHOD_TABLE[task]
         proto_request = task_methods.pack_request_to_proto(request_dict, **query_kwargs)
-        proto_response = await getattr(self.stub, task_methods.method)(proto_request)
+        proto_response = await getattr(self.mr_stub, task_methods.method)(proto_request)
         return task_methods.unpack_response_from_proto(proto_response)
 
     def query(self, request_dict, **query_kwargs):
@@ -147,14 +137,14 @@ def query(self, request_dict, **query_kwargs):
                                          **query_kwargs))
 
     async def terminate_async(self):
-        await self.stub.Terminate(
+        await self.mr_stub.Terminate(
             modelresponse_pb2.google_dot_protobuf_dot_empty__pb2.Empty())
 
     def terminate(self):
         self.asyncio_loop.run_until_complete(self.terminate_async())
 
     async def create_session_async(self, session_id):
-        return await self.stub.CreateSession(
+        return await self.mr_stub.CreateSession(
             modelresponse_pb2.SessionID(session_id=session_id))
 
     def create_session(self, session_id, deployment_name=None):
@@ -166,8 +156,8 @@ def create_session(self, session_id, deployment_name=None):
             self.create_session_async(session_id))
 
     async def destroy_session_async(self, session_id):
-        await self.stub.DestroySession(modelresponse_pb2.SessionID(session_id=session_id)
-                                       )
+        await self.mr_stub.DestroySession(
+            modelresponse_pb2.SessionID(session_id=session_id))
 
     def destroy_session(self, session_id, deployment_name=None):
         if len(self.deployments > 1):
@@ -176,20 +166,28 @@ def destroy_session(self, session_id, deployment_name=None):
         assert task == Tasks.TEXT_GENERATION, f"Session deletion only available for task '{Tasks.TEXT_GENERATION}'."
         self.asyncio_loop.run_until_complete(self.destroy_session_async(session_id))
 
-    async def delete_model_async(self, proto_request):
-        await getattr(self.stub, "DeleteDeployment")(proto_request)
 
-    def delete_model(self, deployment_name):
-        if deployment_name in self.deployments:
-            request_proto = modelresponse_pb2.DeleteDeployRequest(
-                deployment_name=deployment_name)
-            self.asyncio_loop.run_until_complete(self.delete_model_async(request_proto))
-            del self.deployments[deployment_name]
-            return None
-        assert False, f"Deployment: {deployment_name} not found"
+class LBClient(MIIClient):
+    def __init__(self,
+                 deployments,
+                 host,
+                 port,
+                 lb_config=None,
+                 model_path=None,
+                 port_map=None,
+                 deployment_tag=None):
+        super().__init__(deployments, host, port)
+        self.lb_stub = None
+        if port is not None:
+            channel = create_channel(host, port) if not self.channel else self.channel
+            self.lb_stub = modelresponse_pb2_grpc.DeploymentManagementStub(channel)
+        self.lb_config = lb_config
+        self.model_path = model_path
+        self.port_map = port_map if port_map is not None else {}
+        self.deployment_tag = deployment_tag
 
     async def add_models_async(self, proto_request):
-        await getattr(self.stub, "AddDeployment")(proto_request)
+        await getattr(self.lb_stub, "AddDeployment")(proto_request)
 
     def add_models(self,
                    task=None,
@@ -253,11 +251,13 @@ def add_models(self,
                           deployed=lb_enabled)
         if deployment_type == DeploymentType.LOCAL:
             mii.utils.import_score_file(self.deployment_tag).init()
-        if self.stub is None:
+        if self.lb_stub is None:
             self.port_number = getattr(next(iter(self.deployments.values())),
                                        mii.constants.MII_CONFIGS_KEY).port_number
-            channel = create_channel(self.host, self.port_number)
-            self.stub = modelresponse_pb2_grpc.DeploymentManagementStub(channel)
+            self.channel = create_channel(self.host, self.port_number)
+            self.lb_stub = modelresponse_pb2_grpc.DeploymentManagementStub(self.channel)
+            if not self.mr_stub:
+                self.mr_stub = modelresponse_pb2_grpc.ModelResponseStub(self.channel)
         for replica in lb_config.replica_configs:
             request_proto = modelresponse_pb2.AddDeployRequest(
                 task=replica.task,
@@ -269,6 +269,18 @@ def add_models(self,
 
             self.asyncio_loop.run_until_complete(self.add_models_async(request_proto))
 
+    async def delete_model_async(self, proto_request):
+        await getattr(self.lb_stub, "DeleteDeployment")(proto_request)
+
+    def delete_model(self, deployment_name):
+        if deployment_name in self.deployments:
+            request_proto = modelresponse_pb2.DeleteDeployRequest(
+                deployment_name=deployment_name)
+            self.asyncio_loop.run_until_complete(self.delete_model_async(request_proto))
+            del self.deployments[deployment_name]
+            return None
+        assert False, f"Deployment: {deployment_name} not found"
+
 
 class MIITensorParallelClient():
     """
diff --git a/mii/grpc_related/modelresponse_server.py b/mii/grpc_related/modelresponse_server.py
index 6b35d56f..026d4268 100644
--- a/mii/grpc_related/modelresponse_server.py
+++ b/mii/grpc_related/modelresponse_server.py
@@ -299,6 +299,9 @@ def _do_serve(service_impl, port, interceptors=[], is_lb=False):
         modelresponse_pb2_grpc.add_DeploymentManagementServicer_to_server(
             service_impl,
             server)
+        modelresponse_pb2_grpc.add_ModelResponseServicer_to_server(
+            ModelResponse(None),
+            server)
     else:
         modelresponse_pb2_grpc.add_ModelResponseServicer_to_server(service_impl, server)
     server.add_insecure_port(f'[::]:{port}')
diff --git a/mii/grpc_related/proto/modelresponse.proto b/mii/grpc_related/proto/modelresponse.proto
index 7daf300a..fc8a108f 100644
--- a/mii/grpc_related/proto/modelresponse.proto
+++ b/mii/grpc_related/proto/modelresponse.proto
@@ -38,17 +38,7 @@ service ModelResponse {
 
 service DeploymentManagement {
   rpc AddDeployment(AddDeployRequest) returns (google.protobuf.Empty) {}
-  rpc CreateSession (SessionID) returns (google.protobuf.Empty) {}
-  rpc DestroySession (SessionID) returns (google.protobuf.Empty) {}
-  rpc GeneratorReply (MultiStringRequest) returns (MultiStringReply) {}
-  rpc ClassificationReply (SingleStringRequest) returns (SingleStringReply) {}
-  rpc QuestionAndAnswerReply(QARequest) returns (SingleStringReply) {}
-  rpc FillMaskReply(SingleStringRequest) returns (SingleStringReply) {}
-  rpc TokenClassificationReply(SingleStringRequest) returns (SingleStringReply) {}
-  rpc ConversationalReply(ConversationRequest) returns (ConversationReply) {}
-  rpc Txt2ImgReply(MultiStringRequest) returns (ImageReply) {}
   rpc DeleteDeployment(DeleteDeployRequest) returns (google.protobuf.Empty) {}
-  rpc Terminate (google.protobuf.Empty) returns (google.protobuf.Empty) {}
 }
 
 message Value {
diff --git a/mii/grpc_related/proto/modelresponse_pb2.py b/mii/grpc_related/proto/modelresponse_pb2.py
index 30c7a340..fe37da18 100644
--- a/mii/grpc_related/proto/modelresponse_pb2.py
+++ b/mii/grpc_related/proto/modelresponse_pb2.py
@@ -16,7 +16,7 @@
 from google.protobuf import empty_pb2 as google_dot_protobuf_dot_empty__pb2
 
 DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(
-    b'\n\x13modelresponse.proto\x12\rmodelresponse\x1a\x1bgoogle/protobuf/empty.proto\"_\n\x05Value\x12\x10\n\x06svalue\x18\x01 \x01(\tH\x00\x12\x10\n\x06ivalue\x18\x02 \x01(\x03H\x00\x12\x10\n\x06\x66value\x18\x03 \x01(\x02H\x00\x12\x10\n\x06\x62value\x18\x04 \x01(\x08H\x00\x42\x0e\n\x0coneof_values\"\x1f\n\tSessionID\x12\x12\n\nsession_id\x18\x01 \x01(\t\"\xed\x01\n\x13SingleStringRequest\x12\x0f\n\x07request\x18\x01 \x01(\t\x12I\n\x0cquery_kwargs\x18\x02 \x03(\x0b\x32\x33.modelresponse.SingleStringRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x03 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\xeb\x01\n\x12MultiStringRequest\x12\x0f\n\x07request\x18\x01 \x03(\t\x12H\n\x0cquery_kwargs\x18\x02 \x03(\x0b\x32\x32.modelresponse.MultiStringRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x03 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\x85\x01\n\x11SingleStringReply\x12\x10\n\x08response\x18\x01 \x01(\t\x12\x12\n\ntime_taken\x18\x02 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x03 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\x84\x01\n\x10MultiStringReply\x12\x10\n\x08response\x18\x01 \x03(\t\x12\x12\n\ntime_taken\x18\x02 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x03 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\xeb\x01\n\tQARequest\x12\x10\n\x08question\x18\x01 \x01(\t\x12\x0f\n\x07\x63ontext\x18\x02 \x01(\t\x12?\n\x0cquery_kwargs\x18\x03 \x03(\x0b\x32).modelresponse.QARequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\xd3\x02\n\x13\x43onversationRequest\x12\x0c\n\x04text\x18\x01 \x01(\t\x12\x1c\n\x0f\x63onversation_id\x18\x02 \x01(\x03H\x00\x88\x01\x01\x12\x18\n\x10past_user_inputs\x18\x03 \x03(\t\x12\x1b\n\x13generated_responses\x18\x04 \x03(\t\x12I\n\x0cquery_kwargs\x18\x05 \x03(\x0b\x32\x33.modelresponse.ConversationRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x06 \x01(\tH\x01\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_conversation_idB\x12\n\x10_deployment_name\"\xc3\x01\n\x11\x43onversationReply\x12\x17\n\x0f\x63onversation_id\x18\x01 \x01(\x03\x12\x18\n\x10past_user_inputs\x18\x02 \x03(\t\x12\x1b\n\x13generated_responses\x18\x03 \x03(\t\x12\x12\n\ntime_taken\x18\x04 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x05 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x06 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\xaf\x01\n\nImageReply\x12\x0e\n\x06images\x18\x01 \x03(\x0c\x12\x1d\n\x15nsfw_content_detected\x18\x02 \x03(\x08\x12\x0c\n\x04mode\x18\x03 \x01(\t\x12\x0e\n\x06size_w\x18\x04 \x01(\x03\x12\x0e\n\x06size_h\x18\x05 \x01(\x03\x12\x12\n\ntime_taken\x18\x06 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x07 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\x98\x01\n\x10\x41\x64\x64\x44\x65ployRequest\x12\x0c\n\x04task\x18\x01 \x01(\t\x12\x17\n\x0f\x64\x65ployment_name\x18\x02 \x01(\t\x12\x10\n\x08hostname\x18\x03 \x01(\t\x12\x1d\n\x15tensor_parallel_ports\x18\x04 \x03(\x03\x12\x17\n\x0ftorch_dist_port\x18\x05 \x01(\x03\x12\x13\n\x0bgpu_indices\x18\x06 \x03(\x03\".\n\x13\x44\x65leteDeployRequest\x12\x17\n\x0f\x64\x65ployment_name\x18\x01 \x01(\t2\xd4\x06\n\rModelResponse\x12=\n\tTerminate\x12\x16.google.protobuf.Empty\x1a\x16.google.protobuf.Empty\"\x00\x12\x43\n\rCreateSession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12\x44\n\x0e\x44\x65stroySession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12V\n\x0eGeneratorReply\x12!.modelresponse.MultiStringRequest\x1a\x1f.modelresponse.MultiStringReply\"\x00\x12]\n\x13\x43lassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12V\n\x16QuestionAndAnswerReply\x12\x18.modelresponse.QARequest\x1a .modelresponse.SingleStringReply\"\x00\x12W\n\rFillMaskReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12\x62\n\x18TokenClassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12]\n\x13\x43onversationalReply\x12\".modelresponse.ConversationRequest\x1a .modelresponse.ConversationReply\"\x00\x12N\n\x0cTxt2ImgReply\x12!.modelresponse.MultiStringRequest\x1a\x19.modelresponse.ImageReply\"\x00\x32\xf9\x07\n\x14\x44\x65ploymentManagement\x12J\n\rAddDeployment\x12\x1f.modelresponse.AddDeployRequest\x1a\x16.google.protobuf.Empty\"\x00\x12\x43\n\rCreateSession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12\x44\n\x0e\x44\x65stroySession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12V\n\x0eGeneratorReply\x12!.modelresponse.MultiStringRequest\x1a\x1f.modelresponse.MultiStringReply\"\x00\x12]\n\x13\x43lassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12V\n\x16QuestionAndAnswerReply\x12\x18.modelresponse.QARequest\x1a .modelresponse.SingleStringReply\"\x00\x12W\n\rFillMaskReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12\x62\n\x18TokenClassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12]\n\x13\x43onversationalReply\x12\".modelresponse.ConversationRequest\x1a .modelresponse.ConversationReply\"\x00\x12N\n\x0cTxt2ImgReply\x12!.modelresponse.MultiStringRequest\x1a\x19.modelresponse.ImageReply\"\x00\x12P\n\x10\x44\x65leteDeployment\x12\".modelresponse.DeleteDeployRequest\x1a\x16.google.protobuf.Empty\"\x00\x12=\n\tTerminate\x12\x16.google.protobuf.Empty\x1a\x16.google.protobuf.Empty\"\x00\x62\x06proto3'
+    b'\n\x13modelresponse.proto\x12\rmodelresponse\x1a\x1bgoogle/protobuf/empty.proto\"_\n\x05Value\x12\x10\n\x06svalue\x18\x01 \x01(\tH\x00\x12\x10\n\x06ivalue\x18\x02 \x01(\x03H\x00\x12\x10\n\x06\x66value\x18\x03 \x01(\x02H\x00\x12\x10\n\x06\x62value\x18\x04 \x01(\x08H\x00\x42\x0e\n\x0coneof_values\"\x1f\n\tSessionID\x12\x12\n\nsession_id\x18\x01 \x01(\t\"\xed\x01\n\x13SingleStringRequest\x12\x0f\n\x07request\x18\x01 \x01(\t\x12I\n\x0cquery_kwargs\x18\x02 \x03(\x0b\x32\x33.modelresponse.SingleStringRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x03 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\xeb\x01\n\x12MultiStringRequest\x12\x0f\n\x07request\x18\x01 \x03(\t\x12H\n\x0cquery_kwargs\x18\x02 \x03(\x0b\x32\x32.modelresponse.MultiStringRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x03 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\x85\x01\n\x11SingleStringReply\x12\x10\n\x08response\x18\x01 \x01(\t\x12\x12\n\ntime_taken\x18\x02 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x03 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\x84\x01\n\x10MultiStringReply\x12\x10\n\x08response\x18\x01 \x03(\t\x12\x12\n\ntime_taken\x18\x02 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x03 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\xeb\x01\n\tQARequest\x12\x10\n\x08question\x18\x01 \x01(\t\x12\x0f\n\x07\x63ontext\x18\x02 \x01(\t\x12?\n\x0cquery_kwargs\x18\x03 \x03(\x0b\x32).modelresponse.QARequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\xd3\x02\n\x13\x43onversationRequest\x12\x0c\n\x04text\x18\x01 \x01(\t\x12\x1c\n\x0f\x63onversation_id\x18\x02 \x01(\x03H\x00\x88\x01\x01\x12\x18\n\x10past_user_inputs\x18\x03 \x03(\t\x12\x1b\n\x13generated_responses\x18\x04 \x03(\t\x12I\n\x0cquery_kwargs\x18\x05 \x03(\x0b\x32\x33.modelresponse.ConversationRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x06 \x01(\tH\x01\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_conversation_idB\x12\n\x10_deployment_name\"\xc3\x01\n\x11\x43onversationReply\x12\x17\n\x0f\x63onversation_id\x18\x01 \x01(\x03\x12\x18\n\x10past_user_inputs\x18\x02 \x03(\t\x12\x1b\n\x13generated_responses\x18\x03 \x03(\t\x12\x12\n\ntime_taken\x18\x04 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x05 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x06 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\xaf\x01\n\nImageReply\x12\x0e\n\x06images\x18\x01 \x03(\x0c\x12\x1d\n\x15nsfw_content_detected\x18\x02 \x03(\x08\x12\x0c\n\x04mode\x18\x03 \x01(\t\x12\x0e\n\x06size_w\x18\x04 \x01(\x03\x12\x0e\n\x06size_h\x18\x05 \x01(\x03\x12\x12\n\ntime_taken\x18\x06 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x07 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\x98\x01\n\x10\x41\x64\x64\x44\x65ployRequest\x12\x0c\n\x04task\x18\x01 \x01(\t\x12\x17\n\x0f\x64\x65ployment_name\x18\x02 \x01(\t\x12\x10\n\x08hostname\x18\x03 \x01(\t\x12\x1d\n\x15tensor_parallel_ports\x18\x04 \x03(\x03\x12\x17\n\x0ftorch_dist_port\x18\x05 \x01(\x03\x12\x13\n\x0bgpu_indices\x18\x06 \x03(\x03\".\n\x13\x44\x65leteDeployRequest\x12\x17\n\x0f\x64\x65ployment_name\x18\x01 \x01(\t2\xd4\x06\n\rModelResponse\x12=\n\tTerminate\x12\x16.google.protobuf.Empty\x1a\x16.google.protobuf.Empty\"\x00\x12\x43\n\rCreateSession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12\x44\n\x0e\x44\x65stroySession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12V\n\x0eGeneratorReply\x12!.modelresponse.MultiStringRequest\x1a\x1f.modelresponse.MultiStringReply\"\x00\x12]\n\x13\x43lassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12V\n\x16QuestionAndAnswerReply\x12\x18.modelresponse.QARequest\x1a .modelresponse.SingleStringReply\"\x00\x12W\n\rFillMaskReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12\x62\n\x18TokenClassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12]\n\x13\x43onversationalReply\x12\".modelresponse.ConversationRequest\x1a .modelresponse.ConversationReply\"\x00\x12N\n\x0cTxt2ImgReply\x12!.modelresponse.MultiStringRequest\x1a\x19.modelresponse.ImageReply\"\x00\x32\xb4\x01\n\x14\x44\x65ploymentManagement\x12J\n\rAddDeployment\x12\x1f.modelresponse.AddDeployRequest\x1a\x16.google.protobuf.Empty\"\x00\x12P\n\x10\x44\x65leteDeployment\x12\".modelresponse.DeleteDeployRequest\x1a\x16.google.protobuf.Empty\"\x00\x62\x06proto3'
 )
 
 _globals = globals()
@@ -68,5 +68,5 @@
     _globals['_MODELRESPONSE']._serialized_start = 2106
     _globals['_MODELRESPONSE']._serialized_end = 2958
     _globals['_DEPLOYMENTMANAGEMENT']._serialized_start = 2961
-    _globals['_DEPLOYMENTMANAGEMENT']._serialized_end = 3978
+    _globals['_DEPLOYMENTMANAGEMENT']._serialized_end = 3141
 # @@protoc_insertion_point(module_scope)
diff --git a/mii/grpc_related/proto/modelresponse_pb2_grpc.py b/mii/grpc_related/proto/modelresponse_pb2_grpc.py
index 49393660..e90d037d 100644
--- a/mii/grpc_related/proto/modelresponse_pb2_grpc.py
+++ b/mii/grpc_related/proto/modelresponse_pb2_grpc.py
@@ -482,62 +482,11 @@ def __init__(self, channel):
             request_serializer=modelresponse__pb2.AddDeployRequest.SerializeToString,
             response_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString,
         )
-        self.CreateSession = channel.unary_unary(
-            '/modelresponse.DeploymentManagement/CreateSession',
-            request_serializer=modelresponse__pb2.SessionID.SerializeToString,
-            response_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString,
-        )
-        self.DestroySession = channel.unary_unary(
-            '/modelresponse.DeploymentManagement/DestroySession',
-            request_serializer=modelresponse__pb2.SessionID.SerializeToString,
-            response_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString,
-        )
-        self.GeneratorReply = channel.unary_unary(
-            '/modelresponse.DeploymentManagement/GeneratorReply',
-            request_serializer=modelresponse__pb2.MultiStringRequest.SerializeToString,
-            response_deserializer=modelresponse__pb2.MultiStringReply.FromString,
-        )
-        self.ClassificationReply = channel.unary_unary(
-            '/modelresponse.DeploymentManagement/ClassificationReply',
-            request_serializer=modelresponse__pb2.SingleStringRequest.SerializeToString,
-            response_deserializer=modelresponse__pb2.SingleStringReply.FromString,
-        )
-        self.QuestionAndAnswerReply = channel.unary_unary(
-            '/modelresponse.DeploymentManagement/QuestionAndAnswerReply',
-            request_serializer=modelresponse__pb2.QARequest.SerializeToString,
-            response_deserializer=modelresponse__pb2.SingleStringReply.FromString,
-        )
-        self.FillMaskReply = channel.unary_unary(
-            '/modelresponse.DeploymentManagement/FillMaskReply',
-            request_serializer=modelresponse__pb2.SingleStringRequest.SerializeToString,
-            response_deserializer=modelresponse__pb2.SingleStringReply.FromString,
-        )
-        self.TokenClassificationReply = channel.unary_unary(
-            '/modelresponse.DeploymentManagement/TokenClassificationReply',
-            request_serializer=modelresponse__pb2.SingleStringRequest.SerializeToString,
-            response_deserializer=modelresponse__pb2.SingleStringReply.FromString,
-        )
-        self.ConversationalReply = channel.unary_unary(
-            '/modelresponse.DeploymentManagement/ConversationalReply',
-            request_serializer=modelresponse__pb2.ConversationRequest.SerializeToString,
-            response_deserializer=modelresponse__pb2.ConversationReply.FromString,
-        )
-        self.Txt2ImgReply = channel.unary_unary(
-            '/modelresponse.DeploymentManagement/Txt2ImgReply',
-            request_serializer=modelresponse__pb2.MultiStringRequest.SerializeToString,
-            response_deserializer=modelresponse__pb2.ImageReply.FromString,
-        )
         self.DeleteDeployment = channel.unary_unary(
             '/modelresponse.DeploymentManagement/DeleteDeployment',
             request_serializer=modelresponse__pb2.DeleteDeployRequest.SerializeToString,
             response_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString,
         )
-        self.Terminate = channel.unary_unary(
-            '/modelresponse.DeploymentManagement/Terminate',
-            request_serializer=google_dot_protobuf_dot_empty__pb2.Empty.
-            SerializeToString,
-            response_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString,
-        )
 
 
 class DeploymentManagementServicer(object):
@@ -548,72 +497,12 @@ def AddDeployment(self, request, context):
         context.set_details('Method not implemented!')
         raise NotImplementedError('Method not implemented!')
 
-    def CreateSession(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def DestroySession(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def GeneratorReply(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def ClassificationReply(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def QuestionAndAnswerReply(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def FillMaskReply(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def TokenClassificationReply(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def ConversationalReply(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
-    def Txt2ImgReply(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
     def DeleteDeployment(self, request, context):
         """Missing associated documentation comment in .proto file."""
         context.set_code(grpc.StatusCode.UNIMPLEMENTED)
         context.set_details('Method not implemented!')
         raise NotImplementedError('Method not implemented!')
 
-    def Terminate(self, request, context):
-        """Missing associated documentation comment in .proto file."""
-        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
-        context.set_details('Method not implemented!')
-        raise NotImplementedError('Method not implemented!')
-
 
 def add_DeploymentManagementServicer_to_server(servicer, server):
     rpc_method_handlers = {
@@ -624,62 +513,6 @@ def add_DeploymentManagementServicer_to_server(servicer, server):
             response_serializer=google_dot_protobuf_dot_empty__pb2.Empty.
             SerializeToString,
         ),
-        'CreateSession':
-        grpc.unary_unary_rpc_method_handler(
-            servicer.CreateSession,
-            request_deserializer=modelresponse__pb2.SessionID.FromString,
-            response_serializer=google_dot_protobuf_dot_empty__pb2.Empty.
-            SerializeToString,
-        ),
-        'DestroySession':
-        grpc.unary_unary_rpc_method_handler(
-            servicer.DestroySession,
-            request_deserializer=modelresponse__pb2.SessionID.FromString,
-            response_serializer=google_dot_protobuf_dot_empty__pb2.Empty.
-            SerializeToString,
-        ),
-        'GeneratorReply':
-        grpc.unary_unary_rpc_method_handler(
-            servicer.GeneratorReply,
-            request_deserializer=modelresponse__pb2.MultiStringRequest.FromString,
-            response_serializer=modelresponse__pb2.MultiStringReply.SerializeToString,
-        ),
-        'ClassificationReply':
-        grpc.unary_unary_rpc_method_handler(
-            servicer.ClassificationReply,
-            request_deserializer=modelresponse__pb2.SingleStringRequest.FromString,
-            response_serializer=modelresponse__pb2.SingleStringReply.SerializeToString,
-        ),
-        'QuestionAndAnswerReply':
-        grpc.unary_unary_rpc_method_handler(
-            servicer.QuestionAndAnswerReply,
-            request_deserializer=modelresponse__pb2.QARequest.FromString,
-            response_serializer=modelresponse__pb2.SingleStringReply.SerializeToString,
-        ),
-        'FillMaskReply':
-        grpc.unary_unary_rpc_method_handler(
-            servicer.FillMaskReply,
-            request_deserializer=modelresponse__pb2.SingleStringRequest.FromString,
-            response_serializer=modelresponse__pb2.SingleStringReply.SerializeToString,
-        ),
-        'TokenClassificationReply':
-        grpc.unary_unary_rpc_method_handler(
-            servicer.TokenClassificationReply,
-            request_deserializer=modelresponse__pb2.SingleStringRequest.FromString,
-            response_serializer=modelresponse__pb2.SingleStringReply.SerializeToString,
-        ),
-        'ConversationalReply':
-        grpc.unary_unary_rpc_method_handler(
-            servicer.ConversationalReply,
-            request_deserializer=modelresponse__pb2.ConversationRequest.FromString,
-            response_serializer=modelresponse__pb2.ConversationReply.SerializeToString,
-        ),
-        'Txt2ImgReply':
-        grpc.unary_unary_rpc_method_handler(
-            servicer.Txt2ImgReply,
-            request_deserializer=modelresponse__pb2.MultiStringRequest.FromString,
-            response_serializer=modelresponse__pb2.ImageReply.SerializeToString,
-        ),
         'DeleteDeployment':
         grpc.unary_unary_rpc_method_handler(
             servicer.DeleteDeployment,
@@ -687,13 +520,6 @@ def add_DeploymentManagementServicer_to_server(servicer, server):
             response_serializer=google_dot_protobuf_dot_empty__pb2.Empty.
             SerializeToString,
         ),
-        'Terminate':
-        grpc.unary_unary_rpc_method_handler(
-            servicer.Terminate,
-            request_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString,
-            response_serializer=google_dot_protobuf_dot_empty__pb2.Empty.
-            SerializeToString,
-        ),
     }
     generic_handler = grpc.method_handlers_generic_handler(
         'modelresponse.DeploymentManagement',
@@ -730,240 +556,6 @@ def AddDeployment(request,
             timeout,
             metadata)
 
-    @staticmethod
-    def CreateSession(request,
-                      target,
-                      options=(),
-                      channel_credentials=None,
-                      call_credentials=None,
-                      insecure=False,
-                      compression=None,
-                      wait_for_ready=None,
-                      timeout=None,
-                      metadata=None):
-        return grpc.experimental.unary_unary(
-            request,
-            target,
-            '/modelresponse.DeploymentManagement/CreateSession',
-            modelresponse__pb2.SessionID.SerializeToString,
-            google_dot_protobuf_dot_empty__pb2.Empty.FromString,
-            options,
-            channel_credentials,
-            insecure,
-            call_credentials,
-            compression,
-            wait_for_ready,
-            timeout,
-            metadata)
-
-    @staticmethod
-    def DestroySession(request,
-                       target,
-                       options=(),
-                       channel_credentials=None,
-                       call_credentials=None,
-                       insecure=False,
-                       compression=None,
-                       wait_for_ready=None,
-                       timeout=None,
-                       metadata=None):
-        return grpc.experimental.unary_unary(
-            request,
-            target,
-            '/modelresponse.DeploymentManagement/DestroySession',
-            modelresponse__pb2.SessionID.SerializeToString,
-            google_dot_protobuf_dot_empty__pb2.Empty.FromString,
-            options,
-            channel_credentials,
-            insecure,
-            call_credentials,
-            compression,
-            wait_for_ready,
-            timeout,
-            metadata)
-
-    @staticmethod
-    def GeneratorReply(request,
-                       target,
-                       options=(),
-                       channel_credentials=None,
-                       call_credentials=None,
-                       insecure=False,
-                       compression=None,
-                       wait_for_ready=None,
-                       timeout=None,
-                       metadata=None):
-        return grpc.experimental.unary_unary(
-            request,
-            target,
-            '/modelresponse.DeploymentManagement/GeneratorReply',
-            modelresponse__pb2.MultiStringRequest.SerializeToString,
-            modelresponse__pb2.MultiStringReply.FromString,
-            options,
-            channel_credentials,
-            insecure,
-            call_credentials,
-            compression,
-            wait_for_ready,
-            timeout,
-            metadata)
-
-    @staticmethod
-    def ClassificationReply(request,
-                            target,
-                            options=(),
-                            channel_credentials=None,
-                            call_credentials=None,
-                            insecure=False,
-                            compression=None,
-                            wait_for_ready=None,
-                            timeout=None,
-                            metadata=None):
-        return grpc.experimental.unary_unary(
-            request,
-            target,
-            '/modelresponse.DeploymentManagement/ClassificationReply',
-            modelresponse__pb2.SingleStringRequest.SerializeToString,
-            modelresponse__pb2.SingleStringReply.FromString,
-            options,
-            channel_credentials,
-            insecure,
-            call_credentials,
-            compression,
-            wait_for_ready,
-            timeout,
-            metadata)
-
-    @staticmethod
-    def QuestionAndAnswerReply(request,
-                               target,
-                               options=(),
-                               channel_credentials=None,
-                               call_credentials=None,
-                               insecure=False,
-                               compression=None,
-                               wait_for_ready=None,
-                               timeout=None,
-                               metadata=None):
-        return grpc.experimental.unary_unary(
-            request,
-            target,
-            '/modelresponse.DeploymentManagement/QuestionAndAnswerReply',
-            modelresponse__pb2.QARequest.SerializeToString,
-            modelresponse__pb2.SingleStringReply.FromString,
-            options,
-            channel_credentials,
-            insecure,
-            call_credentials,
-            compression,
-            wait_for_ready,
-            timeout,
-            metadata)
-
-    @staticmethod
-    def FillMaskReply(request,
-                      target,
-                      options=(),
-                      channel_credentials=None,
-                      call_credentials=None,
-                      insecure=False,
-                      compression=None,
-                      wait_for_ready=None,
-                      timeout=None,
-                      metadata=None):
-        return grpc.experimental.unary_unary(
-            request,
-            target,
-            '/modelresponse.DeploymentManagement/FillMaskReply',
-            modelresponse__pb2.SingleStringRequest.SerializeToString,
-            modelresponse__pb2.SingleStringReply.FromString,
-            options,
-            channel_credentials,
-            insecure,
-            call_credentials,
-            compression,
-            wait_for_ready,
-            timeout,
-            metadata)
-
-    @staticmethod
-    def TokenClassificationReply(request,
-                                 target,
-                                 options=(),
-                                 channel_credentials=None,
-                                 call_credentials=None,
-                                 insecure=False,
-                                 compression=None,
-                                 wait_for_ready=None,
-                                 timeout=None,
-                                 metadata=None):
-        return grpc.experimental.unary_unary(
-            request,
-            target,
-            '/modelresponse.DeploymentManagement/TokenClassificationReply',
-            modelresponse__pb2.SingleStringRequest.SerializeToString,
-            modelresponse__pb2.SingleStringReply.FromString,
-            options,
-            channel_credentials,
-            insecure,
-            call_credentials,
-            compression,
-            wait_for_ready,
-            timeout,
-            metadata)
-
-    @staticmethod
-    def ConversationalReply(request,
-                            target,
-                            options=(),
-                            channel_credentials=None,
-                            call_credentials=None,
-                            insecure=False,
-                            compression=None,
-                            wait_for_ready=None,
-                            timeout=None,
-                            metadata=None):
-        return grpc.experimental.unary_unary(
-            request,
-            target,
-            '/modelresponse.DeploymentManagement/ConversationalReply',
-            modelresponse__pb2.ConversationRequest.SerializeToString,
-            modelresponse__pb2.ConversationReply.FromString,
-            options,
-            channel_credentials,
-            insecure,
-            call_credentials,
-            compression,
-            wait_for_ready,
-            timeout,
-            metadata)
-
-    @staticmethod
-    def Txt2ImgReply(request,
-                     target,
-                     options=(),
-                     channel_credentials=None,
-                     call_credentials=None,
-                     insecure=False,
-                     compression=None,
-                     wait_for_ready=None,
-                     timeout=None,
-                     metadata=None):
-        return grpc.experimental.unary_unary(
-            request,
-            target,
-            '/modelresponse.DeploymentManagement/Txt2ImgReply',
-            modelresponse__pb2.MultiStringRequest.SerializeToString,
-            modelresponse__pb2.ImageReply.FromString,
-            options,
-            channel_credentials,
-            insecure,
-            call_credentials,
-            compression,
-            wait_for_ready,
-            timeout,
-            metadata)
-
     @staticmethod
     def DeleteDeployment(request,
                          target,
@@ -989,29 +581,3 @@ def DeleteDeployment(request,
             wait_for_ready,
             timeout,
             metadata)
-
-    @staticmethod
-    def Terminate(request,
-                  target,
-                  options=(),
-                  channel_credentials=None,
-                  call_credentials=None,
-                  insecure=False,
-                  compression=None,
-                  wait_for_ready=None,
-                  timeout=None,
-                  metadata=None):
-        return grpc.experimental.unary_unary(
-            request,
-            target,
-            '/modelresponse.DeploymentManagement/Terminate',
-            google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString,
-            google_dot_protobuf_dot_empty__pb2.Empty.FromString,
-            options,
-            channel_credentials,
-            insecure,
-            call_credentials,
-            compression,
-            wait_for_ready,
-            timeout,
-            metadata)

From e1b6d230967e6727b2535bbd885f2ee1d9424ed3 Mon Sep 17 00:00:00 2001
From: Tosin Segun <insanechils@gmail.com>
Date: Fri, 28 Jul 2023 23:09:05 +0000
Subject: [PATCH 61/69] Fixing LB process not terminating

---
 mii/client.py                                 | 36 +++++----------
 mii/grpc_related/modelresponse_server.py      |  3 ++
 mii/grpc_related/proto/build_script.sh        |  2 +-
 mii/grpc_related/proto/modelresponse.proto    |  1 +
 mii/grpc_related/proto/modelresponse_pb2.py   |  5 ++-
 .../proto/modelresponse_pb2_grpc.py           | 45 +++++++++++++++++++
 mii/models/score/score_template.py            | 21 ---------
 mii/server.py                                 | 13 ------
 8 files changed, 63 insertions(+), 63 deletions(-)

diff --git a/mii/client.py b/mii/client.py
index 1236876b..c1edd93f 100644
--- a/mii/client.py
+++ b/mii/client.py
@@ -19,19 +19,6 @@ def _get_deployment_configs(deployment_tag):
     configs = mii.utils.import_score_file(deployment_tag).configs
     for deployment in configs[mii.constants.DEPLOYMENTS_KEY].values():
         deployment_name = deployment[mii.constants.DEPLOYMENT_NAME_KEY]
-        """
-        data = {
-            'deployment_name': deployment[mii.constants.DEPLOYMENT_NAME_KEY],
-            'task': deployment[mii.constants.TASK_NAME_KEY],
-            'model': deployment[mii.constants.MODEL_NAME_KEY],
-            'enable_deepspeed': deployment[mii.constants.ENABLE_DEEPSPEED_KEY],
-            'enable_zero': deployment[mii.constants.ENABLE_DEEPSPEED_ZERO_KEY],
-            'GPU_index_map': deployment[mii.constants.GPU_INDEX_KEY],
-            'mii_config': deployment[mii.constants.MII_CONFIGS_KEY],
-            'ds_config': deployment[mii.constants.DEEPSPEED_CONFIG_KEY],
-            'version': 1,
-        }
-        """
         deployments[deployment_name] = DeploymentConfig(**deployment)
     lb_config = configs.get(mii.constants.LOAD_BALANCER_CONFIG_KEY)
     model_path = configs[mii.constants.MODEL_PATH_KEY]
@@ -103,16 +90,10 @@ def _get_deployment_task(self, deployment_name=None):
             assert len(self.deployments) == 1, "Must pass deployment_name to query when using multiple deployments"
             deployment = next(iter(self.deployments.values()))
             deployment_name = getattr(deployment, mii.constants.DEPLOYMENT_NAME_KEY)
-            #task = get_task(deployment.task) if isinstance(deployment.task,
-            #str) else deployment.task
             task = getattr(deployment, mii.constants.TASK_NAME_KEY)
         else:
             if deployment_name in self.deployments:
                 deployment = self.deployments[deployment_name]
-                """
-                task = get_task(deployment.task) if isinstance(deployment.task,
-                                                               str) else deployment.task
-                                                               """
                 task = getattr(deployment, mii.constants.TASK_NAME_KEY)
             else:
                 assert False, f"{deployment_name} not found in list of deployments"
@@ -137,7 +118,7 @@ def query(self, request_dict, **query_kwargs):
                                          **query_kwargs))
 
     async def terminate_async(self):
-        await self.mr_stub.Terminate(
+        await self.lb_stub.Terminate(
             modelresponse_pb2.google_dot_protobuf_dot_empty__pb2.Empty())
 
     def terminate(self):
@@ -186,6 +167,15 @@ def __init__(self,
         self.port_map = port_map if port_map is not None else {}
         self.deployment_tag = deployment_tag
 
+    """
+    async def terminate_async(self):
+        await self.lb_stub.Terminate(
+            modelresponse_pb2.google_dot_protobuf_dot_empty__pb2.Empty())
+
+    def terminate(self):
+        self.asyncio_loop.run_until_complete(self.terminate_async())
+    """
+
     async def add_models_async(self, proto_request):
         await getattr(self.lb_stub, "AddDeployment")(proto_request)
 
@@ -223,8 +213,6 @@ def add_models(self,
                     mii.constants.DEPLOYMENT_NAME_KEY): deployment
             for deployment in deployments
         }
-        #for deployment in deployments:
-        #    deployment.task = get_task(deployment.task)
         lb_config, self.port_map = allocate_processes(deps, self.port_map)
         lb_enabled = True if len(self.deployments) else False
         if self.lb_config is not None:
@@ -238,10 +226,6 @@ def add_models(self,
             self.model_path = mii.constants.MII_MODEL_PATH_DEFAULT
         elif self.model_path is None and deployment_type == DeploymentType.AML:
             model_path = "model"
-        #for deployment in self.deployments.values():
-        #if isinstance(deployment.task, str):
-        #deployment.task = get_task(deployment.task)
-        #lb_enabled = True if len(self.deployments) else False
         create_score_file(deployment_tag=self.deployment_tag,
                           deployment_type=deployment_type,
                           deployments=deps,
diff --git a/mii/grpc_related/modelresponse_server.py b/mii/grpc_related/modelresponse_server.py
index 026d4268..4aa485dc 100644
--- a/mii/grpc_related/modelresponse_server.py
+++ b/mii/grpc_related/modelresponse_server.py
@@ -35,6 +35,9 @@ def get_stop_event(self):
 
 class DeploymentManagement(ServiceBase,
                            modelresponse_pb2_grpc.DeploymentManagementServicer):
+    def __init__(self):
+        ServiceBase.__init__(self)
+
     def AddDeployment(self, request, context):
         return google_dot_protobuf_dot_empty__pb2.Empty()
 
diff --git a/mii/grpc_related/proto/build_script.sh b/mii/grpc_related/proto/build_script.sh
index 9aaf3bd2..d8615a85 100644
--- a/mii/grpc_related/proto/build_script.sh
+++ b/mii/grpc_related/proto/build_script.sh
@@ -3,7 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 # DeepSpeed Team
-python -m grpc_tools.protoc -I./ --python_out=. --grpc_python_out=. ./modelresponse.proto
+python3 -m grpc_tools.protoc -I./ --python_out=. --grpc_python_out=. ./modelresponse.proto
 
 # update import to be global wrt mii
 sed -i 's/modelresponse_pb2/mii.grpc_related.proto.modelresponse_pb2/g' modelresponse_pb2_grpc.py
diff --git a/mii/grpc_related/proto/modelresponse.proto b/mii/grpc_related/proto/modelresponse.proto
index fc8a108f..146e1f30 100644
--- a/mii/grpc_related/proto/modelresponse.proto
+++ b/mii/grpc_related/proto/modelresponse.proto
@@ -39,6 +39,7 @@ service ModelResponse {
 service DeploymentManagement {
   rpc AddDeployment(AddDeployRequest) returns (google.protobuf.Empty) {}
   rpc DeleteDeployment(DeleteDeployRequest) returns (google.protobuf.Empty) {}
+  rpc Terminate (google.protobuf.Empty) returns (google.protobuf.Empty) {}
 }
 
 message Value {
diff --git a/mii/grpc_related/proto/modelresponse_pb2.py b/mii/grpc_related/proto/modelresponse_pb2.py
index fe37da18..72c33ed8 100644
--- a/mii/grpc_related/proto/modelresponse_pb2.py
+++ b/mii/grpc_related/proto/modelresponse_pb2.py
@@ -2,6 +2,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 # DeepSpeed Team
+# -*- coding: utf-8 -*-
 # Generated by the protocol buffer compiler.  DO NOT EDIT!
 # source: modelresponse.proto
 """Generated protocol buffer code."""
@@ -16,7 +17,7 @@
 from google.protobuf import empty_pb2 as google_dot_protobuf_dot_empty__pb2
 
 DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(
-    b'\n\x13modelresponse.proto\x12\rmodelresponse\x1a\x1bgoogle/protobuf/empty.proto\"_\n\x05Value\x12\x10\n\x06svalue\x18\x01 \x01(\tH\x00\x12\x10\n\x06ivalue\x18\x02 \x01(\x03H\x00\x12\x10\n\x06\x66value\x18\x03 \x01(\x02H\x00\x12\x10\n\x06\x62value\x18\x04 \x01(\x08H\x00\x42\x0e\n\x0coneof_values\"\x1f\n\tSessionID\x12\x12\n\nsession_id\x18\x01 \x01(\t\"\xed\x01\n\x13SingleStringRequest\x12\x0f\n\x07request\x18\x01 \x01(\t\x12I\n\x0cquery_kwargs\x18\x02 \x03(\x0b\x32\x33.modelresponse.SingleStringRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x03 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\xeb\x01\n\x12MultiStringRequest\x12\x0f\n\x07request\x18\x01 \x03(\t\x12H\n\x0cquery_kwargs\x18\x02 \x03(\x0b\x32\x32.modelresponse.MultiStringRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x03 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\x85\x01\n\x11SingleStringReply\x12\x10\n\x08response\x18\x01 \x01(\t\x12\x12\n\ntime_taken\x18\x02 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x03 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\x84\x01\n\x10MultiStringReply\x12\x10\n\x08response\x18\x01 \x03(\t\x12\x12\n\ntime_taken\x18\x02 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x03 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\xeb\x01\n\tQARequest\x12\x10\n\x08question\x18\x01 \x01(\t\x12\x0f\n\x07\x63ontext\x18\x02 \x01(\t\x12?\n\x0cquery_kwargs\x18\x03 \x03(\x0b\x32).modelresponse.QARequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\xd3\x02\n\x13\x43onversationRequest\x12\x0c\n\x04text\x18\x01 \x01(\t\x12\x1c\n\x0f\x63onversation_id\x18\x02 \x01(\x03H\x00\x88\x01\x01\x12\x18\n\x10past_user_inputs\x18\x03 \x03(\t\x12\x1b\n\x13generated_responses\x18\x04 \x03(\t\x12I\n\x0cquery_kwargs\x18\x05 \x03(\x0b\x32\x33.modelresponse.ConversationRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x06 \x01(\tH\x01\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_conversation_idB\x12\n\x10_deployment_name\"\xc3\x01\n\x11\x43onversationReply\x12\x17\n\x0f\x63onversation_id\x18\x01 \x01(\x03\x12\x18\n\x10past_user_inputs\x18\x02 \x03(\t\x12\x1b\n\x13generated_responses\x18\x03 \x03(\t\x12\x12\n\ntime_taken\x18\x04 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x05 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x06 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\xaf\x01\n\nImageReply\x12\x0e\n\x06images\x18\x01 \x03(\x0c\x12\x1d\n\x15nsfw_content_detected\x18\x02 \x03(\x08\x12\x0c\n\x04mode\x18\x03 \x01(\t\x12\x0e\n\x06size_w\x18\x04 \x01(\x03\x12\x0e\n\x06size_h\x18\x05 \x01(\x03\x12\x12\n\ntime_taken\x18\x06 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x07 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\x98\x01\n\x10\x41\x64\x64\x44\x65ployRequest\x12\x0c\n\x04task\x18\x01 \x01(\t\x12\x17\n\x0f\x64\x65ployment_name\x18\x02 \x01(\t\x12\x10\n\x08hostname\x18\x03 \x01(\t\x12\x1d\n\x15tensor_parallel_ports\x18\x04 \x03(\x03\x12\x17\n\x0ftorch_dist_port\x18\x05 \x01(\x03\x12\x13\n\x0bgpu_indices\x18\x06 \x03(\x03\".\n\x13\x44\x65leteDeployRequest\x12\x17\n\x0f\x64\x65ployment_name\x18\x01 \x01(\t2\xd4\x06\n\rModelResponse\x12=\n\tTerminate\x12\x16.google.protobuf.Empty\x1a\x16.google.protobuf.Empty\"\x00\x12\x43\n\rCreateSession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12\x44\n\x0e\x44\x65stroySession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12V\n\x0eGeneratorReply\x12!.modelresponse.MultiStringRequest\x1a\x1f.modelresponse.MultiStringReply\"\x00\x12]\n\x13\x43lassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12V\n\x16QuestionAndAnswerReply\x12\x18.modelresponse.QARequest\x1a .modelresponse.SingleStringReply\"\x00\x12W\n\rFillMaskReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12\x62\n\x18TokenClassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12]\n\x13\x43onversationalReply\x12\".modelresponse.ConversationRequest\x1a .modelresponse.ConversationReply\"\x00\x12N\n\x0cTxt2ImgReply\x12!.modelresponse.MultiStringRequest\x1a\x19.modelresponse.ImageReply\"\x00\x32\xb4\x01\n\x14\x44\x65ploymentManagement\x12J\n\rAddDeployment\x12\x1f.modelresponse.AddDeployRequest\x1a\x16.google.protobuf.Empty\"\x00\x12P\n\x10\x44\x65leteDeployment\x12\".modelresponse.DeleteDeployRequest\x1a\x16.google.protobuf.Empty\"\x00\x62\x06proto3'
+    b'\n\x13modelresponse.proto\x12\rmodelresponse\x1a\x1bgoogle/protobuf/empty.proto\"_\n\x05Value\x12\x10\n\x06svalue\x18\x01 \x01(\tH\x00\x12\x10\n\x06ivalue\x18\x02 \x01(\x03H\x00\x12\x10\n\x06\x66value\x18\x03 \x01(\x02H\x00\x12\x10\n\x06\x62value\x18\x04 \x01(\x08H\x00\x42\x0e\n\x0coneof_values\"\x1f\n\tSessionID\x12\x12\n\nsession_id\x18\x01 \x01(\t\"\xed\x01\n\x13SingleStringRequest\x12\x0f\n\x07request\x18\x01 \x01(\t\x12I\n\x0cquery_kwargs\x18\x02 \x03(\x0b\x32\x33.modelresponse.SingleStringRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x03 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\xeb\x01\n\x12MultiStringRequest\x12\x0f\n\x07request\x18\x01 \x03(\t\x12H\n\x0cquery_kwargs\x18\x02 \x03(\x0b\x32\x32.modelresponse.MultiStringRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x03 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\x85\x01\n\x11SingleStringReply\x12\x10\n\x08response\x18\x01 \x01(\t\x12\x12\n\ntime_taken\x18\x02 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x03 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\x84\x01\n\x10MultiStringReply\x12\x10\n\x08response\x18\x01 \x03(\t\x12\x12\n\ntime_taken\x18\x02 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x03 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\xeb\x01\n\tQARequest\x12\x10\n\x08question\x18\x01 \x01(\t\x12\x0f\n\x07\x63ontext\x18\x02 \x01(\t\x12?\n\x0cquery_kwargs\x18\x03 \x03(\x0b\x32).modelresponse.QARequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\xd3\x02\n\x13\x43onversationRequest\x12\x0c\n\x04text\x18\x01 \x01(\t\x12\x1c\n\x0f\x63onversation_id\x18\x02 \x01(\x03H\x00\x88\x01\x01\x12\x18\n\x10past_user_inputs\x18\x03 \x03(\t\x12\x1b\n\x13generated_responses\x18\x04 \x03(\t\x12I\n\x0cquery_kwargs\x18\x05 \x03(\x0b\x32\x33.modelresponse.ConversationRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x06 \x01(\tH\x01\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_conversation_idB\x12\n\x10_deployment_name\"\xc3\x01\n\x11\x43onversationReply\x12\x17\n\x0f\x63onversation_id\x18\x01 \x01(\x03\x12\x18\n\x10past_user_inputs\x18\x02 \x03(\t\x12\x1b\n\x13generated_responses\x18\x03 \x03(\t\x12\x12\n\ntime_taken\x18\x04 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x05 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x06 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\xaf\x01\n\nImageReply\x12\x0e\n\x06images\x18\x01 \x03(\x0c\x12\x1d\n\x15nsfw_content_detected\x18\x02 \x03(\x08\x12\x0c\n\x04mode\x18\x03 \x01(\t\x12\x0e\n\x06size_w\x18\x04 \x01(\x03\x12\x0e\n\x06size_h\x18\x05 \x01(\x03\x12\x12\n\ntime_taken\x18\x06 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x07 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\x98\x01\n\x10\x41\x64\x64\x44\x65ployRequest\x12\x0c\n\x04task\x18\x01 \x01(\t\x12\x17\n\x0f\x64\x65ployment_name\x18\x02 \x01(\t\x12\x10\n\x08hostname\x18\x03 \x01(\t\x12\x1d\n\x15tensor_parallel_ports\x18\x04 \x03(\x03\x12\x17\n\x0ftorch_dist_port\x18\x05 \x01(\x03\x12\x13\n\x0bgpu_indices\x18\x06 \x03(\x03\".\n\x13\x44\x65leteDeployRequest\x12\x17\n\x0f\x64\x65ployment_name\x18\x01 \x01(\t2\xd4\x06\n\rModelResponse\x12=\n\tTerminate\x12\x16.google.protobuf.Empty\x1a\x16.google.protobuf.Empty\"\x00\x12\x43\n\rCreateSession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12\x44\n\x0e\x44\x65stroySession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12V\n\x0eGeneratorReply\x12!.modelresponse.MultiStringRequest\x1a\x1f.modelresponse.MultiStringReply\"\x00\x12]\n\x13\x43lassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12V\n\x16QuestionAndAnswerReply\x12\x18.modelresponse.QARequest\x1a .modelresponse.SingleStringReply\"\x00\x12W\n\rFillMaskReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12\x62\n\x18TokenClassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12]\n\x13\x43onversationalReply\x12\".modelresponse.ConversationRequest\x1a .modelresponse.ConversationReply\"\x00\x12N\n\x0cTxt2ImgReply\x12!.modelresponse.MultiStringRequest\x1a\x19.modelresponse.ImageReply\"\x00\x32\xf3\x01\n\x14\x44\x65ploymentManagement\x12J\n\rAddDeployment\x12\x1f.modelresponse.AddDeployRequest\x1a\x16.google.protobuf.Empty\"\x00\x12P\n\x10\x44\x65leteDeployment\x12\".modelresponse.DeleteDeployRequest\x1a\x16.google.protobuf.Empty\"\x00\x12=\n\tTerminate\x12\x16.google.protobuf.Empty\x1a\x16.google.protobuf.Empty\"\x00\x62\x06proto3'
 )
 
 _globals = globals()
@@ -68,5 +69,5 @@
     _globals['_MODELRESPONSE']._serialized_start = 2106
     _globals['_MODELRESPONSE']._serialized_end = 2958
     _globals['_DEPLOYMENTMANAGEMENT']._serialized_start = 2961
-    _globals['_DEPLOYMENTMANAGEMENT']._serialized_end = 3141
+    _globals['_DEPLOYMENTMANAGEMENT']._serialized_end = 3204
 # @@protoc_insertion_point(module_scope)
diff --git a/mii/grpc_related/proto/modelresponse_pb2_grpc.py b/mii/grpc_related/proto/modelresponse_pb2_grpc.py
index e90d037d..5334f127 100644
--- a/mii/grpc_related/proto/modelresponse_pb2_grpc.py
+++ b/mii/grpc_related/proto/modelresponse_pb2_grpc.py
@@ -487,6 +487,12 @@ def __init__(self, channel):
             request_serializer=modelresponse__pb2.DeleteDeployRequest.SerializeToString,
             response_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString,
         )
+        self.Terminate = channel.unary_unary(
+            '/modelresponse.DeploymentManagement/Terminate',
+            request_serializer=google_dot_protobuf_dot_empty__pb2.Empty.
+            SerializeToString,
+            response_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString,
+        )
 
 
 class DeploymentManagementServicer(object):
@@ -503,6 +509,12 @@ def DeleteDeployment(self, request, context):
         context.set_details('Method not implemented!')
         raise NotImplementedError('Method not implemented!')
 
+    def Terminate(self, request, context):
+        """Missing associated documentation comment in .proto file."""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
 
 def add_DeploymentManagementServicer_to_server(servicer, server):
     rpc_method_handlers = {
@@ -520,6 +532,13 @@ def add_DeploymentManagementServicer_to_server(servicer, server):
             response_serializer=google_dot_protobuf_dot_empty__pb2.Empty.
             SerializeToString,
         ),
+        'Terminate':
+        grpc.unary_unary_rpc_method_handler(
+            servicer.Terminate,
+            request_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString,
+            response_serializer=google_dot_protobuf_dot_empty__pb2.Empty.
+            SerializeToString,
+        ),
     }
     generic_handler = grpc.method_handlers_generic_handler(
         'modelresponse.DeploymentManagement',
@@ -581,3 +600,29 @@ def DeleteDeployment(request,
             wait_for_ready,
             timeout,
             metadata)
+
+    @staticmethod
+    def Terminate(request,
+                  target,
+                  options=(),
+                  channel_credentials=None,
+                  call_credentials=None,
+                  insecure=False,
+                  compression=None,
+                  wait_for_ready=None,
+                  timeout=None,
+                  metadata=None):
+        return grpc.experimental.unary_unary(
+            request,
+            target,
+            '/modelresponse.DeploymentManagement/Terminate',
+            google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString,
+            google_dot_protobuf_dot_empty__pb2.Empty.FromString,
+            options,
+            channel_credentials,
+            insecure,
+            call_credentials,
+            compression,
+            wait_for_ready,
+            timeout,
+            metadata)
diff --git a/mii/models/score/score_template.py b/mii/models/score/score_template.py
index c4905f3c..7c8208b8 100644
--- a/mii/models/score/score_template.py
+++ b/mii/models/score/score_template.py
@@ -20,28 +20,7 @@ def init():
     deployments = []
     lb_enabled = configs[mii.constants.DEPLOYED_KEY]
     for deployment in configs[mii.constants.DEPLOYMENTS_KEY].values():
-        """
-        data = {
-            'deployment_name': deployment[mii.constants.DEPLOYMENT_NAME_KEY],
-            'task': deployment[mii.constants.TASK_NAME_KEY],
-            'model': deployment[mii.constants.MODEL_NAME_KEY],
-            'enable_deepspeed': deployment[mii.constants.ENABLE_DEEPSPEED_KEY],
-            'enable_zero': deployment[mii.constants.ENABLE_DEEPSPEED_ZERO_KEY],
-            'GPU_index_map': deployment[mii.constants.GPU_INDEX_KEY],
-            'mii_config': deployment[mii.constants.MII_CONFIGS_KEY],
-            'ds_config': deployment[mii.constants.DEEPSPEED_CONFIG_KEY],
-            'version': 1
-        }
-        """
         deployments.append(mii.DeploymentConfig(**deployment))
-    """
-    deployment_name = configs[mii.constants.DEPLOYMENT_NAME_KEY]
-    model_name = configs[mii.constants.MODEL_NAME_KEY]
-    task_name = configs[mii.constants.TASK_NAME_KEY]
-
-    assert model_name is not None, "The model name should be set before calling init"
-    assert task_name is not None, "The task name should be set before calling init"
-    """
 
     mii.MIIServer(deployment_tag,
                   deployments,
diff --git a/mii/server.py b/mii/server.py
index 4fef14f4..30d69363 100644
--- a/mii/server.py
+++ b/mii/server.py
@@ -166,19 +166,6 @@ def _launch_load_balancer(self, model_path, lb_config):
         mii_env["TRANSFORMERS_CACHE"] = model_path
         logger.info(f"load balancer server launch: {cmd}")
         return subprocess.Popen(cmd, env=mii_env)
-        """
-        return self._launch_server_process(
-            deployment_name,
-            model_name,
-            model_path,
-            ds_optimize,
-            ds_zero,
-            ds_config,
-            mii_configs,
-            mii_configs.port_number,
-            "load balancer",
-            ex_server_args=[f"--load-balancer {b64_config_str}"])
-        """
 
     def _launch_restful_gateway(self,
                                 deployment_name,

From 1675bd8474f25ad96ff8ed7fa19a2860d203f8b3 Mon Sep 17 00:00:00 2001
From: Tosin Segun <insanechils@gmail.com>
Date: Mon, 31 Jul 2023 20:41:09 +0000
Subject: [PATCH 62/69] Adding multi_deployment and partial deploy/terminate
 unit tests

---
 tests/conftest.py              | 99 ++++++++++++++++++++++++++++++++++
 tests/test_multi_deployment.py | 35 ++++++++++++
 2 files changed, 134 insertions(+)
 create mode 100644 tests/test_multi_deployment.py

diff --git a/tests/conftest.py b/tests/conftest.py
index cb812069..ed224bee 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -108,6 +108,55 @@ def ds_config(request):
     return request.param
 
 
+@pytest.fixture(scope="function", params=["Multi_Model_Tag"])
+def deployment_tag(request):
+    return request.param
+
+
+@pytest.fixture(scope="function", params=[[]])
+def deployments(request):
+    ret = {}
+    gpu_index_map1 = {'master': [0]}
+    gpu_index_map2 = {'master': [1]}
+    gpu_index_map3 = {'master': [0, 1]}
+
+    deployments = []
+
+    mii_configs1 = {"tensor_parallel": 2, "dtype": "fp16"}
+    mii_configs2 = {"tensor_parallel": 1}
+
+    name = "bigscience/bloom-560m"
+    deployments.append(
+        mii.DeploymentConfig(task='text-generation',
+                             model=name,
+                             deployment_name=name + "_deployment",
+                             GPU_index_map=gpu_index_map3,
+                             mii_configs=mii.config.MIIConfig(**mii_configs1)))
+
+    name = "microsoft/DialogRPT-human-vs-rand"
+    deployments.append(
+        mii.DeploymentConfig(task='text-classification',
+                             model=name,
+                             deployment_name=name + "_deployment",
+                             GPU_index_map=gpu_index_map2))
+
+    name = "microsoft/DialoGPT-large"
+    deployments.append(
+        mii.DeploymentConfig(task='conversational',
+                             model=name,
+                             deployment_name=name + "_deployment",
+                             GPU_index_map=gpu_index_map1,
+                             mii_configs=mii.config.MIIConfig(**mii_configs2)))
+
+    name = "deepset/roberta-large-squad2"
+    deployments.append(
+        mii.DeploymentConfig(task="question-answering",
+                             model=name,
+                             deployment_name=name + "-qa-deployment",
+                             GPU_index_map=gpu_index_map2))
+    return deployments
+
+
 @pytest.fixture(scope="function")
 def deployment_config(task_name: str,
                       model_name: str,
@@ -130,6 +179,19 @@ def deployment_config(task_name: str,
     return config
 
 
+@pytest.fixture(scope="function")
+def multi_deployment_config(deployments: list,
+                            deployment_tag: str,
+                            deployment_type: str):
+    config = SimpleNamespace(deployments=deployments,
+                             deployment_type=deployment_type,
+                             deployment_tag=deployment_tag,
+                             model_path=os.getenv("TRANSFORMERS_CACHE",
+                                                  None))
+    validate_config(config)
+    return config
+
+
 @pytest.fixture(scope="function", params=[None])
 def expected_failure(request):
     return request.param
@@ -147,6 +209,43 @@ def deployment(deployment_config, expected_failure):
         mii.terminate(deployment_config.deployment_name)
 
 
+@pytest.fixture(scope="function")
+def multi_deployment(deployment_tag, multi_deployment_config):
+    mii.deploy(**multi_deployment_config.__dict__)
+    yield multi_deployment_config
+    mii.terminate(deployment_tag)
+
+
 @pytest.fixture(scope="function", params=[{"query": "DeepSpeed is the greatest"}])
 def query(request):
     return request.param
+
+
+@pytest.fixture(scope="function")
+def multi_query(request):
+    queries = []
+    queries.append({
+        "query": ["DeepSpeed is",
+                  "Seattle is"],
+        "deployment_name": "bloom-560m_deployment"
+    })
+
+    queries.append({
+        'query': "DeepSpeed is the greatest",
+        "deployment_name": "microsoft/DialogRPT-human-vs-rand_deployment"
+    })
+
+    queries.append({
+        'text': "DeepSpeed is the greatest",
+        'conversation_id': 3,
+        'past_user_inputs': [],
+        'generated_responses': [],
+        "deployment_name": "microsoft/DialoGPT-large_deployment"
+    })
+
+    queries.append({
+        'question': "What is the greatest?",
+        'context': "DeepSpeed is the greatest",
+        "deployment_name": "deepset/roberta-large-squad2" + "-qa-deployment"
+    })
+    return queries
diff --git a/tests/test_multi_deployment.py b/tests/test_multi_deployment.py
new file mode 100644
index 00000000..9caa9828
--- /dev/null
+++ b/tests/test_multi_deployment.py
@@ -0,0 +1,35 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+import pytest
+import mii
+
+
+def test_multi_deploy(deployment_tag, multi_deployment, multi_query):
+    generator = mii.mii_query_handle(deployment_tag)
+    for query in multi_query:
+        result = generator.query(query)
+    assert result
+
+
+@pytest.mark.parametrize(
+    "task_name, model_name, query",
+    [
+        (
+            "text-generation",
+            "bigscience/bloom-560m",
+            {
+                "query": ["DeepSpeed is the greatest",
+                          'Seattle is']
+            },
+        ),
+    ],
+)
+def test_partial_deploy(deployment_tag, multi_deployment, deployment_config, query):
+    generator = mii.mii_query_handle(deployment_tag)
+    generator.add_models(**deployment_config.__dict__)
+    query["deployment_name"] = deployment_config.deployment_name
+    result = generator.query(query)
+    generator.delete_model(deployment_config.deployment_name)
+    assert result

From 8684a61ae65d166d1e25fa363941489313bc65ab Mon Sep 17 00:00:00 2001
From: Tosin Segun <insanechils@gmail.com>
Date: Mon, 31 Jul 2023 21:01:49 +0000
Subject: [PATCH 63/69] Removing comments

---
 mii/client.py    | 9 ---------
 mii/constants.py | 2 +-
 mii/terminate.py | 2 +-
 3 files changed, 2 insertions(+), 11 deletions(-)

diff --git a/mii/client.py b/mii/client.py
index c1edd93f..e2eaaa38 100644
--- a/mii/client.py
+++ b/mii/client.py
@@ -167,15 +167,6 @@ def __init__(self,
         self.port_map = port_map if port_map is not None else {}
         self.deployment_tag = deployment_tag
 
-    """
-    async def terminate_async(self):
-        await self.lb_stub.Terminate(
-            modelresponse_pb2.google_dot_protobuf_dot_empty__pb2.Empty())
-
-    def terminate(self):
-        self.asyncio_loop.run_until_complete(self.terminate_async())
-    """
-
     async def add_models_async(self, proto_request):
         await getattr(self.lb_stub, "AddDeployment")(proto_request)
 
diff --git a/mii/constants.py b/mii/constants.py
index f4860cc9..f2dced4e 100644
--- a/mii/constants.py
+++ b/mii/constants.py
@@ -103,7 +103,7 @@ class ModelProvider(enum.Enum):
 CHECKPOINT_KEY = "checkpoint"
 DEPLOYED_KEY = "deployed"
 VERSION_KEY = "version"
-MII_TERMINATE_DEP_NAME = "__MII_TERMINATE_CALL__"
+MII_TERMINATE_DEP_KEY = "__MII_TERMINATE_CALL__"
 
 MII_CACHE_PATH = "MII_CACHE_PATH"
 MII_CACHE_PATH_DEFAULT = "/tmp/mii_cache"
diff --git a/mii/terminate.py b/mii/terminate.py
index 77df55ff..5585832b 100644
--- a/mii/terminate.py
+++ b/mii/terminate.py
@@ -14,7 +14,7 @@ def terminate(deployment_tag):
         generator.terminate()
         return
     try:
-        generator.query({'query': ''}, mii.constants.MII_TERMINATE_DEP_NAME)
+        generator.query({'query': ''}, mii.constants.MII_TERMINATE_DEP_KEY)
     except grpc.aio._call.AioRpcError as error:
         if error._code == grpc.StatusCode.UNAVAILABLE:
             mii.utils.logger.warn(f"Server for {deployment_tag} not found")

From 56a7fcede7bda7eef2a86e6917cca49f1252bdba Mon Sep 17 00:00:00 2001
From: Tosin Segun <insanechils@gmail.com>
Date: Tue, 1 Aug 2023 18:31:26 +0000
Subject: [PATCH 64/69] Fixing spelling issues

---
 mii/client.py     | 2 +-
 tests/conftest.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/mii/client.py b/mii/client.py
index e2eaaa38..2e60149c 100644
--- a/mii/client.py
+++ b/mii/client.py
@@ -86,7 +86,7 @@ def __init__(self, deployments, host, port):
 
     def _get_deployment_task(self, deployment_name=None):
         task = None
-        if deployment_name is None or deployment_name == mii.constants.MII_TERMINATE_DEP_NAME:  #mii.terminate() or single model
+        if deployment_name is None or deployment_name == mii.constants.MII_TERMINATE_DEP_KEY:  #mii.terminate() or single model
             assert len(self.deployments) == 1, "Must pass deployment_name to query when using multiple deployments"
             deployment = next(iter(self.deployments.values()))
             deployment_name = getattr(deployment, mii.constants.DEPLOYMENT_NAME_KEY)
diff --git a/tests/conftest.py b/tests/conftest.py
index ed224bee..29be37be 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -227,7 +227,7 @@ def multi_query(request):
     queries.append({
         "query": ["DeepSpeed is",
                   "Seattle is"],
-        "deployment_name": "bloom-560m_deployment"
+        "deployment_name": "bigscience/bloom-560m_deployment"
     })
 
     queries.append({

From fb70c3db2bd6bba91c42fe8defb0c8450d5abff0 Mon Sep 17 00:00:00 2001
From: TosinSeg <90005810+TosinSeg@users.noreply.github.com>
Date: Tue, 1 Aug 2023 14:41:58 -0700
Subject: [PATCH 65/69] Update mii/client.py

Co-authored-by: Michael Wyatt <mrwyattii@gmail.com>
---
 mii/client.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mii/client.py b/mii/client.py
index 2e60149c..c795fef2 100644
--- a/mii/client.py
+++ b/mii/client.py
@@ -17,7 +17,7 @@
 def _get_deployment_configs(deployment_tag):
     deployments = {}
     configs = mii.utils.import_score_file(deployment_tag).configs
-    for deployment in configs[mii.constants.DEPLOYMENTS_KEY].values():
+    for deployment in configs.get(mii.constants.DEPLOYMENTS_KEY).values():
         deployment_name = deployment[mii.constants.DEPLOYMENT_NAME_KEY]
         deployments[deployment_name] = DeploymentConfig(**deployment)
     lb_config = configs.get(mii.constants.LOAD_BALANCER_CONFIG_KEY)

From e2cfe8a60e1528d3db4a4c4cfdb916d97d2b3275 Mon Sep 17 00:00:00 2001
From: TosinSeg <90005810+TosinSeg@users.noreply.github.com>
Date: Tue, 1 Aug 2023 14:42:12 -0700
Subject: [PATCH 66/69] Update mii/client.py

Co-authored-by: Michael Wyatt <mrwyattii@gmail.com>
---
 mii/client.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mii/client.py b/mii/client.py
index c795fef2..a4f717d0 100644
--- a/mii/client.py
+++ b/mii/client.py
@@ -21,7 +21,7 @@ def _get_deployment_configs(deployment_tag):
         deployment_name = deployment[mii.constants.DEPLOYMENT_NAME_KEY]
         deployments[deployment_name] = DeploymentConfig(**deployment)
     lb_config = configs.get(mii.constants.LOAD_BALANCER_CONFIG_KEY)
-    model_path = configs[mii.constants.MODEL_PATH_KEY]
+    model_path = configs.get(mii.constants.MODEL_PATH_KEY)
     port_map = configs.get(mii.constants.PORT_MAP_KEY)
     return deployments, lb_config, model_path, port_map
 

From 1312738637fa00665cd53cd421cfb351f8ffd4af Mon Sep 17 00:00:00 2001
From: Tosin Segun <insanechils@gmail.com>
Date: Tue, 1 Aug 2023 22:00:55 +0000
Subject: [PATCH 67/69] Removing AML from addDeploy

---
 mii/client.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/mii/client.py b/mii/client.py
index a4f717d0..ab459cfa 100644
--- a/mii/client.py
+++ b/mii/client.py
@@ -87,7 +87,8 @@ def __init__(self, deployments, host, port):
     def _get_deployment_task(self, deployment_name=None):
         task = None
         if deployment_name is None or deployment_name == mii.constants.MII_TERMINATE_DEP_KEY:  #mii.terminate() or single model
-            assert len(self.deployments) == 1, "Must pass deployment_name to query when using multiple deployments"
+            if deployment_name is None:
+                assert len(self.deployments) == 1, "Must pass deployment_name to query when using multiple deployments"
             deployment = next(iter(self.deployments.values()))
             deployment_name = getattr(deployment, mii.constants.DEPLOYMENT_NAME_KEY)
             task = getattr(deployment, mii.constants.TASK_NAME_KEY)
@@ -182,7 +183,7 @@ def add_models(self,
                    deployment_type=DeploymentType.LOCAL,
                    model_path=None,
                    version=1):
-
+        assert deployment_type != DeploymentType.AML, "Cannot currently add models to AML deployment"
         _, deployments = validate_deployment(task=task,
                                              model=model,
                                              deployment_name=deployment_name,
@@ -215,8 +216,7 @@ def add_models(self,
                                      mii.constants.DEPLOYMENT_NAME_KEY)] = deployment
         if self.model_path is None and deployment_type == DeploymentType.LOCAL:
             self.model_path = mii.constants.MII_MODEL_PATH_DEFAULT
-        elif self.model_path is None and deployment_type == DeploymentType.AML:
-            model_path = "model"
+
         create_score_file(deployment_tag=self.deployment_tag,
                           deployment_type=deployment_type,
                           deployments=deps,

From b0f0da4f2f35648edab60a5120a5620ebac93941 Mon Sep 17 00:00:00 2001
From: Tosin Segun <insanechils@gmail.com>
Date: Wed, 2 Aug 2023 19:35:20 +0000
Subject: [PATCH 68/69] Refactoring MIIConfig and DeploymentConfig

---
 examples/multi_model/deploy.py     | 14 +++--
 mii/client.py                      | 72 ++++++++++++----------
 mii/config.py                      | 94 ++++++++++++++++++++++++-----
 mii/constants.py                   |  1 +
 mii/deployment.py                  | 38 +++++++-----
 mii/launch/multi_gpu_server.py     | 10 +++-
 mii/models/score/generate.py       | 44 ++++++++++++--
 mii/models/score/score_template.py |  8 ++-
 mii/server.py                      | 96 +++++++++++++++++-------------
 9 files changed, 256 insertions(+), 121 deletions(-)

diff --git a/examples/multi_model/deploy.py b/examples/multi_model/deploy.py
index c0b93b56..525b2da3 100644
--- a/examples/multi_model/deploy.py
+++ b/examples/multi_model/deploy.py
@@ -19,7 +19,8 @@
                          model=name,
                          deployment_name=name + "_deployment",
                          GPU_index_map=gpu_index_map3,
-                         mii_configs=mii.config.MIIConfig(**mii_configs1)))
+                         tensor_parallel=2,
+                         dtype="fp16"))
 
 # gpt2
 name = "microsoft/DialogRPT-human-vs-rand"
@@ -31,11 +32,12 @@
 
 name = "microsoft/DialoGPT-large"
 deployments.append(
-    mii.DeploymentConfig(task='conversational',
-                         model=name,
-                         deployment_name=name + "_deployment",
-                         GPU_index_map=gpu_index_map1,
-                         mii_configs=mii.config.MIIConfig(**mii_configs2)))
+    mii.DeploymentConfig(
+        task='conversational',
+        model=name,
+        deployment_name=name + "_deployment",
+        GPU_index_map=gpu_index_map1,
+    ))
 
 name = "deepset/roberta-large-squad2"
 deployments.append(
diff --git a/mii/client.py b/mii/client.py
index ab459cfa..d478f7f2 100644
--- a/mii/client.py
+++ b/mii/client.py
@@ -10,8 +10,8 @@
 from mii.grpc_related.proto import modelresponse_pb2, modelresponse_pb2_grpc
 from mii.constants import GRPC_MAX_MSG_SIZE, Tasks, DeploymentType
 from mii.method_table import GRPC_METHOD_TABLE
-from mii.deployment import allocate_processes, create_score_file, validate_deployment
-from mii.config import DeploymentConfig
+from mii.deployment import allocate_processes
+from mii.config import DeploymentConfig, MIIConfig
 
 
 def _get_deployment_configs(deployment_tag):
@@ -23,7 +23,9 @@ def _get_deployment_configs(deployment_tag):
     lb_config = configs.get(mii.constants.LOAD_BALANCER_CONFIG_KEY)
     model_path = configs.get(mii.constants.MODEL_PATH_KEY)
     port_map = configs.get(mii.constants.PORT_MAP_KEY)
-    return deployments, lb_config, model_path, port_map
+    deployment_type = configs.get(mii.constants.DEPLOYMENT_TYPE_KEY)
+    mii_configs = MIIConfig(**configs.get(mii.constants.MII_CONFIGS_KEY))
+    return deployments, lb_config, model_path, port_map, deployment_type, mii_configs
 
 
 def mii_query_handle(deployment_tag):
@@ -43,15 +45,17 @@ def mii_query_handle(deployment_tag):
         inference_pipeline, task = mii.non_persistent_models[deployment_tag]
         return MIINonPersistentClient(task, deployment_tag)
 
-    deployments, lb_config, model_path, port_map = _get_deployment_configs(deployment_tag)
-    mii_configs = None
+    deployments, lb_config, model_path, port_map, deployment_type, mii_configs = _get_deployment_configs(deployment_tag)
+    """mii_configs = None
     if len(deployments) > 0:
         mii_configs = getattr(next(iter(deployments.values())),
                               mii.constants.MII_CONFIGS_KEY)
+    """
     port_number = None if mii_configs == None else mii_configs.port_number
-    if port_number:
+    """if port_number:
         for deployment in deployments.values():
             assert getattr(deployment, mii.constants.MII_CONFIGS_KEY).port_number == port_number, f"All port numbers is each deployments mii_configs must match"
+    """
 
     return LBClient(deployments,
                     "localhost",
@@ -59,7 +63,9 @@ def mii_query_handle(deployment_tag):
                     lb_config,
                     model_path,
                     port_map,
-                    deployment_tag)
+                    deployment_tag,
+                    deployment_type,
+                    mii_configs)
 
 
 def create_channel(host, port):
@@ -157,7 +163,9 @@ def __init__(self,
                  lb_config=None,
                  model_path=None,
                  port_map=None,
-                 deployment_tag=None):
+                 deployment_tag=None,
+                 deployment_type=DeploymentType.LOCAL,
+                 mii_configs={}):
         super().__init__(deployments, host, port)
         self.lb_stub = None
         if port is not None:
@@ -167,24 +175,15 @@ def __init__(self,
         self.model_path = model_path
         self.port_map = port_map if port_map is not None else {}
         self.deployment_tag = deployment_tag
+        self.deployment_type = deployment_type
+        self.mii_configs = mii_configs
 
     async def add_models_async(self, proto_request):
         await getattr(self.lb_stub, "AddDeployment")(proto_request)
 
-    def add_models(self,
-                   task=None,
-                   model=None,
-                   deployment_name=None,
-                   enable_deepspeed=True,
-                   enable_zero=False,
-                   ds_config=None,
-                   mii_config={},
-                   deployments=[],
-                   deployment_type=DeploymentType.LOCAL,
-                   model_path=None,
-                   version=1):
-        assert deployment_type != DeploymentType.AML, "Cannot currently add models to AML deployment"
-        _, deployments = validate_deployment(task=task,
+    def add_models(self, deployments=[], model_path=None, version=1):
+        assert self.deployment_type != DeploymentType.AML, "Cannot currently add models to AML deployment"
+        """_, deployments = validate_deployment(task=task,
                                              model=model,
                                              deployment_name=deployment_name,
                                              enable_deepspeed=enable_deepspeed,
@@ -196,7 +195,7 @@ def add_models(self,
                                              deployment_type=deployment_type,
                                              model_path=model_path,
                                              version=version)
-
+        """
         if not deployments:  #Empty deployment
             return None
 
@@ -205,7 +204,7 @@ def add_models(self,
                     mii.constants.DEPLOYMENT_NAME_KEY): deployment
             for deployment in deployments
         }
-        lb_config, self.port_map = allocate_processes(deps, self.port_map)
+        lb_config, self.port_map = allocate_processes(deps, self.port_map, self.mii_configs)
         lb_enabled = True if len(self.deployments) else False
         if self.lb_config is not None:
             self.lb_config.replica_configs.extend(lb_config.replica_configs)
@@ -214,21 +213,30 @@ def add_models(self,
         for deployment in deployments:
             self.deployments[getattr(deployment,
                                      mii.constants.DEPLOYMENT_NAME_KEY)] = deployment
-        if self.model_path is None and deployment_type == DeploymentType.LOCAL:
+        if self.model_path is None and self.deployment_type == DeploymentType.LOCAL:
             self.model_path = mii.constants.MII_MODEL_PATH_DEFAULT
-
-        create_score_file(deployment_tag=self.deployment_tag,
+        """create_score_file(deployment_tag=self.deployment_tag,
                           deployment_type=deployment_type,
                           deployments=deps,
                           model_path=self.model_path,
                           port_map=self.port_map,
                           lb_config=lb_config,
                           deployed=lb_enabled)
+
         if deployment_type == DeploymentType.LOCAL:
             mii.utils.import_score_file(self.deployment_tag).init()
+        """
+        if not self.mii_configs:
+            self.mii_configs = mii.configs.MIIConfigs(**{})
+        mii.MIIServer(self.deployment_tag,
+                      deps.values(),
+                      self.model_path,
+                      lb_config=lb_config,
+                      lb_enabled=lb_enabled,
+                      mii_configs=self.mii_configs)
+
         if self.lb_stub is None:
-            self.port_number = getattr(next(iter(self.deployments.values())),
-                                       mii.constants.MII_CONFIGS_KEY).port_number
+            self.port_number = self.mii_configs.port_number
             self.channel = create_channel(self.host, self.port_number)
             self.lb_stub = modelresponse_pb2_grpc.DeploymentManagementStub(self.channel)
             if not self.mr_stub:
@@ -346,8 +354,8 @@ def terminate(self):
 
 
 def terminate_restful_gateway(deployment_tag):
-    deployments, _, _, _ = _get_deployment_configs(deployment_tag)
+    deployments, _, _, _, _, mii_configs = _get_deployment_configs(deployment_tag)
     for deployment in deployments.values():
-        mii_configs = getattr(deployment, mii.constants.MII_CONFIGS_KEY)
-        if mii_configs.enable_restful_api:
+        #mii_configs = getattr(deployment, mii.constants.MII_CONFIGS_KEY)
+        if deployment.enable_restful_api:
             requests.get(f"http://localhost:{mii_configs.restful_api_port}/terminate")
diff --git a/mii/config.py b/mii/config.py
index b8325562..695054c3 100644
--- a/mii/config.py
+++ b/mii/config.py
@@ -8,6 +8,7 @@
 from pydantic import BaseModel, validator, root_validator, Field
 from deepspeed.launcher.runner import DLTS_HOSTFILE
 from mii.utils import get_task
+from mii.constants import DEPLOYMENT_NAME_KEY, TASK_NAME_KEY, MODEL_NAME_KEY, ENABLE_DEEPSPEED_KEY, ENABLE_DEEPSPEED_ZERO_KEY, GPU_INDEX_KEY, DEEPSPEED_CONFIG_KEY, VERSION_KEY
 
 
 class DtypeEnum(Enum):
@@ -57,7 +58,7 @@ class MIIConfig(BaseModel):
     replica_num: int = 1
     hostfile: str = DLTS_HOSTFILE
     trust_remote_code: bool = False
-
+    """
     @validator("deploy_rank")
     def deploy_valid(cls, field_value, values):
         if "tensor_parallel" not in values:
@@ -76,8 +77,9 @@ def deploy_valid(cls, field_value, values):
         # number of ranks provided must be equal to TP size, DP is handled outside MII currently
         assert values["tensor_parallel"] == len(field_value), \
             f"{len(field_value)} rank(s) provided in 'deploy_rank' does not align with tensor_parallel size of {values['tensor_parallel']}"
-        return field_value
-
+    return field_value
+    """
+    """
     @validator('checkpoint_dict')
     def checkpoint_dict_valid(cls, value):
         if value is None:
@@ -90,7 +92,8 @@ def checkpoint_dict_valid(cls, value):
             if not value.get(k, ''):
                 raise ValueError(f"Missing key={k} in checkpoint_dict")
         return value
-
+    """
+    """
     @root_validator
     def meta_tensor_or_sys_mem(cls, values):
         if values.get("meta_tensor") and values.get("load_with_sys_mem"):
@@ -98,7 +101,7 @@ def meta_tensor_or_sys_mem(cls, values):
                 "`meta_tensor` and `load_with_sys_mem` cannot be active at the same time."
             )
         return values
-
+    """
     class Config:
         validate_all = True
         validate_assignment = True
@@ -131,16 +134,72 @@ class Config:
 
 
 class DeploymentConfig(BaseModel):
-    deployment_name: str = Field(alias="DEPLOYMENT_NAME_KEY")
-    task: str = Field(alias="TASK_NAME_KEY")
-    model: str = Field(alias="MODEL_NAME_KEY")
-    ds_optimize: bool = Field(default=True, alias="ENABLE_DEEPSPEED_KEY")
-    ds_zero: bool = Field(default=False, alias="ENABLE_DEEPSPEED_ZERO_KEY")
-    GPU_index_map: dict = Field(default=None, alias="GPU_INDEX_KEY")
-    mii_configs: MIIConfig = Field(default=MIIConfig.parse_obj({}),
-                                   alias="MII_CONFIGS_KEY")
-    ds_config: dict = Field(default=None, alias="DEEPSPEED_CONFIG_KEY")
-    version: int = Field(default=1, alias="VERSION_KEY")
+    deployment_name: str = Field(alias=DEPLOYMENT_NAME_KEY)
+    task: str = Field(alias=TASK_NAME_KEY)
+    model: str = Field(alias=MODEL_NAME_KEY)
+    ds_optimize: bool = Field(default=True, alias=ENABLE_DEEPSPEED_KEY)
+    ds_zero: bool = Field(default=False, alias=ENABLE_DEEPSPEED_ZERO_KEY)
+    GPU_index_map: dict = Field(default=None, alias=GPU_INDEX_KEY)
+    #mii_configs: MIIConfig = Field(default={}, alias=MII_CONFIGS_KEY)
+    ds_config: dict = Field(default=None, alias=DEEPSPEED_CONFIG_KEY)
+    version: int = Field(default=1, alias=VERSION_KEY)
+    tensor_parallel: int = 1
+    dtype: DtypeEnum = torch.float32
+    meta_tensor: bool = False
+    load_with_sys_mem: bool = False
+    replace_with_kernel_inject: bool = True
+    profile_model_time: bool = False
+    skip_model_check: bool = False
+    max_tokens: int = 1024
+    enable_restful_api: bool = False
+    replica_num: int = 1
+    hostfile: str = DLTS_HOSTFILE
+    deploy_rank: Union[int, List[int]] = -1
+    enable_cuda_graph: bool = False
+    checkpoint_dict: Union[dict, None] = None
+    hf_auth_token: str = None
+    trust_remote_code: bool = False
+
+    @validator('checkpoint_dict')
+    def checkpoint_dict_valid(cls, value):
+        if value is None:
+            return value
+        if value.get('base_dir', ''):
+            raise ValueError(
+                "please unset 'base_dir' it will be set w.r.t. the deployment 'model_path'"
+            )
+        for k in ['checkpoints', 'parallelization', 'version', 'type']:
+            if not value.get(k, ''):
+                raise ValueError(f"Missing key={k} in checkpoint_dict")
+        return value
+
+    @validator("deploy_rank")
+    def deploy_valid(cls, field_value, values):
+        if "tensor_parallel" not in values:
+            raise ValueError(
+                "'tensor_parallel' must be defined in the pydantic model before 'deploy_rank'"
+            )
+
+        # if deploy rank is not given, default to align with TP value
+        if field_value == -1:
+            field_value = list(range(values["tensor_parallel"]))
+
+        # ensure deploy rank type is always list for easier consumption later
+        if not isinstance(field_value, list):
+            field_value = [field_value]
+
+        # number of ranks provided must be equal to TP size, DP is handled outside MII currently
+        assert values["tensor_parallel"] == len(field_value), \
+            f"{len(field_value)} rank(s) provided in 'deploy_rank' does not align with tensor_parallel size of {values['tensor_parallel']}"
+        return field_value
+
+    @root_validator
+    def meta_tensor_or_sys_mem(cls, values):
+        if values.get("meta_tensor") and values.get("load_with_sys_mem"):
+            raise ValueError(
+                "`meta_tensor` and `load_with_sys_mem` cannot be active at the same time."
+            )
+        return values
 
     @validator("task")
     def convert_task_str(cls, field_value, values):
@@ -148,3 +207,8 @@ def convert_task_str(cls, field_value, values):
 
     class Config:
         allow_population_by_field_name = True
+        validate_all = True
+        validate_assignment = True
+        use_enum_values = True
+        extra = 'forbid'
+        json_encoders = {torch.dtype: lambda x: str(x)}
diff --git a/mii/constants.py b/mii/constants.py
index f2dced4e..3d674efe 100644
--- a/mii/constants.py
+++ b/mii/constants.py
@@ -104,6 +104,7 @@ class ModelProvider(enum.Enum):
 DEPLOYED_KEY = "deployed"
 VERSION_KEY = "version"
 MII_TERMINATE_DEP_KEY = "__MII_TERMINATE_CALL__"
+DEPLOYMENT_TYPE_KEY = "deployment_type"
 
 MII_CACHE_PATH = "MII_CACHE_PATH"
 MII_CACHE_PATH_DEFAULT = "/tmp/mii_cache"
diff --git a/mii/deployment.py b/mii/deployment.py
index 5744d182..1976ca4a 100644
--- a/mii/deployment.py
+++ b/mii/deployment.py
@@ -68,6 +68,9 @@ def deploy(task=None,
         If deployment_type is `LOCAL`, returns just the name of the deployment that can be used to create a query handle using `mii.mii_query_handle(deployment_name)`
 
     """
+    if not mii_config:
+        mii_config = mii.config.MIIConfig(**{})
+
     if model_path is None and deployment_type == DeploymentType.LOCAL:
         model_path = MII_MODEL_PATH_DEFAULT
     elif model_path is None and deployment_type == DeploymentType.AML:
@@ -98,15 +101,15 @@ def deploy(task=None,
 
     # parse and validate mii config
     for deployment in deployments:
-        mii_config = getattr(deployment, mii.constants.MII_CONFIGS_KEY)
+        #mii_config = getattr(deployment, mii.constants.MII_CONFIGS_KEY)
         if getattr(deployment, mii.constants.ENABLE_DEEPSPEED_ZERO_KEY):
             if getattr(deployment,
                        mii.constants.DEEPSPEED_CONFIG_KEY).get("fp16",
                                                                {}).get("enabled",
                                                                        False):
-                assert (mii_config.dtype == torch.half), "MII Config Error: MII dtype and ZeRO dtype must match"
+                assert (deployment.dtype == torch.half), "MII Config Error: MII dtype and ZeRO dtype must match"
             else:
-                assert (mii_config.dtype == torch.float), "MII Config Error: MII dtype and ZeRO dtype must match"
+                assert (deployment.dtype == torch.float), "MII Config Error: MII dtype and ZeRO dtype must match"
         assert not (enable_deepspeed and enable_zero), "MII Config Error: DeepSpeed and ZeRO cannot both be enabled, select only one"
 
     # aml only allows certain characters for deployment names
@@ -137,12 +140,11 @@ def deploy(task=None,
             )
 
     deps = {deployment.deployment_name: deployment for deployment in deployments}
-
     # In local deployments use default path if no model path set
 
     # add fields for replica deployment
     port_map = {}
-    lb_config, port_map = allocate_processes(deps, port_map)
+    lb_config, port_map = allocate_processes(deps, port_map, mii_config)
 
     if deployment_type != DeploymentType.NON_PERSISTENT:
         create_score_file(deployment_tag=deployment_tag,
@@ -150,7 +152,8 @@ def deploy(task=None,
                           deployments=deps,
                           model_path=model_path,
                           port_map=port_map,
-                          lb_config=lb_config)
+                          lb_config=lb_config,
+                          mii_configs=mii_config)
 
     if deployment_type == DeploymentType.AML:
         _deploy_aml(deployment_tag=deployment_tag, model_name=model, version=version)
@@ -166,33 +169,35 @@ def deploy(task=None,
             enable_deepspeed,
             enable_zero,
             provider,
-            mii_config),
+            deployment),
                                                       task)
     else:
         raise Exception(f"Unknown deployment type: {deployment_type}")
 
 
-def allocate_processes(deployments, port_map):
+def allocate_processes(deployments, port_map, mii_config):
     replica_configs = []
     port_offset = 1
     for deployment in deployments.values():
-        mii_config = getattr(deployment, mii.constants.MII_CONFIGS_KEY)
-        replica_pool = _allocate_processes(mii_config.hostfile,
-                                           mii_config.tensor_parallel,
-                                           mii_config.replica_num,
-                                           deployment.GPU_index_map)
+        #mii_config = getattr(deployment, mii.constants.MII_CONFIGS_KEY)
+        replica_pool = _allocate_processes(
+            deployment.hostfile,
+            deployment.tensor_parallel,
+            deployment.replica_num,
+            getattr(deployment,
+                    mii.constants.GPU_INDEX_KEY))
 
         for i, (hostname, gpu_indices) in enumerate(replica_pool):
             # Reserver port for a LB proxy when replication is enabled
             if hostname not in port_map:
                 port_map[hostname] = set()
-            base_port = mii_config.port_number + i * mii_config.tensor_parallel + port_offset
+            base_port = mii_config.port_number + i * deployment.tensor_parallel + port_offset
             if base_port in port_map[hostname]:
                 base_port = max(port_map[hostname]) + 1
             tensor_parallel_ports = list(
                 range(base_port,
-                      base_port + mii_config.tensor_parallel))
-            for i in range(base_port, base_port + mii_config.tensor_parallel):
+                      base_port + deployment.tensor_parallel))
+            for i in range(base_port, base_port + deployment.tensor_parallel):
                 port_map[hostname].add(i)
             torch_dist_port = mii_config.torch_dist_port + i
             replica_configs.append(
@@ -236,6 +241,7 @@ def validate_deployment(task=None,
                           deployment_type=deployment_type,
                           deployments=None,
                           model_path=model_path,
+                          mii_configs={},
                           port_map=None,
                           lb_config=None)
         return deployment_tag, None
diff --git a/mii/launch/multi_gpu_server.py b/mii/launch/multi_gpu_server.py
index 1f7fc00a..194cc4a9 100644
--- a/mii/launch/multi_gpu_server.py
+++ b/mii/launch/multi_gpu_server.py
@@ -6,8 +6,8 @@
 import argparse
 import mii
 
-from mii import MIIConfig, LoadBalancerConfig
-
+from mii import MIIConfig, LoadBalancerConfig, DeploymentConfig
+from mii.utils import get_task_name
 from mii.models.load_models import load_models
 from mii.grpc_related.modelresponse_server import serve_inference, serve_load_balancing
 from mii.grpc_related.restful_gateway import RestfulGatewayThread
@@ -45,6 +45,7 @@ def main():
                         "--restful-gateway",
                         action='store_true',
                         help="launch restful api gateway")
+    parser.add_argument("-f", "--deployment", type=str, help="base64 encoded deployment")
 
     args = parser.parse_args()
 
@@ -53,6 +54,9 @@ def main():
     # convert dict -> mii config
     mii_config = MIIConfig(**config_dict)
 
+    deployment_dict = decode_config_from_str(args.deployment)
+    deployment_dict['task'] = get_task_name(mii.constants.Tasks(deployment_dict['task']))
+    deployment = DeploymentConfig(**deployment_dict)
     if args.restful_gateway:
         print(f"Starting RESTful API gateway on port: {mii_config.restful_api_port}")
         gateway_thread = RestfulGatewayThread(args.deployment_name,
@@ -77,7 +81,7 @@ def main():
                                          ds_zero=args.ds_zero,
                                          ds_config_path=args.ds_config,
                                          provider=provider,
-                                         mii_config=mii_config)
+                                         mii_config=deployment)
 
         print(f"Starting server on port: {port}")
         serve_inference(inference_pipeline, port)
diff --git a/mii/models/score/generate.py b/mii/models/score/generate.py
index 2f2bf8b0..6d608fc8 100644
--- a/mii/models/score/generate.py
+++ b/mii/models/score/generate.py
@@ -15,9 +15,13 @@ def create_score_file(deployment_tag,
                       model_path,
                       port_map,
                       lb_config,
+                      mii_configs={},
                       deployed=False):
 
     config_dict = {}
+    config_dict[
+        mii.constants.MII_CONFIGS_KEY] = mii_configs.dict() if mii_configs else {}
+    config_dict[mii.constants.DEPLOYMENT_TYPE_KEY] = deployment_type.value
     config_dict[mii.constants.MODEL_PATH_KEY] = model_path
     config_dict[mii.constants.DEPLOYMENT_TAG_KEY] = deployment_tag
     config_dict[mii.constants.DEPLOYED_KEY] = deployed
@@ -40,9 +44,9 @@ def create_score_file(deployment_tag,
                 mii.constants.ENABLE_DEEPSPEED_KEY:
                 getattr(deployment,
                         mii.constants.ENABLE_DEEPSPEED_KEY),
-                mii.constants.MII_CONFIGS_KEY:
-                getattr(deployment,
-                        mii.constants.MII_CONFIGS_KEY).dict(),
+                #mii.constants.MII_CONFIGS_KEY:
+                #getattr(deployment,
+                #        mii.constants.MII_CONFIGS_KEY).dict(),
                 mii.constants.ENABLE_DEEPSPEED_ZERO_KEY:
                 getattr(deployment,
                         mii.constants.ENABLE_DEEPSPEED_ZERO_KEY),
@@ -51,7 +55,39 @@ def create_score_file(deployment_tag,
                         mii.constants.DEEPSPEED_CONFIG_KEY),
                 mii.constants.GPU_INDEX_KEY:
                 getattr(deployment,
-                        mii.constants.GPU_INDEX_KEY)
+                        mii.constants.GPU_INDEX_KEY),
+                'tensor_parallel':
+                deployment.tensor_parallel,
+                'dtype':
+                deployment.dtype,
+                'meta_tensor':
+                deployment.meta_tensor,
+                'load_with_sys_mem':
+                deployment.load_with_sys_mem,
+                'replace_with_kernel_inject':
+                deployment.replace_with_kernel_inject,
+                'profile_model_time':
+                deployment.profile_model_time,
+                'skip_model_check':
+                deployment.skip_model_check,
+                'max_tokens':
+                deployment.max_tokens,
+                'enable_restful_api':
+                deployment.enable_restful_api,
+                'replica_num':
+                deployment.replica_num,
+                'hostfile':
+                deployment.hostfile,
+                'deploy_rank':
+                deployment.deploy_rank,
+                'enable_cuda_graph':
+                deployment.enable_cuda_graph,
+                'checkpoint_dict':
+                deployment.checkpoint_dict,
+                'hf_auth_token':
+                deployment.hf_auth_token,
+                'trust_remote_code':
+                deployment.trust_remote_code
             }
             config_dict[mii.constants.DEPLOYMENTS_KEY][
                 deployment.deployment_name] = deployment_config
diff --git a/mii/models/score/score_template.py b/mii/models/score/score_template.py
index 7c8208b8..df4d94d0 100644
--- a/mii/models/score/score_template.py
+++ b/mii/models/score/score_template.py
@@ -8,7 +8,7 @@
 import json
 import torch
 import mii
-from mii.config import LoadBalancerConfig, ReplicaConfig
+from mii.config import LoadBalancerConfig, ReplicaConfig, MIIConfig
 import time
 
 model = None
@@ -16,18 +16,20 @@
 
 def init():
     model_path = mii.utils.full_model_path(configs[mii.constants.MODEL_PATH_KEY])
+    mii_configs = configs[mii.constants.MII_CONFIGS_KEY]
     deployment_tag = configs[mii.constants.DEPLOYMENT_TAG_KEY]
     deployments = []
     lb_enabled = configs[mii.constants.DEPLOYED_KEY]
     for deployment in configs[mii.constants.DEPLOYMENTS_KEY].values():
         deployments.append(mii.DeploymentConfig(**deployment))
-
+    mii_configs = MIIConfig(**mii_configs)
     mii.MIIServer(deployment_tag,
                   deployments,
                   model_path,
                   lb_config=configs.get(mii.constants.LOAD_BALANCER_CONFIG_KEY,
                                         None),
-                  lb_enabled=lb_enabled)
+                  lb_enabled=lb_enabled,
+                  mii_configs=mii_configs)
 
     global model
     model = None
diff --git a/mii/server.py b/mii/server.py
index 30d69363..1aeac364 100644
--- a/mii/server.py
+++ b/mii/server.py
@@ -33,26 +33,26 @@ def __init__(self,
                  deployments,
                  model_path,
                  lb_config=None,
-                 lb_enabled=False):
+                 lb_enabled=False,
+                 mii_configs={}):
         if len(deployments) > 0:
             self.lb_enabled = lb_enabled
             self.deployments = deployments
             for deployment in deployments:
-                mii_configs = getattr(deployment, mii.constants.MII_CONFIGS_KEY)
-                assert get_num_gpus(mii_configs) > 0, f"GPU count for {deployment.deployment_name} must be greater than 0"
-                if mii_configs.hostfile is None:
+                #mii_configs = getattr(deployment, mii.constants.MII_CONFIGS_KEY)
+                assert get_num_gpus(deployment) > 0, f"GPU count for {deployment.deployment_name} must be greater than 0"
+                if deployment.hostfile is None:
                     hostfile = tempfile.NamedTemporaryFile(delete=False)
                     num_gpu = torch.cuda.device_count()
                     with open(hostfile, "w") as f:
                         f.write(f"localhost slots={num_gpu}")
-                    mii.configs.hostfile = hostfile
-
-            processes = self._initialize_service(
-                deployment_tag,
-                deployments,
-                model_path,
-                lb_config,
-            )
+                    deployment.hostfile = hostfile
+            deps = {dep.deployment_name: dep for dep in deployments}
+            processes = self._initialize_service(deployment_tag,
+                                                 deps,
+                                                 model_path,
+                                                 lb_config,
+                                                 mii_configs)
             self._wait_until_server_is_live(processes, lb_config.replica_configs)
 
     def _wait_until_server_is_live(self, processes, deployment):
@@ -100,10 +100,11 @@ def _build_server_args(self,
                            ds_zero,
                            ds_config,
                            mii_configs,
-                           port):
+                           port,
+                           deployment):
         # serialize mii config
         b64_config_str = config_to_b64_str(mii_configs)
-
+        b64_deployment = config_to_b64_str(deployment)
         task = ""
         for deployment in self.deployments:
             if deployment_name == getattr(deployment, mii.constants.DEPLOYMENT_NAME_KEY):
@@ -117,6 +118,7 @@ def _build_server_args(self,
         server_args_str += f" --provider {provider}"
 
         server_args_str += f" --config {b64_config_str}"
+        server_args_str += f" -f {b64_deployment}"
         server_args_str += " --ds-zero" if ds_zero else ""
         if ds_zero and ds_config is not None:
             if isinstance(ds_config, dict):
@@ -175,17 +177,21 @@ def _launch_restful_gateway(self,
                                 ds_zero,
                                 ds_config,
                                 mii_configs,
-                                port):
-        return self._launch_server_process(deployment_name,
-                                           model_name,
-                                           model_path,
-                                           ds_optimize,
-                                           ds_zero,
-                                           ds_config,
-                                           mii_configs,
-                                           port,
-                                           "restful api gateway",
-                                           ex_server_args=["--restful-gateway"])
+                                port,
+                                deployment):
+        return self._launch_server_process(
+            deployment_name,
+            model_name,
+            model_path,
+            ds_optimize,
+            ds_zero,
+            ds_config,
+            mii_configs,
+            port,
+            "restful api gateway",
+            deployment,
+            ex_server_args=["--restful-gateway"],
+        )
 
     def _launch_server_process(self,
                                deployment_name,
@@ -197,6 +203,7 @@ def _launch_server_process(self,
                                mii_configs,
                                port,
                                msg_server_type,
+                               deployment,
                                ds_launch_str=None,
                                ex_server_args=[]):
         launch_str = f"{sys.executable} -m mii.launch.multi_gpu_server"
@@ -207,7 +214,8 @@ def _launch_server_process(self,
                                                   ds_zero,
                                                   ds_config,
                                                   mii_configs,
-                                                  port)
+                                                  port,
+                                                  deployment)
         server_args_str += f" " + \
             " ".join(ex_server_args) if ex_server_args else ""
 
@@ -233,7 +241,8 @@ def _launch_deepspeed(self,
                           host,
                           port,
                           master_port,
-                          deploy_ranks):
+                          deploy_ranks,
+                          deployment):
         # use different hostfiles for replica instances
         # pass /dev/null when no replica is used
         worker_str = f"-H {hostfile} "
@@ -256,12 +265,17 @@ def _launch_deepspeed(self,
                                            mii_configs,
                                            port,
                                            "MII server",
+                                           deployment,
                                            ds_launch_str=ds_launch_str)
 
-    def _initialize_service(self, deployment_tag, deployments, model_path, lb_config):
+    def _initialize_service(self,
+                            deployment_tag,
+                            deployments,
+                            model_path,
+                            lb_config,
+                            mii_configs):
 
         processes = []
-
         host_gpus = defaultdict(list)
         for repl_config in lb_config.replica_configs:
             host_gpus[repl_config.hostname].extend(repl_config.gpu_indices)
@@ -269,10 +283,11 @@ def _initialize_service(self, deployment_tag, deployments, model_path, lb_config
         # Start replica instances
         for i, repl_config in enumerate(lb_config.replica_configs):
             name = repl_config.deployment_name
-            deployment = None
-            for dep in deployments:
+            deployment = None if name not in deployments else deployments[name]
+            """for dep in deployments:
                 if getattr(dep, mii.constants.DEPLOYMENT_NAME_KEY) == name:
                     deployment = dep
+            """
             if deployment is None:
                 continue
             hostfile = tempfile.NamedTemporaryFile(delete=False)
@@ -291,15 +306,13 @@ def _initialize_service(self, deployment_tag, deployments, model_path, lb_config
                             mii.constants.ENABLE_DEEPSPEED_ZERO_KEY),
                     getattr(deployment,
                             mii.constants.DEEPSPEED_CONFIG_KEY),
-                    getattr(deployment,
-                            mii.constants.MII_CONFIGS_KEY),
+                    mii_configs,
                     hostfile.name,
                     repl_config.hostname,
                     repl_config.tensor_parallel_ports[0],
-                    getattr(deployment,
-                            mii.constants.MII_CONFIGS_KEY).torch_dist_port + (100 * i) +
-                    repl_config.gpu_indices[0],
-                    repl_config.gpu_indices))
+                    mii_configs.torch_dist_port + (100 * i) + repl_config.gpu_indices[0],
+                    repl_config.gpu_indices,
+                    deployment))
 
             # start load balancer here.
             # we don't use deepspeed launcher for the load balancer because it does not need a GPU.
@@ -309,7 +322,7 @@ def _initialize_service(self, deployment_tag, deployments, model_path, lb_config
             processes.append(self._launch_load_balancer(model_path, lb_config))
 
         for deployment in self.deployments:
-            if getattr(deployment, mii.constants.MII_CONFIGS_KEY).enable_restful_api:
+            if deployment.enable_restful_api:
                 # start rest api server
                 processes.append(
                     self._launch_restful_gateway(
@@ -324,10 +337,9 @@ def _initialize_service(self, deployment_tag, deployments, model_path, lb_config
                                 mii.constants.ENABLE_DEEPSPEED_ZERO_KEY),
                         getattr(deployment,
                                 mii.constants.DEEPSPEED_CONFIG_KEY),
-                        getattr(deployment,
-                                mii.constants.MII_CONFIGS_KEY),
-                        getattr(deployment,
-                                mii.constants.MII_CONFIGS_KEY).port_number))
+                        mii_configs,
+                        mii_configs.port_number),
+                    deployment)
                 break
 
         return processes

From b78068ecb987f6932b27ae7f48832e52aa58f5ec Mon Sep 17 00:00:00 2001
From: Tosin Segun <insanechils@gmail.com>
Date: Fri, 11 Aug 2023 17:33:18 +0000
Subject: [PATCH 69/69] Partial deploy/termination example

---
 examples/multi_model/add_delete_models.py | 32 +++++++++++++++++++++++
 1 file changed, 32 insertions(+)
 create mode 100644 examples/multi_model/add_delete_models.py

diff --git a/examples/multi_model/add_delete_models.py b/examples/multi_model/add_delete_models.py
new file mode 100644
index 00000000..2a85b0f3
--- /dev/null
+++ b/examples/multi_model/add_delete_models.py
@@ -0,0 +1,32 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import mii
+
+deployments = []
+results = []
+name = 'bigscience/bloom-560m'
+mii_configs1 = {"tensor_parallel": 1, "dtype": "fp16"}
+deployments.append(
+    mii.DeploymentConfig(task='text-generation',
+                         model=name,
+                         deployment_name=name + "_deployment5",
+                         mii_configs=mii.config.MIIConfig(**mii_configs1)
+                         ))
+
+generator = mii.mii_query_handle("multi_models")
+generator.add_models(deployments=deployments)
+
+result = generator.query(
+    {
+        "query": ["DeepSpeed is",
+                  "Seattle is"],
+        "deployment_name": "bigscience/bloom-560m_deployment5"
+    },
+    do_sample=True,
+    max_new_tokens=30,
+)
+print(result)
+generator.delete_model("bigscience/bloom-560m_deployment5")