From 4eac006d89538d675bf8bac8f6d365aac34a1337 Mon Sep 17 00:00:00 2001 From: Tosin Segun Date: Mon, 19 Jun 2023 23:59:09 +0000 Subject: [PATCH 01/69] Removing load balancing config --- mii/client.py | 6 +-- mii/config.py | 8 ---- mii/deployment.py | 40 ++++++++-------- mii/server.py | 119 ++++++++++++++++++---------------------------- 4 files changed, 70 insertions(+), 103 deletions(-) diff --git a/mii/client.py b/mii/client.py index 5542d93a..b2970a5e 100644 --- a/mii/client.py +++ b/mii/client.py @@ -35,14 +35,14 @@ def mii_query_handle(deployment_name): query_handle: A query handle with a single method `.query(request_dictionary)` using which queries can be sent to the model. """ task_name, mii_configs = _get_deployment_info(deployment_name) - if mii_configs.enable_load_balancing: - return MIIClient(task_name, "localhost", mii_configs.port_number) + return MIIClient(task_name, "localhost", mii_configs.port_number) + """ else: return MIITensorParallelClient( task_name, "localhost", [mii_configs.port_number + i for i in range(mii_configs.tensor_parallel)]) - + """ def create_channel(host, port): return grpc.aio.insecure_channel(f'{host}:{port}', diff --git a/mii/config.py b/mii/config.py index 0f7d24b3..34e93ddd 100644 --- a/mii/config.py +++ b/mii/config.py @@ -55,7 +55,6 @@ class MIIConfig(BaseModel): max_tokens: int = 1024 enable_restful_api: bool = False restful_api_port: int = 51080 - enable_load_balancing: bool = False replica_num: int = 1 hostfile: str = DLTS_HOSTFILE @@ -92,13 +91,6 @@ def checkpoint_dict_valid(cls, value): raise ValueError(f"Missing key={k} in checkpoint_dict") return value - @root_validator - def auto_enable_load_balancing(cls, values): - if values["enable_restful_api"] and not values["enable_load_balancing"]: - logger.warn("Restful API is enabled, enabling Load Balancing") - values["enable_load_balancing"] = True - return values - class Config: validate_all = True validate_assignment = True diff --git a/mii/deployment.py b/mii/deployment.py index d7ec3226..254ac7b5 100644 --- a/mii/deployment.py +++ b/mii/deployment.py @@ -104,26 +104,25 @@ def deploy(task, # add fields for replica deployment lb_config = None - if mii_config.enable_load_balancing: - replica_pool = _allocate_processes(mii_config.hostfile, - mii_config.tensor_parallel, - mii_config.replica_num) - replica_configs = [] - for i, (hostname, gpu_indices) in enumerate(replica_pool): - # Reserver port for a LB proxy when replication is enabled - port_offset = 1 if mii_config.enable_load_balancing else 0 - base_port = mii_config.port_number + i * mii_config.tensor_parallel + port_offset - tensor_parallel_ports = list( - range(base_port, - base_port + mii_config.tensor_parallel)) - torch_dist_port = mii_config.torch_dist_port + i - replica_configs.append( - ReplicaConfig(hostname=hostname, - tensor_parallel_ports=tensor_parallel_ports, - torch_dist_port=torch_dist_port, - gpu_indices=gpu_indices)) - lb_config = LoadBalancerConfig(port=mii_config.port_number, - replica_configs=replica_configs) + replica_pool = _allocate_processes(mii_config.hostfile, + mii_config.tensor_parallel, + mii_config.replica_num) + replica_configs = [] + for i, (hostname, gpu_indices) in enumerate(replica_pool): + # Reserver port for a LB proxy when replication is enabled + port_offset = 1 + base_port = mii_config.port_number + i * mii_config.tensor_parallel + port_offset + tensor_parallel_ports = list( + range(base_port, + base_port + mii_config.tensor_parallel)) + torch_dist_port = mii_config.torch_dist_port + i + replica_configs.append( + ReplicaConfig(hostname=hostname, + tensor_parallel_ports=tensor_parallel_ports, + torch_dist_port=torch_dist_port, + gpu_indices=gpu_indices)) + lb_config = LoadBalancerConfig(port=mii_config.port_number, + replica_configs=replica_configs) create_score_file(deployment_name=deployment_name, deployment_type=deployment_type, @@ -165,6 +164,7 @@ def _allocate_processes(hostfile_path, tensor_parallel, num_replicas): assert resource_pool is not None and len( resource_pool) > 0, f'No hosts found in {hostfile_path}' + print(resource_pool) replica_pool = [] allocated_num = 0 for host, slots in resource_pool.items(): diff --git a/mii/server.py b/mii/server.py index 158b7a5c..626a2522 100644 --- a/mii/server.py +++ b/mii/server.py @@ -48,7 +48,7 @@ def __init__(self, self.port_number = mii_configs.port_number - if mii_configs.enable_load_balancing and mii_configs.hostfile is None: + if mii_configs.hostfile is None: raise ValueError( "hostfile must be provided if enable_load_balancing == True") @@ -60,11 +60,7 @@ def __init__(self, ds_config, mii_configs, lb_config) - deployment = lb_config.replica_configs if mii_configs.enable_load_balancing else [ - ReplicaConfig(hostname='localhost', - tensor_parallel_ports=[mii_configs.port_number], - torch_dist_port=mii_configs.torch_dist_port) - ] + deployment = lb_config.replica_configs self._wait_until_server_is_live(processes, deployment) def _wait_until_server_is_live(self, processes, deployment): @@ -297,78 +293,57 @@ def _initialize_service(self, lb_config): processes = [] - if mii_configs.enable_load_balancing: - - host_gpus = defaultdict(list) - for repl_config in lb_config.replica_configs: - host_gpus[repl_config.hostname].extend(repl_config.gpu_indices) - - # Start replica instances - for i, repl_config in enumerate(lb_config.replica_configs): - hostfile = tempfile.NamedTemporaryFile(delete=False) - hostfile.write( - f'{repl_config.hostname} slots={max(host_gpus[repl_config.hostname])+1}\n' - .encode()) - processes.append( + + host_gpus = defaultdict(list) + for repl_config in lb_config.replica_configs: + host_gpus[repl_config.hostname].extend(repl_config.gpu_indices) + + # Start replica instances + for i, repl_config in enumerate(lb_config.replica_configs): + hostfile = tempfile.NamedTemporaryFile(delete=False) + hostfile.write( + f'{repl_config.hostname} slots={max(host_gpus[repl_config.hostname])+1}\n' + .encode()) + processes.append( self._launch_deepspeed( - deployment_name, - model_name, - model_path, - ds_optimize, - ds_zero, - ds_config, - mii_configs, - hostfile.name, - repl_config.hostname, - repl_config.tensor_parallel_ports[0], - mii_configs.torch_dist_port + (100 * i) + - repl_config.gpu_indices[0], - repl_config.gpu_indices)) + deployment_name, + model_name, + model_path, + ds_optimize, + ds_zero, + ds_config, + mii_configs, + hostfile.name, + repl_config.hostname, + repl_config.tensor_parallel_ports[0], + mii_configs.torch_dist_port + (100 * i) + + repl_config.gpu_indices[0], + repl_config.gpu_indices)) # start load balancer here. # we don't use deepspeed launcher for the load balancer because it does not need a GPU. # The deepspeed launcher determines the number of processes to launch based on GPUs available on the host or CUDA_VISIBLE_DEVICES, # and it is expected to assign one GPU to one process. + processes.append( + self._launch_load_balancer(deployment_name, + model_name, + model_path, + ds_optimize, + ds_zero, + ds_config, + mii_configs, + lb_config)) + + if mii_configs.enable_restful_api: + # start rest api server processes.append( - self._launch_load_balancer(deployment_name, - model_name, - model_path, - ds_optimize, - ds_zero, - ds_config, - mii_configs, - lb_config)) - - if mii_configs.enable_restful_api: - # start rest api server - processes.append( - self._launch_restful_gateway(deployment_name, - model_name, - model_path, - ds_optimize, - ds_zero, - ds_config, - mii_configs, - mii_configs.port_number)) - - return processes - else: - if self._is_socket_open("localhost", self.port_number): - raise RuntimeError( - f"Server is already running on port {self.port_number}, please shutdown or use different port." - ) + self._launch_restful_gateway(deployment_name, + model_name, + model_path, + ds_optimize, + ds_zero, + ds_config, + mii_configs, + mii_configs.port_number)) - processes.append( - self._launch_deepspeed(deployment_name, - model_name, - model_path, - ds_optimize, - ds_zero, - ds_config, - mii_configs, - '/dev/null', - 'localhost', - mii_configs.port_number, - mii_configs.torch_dist_port, - mii_configs.deploy_rank)) return processes From c68e999000b0669a2e85d64865c762257aa6284c Mon Sep 17 00:00:00 2001 From: Tosin Segun Date: Tue, 20 Jun 2023 17:24:13 +0000 Subject: [PATCH 02/69] Reformatting tests --- tests/test_local_deployment.py | 37 +++------------------------------- 1 file changed, 3 insertions(+), 34 deletions(-) diff --git a/tests/test_local_deployment.py b/tests/test_local_deployment.py index d28af701..b7ae8da5 100644 --- a/tests/test_local_deployment.py +++ b/tests/test_local_deployment.py @@ -43,11 +43,6 @@ def load_with_sys_mem(request): return request.param -@pytest.fixture(scope="function", params=[False]) -def enable_load_balancing(request): - return request.param - - @pytest.fixture(scope="function", params=[False]) def enable_restful_api(request): return request.param @@ -83,7 +78,6 @@ def mii_configs( tensor_parallel: int, port_number: int, load_with_sys_mem: bool, - enable_load_balancing: bool, enable_restful_api: bool, restful_api_port: int, ): @@ -91,18 +85,15 @@ def mii_configs( # Create a hostfile for DeepSpeed launcher when load_balancing is enabled hostfile = os.path.join(tmpdir, "hostfile") num_gpu = torch.cuda.device_count() - enable_load_balancing = enable_load_balancing or enable_restful_api - if enable_load_balancing: - with open(hostfile, "w") as f: - f.write(f"localhost slots={num_gpu}") + with open(hostfile, "w") as f: + f.write(f"localhost slots={num_gpu}") return { 'dtype': dtype, 'tensor_parallel': tensor_parallel, 'port_number': port_number, 'load_with_sys_mem': load_with_sys_mem, - 'enable_load_balancing': enable_load_balancing, - 'replica_num': num_gpu * enable_load_balancing // tensor_parallel, + 'replica_num': num_gpu * 1 // tensor_parallel, 'hostfile': hostfile, 'enable_restful_api': enable_restful_api, 'restful_api_port': restful_api_port, @@ -215,28 +206,6 @@ def test_single_GPU(local_deployment, query): assert result -@pytest.mark.local -@pytest.mark.parametrize("enable_load_balancing", [True]) -@pytest.mark.parametrize("tensor_parallel", [1, 2]) -@pytest.mark.parametrize( - "task_name, model_name, query", - [ - ( - "text-generation", - "bigscience/bloom-560m", - { - "query": ["DeepSpeed is the greatest"] - }, - ), - ], -) -def test_load_balancing(local_deployment, query): - generator = mii.mii_query_handle(local_deployment.deployment_name) - for _ in range(10): - result = generator.query(query) - assert result - - @pytest.mark.local @pytest.mark.parametrize("enable_restful_api", [True]) @pytest.mark.parametrize("restful_api_port", [28080]) From 5ce1a922fec29ce3d555b9847838ae7cb3ff18ee Mon Sep 17 00:00:00 2001 From: Tosin Segun Date: Tue, 20 Jun 2023 18:05:57 +0000 Subject: [PATCH 03/69] Fixed the formatting --- mii/client.py | 1 + mii/config.py | 4 +--- mii/server.py | 20 +++++++++----------- 3 files changed, 11 insertions(+), 14 deletions(-) diff --git a/mii/client.py b/mii/client.py index b2970a5e..964e9624 100644 --- a/mii/client.py +++ b/mii/client.py @@ -44,6 +44,7 @@ def mii_query_handle(deployment_name): [mii_configs.port_number + i for i in range(mii_configs.tensor_parallel)]) """ + def create_channel(host, port): return grpc.aio.insecure_channel(f'{host}:{port}', options=[('grpc.max_send_message_length', diff --git a/mii/config.py b/mii/config.py index 34e93ddd..d9e5aeb7 100644 --- a/mii/config.py +++ b/mii/config.py @@ -5,12 +5,10 @@ import torch from typing import Union, List from enum import Enum -from pydantic import BaseModel, validator, root_validator +from pydantic import BaseModel, validator from deepspeed.launcher.runner import DLTS_HOSTFILE -from .utils import logger - class DtypeEnum(Enum): # The torch dtype must always be the first value (so we return torch.dtype) diff --git a/mii/server.py b/mii/server.py index 626a2522..d62da132 100644 --- a/mii/server.py +++ b/mii/server.py @@ -14,7 +14,6 @@ import mii from mii.utils import get_num_gpus, logger -from mii.config import ReplicaConfig def config_to_b64_str(config): @@ -305,7 +304,7 @@ def _initialize_service(self, f'{repl_config.hostname} slots={max(host_gpus[repl_config.hostname])+1}\n' .encode()) processes.append( - self._launch_deepspeed( + self._launch_deepspeed( deployment_name, model_name, model_path, @@ -316,8 +315,7 @@ def _initialize_service(self, hostfile.name, repl_config.hostname, repl_config.tensor_parallel_ports[0], - mii_configs.torch_dist_port + (100 * i) + - repl_config.gpu_indices[0], + mii_configs.torch_dist_port + (100 * i) + repl_config.gpu_indices[0], repl_config.gpu_indices)) # start load balancer here. @@ -326,13 +324,13 @@ def _initialize_service(self, # and it is expected to assign one GPU to one process. processes.append( self._launch_load_balancer(deployment_name, - model_name, - model_path, - ds_optimize, - ds_zero, - ds_config, - mii_configs, - lb_config)) + model_name, + model_path, + ds_optimize, + ds_zero, + ds_config, + mii_configs, + lb_config)) if mii_configs.enable_restful_api: # start rest api server From fa10e19f9612b9452f6d02c0d85e33519e28052c Mon Sep 17 00:00:00 2001 From: Tosin Segun Date: Tue, 20 Jun 2023 18:12:01 +0000 Subject: [PATCH 04/69] Removed print statement --- mii/deployment.py | 1 - 1 file changed, 1 deletion(-) diff --git a/mii/deployment.py b/mii/deployment.py index 254ac7b5..e4c36ae6 100644 --- a/mii/deployment.py +++ b/mii/deployment.py @@ -164,7 +164,6 @@ def _allocate_processes(hostfile_path, tensor_parallel, num_replicas): assert resource_pool is not None and len( resource_pool) > 0, f'No hosts found in {hostfile_path}' - print(resource_pool) replica_pool = [] allocated_num = 0 for host, slots in resource_pool.items(): From 8970f4e69a9dd368574f084a6bb246f1ed0e926e Mon Sep 17 00:00:00 2001 From: Tosin Segun Date: Mon, 26 Jun 2023 16:54:34 +0000 Subject: [PATCH 05/69] Removing unused import --- mii/server.py | 1 - 1 file changed, 1 deletion(-) diff --git a/mii/server.py b/mii/server.py index f0bdad0b..35c1745d 100644 --- a/mii/server.py +++ b/mii/server.py @@ -14,7 +14,6 @@ import mii from mii.utils import get_num_gpus, logger, get_provider_name -from mii.config import ReplicaConfig def config_to_b64_str(config): From 517bea8c6df19195c088c463d343a6831303afdf Mon Sep 17 00:00:00 2001 From: Tosin Segun Date: Mon, 26 Jun 2023 17:39:52 +0000 Subject: [PATCH 06/69] Fixing tests --- tests/test_non_persistent_deployment.py | 30 ++----------------------- 1 file changed, 2 insertions(+), 28 deletions(-) diff --git a/tests/test_non_persistent_deployment.py b/tests/test_non_persistent_deployment.py index 3fd5825b..c2347581 100644 --- a/tests/test_non_persistent_deployment.py +++ b/tests/test_non_persistent_deployment.py @@ -16,22 +16,18 @@ def mii_configs( dtype: str, tensor_parallel: int, load_with_sys_mem: bool, - enable_load_balancing: bool, ): # Create a hostfile for DeepSpeed launcher when load_balancing is enabled hostfile = os.path.join(tmpdir, "hostfile") num_gpu = torch.cuda.device_count() - enable_load_balancing = enable_load_balancing - if enable_load_balancing: - with open(hostfile, "w") as f: - f.write(f"localhost slots={num_gpu}") + with open(hostfile, "w") as f: + f.write(f"localhost slots={num_gpu}") return { 'dtype': dtype, 'tensor_parallel': tensor_parallel, 'load_with_sys_mem': load_with_sys_mem, - 'enable_load_balancing': enable_load_balancing, } @@ -134,25 +130,3 @@ def test_single_GPU(non_persistent_deployment, query): generator = mii.mii_query_handle(non_persistent_deployment.deployment_name) result = generator.query(query) assert result - - -@pytest.mark.local -@pytest.mark.parametrize("enable_load_balancing", [True]) -@pytest.mark.parametrize("expected_failure", [AssertionError]) -@pytest.mark.parametrize("tensor_parallel", [1, 2]) -@pytest.mark.parametrize( - "task_name, model_name, query", - [ - ( - "text-generation", - "bigscience/bloom-560m", - { - "query": ["DeepSpeed is the greatest"] - }, - ), - ], -) -def test_load_balancing(non_persistent_deployment, query): - print(f"TESTING NON_PERSISTENT_DEPLOYMENT: {non_persistent_deployment}") - assert "Cannot use Load Balancing with Non persistent deployment" in str( - non_persistent_deployment.value) From 58dd2b2eb504ddb5170b26f7521cbcdda85bb63f Mon Sep 17 00:00:00 2001 From: Tosin Segun Date: Mon, 26 Jun 2023 17:44:13 +0000 Subject: [PATCH 07/69] Fixing merge issue --- mii/deployment.py | 1 - 1 file changed, 1 deletion(-) diff --git a/mii/deployment.py b/mii/deployment.py index 040142e4..fd5a11c9 100644 --- a/mii/deployment.py +++ b/mii/deployment.py @@ -143,7 +143,6 @@ def deploy(task, elif deployment_type == DeploymentType.LOCAL: return _deploy_local(deployment_name, model_path=model_path) elif deployment_type == DeploymentType.NON_PERSISTENT: - assert not mii_config.enable_load_balancing, "Cannot use Load Balancing with Non persistent deployment" assert int(os.getenv('WORLD_SIZE', '1')) == mii_config.tensor_parallel, "World Size does not equal number of tensors. When using non-persistent deployment type, please launch with `deepspeed --num_gpus `" provider = MODEL_PROVIDER_MAP[get_provider_name(model, task)] mii.non_persistent_models[deployment_name] = (load_models( From bb0d5518f11ff122ac17119837e849dc42c31dc6 Mon Sep 17 00:00:00 2001 From: Tosin Segun Date: Mon, 26 Jun 2023 18:36:06 +0000 Subject: [PATCH 08/69] Creating hostfile when one is not provided --- mii/client.py | 7 ------- mii/server.py | 8 ++++++-- tests/test_local_deployment.py | 2 +- tests/utils.py | 5 ----- 4 files changed, 7 insertions(+), 15 deletions(-) diff --git a/mii/client.py b/mii/client.py index f2fb887e..535b55c8 100644 --- a/mii/client.py +++ b/mii/client.py @@ -41,13 +41,6 @@ def mii_query_handle(deployment_name): task_name, mii_configs = _get_deployment_info(deployment_name) return MIIClient(task_name, "localhost", mii_configs.port_number) - """ - else: - return MIITensorParallelClient( - task_name, - "localhost", - [mii_configs.port_number + i for i in range(mii_configs.tensor_parallel)]) - """ def create_channel(host, port): diff --git a/mii/server.py b/mii/server.py index 35c1745d..c734beba 100644 --- a/mii/server.py +++ b/mii/server.py @@ -9,6 +9,7 @@ import sys import tempfile import time +import torch from pathlib import Path from collections import defaultdict @@ -48,8 +49,11 @@ def __init__(self, self.port_number = mii_configs.port_number if mii_configs.hostfile is None: - raise ValueError( - "hostfile must be provided if enable_load_balancing == True") + hostfile = tempfile.NamedTemporaryFile() + num_gpu = torch.cuda.device_count() + with open(hostfile, "w") as f: + f.write(f"localhost slots={num_gpu}") + mii.configs.hostfile = hostfile processes = self._initialize_service(deployment_name, model_name, diff --git a/tests/test_local_deployment.py b/tests/test_local_deployment.py index ac5befd6..b3cd8218 100644 --- a/tests/test_local_deployment.py +++ b/tests/test_local_deployment.py @@ -53,7 +53,7 @@ def mii_configs( 'tensor_parallel': tensor_parallel, 'port_number': port_number, 'load_with_sys_mem': load_with_sys_mem, - 'replica_num': num_gpu * 1 // tensor_parallel, + 'replica_num': num_gpu // tensor_parallel, 'hostfile': hostfile, 'enable_restful_api': enable_restful_api, 'restful_api_port': restful_api_port, diff --git a/tests/utils.py b/tests/utils.py index 3fd2b950..babec323 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -49,8 +49,3 @@ def ds_config(request): @pytest.fixture(scope="function", params=[None]) def expected_failure(request): return request.param - - -@pytest.fixture(scope="function", params=[False]) -def enable_load_balancing(request): - return request.param From 3823534e07d03c43f91a19c9e531bdb07c2302fd Mon Sep 17 00:00:00 2001 From: Tosin Segun Date: Mon, 26 Jun 2023 20:24:00 +0000 Subject: [PATCH 09/69] Fixing import statements removed by merge --- mii/config.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mii/config.py b/mii/config.py index 30604e74..2d5d520a 100644 --- a/mii/config.py +++ b/mii/config.py @@ -5,10 +5,12 @@ import torch from typing import Union, List from enum import Enum -from pydantic import BaseModel, validator +from pydantic import BaseModel, validator, root_validator from deepspeed.launcher.runner import DLTS_HOSTFILE +from .utils import logger + class DtypeEnum(Enum): # The torch dtype must always be the first value (so we return torch.dtype) From 6f9b4ad9c81e2ea10102193b8433a89c532f1fe6 Mon Sep 17 00:00:00 2001 From: Tosin Segun Date: Mon, 26 Jun 2023 20:59:45 +0000 Subject: [PATCH 10/69] Removing load_balancing check --- mii/config.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/mii/config.py b/mii/config.py index 2d5d520a..6a8bac16 100644 --- a/mii/config.py +++ b/mii/config.py @@ -9,8 +9,6 @@ from deepspeed.launcher.runner import DLTS_HOSTFILE -from .utils import logger - class DtypeEnum(Enum): # The torch dtype must always be the first value (so we return torch.dtype) @@ -92,13 +90,6 @@ def checkpoint_dict_valid(cls, value): raise ValueError(f"Missing key={k} in checkpoint_dict") return value - @root_validator - def auto_enable_load_balancing(cls, values): - if values["enable_restful_api"] and not values["enable_load_balancing"]: - logger.warn("Restful API is enabled, enabling Load Balancing") - values["enable_load_balancing"] = True - return values - @root_validator def meta_tensor_or_sys_mem(cls, values): if values.get("meta_tensor") and values.get("load_with_sys_mem"): From 499b9ad83a192a24b47252d6e00ad4f7bf02f0ee Mon Sep 17 00:00:00 2001 From: Tosin Segun Date: Mon, 26 Jun 2023 21:39:56 +0000 Subject: [PATCH 11/69] Removing redudant definitions --- mii/deployment.py | 1 - mii/server.py | 3 +-- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/mii/deployment.py b/mii/deployment.py index fd5a11c9..3cadd994 100644 --- a/mii/deployment.py +++ b/mii/deployment.py @@ -105,7 +105,6 @@ def deploy(task, model_path = "model" # add fields for replica deployment - lb_config = None replica_pool = _allocate_processes(mii_config.hostfile, mii_config.tensor_parallel, mii_config.replica_num) diff --git a/mii/server.py b/mii/server.py index c734beba..77e50e26 100644 --- a/mii/server.py +++ b/mii/server.py @@ -63,8 +63,7 @@ def __init__(self, ds_config, mii_configs, lb_config) - deployment = lb_config.replica_configs - self._wait_until_server_is_live(processes, deployment) + self._wait_until_server_is_live(processes, lb_config.replica_configs) def _wait_until_server_is_live(self, processes, deployment): for process, repl_config in zip(processes, deployment): From 5419ef6f649b1ec2bfeb68385786b442cac56fa1 Mon Sep 17 00:00:00 2001 From: Tosin Segun Date: Mon, 26 Jun 2023 22:23:20 +0000 Subject: [PATCH 12/69] Removing hostfile from test --- tests/test_local_deployment.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/tests/test_local_deployment.py b/tests/test_local_deployment.py index 1acf91c3..3d1ad214 100644 --- a/tests/test_local_deployment.py +++ b/tests/test_local_deployment.py @@ -43,12 +43,7 @@ def mii_configs( restful_api_port: int, ): - # Create a hostfile for DeepSpeed launcher when load_balancing is enabled - hostfile = os.path.join(tmpdir, "hostfile") num_gpu = torch.cuda.device_count() - with open(hostfile, "w") as f: - f.write(f"localhost slots={num_gpu}") - return { 'dtype': dtype, 'tensor_parallel': tensor_parallel, @@ -56,7 +51,6 @@ def mii_configs( 'meta_tensor': meta_tensor, 'load_with_sys_mem': load_with_sys_mem, 'replica_num': num_gpu // tensor_parallel, - 'hostfile': hostfile, 'enable_restful_api': enable_restful_api, 'restful_api_port': restful_api_port, } From a70b6de25a835cefd969d01dd917290db892cbfa Mon Sep 17 00:00:00 2001 From: Tosin Segun Date: Mon, 26 Jun 2023 23:10:33 +0000 Subject: [PATCH 13/69] Removing hostfile from non-persistent test --- tests/test_non_persistent_deployment.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/tests/test_non_persistent_deployment.py b/tests/test_non_persistent_deployment.py index c2347581..50861493 100644 --- a/tests/test_non_persistent_deployment.py +++ b/tests/test_non_persistent_deployment.py @@ -4,7 +4,6 @@ # DeepSpeed Team import pytest import os -import torch from types import SimpleNamespace from .utils import * # noqa: F401 import mii @@ -18,12 +17,6 @@ def mii_configs( load_with_sys_mem: bool, ): - # Create a hostfile for DeepSpeed launcher when load_balancing is enabled - hostfile = os.path.join(tmpdir, "hostfile") - num_gpu = torch.cuda.device_count() - with open(hostfile, "w") as f: - f.write(f"localhost slots={num_gpu}") - return { 'dtype': dtype, 'tensor_parallel': tensor_parallel, From eea658beb7e1ffff5b6b6e778880bd7d70f7e914 Mon Sep 17 00:00:00 2001 From: Tosin Segun Date: Tue, 27 Jun 2023 21:33:42 +0000 Subject: [PATCH 14/69] initial changes --- mii/__init__.py | 1 + mii/config.py | 14 ++++++- mii/constants.py | 2 +- mii/deployment.py | 80 +++++++++++++++++------------------- mii/models/score/generate.py | 59 +++++++++++++------------- mii/server.py | 9 +--- 6 files changed, 83 insertions(+), 82 deletions(-) diff --git a/mii/__init__.py b/mii/__init__.py index ab409d4c..6b9740e7 100644 --- a/mii/__init__.py +++ b/mii/__init__.py @@ -15,6 +15,7 @@ __version__ = "0.0.0" non_persistent_models = {} +multi_model_deployments = {} try: from .version import __version__ except ImportError: diff --git a/mii/config.py b/mii/config.py index 6a8bac16..19889740 100644 --- a/mii/config.py +++ b/mii/config.py @@ -123,4 +123,16 @@ class LoadBalancerConfig(BaseModel): class Config: validate_all = True - validate_assignment = True +validate_assignment = True + + +class Deployment(BaseModel): + deployment_name: str + task: str + model: str + enable_deepspeed: bool = True + enable_zero: bool = True + GPU_index_map: dict = None + mii_config: dict = None + ds_config: dict = None + version: int = 1 diff --git a/mii/constants.py b/mii/constants.py index ba4cfa2f..29493433 100644 --- a/mii/constants.py +++ b/mii/constants.py @@ -94,7 +94,7 @@ class ModelProvider(enum.Enum): DEPLOYMENT_NAME_KEY = 'deployment_name' MODEL_PATH_KEY = 'model_path' LOAD_BALANCER_CONFIG_KEY = 'load_balancer_config' - +DEPLOYMENT_TAG_KEY = 'deployment_tag' ENABLE_DEEPSPEED_KEY = 'ds_optimize' ENABLE_DEEPSPEED_ZERO_KEY = 'ds_zero' DEEPSPEED_CONFIG_KEY = 'ds_config' diff --git a/mii/deployment.py b/mii/deployment.py index 3cadd994..afb5abf9 100644 --- a/mii/deployment.py +++ b/mii/deployment.py @@ -16,16 +16,10 @@ from .config import ReplicaConfig, LoadBalancerConfig -def deploy(task, - model, - deployment_name, +def deploy(deployment_tag, + deployments, deployment_type=DeploymentType.LOCAL, - model_path=None, - enable_deepspeed=True, - enable_zero=False, - ds_config=None, - mii_config={}, - version=1): + model_path=None): """Deploy a task using specified model. For usage examples see: mii/examples/local/text-generation-example.py @@ -66,15 +60,19 @@ def deploy(task, If deployment_type is `LOCAL`, returns just the name of the deployment that can be used to create a query handle using `mii.mii_query_handle(deployment_name)` """ - + mii.multi_model_deployments[deployment_tag] = deployments + ports = set() # parse and validate mii config - mii_config = mii.config.MIIConfig(**mii_config) - if enable_zero: - if ds_config.get("fp16", {}).get("enabled", False): - assert (mii_config.dtype == torch.half), "MII Config Error: MII dtype and ZeRO dtype must match" - else: - assert (mii_config.dtype == torch.float), "MII Config Error: MII dtype and ZeRO dtype must match" - assert not (enable_deepspeed and enable_zero), "MII Config Error: DeepSpeed and ZeRO cannot both be enabled, select only one" + for deployment in deployments: + mii_config = mii.config.MIIConfig(**deployment.mii_config) + assert mii_config.port_number not in ports, f"duplicate port numbers not allowed - {mii.config.port_number}" + ports.add(mii_config.port_number) + if deployment.enable_zero: + if deployment.ds_config.get("fp16", {}).get("enabled", False): + assert (mii_config.dtype == torch.half), "MII Config Error: MII dtype and ZeRO dtype must match" + else: + assert (mii_config.dtype == torch.float), "MII Config Error: MII dtype and ZeRO dtype must match" + assert not (enable_deepspeed and enable_zero), "MII Config Error: DeepSpeed and ZeRO cannot both be enabled, select only one" # aml only allows certain characters for deployment names if deployment_type == DeploymentType.AML: @@ -82,21 +80,22 @@ def deploy(task, string.digits + '-') assert set(deployment_name) <= allowed_chars, "AML deployment names can only contain a-z, A-Z, 0-9, and '-'" - task = mii.utils.get_task(task) + for deployment in deployments: + deployment.task = mii.utils.get_task(deployment.task) - if not mii_config.skip_model_check: - mii.utils.check_if_task_and_model_is_valid(task, model) - if enable_deepspeed: - mii.utils.check_if_task_and_model_is_supported(task, model) + if not mii_config.skip_model_check: + mii.utils.check_if_task_and_model_is_valid(deployment.task, deployment.model) + if enable_deepspeed: + mii.utils.check_if_task_and_model_is_supported(deployment.task, deployment.model) - if enable_deepspeed: - logger.info( - f"************* MII is using DeepSpeed Optimizations to accelerate your model *************" - ) - else: - logger.info( - f"************* DeepSpeed Optimizations not enabled. Please use enable_deepspeed to get better performance *************" - ) + if enable_deepspeed: + logger.info( + f"************* MII is using DeepSpeed Optimizations to accelerate your model: {deployment.model} *************" + ) + else: + logger.info( + f"************* DeepSpeed Optimizations not enabled. Please use enable_deepspeed to get better performance for: {deployment.model} *************" + ) # In local deployments use default path if no model path set if model_path is None and deployment_type == DeploymentType.LOCAL: @@ -126,21 +125,16 @@ def deploy(task, replica_configs=replica_configs) if deployment_type != DeploymentType.NON_PERSISTENT: - create_score_file(deployment_name=deployment_name, + create_score_file(deployment_tag=deployment_tag, + deployments=deployments, deployment_type=deployment_type, - task=task, - model_name=model, - ds_optimize=enable_deepspeed, - ds_zero=enable_zero, - ds_config=ds_config, - mii_config=mii_config, model_path=model_path, lb_config=lb_config) if deployment_type == DeploymentType.AML: - _deploy_aml(deployment_name=deployment_name, model_name=model, version=version) + _deploy_aml(deployment_tag=deployment_tag, model_name=model, version=version) elif deployment_type == DeploymentType.LOCAL: - return _deploy_local(deployment_name, model_path=model_path) + return _deploy_local(deployment_tag, model_path=model_path) elif deployment_type == DeploymentType.NON_PERSISTENT: assert int(os.getenv('WORLD_SIZE', '1')) == mii_config.tensor_parallel, "World Size does not equal number of tensors. When using non-persistent deployment type, please launch with `deepspeed --num_gpus `" provider = MODEL_PROVIDER_MAP[get_provider_name(model, task)] @@ -157,14 +151,14 @@ def deploy(task, raise Exception(f"Unknown deployment type: {deployment_type}") -def _deploy_local(deployment_name, model_path): - mii.utils.import_score_file(deployment_name).init() +def _deploy_local(deployment_tag, model_path): + mii.utils.import_score_file(deployment_tag).init() -def _deploy_aml(deployment_name, model_name, version): +def _deploy_aml(deployment_tag, model_name, version): acr_name = mii.aml_related.utils.get_acr_name() mii.aml_related.utils.generate_aml_scripts(acr_name=acr_name, - deployment_name=deployment_name, + deployment_name=deployment_tag, model_name=model_name, version=version) print( diff --git a/mii/models/score/generate.py b/mii/models/score/generate.py index 1184d70e..6da8f3d9 100644 --- a/mii/models/score/generate.py +++ b/mii/models/score/generate.py @@ -9,53 +9,52 @@ from mii.constants import DeploymentType -def create_score_file(deployment_name, +def create_score_file(deployment_tag, deployment_type, - task, - model_name, - ds_optimize, - ds_zero, - ds_config, - mii_config, + deployments, model_path, - lb_config): + lb_config) + config_dict = {} - config_dict[mii.constants.DEPLOYMENT_NAME_KEY] = deployment_name - config_dict[mii.constants.TASK_NAME_KEY] = mii.utils.get_task_name(task) - config_dict[mii.constants.MODEL_NAME_KEY] = model_name - config_dict[mii.constants.ENABLE_DEEPSPEED_KEY] = ds_optimize - config_dict[mii.constants.MII_CONFIGS_KEY] = mii_config.dict() - config_dict[mii.constants.ENABLE_DEEPSPEED_ZERO_KEY] = ds_zero - config_dict[mii.constants.DEEPSPEED_CONFIG_KEY] = ds_config config_dict[mii.constants.MODEL_PATH_KEY] = model_path - - if lb_config is not None: - config_dict[mii.constants.LOAD_BALANCER_CONFIG_KEY] = lb_config - - if len(mii.__path__) > 1: - logger.warning( - f"Detected mii path as multiple sources: {mii.__path__}, might cause unknown behavior" - ) + config_dict[mii.constants.DEPLOYMENT_TAG_KEY] = deployment_tag + for deployment in deployments: + config_dict[deployment.deployment_name] = {} + config_dict[deployment.deployment_name][mii.constants.DEPLOYMENT_NAME_KEY] = deployment_name + config_dict[deployment.deployment_name][mii.constants.TASK_NAME_KEY] = mii.utils.get_task_name(task) + config_dict[deployment.deployment_name][mii.constants.MODEL_NAME_KEY] = model_name + config_dict[deployment.deployment_name][mii.constants.ENABLE_DEEPSPEED_KEY] = ds_optimize + config_dict[deployment.deployment_name][mii.constants.MII_CONFIGS_KEY] = mii_config.dict() + config_dict[deployment.deployment_name][mii.constants.ENABLE_DEEPSPEED_ZERO_KEY] = ds_zero + config_dict[deployment.deployment_name][mii.constants.DEEPSPEED_CONFIG_KEY] = ds_config + + if lb_config is not None: + config_dict[deployment.deployment_name][mii.constants.LOAD_BALANCER_CONFIG_KEY] = lb_config + + if len(mii.__path__) > 1: + logger.warning( + f"Detected mii path as multiple sources: {mii.__path__}, might cause unknown behavior" + ) with open(os.path.join(mii.__path__[0], - "models/score/score_template.py"), - "r") as fd: + "models/score/score_template.py"), + "r") as fd: score_src = fd.read() # update score file w. global config dict source_with_config = f"{score_src}\n" source_with_config += f"configs = {pprint.pformat(config_dict, indent=4)}" - with open(generated_score_path(deployment_name, deployment_type), "w") as fd: - fd.write(source_with_config) + with open(generated_score_path(deployment_tag, deployment_type), "w") as fd: + fd.write(source_with_config): fd.write("\n") -def generated_score_path(deployment_name, deployment_type): +def generated_score_path(deployment_tag, deployment_type): if deployment_type == DeploymentType.LOCAL: - score_path = os.path.join(mii.utils.mii_cache_path(), deployment_name) + score_path = os.path.join(mii.utils.mii_cache_path(), deployment_tag) elif deployment_type == DeploymentType.AML: - score_path = os.path.join(mii.aml_related.utils.aml_output_path(deployment_name), + score_path = os.path.join(mii.aml_related.utils.aml_output_path(deployment_tag), "code") if not os.path.isdir(score_path): os.makedirs(score_path) diff --git a/mii/server.py b/mii/server.py index 77e50e26..61b5f9cb 100644 --- a/mii/server.py +++ b/mii/server.py @@ -29,14 +29,9 @@ def config_to_b64_str(config): class MIIServer(): '''Initialize the model, setup the server for the model under model_path''' def __init__(self, - deployment_name, - task_name, - model_name, + deployment_tag, + deployments, model_path, - ds_optimize=True, - ds_zero=False, - ds_config=None, - mii_configs={}, lb_config=None): mii_configs = mii.config.MIIConfig(**mii_configs) From c21c31bf87f9c3dbbc0550ac09076a764f736d36 Mon Sep 17 00:00:00 2001 From: Tosin Segun Date: Wed, 28 Jun 2023 18:02:17 +0000 Subject: [PATCH 15/69] Maintaining current behavior --- mii/config.py | 1 + mii/deployment.py | 18 ++++++++++++++++-- mii/models/score/generate.py | 4 ++-- mii/models/score/score_template.py | 2 ++ mii/server.py | 29 +++++++++++------------------ 5 files changed, 32 insertions(+), 22 deletions(-) diff --git a/mii/config.py b/mii/config.py index 19889740..1e74df85 100644 --- a/mii/config.py +++ b/mii/config.py @@ -107,6 +107,7 @@ class Config: class ReplicaConfig(BaseModel): + deployment_name: str = "" hostname: str = "" tensor_parallel_ports: List[int] = [] torch_dist_port: int = None diff --git a/mii/deployment.py b/mii/deployment.py index afb5abf9..c0175c09 100644 --- a/mii/deployment.py +++ b/mii/deployment.py @@ -16,8 +16,15 @@ from .config import ReplicaConfig, LoadBalancerConfig -def deploy(deployment_tag, - deployments, +def deploy(task=None, + model=None, + deployment_name=None, + enable_deepspeed=True, + enable_zero=False, + ds_config=None, + mii_config={}, + deployment_tag=None, + deployments=[], deployment_type=DeploymentType.LOCAL, model_path=None): """Deploy a task using specified model. For usage examples see: @@ -60,6 +67,13 @@ def deploy(deployment_tag, If deployment_type is `LOCAL`, returns just the name of the deployment that can be used to create a query handle using `mii.mii_query_handle(deployment_name)` """ + if len(deployments == 0): + assert model is not None and task is not None and deployment_name is not None, "model, task, and deployment name must be set to deploy sigular model" + deployments = [Deployment(deployment_name, task, model, enable_deepspeed, enable_zero, None, mii_config, ds_config, version)] + deployment_tag = deployment_name + "_tag" + else: + assert deployment_tag is not None, "deployment_tag must be set to deploy multiple models" + mii.multi_model_deployments[deployment_tag] = deployments ports = set() # parse and validate mii config diff --git a/mii/models/score/generate.py b/mii/models/score/generate.py index 6da8f3d9..68036960 100644 --- a/mii/models/score/generate.py +++ b/mii/models/score/generate.py @@ -13,7 +13,7 @@ def create_score_file(deployment_tag, deployment_type, deployments, model_path, - lb_config) + lb_config): config_dict = {} config_dict[mii.constants.MODEL_PATH_KEY] = model_path @@ -46,7 +46,7 @@ def create_score_file(deployment_tag, source_with_config += f"configs = {pprint.pformat(config_dict, indent=4)}" with open(generated_score_path(deployment_tag, deployment_type), "w") as fd: - fd.write(source_with_config): + fd.write(source_with_config) fd.write("\n") diff --git a/mii/models/score/score_template.py b/mii/models/score/score_template.py index 04e47fae..80c220df 100644 --- a/mii/models/score/score_template.py +++ b/mii/models/score/score_template.py @@ -16,6 +16,8 @@ def init(): model_path = mii.utils.full_model_path(configs[mii.constants.MODEL_PATH_KEY]) + deployment_tag = configs[mii.constants.DEPLOYMENT_TAG_KEY] + deployments = mii.multi_model_deployments[deployment_tag] deployment_name = configs[mii.constants.DEPLOYMENT_NAME_KEY] model_name = configs[mii.constants.MODEL_NAME_KEY] diff --git a/mii/server.py b/mii/server.py index 7c2e58dd..c7aff4de 100644 --- a/mii/server.py +++ b/mii/server.py @@ -50,13 +50,9 @@ def __init__(self, f.write(f"localhost slots={num_gpu}") mii.configs.hostfile = hostfile - processes = self._initialize_service(deployment_name, - model_name, + processes = self._initialize_service(deployment_tag, + deployments, model_path, - ds_optimize, - ds_zero, - ds_config, - mii_configs, lb_config) self._wait_until_server_is_live(processes, lb_config.replica_configs) @@ -273,13 +269,9 @@ def _launch_deepspeed(self, ds_launch_str=ds_launch_str) def _initialize_service(self, - deployment_name, - model_name, + deployment_tag, + deployments, model_path, - ds_optimize, - ds_zero, - ds_config, - mii_configs, lb_config): processes = [] @@ -290,19 +282,20 @@ def _initialize_service(self, # Start replica instances for i, repl_config in enumerate(lb_config.replica_configs): + name = repl_config.deployment_name hostfile = tempfile.NamedTemporaryFile(delete=False) hostfile.write( f'{repl_config.hostname} slots={max(host_gpus[repl_config.hostname])+1}\n' .encode()) processes.append( self._launch_deepspeed( - deployment_name, - model_name, + name, + deployments[name].model, model_path, - ds_optimize, - ds_zero, - ds_config, - mii_configs, + deployments[name].enable_deepspeed, + deployments[name].enable_zero, + deployments[name].ds_config, + deployments[name].mii_configs, hostfile.name, repl_config.hostname, repl_config.tensor_parallel_ports[0], From f5253298654fce56156c2750c6184f7b967ddfe8 Mon Sep 17 00:00:00 2001 From: Tosin Segun Date: Wed, 28 Jun 2023 19:04:12 +0000 Subject: [PATCH 16/69] Reading from score file --- mii/config.py | 2 +- mii/deployment.py | 4 ++-- mii/models/score/score_template.py | 4 +++- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/mii/config.py b/mii/config.py index 1e74df85..954cecd7 100644 --- a/mii/config.py +++ b/mii/config.py @@ -134,6 +134,6 @@ class Deployment(BaseModel): enable_deepspeed: bool = True enable_zero: bool = True GPU_index_map: dict = None - mii_config: dict = None + mii_config: MIIConfig = None ds_config: dict = None version: int = 1 diff --git a/mii/deployment.py b/mii/deployment.py index c0175c09..a8998a9e 100644 --- a/mii/deployment.py +++ b/mii/deployment.py @@ -67,8 +67,8 @@ def deploy(task=None, If deployment_type is `LOCAL`, returns just the name of the deployment that can be used to create a query handle using `mii.mii_query_handle(deployment_name)` """ - if len(deployments == 0): - assert model is not None and task is not None and deployment_name is not None, "model, task, and deployment name must be set to deploy sigular model" + if not deployments: + assert all((model, task, deployment_name)), "model, task, and deployment name must be set to deploy singular model" deployments = [Deployment(deployment_name, task, model, enable_deepspeed, enable_zero, None, mii_config, ds_config, version)] deployment_tag = deployment_name + "_tag" else: diff --git a/mii/models/score/score_template.py b/mii/models/score/score_template.py index 80c220df..0681ac2f 100644 --- a/mii/models/score/score_template.py +++ b/mii/models/score/score_template.py @@ -17,7 +17,9 @@ def init(): model_path = mii.utils.full_model_path(configs[mii.constants.MODEL_PATH_KEY]) deployment_tag = configs[mii.constants.DEPLOYMENT_TAG_KEY] - deployments = mii.multi_model_deployments[deployment_tag] + deployments = [] + for deployment in configs.values(): + deployments.append(Deployment(deployment[mii.constants.DEPLOYMENT_NAME_KEY], deployment[mii.constants.TASK_NAME_KEY], deployment[mii.constants.DEPLOYMENT_MODEL_NAME_KEY], deloyment[mii.constants.ENABLE_DEEPSPEED_KEY], deployment[mii.constants.ENABLE_DEEPSPEED_ZERO_KEY], None, deployment[mii.constants.MII_CONFIGS_KEY], deployment[mii.constants.DS_CONFIG_KEY], 1)) deployment_name = configs[mii.constants.DEPLOYMENT_NAME_KEY] model_name = configs[mii.constants.MODEL_NAME_KEY] From 3c0937f2a07bb8b71d1864f5a299bf9b52124211 Mon Sep 17 00:00:00 2001 From: Tosin Segun Date: Wed, 28 Jun 2023 21:36:41 +0000 Subject: [PATCH 17/69] fixing syntax errors --- mii/__init__.py | 2 +- mii/config.py | 4 ++-- mii/deployment.py | 7 ++++--- mii/grpc_related/modelresponse_server.py | 1 + 4 files changed, 8 insertions(+), 6 deletions(-) diff --git a/mii/__init__.py b/mii/__init__.py index 6b9740e7..5c84d1dc 100644 --- a/mii/__init__.py +++ b/mii/__init__.py @@ -10,7 +10,7 @@ from .constants import DeploymentType, Tasks from .aml_related.utils import aml_output_path -from .config import MIIConfig, LoadBalancerConfig +from .config import MIIConfig, LoadBalancerConfig, Deployment from .grpc_related.proto import modelresponse_pb2_grpc __version__ = "0.0.0" diff --git a/mii/config.py b/mii/config.py index 954cecd7..531ee800 100644 --- a/mii/config.py +++ b/mii/config.py @@ -132,8 +132,8 @@ class Deployment(BaseModel): task: str model: str enable_deepspeed: bool = True - enable_zero: bool = True + enable_zero: bool = False GPU_index_map: dict = None - mii_config: MIIConfig = None + mii_config: MIIConfig = MIIConfig.parse_obj({}) ds_config: dict = None version: int = 1 diff --git a/mii/deployment.py b/mii/deployment.py index a8998a9e..01dbd71b 100644 --- a/mii/deployment.py +++ b/mii/deployment.py @@ -78,9 +78,10 @@ def deploy(task=None, ports = set() # parse and validate mii config for deployment in deployments: - mii_config = mii.config.MIIConfig(**deployment.mii_config) - assert mii_config.port_number not in ports, f"duplicate port numbers not allowed - {mii.config.port_number}" - ports.add(mii_config.port_number) + mii_config = deployment.mii_config + print(mii_config) + assert mii_config.port_number not in ports, f"duplicate port numbers not allowed - {mii_config.port_number}" + #ports.add(mii_config.port_number) if deployment.enable_zero: if deployment.ds_config.get("fp16", {}).get("enabled", False): assert (mii_config.dtype == torch.half), "MII Config Error: MII dtype and ZeRO dtype must match" diff --git a/mii/grpc_related/modelresponse_server.py b/mii/grpc_related/modelresponse_server.py index 4a0a5d00..47325f6c 100644 --- a/mii/grpc_related/modelresponse_server.py +++ b/mii/grpc_related/modelresponse_server.py @@ -173,6 +173,7 @@ def __init__(self, task_name, replica_configs): replica.tensor_parallel_ports) for replica in replica_configs ] + print(self.stubs) self.counter = AtomicCounter() self.task = get_task(task_name) self.replica_sessions = {} From 156ac8391f54fb95a6f204d88366b287c930ca1d Mon Sep 17 00:00:00 2001 From: Tosin Segun Date: Wed, 28 Jun 2023 22:58:26 +0000 Subject: [PATCH 18/69] Fixing more syntax errors --- mii/deployment.py | 37 ++++++++++++++++-------------- mii/models/score/generate.py | 14 +++++------ mii/models/score/score_template.py | 20 ++++++++-------- mii/server.py | 23 +++++++++++-------- 4 files changed, 52 insertions(+), 42 deletions(-) diff --git a/mii/deployment.py b/mii/deployment.py index 01dbd71b..ecdb95c2 100644 --- a/mii/deployment.py +++ b/mii/deployment.py @@ -119,30 +119,33 @@ def deploy(task=None, model_path = "model" # add fields for replica deployment - replica_pool = _allocate_processes(mii_config.hostfile, - mii_config.tensor_parallel, - mii_config.replica_num) replica_configs = [] - for i, (hostname, gpu_indices) in enumerate(replica_pool): - # Reserver port for a LB proxy when replication is enabled - port_offset = 1 - base_port = mii_config.port_number + i * mii_config.tensor_parallel + port_offset - tensor_parallel_ports = list( - range(base_port, - base_port + mii_config.tensor_parallel)) - torch_dist_port = mii_config.torch_dist_port + i - replica_configs.append( - ReplicaConfig(hostname=hostname, - tensor_parallel_ports=tensor_parallel_ports, - torch_dist_port=torch_dist_port, - gpu_indices=gpu_indices)) + for deployment in deployments: + mii_config = deployment.mii_config + replica_pool = _allocate_processes(mii_config.hostfile, + mii_config.tensor_parallel, + mii_config.replica_num) + + for i, (hostname, gpu_indices) in enumerate(replica_pool): + # Reserver port for a LB proxy when replication is enabled + port_offset = 1 + base_port = mii_config.port_number + i * mii_config.tensor_parallel + port_offset + tensor_parallel_ports = list( + range(base_port, + base_port + mii_config.tensor_parallel)) + torch_dist_port = mii_config.torch_dist_port + i + replica_configs.append( + ReplicaConfig(hostname=hostname, + tensor_parallel_ports=tensor_parallel_ports, + torch_dist_port=torch_dist_port, + gpu_indices=gpu_indices)) lb_config = LoadBalancerConfig(port=mii_config.port_number, replica_configs=replica_configs) if deployment_type != DeploymentType.NON_PERSISTENT: create_score_file(deployment_tag=deployment_tag, - deployments=deployments, deployment_type=deployment_type, + deployments=deployments, model_path=model_path, lb_config=lb_config) diff --git a/mii/models/score/generate.py b/mii/models/score/generate.py index 68036960..27716bd6 100644 --- a/mii/models/score/generate.py +++ b/mii/models/score/generate.py @@ -20,13 +20,13 @@ def create_score_file(deployment_tag, config_dict[mii.constants.DEPLOYMENT_TAG_KEY] = deployment_tag for deployment in deployments: config_dict[deployment.deployment_name] = {} - config_dict[deployment.deployment_name][mii.constants.DEPLOYMENT_NAME_KEY] = deployment_name - config_dict[deployment.deployment_name][mii.constants.TASK_NAME_KEY] = mii.utils.get_task_name(task) - config_dict[deployment.deployment_name][mii.constants.MODEL_NAME_KEY] = model_name - config_dict[deployment.deployment_name][mii.constants.ENABLE_DEEPSPEED_KEY] = ds_optimize - config_dict[deployment.deployment_name][mii.constants.MII_CONFIGS_KEY] = mii_config.dict() - config_dict[deployment.deployment_name][mii.constants.ENABLE_DEEPSPEED_ZERO_KEY] = ds_zero - config_dict[deployment.deployment_name][mii.constants.DEEPSPEED_CONFIG_KEY] = ds_config + config_dict[deployment.deployment_name][mii.constants.DEPLOYMENT_NAME_KEY] = deployment.deployment_name + config_dict[deployment.deployment_name][mii.constants.TASK_NAME_KEY] = mii.utils.get_task_name(deployment.task) + config_dict[deployment.deployment_name][mii.constants.MODEL_NAME_KEY] = deployment.model + config_dict[deployment.deployment_name][mii.constants.ENABLE_DEEPSPEED_KEY] = deployment.enable_deepspeed + config_dict[deployment.deployment_name][mii.constants.MII_CONFIGS_KEY] = deployment.mii_config.dict() + config_dict[deployment.deployment_name][mii.constants.ENABLE_DEEPSPEED_ZERO_KEY] = deployment.enable_zero + config_dict[deployment.deployment_name][mii.constants.DEEPSPEED_CONFIG_KEY] = deployment.ds_config if lb_config is not None: config_dict[deployment.deployment_name][mii.constants.LOAD_BALANCER_CONFIG_KEY] = lb_config diff --git a/mii/models/score/score_template.py b/mii/models/score/score_template.py index 0681ac2f..7127a8ee 100644 --- a/mii/models/score/score_template.py +++ b/mii/models/score/score_template.py @@ -19,23 +19,25 @@ def init(): deployment_tag = configs[mii.constants.DEPLOYMENT_TAG_KEY] deployments = [] for deployment in configs.values(): - deployments.append(Deployment(deployment[mii.constants.DEPLOYMENT_NAME_KEY], deployment[mii.constants.TASK_NAME_KEY], deployment[mii.constants.DEPLOYMENT_MODEL_NAME_KEY], deloyment[mii.constants.ENABLE_DEEPSPEED_KEY], deployment[mii.constants.ENABLE_DEEPSPEED_ZERO_KEY], None, deployment[mii.constants.MII_CONFIGS_KEY], deployment[mii.constants.DS_CONFIG_KEY], 1)) - + if not isinstance(deployment, dict): + continue + print(f"\nDEPLOYMENT ->{configs.values()}") + data = {'deployment_name': deployment[mii.constants.DEPLOYMENT_NAME_KEY], 'task': deployment[mii.constants.TASK_NAME_KEY], 'model': deployment[mii.constants.MODEL_NAME_KEY], 'enable_deepspeed': deployment[mii.constants.ENABLE_DEEPSPEED_KEY], 'enable_zero': deployment[mii.constants.ENABLE_DEEPSPEED_ZERO_KEY], 'GPU_index_map': None, 'mii_config': deployment[mii.constants.MII_CONFIGS_KEY], 'ds_config': deployment[mii.constants.DEEPSPEED_CONFIG_KEY], 'version': 1} + deployments.append(mii.Deployment.parse_obj(data)) + + print(f"WITHIN INIT {deployments}") + """ deployment_name = configs[mii.constants.DEPLOYMENT_NAME_KEY] model_name = configs[mii.constants.MODEL_NAME_KEY] task_name = configs[mii.constants.TASK_NAME_KEY] assert model_name is not None, "The model name should be set before calling init" assert task_name is not None, "The task name should be set before calling init" + """ - mii.MIIServer(deployment_name, - task_name, - model_name, + mii.MIIServer(deployment_tag, + deployments, model_path, - ds_optimize=configs[mii.constants.ENABLE_DEEPSPEED_KEY], - ds_zero=configs[mii.constants.ENABLE_DEEPSPEED_ZERO_KEY], - ds_config=configs[mii.constants.DEEPSPEED_CONFIG_KEY], - mii_configs=configs[mii.constants.MII_CONFIGS_KEY], lb_config=configs.get(mii.constants.LOAD_BALANCER_CONFIG_KEY, None)) diff --git a/mii/server.py b/mii/server.py index c7aff4de..496ecd01 100644 --- a/mii/server.py +++ b/mii/server.py @@ -34,14 +34,14 @@ def __init__(self, model_path, lb_config=None): - mii_configs = mii.config.MIIConfig(**mii_configs) + #mii_configs = mii.config.MIIConfig(**mii_configs) - self.task = mii.utils.get_task(task_name) + #self.task = mii.utils.get_task(task_name) - self.num_gpus = get_num_gpus(mii_configs) - assert self.num_gpus > 0, "GPU count must be greater than 0" + for deployment in deployments: + assert get_num_gpus(deployment.mii_configs) > 0, f"GPU count for {deployment.deployment_name} must be greater than 0" - self.port_number = mii_configs.port_number + #self.port_number = mii_configs.port_number if mii_configs.hostfile is None: hostfile = tempfile.NamedTemporaryFile(delete=False) @@ -104,12 +104,17 @@ def _build_server_args(self, port): # serialize mii config b64_config_str = config_to_b64_str(mii_configs) - - server_args_str = f"--deployment-name {deployment_name} --task-name {mii.utils.get_task_name(self.task)} --model {model_name} --model-path {model_path} --port {port}" + + task = "" + for deployment in deployments: + if deployment_name == deployment.deployment_name: + task = deployment.task + break + server_args_str = f"--deployment-name {deployment_name} --task-name {mii.utils.get_task_name(task)} --model {model_name} --model-path {model_path} --port {port}" server_args_str += " --ds-optimize" if ds_optimize else "" # XXX: fetch model provider based on model name in a more general way - provider = get_provider_name(model_name, self.task) + provider = get_provider_name(model_name, task) server_args_str += f" --provider {provider}" server_args_str += f" --config {b64_config_str}" @@ -134,7 +139,7 @@ def create_config_from_dict(tmpdir, config_dict): f"Expected a string path to an existing deepspeed config, or a dictionary. Received: {ds_config}" ) server_args_str += f" --ds-config {ds_config_path}" - printable_config = f"task-name {mii.utils.get_task_name(self.task)} model {model_name} model-path {model_path} port {self.port_number} provider {provider}" + printable_config = f"task-name task model {model_name} model-path {model_path} port 50050 provider {provider}" logger.info(f"MII using multi-gpu deepspeed launcher:\n" + self.print_helper(printable_config)) return server_args_str From 38e270ec2a8e70f330ff5d802f95cb429ce5eb94 Mon Sep 17 00:00:00 2001 From: Tosin Segun Date: Thu, 29 Jun 2023 18:19:11 +0000 Subject: [PATCH 19/69] Fixing more syntax issues --- mii/__init__.py | 1 - mii/deployment.py | 4 ++-- mii/models/score/generate.py | 4 ++-- mii/server.py | 43 ++++++++++++++++++++---------------- 4 files changed, 28 insertions(+), 24 deletions(-) diff --git a/mii/__init__.py b/mii/__init__.py index 5c84d1dc..b0008c06 100644 --- a/mii/__init__.py +++ b/mii/__init__.py @@ -15,7 +15,6 @@ __version__ = "0.0.0" non_persistent_models = {} -multi_model_deployments = {} try: from .version import __version__ except ImportError: diff --git a/mii/deployment.py b/mii/deployment.py index ecdb95c2..60a4c8cf 100644 --- a/mii/deployment.py +++ b/mii/deployment.py @@ -74,7 +74,6 @@ def deploy(task=None, else: assert deployment_tag is not None, "deployment_tag must be set to deploy multiple models" - mii.multi_model_deployments[deployment_tag] = deployments ports = set() # parse and validate mii config for deployment in deployments: @@ -135,7 +134,8 @@ def deploy(task=None, base_port + mii_config.tensor_parallel)) torch_dist_port = mii_config.torch_dist_port + i replica_configs.append( - ReplicaConfig(hostname=hostname, + ReplicaConfig(deployment_name = deployment.deployment_name, + hostname=hostname, tensor_parallel_ports=tensor_parallel_ports, torch_dist_port=torch_dist_port, gpu_indices=gpu_indices)) diff --git a/mii/models/score/generate.py b/mii/models/score/generate.py index 27716bd6..ecbfeea7 100644 --- a/mii/models/score/generate.py +++ b/mii/models/score/generate.py @@ -28,8 +28,8 @@ def create_score_file(deployment_tag, config_dict[deployment.deployment_name][mii.constants.ENABLE_DEEPSPEED_ZERO_KEY] = deployment.enable_zero config_dict[deployment.deployment_name][mii.constants.DEEPSPEED_CONFIG_KEY] = deployment.ds_config - if lb_config is not None: - config_dict[deployment.deployment_name][mii.constants.LOAD_BALANCER_CONFIG_KEY] = lb_config + if lb_config is not None: + config_dict[mii.constants.LOAD_BALANCER_CONFIG_KEY] = lb_config if len(mii.__path__) > 1: logger.warning( diff --git a/mii/server.py b/mii/server.py index 496ecd01..3325bc48 100644 --- a/mii/server.py +++ b/mii/server.py @@ -37,18 +37,17 @@ def __init__(self, #mii_configs = mii.config.MIIConfig(**mii_configs) #self.task = mii.utils.get_task(task_name) - + self.deployments = deployments for deployment in deployments: - assert get_num_gpus(deployment.mii_configs) > 0, f"GPU count for {deployment.deployment_name} must be greater than 0" - - #self.port_number = mii_configs.port_number - - if mii_configs.hostfile is None: - hostfile = tempfile.NamedTemporaryFile(delete=False) - num_gpu = torch.cuda.device_count() - with open(hostfile, "w") as f: - f.write(f"localhost slots={num_gpu}") - mii.configs.hostfile = hostfile + assert get_num_gpus(deployment.mii_config) > 0, f"GPU count for {deployment.deployment_name} must be greater than 0" + mii_configs = deployment.mii_config + deployment.task = mii.utils.get_task(deployment.task) + if mii_configs.hostfile is None: + hostfile = tempfile.NamedTemporaryFile(delete=False) + num_gpu = torch.cuda.device_count() + with open(hostfile, "w") as f: + f.write(f"localhost slots={num_gpu}") + mii.configs.hostfile = hostfile processes = self._initialize_service(deployment_tag, deployments, @@ -106,7 +105,7 @@ def _build_server_args(self, b64_config_str = config_to_b64_str(mii_configs) task = "" - for deployment in deployments: + for deployment in self.deployments: if deployment_name == deployment.deployment_name: task = deployment.task break @@ -139,7 +138,7 @@ def create_config_from_dict(tmpdir, config_dict): f"Expected a string path to an existing deepspeed config, or a dictionary. Received: {ds_config}" ) server_args_str += f" --ds-config {ds_config_path}" - printable_config = f"task-name task model {model_name} model-path {model_path} port 50050 provider {provider}" + printable_config = f"task-name {task} model {model_name} model-path {model_path} port {port} provider {provider}" logger.info(f"MII using multi-gpu deepspeed launcher:\n" + self.print_helper(printable_config)) return server_args_str @@ -288,6 +287,12 @@ def _initialize_service(self, # Start replica instances for i, repl_config in enumerate(lb_config.replica_configs): name = repl_config.deployment_name + deployment = None + print (f"IN SERVER NAME -> {name}") + for dep in deployments: + print(f"\nDEPLOYMENT_NAME {dep.deployment_name}") + if dep.deployment_name == name: + deployment = dep hostfile = tempfile.NamedTemporaryFile(delete=False) hostfile.write( f'{repl_config.hostname} slots={max(host_gpus[repl_config.hostname])+1}\n' @@ -295,16 +300,16 @@ def _initialize_service(self, processes.append( self._launch_deepspeed( name, - deployments[name].model, + deployment.model, model_path, - deployments[name].enable_deepspeed, - deployments[name].enable_zero, - deployments[name].ds_config, - deployments[name].mii_configs, + deployment.enable_deepspeed, + deployment.enable_zero, + deployment.ds_config, + deployment.mii_config, hostfile.name, repl_config.hostname, repl_config.tensor_parallel_ports[0], - mii_configs.torch_dist_port + (100 * i) + repl_config.gpu_indices[0], + deployment.mii_config.torch_dist_port + (100 * i) + repl_config.gpu_indices[0], repl_config.gpu_indices)) # start load balancer here. From 4d4e0d8795a4db46d39246960e6cc4284f3072b6 Mon Sep 17 00:00:00 2001 From: Tosin Segun Date: Thu, 29 Jun 2023 18:43:09 +0000 Subject: [PATCH 20/69] initial lb changes --- mii/grpc_related/modelresponse_server.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/mii/grpc_related/modelresponse_server.py b/mii/grpc_related/modelresponse_server.py index 47325f6c..a92dcb2a 100644 --- a/mii/grpc_related/modelresponse_server.py +++ b/mii/grpc_related/modelresponse_server.py @@ -168,11 +168,12 @@ def __init__(self, task_name, replica_configs): super().__init__() self.asyncio_loop = asyncio.get_event_loop() - self.stubs = [ - ParallelStubInvoker(replica.hostname, - replica.tensor_parallel_ports) - for replica in replica_configs - ] + self.stubs = {} + for repl in replica_configs: + stubs[repl.deployment_name] = [ParallelStubInvoker(replica.hostname, + replica.tensor_parallel_ports) + for replica in replica_configs + ] print(self.stubs) self.counter = AtomicCounter() self.task = get_task(task_name) From f801b360e5dcd4fe1a49129d65f634a208325013 Mon Sep 17 00:00:00 2001 From: Tosin Segun Date: Thu, 29 Jun 2023 20:59:25 +0000 Subject: [PATCH 21/69] More load balancing changes --- mii/grpc_related/modelresponse_server.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/mii/grpc_related/modelresponse_server.py b/mii/grpc_related/modelresponse_server.py index a92dcb2a..dc5772b5 100644 --- a/mii/grpc_related/modelresponse_server.py +++ b/mii/grpc_related/modelresponse_server.py @@ -169,16 +169,24 @@ def __init__(self, task_name, replica_configs): self.asyncio_loop = asyncio.get_event_loop() self.stubs = {} + self.counter = {} for repl in replica_configs: - stubs[repl.deployment_name] = [ParallelStubInvoker(replica.hostname, + stubs[repl.deployment_name] = [] + self.counter[repl.deployment_name] = AtomicCounter() + + + for repl in replica_configs: + stubs[repl.deployment_name].extend(ParallelStubInvoker(replica.hostname, replica.tensor_parallel_ports) for replica in replica_configs - ] + ) print(self.stubs) + """ self.counter = AtomicCounter() self.task = get_task(task_name) self.replica_sessions = {} - + """ + # Start the asyncio loop in a separate thread def run_asyncio_loop(loop): asyncio.set_event_loop(loop) From fd4e2ed030d817a5df12d8b979c46c9be23aa28b Mon Sep 17 00:00:00 2001 From: Tosin Segun Date: Fri, 30 Jun 2023 18:18:43 +0000 Subject: [PATCH 22/69] LB changes and syntax --- mii/grpc_related/modelresponse_server.py | 29 +++++++++++-------- mii/server.py | 36 +++++++++++++----------- 2 files changed, 36 insertions(+), 29 deletions(-) diff --git a/mii/grpc_related/modelresponse_server.py b/mii/grpc_related/modelresponse_server.py index dc5772b5..bbf10857 100644 --- a/mii/grpc_related/modelresponse_server.py +++ b/mii/grpc_related/modelresponse_server.py @@ -171,16 +171,15 @@ def __init__(self, task_name, replica_configs): self.stubs = {} self.counter = {} for repl in replica_configs: - stubs[repl.deployment_name] = [] + self.stubs[repl.deployment_name] = [] self.counter[repl.deployment_name] = AtomicCounter() for repl in replica_configs: - stubs[repl.deployment_name].extend(ParallelStubInvoker(replica.hostname, + self.stubs[repl.deployment_name].extend(ParallelStubInvoker(replica.hostname, replica.tensor_parallel_ports) - for replica in replica_configs - ) - print(self.stubs) + for replica in replica_configs if replica.deployment_name == repl.deployment_name) + print(f"\nSTUBS-> {self.stubs}\nCOUNTERS-> {self.counter}") """ self.counter = AtomicCounter() self.task = get_task(task_name) @@ -200,7 +199,14 @@ def choose_stub(self, call_count): def intercept_service(self, continuation, handler_call_details): next_handler = continuation(handler_call_details) assert next_handler.unary_unary is not None - + deployment_name = "" + #USE KWARGS LIKE THEY ARE USED TO MAKE SESSIONS TO GET THE DEPLOYMENT NAME TO HASH THE COUNTERS/STUBS + kwargs = unpack_proto_query_kwargs(request_proto.query_kwargs) + assert "deployment_name" in kwargs, "Must include deployment_name in kwargs for query" + deployment_name = kwargs['deployment_name'] + + print(f"\nDEPLOYMENT NAME WITHIN INTERCEPTOR -> {deployment_name}") + def invoke_intercept_method(request_proto, context): method_name = _get_grpc_method_name(handler_call_details.method) @@ -211,30 +217,29 @@ def invoke_intercept_method(request_proto, context): self.asyncio_loop.call_soon_threadsafe(self.asyncio_loop.stop) return next_handler.unary_unary(request_proto, context) - call_count = self.counter.get_and_increment() - replica_index = call_count % len(self.stubs) + call_count = self.counter[deployment_name].get_and_increment() + replica_index = call_count % len(self.stubs[deployment_name]) if method_name == CREATE_SESSION_METHOD: if request_proto.session_id in self.sessions: raise ValueError( f"session {request_proto.session_id} already exists") self.replica_sessions[request_proto.session_id] = replica_index - self.stubs[replica_index].invoke(CREATE_SESSION_METHOD, request_proto) + self.stubs[deployment_name][replica_index].invoke(CREATE_SESSION_METHOD, request_proto) return google_dot_protobuf_dot_empty__pb2.Empty() if method_name == DESTROY_SESSION_METHOD: replica_index = self.replica_sessions.pop(request_proto.session_id) - self.stubs[replica_index].invoke(DESTROY_SESSION_METHOD, request_proto) + self.stubs[deployment_name][replica_index].invoke(DESTROY_SESSION_METHOD, request_proto) return google_dot_protobuf_dot_empty__pb2.Empty() - kwargs = unpack_proto_query_kwargs(request_proto.query_kwargs) if "session_id" in kwargs: session_id = kwargs["session_id"] if session_id not in self.replica_sessions: raise ValueError(f"session not found") replica_index = self.replica_sessions[session_id] - ret = self.stubs[replica_index].invoke(method_name, request_proto) + ret = self.stubs[deployment_name][replica_index].invoke(method_name, request_proto) return ret return grpc.unary_unary_rpc_method_handler( diff --git a/mii/server.py b/mii/server.py index 3325bc48..dc0768b8 100644 --- a/mii/server.py +++ b/mii/server.py @@ -317,25 +317,27 @@ def _initialize_service(self, # The deepspeed launcher determines the number of processes to launch based on GPUs available on the host or CUDA_VISIBLE_DEVICES, # and it is expected to assign one GPU to one process. processes.append( - self._launch_load_balancer(deployment_name, - model_name, + self._launch_load_balancer(self.deployments[0].deployment_name, + self.deployments[0].model, model_path, - ds_optimize, - ds_zero, - ds_config, - mii_configs, + self.deployments[0].enable_deepspeed, + self.deployments[0].enable_zero, + self.deployments[0].ds_config, + self.deployments[0].mii_config, lb_config)) - if mii_configs.enable_restful_api: - # start rest api server - processes.append( - self._launch_restful_gateway(deployment_name, - model_name, - model_path, - ds_optimize, - ds_zero, - ds_config, - mii_configs, - mii_configs.port_number)) + for deployment in self.deployments: + if deployment.mii_config.enable_restful_api: + # start rest api server + processes.append( + self._launch_restful_gateway(deployment.deployment_name, + deployment.model, + model_path, + deployment.enable_deepspeed, + deployment.enable_zero, + deployment.ds_config, + deployment.mii_config, + deployment.mii_config.port_number)) + break return processes From 0a3b7e5cab714a1466dc7264432e68b6101dc289 Mon Sep 17 00:00:00 2001 From: Tosin Segun Date: Fri, 30 Jun 2023 21:13:25 +0000 Subject: [PATCH 23/69] Refactor client, and unpack request in load balancer --- mii/client.py | 15 +++++----- mii/grpc_related/modelresponse_server.py | 36 +++++++++++++++++++----- 2 files changed, 37 insertions(+), 14 deletions(-) diff --git a/mii/client.py b/mii/client.py index 535b55c8..31216f47 100644 --- a/mii/client.py +++ b/mii/client.py @@ -12,17 +12,17 @@ from mii.method_table import GRPC_METHOD_TABLE -def _get_deployment_info(deployment_name): - configs = mii.utils.import_score_file(deployment_name).configs - task = configs[mii.constants.TASK_NAME_KEY] - mii_configs_dict = configs[mii.constants.MII_CONFIGS_KEY] +def _get_deployment_info(deployment_tag, deployment_name): + configs = mii.utils.import_score_file(deployment_tag).configs + task = configs[deployment_name][mii.constants.TASK_NAME_KEY] + mii_configs_dict = configs[deployment_name][mii.constants.MII_CONFIGS_KEY] mii_configs = mii.config.MIIConfig(**mii_configs_dict) assert task is not None, "The task name should be set before calling init" return task, mii_configs -def mii_query_handle(deployment_name): +def mii_query_handle(deployment_tag, deployment_name): """Get a query handle for a local deployment: mii/examples/local/gpt2-query-example.py @@ -39,7 +39,7 @@ def mii_query_handle(deployment_name): inference_pipeline, task = mii.non_persistent_models[deployment_name] return MIINonPersistentClient(task, deployment_name) - task_name, mii_configs = _get_deployment_info(deployment_name) + task_name, mii_configs = _get_deployment_info(deployment_tag, deployment_name) return MIIClient(task_name, "localhost", mii_configs.port_number) @@ -60,7 +60,8 @@ def __init__(self, task_name, host, port): channel = create_channel(host, port) self.stub = modelresponse_pb2_grpc.ModelResponseStub(channel) self.task = get_task(task_name) - + + print(f"IN CLEINT TASK -> {self.task}\n STUB -> {self.stub}") async def _request_async_response(self, request_dict, **query_kwargs): if self.task not in GRPC_METHOD_TABLE: raise ValueError(f"unknown task: {self.task}") diff --git a/mii/grpc_related/modelresponse_server.py b/mii/grpc_related/modelresponse_server.py index bbf10857..6792af63 100644 --- a/mii/grpc_related/modelresponse_server.py +++ b/mii/grpc_related/modelresponse_server.py @@ -16,7 +16,7 @@ from mii.constants import GRPC_MAX_MSG_SIZE, CREATE_SESSION_METHOD, DESTROY_SESSION_METHOD, TERMINATE_METHOD, LB_MAX_WORKER_THREADS, SERVER_SHUTDOWN_TIMEOUT, Tasks from mii.method_table import GRPC_METHOD_TABLE from mii.client import create_channel -from mii.utils import get_task, unpack_proto_query_kwargs +from mii.utils import get_task, unpack_proto_query_kwargs, kwarg_dict_to_proto class ServiceBase(modelresponse_pb2_grpc.ModelResponseServicer): @@ -199,15 +199,37 @@ def choose_stub(self, call_count): def intercept_service(self, continuation, handler_call_details): next_handler = continuation(handler_call_details) assert next_handler.unary_unary is not None - deployment_name = "" #USE KWARGS LIKE THEY ARE USED TO MAKE SESSIONS TO GET THE DEPLOYMENT NAME TO HASH THE COUNTERS/STUBS - kwargs = unpack_proto_query_kwargs(request_proto.query_kwargs) - assert "deployment_name" in kwargs, "Must include deployment_name in kwargs for query" - deployment_name = kwargs['deployment_name'] - print(f"\nDEPLOYMENT NAME WITHIN INTERCEPTOR -> {deployment_name}") - def invoke_intercept_method(request_proto, context): + kwargs = unpack_proto_query_kwargs(request_proto.query_kwargs) + assert "deployment_name" in kwargs, "Must include deployment_name in kwargs for query" + deployment_name = kwargs.get('deployment_name') + del kwargs['deployment_name'] + kwargs = kwarg_dict_to_proto(**kwargs) + task = None + for repl in replica_configs: + if repl.deployment_name == deployment_name: + task = repl.task + break + method = GRPC_METHOD_TABLE[get_task(task)] + if method_name == "ConversationalReply": + request_dict = {} + request_dict['text'] = request_proto.text + request_dict['conversation_id'] = getattr(request_proto, 'conversation_id') + request_dict['past_user_inputs'] = request_proto.past_user_inputs + request_dict['generated_responses'] = request_proto.generated_responses + request_proto = method.pack_request_to_proto(request_dict, kwargs) + + elif method_name == "QuestionAndAnswerReply": + request_dict = {} + request_dict['question'] = request_proto.question + request_dict['context'] = requet_proto.context + request_proto = method.pack_request_to_proto(request_dict, kwargs) + else + request_proto = method.pack_request_to_proto(request_proto.query, kwargs) + + print(f"\nDEPLOYMENT NAME WITHIN INTERCEPTOR -> {deployment_name}") method_name = _get_grpc_method_name(handler_call_details.method) if method_name == TERMINATE_METHOD: From 6523c0477c204ce7a02a81d837fd1af4ba0587e3 Mon Sep 17 00:00:00 2001 From: Tosin Segun Date: Mon, 3 Jul 2023 17:08:15 +0000 Subject: [PATCH 24/69] First working queries --- examples/multi_model/query.py | 11 ++++++ examples/multi_model/shutdown.py | 2 ++ .../text-generation-bloom560m-example.py | 18 ++++++++++ mii/config.py | 3 +- mii/deployment.py | 3 +- mii/grpc_related/modelresponse_server.py | 35 ++++++++++++------- mii/method_table.py | 3 ++ 7 files changed, 60 insertions(+), 15 deletions(-) create mode 100644 examples/multi_model/query.py create mode 100644 examples/multi_model/shutdown.py create mode 100644 examples/multi_model/text-generation-bloom560m-example.py diff --git a/examples/multi_model/query.py b/examples/multi_model/query.py new file mode 100644 index 00000000..052c293d --- /dev/null +++ b/examples/multi_model/query.py @@ -0,0 +1,11 @@ +import mii +import time +generator = mii.mii_query_handle("first_test", "bloom560m_deployment") +result = generator.query({"query": ["DeepSpeed is", "Seattle is"]}, do_sample=True, max_new_tokens = 30, deployment_name = "bloom560m_deployment") +print(result) + +time.sleep(5) +generator2 = mii.mii_query_handle("first_test", "microsoft/DialogRPT-human-vs-rand_deployment") +result = generator2.query({'query': "DeepSpeed is the greatest"}, deployment_name = "microsoft/DialogRPT-human-vs-rand_deployment") +print(result) + diff --git a/examples/multi_model/shutdown.py b/examples/multi_model/shutdown.py new file mode 100644 index 00000000..5f082f2f --- /dev/null +++ b/examples/multi_model/shutdown.py @@ -0,0 +1,2 @@ +import mii +mii.terminate("bloom560m_deployment") diff --git a/examples/multi_model/text-generation-bloom560m-example.py b/examples/multi_model/text-generation-bloom560m-example.py new file mode 100644 index 00000000..6b5d25fe --- /dev/null +++ b/examples/multi_model/text-generation-bloom560m-example.py @@ -0,0 +1,18 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team +import mii + +deployments = [] +mii_configs1 = {"tensor_parallel": 1, "dtype": "fp16"} +deployments.append(mii.Deployment(task='text-generation', + model="bigscience/bloom-560m", + deployment_name="bloom560m_deployment", + mii_config=mii.config.MIIConfig(**mii_configs1))) + +# gpt2 +name = "microsoft/DialogRPT-human-vs-rand" +deployments.append(mii.Deployment(task='text-classification', model=name, deployment_name=name + "_deployment")) + +mii.deploy(deployment_tag="first_test", deployments=deployments) diff --git a/mii/config.py b/mii/config.py index 531ee800..e425a2e6 100644 --- a/mii/config.py +++ b/mii/config.py @@ -6,7 +6,7 @@ from typing import Union, List from enum import Enum from pydantic import BaseModel, validator, root_validator - +from .constants import Tasks from deepspeed.launcher.runner import DLTS_HOSTFILE @@ -107,6 +107,7 @@ class Config: class ReplicaConfig(BaseModel): + task: str = "" deployment_name: str = "" hostname: str = "" tensor_parallel_ports: List[int] = [] diff --git a/mii/deployment.py b/mii/deployment.py index 60a4c8cf..7f35c264 100644 --- a/mii/deployment.py +++ b/mii/deployment.py @@ -134,7 +134,8 @@ def deploy(task=None, base_port + mii_config.tensor_parallel)) torch_dist_port = mii_config.torch_dist_port + i replica_configs.append( - ReplicaConfig(deployment_name = deployment.deployment_name, + ReplicaConfig(task=get_task_name(deployment.task), + deployment_name = deployment.deployment_name, hostname=hostname, tensor_parallel_ports=tensor_parallel_ports, torch_dist_port=torch_dist_port, diff --git a/mii/grpc_related/modelresponse_server.py b/mii/grpc_related/modelresponse_server.py index 6792af63..1db6f77f 100644 --- a/mii/grpc_related/modelresponse_server.py +++ b/mii/grpc_related/modelresponse_server.py @@ -170,6 +170,7 @@ def __init__(self, task_name, replica_configs): self.stubs = {} self.counter = {} + self.replica_configs = replica_configs for repl in replica_configs: self.stubs[repl.deployment_name] = [] self.counter[repl.deployment_name] = AtomicCounter() @@ -202,42 +203,48 @@ def intercept_service(self, continuation, handler_call_details): #USE KWARGS LIKE THEY ARE USED TO MAKE SESSIONS TO GET THE DEPLOYMENT NAME TO HASH THE COUNTERS/STUBS def invoke_intercept_method(request_proto, context): + method_name = _get_grpc_method_name(handler_call_details.method) kwargs = unpack_proto_query_kwargs(request_proto.query_kwargs) assert "deployment_name" in kwargs, "Must include deployment_name in kwargs for query" deployment_name = kwargs.get('deployment_name') - del kwargs['deployment_name'] - kwargs = kwarg_dict_to_proto(**kwargs) + kwargs.pop('deployment_name', None) task = None - for repl in replica_configs: + for repl in self.replica_configs: if repl.deployment_name == deployment_name: task = repl.task break + print(f"\nTASK ->{task}") method = GRPC_METHOD_TABLE[get_task(task)] + new_request = None if method_name == "ConversationalReply": request_dict = {} request_dict['text'] = request_proto.text - request_dict['conversation_id'] = getattr(request_proto, 'conversation_id') + val = getattr(request_proto, 'conversation_id') + request_dict['conversation_id'] = int(val) if val is not None else None request_dict['past_user_inputs'] = request_proto.past_user_inputs request_dict['generated_responses'] = request_proto.generated_responses - request_proto = method.pack_request_to_proto(request_dict, kwargs) + new_request = method.pack_request_to_proto(request_dict, **kwargs) elif method_name == "QuestionAndAnswerReply": request_dict = {} request_dict['question'] = request_proto.question request_dict['context'] = requet_proto.context - request_proto = method.pack_request_to_proto(request_dict, kwargs) - else - request_proto = method.pack_request_to_proto(request_proto.query, kwargs) + new_request = method.pack_request_to_proto(request_dict, **kwargs) + else: + request_dict = {} + request_dict["query"] = list(request_proto.request) if method_name == "GeneratorReply" or method_name == "Txt2ImgReply" else str(request_proto.request) + print(f"HERE request_dict -> {request_dict}\nKWARGS-> {kwargs}") + new_request = method.pack_request_to_proto(request_dict, **kwargs) + print("done?") print(f"\nDEPLOYMENT NAME WITHIN INTERCEPTOR -> {deployment_name}") - method_name = _get_grpc_method_name(handler_call_details.method) if method_name == TERMINATE_METHOD: for stub in self.stubs: stub.invoke(TERMINATE_METHOD, google_dot_protobuf_dot_empty__pb2.Empty()) self.asyncio_loop.call_soon_threadsafe(self.asyncio_loop.stop) - return next_handler.unary_unary(request_proto, context) + return next_handler.unary_unary(new_request, context) call_count = self.counter[deployment_name].get_and_increment() replica_index = call_count % len(self.stubs[deployment_name]) @@ -247,12 +254,12 @@ def invoke_intercept_method(request_proto, context): raise ValueError( f"session {request_proto.session_id} already exists") self.replica_sessions[request_proto.session_id] = replica_index - self.stubs[deployment_name][replica_index].invoke(CREATE_SESSION_METHOD, request_proto) + self.stubs[deployment_name][replica_index].invoke(CREATE_SESSION_METHOD, new_request) return google_dot_protobuf_dot_empty__pb2.Empty() if method_name == DESTROY_SESSION_METHOD: replica_index = self.replica_sessions.pop(request_proto.session_id) - self.stubs[deployment_name][replica_index].invoke(DESTROY_SESSION_METHOD, request_proto) + self.stubs[deployment_name][replica_index].invoke(DESTROY_SESSION_METHOD, new_request) return google_dot_protobuf_dot_empty__pb2.Empty() if "session_id" in kwargs: @@ -261,7 +268,9 @@ def invoke_intercept_method(request_proto, context): raise ValueError(f"session not found") replica_index = self.replica_sessions[session_id] - ret = self.stubs[deployment_name][replica_index].invoke(method_name, request_proto) + assert new_request is not None, "test" + print("ASSERT DONE") + ret = self.stubs[deployment_name][replica_index].invoke(method_name, new_request) return ret return grpc.unary_unary_rpc_method_handler( diff --git a/mii/method_table.py b/mii/method_table.py index c412f446..8dfea390 100644 --- a/mii/method_table.py +++ b/mii/method_table.py @@ -23,6 +23,9 @@ def single_string_response_to_proto(self, response, time_taken, model_time_taken def multi_string_request_to_proto(self, request_dict, **query_kwargs): + temp = kwarg_dict_to_proto(query_kwargs) + print(f"FINE {temp}\nrd->{request_dict}") + print(isinstance(request_dict['query'], list)) return modelresponse_pb2.MultiStringRequest( request=request_dict['query'] if isinstance(request_dict['query'], list) else [request_dict['query']], From 06b40f5ee3c469261610b52d6dfac4153f28000b Mon Sep 17 00:00:00 2001 From: Tosin Segun Date: Mon, 3 Jul 2023 21:53:37 +0000 Subject: [PATCH 25/69] Fixing conversational and q&a args --- mii/grpc_related/modelresponse_server.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/mii/grpc_related/modelresponse_server.py b/mii/grpc_related/modelresponse_server.py index 1db6f77f..a77d000b 100644 --- a/mii/grpc_related/modelresponse_server.py +++ b/mii/grpc_related/modelresponse_server.py @@ -213,22 +213,22 @@ def invoke_intercept_method(request_proto, context): if repl.deployment_name == deployment_name: task = repl.task break - print(f"\nTASK ->{task}") + print(f"\nTASK ->{task}\nMETHOD NAME-> {method_name}") method = GRPC_METHOD_TABLE[get_task(task)] new_request = None if method_name == "ConversationalReply": request_dict = {} - request_dict['text'] = request_proto.text + request_dict['text'] = str(request_proto.text) val = getattr(request_proto, 'conversation_id') request_dict['conversation_id'] = int(val) if val is not None else None - request_dict['past_user_inputs'] = request_proto.past_user_inputs - request_dict['generated_responses'] = request_proto.generated_responses + request_dict['past_user_inputs'] = list(request_proto.past_user_inputs) + request_dict['generated_responses'] = list(request_proto.generated_responses) new_request = method.pack_request_to_proto(request_dict, **kwargs) elif method_name == "QuestionAndAnswerReply": request_dict = {} - request_dict['question'] = request_proto.question - request_dict['context'] = requet_proto.context + request_dict['question'] = str(request_proto.question) + request_dict['context'] = str(requet_proto.context) new_request = method.pack_request_to_proto(request_dict, **kwargs) else: request_dict = {} @@ -270,6 +270,7 @@ def invoke_intercept_method(request_proto, context): assert new_request is not None, "test" print("ASSERT DONE") + print(new_request.query_kwargs) ret = self.stubs[deployment_name][replica_index].invoke(method_name, new_request) return ret From 96d0dcb8d332153ad28df639c999e70dc6baff3f Mon Sep 17 00:00:00 2001 From: Tosin Segun Date: Wed, 5 Jul 2023 18:33:46 +0000 Subject: [PATCH 26/69] Updates to _allocate_processes and fixing example --- examples/multi_model/query.py | 11 ++++++++ .../text-generation-bloom560m-example.py | 17 ++++++++++-- mii/client.py | 1 - mii/deployment.py | 26 ++++++++++++++----- mii/grpc_related/modelresponse_server.py | 2 +- 5 files changed, 46 insertions(+), 11 deletions(-) diff --git a/examples/multi_model/query.py b/examples/multi_model/query.py index 052c293d..e4bfd8d9 100644 --- a/examples/multi_model/query.py +++ b/examples/multi_model/query.py @@ -1,5 +1,6 @@ import mii import time + generator = mii.mii_query_handle("first_test", "bloom560m_deployment") result = generator.query({"query": ["DeepSpeed is", "Seattle is"]}, do_sample=True, max_new_tokens = 30, deployment_name = "bloom560m_deployment") print(result) @@ -9,3 +10,13 @@ result = generator2.query({'query': "DeepSpeed is the greatest"}, deployment_name = "microsoft/DialogRPT-human-vs-rand_deployment") print(result) +time.sleep(5) + +generator3 = mii.mii_query_handle("first_test", "microsoft/DialoGPT-large_deployment") +result = generator3.query({'text': "DeepSpeed is the greatest", + 'conversation_id': 3, + 'past_user_inputs': [], + 'generated_responses': [] + }, deployment_name= "microsoft/DialoGPT-large_deployment") +print(result) + diff --git a/examples/multi_model/text-generation-bloom560m-example.py b/examples/multi_model/text-generation-bloom560m-example.py index 6b5d25fe..b1d0d6b6 100644 --- a/examples/multi_model/text-generation-bloom560m-example.py +++ b/examples/multi_model/text-generation-bloom560m-example.py @@ -4,15 +4,28 @@ # DeepSpeed Team import mii +gpu_index_map1 = {'master': [0]} +gpu_index_map2 = {'master': [1]} +gpu_index_map3 = {'master': [0, 1]} + deployments = [] -mii_configs1 = {"tensor_parallel": 1, "dtype": "fp16"} +mii_configs1 = {"tensor_parallel": 2, "dtype": "fp16"} deployments.append(mii.Deployment(task='text-generation', model="bigscience/bloom-560m", deployment_name="bloom560m_deployment", + GPU_index_map=gpu_index_map3, mii_config=mii.config.MIIConfig(**mii_configs1))) # gpt2 name = "microsoft/DialogRPT-human-vs-rand" -deployments.append(mii.Deployment(task='text-classification', model=name, deployment_name=name + "_deployment")) +deployments.append(mii.Deployment(task='text-classification', model=name, deployment_name=name + "_deployment", GPU_index_map=gpu_index_map2)) + +mii_configs2 = {"tensor_parallel": 1} + + +name = "microsoft/DialoGPT-large" + +deployments.append(mii.Deployment(task='conversational', model=name, deployment_name=name + "_deployment", GPU_index_map=gpu_index_map1, mii_config=mii.config.MIIConfig(**mii_configs2))) + mii.deploy(deployment_tag="first_test", deployments=deployments) diff --git a/mii/client.py b/mii/client.py index 31216f47..fe884ed1 100644 --- a/mii/client.py +++ b/mii/client.py @@ -61,7 +61,6 @@ def __init__(self, task_name, host, port): self.stub = modelresponse_pb2_grpc.ModelResponseStub(channel) self.task = get_task(task_name) - print(f"IN CLEINT TASK -> {self.task}\n STUB -> {self.stub}") async def _request_async_response(self, request_dict, **query_kwargs): if self.task not in GRPC_METHOD_TABLE: raise ValueError(f"unknown task: {self.task}") diff --git a/mii/deployment.py b/mii/deployment.py index 7f35c264..330acd51 100644 --- a/mii/deployment.py +++ b/mii/deployment.py @@ -74,13 +74,9 @@ def deploy(task=None, else: assert deployment_tag is not None, "deployment_tag must be set to deploy multiple models" - ports = set() # parse and validate mii config for deployment in deployments: mii_config = deployment.mii_config - print(mii_config) - assert mii_config.port_number not in ports, f"duplicate port numbers not allowed - {mii_config.port_number}" - #ports.add(mii_config.port_number) if deployment.enable_zero: if deployment.ds_config.get("fp16", {}).get("enabled", False): assert (mii_config.dtype == torch.half), "MII Config Error: MII dtype and ZeRO dtype must match" @@ -119,19 +115,25 @@ def deploy(task=None, # add fields for replica deployment replica_configs = [] + ports = set() for deployment in deployments: mii_config = deployment.mii_config replica_pool = _allocate_processes(mii_config.hostfile, mii_config.tensor_parallel, - mii_config.replica_num) + mii_config.replica_num, + deployment.GPU_index_map) for i, (hostname, gpu_indices) in enumerate(replica_pool): # Reserver port for a LB proxy when replication is enabled port_offset = 1 base_port = mii_config.port_number + i * mii_config.tensor_parallel + port_offset + if base_port in ports: + base_port = max(ports) + 1 tensor_parallel_ports = list( range(base_port, base_port + mii_config.tensor_parallel)) + for i in range(base_port, base_port + mii_config.tensor_parallel): + ports.add(i) torch_dist_port = mii_config.torch_dist_port + i replica_configs.append( ReplicaConfig(task=get_task_name(deployment.task), @@ -186,12 +188,22 @@ def _deploy_aml(deployment_tag, model_name, version): print("Please run 'deploy.sh' to bring your deployment online") -def _allocate_processes(hostfile_path, tensor_parallel, num_replicas): +def _allocate_processes(hostfile_path, tensor_parallel, num_replicas, gpu_index_map=None): resource_pool = fetch_hostfile(hostfile_path) assert resource_pool is not None and len( resource_pool) > 0, f'No hosts found in {hostfile_path}' - + replica_pool = [] + + if gpu_index_map is not None: + assert len(gpu_index_map) == num_replicas, "Number of Hosts must match number of replicas" + for host in gpu_index_map: + assert host in resource_pool, f"Host: {host} was not found" + assert resource_pool[host] >= tensor_parallel, f"Host {host} has {slots} slot(s), but {tensor_parallel} slot(s) are required" + for host in gpu_index_map: + replica_pool.append((host, gpu_index_map[host])) + return replica_pool + allocated_num = 0 for host, slots in resource_pool.items(): available_on_host = slots diff --git a/mii/grpc_related/modelresponse_server.py b/mii/grpc_related/modelresponse_server.py index a77d000b..7ca739aa 100644 --- a/mii/grpc_related/modelresponse_server.py +++ b/mii/grpc_related/modelresponse_server.py @@ -175,7 +175,7 @@ def __init__(self, task_name, replica_configs): self.stubs[repl.deployment_name] = [] self.counter[repl.deployment_name] = AtomicCounter() - + print(replica_configs) for repl in replica_configs: self.stubs[repl.deployment_name].extend(ParallelStubInvoker(replica.hostname, replica.tensor_parallel_ports) From ab41d24b6445a22e1e171f544c9c4ab16ee333d7 Mon Sep 17 00:00:00 2001 From: Tosin Segun Date: Wed, 5 Jul 2023 21:36:10 +0000 Subject: [PATCH 27/69] Adding host map for allocating processes and formatting --- examples/multi_model/query.py | 31 +++++++---- examples/multi_model/shutdown.py | 5 ++ .../text-generation-bloom560m-example.py | 26 ++++++---- mii/client.py | 2 +- mii/config.py | 3 +- mii/deployment.py | 51 ++++++++++++------- mii/grpc_related/modelresponse_server.py | 37 +++++++++----- mii/models/score/generate.py | 27 ++++++---- mii/models/score/score_template.py | 12 ++++- mii/server.py | 19 +++---- 10 files changed, 140 insertions(+), 73 deletions(-) diff --git a/examples/multi_model/query.py b/examples/multi_model/query.py index e4bfd8d9..37c55476 100644 --- a/examples/multi_model/query.py +++ b/examples/multi_model/query.py @@ -1,22 +1,35 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + import mii import time generator = mii.mii_query_handle("first_test", "bloom560m_deployment") -result = generator.query({"query": ["DeepSpeed is", "Seattle is"]}, do_sample=True, max_new_tokens = 30, deployment_name = "bloom560m_deployment") +result = generator.query({"query": ["DeepSpeed is", + "Seattle is"]}, + do_sample=True, + max_new_tokens=30, + deployment_name="bloom560m_deployment") print(result) time.sleep(5) -generator2 = mii.mii_query_handle("first_test", "microsoft/DialogRPT-human-vs-rand_deployment") -result = generator2.query({'query': "DeepSpeed is the greatest"}, deployment_name = "microsoft/DialogRPT-human-vs-rand_deployment") +generator2 = mii.mii_query_handle("first_test", + "microsoft/DialogRPT-human-vs-rand_deployment") +result = generator2.query({'query': "DeepSpeed is the greatest"}, + deployment_name="microsoft/DialogRPT-human-vs-rand_deployment") print(result) time.sleep(5) generator3 = mii.mii_query_handle("first_test", "microsoft/DialoGPT-large_deployment") -result = generator3.query({'text': "DeepSpeed is the greatest", - 'conversation_id': 3, - 'past_user_inputs': [], - 'generated_responses': [] - }, deployment_name= "microsoft/DialoGPT-large_deployment") +result = generator3.query( + { + 'text': "DeepSpeed is the greatest", + 'conversation_id': 3, + 'past_user_inputs': [], + 'generated_responses': [] + }, + deployment_name="microsoft/DialoGPT-large_deployment") print(result) - diff --git a/examples/multi_model/shutdown.py b/examples/multi_model/shutdown.py index 5f082f2f..11e0b4b9 100644 --- a/examples/multi_model/shutdown.py +++ b/examples/multi_model/shutdown.py @@ -1,2 +1,7 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team import mii + mii.terminate("bloom560m_deployment") diff --git a/examples/multi_model/text-generation-bloom560m-example.py b/examples/multi_model/text-generation-bloom560m-example.py index b1d0d6b6..a5dc202c 100644 --- a/examples/multi_model/text-generation-bloom560m-example.py +++ b/examples/multi_model/text-generation-bloom560m-example.py @@ -10,22 +10,30 @@ deployments = [] mii_configs1 = {"tensor_parallel": 2, "dtype": "fp16"} -deployments.append(mii.Deployment(task='text-generation', - model="bigscience/bloom-560m", - deployment_name="bloom560m_deployment", - GPU_index_map=gpu_index_map3, - mii_config=mii.config.MIIConfig(**mii_configs1))) +deployments.append( + mii.Deployment(task='text-generation', + model="bigscience/bloom-560m", + deployment_name="bloom560m_deployment", + GPU_index_map=gpu_index_map3, + mii_config=mii.config.MIIConfig(**mii_configs1))) # gpt2 name = "microsoft/DialogRPT-human-vs-rand" -deployments.append(mii.Deployment(task='text-classification', model=name, deployment_name=name + "_deployment", GPU_index_map=gpu_index_map2)) +deployments.append( + mii.Deployment(task='text-classification', + model=name, + deployment_name=name + "_deployment", + GPU_index_map=gpu_index_map2)) mii_configs2 = {"tensor_parallel": 1} - name = "microsoft/DialoGPT-large" -deployments.append(mii.Deployment(task='conversational', model=name, deployment_name=name + "_deployment", GPU_index_map=gpu_index_map1, mii_config=mii.config.MIIConfig(**mii_configs2))) - +deployments.append( + mii.Deployment(task='conversational', + model=name, + deployment_name=name + "_deployment", + GPU_index_map=gpu_index_map1, + mii_config=mii.config.MIIConfig(**mii_configs2))) mii.deploy(deployment_tag="first_test", deployments=deployments) diff --git a/mii/client.py b/mii/client.py index fe884ed1..3a314384 100644 --- a/mii/client.py +++ b/mii/client.py @@ -60,7 +60,7 @@ def __init__(self, task_name, host, port): channel = create_channel(host, port) self.stub = modelresponse_pb2_grpc.ModelResponseStub(channel) self.task = get_task(task_name) - + async def _request_async_response(self, request_dict, **query_kwargs): if self.task not in GRPC_METHOD_TABLE: raise ValueError(f"unknown task: {self.task}") diff --git a/mii/config.py b/mii/config.py index e425a2e6..4eb6b597 100644 --- a/mii/config.py +++ b/mii/config.py @@ -6,7 +6,6 @@ from typing import Union, List from enum import Enum from pydantic import BaseModel, validator, root_validator -from .constants import Tasks from deepspeed.launcher.runner import DLTS_HOSTFILE @@ -125,6 +124,8 @@ class LoadBalancerConfig(BaseModel): class Config: validate_all = True + + validate_assignment = True diff --git a/mii/deployment.py b/mii/deployment.py index 330acd51..95be4c01 100644 --- a/mii/deployment.py +++ b/mii/deployment.py @@ -13,7 +13,7 @@ from .utils import logger, get_task_name, get_provider_name from .models.score import create_score_file from .models import load_models -from .config import ReplicaConfig, LoadBalancerConfig +from .config import ReplicaConfig, LoadBalancerConfig, Deployment def deploy(task=None, @@ -26,7 +26,8 @@ def deploy(task=None, deployment_tag=None, deployments=[], deployment_type=DeploymentType.LOCAL, - model_path=None): + model_path=None, + version=1): """Deploy a task using specified model. For usage examples see: mii/examples/local/text-generation-example.py @@ -69,7 +70,17 @@ def deploy(task=None, """ if not deployments: assert all((model, task, deployment_name)), "model, task, and deployment name must be set to deploy singular model" - deployments = [Deployment(deployment_name, task, model, enable_deepspeed, enable_zero, None, mii_config, ds_config, version)] + deployments = [ + Deployment(deployment_name, + task, + model, + enable_deepspeed, + enable_zero, + None, + mii_config, + ds_config, + version) + ] deployment_tag = deployment_name + "_tag" else: assert deployment_tag is not None, "deployment_tag must be set to deploy multiple models" @@ -96,15 +107,17 @@ def deploy(task=None, if not mii_config.skip_model_check: mii.utils.check_if_task_and_model_is_valid(deployment.task, deployment.model) if enable_deepspeed: - mii.utils.check_if_task_and_model_is_supported(deployment.task, deployment.model) + mii.utils.check_if_task_and_model_is_supported( + deployment.task, + deployment.model) if enable_deepspeed: logger.info( - f"************* MII is using DeepSpeed Optimizations to accelerate your model: {deployment.model} *************" + f"************* MII is using DeepSpeed Optimizations to accelerate your model: {deployment.model} *************" ) else: logger.info( - f"************* DeepSpeed Optimizations not enabled. Please use enable_deepspeed to get better performance for: {deployment.model} *************" + f"************* DeepSpeed Optimizations not enabled. Please use enable_deepspeed to get better performance for: {deployment.model} *************" ) # In local deployments use default path if no model path set @@ -115,7 +128,7 @@ def deploy(task=None, # add fields for replica deployment replica_configs = [] - ports = set() + port_map = {} for deployment in deployments: mii_config = deployment.mii_config replica_pool = _allocate_processes(mii_config.hostfile, @@ -125,19 +138,21 @@ def deploy(task=None, for i, (hostname, gpu_indices) in enumerate(replica_pool): # Reserver port for a LB proxy when replication is enabled + if hostname not in port_map: + port_map[hostname] = set() port_offset = 1 base_port = mii_config.port_number + i * mii_config.tensor_parallel + port_offset - if base_port in ports: - base_port = max(ports) + 1 + if base_port in port_map[hostname]: + base_port = max(port_map[hostname]) + 1 tensor_parallel_ports = list( range(base_port, - base_port + mii_config.tensor_parallel)) + base_port + mii_config.tensor_parallel)) for i in range(base_port, base_port + mii_config.tensor_parallel): - ports.add(i) + port_map[hostname].add(i) torch_dist_port = mii_config.torch_dist_port + i replica_configs.append( ReplicaConfig(task=get_task_name(deployment.task), - deployment_name = deployment.deployment_name, + deployment_name=deployment.deployment_name, hostname=hostname, tensor_parallel_ports=tensor_parallel_ports, torch_dist_port=torch_dist_port, @@ -183,23 +198,25 @@ def _deploy_aml(deployment_tag, model_name, version): model_name=model_name, version=version) print( - f"AML deployment assets at {mii.aml_related.utils.aml_output_path(deployment_name)}" + f"AML deployment assets at {mii.aml_related.utils.aml_output_path(deployment_tag)}" ) print("Please run 'deploy.sh' to bring your deployment online") -def _allocate_processes(hostfile_path, tensor_parallel, num_replicas, gpu_index_map=None): +def _allocate_processes(hostfile_path, + tensor_parallel, + num_replicas, + gpu_index_map=None): resource_pool = fetch_hostfile(hostfile_path) assert resource_pool is not None and len( resource_pool) > 0, f'No hosts found in {hostfile_path}' - + replica_pool = [] if gpu_index_map is not None: - assert len(gpu_index_map) == num_replicas, "Number of Hosts must match number of replicas" for host in gpu_index_map: assert host in resource_pool, f"Host: {host} was not found" - assert resource_pool[host] >= tensor_parallel, f"Host {host} has {slots} slot(s), but {tensor_parallel} slot(s) are required" + assert resource_pool[host] >= tensor_parallel, f"Host {host} has {resource_pool[host]} slot(s), but {tensor_parallel} slot(s) are required" for host in gpu_index_map: replica_pool.append((host, gpu_index_map[host])) return replica_pool diff --git a/mii/grpc_related/modelresponse_server.py b/mii/grpc_related/modelresponse_server.py index 7ca739aa..6e97c085 100644 --- a/mii/grpc_related/modelresponse_server.py +++ b/mii/grpc_related/modelresponse_server.py @@ -16,7 +16,7 @@ from mii.constants import GRPC_MAX_MSG_SIZE, CREATE_SESSION_METHOD, DESTROY_SESSION_METHOD, TERMINATE_METHOD, LB_MAX_WORKER_THREADS, SERVER_SHUTDOWN_TIMEOUT, Tasks from mii.method_table import GRPC_METHOD_TABLE from mii.client import create_channel -from mii.utils import get_task, unpack_proto_query_kwargs, kwarg_dict_to_proto +from mii.utils import get_task, unpack_proto_query_kwargs class ServiceBase(modelresponse_pb2_grpc.ModelResponseServicer): @@ -177,16 +177,18 @@ def __init__(self, task_name, replica_configs): print(replica_configs) for repl in replica_configs: - self.stubs[repl.deployment_name].extend(ParallelStubInvoker(replica.hostname, - replica.tensor_parallel_ports) - for replica in replica_configs if replica.deployment_name == repl.deployment_name) + self.stubs[repl.deployment_name].extend( + ParallelStubInvoker(replica.hostname, + replica.tensor_parallel_ports) + for replica in replica_configs + if replica.deployment_name == repl.deployment_name) print(f"\nSTUBS-> {self.stubs}\nCOUNTERS-> {self.counter}") """ self.counter = AtomicCounter() self.task = get_task(task_name) self.replica_sessions = {} """ - + # Start the asyncio loop in a separate thread def run_asyncio_loop(loop): asyncio.set_event_loop(loop) @@ -200,6 +202,7 @@ def choose_stub(self, call_count): def intercept_service(self, continuation, handler_call_details): next_handler = continuation(handler_call_details) assert next_handler.unary_unary is not None + #USE KWARGS LIKE THEY ARE USED TO MAKE SESSIONS TO GET THE DEPLOYMENT NAME TO HASH THE COUNTERS/STUBS def invoke_intercept_method(request_proto, context): @@ -219,20 +222,24 @@ def invoke_intercept_method(request_proto, context): if method_name == "ConversationalReply": request_dict = {} request_dict['text'] = str(request_proto.text) - val = getattr(request_proto, 'conversation_id') + val = getattr(request_proto, 'conversation_id') request_dict['conversation_id'] = int(val) if val is not None else None request_dict['past_user_inputs'] = list(request_proto.past_user_inputs) - request_dict['generated_responses'] = list(request_proto.generated_responses) + request_dict['generated_responses'] = list( + request_proto.generated_responses) new_request = method.pack_request_to_proto(request_dict, **kwargs) elif method_name == "QuestionAndAnswerReply": request_dict = {} request_dict['question'] = str(request_proto.question) - request_dict['context'] = str(requet_proto.context) + request_dict['context'] = str(request_proto.context) new_request = method.pack_request_to_proto(request_dict, **kwargs) else: request_dict = {} - request_dict["query"] = list(request_proto.request) if method_name == "GeneratorReply" or method_name == "Txt2ImgReply" else str(request_proto.request) + request_dict["query"] = list( + request_proto.request + ) if method_name == "GeneratorReply" or method_name == "Txt2ImgReply" else str( + request_proto.request) print(f"HERE request_dict -> {request_dict}\nKWARGS-> {kwargs}") new_request = method.pack_request_to_proto(request_dict, **kwargs) print("done?") @@ -254,12 +261,16 @@ def invoke_intercept_method(request_proto, context): raise ValueError( f"session {request_proto.session_id} already exists") self.replica_sessions[request_proto.session_id] = replica_index - self.stubs[deployment_name][replica_index].invoke(CREATE_SESSION_METHOD, new_request) + self.stubs[deployment_name][replica_index].invoke( + CREATE_SESSION_METHOD, + new_request) return google_dot_protobuf_dot_empty__pb2.Empty() if method_name == DESTROY_SESSION_METHOD: replica_index = self.replica_sessions.pop(request_proto.session_id) - self.stubs[deployment_name][replica_index].invoke(DESTROY_SESSION_METHOD, new_request) + self.stubs[deployment_name][replica_index].invoke( + DESTROY_SESSION_METHOD, + new_request) return google_dot_protobuf_dot_empty__pb2.Empty() if "session_id" in kwargs: @@ -271,7 +282,9 @@ def invoke_intercept_method(request_proto, context): assert new_request is not None, "test" print("ASSERT DONE") print(new_request.query_kwargs) - ret = self.stubs[deployment_name][replica_index].invoke(method_name, new_request) + ret = self.stubs[deployment_name][replica_index].invoke( + method_name, + new_request) return ret return grpc.unary_unary_rpc_method_handler( diff --git a/mii/models/score/generate.py b/mii/models/score/generate.py index ecbfeea7..ecd15ffe 100644 --- a/mii/models/score/generate.py +++ b/mii/models/score/generate.py @@ -14,19 +14,26 @@ def create_score_file(deployment_tag, deployments, model_path, lb_config): - + config_dict = {} config_dict[mii.constants.MODEL_PATH_KEY] = model_path config_dict[mii.constants.DEPLOYMENT_TAG_KEY] = deployment_tag for deployment in deployments: config_dict[deployment.deployment_name] = {} - config_dict[deployment.deployment_name][mii.constants.DEPLOYMENT_NAME_KEY] = deployment.deployment_name - config_dict[deployment.deployment_name][mii.constants.TASK_NAME_KEY] = mii.utils.get_task_name(deployment.task) - config_dict[deployment.deployment_name][mii.constants.MODEL_NAME_KEY] = deployment.model - config_dict[deployment.deployment_name][mii.constants.ENABLE_DEEPSPEED_KEY] = deployment.enable_deepspeed - config_dict[deployment.deployment_name][mii.constants.MII_CONFIGS_KEY] = deployment.mii_config.dict() - config_dict[deployment.deployment_name][mii.constants.ENABLE_DEEPSPEED_ZERO_KEY] = deployment.enable_zero - config_dict[deployment.deployment_name][mii.constants.DEEPSPEED_CONFIG_KEY] = deployment.ds_config + config_dict[deployment.deployment_name][ + mii.constants.DEPLOYMENT_NAME_KEY] = deployment.deployment_name + config_dict[deployment.deployment_name][ + mii.constants.TASK_NAME_KEY] = mii.utils.get_task_name(deployment.task) + config_dict[deployment.deployment_name][ + mii.constants.MODEL_NAME_KEY] = deployment.model + config_dict[deployment.deployment_name][ + mii.constants.ENABLE_DEEPSPEED_KEY] = deployment.enable_deepspeed + config_dict[deployment.deployment_name][ + mii.constants.MII_CONFIGS_KEY] = deployment.mii_config.dict() + config_dict[deployment.deployment_name][ + mii.constants.ENABLE_DEEPSPEED_ZERO_KEY] = deployment.enable_zero + config_dict[deployment.deployment_name][ + mii.constants.DEEPSPEED_CONFIG_KEY] = deployment.ds_config if lb_config is not None: config_dict[mii.constants.LOAD_BALANCER_CONFIG_KEY] = lb_config @@ -37,8 +44,8 @@ def create_score_file(deployment_tag, ) with open(os.path.join(mii.__path__[0], - "models/score/score_template.py"), - "r") as fd: + "models/score/score_template.py"), + "r") as fd: score_src = fd.read() # update score file w. global config dict diff --git a/mii/models/score/score_template.py b/mii/models/score/score_template.py index 7127a8ee..2faa1ebc 100644 --- a/mii/models/score/score_template.py +++ b/mii/models/score/score_template.py @@ -22,7 +22,17 @@ def init(): if not isinstance(deployment, dict): continue print(f"\nDEPLOYMENT ->{configs.values()}") - data = {'deployment_name': deployment[mii.constants.DEPLOYMENT_NAME_KEY], 'task': deployment[mii.constants.TASK_NAME_KEY], 'model': deployment[mii.constants.MODEL_NAME_KEY], 'enable_deepspeed': deployment[mii.constants.ENABLE_DEEPSPEED_KEY], 'enable_zero': deployment[mii.constants.ENABLE_DEEPSPEED_ZERO_KEY], 'GPU_index_map': None, 'mii_config': deployment[mii.constants.MII_CONFIGS_KEY], 'ds_config': deployment[mii.constants.DEEPSPEED_CONFIG_KEY], 'version': 1} + data = { + 'deployment_name': deployment[mii.constants.DEPLOYMENT_NAME_KEY], + 'task': deployment[mii.constants.TASK_NAME_KEY], + 'model': deployment[mii.constants.MODEL_NAME_KEY], + 'enable_deepspeed': deployment[mii.constants.ENABLE_DEEPSPEED_KEY], + 'enable_zero': deployment[mii.constants.ENABLE_DEEPSPEED_ZERO_KEY], + 'GPU_index_map': None, + 'mii_config': deployment[mii.constants.MII_CONFIGS_KEY], + 'ds_config': deployment[mii.constants.DEEPSPEED_CONFIG_KEY], + 'version': 1 + } deployments.append(mii.Deployment.parse_obj(data)) print(f"WITHIN INIT {deployments}") diff --git a/mii/server.py b/mii/server.py index dc0768b8..76ad9443 100644 --- a/mii/server.py +++ b/mii/server.py @@ -28,11 +28,7 @@ def config_to_b64_str(config): class MIIServer(): '''Initialize the model, setup the server for the model under model_path''' - def __init__(self, - deployment_tag, - deployments, - model_path, - lb_config=None): + def __init__(self, deployment_tag, deployments, model_path, lb_config=None): #mii_configs = mii.config.MIIConfig(**mii_configs) @@ -103,7 +99,7 @@ def _build_server_args(self, port): # serialize mii config b64_config_str = config_to_b64_str(mii_configs) - + task = "" for deployment in self.deployments: if deployment_name == deployment.deployment_name: @@ -272,11 +268,7 @@ def _launch_deepspeed(self, "MII server", ds_launch_str=ds_launch_str) - def _initialize_service(self, - deployment_tag, - deployments, - model_path, - lb_config): + def _initialize_service(self, deployment_tag, deployments, model_path, lb_config): processes = [] @@ -288,7 +280,7 @@ def _initialize_service(self, for i, repl_config in enumerate(lb_config.replica_configs): name = repl_config.deployment_name deployment = None - print (f"IN SERVER NAME -> {name}") + print(f"IN SERVER NAME -> {name}") for dep in deployments: print(f"\nDEPLOYMENT_NAME {dep.deployment_name}") if dep.deployment_name == name: @@ -309,7 +301,8 @@ def _initialize_service(self, hostfile.name, repl_config.hostname, repl_config.tensor_parallel_ports[0], - deployment.mii_config.torch_dist_port + (100 * i) + repl_config.gpu_indices[0], + deployment.mii_config.torch_dist_port + (100 * i) + + repl_config.gpu_indices[0], repl_config.gpu_indices)) # start load balancer here. From 8673a9adc7e22b17154a1f6ebb857bc304d41420 Mon Sep 17 00:00:00 2001 From: Tosin Segun Date: Wed, 5 Jul 2023 22:48:33 +0000 Subject: [PATCH 28/69] Fixing terminate functionality --- examples/multi_model/shutdown.py | 2 +- mii/client.py | 4 ++-- mii/grpc_related/modelresponse_server.py | 17 +++++++++-------- mii/terminate.py | 6 +++--- 4 files changed, 15 insertions(+), 14 deletions(-) diff --git a/examples/multi_model/shutdown.py b/examples/multi_model/shutdown.py index 11e0b4b9..4de143e6 100644 --- a/examples/multi_model/shutdown.py +++ b/examples/multi_model/shutdown.py @@ -4,4 +4,4 @@ # DeepSpeed Team import mii -mii.terminate("bloom560m_deployment") +mii.terminate("first_test", "bloom560m_deployment") diff --git a/mii/client.py b/mii/client.py index 3a314384..b58049c7 100644 --- a/mii/client.py +++ b/mii/client.py @@ -188,7 +188,7 @@ def terminate(self): del mii.non_persistent_models[self.deployment_name] -def terminate_restful_gateway(deployment_name): - _, mii_configs = _get_deployment_info(deployment_name) +def terminate_restful_gateway(deployment_tag, deployment_name): + _, mii_configs = _get_deployment_info(deployment_tag, deployment_name) if mii_configs.enable_restful_api: requests.get(f"http://localhost:{mii_configs.restful_api_port}/terminate") diff --git a/mii/grpc_related/modelresponse_server.py b/mii/grpc_related/modelresponse_server.py index 6e97c085..441faffd 100644 --- a/mii/grpc_related/modelresponse_server.py +++ b/mii/grpc_related/modelresponse_server.py @@ -207,8 +207,16 @@ def intercept_service(self, continuation, handler_call_details): def invoke_intercept_method(request_proto, context): method_name = _get_grpc_method_name(handler_call_details.method) + if method_name == TERMINATE_METHOD: + for deployment in self.stubs: + for stub in self.stubs[deployment]: + stub.invoke(TERMINATE_METHOD, + google_dot_protobuf_dot_empty__pb2.Empty()) + self.asyncio_loop.call_soon_threadsafe(self.asyncio_loop.stop) + return next_handler.unary_unary(request_proto, context) kwargs = unpack_proto_query_kwargs(request_proto.query_kwargs) - assert "deployment_name" in kwargs, "Must include deployment_name in kwargs for query" + if method_name != TERMINATE_METHOD: + assert "deployment_name" in kwargs, "Must include deployment_name in kwargs for query" deployment_name = kwargs.get('deployment_name') kwargs.pop('deployment_name', None) task = None @@ -246,13 +254,6 @@ def invoke_intercept_method(request_proto, context): print(f"\nDEPLOYMENT NAME WITHIN INTERCEPTOR -> {deployment_name}") - if method_name == TERMINATE_METHOD: - for stub in self.stubs: - stub.invoke(TERMINATE_METHOD, - google_dot_protobuf_dot_empty__pb2.Empty()) - self.asyncio_loop.call_soon_threadsafe(self.asyncio_loop.stop) - return next_handler.unary_unary(new_request, context) - call_count = self.counter[deployment_name].get_and_increment() replica_index = call_count % len(self.stubs[deployment_name]) diff --git a/mii/terminate.py b/mii/terminate.py index 167c5a5a..94fa7a77 100644 --- a/mii/terminate.py +++ b/mii/terminate.py @@ -7,9 +7,9 @@ import mii -def terminate(deployment_name): +def terminate(deployment_tag, deployment_name): mii.utils.logger.info(f"Terminating server for {deployment_name}") - generator = mii.mii_query_handle(deployment_name) + generator = mii.mii_query_handle(deployment_tag, deployment_name) if (deployment_name in mii.non_persistent_models): generator.terminate() return @@ -24,4 +24,4 @@ def terminate(deployment_name): pass generator.terminate() - mii.client.terminate_restful_gateway(deployment_name) + mii.client.terminate_restful_gateway(deployment_tag, deployment_name) From 8d09b3757ba1327247979f1f8bb414bfa3ee1aa1 Mon Sep 17 00:00:00 2001 From: Tosin Segun Date: Thu, 6 Jul 2023 18:27:32 +0000 Subject: [PATCH 29/69] Refactored client --- examples/multi_model/query.py | 25 +++++++------- mii/client.py | 64 ++++++++++++++++++++++++++--------- 2 files changed, 60 insertions(+), 29 deletions(-) diff --git a/examples/multi_model/query.py b/examples/multi_model/query.py index 37c55476..2e06f159 100644 --- a/examples/multi_model/query.py +++ b/examples/multi_model/query.py @@ -6,30 +6,29 @@ import mii import time -generator = mii.mii_query_handle("first_test", "bloom560m_deployment") -result = generator.query({"query": ["DeepSpeed is", - "Seattle is"]}, - do_sample=True, - max_new_tokens=30, - deployment_name="bloom560m_deployment") +generator = mii.mii_query_handle("first_test") +result = generator.query( + {"query": ["DeepSpeed is", + "Seattle is"]}, + "bloom560m_deployment", + do_sample=True, + max_new_tokens=30, +) print(result) time.sleep(5) -generator2 = mii.mii_query_handle("first_test", - "microsoft/DialogRPT-human-vs-rand_deployment") -result = generator2.query({'query': "DeepSpeed is the greatest"}, - deployment_name="microsoft/DialogRPT-human-vs-rand_deployment") +result = generator.query({'query': "DeepSpeed is the greatest"}, + "microsoft/DialogRPT-human-vs-rand_deployment") print(result) time.sleep(5) -generator3 = mii.mii_query_handle("first_test", "microsoft/DialoGPT-large_deployment") -result = generator3.query( +result = generator.query( { 'text': "DeepSpeed is the greatest", 'conversation_id': 3, 'past_user_inputs': [], 'generated_responses': [] }, - deployment_name="microsoft/DialoGPT-large_deployment") + "microsoft/DialoGPT-large_deployment") print(result) diff --git a/mii/client.py b/mii/client.py index b58049c7..e4e7ed50 100644 --- a/mii/client.py +++ b/mii/client.py @@ -12,17 +12,28 @@ from mii.method_table import GRPC_METHOD_TABLE -def _get_deployment_info(deployment_tag, deployment_name): +def _get_deployment_info(deployment_tag): + deployments = [] configs = mii.utils.import_score_file(deployment_tag).configs + for deployment in configs: + if not isinstance(configs[deployment], dict): + continue + print("here") + deployments.append(configs[deployment]) + mii_configs_dict = configs[deployment][mii.constants.MII_CONFIGS_KEY] + mii_configs = mii.config.MIIConfig(**mii_configs_dict) + return deployments + """ task = configs[deployment_name][mii.constants.TASK_NAME_KEY] mii_configs_dict = configs[deployment_name][mii.constants.MII_CONFIGS_KEY] mii_configs = mii.config.MIIConfig(**mii_configs_dict) assert task is not None, "The task name should be set before calling init" return task, mii_configs + """ -def mii_query_handle(deployment_tag, deployment_name): +def mii_query_handle(deployment_tag, deployment_name=None): """Get a query handle for a local deployment: mii/examples/local/gpt2-query-example.py @@ -35,12 +46,15 @@ def mii_query_handle(deployment_tag, deployment_name): query_handle: A query handle with a single method `.query(request_dictionary)` using which queries can be sent to the model. """ - if deployment_name in mii.non_persistent_models: + if deployment_name is not None and deployment_name in mii.non_persistent_models: inference_pipeline, task = mii.non_persistent_models[deployment_name] return MIINonPersistentClient(task, deployment_name) - task_name, mii_configs = _get_deployment_info(deployment_tag, deployment_name) - return MIIClient(task_name, "localhost", mii_configs.port_number) + deployments = _get_deployment_info(deployment_tag) + print(deployments) + mii_configs_dict = deployments[0][mii.constants.MII_CONFIGS_KEY] + mii_configs = mii.config.MIIConfig(**mii_configs_dict) + return MIIClient(deployments, "localhost", mii_configs.port_number) def create_channel(host, port): @@ -55,24 +69,32 @@ class MIIClient(): """ Client to send queries to a single endpoint. """ - def __init__(self, task_name, host, port): + def __init__(self, deployments, host, port): self.asyncio_loop = asyncio.get_event_loop() channel = create_channel(host, port) self.stub = modelresponse_pb2_grpc.ModelResponseStub(channel) - self.task = get_task(task_name) + #self.task = get_task(task_name) + self.deployments = deployments - async def _request_async_response(self, request_dict, **query_kwargs): - if self.task not in GRPC_METHOD_TABLE: - raise ValueError(f"unknown task: {self.task}") + async def _request_async_response(self, request_dict, task, **query_kwargs): + if task not in GRPC_METHOD_TABLE: + raise ValueError(f"unknown task: {task}") - task_methods = GRPC_METHOD_TABLE[self.task] + task_methods = GRPC_METHOD_TABLE[task] proto_request = task_methods.pack_request_to_proto(request_dict, **query_kwargs) proto_response = await getattr(self.stub, task_methods.method)(proto_request) return task_methods.unpack_response_from_proto(proto_response) - def query(self, request_dict, **query_kwargs): + def query(self, request_dict, deployment_name, **query_kwargs): + task = None + for deployment in self.deployments: + if deployment[mii.constants.DEPLOYMENT_NAME_KEY] == deployment_name: + task = get_task(deployment[mii.constants.TASK_NAME_KEY]) + break + query_kwargs['deployment_name'] = deployment_name return self.asyncio_loop.run_until_complete( self._request_async_response(request_dict, + task, **query_kwargs)) async def terminate_async(self): @@ -86,8 +108,13 @@ async def create_session_async(self, session_id): return await self.stub.CreateSession( modelresponse_pb2.SessionID(session_id=session_id)) - def create_session(self, session_id): - assert self.task == Tasks.TEXT_GENERATION, f"Session creation only available for task '{Tasks.TEXT_GENERATION}'." + def create_session(self, session_id, deployment_name): + task = None + for deployment in self.deployments: + if deployment[mii.constants.DEPLOYMENT_NAME_KEY] == deployment_name: + task = get_task(deployment[mii.constants.TASK_NAME_KEY]) + break + assert task == Tasks.TEXT_GENERATION, f"Session creation only available for task '{Tasks.TEXT_GENERATION}'." return self.asyncio_loop.run_until_complete( self.create_session_async(session_id)) @@ -95,8 +122,13 @@ async def destroy_session_async(self, session_id): await self.stub.DestroySession(modelresponse_pb2.SessionID(session_id=session_id) ) - def destroy_session(self, session_id): - assert self.task == Tasks.TEXT_GENERATION, f"Session deletion only available for task '{Tasks.TEXT_GENERATION}'." + def destroy_session(self, session_id, deployment_name): + task = None + for deployment in self.deployments: + if deployment[mii.constants.DEPLOYMENT_NAME_KEY] == deployment_name: + task = get_task(deployment[mii.constants.TASK_NAME_KEY]) + break + assert task == Tasks.TEXT_GENERATION, f"Session deletion only available for task '{Tasks.TEXT_GENERATION}'." self.asyncio_loop.run_until_complete(self.destroy_session_async(session_id)) From 7a136d6a2d88390690323b5874fa719e32649f48 Mon Sep 17 00:00:00 2001 From: Tosin Segun Date: Thu, 6 Jul 2023 21:11:09 +0000 Subject: [PATCH 30/69] More Refactoring and q/a example --- examples/multi_model/query.py | 8 ++++++++ examples/multi_model/shutdown.py | 2 +- .../text-generation-bloom560m-example.py | 7 +++++++ mii/client.py | 19 +++++++++++-------- mii/deployment.py | 2 +- mii/terminate.py | 14 +++++++------- 6 files changed, 35 insertions(+), 17 deletions(-) diff --git a/examples/multi_model/query.py b/examples/multi_model/query.py index 2e06f159..377dcf1f 100644 --- a/examples/multi_model/query.py +++ b/examples/multi_model/query.py @@ -32,3 +32,11 @@ }, "microsoft/DialoGPT-large_deployment") print(result) + +results = generator.query( + { + 'question': "What is the greatest?", + 'context': "DeepSpeed is the greatest" + }, + "deepset/roberta-large-squad2" + "-qa-deployment") +print(results) diff --git a/examples/multi_model/shutdown.py b/examples/multi_model/shutdown.py index 4de143e6..281389c4 100644 --- a/examples/multi_model/shutdown.py +++ b/examples/multi_model/shutdown.py @@ -4,4 +4,4 @@ # DeepSpeed Team import mii -mii.terminate("first_test", "bloom560m_deployment") +mii.terminate("first_test") diff --git a/examples/multi_model/text-generation-bloom560m-example.py b/examples/multi_model/text-generation-bloom560m-example.py index a5dc202c..f070195e 100644 --- a/examples/multi_model/text-generation-bloom560m-example.py +++ b/examples/multi_model/text-generation-bloom560m-example.py @@ -36,4 +36,11 @@ GPU_index_map=gpu_index_map1, mii_config=mii.config.MIIConfig(**mii_configs2))) +name = "deepset/roberta-large-squad2" +deployments.append( + mii.Deployment(task="question-answering", + model=name, + deployment_name=name + "-qa-deployment", + GPU_index_map=gpu_index_map2)) + mii.deploy(deployment_tag="first_test", deployments=deployments) diff --git a/mii/client.py b/mii/client.py index e4e7ed50..2a9e2f1a 100644 --- a/mii/client.py +++ b/mii/client.py @@ -18,7 +18,6 @@ def _get_deployment_info(deployment_tag): for deployment in configs: if not isinstance(configs[deployment], dict): continue - print("here") deployments.append(configs[deployment]) mii_configs_dict = configs[deployment][mii.constants.MII_CONFIGS_KEY] mii_configs = mii.config.MIIConfig(**mii_configs_dict) @@ -33,7 +32,7 @@ def _get_deployment_info(deployment_tag): """ -def mii_query_handle(deployment_tag, deployment_name=None): +def mii_query_handle(deployment_tag): """Get a query handle for a local deployment: mii/examples/local/gpt2-query-example.py @@ -46,12 +45,11 @@ def mii_query_handle(deployment_tag, deployment_name=None): query_handle: A query handle with a single method `.query(request_dictionary)` using which queries can be sent to the model. """ - if deployment_name is not None and deployment_name in mii.non_persistent_models: + if deployment_tag in mii.non_persistent_models: inference_pipeline, task = mii.non_persistent_models[deployment_name] return MIINonPersistentClient(task, deployment_name) deployments = _get_deployment_info(deployment_tag) - print(deployments) mii_configs_dict = deployments[0][mii.constants.MII_CONFIGS_KEY] mii_configs = mii.config.MIIConfig(**mii_configs_dict) return MIIClient(deployments, "localhost", mii_configs.port_number) @@ -86,6 +84,8 @@ async def _request_async_response(self, request_dict, task, **query_kwargs): return task_methods.unpack_response_from_proto(proto_response) def query(self, request_dict, deployment_name, **query_kwargs): + if deployment_name is None: #mii.terminate() + return len(self.deployments) task = None for deployment in self.deployments: if deployment[mii.constants.DEPLOYMENT_NAME_KEY] == deployment_name: @@ -220,7 +220,10 @@ def terminate(self): del mii.non_persistent_models[self.deployment_name] -def terminate_restful_gateway(deployment_tag, deployment_name): - _, mii_configs = _get_deployment_info(deployment_tag, deployment_name) - if mii_configs.enable_restful_api: - requests.get(f"http://localhost:{mii_configs.restful_api_port}/terminate") +def terminate_restful_gateway(deployment_tag): + deployments = _get_deployment_info(deployment_tag) + for deployment in deployments: + mii_configs_dict = deployment[mii.constants.MII_CONFIGS_KEY] + mii_configs = mii.config.MIIConfig(**mii_configs_dict) + if mii_configs.enable_restful_api: + requests.get(f"http://localhost:{mii_configs.restful_api_port}/terminate") diff --git a/mii/deployment.py b/mii/deployment.py index 95be4c01..4d638537 100644 --- a/mii/deployment.py +++ b/mii/deployment.py @@ -81,7 +81,7 @@ def deploy(task=None, ds_config, version) ] - deployment_tag = deployment_name + "_tag" + deployment_tag = deployment_name else: assert deployment_tag is not None, "deployment_tag must be set to deploy multiple models" diff --git a/mii/terminate.py b/mii/terminate.py index 94fa7a77..0a2b82b4 100644 --- a/mii/terminate.py +++ b/mii/terminate.py @@ -7,21 +7,21 @@ import mii -def terminate(deployment_tag, deployment_name): - mii.utils.logger.info(f"Terminating server for {deployment_name}") - generator = mii.mii_query_handle(deployment_tag, deployment_name) - if (deployment_name in mii.non_persistent_models): +def terminate(deployment_tag): + mii.utils.logger.info(f"Terminating server for {deployment_tag}") + generator = mii.mii_query_handle(deployment_tag) + if (deployment_tag in mii.non_persistent_models): generator.terminate() return try: - generator.query({'query': ''}) + generator.query({'query': ''}, None) except grpc.aio._call.AioRpcError as error: if error._code == grpc.StatusCode.UNAVAILABLE: - mii.utils.logger.warn(f"Server for {deployment_name} not found") + mii.utils.logger.warn(f"Server for {deployment_tag} not found") else: pass except (KeyError, TypeError) as error: pass generator.terminate() - mii.client.terminate_restful_gateway(deployment_tag, deployment_name) + mii.client.terminate_restful_gateway(deployment_tag) From 2c6ec08299262a9d0954541f56e4b1d3b9020e6a Mon Sep 17 00:00:00 2001 From: Tosin Segun Date: Thu, 6 Jul 2023 21:55:48 +0000 Subject: [PATCH 31/69] Reformatting to maintain previous syntax --- examples/multi_model/query.py | 2 +- mii/client.py | 48 +++++++++++++++++++++-------------- mii/deployment.py | 18 ++++++------- 3 files changed, 39 insertions(+), 29 deletions(-) diff --git a/examples/multi_model/query.py b/examples/multi_model/query.py index 377dcf1f..2be15c8b 100644 --- a/examples/multi_model/query.py +++ b/examples/multi_model/query.py @@ -37,6 +37,6 @@ { 'question': "What is the greatest?", 'context': "DeepSpeed is the greatest" - }, + }, "deepset/roberta-large-squad2" + "-qa-deployment") print(results) diff --git a/mii/client.py b/mii/client.py index 2a9e2f1a..2282fc27 100644 --- a/mii/client.py +++ b/mii/client.py @@ -46,8 +46,8 @@ def mii_query_handle(deployment_tag): """ if deployment_tag in mii.non_persistent_models: - inference_pipeline, task = mii.non_persistent_models[deployment_name] - return MIINonPersistentClient(task, deployment_name) + inference_pipeline, task = mii.non_persistent_models[deployment_tag] + return MIINonPersistentClient(task, deployment_tag) deployments = _get_deployment_info(deployment_tag) mii_configs_dict = deployments[0][mii.constants.MII_CONFIGS_KEY] @@ -83,14 +83,16 @@ async def _request_async_response(self, request_dict, task, **query_kwargs): proto_response = await getattr(self.stub, task_methods.method)(proto_request) return task_methods.unpack_response_from_proto(proto_response) - def query(self, request_dict, deployment_name, **query_kwargs): - if deployment_name is None: #mii.terminate() - return len(self.deployments) + def query(self, request_dict, deployment_name=None, **query_kwargs): task = None - for deployment in self.deployments: - if deployment[mii.constants.DEPLOYMENT_NAME_KEY] == deployment_name: - task = get_task(deployment[mii.constants.TASK_NAME_KEY]) - break + if deployment_name is None: #mii.terminate() or single model + deployment_name = self.deployments[0][mii.constants.DEPLOYMENT_NAME_KEY] + task = get_task(self.deployments[0][mii.constants.TASK_NAME_KEY]) + else: + for deployment in self.deployments: + if deployment[mii.constants.DEPLOYMENT_NAME_KEY] == deployment_name: + task = get_task(deployment[mii.constants.TASK_NAME_KEY]) + break query_kwargs['deployment_name'] = deployment_name return self.asyncio_loop.run_until_complete( self._request_async_response(request_dict, @@ -108,12 +110,16 @@ async def create_session_async(self, session_id): return await self.stub.CreateSession( modelresponse_pb2.SessionID(session_id=session_id)) - def create_session(self, session_id, deployment_name): + def create_session(self, session_id, deployment_name=None): task = None - for deployment in self.deployments: - if deployment[mii.constants.DEPLOYMENT_NAME_KEY] == deployment_name: - task = get_task(deployment[mii.constants.TASK_NAME_KEY]) - break + if deployment_name is None: #mii.terminate() or single model + deployment_name = self.deployments[0][mii.constants.DEPLOYMENT_NAME_KEY] + task = get_task(self.deployments[0][mii.constants.TASK_NAME_KEY]) + else: + for deployment in self.deployments: + if deployment[mii.constants.DEPLOYMENT_NAME_KEY] == deployment_name: + task = get_task(deployment[mii.constants.TASK_NAME_KEY]) + break assert task == Tasks.TEXT_GENERATION, f"Session creation only available for task '{Tasks.TEXT_GENERATION}'." return self.asyncio_loop.run_until_complete( self.create_session_async(session_id)) @@ -122,12 +128,16 @@ async def destroy_session_async(self, session_id): await self.stub.DestroySession(modelresponse_pb2.SessionID(session_id=session_id) ) - def destroy_session(self, session_id, deployment_name): + def destroy_session(self, session_id, deployment_name=None): task = None - for deployment in self.deployments: - if deployment[mii.constants.DEPLOYMENT_NAME_KEY] == deployment_name: - task = get_task(deployment[mii.constants.TASK_NAME_KEY]) - break + if deployment_name is None: #mii.terminate() or single model + deployment_name = self.deployments[0][mii.constants.DEPLOYMENT_NAME_KEY] + task = get_task(self.deployments[0][mii.constants.TASK_NAME_KEY]) + else: + for deployment in self.deployments: + if deployment[mii.constants.DEPLOYMENT_NAME_KEY] == deployment_name: + task = get_task(deployment[mii.constants.TASK_NAME_KEY]) + break assert task == Tasks.TEXT_GENERATION, f"Session deletion only available for task '{Tasks.TEXT_GENERATION}'." self.asyncio_loop.run_until_complete(self.destroy_session_async(session_id)) diff --git a/mii/deployment.py b/mii/deployment.py index 4d638537..986186f3 100644 --- a/mii/deployment.py +++ b/mii/deployment.py @@ -71,15 +71,15 @@ def deploy(task=None, if not deployments: assert all((model, task, deployment_name)), "model, task, and deployment name must be set to deploy singular model" deployments = [ - Deployment(deployment_name, - task, - model, - enable_deepspeed, - enable_zero, - None, - mii_config, - ds_config, - version) + Deployment(deployment_name=deployment_name, + task=task, + model=model, + enable_deepspeed=enable_deepspeed, + enable_zero=enable_zero, + GPU_index_map=None, + mii_config=mii.config.MIIConfig(**mii_config), + ds_config=ds_config, + version=version) ] deployment_tag = deployment_name else: From 0cb88a9f2696d4a4aae0231af72eb4df1503b969 Mon Sep 17 00:00:00 2001 From: Tosin Segun Date: Thu, 6 Jul 2023 22:01:06 +0000 Subject: [PATCH 32/69] Removing print/debug statements --- mii/grpc_related/modelresponse_server.py | 9 --------- mii/method_table.py | 3 --- mii/models/score/score_template.py | 3 --- mii/server.py | 2 -- 4 files changed, 17 deletions(-) diff --git a/mii/grpc_related/modelresponse_server.py b/mii/grpc_related/modelresponse_server.py index 441faffd..0531f68a 100644 --- a/mii/grpc_related/modelresponse_server.py +++ b/mii/grpc_related/modelresponse_server.py @@ -175,14 +175,12 @@ def __init__(self, task_name, replica_configs): self.stubs[repl.deployment_name] = [] self.counter[repl.deployment_name] = AtomicCounter() - print(replica_configs) for repl in replica_configs: self.stubs[repl.deployment_name].extend( ParallelStubInvoker(replica.hostname, replica.tensor_parallel_ports) for replica in replica_configs if replica.deployment_name == repl.deployment_name) - print(f"\nSTUBS-> {self.stubs}\nCOUNTERS-> {self.counter}") """ self.counter = AtomicCounter() self.task = get_task(task_name) @@ -224,7 +222,6 @@ def invoke_intercept_method(request_proto, context): if repl.deployment_name == deployment_name: task = repl.task break - print(f"\nTASK ->{task}\nMETHOD NAME-> {method_name}") method = GRPC_METHOD_TABLE[get_task(task)] new_request = None if method_name == "ConversationalReply": @@ -248,11 +245,7 @@ def invoke_intercept_method(request_proto, context): request_proto.request ) if method_name == "GeneratorReply" or method_name == "Txt2ImgReply" else str( request_proto.request) - print(f"HERE request_dict -> {request_dict}\nKWARGS-> {kwargs}") new_request = method.pack_request_to_proto(request_dict, **kwargs) - print("done?") - - print(f"\nDEPLOYMENT NAME WITHIN INTERCEPTOR -> {deployment_name}") call_count = self.counter[deployment_name].get_and_increment() replica_index = call_count % len(self.stubs[deployment_name]) @@ -281,8 +274,6 @@ def invoke_intercept_method(request_proto, context): replica_index = self.replica_sessions[session_id] assert new_request is not None, "test" - print("ASSERT DONE") - print(new_request.query_kwargs) ret = self.stubs[deployment_name][replica_index].invoke( method_name, new_request) diff --git a/mii/method_table.py b/mii/method_table.py index 8dfea390..c412f446 100644 --- a/mii/method_table.py +++ b/mii/method_table.py @@ -23,9 +23,6 @@ def single_string_response_to_proto(self, response, time_taken, model_time_taken def multi_string_request_to_proto(self, request_dict, **query_kwargs): - temp = kwarg_dict_to_proto(query_kwargs) - print(f"FINE {temp}\nrd->{request_dict}") - print(isinstance(request_dict['query'], list)) return modelresponse_pb2.MultiStringRequest( request=request_dict['query'] if isinstance(request_dict['query'], list) else [request_dict['query']], diff --git a/mii/models/score/score_template.py b/mii/models/score/score_template.py index 2faa1ebc..94eb6ca8 100644 --- a/mii/models/score/score_template.py +++ b/mii/models/score/score_template.py @@ -21,7 +21,6 @@ def init(): for deployment in configs.values(): if not isinstance(deployment, dict): continue - print(f"\nDEPLOYMENT ->{configs.values()}") data = { 'deployment_name': deployment[mii.constants.DEPLOYMENT_NAME_KEY], 'task': deployment[mii.constants.TASK_NAME_KEY], @@ -34,8 +33,6 @@ def init(): 'version': 1 } deployments.append(mii.Deployment.parse_obj(data)) - - print(f"WITHIN INIT {deployments}") """ deployment_name = configs[mii.constants.DEPLOYMENT_NAME_KEY] model_name = configs[mii.constants.MODEL_NAME_KEY] diff --git a/mii/server.py b/mii/server.py index 76ad9443..3ac9ee08 100644 --- a/mii/server.py +++ b/mii/server.py @@ -280,9 +280,7 @@ def _initialize_service(self, deployment_tag, deployments, model_path, lb_config for i, repl_config in enumerate(lb_config.replica_configs): name = repl_config.deployment_name deployment = None - print(f"IN SERVER NAME -> {name}") for dep in deployments: - print(f"\nDEPLOYMENT_NAME {dep.deployment_name}") if dep.deployment_name == name: deployment = dep hostfile = tempfile.NamedTemporaryFile(delete=False) From 7c0ee125f94856227d9719ee2f1023a116fd6e1e Mon Sep 17 00:00:00 2001 From: Tosin Segun Date: Thu, 6 Jul 2023 22:44:18 +0000 Subject: [PATCH 33/69] Fixing non-persistent deloyments --- mii/deployment.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mii/deployment.py b/mii/deployment.py index 986186f3..2fe30830 100644 --- a/mii/deployment.py +++ b/mii/deployment.py @@ -175,7 +175,7 @@ def deploy(task=None, assert int(os.getenv('WORLD_SIZE', '1')) == mii_config.tensor_parallel, "World Size does not equal number of tensors. When using non-persistent deployment type, please launch with `deepspeed --num_gpus `" provider = MODEL_PROVIDER_MAP[get_provider_name(model, task)] mii.non_persistent_models[deployment_name] = (load_models( - get_task_name(task), + task, model, model_path, enable_deepspeed, From 7a956d5260a38537f4dc7c9dc2449dd231fe787b Mon Sep 17 00:00:00 2001 From: Tosin Segun Date: Fri, 7 Jul 2023 22:34:16 +0000 Subject: [PATCH 34/69] Refactoring Load balancer launch --- mii/client.py | 1 + mii/grpc_related/modelresponse_server.py | 7 +++--- mii/launch/load_balance_server.py | 31 ++++++++++++++++++++++++ mii/launch/multi_gpu_server.py | 12 +-------- mii/launch/utils.py | 15 ++++++++++++ mii/server.py | 29 ++++++++-------------- 6 files changed, 61 insertions(+), 34 deletions(-) create mode 100644 mii/launch/load_balance_server.py create mode 100644 mii/launch/utils.py diff --git a/mii/client.py b/mii/client.py index 2282fc27..cc25b77b 100644 --- a/mii/client.py +++ b/mii/client.py @@ -86,6 +86,7 @@ async def _request_async_response(self, request_dict, task, **query_kwargs): def query(self, request_dict, deployment_name=None, **query_kwargs): task = None if deployment_name is None: #mii.terminate() or single model + #assert len(self.deployments) == 1, "Must pass deployment_name to query when using multiple deployments" deployment_name = self.deployments[0][mii.constants.DEPLOYMENT_NAME_KEY] task = get_task(self.deployments[0][mii.constants.TASK_NAME_KEY]) else: diff --git a/mii/grpc_related/modelresponse_server.py b/mii/grpc_related/modelresponse_server.py index 0531f68a..01bc1310 100644 --- a/mii/grpc_related/modelresponse_server.py +++ b/mii/grpc_related/modelresponse_server.py @@ -164,7 +164,7 @@ def invoke(self, method_name, proto_request): class LoadBalancingInterceptor(grpc.ServerInterceptor): - def __init__(self, task_name, replica_configs): + def __init__(self, replica_configs): super().__init__() self.asyncio_loop = asyncio.get_event_loop() @@ -306,11 +306,10 @@ def serve_inference(inference_pipeline, port): _do_serve(ModelResponse(inference_pipeline), port) -def serve_load_balancing(task_name, lb_config): +def serve_load_balancing(lb_config): _do_serve(ServiceBase(), lb_config.port, - [LoadBalancingInterceptor(task_name, - lb_config.replica_configs)]) + [LoadBalancingInterceptor(lb_config.replica_configs)]) if __name__ == '__main__': diff --git a/mii/launch/load_balance_server.py b/mii/launch/load_balance_server.py new file mode 100644 index 00000000..01de3822 --- /dev/null +++ b/mii/launch/load_balance_server.py @@ -0,0 +1,31 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team +import argparse + +from mii import LoadBalancerConfig + +from mii.grpc_related.modelresponse_server import serve_load_balancing +from .utils import decode_config_from_str + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--load-balancer", + type=str, + default=None, + help="base64 encoded load balancer config") + + args = parser.parse_args() + assert args.load_balancer is not None, "lb_config required to use load balancer" + lb_config_dict = decode_config_from_str(args.load_balancer) + lb_config = LoadBalancerConfig(**lb_config_dict) + + print(f"Starting load balancer on port: {lb_config.port}") + serve_load_balancing(lb_config) + + +if __name__ == "__main__": + # python -m mii.launch.load_balance_server + main() diff --git a/mii/launch/multi_gpu_server.py b/mii/launch/multi_gpu_server.py index 27878725..1f7fc00a 100644 --- a/mii/launch/multi_gpu_server.py +++ b/mii/launch/multi_gpu_server.py @@ -5,23 +5,13 @@ import os import argparse import mii -import base64 -import json from mii import MIIConfig, LoadBalancerConfig from mii.models.load_models import load_models from mii.grpc_related.modelresponse_server import serve_inference, serve_load_balancing from mii.grpc_related.restful_gateway import RestfulGatewayThread - - -def decode_config_from_str(config_str): - # str -> bytes - b64_bytes = config_str.encode() - # decode b64 bytes -> json bytes - config_bytes = base64.urlsafe_b64decode(b64_bytes) - # convert json bytes -> str -> dict - return json.loads(config_bytes.decode()) +from .utils import decode_config_from_str def main(): diff --git a/mii/launch/utils.py b/mii/launch/utils.py new file mode 100644 index 00000000..9e039409 --- /dev/null +++ b/mii/launch/utils.py @@ -0,0 +1,15 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team +import base64 +import json + + +def decode_config_from_str(config_str): + # str -> bytes + b64_bytes = config_str.encode() + # decode b64 bytes -> json bytes + config_bytes = base64.urlsafe_b64decode(b64_bytes) + # convert json bytes -> str -> dict + return json.loads(config_bytes.decode()) diff --git a/mii/server.py b/mii/server.py index 3ac9ee08..ceaf2912 100644 --- a/mii/server.py +++ b/mii/server.py @@ -152,19 +152,17 @@ def print_helper(self, args): printable_string += " " + "-" * 60 return printable_string - def _launch_load_balancer(self, - deployment_name, - model_name, - model_path, - ds_optimize, - ds_zero, - ds_config, - mii_configs, - lb_config): + def _launch_load_balancer(self, model_path, lb_config): # serialize mii config b64_config_str = config_to_b64_str(lb_config) - + launch_str = f"{sys.executable} -m mii.launch.load_balance_server --load-balancer {b64_config_str}" + cmd = launch_str.split(" ") + mii_env = os.environ.copy() + mii_env["TRANSFORMERS_CACHE"] = model_path + logger.info(f"load balancer server launch: {cmd}") + return subprocess.Popen(cmd, env=mii_env) + """ return self._launch_server_process( deployment_name, model_name, @@ -176,6 +174,7 @@ def _launch_load_balancer(self, mii_configs.port_number, "load balancer", ex_server_args=[f"--load-balancer {b64_config_str}"]) + """ def _launch_restful_gateway(self, deployment_name, @@ -307,15 +306,7 @@ def _initialize_service(self, deployment_tag, deployments, model_path, lb_config # we don't use deepspeed launcher for the load balancer because it does not need a GPU. # The deepspeed launcher determines the number of processes to launch based on GPUs available on the host or CUDA_VISIBLE_DEVICES, # and it is expected to assign one GPU to one process. - processes.append( - self._launch_load_balancer(self.deployments[0].deployment_name, - self.deployments[0].model, - model_path, - self.deployments[0].enable_deepspeed, - self.deployments[0].enable_zero, - self.deployments[0].ds_config, - self.deployments[0].mii_config, - lb_config)) + processes.append(self._launch_load_balancer(model_path, lb_config)) for deployment in self.deployments: if deployment.mii_config.enable_restful_api: From f8cfe28f0a0edf8f23b6b2b17f0b01ea9e5c8a0d Mon Sep 17 00:00:00 2001 From: Tosin Segun Date: Mon, 10 Jul 2023 20:52:35 +0000 Subject: [PATCH 35/69] Fixing restful gateway client --- mii/client.py | 4 ++-- mii/grpc_related/restful_gateway.py | 4 +++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/mii/client.py b/mii/client.py index cc25b77b..a33b71ba 100644 --- a/mii/client.py +++ b/mii/client.py @@ -87,8 +87,8 @@ def query(self, request_dict, deployment_name=None, **query_kwargs): task = None if deployment_name is None: #mii.terminate() or single model #assert len(self.deployments) == 1, "Must pass deployment_name to query when using multiple deployments" - deployment_name = self.deployments[0][mii.constants.DEPLOYMENT_NAME_KEY] - task = get_task(self.deployments[0][mii.constants.TASK_NAME_KEY]) + deployment_name = self.deployments[0]['deployment_name'] + task = get_task(self.deployments[0]['task_name']) else: for deployment in self.deployments: if deployment[mii.constants.DEPLOYMENT_NAME_KEY] == deployment_name: diff --git a/mii/grpc_related/restful_gateway.py b/mii/grpc_related/restful_gateway.py index e8cfa934..d3dc53da 100644 --- a/mii/grpc_related/restful_gateway.py +++ b/mii/grpc_related/restful_gateway.py @@ -19,7 +19,9 @@ def shutdown(thread): def createRestfulGatewayApp(deployment_name, task, mii_config, server_thread): # client must be thread-safe - client = mii.MIIClient(task, "localhost", mii_config.port_number) + client = mii.mii_query_handle(deployment_name) + + #client = mii.MIIClient(deployment_name, "localhost", mii_config.port_number) class RestfulGatewayService(Resource): def __init__(self): From 079807d8255f27119d3007f1fb62861645ec9c41 Mon Sep 17 00:00:00 2001 From: Tosin Segun Date: Mon, 10 Jul 2023 22:23:53 +0000 Subject: [PATCH 36/69] Fixing replica issue --- mii/grpc_related/modelresponse_server.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/mii/grpc_related/modelresponse_server.py b/mii/grpc_related/modelresponse_server.py index 01bc1310..9fb4608f 100644 --- a/mii/grpc_related/modelresponse_server.py +++ b/mii/grpc_related/modelresponse_server.py @@ -176,11 +176,9 @@ def __init__(self, replica_configs): self.counter[repl.deployment_name] = AtomicCounter() for repl in replica_configs: - self.stubs[repl.deployment_name].extend( - ParallelStubInvoker(replica.hostname, - replica.tensor_parallel_ports) - for replica in replica_configs - if replica.deployment_name == repl.deployment_name) + self.stubs[repl.deployment_name].append( + ParallelStubInvoker(repl.hostname, + repl.tensor_parallel_ports)) """ self.counter = AtomicCounter() self.task = get_task(task_name) From ea1e47e2bf2a3b03964e76e3cb309044a6e9dc87 Mon Sep 17 00:00:00 2001 From: Tosin Segun Date: Mon, 10 Jul 2023 22:42:35 +0000 Subject: [PATCH 37/69] Fixing non persistent client --- mii/client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mii/client.py b/mii/client.py index a33b71ba..60c6ae44 100644 --- a/mii/client.py +++ b/mii/client.py @@ -200,7 +200,7 @@ def destroy_session(self, session_id): class MIINonPersistentClient(): def __init__(self, task, deployment_name): - self.task = task + self.task = get_task(task) self.deployment_name = deployment_name def query(self, request_dict, **query_kwargs): From 98b61290c7bc5c55251adb5d7b2834bcfebc9545 Mon Sep 17 00:00:00 2001 From: Mahesh Sinha <31616939+msinha251@users.noreply.github.com> Date: Wed, 12 Jul 2023 00:43:04 +0200 Subject: [PATCH 38/69] Adding trust_remote_code support (#203) - [Enhancement](https://github.com/microsoft/DeepSpeed-MII/issues/181) Co-authored-by: Mahesh Sinha Co-authored-by: Michael Wyatt --- mii/config.py | 1 + mii/models/providers/huggingface.py | 1 + 2 files changed, 2 insertions(+) diff --git a/mii/config.py b/mii/config.py index 6a8bac16..2714cb40 100644 --- a/mii/config.py +++ b/mii/config.py @@ -56,6 +56,7 @@ class MIIConfig(BaseModel): restful_api_port: int = 51080 replica_num: int = 1 hostfile: str = DLTS_HOSTFILE + trust_remote_code: bool = False @validator("deploy_rank") def deploy_valid(cls, field_value, values): diff --git a/mii/models/providers/huggingface.py b/mii/models/providers/huggingface.py index c04a6829..27f456aa 100644 --- a/mii/models/providers/huggingface.py +++ b/mii/models/providers/huggingface.py @@ -194,5 +194,6 @@ def hf_provider(model_path, model_name, task_name, mii_config): framework="pt", use_auth_token=mii_config.hf_auth_token, torch_dtype=mii_config.dtype, + trust_remote_code=mii_config.trust_remote_code, ) return inference_pipeline From daab5e68a9494e43c8bb70cfc17d5b9f6d77c6e7 Mon Sep 17 00:00:00 2001 From: Tosin Segun Date: Wed, 12 Jul 2023 22:58:47 +0000 Subject: [PATCH 39/69] Refactoring --- ...eration-bloom560m-example.py => deploy.py} | 13 ++-- examples/multi_model/query.py | 17 +++--- examples/multi_model/shutdown.py | 2 +- mii/__init__.py | 2 +- mii/client.py | 61 ++++++------------- mii/config.py | 2 +- mii/deployment.py | 22 +++---- mii/grpc_related/modelresponse_server.py | 10 ++- mii/models/score/generate.py | 8 +-- mii/models/score/score_template.py | 2 +- 10 files changed, 57 insertions(+), 82 deletions(-) rename examples/multi_model/{text-generation-bloom560m-example.py => deploy.py} (87%) diff --git a/examples/multi_model/text-generation-bloom560m-example.py b/examples/multi_model/deploy.py similarity index 87% rename from examples/multi_model/text-generation-bloom560m-example.py rename to examples/multi_model/deploy.py index f070195e..455ed498 100644 --- a/examples/multi_model/text-generation-bloom560m-example.py +++ b/examples/multi_model/deploy.py @@ -9,11 +9,15 @@ gpu_index_map3 = {'master': [0, 1]} deployments = [] + mii_configs1 = {"tensor_parallel": 2, "dtype": "fp16"} +mii_configs2 = {"tensor_parallel": 1} + +name = "bigscience/bloom-560m" deployments.append( mii.Deployment(task='text-generation', - model="bigscience/bloom-560m", - deployment_name="bloom560m_deployment", + model=name, + deployment_name=name + "_deployment", GPU_index_map=gpu_index_map3, mii_config=mii.config.MIIConfig(**mii_configs1))) @@ -25,10 +29,7 @@ deployment_name=name + "_deployment", GPU_index_map=gpu_index_map2)) -mii_configs2 = {"tensor_parallel": 1} - name = "microsoft/DialoGPT-large" - deployments.append( mii.Deployment(task='conversational', model=name, @@ -43,4 +44,4 @@ deployment_name=name + "-qa-deployment", GPU_index_map=gpu_index_map2)) -mii.deploy(deployment_tag="first_test", deployments=deployments) +mii.deploy(deployment_tag="multi_models", deployments=deployments) diff --git a/examples/multi_model/query.py b/examples/multi_model/query.py index 2be15c8b..519953a1 100644 --- a/examples/multi_model/query.py +++ b/examples/multi_model/query.py @@ -4,9 +4,9 @@ # DeepSpeed Team import mii -import time -generator = mii.mii_query_handle("first_test") +results = [] +generator = mii.mii_query_handle("multi_models") result = generator.query( {"query": ["DeepSpeed is", "Seattle is"]}, @@ -14,14 +14,11 @@ do_sample=True, max_new_tokens=30, ) -print(result) +results.append(result) -time.sleep(5) result = generator.query({'query': "DeepSpeed is the greatest"}, "microsoft/DialogRPT-human-vs-rand_deployment") -print(result) - -time.sleep(5) +results.append(result) result = generator.query( { @@ -31,12 +28,12 @@ 'generated_responses': [] }, "microsoft/DialoGPT-large_deployment") -print(result) +results.append(result) -results = generator.query( +result = generator.query( { 'question': "What is the greatest?", 'context': "DeepSpeed is the greatest" }, "deepset/roberta-large-squad2" + "-qa-deployment") -print(results) +results.append(result) diff --git a/examples/multi_model/shutdown.py b/examples/multi_model/shutdown.py index 281389c4..6b718a4d 100644 --- a/examples/multi_model/shutdown.py +++ b/examples/multi_model/shutdown.py @@ -4,4 +4,4 @@ # DeepSpeed Team import mii -mii.terminate("first_test") +mii.terminate("multi_models") diff --git a/mii/__init__.py b/mii/__init__.py index b0008c06..66748a56 100644 --- a/mii/__init__.py +++ b/mii/__init__.py @@ -10,7 +10,7 @@ from .constants import DeploymentType, Tasks from .aml_related.utils import aml_output_path -from .config import MIIConfig, LoadBalancerConfig, Deployment +from .config import MIIConfig, LoadBalancerConfig, DeploymentConfig from .grpc_related.proto import modelresponse_pb2_grpc __version__ = "0.0.0" diff --git a/mii/client.py b/mii/client.py index 60c6ae44..13131cbb 100644 --- a/mii/client.py +++ b/mii/client.py @@ -12,24 +12,14 @@ from mii.method_table import GRPC_METHOD_TABLE -def _get_deployment_info(deployment_tag): +def _get_deployment_configs(deployment_tag): deployments = [] configs = mii.utils.import_score_file(deployment_tag).configs for deployment in configs: if not isinstance(configs[deployment], dict): continue deployments.append(configs[deployment]) - mii_configs_dict = configs[deployment][mii.constants.MII_CONFIGS_KEY] - mii_configs = mii.config.MIIConfig(**mii_configs_dict) return deployments - """ - task = configs[deployment_name][mii.constants.TASK_NAME_KEY] - mii_configs_dict = configs[deployment_name][mii.constants.MII_CONFIGS_KEY] - mii_configs = mii.config.MIIConfig(**mii_configs_dict) - - assert task is not None, "The task name should be set before calling init" - return task, mii_configs - """ def mii_query_handle(deployment_tag): @@ -49,7 +39,7 @@ def mii_query_handle(deployment_tag): inference_pipeline, task = mii.non_persistent_models[deployment_tag] return MIINonPersistentClient(task, deployment_tag) - deployments = _get_deployment_info(deployment_tag) + deployments = _get_deployment_configs(deployment_tag) mii_configs_dict = deployments[0][mii.constants.MII_CONFIGS_KEY] mii_configs = mii.config.MIIConfig(**mii_configs_dict) return MIIClient(deployments, "localhost", mii_configs.port_number) @@ -74,6 +64,20 @@ def __init__(self, deployments, host, port): #self.task = get_task(task_name) self.deployments = deployments + def _get_deployment_task(self, deployment_name=None): + task = None + if deployment_name is None: #mii.terminate() or single model + assert len(self.deployments) == 1, "Must pass deployment_name to query when using multiple deployments" + deployment_name = self.deployments[0][mii.constants.DEPLOYMENT_NAME_KEY] + task = get_task(self.deployments[0][mii.constants.TASK_NAME_KEY]) + else: + for deployment in self.deployments: + if deployment[mii.constants.DEPLOYMENT_NAME_KEY] == deployment_name: + task = get_task(deployment[mii.constants.TASK_NAME_KEY]) + break + assert False, f"{deployment_name} not found in list of deployments" + return deployment_name, task + async def _request_async_response(self, request_dict, task, **query_kwargs): if task not in GRPC_METHOD_TABLE: raise ValueError(f"unknown task: {task}") @@ -84,16 +88,7 @@ async def _request_async_response(self, request_dict, task, **query_kwargs): return task_methods.unpack_response_from_proto(proto_response) def query(self, request_dict, deployment_name=None, **query_kwargs): - task = None - if deployment_name is None: #mii.terminate() or single model - #assert len(self.deployments) == 1, "Must pass deployment_name to query when using multiple deployments" - deployment_name = self.deployments[0]['deployment_name'] - task = get_task(self.deployments[0]['task_name']) - else: - for deployment in self.deployments: - if deployment[mii.constants.DEPLOYMENT_NAME_KEY] == deployment_name: - task = get_task(deployment[mii.constants.TASK_NAME_KEY]) - break + deployment_name, task = self._get_deployment_task(deployment_name) query_kwargs['deployment_name'] = deployment_name return self.asyncio_loop.run_until_complete( self._request_async_response(request_dict, @@ -112,15 +107,7 @@ async def create_session_async(self, session_id): modelresponse_pb2.SessionID(session_id=session_id)) def create_session(self, session_id, deployment_name=None): - task = None - if deployment_name is None: #mii.terminate() or single model - deployment_name = self.deployments[0][mii.constants.DEPLOYMENT_NAME_KEY] - task = get_task(self.deployments[0][mii.constants.TASK_NAME_KEY]) - else: - for deployment in self.deployments: - if deployment[mii.constants.DEPLOYMENT_NAME_KEY] == deployment_name: - task = get_task(deployment[mii.constants.TASK_NAME_KEY]) - break + deployment_name, task = self._get_deployment_task(deployment_name) assert task == Tasks.TEXT_GENERATION, f"Session creation only available for task '{Tasks.TEXT_GENERATION}'." return self.asyncio_loop.run_until_complete( self.create_session_async(session_id)) @@ -130,15 +117,7 @@ async def destroy_session_async(self, session_id): ) def destroy_session(self, session_id, deployment_name=None): - task = None - if deployment_name is None: #mii.terminate() or single model - deployment_name = self.deployments[0][mii.constants.DEPLOYMENT_NAME_KEY] - task = get_task(self.deployments[0][mii.constants.TASK_NAME_KEY]) - else: - for deployment in self.deployments: - if deployment[mii.constants.DEPLOYMENT_NAME_KEY] == deployment_name: - task = get_task(deployment[mii.constants.TASK_NAME_KEY]) - break + deployment_name, task = self._get_deployment_task(deployment_name) assert task == Tasks.TEXT_GENERATION, f"Session deletion only available for task '{Tasks.TEXT_GENERATION}'." self.asyncio_loop.run_until_complete(self.destroy_session_async(session_id)) @@ -232,7 +211,7 @@ def terminate(self): def terminate_restful_gateway(deployment_tag): - deployments = _get_deployment_info(deployment_tag) + deployments = _get_deployment_configs(deployment_tag) for deployment in deployments: mii_configs_dict = deployment[mii.constants.MII_CONFIGS_KEY] mii_configs = mii.config.MIIConfig(**mii_configs_dict) diff --git a/mii/config.py b/mii/config.py index 4eb6b597..6d7be86e 100644 --- a/mii/config.py +++ b/mii/config.py @@ -129,7 +129,7 @@ class Config: validate_assignment = True -class Deployment(BaseModel): +class DeploymentConfig(BaseModel): deployment_name: str task: str model: str diff --git a/mii/deployment.py b/mii/deployment.py index 2fe30830..dc970035 100644 --- a/mii/deployment.py +++ b/mii/deployment.py @@ -13,7 +13,7 @@ from .utils import logger, get_task_name, get_provider_name from .models.score import create_score_file from .models import load_models -from .config import ReplicaConfig, LoadBalancerConfig, Deployment +from .config import ReplicaConfig, LoadBalancerConfig, DeploymentConfig def deploy(task=None, @@ -71,15 +71,15 @@ def deploy(task=None, if not deployments: assert all((model, task, deployment_name)), "model, task, and deployment name must be set to deploy singular model" deployments = [ - Deployment(deployment_name=deployment_name, - task=task, - model=model, - enable_deepspeed=enable_deepspeed, - enable_zero=enable_zero, - GPU_index_map=None, - mii_config=mii.config.MIIConfig(**mii_config), - ds_config=ds_config, - version=version) + DeploymentConfig(deployment_name=deployment_name, + task=task, + model=model, + enable_deepspeed=enable_deepspeed, + enable_zero=enable_zero, + GPU_index_map=None, + mii_config=mii.config.MIIConfig(**mii_config), + ds_config=ds_config, + version=version) ] deployment_tag = deployment_name else: @@ -129,6 +129,7 @@ def deploy(task=None, # add fields for replica deployment replica_configs = [] port_map = {} + port_offset = 1 for deployment in deployments: mii_config = deployment.mii_config replica_pool = _allocate_processes(mii_config.hostfile, @@ -140,7 +141,6 @@ def deploy(task=None, # Reserver port for a LB proxy when replication is enabled if hostname not in port_map: port_map[hostname] = set() - port_offset = 1 base_port = mii_config.port_number + i * mii_config.tensor_parallel + port_offset if base_port in port_map[hostname]: base_port = max(port_map[hostname]) + 1 diff --git a/mii/grpc_related/modelresponse_server.py b/mii/grpc_related/modelresponse_server.py index 9fb4608f..20007b91 100644 --- a/mii/grpc_related/modelresponse_server.py +++ b/mii/grpc_related/modelresponse_server.py @@ -171,9 +171,11 @@ def __init__(self, replica_configs): self.stubs = {} self.counter = {} self.replica_configs = replica_configs + self.tasks = {} for repl in replica_configs: self.stubs[repl.deployment_name] = [] self.counter[repl.deployment_name] = AtomicCounter() + self.tasks[repl.deployment_name] = repl.task for repl in replica_configs: self.stubs[repl.deployment_name].append( @@ -215,11 +217,8 @@ def invoke_intercept_method(request_proto, context): assert "deployment_name" in kwargs, "Must include deployment_name in kwargs for query" deployment_name = kwargs.get('deployment_name') kwargs.pop('deployment_name', None) - task = None - for repl in self.replica_configs: - if repl.deployment_name == deployment_name: - task = repl.task - break + task = self.tasks[deployment_name] + assert task is not None, f"task for {deployment_name} not found" method = GRPC_METHOD_TABLE[get_task(task)] new_request = None if method_name == "ConversationalReply": @@ -271,7 +270,6 @@ def invoke_intercept_method(request_proto, context): raise ValueError(f"session not found") replica_index = self.replica_sessions[session_id] - assert new_request is not None, "test" ret = self.stubs[deployment_name][replica_index].invoke( method_name, new_request) diff --git a/mii/models/score/generate.py b/mii/models/score/generate.py index ecd15ffe..7a3ae021 100644 --- a/mii/models/score/generate.py +++ b/mii/models/score/generate.py @@ -38,10 +38,10 @@ def create_score_file(deployment_tag, if lb_config is not None: config_dict[mii.constants.LOAD_BALANCER_CONFIG_KEY] = lb_config - if len(mii.__path__) > 1: - logger.warning( - f"Detected mii path as multiple sources: {mii.__path__}, might cause unknown behavior" - ) + if len(mii.__path__) > 1: + logger.warning( + f"Detected mii path as multiple sources: {mii.__path__}, might cause unknown behavior" + ) with open(os.path.join(mii.__path__[0], "models/score/score_template.py"), diff --git a/mii/models/score/score_template.py b/mii/models/score/score_template.py index 94eb6ca8..83b46de3 100644 --- a/mii/models/score/score_template.py +++ b/mii/models/score/score_template.py @@ -32,7 +32,7 @@ def init(): 'ds_config': deployment[mii.constants.DEEPSPEED_CONFIG_KEY], 'version': 1 } - deployments.append(mii.Deployment.parse_obj(data)) + deployments.append(mii.DeploymentConfig.parse_obj(data)) """ deployment_name = configs[mii.constants.DEPLOYMENT_NAME_KEY] model_name = configs[mii.constants.MODEL_NAME_KEY] From 84073f9f60086c1f335cbc983bbcf3d0c3c29f87 Mon Sep 17 00:00:00 2001 From: TosinSeg <90005810+TosinSeg@users.noreply.github.com> Date: Wed, 12 Jul 2023 15:59:43 -0700 Subject: [PATCH 40/69] Update mii/models/score/generate.py Co-authored-by: Michael Wyatt --- mii/models/score/generate.py | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/mii/models/score/generate.py b/mii/models/score/generate.py index 7a3ae021..dc73fdb9 100644 --- a/mii/models/score/generate.py +++ b/mii/models/score/generate.py @@ -19,21 +19,16 @@ def create_score_file(deployment_tag, config_dict[mii.constants.MODEL_PATH_KEY] = model_path config_dict[mii.constants.DEPLOYMENT_TAG_KEY] = deployment_tag for deployment in deployments: - config_dict[deployment.deployment_name] = {} - config_dict[deployment.deployment_name][ - mii.constants.DEPLOYMENT_NAME_KEY] = deployment.deployment_name - config_dict[deployment.deployment_name][ - mii.constants.TASK_NAME_KEY] = mii.utils.get_task_name(deployment.task) - config_dict[deployment.deployment_name][ - mii.constants.MODEL_NAME_KEY] = deployment.model - config_dict[deployment.deployment_name][ - mii.constants.ENABLE_DEEPSPEED_KEY] = deployment.enable_deepspeed - config_dict[deployment.deployment_name][ - mii.constants.MII_CONFIGS_KEY] = deployment.mii_config.dict() - config_dict[deployment.deployment_name][ - mii.constants.ENABLE_DEEPSPEED_ZERO_KEY] = deployment.enable_zero - config_dict[deployment.deployment_name][ - mii.constants.DEEPSPEED_CONFIG_KEY] = deployment.ds_config + deployment_config = { + mii.constants.DEPLOYMENT_NAME_KEY: deployment.deployment_name, + mii.constants.TASK_NAME_KEY: mii.utils.get_task_name(deployment.task), + mii.constants.MODEL_NAME_KEY: deployment.model, + mii.constants.ENABLE_DEEPSPEED_KEY: deployment.enable_deepspeed, + mii.constants.MII_CONFIGS_KEY: deployment.mii_config.dict(), + mii.constants.ENABLE_DEEPSPEED_ZERO_KEY: deployment.enable_zero, + mii.constants.DEEPSPEED_CONFIG_KEY: deployment.ds_config, + } + config_dict[deployment.deployment_name] = deployment_config if lb_config is not None: config_dict[mii.constants.LOAD_BALANCER_CONFIG_KEY] = lb_config From b4edc2bd8617fee80286b00e1deb223533837db3 Mon Sep 17 00:00:00 2001 From: Tosin Segun Date: Thu, 13 Jul 2023 23:29:32 +0000 Subject: [PATCH 41/69] Refactoring Load Balancer and request_proto --- examples/multi_model/deploy.py | 8 +- examples/multi_model/query.py | 21 +- mii/client.py | 10 +- mii/grpc_related/modelresponse_server.py | 16 +- mii/grpc_related/proto/modelresponse.proto | 8 + mii/grpc_related/proto/modelresponse_pb2.py | 97 ++-- .../proto/modelresponse_pb2_grpc.py | 520 +++++++----------- mii/method_table.py | 12 +- 8 files changed, 301 insertions(+), 391 deletions(-) diff --git a/examples/multi_model/deploy.py b/examples/multi_model/deploy.py index 455ed498..03d2a3a0 100644 --- a/examples/multi_model/deploy.py +++ b/examples/multi_model/deploy.py @@ -15,7 +15,7 @@ name = "bigscience/bloom-560m" deployments.append( - mii.Deployment(task='text-generation', + mii.DeploymentConfig(task='text-generation', model=name, deployment_name=name + "_deployment", GPU_index_map=gpu_index_map3, @@ -24,14 +24,14 @@ # gpt2 name = "microsoft/DialogRPT-human-vs-rand" deployments.append( - mii.Deployment(task='text-classification', + mii.DeploymentConfig(task='text-classification', model=name, deployment_name=name + "_deployment", GPU_index_map=gpu_index_map2)) name = "microsoft/DialoGPT-large" deployments.append( - mii.Deployment(task='conversational', + mii.DeploymentConfig(task='conversational', model=name, deployment_name=name + "_deployment", GPU_index_map=gpu_index_map1, @@ -39,7 +39,7 @@ name = "deepset/roberta-large-squad2" deployments.append( - mii.Deployment(task="question-answering", + mii.DeploymentConfig(task="question-answering", model=name, deployment_name=name + "-qa-deployment", GPU_index_map=gpu_index_map2)) diff --git a/examples/multi_model/query.py b/examples/multi_model/query.py index 519953a1..caf85934 100644 --- a/examples/multi_model/query.py +++ b/examples/multi_model/query.py @@ -9,15 +9,16 @@ generator = mii.mii_query_handle("multi_models") result = generator.query( {"query": ["DeepSpeed is", - "Seattle is"]}, - "bloom560m_deployment", + "Seattle is"], + + "deployment_name": "bigscience/bloom-560m_deployment" + }, do_sample=True, max_new_tokens=30, ) results.append(result) -result = generator.query({'query': "DeepSpeed is the greatest"}, - "microsoft/DialogRPT-human-vs-rand_deployment") +result = generator.query({'query': "DeepSpeed is the greatest", "deployment_name": "microsoft/DialogRPT-human-vs-rand_deployment"}) results.append(result) result = generator.query( @@ -25,15 +26,15 @@ 'text': "DeepSpeed is the greatest", 'conversation_id': 3, 'past_user_inputs': [], - 'generated_responses': [] - }, - "microsoft/DialoGPT-large_deployment") + 'generated_responses': [], + "deployment_name": "microsoft/DialoGPT-large_deployment" + }) results.append(result) result = generator.query( { 'question': "What is the greatest?", - 'context': "DeepSpeed is the greatest" - }, - "deepset/roberta-large-squad2" + "-qa-deployment") + 'context': "DeepSpeed is the greatest", + "deployment_name": "deepset/roberta-large-squad2" + "-qa-deployment" + }) results.append(result) diff --git a/mii/client.py b/mii/client.py index 13131cbb..d71dce89 100644 --- a/mii/client.py +++ b/mii/client.py @@ -74,7 +74,7 @@ def _get_deployment_task(self, deployment_name=None): for deployment in self.deployments: if deployment[mii.constants.DEPLOYMENT_NAME_KEY] == deployment_name: task = get_task(deployment[mii.constants.TASK_NAME_KEY]) - break + return deployment_name, task assert False, f"{deployment_name} not found in list of deployments" return deployment_name, task @@ -87,9 +87,9 @@ async def _request_async_response(self, request_dict, task, **query_kwargs): proto_response = await getattr(self.stub, task_methods.method)(proto_request) return task_methods.unpack_response_from_proto(proto_response) - def query(self, request_dict, deployment_name=None, **query_kwargs): + def query(self, request_dict, **query_kwargs): + deployment_name = request_dict.get('deployment_name') deployment_name, task = self._get_deployment_task(deployment_name) - query_kwargs['deployment_name'] = deployment_name return self.asyncio_loop.run_until_complete( self._request_async_response(request_dict, task, @@ -107,6 +107,8 @@ async def create_session_async(self, session_id): modelresponse_pb2.SessionID(session_id=session_id)) def create_session(self, session_id, deployment_name=None): + if len(self.deployments > 1): + assert deployment_name is not None, "Deployment name must be passed in to create session when there are multiple models" deployment_name, task = self._get_deployment_task(deployment_name) assert task == Tasks.TEXT_GENERATION, f"Session creation only available for task '{Tasks.TEXT_GENERATION}'." return self.asyncio_loop.run_until_complete( @@ -117,6 +119,8 @@ async def destroy_session_async(self, session_id): ) def destroy_session(self, session_id, deployment_name=None): + if len(self.deployments > 1): + assert deployment_name is not None, "Deployment name must be passed in to destroy session when there are multiple models" deployment_name, task = self._get_deployment_task(deployment_name) assert task == Tasks.TEXT_GENERATION, f"Session deletion only available for task '{Tasks.TEXT_GENERATION}'." self.asyncio_loop.run_until_complete(self.destroy_session_async(session_id)) diff --git a/mii/grpc_related/modelresponse_server.py b/mii/grpc_related/modelresponse_server.py index 20007b91..6f123a7f 100644 --- a/mii/grpc_related/modelresponse_server.py +++ b/mii/grpc_related/modelresponse_server.py @@ -212,6 +212,8 @@ def invoke_intercept_method(request_proto, context): google_dot_protobuf_dot_empty__pb2.Empty()) self.asyncio_loop.call_soon_threadsafe(self.asyncio_loop.stop) return next_handler.unary_unary(request_proto, context) + deployment_name = getattr(request_proto, 'deployment_name') + """ kwargs = unpack_proto_query_kwargs(request_proto.query_kwargs) if method_name != TERMINATE_METHOD: assert "deployment_name" in kwargs, "Must include deployment_name in kwargs for query" @@ -243,7 +245,7 @@ def invoke_intercept_method(request_proto, context): ) if method_name == "GeneratorReply" or method_name == "Txt2ImgReply" else str( request_proto.request) new_request = method.pack_request_to_proto(request_dict, **kwargs) - + """ call_count = self.counter[deployment_name].get_and_increment() replica_index = call_count % len(self.stubs[deployment_name]) @@ -254,25 +256,25 @@ def invoke_intercept_method(request_proto, context): self.replica_sessions[request_proto.session_id] = replica_index self.stubs[deployment_name][replica_index].invoke( CREATE_SESSION_METHOD, - new_request) + request_proto) return google_dot_protobuf_dot_empty__pb2.Empty() if method_name == DESTROY_SESSION_METHOD: replica_index = self.replica_sessions.pop(request_proto.session_id) self.stubs[deployment_name][replica_index].invoke( DESTROY_SESSION_METHOD, - new_request) + request_proto) return google_dot_protobuf_dot_empty__pb2.Empty() - - if "session_id" in kwargs: - session_id = kwargs["session_id"] + + if "session_id" in request_proto.query_kwargs: + session_id = request_proto.query_kwargs["session_id"] if session_id not in self.replica_sessions: raise ValueError(f"session not found") replica_index = self.replica_sessions[session_id] ret = self.stubs[deployment_name][replica_index].invoke( method_name, - new_request) + request_proto) return ret return grpc.unary_unary_rpc_method_handler( diff --git a/mii/grpc_related/proto/modelresponse.proto b/mii/grpc_related/proto/modelresponse.proto index a0698899..ce55522b 100644 --- a/mii/grpc_related/proto/modelresponse.proto +++ b/mii/grpc_related/proto/modelresponse.proto @@ -52,29 +52,34 @@ message SessionID { message SingleStringRequest { string request = 1; map query_kwargs = 2; + optional string deployment_name = 3; } message MultiStringRequest { repeated string request = 1; map query_kwargs = 2; + optional string deployment_name = 3; } message SingleStringReply { string response = 1; float time_taken = 2; float model_time_taken = 3; + optional string deployment_name = 4; } message MultiStringReply { repeated string response = 1; float time_taken = 2; float model_time_taken = 3; + optional string deployment_name = 4; } message QARequest { string question = 1; string context = 2; map query_kwargs = 3; + optional string deployment_name = 4; } message ConversationRequest { @@ -83,6 +88,7 @@ message ConversationRequest { repeated string past_user_inputs = 3; repeated string generated_responses = 4; map query_kwargs = 5; + optional string deployment_name = 6; } message ConversationReply { @@ -91,6 +97,7 @@ message ConversationReply { repeated string generated_responses = 3; float time_taken = 4; float model_time_taken = 5; + optional string deployment_name = 6; } message ImageReply { @@ -100,4 +107,5 @@ message ImageReply { int64 size_w = 4; int64 size_h = 5; float time_taken = 6; + optional string deployment_name = 7; } diff --git a/mii/grpc_related/proto/modelresponse_pb2.py b/mii/grpc_related/proto/modelresponse_pb2.py index 76b1f994..53305ca5 100644 --- a/mii/grpc_related/proto/modelresponse_pb2.py +++ b/mii/grpc_related/proto/modelresponse_pb2.py @@ -1,66 +1,63 @@ -# Copyright (c) Microsoft Corporation. -# SPDX-License-Identifier: Apache-2.0 - -# DeepSpeed Team - +# -*- coding: utf-8 -*- # Generated by the protocol buffer compiler. DO NOT EDIT! # source: modelresponse.proto """Generated protocol buffer code.""" -from google.protobuf.internal import builder as _builder from google.protobuf import descriptor as _descriptor from google.protobuf import descriptor_pool as _descriptor_pool from google.protobuf import symbol_database as _symbol_database +from google.protobuf.internal import builder as _builder # @@protoc_insertion_point(imports) _sym_db = _symbol_database.Default() + from google.protobuf import empty_pb2 as google_dot_protobuf_dot_empty__pb2 -DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( - b'\n\x13modelresponse.proto\x12\rmodelresponse\x1a\x1bgoogle/protobuf/empty.proto\"_\n\x05Value\x12\x10\n\x06svalue\x18\x01 \x01(\tH\x00\x12\x10\n\x06ivalue\x18\x02 \x01(\x03H\x00\x12\x10\n\x06\x66value\x18\x03 \x01(\x02H\x00\x12\x10\n\x06\x62value\x18\x04 \x01(\x08H\x00\x42\x0e\n\x0coneof_values\"\x1f\n\tSessionID\x12\x12\n\nsession_id\x18\x01 \x01(\t\"\xbb\x01\n\x13SingleStringRequest\x12\x0f\n\x07request\x18\x01 \x01(\t\x12I\n\x0cquery_kwargs\x18\x02 \x03(\x0b\x32\x33.modelresponse.SingleStringRequest.QueryKwargsEntry\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\"\xb9\x01\n\x12MultiStringRequest\x12\x0f\n\x07request\x18\x01 \x03(\t\x12H\n\x0cquery_kwargs\x18\x02 \x03(\x0b\x32\x32.modelresponse.MultiStringRequest.QueryKwargsEntry\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\"S\n\x11SingleStringReply\x12\x10\n\x08response\x18\x01 \x01(\t\x12\x12\n\ntime_taken\x18\x02 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x03 \x01(\x02\"R\n\x10MultiStringReply\x12\x10\n\x08response\x18\x01 \x03(\t\x12\x12\n\ntime_taken\x18\x02 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x03 \x01(\x02\"\xb9\x01\n\tQARequest\x12\x10\n\x08question\x18\x01 \x01(\t\x12\x0f\n\x07\x63ontext\x18\x02 \x01(\t\x12?\n\x0cquery_kwargs\x18\x03 \x03(\x0b\x32).modelresponse.QARequest.QueryKwargsEntry\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\"\xa1\x02\n\x13\x43onversationRequest\x12\x0c\n\x04text\x18\x01 \x01(\t\x12\x1c\n\x0f\x63onversation_id\x18\x02 \x01(\x03H\x00\x88\x01\x01\x12\x18\n\x10past_user_inputs\x18\x03 \x03(\t\x12\x1b\n\x13generated_responses\x18\x04 \x03(\t\x12I\n\x0cquery_kwargs\x18\x05 \x03(\x0b\x32\x33.modelresponse.ConversationRequest.QueryKwargsEntry\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_conversation_id\"\x91\x01\n\x11\x43onversationReply\x12\x17\n\x0f\x63onversation_id\x18\x01 \x01(\x03\x12\x18\n\x10past_user_inputs\x18\x02 \x03(\t\x12\x1b\n\x13generated_responses\x18\x03 \x03(\t\x12\x12\n\ntime_taken\x18\x04 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x05 \x01(\x02\"}\n\nImageReply\x12\x0e\n\x06images\x18\x01 \x03(\x0c\x12\x1d\n\x15nsfw_content_detected\x18\x02 \x03(\x08\x12\x0c\n\x04mode\x18\x03 \x01(\t\x12\x0e\n\x06size_w\x18\x04 \x01(\x03\x12\x0e\n\x06size_h\x18\x05 \x01(\x03\x12\x12\n\ntime_taken\x18\x06 \x01(\x02\x32\xd4\x06\n\rModelResponse\x12=\n\tTerminate\x12\x16.google.protobuf.Empty\x1a\x16.google.protobuf.Empty\"\x00\x12\x43\n\rCreateSession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12\x44\n\x0e\x44\x65stroySession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12V\n\x0eGeneratorReply\x12!.modelresponse.MultiStringRequest\x1a\x1f.modelresponse.MultiStringReply\"\x00\x12]\n\x13\x43lassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12V\n\x16QuestionAndAnswerReply\x12\x18.modelresponse.QARequest\x1a .modelresponse.SingleStringReply\"\x00\x12W\n\rFillMaskReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12\x62\n\x18TokenClassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12]\n\x13\x43onversationalReply\x12\".modelresponse.ConversationRequest\x1a .modelresponse.ConversationReply\"\x00\x12N\n\x0cTxt2ImgReply\x12!.modelresponse.MultiStringRequest\x1a\x19.modelresponse.ImageReply\"\x00\x62\x06proto3' -) -_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, globals()) -_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'modelresponse_pb2', globals()) +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x13modelresponse.proto\x12\rmodelresponse\x1a\x1bgoogle/protobuf/empty.proto\"_\n\x05Value\x12\x10\n\x06svalue\x18\x01 \x01(\tH\x00\x12\x10\n\x06ivalue\x18\x02 \x01(\x03H\x00\x12\x10\n\x06\x66value\x18\x03 \x01(\x02H\x00\x12\x10\n\x06\x62value\x18\x04 \x01(\x08H\x00\x42\x0e\n\x0coneof_values\"\x1f\n\tSessionID\x12\x12\n\nsession_id\x18\x01 \x01(\t\"\xed\x01\n\x13SingleStringRequest\x12\x0f\n\x07request\x18\x01 \x01(\t\x12I\n\x0cquery_kwargs\x18\x02 \x03(\x0b\x32\x33.modelresponse.SingleStringRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x03 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\xeb\x01\n\x12MultiStringRequest\x12\x0f\n\x07request\x18\x01 \x03(\t\x12H\n\x0cquery_kwargs\x18\x02 \x03(\x0b\x32\x32.modelresponse.MultiStringRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x03 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\x85\x01\n\x11SingleStringReply\x12\x10\n\x08response\x18\x01 \x01(\t\x12\x12\n\ntime_taken\x18\x02 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x03 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\x84\x01\n\x10MultiStringReply\x12\x10\n\x08response\x18\x01 \x03(\t\x12\x12\n\ntime_taken\x18\x02 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x03 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\xeb\x01\n\tQARequest\x12\x10\n\x08question\x18\x01 \x01(\t\x12\x0f\n\x07\x63ontext\x18\x02 \x01(\t\x12?\n\x0cquery_kwargs\x18\x03 \x03(\x0b\x32).modelresponse.QARequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\xd3\x02\n\x13\x43onversationRequest\x12\x0c\n\x04text\x18\x01 \x01(\t\x12\x1c\n\x0f\x63onversation_id\x18\x02 \x01(\x03H\x00\x88\x01\x01\x12\x18\n\x10past_user_inputs\x18\x03 \x03(\t\x12\x1b\n\x13generated_responses\x18\x04 \x03(\t\x12I\n\x0cquery_kwargs\x18\x05 \x03(\x0b\x32\x33.modelresponse.ConversationRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x06 \x01(\tH\x01\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_conversation_idB\x12\n\x10_deployment_name\"\xc3\x01\n\x11\x43onversationReply\x12\x17\n\x0f\x63onversation_id\x18\x01 \x01(\x03\x12\x18\n\x10past_user_inputs\x18\x02 \x03(\t\x12\x1b\n\x13generated_responses\x18\x03 \x03(\t\x12\x12\n\ntime_taken\x18\x04 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x05 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x06 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\xaf\x01\n\nImageReply\x12\x0e\n\x06images\x18\x01 \x03(\x0c\x12\x1d\n\x15nsfw_content_detected\x18\x02 \x03(\x08\x12\x0c\n\x04mode\x18\x03 \x01(\t\x12\x0e\n\x06size_w\x18\x04 \x01(\x03\x12\x0e\n\x06size_h\x18\x05 \x01(\x03\x12\x12\n\ntime_taken\x18\x06 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x07 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name2\xd4\x06\n\rModelResponse\x12=\n\tTerminate\x12\x16.google.protobuf.Empty\x1a\x16.google.protobuf.Empty\"\x00\x12\x43\n\rCreateSession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12\x44\n\x0e\x44\x65stroySession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12V\n\x0eGeneratorReply\x12!.modelresponse.MultiStringRequest\x1a\x1f.modelresponse.MultiStringReply\"\x00\x12]\n\x13\x43lassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12V\n\x16QuestionAndAnswerReply\x12\x18.modelresponse.QARequest\x1a .modelresponse.SingleStringReply\"\x00\x12W\n\rFillMaskReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12\x62\n\x18TokenClassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12]\n\x13\x43onversationalReply\x12\".modelresponse.ConversationRequest\x1a .modelresponse.ConversationReply\"\x00\x12N\n\x0cTxt2ImgReply\x12!.modelresponse.MultiStringRequest\x1a\x19.modelresponse.ImageReply\"\x00\x62\x06proto3') + +_globals = globals() +_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) +_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'modelresponse_pb2', _globals) if _descriptor._USE_C_DESCRIPTORS == False: - DESCRIPTOR._options = None - _SINGLESTRINGREQUEST_QUERYKWARGSENTRY._options = None - _SINGLESTRINGREQUEST_QUERYKWARGSENTRY._serialized_options = b'8\001' - _MULTISTRINGREQUEST_QUERYKWARGSENTRY._options = None - _MULTISTRINGREQUEST_QUERYKWARGSENTRY._serialized_options = b'8\001' - _QAREQUEST_QUERYKWARGSENTRY._options = None - _QAREQUEST_QUERYKWARGSENTRY._serialized_options = b'8\001' - _CONVERSATIONREQUEST_QUERYKWARGSENTRY._options = None - _CONVERSATIONREQUEST_QUERYKWARGSENTRY._serialized_options = b'8\001' - _VALUE._serialized_start = 67 - _VALUE._serialized_end = 162 - _SESSIONID._serialized_start = 164 - _SESSIONID._serialized_end = 195 - _SINGLESTRINGREQUEST._serialized_start = 198 - _SINGLESTRINGREQUEST._serialized_end = 385 - _SINGLESTRINGREQUEST_QUERYKWARGSENTRY._serialized_start = 313 - _SINGLESTRINGREQUEST_QUERYKWARGSENTRY._serialized_end = 385 - _MULTISTRINGREQUEST._serialized_start = 388 - _MULTISTRINGREQUEST._serialized_end = 573 - _MULTISTRINGREQUEST_QUERYKWARGSENTRY._serialized_start = 313 - _MULTISTRINGREQUEST_QUERYKWARGSENTRY._serialized_end = 385 - _SINGLESTRINGREPLY._serialized_start = 575 - _SINGLESTRINGREPLY._serialized_end = 658 - _MULTISTRINGREPLY._serialized_start = 660 - _MULTISTRINGREPLY._serialized_end = 742 - _QAREQUEST._serialized_start = 745 - _QAREQUEST._serialized_end = 930 - _QAREQUEST_QUERYKWARGSENTRY._serialized_start = 313 - _QAREQUEST_QUERYKWARGSENTRY._serialized_end = 385 - _CONVERSATIONREQUEST._serialized_start = 933 - _CONVERSATIONREQUEST._serialized_end = 1222 - _CONVERSATIONREQUEST_QUERYKWARGSENTRY._serialized_start = 313 - _CONVERSATIONREQUEST_QUERYKWARGSENTRY._serialized_end = 385 - _CONVERSATIONREPLY._serialized_start = 1225 - _CONVERSATIONREPLY._serialized_end = 1370 - _IMAGEREPLY._serialized_start = 1372 - _IMAGEREPLY._serialized_end = 1497 - _MODELRESPONSE._serialized_start = 1500 - _MODELRESPONSE._serialized_end = 2352 + DESCRIPTOR._options = None + _SINGLESTRINGREQUEST_QUERYKWARGSENTRY._options = None + _SINGLESTRINGREQUEST_QUERYKWARGSENTRY._serialized_options = b'8\001' + _MULTISTRINGREQUEST_QUERYKWARGSENTRY._options = None + _MULTISTRINGREQUEST_QUERYKWARGSENTRY._serialized_options = b'8\001' + _QAREQUEST_QUERYKWARGSENTRY._options = None + _QAREQUEST_QUERYKWARGSENTRY._serialized_options = b'8\001' + _CONVERSATIONREQUEST_QUERYKWARGSENTRY._options = None + _CONVERSATIONREQUEST_QUERYKWARGSENTRY._serialized_options = b'8\001' + _globals['_VALUE']._serialized_start=67 + _globals['_VALUE']._serialized_end=162 + _globals['_SESSIONID']._serialized_start=164 + _globals['_SESSIONID']._serialized_end=195 + _globals['_SINGLESTRINGREQUEST']._serialized_start=198 + _globals['_SINGLESTRINGREQUEST']._serialized_end=435 + _globals['_SINGLESTRINGREQUEST_QUERYKWARGSENTRY']._serialized_start=343 + _globals['_SINGLESTRINGREQUEST_QUERYKWARGSENTRY']._serialized_end=415 + _globals['_MULTISTRINGREQUEST']._serialized_start=438 + _globals['_MULTISTRINGREQUEST']._serialized_end=673 + _globals['_MULTISTRINGREQUEST_QUERYKWARGSENTRY']._serialized_start=343 + _globals['_MULTISTRINGREQUEST_QUERYKWARGSENTRY']._serialized_end=415 + _globals['_SINGLESTRINGREPLY']._serialized_start=676 + _globals['_SINGLESTRINGREPLY']._serialized_end=809 + _globals['_MULTISTRINGREPLY']._serialized_start=812 + _globals['_MULTISTRINGREPLY']._serialized_end=944 + _globals['_QAREQUEST']._serialized_start=947 + _globals['_QAREQUEST']._serialized_end=1182 + _globals['_QAREQUEST_QUERYKWARGSENTRY']._serialized_start=343 + _globals['_QAREQUEST_QUERYKWARGSENTRY']._serialized_end=415 + _globals['_CONVERSATIONREQUEST']._serialized_start=1185 + _globals['_CONVERSATIONREQUEST']._serialized_end=1524 + _globals['_CONVERSATIONREQUEST_QUERYKWARGSENTRY']._serialized_start=343 + _globals['_CONVERSATIONREQUEST_QUERYKWARGSENTRY']._serialized_end=415 + _globals['_CONVERSATIONREPLY']._serialized_start=1527 + _globals['_CONVERSATIONREPLY']._serialized_end=1722 + _globals['_IMAGEREPLY']._serialized_start=1725 + _globals['_IMAGEREPLY']._serialized_end=1900 + _globals['_MODELRESPONSE']._serialized_start=1903 + _globals['_MODELRESPONSE']._serialized_end=2755 # @@protoc_insertion_point(module_scope) diff --git a/mii/grpc_related/proto/modelresponse_pb2_grpc.py b/mii/grpc_related/proto/modelresponse_pb2_grpc.py index 95cfa825..683e4962 100644 --- a/mii/grpc_related/proto/modelresponse_pb2_grpc.py +++ b/mii/grpc_related/proto/modelresponse_pb2_grpc.py @@ -1,8 +1,3 @@ -# Copyright (c) Microsoft Corporation. -# SPDX-License-Identifier: Apache-2.0 - -# DeepSpeed Team - # Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT! """Client and server classes corresponding to protobuf-defined services.""" import grpc @@ -13,6 +8,7 @@ class ModelResponseStub(object): """Missing associated documentation comment in .proto file.""" + def __init__(self, channel): """Constructor. @@ -20,60 +16,60 @@ def __init__(self, channel): channel: A grpc.Channel. """ self.Terminate = channel.unary_unary( - '/modelresponse.ModelResponse/Terminate', - request_serializer=google_dot_protobuf_dot_empty__pb2.Empty. - SerializeToString, - response_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, - ) + '/modelresponse.ModelResponse/Terminate', + request_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, + response_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, + ) self.CreateSession = channel.unary_unary( - '/modelresponse.ModelResponse/CreateSession', - request_serializer=modelresponse__pb2.SessionID.SerializeToString, - response_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, - ) + '/modelresponse.ModelResponse/CreateSession', + request_serializer=modelresponse__pb2.SessionID.SerializeToString, + response_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, + ) self.DestroySession = channel.unary_unary( - '/modelresponse.ModelResponse/DestroySession', - request_serializer=modelresponse__pb2.SessionID.SerializeToString, - response_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, - ) + '/modelresponse.ModelResponse/DestroySession', + request_serializer=modelresponse__pb2.SessionID.SerializeToString, + response_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, + ) self.GeneratorReply = channel.unary_unary( - '/modelresponse.ModelResponse/GeneratorReply', - request_serializer=modelresponse__pb2.MultiStringRequest.SerializeToString, - response_deserializer=modelresponse__pb2.MultiStringReply.FromString, - ) + '/modelresponse.ModelResponse/GeneratorReply', + request_serializer=modelresponse__pb2.MultiStringRequest.SerializeToString, + response_deserializer=modelresponse__pb2.MultiStringReply.FromString, + ) self.ClassificationReply = channel.unary_unary( - '/modelresponse.ModelResponse/ClassificationReply', - request_serializer=modelresponse__pb2.SingleStringRequest.SerializeToString, - response_deserializer=modelresponse__pb2.SingleStringReply.FromString, - ) + '/modelresponse.ModelResponse/ClassificationReply', + request_serializer=modelresponse__pb2.SingleStringRequest.SerializeToString, + response_deserializer=modelresponse__pb2.SingleStringReply.FromString, + ) self.QuestionAndAnswerReply = channel.unary_unary( - '/modelresponse.ModelResponse/QuestionAndAnswerReply', - request_serializer=modelresponse__pb2.QARequest.SerializeToString, - response_deserializer=modelresponse__pb2.SingleStringReply.FromString, - ) + '/modelresponse.ModelResponse/QuestionAndAnswerReply', + request_serializer=modelresponse__pb2.QARequest.SerializeToString, + response_deserializer=modelresponse__pb2.SingleStringReply.FromString, + ) self.FillMaskReply = channel.unary_unary( - '/modelresponse.ModelResponse/FillMaskReply', - request_serializer=modelresponse__pb2.SingleStringRequest.SerializeToString, - response_deserializer=modelresponse__pb2.SingleStringReply.FromString, - ) + '/modelresponse.ModelResponse/FillMaskReply', + request_serializer=modelresponse__pb2.SingleStringRequest.SerializeToString, + response_deserializer=modelresponse__pb2.SingleStringReply.FromString, + ) self.TokenClassificationReply = channel.unary_unary( - '/modelresponse.ModelResponse/TokenClassificationReply', - request_serializer=modelresponse__pb2.SingleStringRequest.SerializeToString, - response_deserializer=modelresponse__pb2.SingleStringReply.FromString, - ) + '/modelresponse.ModelResponse/TokenClassificationReply', + request_serializer=modelresponse__pb2.SingleStringRequest.SerializeToString, + response_deserializer=modelresponse__pb2.SingleStringReply.FromString, + ) self.ConversationalReply = channel.unary_unary( - '/modelresponse.ModelResponse/ConversationalReply', - request_serializer=modelresponse__pb2.ConversationRequest.SerializeToString, - response_deserializer=modelresponse__pb2.ConversationReply.FromString, - ) + '/modelresponse.ModelResponse/ConversationalReply', + request_serializer=modelresponse__pb2.ConversationRequest.SerializeToString, + response_deserializer=modelresponse__pb2.ConversationReply.FromString, + ) self.Txt2ImgReply = channel.unary_unary( - '/modelresponse.ModelResponse/Txt2ImgReply', - request_serializer=modelresponse__pb2.MultiStringRequest.SerializeToString, - response_deserializer=modelresponse__pb2.ImageReply.FromString, - ) + '/modelresponse.ModelResponse/Txt2ImgReply', + request_serializer=modelresponse__pb2.MultiStringRequest.SerializeToString, + response_deserializer=modelresponse__pb2.ImageReply.FromString, + ) class ModelResponseServicer(object): """Missing associated documentation comment in .proto file.""" + def Terminate(self, request, context): """Missing associated documentation comment in .proto file.""" context.set_code(grpc.StatusCode.UNIMPLEMENTED) @@ -137,334 +133,232 @@ def Txt2ImgReply(self, request, context): def add_ModelResponseServicer_to_server(servicer, server): rpc_method_handlers = { - 'Terminate': - grpc.unary_unary_rpc_method_handler( - servicer.Terminate, - request_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, - response_serializer=google_dot_protobuf_dot_empty__pb2.Empty. - SerializeToString, - ), - 'CreateSession': - grpc.unary_unary_rpc_method_handler( - servicer.CreateSession, - request_deserializer=modelresponse__pb2.SessionID.FromString, - response_serializer=google_dot_protobuf_dot_empty__pb2.Empty. - SerializeToString, - ), - 'DestroySession': - grpc.unary_unary_rpc_method_handler( - servicer.DestroySession, - request_deserializer=modelresponse__pb2.SessionID.FromString, - response_serializer=google_dot_protobuf_dot_empty__pb2.Empty. - SerializeToString, - ), - 'GeneratorReply': - grpc.unary_unary_rpc_method_handler( - servicer.GeneratorReply, - request_deserializer=modelresponse__pb2.MultiStringRequest.FromString, - response_serializer=modelresponse__pb2.MultiStringReply.SerializeToString, - ), - 'ClassificationReply': - grpc.unary_unary_rpc_method_handler( - servicer.ClassificationReply, - request_deserializer=modelresponse__pb2.SingleStringRequest.FromString, - response_serializer=modelresponse__pb2.SingleStringReply.SerializeToString, - ), - 'QuestionAndAnswerReply': - grpc.unary_unary_rpc_method_handler( - servicer.QuestionAndAnswerReply, - request_deserializer=modelresponse__pb2.QARequest.FromString, - response_serializer=modelresponse__pb2.SingleStringReply.SerializeToString, - ), - 'FillMaskReply': - grpc.unary_unary_rpc_method_handler( - servicer.FillMaskReply, - request_deserializer=modelresponse__pb2.SingleStringRequest.FromString, - response_serializer=modelresponse__pb2.SingleStringReply.SerializeToString, - ), - 'TokenClassificationReply': - grpc.unary_unary_rpc_method_handler( - servicer.TokenClassificationReply, - request_deserializer=modelresponse__pb2.SingleStringRequest.FromString, - response_serializer=modelresponse__pb2.SingleStringReply.SerializeToString, - ), - 'ConversationalReply': - grpc.unary_unary_rpc_method_handler( - servicer.ConversationalReply, - request_deserializer=modelresponse__pb2.ConversationRequest.FromString, - response_serializer=modelresponse__pb2.ConversationReply.SerializeToString, - ), - 'Txt2ImgReply': - grpc.unary_unary_rpc_method_handler( - servicer.Txt2ImgReply, - request_deserializer=modelresponse__pb2.MultiStringRequest.FromString, - response_serializer=modelresponse__pb2.ImageReply.SerializeToString, - ), + 'Terminate': grpc.unary_unary_rpc_method_handler( + servicer.Terminate, + request_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, + response_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, + ), + 'CreateSession': grpc.unary_unary_rpc_method_handler( + servicer.CreateSession, + request_deserializer=modelresponse__pb2.SessionID.FromString, + response_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, + ), + 'DestroySession': grpc.unary_unary_rpc_method_handler( + servicer.DestroySession, + request_deserializer=modelresponse__pb2.SessionID.FromString, + response_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, + ), + 'GeneratorReply': grpc.unary_unary_rpc_method_handler( + servicer.GeneratorReply, + request_deserializer=modelresponse__pb2.MultiStringRequest.FromString, + response_serializer=modelresponse__pb2.MultiStringReply.SerializeToString, + ), + 'ClassificationReply': grpc.unary_unary_rpc_method_handler( + servicer.ClassificationReply, + request_deserializer=modelresponse__pb2.SingleStringRequest.FromString, + response_serializer=modelresponse__pb2.SingleStringReply.SerializeToString, + ), + 'QuestionAndAnswerReply': grpc.unary_unary_rpc_method_handler( + servicer.QuestionAndAnswerReply, + request_deserializer=modelresponse__pb2.QARequest.FromString, + response_serializer=modelresponse__pb2.SingleStringReply.SerializeToString, + ), + 'FillMaskReply': grpc.unary_unary_rpc_method_handler( + servicer.FillMaskReply, + request_deserializer=modelresponse__pb2.SingleStringRequest.FromString, + response_serializer=modelresponse__pb2.SingleStringReply.SerializeToString, + ), + 'TokenClassificationReply': grpc.unary_unary_rpc_method_handler( + servicer.TokenClassificationReply, + request_deserializer=modelresponse__pb2.SingleStringRequest.FromString, + response_serializer=modelresponse__pb2.SingleStringReply.SerializeToString, + ), + 'ConversationalReply': grpc.unary_unary_rpc_method_handler( + servicer.ConversationalReply, + request_deserializer=modelresponse__pb2.ConversationRequest.FromString, + response_serializer=modelresponse__pb2.ConversationReply.SerializeToString, + ), + 'Txt2ImgReply': grpc.unary_unary_rpc_method_handler( + servicer.Txt2ImgReply, + request_deserializer=modelresponse__pb2.MultiStringRequest.FromString, + response_serializer=modelresponse__pb2.ImageReply.SerializeToString, + ), } - generic_handler = grpc.method_handlers_generic_handler('modelresponse.ModelResponse', - rpc_method_handlers) - server.add_generic_rpc_handlers((generic_handler, )) + generic_handler = grpc.method_handlers_generic_handler( + 'modelresponse.ModelResponse', rpc_method_handlers) + server.add_generic_rpc_handlers((generic_handler,)) -# This class is part of an EXPERIMENTAL API. + # This class is part of an EXPERIMENTAL API. class ModelResponse(object): """Missing associated documentation comment in .proto file.""" + @staticmethod def Terminate(request, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None): - return grpc.experimental.unary_unary( - request, target, - '/modelresponse.ModelResponse/Terminate', + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary(request, target, '/modelresponse.ModelResponse/Terminate', google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, google_dot_protobuf_dot_empty__pb2.Empty.FromString, - options, - channel_credentials, - insecure, - call_credentials, - compression, - wait_for_ready, - timeout, - metadata) + options, channel_credentials, + insecure, call_credentials, compression, wait_for_ready, timeout, metadata) @staticmethod def CreateSession(request, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None): - return grpc.experimental.unary_unary( - request, target, - '/modelresponse.ModelResponse/CreateSession', + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary(request, target, '/modelresponse.ModelResponse/CreateSession', modelresponse__pb2.SessionID.SerializeToString, google_dot_protobuf_dot_empty__pb2.Empty.FromString, - options, - channel_credentials, - insecure, - call_credentials, - compression, - wait_for_ready, - timeout, - metadata) + options, channel_credentials, + insecure, call_credentials, compression, wait_for_ready, timeout, metadata) @staticmethod def DestroySession(request, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None): - return grpc.experimental.unary_unary( - request, target, - '/modelresponse.ModelResponse/DestroySession', + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary(request, target, '/modelresponse.ModelResponse/DestroySession', modelresponse__pb2.SessionID.SerializeToString, google_dot_protobuf_dot_empty__pb2.Empty.FromString, - options, - channel_credentials, - insecure, - call_credentials, - compression, - wait_for_ready, - timeout, - metadata) + options, channel_credentials, + insecure, call_credentials, compression, wait_for_ready, timeout, metadata) @staticmethod def GeneratorReply(request, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None): - return grpc.experimental.unary_unary( - request, target, - '/modelresponse.ModelResponse/GeneratorReply', + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary(request, target, '/modelresponse.ModelResponse/GeneratorReply', modelresponse__pb2.MultiStringRequest.SerializeToString, modelresponse__pb2.MultiStringReply.FromString, - options, - channel_credentials, - insecure, - call_credentials, - compression, - wait_for_ready, - timeout, - metadata) + options, channel_credentials, + insecure, call_credentials, compression, wait_for_ready, timeout, metadata) @staticmethod def ClassificationReply(request, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None): - return grpc.experimental.unary_unary( - request, target, - '/modelresponse.ModelResponse/ClassificationReply', + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary(request, target, '/modelresponse.ModelResponse/ClassificationReply', modelresponse__pb2.SingleStringRequest.SerializeToString, modelresponse__pb2.SingleStringReply.FromString, - options, - channel_credentials, - insecure, - call_credentials, - compression, - wait_for_ready, - timeout, - metadata) + options, channel_credentials, + insecure, call_credentials, compression, wait_for_ready, timeout, metadata) @staticmethod def QuestionAndAnswerReply(request, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None): - return grpc.experimental.unary_unary( - request, target, - '/modelresponse.ModelResponse/QuestionAndAnswerReply', + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary(request, target, '/modelresponse.ModelResponse/QuestionAndAnswerReply', modelresponse__pb2.QARequest.SerializeToString, modelresponse__pb2.SingleStringReply.FromString, - options, - channel_credentials, - insecure, - call_credentials, - compression, - wait_for_ready, - timeout, - metadata) + options, channel_credentials, + insecure, call_credentials, compression, wait_for_ready, timeout, metadata) @staticmethod def FillMaskReply(request, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None): - return grpc.experimental.unary_unary( - request, target, - '/modelresponse.ModelResponse/FillMaskReply', + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary(request, target, '/modelresponse.ModelResponse/FillMaskReply', modelresponse__pb2.SingleStringRequest.SerializeToString, modelresponse__pb2.SingleStringReply.FromString, - options, - channel_credentials, - insecure, - call_credentials, - compression, - wait_for_ready, - timeout, - metadata) + options, channel_credentials, + insecure, call_credentials, compression, wait_for_ready, timeout, metadata) @staticmethod def TokenClassificationReply(request, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None): - return grpc.experimental.unary_unary( - request, target, - '/modelresponse.ModelResponse/TokenClassificationReply', + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary(request, target, '/modelresponse.ModelResponse/TokenClassificationReply', modelresponse__pb2.SingleStringRequest.SerializeToString, modelresponse__pb2.SingleStringReply.FromString, - options, - channel_credentials, - insecure, - call_credentials, - compression, - wait_for_ready, - timeout, - metadata) + options, channel_credentials, + insecure, call_credentials, compression, wait_for_ready, timeout, metadata) @staticmethod def ConversationalReply(request, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None): - return grpc.experimental.unary_unary( - request, target, - '/modelresponse.ModelResponse/ConversationalReply', + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary(request, target, '/modelresponse.ModelResponse/ConversationalReply', modelresponse__pb2.ConversationRequest.SerializeToString, modelresponse__pb2.ConversationReply.FromString, - options, - channel_credentials, - insecure, - call_credentials, - compression, - wait_for_ready, - timeout, - metadata) + options, channel_credentials, + insecure, call_credentials, compression, wait_for_ready, timeout, metadata) @staticmethod def Txt2ImgReply(request, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None): - return grpc.experimental.unary_unary( - request, target, - '/modelresponse.ModelResponse/Txt2ImgReply', + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary(request, target, '/modelresponse.ModelResponse/Txt2ImgReply', modelresponse__pb2.MultiStringRequest.SerializeToString, modelresponse__pb2.ImageReply.FromString, - options, - channel_credentials, - insecure, - call_credentials, - compression, - wait_for_ready, - timeout, - metadata) + options, channel_credentials, + insecure, call_credentials, compression, wait_for_ready, timeout, metadata) diff --git a/mii/method_table.py b/mii/method_table.py index c412f446..f7f87d28 100644 --- a/mii/method_table.py +++ b/mii/method_table.py @@ -13,7 +13,8 @@ def single_string_request_to_proto(self, request_dict, **query_kwargs): return modelresponse_pb2.SingleStringRequest( request=request_dict['query'], - query_kwargs=kwarg_dict_to_proto(query_kwargs)) + query_kwargs=kwarg_dict_to_proto(query_kwargs), + deployment_name=request_dict.get('deployment_name')) def single_string_response_to_proto(self, response, time_taken, model_time_taken): @@ -26,7 +27,8 @@ def multi_string_request_to_proto(self, request_dict, **query_kwargs): return modelresponse_pb2.MultiStringRequest( request=request_dict['query'] if isinstance(request_dict['query'], list) else [request_dict['query']], - query_kwargs=kwarg_dict_to_proto(query_kwargs)) + query_kwargs=kwarg_dict_to_proto(query_kwargs), + deployment_name=request_dict.get('deployment_name')) def proto_request_to_single_input(self, request): @@ -143,7 +145,8 @@ def pack_request_to_proto(self, request_dict, **query_kwargs): return modelresponse_pb2.QARequest( question=request_dict['question'], context=request_dict['context'], - query_kwargs=kwarg_dict_to_proto(query_kwargs)) + query_kwargs=kwarg_dict_to_proto(query_kwargs), + deployment_name=request_dict.get('deployment_name')) def unpack_request_from_proto(self, request): kwargs = unpack_proto_query_kwargs(request.query_kwargs) @@ -222,7 +225,8 @@ def pack_request_to_proto(self, request_dict, **query_kwargs): if 'conversation_id' in request_dict else None, past_user_inputs=request_dict['past_user_inputs'], generated_responses=request_dict['generated_responses'], - query_kwargs=kwarg_dict_to_proto(query_kwargs)) + query_kwargs=kwarg_dict_to_proto(query_kwargs), + deployment_name=request_dict.get('deployment_name')) class Text2ImgMethods(TaskMethods): From 63461949e0e44be45958dba53da9f24f7983e793 Mon Sep 17 00:00:00 2001 From: Tosin Segun Date: Thu, 13 Jul 2023 23:37:38 +0000 Subject: [PATCH 42/69] Formatting --- examples/multi_model/deploy.py | 28 +- examples/multi_model/query.py | 44 +- mii/grpc_related/modelresponse_server.py | 3 +- mii/grpc_related/proto/modelresponse_pb2.py | 90 +-- .../proto/modelresponse_pb2_grpc.py | 520 +++++++++++------- 5 files changed, 400 insertions(+), 285 deletions(-) diff --git a/examples/multi_model/deploy.py b/examples/multi_model/deploy.py index 03d2a3a0..f0408da7 100644 --- a/examples/multi_model/deploy.py +++ b/examples/multi_model/deploy.py @@ -16,32 +16,32 @@ name = "bigscience/bloom-560m" deployments.append( mii.DeploymentConfig(task='text-generation', - model=name, - deployment_name=name + "_deployment", - GPU_index_map=gpu_index_map3, - mii_config=mii.config.MIIConfig(**mii_configs1))) + model=name, + deployment_name=name + "_deployment", + GPU_index_map=gpu_index_map3, + mii_config=mii.config.MIIConfig(**mii_configs1))) # gpt2 name = "microsoft/DialogRPT-human-vs-rand" deployments.append( mii.DeploymentConfig(task='text-classification', - model=name, - deployment_name=name + "_deployment", - GPU_index_map=gpu_index_map2)) + model=name, + deployment_name=name + "_deployment", + GPU_index_map=gpu_index_map2)) name = "microsoft/DialoGPT-large" deployments.append( mii.DeploymentConfig(task='conversational', - model=name, - deployment_name=name + "_deployment", - GPU_index_map=gpu_index_map1, - mii_config=mii.config.MIIConfig(**mii_configs2))) + model=name, + deployment_name=name + "_deployment", + GPU_index_map=gpu_index_map1, + mii_config=mii.config.MIIConfig(**mii_configs2))) name = "deepset/roberta-large-squad2" deployments.append( mii.DeploymentConfig(task="question-answering", - model=name, - deployment_name=name + "-qa-deployment", - GPU_index_map=gpu_index_map2)) + model=name, + deployment_name=name + "-qa-deployment", + GPU_index_map=gpu_index_map2)) mii.deploy(deployment_tag="multi_models", deployments=deployments) diff --git a/examples/multi_model/query.py b/examples/multi_model/query.py index caf85934..bf760b49 100644 --- a/examples/multi_model/query.py +++ b/examples/multi_model/query.py @@ -8,33 +8,39 @@ results = [] generator = mii.mii_query_handle("multi_models") result = generator.query( - {"query": ["DeepSpeed is", - "Seattle is"], - - "deployment_name": "bigscience/bloom-560m_deployment" + { + "query": ["DeepSpeed is", + "Seattle is"], + "deployment_name": "bigscience/bloom-560m_deployment" }, do_sample=True, max_new_tokens=30, ) results.append(result) -result = generator.query({'query': "DeepSpeed is the greatest", "deployment_name": "microsoft/DialogRPT-human-vs-rand_deployment"}) +result = generator.query({ + 'query': + "DeepSpeed is the greatest", + "deployment_name": + "microsoft/DialogRPT-human-vs-rand_deployment" +}) results.append(result) -result = generator.query( - { - 'text': "DeepSpeed is the greatest", - 'conversation_id': 3, - 'past_user_inputs': [], - 'generated_responses': [], - "deployment_name": "microsoft/DialoGPT-large_deployment" - }) +result = generator.query({ + 'text': "DeepSpeed is the greatest", + 'conversation_id': 3, + 'past_user_inputs': [], + 'generated_responses': [], + "deployment_name": "microsoft/DialoGPT-large_deployment" +}) results.append(result) -result = generator.query( - { - 'question': "What is the greatest?", - 'context': "DeepSpeed is the greatest", - "deployment_name": "deepset/roberta-large-squad2" + "-qa-deployment" - }) +result = generator.query({ + 'question': + "What is the greatest?", + 'context': + "DeepSpeed is the greatest", + "deployment_name": + "deepset/roberta-large-squad2" + "-qa-deployment" +}) results.append(result) diff --git a/mii/grpc_related/modelresponse_server.py b/mii/grpc_related/modelresponse_server.py index 6f123a7f..94eec2d2 100644 --- a/mii/grpc_related/modelresponse_server.py +++ b/mii/grpc_related/modelresponse_server.py @@ -16,7 +16,6 @@ from mii.constants import GRPC_MAX_MSG_SIZE, CREATE_SESSION_METHOD, DESTROY_SESSION_METHOD, TERMINATE_METHOD, LB_MAX_WORKER_THREADS, SERVER_SHUTDOWN_TIMEOUT, Tasks from mii.method_table import GRPC_METHOD_TABLE from mii.client import create_channel -from mii.utils import get_task, unpack_proto_query_kwargs class ServiceBase(modelresponse_pb2_grpc.ModelResponseServicer): @@ -265,7 +264,7 @@ def invoke_intercept_method(request_proto, context): DESTROY_SESSION_METHOD, request_proto) return google_dot_protobuf_dot_empty__pb2.Empty() - + if "session_id" in request_proto.query_kwargs: session_id = request_proto.query_kwargs["session_id"] if session_id not in self.replica_sessions: diff --git a/mii/grpc_related/proto/modelresponse_pb2.py b/mii/grpc_related/proto/modelresponse_pb2.py index 53305ca5..7802b7f2 100644 --- a/mii/grpc_related/proto/modelresponse_pb2.py +++ b/mii/grpc_related/proto/modelresponse_pb2.py @@ -1,4 +1,8 @@ -# -*- coding: utf-8 -*- +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + # Generated by the protocol buffer compiler. DO NOT EDIT! # source: modelresponse.proto """Generated protocol buffer code.""" @@ -10,54 +14,54 @@ _sym_db = _symbol_database.Default() - from google.protobuf import empty_pb2 as google_dot_protobuf_dot_empty__pb2 - -DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x13modelresponse.proto\x12\rmodelresponse\x1a\x1bgoogle/protobuf/empty.proto\"_\n\x05Value\x12\x10\n\x06svalue\x18\x01 \x01(\tH\x00\x12\x10\n\x06ivalue\x18\x02 \x01(\x03H\x00\x12\x10\n\x06\x66value\x18\x03 \x01(\x02H\x00\x12\x10\n\x06\x62value\x18\x04 \x01(\x08H\x00\x42\x0e\n\x0coneof_values\"\x1f\n\tSessionID\x12\x12\n\nsession_id\x18\x01 \x01(\t\"\xed\x01\n\x13SingleStringRequest\x12\x0f\n\x07request\x18\x01 \x01(\t\x12I\n\x0cquery_kwargs\x18\x02 \x03(\x0b\x32\x33.modelresponse.SingleStringRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x03 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\xeb\x01\n\x12MultiStringRequest\x12\x0f\n\x07request\x18\x01 \x03(\t\x12H\n\x0cquery_kwargs\x18\x02 \x03(\x0b\x32\x32.modelresponse.MultiStringRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x03 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\x85\x01\n\x11SingleStringReply\x12\x10\n\x08response\x18\x01 \x01(\t\x12\x12\n\ntime_taken\x18\x02 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x03 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\x84\x01\n\x10MultiStringReply\x12\x10\n\x08response\x18\x01 \x03(\t\x12\x12\n\ntime_taken\x18\x02 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x03 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\xeb\x01\n\tQARequest\x12\x10\n\x08question\x18\x01 \x01(\t\x12\x0f\n\x07\x63ontext\x18\x02 \x01(\t\x12?\n\x0cquery_kwargs\x18\x03 \x03(\x0b\x32).modelresponse.QARequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\xd3\x02\n\x13\x43onversationRequest\x12\x0c\n\x04text\x18\x01 \x01(\t\x12\x1c\n\x0f\x63onversation_id\x18\x02 \x01(\x03H\x00\x88\x01\x01\x12\x18\n\x10past_user_inputs\x18\x03 \x03(\t\x12\x1b\n\x13generated_responses\x18\x04 \x03(\t\x12I\n\x0cquery_kwargs\x18\x05 \x03(\x0b\x32\x33.modelresponse.ConversationRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x06 \x01(\tH\x01\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_conversation_idB\x12\n\x10_deployment_name\"\xc3\x01\n\x11\x43onversationReply\x12\x17\n\x0f\x63onversation_id\x18\x01 \x01(\x03\x12\x18\n\x10past_user_inputs\x18\x02 \x03(\t\x12\x1b\n\x13generated_responses\x18\x03 \x03(\t\x12\x12\n\ntime_taken\x18\x04 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x05 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x06 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\xaf\x01\n\nImageReply\x12\x0e\n\x06images\x18\x01 \x03(\x0c\x12\x1d\n\x15nsfw_content_detected\x18\x02 \x03(\x08\x12\x0c\n\x04mode\x18\x03 \x01(\t\x12\x0e\n\x06size_w\x18\x04 \x01(\x03\x12\x0e\n\x06size_h\x18\x05 \x01(\x03\x12\x12\n\ntime_taken\x18\x06 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x07 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name2\xd4\x06\n\rModelResponse\x12=\n\tTerminate\x12\x16.google.protobuf.Empty\x1a\x16.google.protobuf.Empty\"\x00\x12\x43\n\rCreateSession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12\x44\n\x0e\x44\x65stroySession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12V\n\x0eGeneratorReply\x12!.modelresponse.MultiStringRequest\x1a\x1f.modelresponse.MultiStringReply\"\x00\x12]\n\x13\x43lassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12V\n\x16QuestionAndAnswerReply\x12\x18.modelresponse.QARequest\x1a .modelresponse.SingleStringReply\"\x00\x12W\n\rFillMaskReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12\x62\n\x18TokenClassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12]\n\x13\x43onversationalReply\x12\".modelresponse.ConversationRequest\x1a .modelresponse.ConversationReply\"\x00\x12N\n\x0cTxt2ImgReply\x12!.modelresponse.MultiStringRequest\x1a\x19.modelresponse.ImageReply\"\x00\x62\x06proto3') +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( + b'\n\x13modelresponse.proto\x12\rmodelresponse\x1a\x1bgoogle/protobuf/empty.proto\"_\n\x05Value\x12\x10\n\x06svalue\x18\x01 \x01(\tH\x00\x12\x10\n\x06ivalue\x18\x02 \x01(\x03H\x00\x12\x10\n\x06\x66value\x18\x03 \x01(\x02H\x00\x12\x10\n\x06\x62value\x18\x04 \x01(\x08H\x00\x42\x0e\n\x0coneof_values\"\x1f\n\tSessionID\x12\x12\n\nsession_id\x18\x01 \x01(\t\"\xed\x01\n\x13SingleStringRequest\x12\x0f\n\x07request\x18\x01 \x01(\t\x12I\n\x0cquery_kwargs\x18\x02 \x03(\x0b\x32\x33.modelresponse.SingleStringRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x03 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\xeb\x01\n\x12MultiStringRequest\x12\x0f\n\x07request\x18\x01 \x03(\t\x12H\n\x0cquery_kwargs\x18\x02 \x03(\x0b\x32\x32.modelresponse.MultiStringRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x03 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\x85\x01\n\x11SingleStringReply\x12\x10\n\x08response\x18\x01 \x01(\t\x12\x12\n\ntime_taken\x18\x02 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x03 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\x84\x01\n\x10MultiStringReply\x12\x10\n\x08response\x18\x01 \x03(\t\x12\x12\n\ntime_taken\x18\x02 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x03 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\xeb\x01\n\tQARequest\x12\x10\n\x08question\x18\x01 \x01(\t\x12\x0f\n\x07\x63ontext\x18\x02 \x01(\t\x12?\n\x0cquery_kwargs\x18\x03 \x03(\x0b\x32).modelresponse.QARequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\xd3\x02\n\x13\x43onversationRequest\x12\x0c\n\x04text\x18\x01 \x01(\t\x12\x1c\n\x0f\x63onversation_id\x18\x02 \x01(\x03H\x00\x88\x01\x01\x12\x18\n\x10past_user_inputs\x18\x03 \x03(\t\x12\x1b\n\x13generated_responses\x18\x04 \x03(\t\x12I\n\x0cquery_kwargs\x18\x05 \x03(\x0b\x32\x33.modelresponse.ConversationRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x06 \x01(\tH\x01\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_conversation_idB\x12\n\x10_deployment_name\"\xc3\x01\n\x11\x43onversationReply\x12\x17\n\x0f\x63onversation_id\x18\x01 \x01(\x03\x12\x18\n\x10past_user_inputs\x18\x02 \x03(\t\x12\x1b\n\x13generated_responses\x18\x03 \x03(\t\x12\x12\n\ntime_taken\x18\x04 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x05 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x06 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\xaf\x01\n\nImageReply\x12\x0e\n\x06images\x18\x01 \x03(\x0c\x12\x1d\n\x15nsfw_content_detected\x18\x02 \x03(\x08\x12\x0c\n\x04mode\x18\x03 \x01(\t\x12\x0e\n\x06size_w\x18\x04 \x01(\x03\x12\x0e\n\x06size_h\x18\x05 \x01(\x03\x12\x12\n\ntime_taken\x18\x06 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x07 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name2\xd4\x06\n\rModelResponse\x12=\n\tTerminate\x12\x16.google.protobuf.Empty\x1a\x16.google.protobuf.Empty\"\x00\x12\x43\n\rCreateSession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12\x44\n\x0e\x44\x65stroySession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12V\n\x0eGeneratorReply\x12!.modelresponse.MultiStringRequest\x1a\x1f.modelresponse.MultiStringReply\"\x00\x12]\n\x13\x43lassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12V\n\x16QuestionAndAnswerReply\x12\x18.modelresponse.QARequest\x1a .modelresponse.SingleStringReply\"\x00\x12W\n\rFillMaskReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12\x62\n\x18TokenClassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12]\n\x13\x43onversationalReply\x12\".modelresponse.ConversationRequest\x1a .modelresponse.ConversationReply\"\x00\x12N\n\x0cTxt2ImgReply\x12!.modelresponse.MultiStringRequest\x1a\x19.modelresponse.ImageReply\"\x00\x62\x06proto3' +) _globals = globals() _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) _builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'modelresponse_pb2', _globals) if _descriptor._USE_C_DESCRIPTORS == False: - DESCRIPTOR._options = None - _SINGLESTRINGREQUEST_QUERYKWARGSENTRY._options = None - _SINGLESTRINGREQUEST_QUERYKWARGSENTRY._serialized_options = b'8\001' - _MULTISTRINGREQUEST_QUERYKWARGSENTRY._options = None - _MULTISTRINGREQUEST_QUERYKWARGSENTRY._serialized_options = b'8\001' - _QAREQUEST_QUERYKWARGSENTRY._options = None - _QAREQUEST_QUERYKWARGSENTRY._serialized_options = b'8\001' - _CONVERSATIONREQUEST_QUERYKWARGSENTRY._options = None - _CONVERSATIONREQUEST_QUERYKWARGSENTRY._serialized_options = b'8\001' - _globals['_VALUE']._serialized_start=67 - _globals['_VALUE']._serialized_end=162 - _globals['_SESSIONID']._serialized_start=164 - _globals['_SESSIONID']._serialized_end=195 - _globals['_SINGLESTRINGREQUEST']._serialized_start=198 - _globals['_SINGLESTRINGREQUEST']._serialized_end=435 - _globals['_SINGLESTRINGREQUEST_QUERYKWARGSENTRY']._serialized_start=343 - _globals['_SINGLESTRINGREQUEST_QUERYKWARGSENTRY']._serialized_end=415 - _globals['_MULTISTRINGREQUEST']._serialized_start=438 - _globals['_MULTISTRINGREQUEST']._serialized_end=673 - _globals['_MULTISTRINGREQUEST_QUERYKWARGSENTRY']._serialized_start=343 - _globals['_MULTISTRINGREQUEST_QUERYKWARGSENTRY']._serialized_end=415 - _globals['_SINGLESTRINGREPLY']._serialized_start=676 - _globals['_SINGLESTRINGREPLY']._serialized_end=809 - _globals['_MULTISTRINGREPLY']._serialized_start=812 - _globals['_MULTISTRINGREPLY']._serialized_end=944 - _globals['_QAREQUEST']._serialized_start=947 - _globals['_QAREQUEST']._serialized_end=1182 - _globals['_QAREQUEST_QUERYKWARGSENTRY']._serialized_start=343 - _globals['_QAREQUEST_QUERYKWARGSENTRY']._serialized_end=415 - _globals['_CONVERSATIONREQUEST']._serialized_start=1185 - _globals['_CONVERSATIONREQUEST']._serialized_end=1524 - _globals['_CONVERSATIONREQUEST_QUERYKWARGSENTRY']._serialized_start=343 - _globals['_CONVERSATIONREQUEST_QUERYKWARGSENTRY']._serialized_end=415 - _globals['_CONVERSATIONREPLY']._serialized_start=1527 - _globals['_CONVERSATIONREPLY']._serialized_end=1722 - _globals['_IMAGEREPLY']._serialized_start=1725 - _globals['_IMAGEREPLY']._serialized_end=1900 - _globals['_MODELRESPONSE']._serialized_start=1903 - _globals['_MODELRESPONSE']._serialized_end=2755 + DESCRIPTOR._options = None + _SINGLESTRINGREQUEST_QUERYKWARGSENTRY._options = None + _SINGLESTRINGREQUEST_QUERYKWARGSENTRY._serialized_options = b'8\001' + _MULTISTRINGREQUEST_QUERYKWARGSENTRY._options = None + _MULTISTRINGREQUEST_QUERYKWARGSENTRY._serialized_options = b'8\001' + _QAREQUEST_QUERYKWARGSENTRY._options = None + _QAREQUEST_QUERYKWARGSENTRY._serialized_options = b'8\001' + _CONVERSATIONREQUEST_QUERYKWARGSENTRY._options = None + _CONVERSATIONREQUEST_QUERYKWARGSENTRY._serialized_options = b'8\001' + _globals['_VALUE']._serialized_start = 67 + _globals['_VALUE']._serialized_end = 162 + _globals['_SESSIONID']._serialized_start = 164 + _globals['_SESSIONID']._serialized_end = 195 + _globals['_SINGLESTRINGREQUEST']._serialized_start = 198 + _globals['_SINGLESTRINGREQUEST']._serialized_end = 435 + _globals['_SINGLESTRINGREQUEST_QUERYKWARGSENTRY']._serialized_start = 343 + _globals['_SINGLESTRINGREQUEST_QUERYKWARGSENTRY']._serialized_end = 415 + _globals['_MULTISTRINGREQUEST']._serialized_start = 438 + _globals['_MULTISTRINGREQUEST']._serialized_end = 673 + _globals['_MULTISTRINGREQUEST_QUERYKWARGSENTRY']._serialized_start = 343 + _globals['_MULTISTRINGREQUEST_QUERYKWARGSENTRY']._serialized_end = 415 + _globals['_SINGLESTRINGREPLY']._serialized_start = 676 + _globals['_SINGLESTRINGREPLY']._serialized_end = 809 + _globals['_MULTISTRINGREPLY']._serialized_start = 812 + _globals['_MULTISTRINGREPLY']._serialized_end = 944 + _globals['_QAREQUEST']._serialized_start = 947 + _globals['_QAREQUEST']._serialized_end = 1182 + _globals['_QAREQUEST_QUERYKWARGSENTRY']._serialized_start = 343 + _globals['_QAREQUEST_QUERYKWARGSENTRY']._serialized_end = 415 + _globals['_CONVERSATIONREQUEST']._serialized_start = 1185 + _globals['_CONVERSATIONREQUEST']._serialized_end = 1524 + _globals['_CONVERSATIONREQUEST_QUERYKWARGSENTRY']._serialized_start = 343 + _globals['_CONVERSATIONREQUEST_QUERYKWARGSENTRY']._serialized_end = 415 + _globals['_CONVERSATIONREPLY']._serialized_start = 1527 + _globals['_CONVERSATIONREPLY']._serialized_end = 1722 + _globals['_IMAGEREPLY']._serialized_start = 1725 + _globals['_IMAGEREPLY']._serialized_end = 1900 + _globals['_MODELRESPONSE']._serialized_start = 1903 + _globals['_MODELRESPONSE']._serialized_end = 2755 # @@protoc_insertion_point(module_scope) diff --git a/mii/grpc_related/proto/modelresponse_pb2_grpc.py b/mii/grpc_related/proto/modelresponse_pb2_grpc.py index 683e4962..95cfa825 100644 --- a/mii/grpc_related/proto/modelresponse_pb2_grpc.py +++ b/mii/grpc_related/proto/modelresponse_pb2_grpc.py @@ -1,3 +1,8 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + # Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT! """Client and server classes corresponding to protobuf-defined services.""" import grpc @@ -8,7 +13,6 @@ class ModelResponseStub(object): """Missing associated documentation comment in .proto file.""" - def __init__(self, channel): """Constructor. @@ -16,60 +20,60 @@ def __init__(self, channel): channel: A grpc.Channel. """ self.Terminate = channel.unary_unary( - '/modelresponse.ModelResponse/Terminate', - request_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, - response_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, - ) + '/modelresponse.ModelResponse/Terminate', + request_serializer=google_dot_protobuf_dot_empty__pb2.Empty. + SerializeToString, + response_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, + ) self.CreateSession = channel.unary_unary( - '/modelresponse.ModelResponse/CreateSession', - request_serializer=modelresponse__pb2.SessionID.SerializeToString, - response_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, - ) + '/modelresponse.ModelResponse/CreateSession', + request_serializer=modelresponse__pb2.SessionID.SerializeToString, + response_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, + ) self.DestroySession = channel.unary_unary( - '/modelresponse.ModelResponse/DestroySession', - request_serializer=modelresponse__pb2.SessionID.SerializeToString, - response_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, - ) + '/modelresponse.ModelResponse/DestroySession', + request_serializer=modelresponse__pb2.SessionID.SerializeToString, + response_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, + ) self.GeneratorReply = channel.unary_unary( - '/modelresponse.ModelResponse/GeneratorReply', - request_serializer=modelresponse__pb2.MultiStringRequest.SerializeToString, - response_deserializer=modelresponse__pb2.MultiStringReply.FromString, - ) + '/modelresponse.ModelResponse/GeneratorReply', + request_serializer=modelresponse__pb2.MultiStringRequest.SerializeToString, + response_deserializer=modelresponse__pb2.MultiStringReply.FromString, + ) self.ClassificationReply = channel.unary_unary( - '/modelresponse.ModelResponse/ClassificationReply', - request_serializer=modelresponse__pb2.SingleStringRequest.SerializeToString, - response_deserializer=modelresponse__pb2.SingleStringReply.FromString, - ) + '/modelresponse.ModelResponse/ClassificationReply', + request_serializer=modelresponse__pb2.SingleStringRequest.SerializeToString, + response_deserializer=modelresponse__pb2.SingleStringReply.FromString, + ) self.QuestionAndAnswerReply = channel.unary_unary( - '/modelresponse.ModelResponse/QuestionAndAnswerReply', - request_serializer=modelresponse__pb2.QARequest.SerializeToString, - response_deserializer=modelresponse__pb2.SingleStringReply.FromString, - ) + '/modelresponse.ModelResponse/QuestionAndAnswerReply', + request_serializer=modelresponse__pb2.QARequest.SerializeToString, + response_deserializer=modelresponse__pb2.SingleStringReply.FromString, + ) self.FillMaskReply = channel.unary_unary( - '/modelresponse.ModelResponse/FillMaskReply', - request_serializer=modelresponse__pb2.SingleStringRequest.SerializeToString, - response_deserializer=modelresponse__pb2.SingleStringReply.FromString, - ) + '/modelresponse.ModelResponse/FillMaskReply', + request_serializer=modelresponse__pb2.SingleStringRequest.SerializeToString, + response_deserializer=modelresponse__pb2.SingleStringReply.FromString, + ) self.TokenClassificationReply = channel.unary_unary( - '/modelresponse.ModelResponse/TokenClassificationReply', - request_serializer=modelresponse__pb2.SingleStringRequest.SerializeToString, - response_deserializer=modelresponse__pb2.SingleStringReply.FromString, - ) + '/modelresponse.ModelResponse/TokenClassificationReply', + request_serializer=modelresponse__pb2.SingleStringRequest.SerializeToString, + response_deserializer=modelresponse__pb2.SingleStringReply.FromString, + ) self.ConversationalReply = channel.unary_unary( - '/modelresponse.ModelResponse/ConversationalReply', - request_serializer=modelresponse__pb2.ConversationRequest.SerializeToString, - response_deserializer=modelresponse__pb2.ConversationReply.FromString, - ) + '/modelresponse.ModelResponse/ConversationalReply', + request_serializer=modelresponse__pb2.ConversationRequest.SerializeToString, + response_deserializer=modelresponse__pb2.ConversationReply.FromString, + ) self.Txt2ImgReply = channel.unary_unary( - '/modelresponse.ModelResponse/Txt2ImgReply', - request_serializer=modelresponse__pb2.MultiStringRequest.SerializeToString, - response_deserializer=modelresponse__pb2.ImageReply.FromString, - ) + '/modelresponse.ModelResponse/Txt2ImgReply', + request_serializer=modelresponse__pb2.MultiStringRequest.SerializeToString, + response_deserializer=modelresponse__pb2.ImageReply.FromString, + ) class ModelResponseServicer(object): """Missing associated documentation comment in .proto file.""" - def Terminate(self, request, context): """Missing associated documentation comment in .proto file.""" context.set_code(grpc.StatusCode.UNIMPLEMENTED) @@ -133,232 +137,334 @@ def Txt2ImgReply(self, request, context): def add_ModelResponseServicer_to_server(servicer, server): rpc_method_handlers = { - 'Terminate': grpc.unary_unary_rpc_method_handler( - servicer.Terminate, - request_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, - response_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, - ), - 'CreateSession': grpc.unary_unary_rpc_method_handler( - servicer.CreateSession, - request_deserializer=modelresponse__pb2.SessionID.FromString, - response_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, - ), - 'DestroySession': grpc.unary_unary_rpc_method_handler( - servicer.DestroySession, - request_deserializer=modelresponse__pb2.SessionID.FromString, - response_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, - ), - 'GeneratorReply': grpc.unary_unary_rpc_method_handler( - servicer.GeneratorReply, - request_deserializer=modelresponse__pb2.MultiStringRequest.FromString, - response_serializer=modelresponse__pb2.MultiStringReply.SerializeToString, - ), - 'ClassificationReply': grpc.unary_unary_rpc_method_handler( - servicer.ClassificationReply, - request_deserializer=modelresponse__pb2.SingleStringRequest.FromString, - response_serializer=modelresponse__pb2.SingleStringReply.SerializeToString, - ), - 'QuestionAndAnswerReply': grpc.unary_unary_rpc_method_handler( - servicer.QuestionAndAnswerReply, - request_deserializer=modelresponse__pb2.QARequest.FromString, - response_serializer=modelresponse__pb2.SingleStringReply.SerializeToString, - ), - 'FillMaskReply': grpc.unary_unary_rpc_method_handler( - servicer.FillMaskReply, - request_deserializer=modelresponse__pb2.SingleStringRequest.FromString, - response_serializer=modelresponse__pb2.SingleStringReply.SerializeToString, - ), - 'TokenClassificationReply': grpc.unary_unary_rpc_method_handler( - servicer.TokenClassificationReply, - request_deserializer=modelresponse__pb2.SingleStringRequest.FromString, - response_serializer=modelresponse__pb2.SingleStringReply.SerializeToString, - ), - 'ConversationalReply': grpc.unary_unary_rpc_method_handler( - servicer.ConversationalReply, - request_deserializer=modelresponse__pb2.ConversationRequest.FromString, - response_serializer=modelresponse__pb2.ConversationReply.SerializeToString, - ), - 'Txt2ImgReply': grpc.unary_unary_rpc_method_handler( - servicer.Txt2ImgReply, - request_deserializer=modelresponse__pb2.MultiStringRequest.FromString, - response_serializer=modelresponse__pb2.ImageReply.SerializeToString, - ), + 'Terminate': + grpc.unary_unary_rpc_method_handler( + servicer.Terminate, + request_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, + response_serializer=google_dot_protobuf_dot_empty__pb2.Empty. + SerializeToString, + ), + 'CreateSession': + grpc.unary_unary_rpc_method_handler( + servicer.CreateSession, + request_deserializer=modelresponse__pb2.SessionID.FromString, + response_serializer=google_dot_protobuf_dot_empty__pb2.Empty. + SerializeToString, + ), + 'DestroySession': + grpc.unary_unary_rpc_method_handler( + servicer.DestroySession, + request_deserializer=modelresponse__pb2.SessionID.FromString, + response_serializer=google_dot_protobuf_dot_empty__pb2.Empty. + SerializeToString, + ), + 'GeneratorReply': + grpc.unary_unary_rpc_method_handler( + servicer.GeneratorReply, + request_deserializer=modelresponse__pb2.MultiStringRequest.FromString, + response_serializer=modelresponse__pb2.MultiStringReply.SerializeToString, + ), + 'ClassificationReply': + grpc.unary_unary_rpc_method_handler( + servicer.ClassificationReply, + request_deserializer=modelresponse__pb2.SingleStringRequest.FromString, + response_serializer=modelresponse__pb2.SingleStringReply.SerializeToString, + ), + 'QuestionAndAnswerReply': + grpc.unary_unary_rpc_method_handler( + servicer.QuestionAndAnswerReply, + request_deserializer=modelresponse__pb2.QARequest.FromString, + response_serializer=modelresponse__pb2.SingleStringReply.SerializeToString, + ), + 'FillMaskReply': + grpc.unary_unary_rpc_method_handler( + servicer.FillMaskReply, + request_deserializer=modelresponse__pb2.SingleStringRequest.FromString, + response_serializer=modelresponse__pb2.SingleStringReply.SerializeToString, + ), + 'TokenClassificationReply': + grpc.unary_unary_rpc_method_handler( + servicer.TokenClassificationReply, + request_deserializer=modelresponse__pb2.SingleStringRequest.FromString, + response_serializer=modelresponse__pb2.SingleStringReply.SerializeToString, + ), + 'ConversationalReply': + grpc.unary_unary_rpc_method_handler( + servicer.ConversationalReply, + request_deserializer=modelresponse__pb2.ConversationRequest.FromString, + response_serializer=modelresponse__pb2.ConversationReply.SerializeToString, + ), + 'Txt2ImgReply': + grpc.unary_unary_rpc_method_handler( + servicer.Txt2ImgReply, + request_deserializer=modelresponse__pb2.MultiStringRequest.FromString, + response_serializer=modelresponse__pb2.ImageReply.SerializeToString, + ), } - generic_handler = grpc.method_handlers_generic_handler( - 'modelresponse.ModelResponse', rpc_method_handlers) - server.add_generic_rpc_handlers((generic_handler,)) + generic_handler = grpc.method_handlers_generic_handler('modelresponse.ModelResponse', + rpc_method_handlers) + server.add_generic_rpc_handlers((generic_handler, )) - # This class is part of an EXPERIMENTAL API. +# This class is part of an EXPERIMENTAL API. class ModelResponse(object): """Missing associated documentation comment in .proto file.""" - @staticmethod def Terminate(request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary( + request, target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None): - return grpc.experimental.unary_unary(request, target, '/modelresponse.ModelResponse/Terminate', + '/modelresponse.ModelResponse/Terminate', google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, google_dot_protobuf_dot_empty__pb2.Empty.FromString, - options, channel_credentials, - insecure, call_credentials, compression, wait_for_ready, timeout, metadata) + options, + channel_credentials, + insecure, + call_credentials, + compression, + wait_for_ready, + timeout, + metadata) @staticmethod def CreateSession(request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary( + request, target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None): - return grpc.experimental.unary_unary(request, target, '/modelresponse.ModelResponse/CreateSession', + '/modelresponse.ModelResponse/CreateSession', modelresponse__pb2.SessionID.SerializeToString, google_dot_protobuf_dot_empty__pb2.Empty.FromString, - options, channel_credentials, - insecure, call_credentials, compression, wait_for_ready, timeout, metadata) + options, + channel_credentials, + insecure, + call_credentials, + compression, + wait_for_ready, + timeout, + metadata) @staticmethod def DestroySession(request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary( + request, target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None): - return grpc.experimental.unary_unary(request, target, '/modelresponse.ModelResponse/DestroySession', + '/modelresponse.ModelResponse/DestroySession', modelresponse__pb2.SessionID.SerializeToString, google_dot_protobuf_dot_empty__pb2.Empty.FromString, - options, channel_credentials, - insecure, call_credentials, compression, wait_for_ready, timeout, metadata) + options, + channel_credentials, + insecure, + call_credentials, + compression, + wait_for_ready, + timeout, + metadata) @staticmethod def GeneratorReply(request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary( + request, target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None): - return grpc.experimental.unary_unary(request, target, '/modelresponse.ModelResponse/GeneratorReply', + '/modelresponse.ModelResponse/GeneratorReply', modelresponse__pb2.MultiStringRequest.SerializeToString, modelresponse__pb2.MultiStringReply.FromString, - options, channel_credentials, - insecure, call_credentials, compression, wait_for_ready, timeout, metadata) + options, + channel_credentials, + insecure, + call_credentials, + compression, + wait_for_ready, + timeout, + metadata) @staticmethod def ClassificationReply(request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary( + request, target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None): - return grpc.experimental.unary_unary(request, target, '/modelresponse.ModelResponse/ClassificationReply', + '/modelresponse.ModelResponse/ClassificationReply', modelresponse__pb2.SingleStringRequest.SerializeToString, modelresponse__pb2.SingleStringReply.FromString, - options, channel_credentials, - insecure, call_credentials, compression, wait_for_ready, timeout, metadata) + options, + channel_credentials, + insecure, + call_credentials, + compression, + wait_for_ready, + timeout, + metadata) @staticmethod def QuestionAndAnswerReply(request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary( + request, target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None): - return grpc.experimental.unary_unary(request, target, '/modelresponse.ModelResponse/QuestionAndAnswerReply', + '/modelresponse.ModelResponse/QuestionAndAnswerReply', modelresponse__pb2.QARequest.SerializeToString, modelresponse__pb2.SingleStringReply.FromString, - options, channel_credentials, - insecure, call_credentials, compression, wait_for_ready, timeout, metadata) + options, + channel_credentials, + insecure, + call_credentials, + compression, + wait_for_ready, + timeout, + metadata) @staticmethod def FillMaskReply(request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary( + request, target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None): - return grpc.experimental.unary_unary(request, target, '/modelresponse.ModelResponse/FillMaskReply', + '/modelresponse.ModelResponse/FillMaskReply', modelresponse__pb2.SingleStringRequest.SerializeToString, modelresponse__pb2.SingleStringReply.FromString, - options, channel_credentials, - insecure, call_credentials, compression, wait_for_ready, timeout, metadata) + options, + channel_credentials, + insecure, + call_credentials, + compression, + wait_for_ready, + timeout, + metadata) @staticmethod def TokenClassificationReply(request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary( + request, target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None): - return grpc.experimental.unary_unary(request, target, '/modelresponse.ModelResponse/TokenClassificationReply', + '/modelresponse.ModelResponse/TokenClassificationReply', modelresponse__pb2.SingleStringRequest.SerializeToString, modelresponse__pb2.SingleStringReply.FromString, - options, channel_credentials, - insecure, call_credentials, compression, wait_for_ready, timeout, metadata) + options, + channel_credentials, + insecure, + call_credentials, + compression, + wait_for_ready, + timeout, + metadata) @staticmethod def ConversationalReply(request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary( + request, target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None): - return grpc.experimental.unary_unary(request, target, '/modelresponse.ModelResponse/ConversationalReply', + '/modelresponse.ModelResponse/ConversationalReply', modelresponse__pb2.ConversationRequest.SerializeToString, modelresponse__pb2.ConversationReply.FromString, - options, channel_credentials, - insecure, call_credentials, compression, wait_for_ready, timeout, metadata) + options, + channel_credentials, + insecure, + call_credentials, + compression, + wait_for_ready, + timeout, + metadata) @staticmethod def Txt2ImgReply(request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary( + request, target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None): - return grpc.experimental.unary_unary(request, target, '/modelresponse.ModelResponse/Txt2ImgReply', + '/modelresponse.ModelResponse/Txt2ImgReply', modelresponse__pb2.MultiStringRequest.SerializeToString, modelresponse__pb2.ImageReply.FromString, - options, channel_credentials, - insecure, call_credentials, compression, wait_for_ready, timeout, metadata) + options, + channel_credentials, + insecure, + call_credentials, + compression, + wait_for_ready, + timeout, + metadata) From 94b66993702d54223f377cdcd6a5af19f524236b Mon Sep 17 00:00:00 2001 From: Tosin Segun Date: Fri, 14 Jul 2023 16:51:58 +0000 Subject: [PATCH 43/69] Fixing the client --- mii/client.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mii/client.py b/mii/client.py index d71dce89..5d18b81c 100644 --- a/mii/client.py +++ b/mii/client.py @@ -90,6 +90,7 @@ async def _request_async_response(self, request_dict, task, **query_kwargs): def query(self, request_dict, **query_kwargs): deployment_name = request_dict.get('deployment_name') deployment_name, task = self._get_deployment_task(deployment_name) + request_dict['deployment_name'] = deployment_name return self.asyncio_loop.run_until_complete( self._request_async_response(request_dict, task, From 710c20b901634dff1602a1e53f59010022d52b2c Mon Sep 17 00:00:00 2001 From: Tosin Segun Date: Fri, 21 Jul 2023 00:02:42 +0000 Subject: [PATCH 44/69] Initial partial deployment commit --- mii/__init__.py | 5 + mii/client.py | 54 +- mii/config.py | 1 + mii/constants.py | 3 +- mii/deployment.py | 55 +- mii/grpc_related/modelresponse_server.py | 22 +- mii/grpc_related/proto/modelresponse.proto | 4 + mii/grpc_related/proto/modelresponse_pb2.py | 92 ++- .../proto/modelresponse_pb2_grpc.py | 581 ++++++++---------- 9 files changed, 439 insertions(+), 378 deletions(-) diff --git a/mii/__init__.py b/mii/__init__.py index 66748a56..aba912f3 100644 --- a/mii/__init__.py +++ b/mii/__init__.py @@ -15,6 +15,11 @@ __version__ = "0.0.0" non_persistent_models = {} +port_map = {} +deployment_tag = "" +lb_config = None +model_path = None +deployment_type = None try: from .version import __version__ except ImportError: diff --git a/mii/client.py b/mii/client.py index 5d18b81c..f76105ed 100644 --- a/mii/client.py +++ b/mii/client.py @@ -8,8 +8,10 @@ import mii from mii.utils import get_task from mii.grpc_related.proto import modelresponse_pb2, modelresponse_pb2_grpc -from mii.constants import GRPC_MAX_MSG_SIZE, Tasks +from mii.constants import GRPC_MAX_MSG_SIZE, Tasks, DeploymentType from mii.method_table import GRPC_METHOD_TABLE +from mii.deployment import allocate_processes, create_score_file +from mii.config import DeploymentConfig def _get_deployment_configs(deployment_tag): @@ -18,6 +20,7 @@ def _get_deployment_configs(deployment_tag): for deployment in configs: if not isinstance(configs[deployment], dict): continue + configs[deployment][mii.constants.DEPLOYED_KEY] = True deployments.append(configs[deployment]) return deployments @@ -60,7 +63,7 @@ class MIIClient(): def __init__(self, deployments, host, port): self.asyncio_loop = asyncio.get_event_loop() channel = create_channel(host, port) - self.stub = modelresponse_pb2_grpc.ModelResponseStub(channel) + self.stub = modelresponse_pb2_grpc.DeploymentManagementStub(channel) #self.task = get_task(task_name) self.deployments = deployments @@ -126,7 +129,54 @@ def destroy_session(self, session_id, deployment_name=None): assert task == Tasks.TEXT_GENERATION, f"Session deletion only available for task '{Tasks.TEXT_GENERATION}'." self.asyncio_loop.run_until_complete(self.destroy_session_async(session_id)) + async def add_models_async(self, request=None): + await getattr(self.stub, "AddDeployment")(modelresponse_pb2.google_dot_protobuf_dot_empty__pb2.Empty()) + + def add_models(self, + task=None, + model=None, + deployment_name=None, + enable_deepspeed=True, + enable_zero=False, + ds_config=None, + mii_config={}, + deployment_tag=None, + deployments=[], + deployment_type=DeploymentType.LOCAL, + model_path=None, + version=1): + if not deployments: + assert all((model, task, deployment_name)), "model, task, and deployment name must be set to deploy singular model" + deployments = [ + DeploymentConfig(deployment_name=deployment_name, + task=task, + model=model, + enable_deepspeed=enable_deepspeed, + enable_zero=enable_zero, + GPU_index_map=None, + mii_config=mii.config.MIIConfig(**mii_config), + ds_config=ds_config, + version=version, + deployed=False) + ] + """ + deployment_tag = mii.deployment_tag + lb_config = allocate_processes(deployments) + if mii.lb_config is not None: + mii.lb_config.replica_configs.extend(lb_config.replica_configs) + else: + mii.lb_config = lb_config + self.deployments.extend(deployments) + if mii.model_path is None and deployment_type == DeploymentType.LOCAL: + mii.model_path = MII_MODEL_PATH_DEFAULT + elif mii.model_path is None and deployment_type == DeploymentType.AML: + model_path = "model" + create_score_file(deployment_tag=deployment_tag, deployment_type=mii.deployment_type, deployments=self.deployments, model_path=mii.model_path, lb_config=mii.lb_config) + if mii.deployment_type == DeploymentType.Local: + mii.utils.import_score_file(deployment_tag).init() + """ + self.asyncio_loop.run_until_complete(self.add_models_async()) class MIITensorParallelClient(): """ Client to send queries to multiple endpoints in parallel. diff --git a/mii/config.py b/mii/config.py index d7a246c0..ea3fbe43 100644 --- a/mii/config.py +++ b/mii/config.py @@ -140,3 +140,4 @@ class DeploymentConfig(BaseModel): mii_config: MIIConfig = MIIConfig.parse_obj({}) ds_config: dict = None version: int = 1 + deployed: bool = False diff --git a/mii/constants.py b/mii/constants.py index 29493433..baffdcf9 100644 --- a/mii/constants.py +++ b/mii/constants.py @@ -99,7 +99,7 @@ class ModelProvider(enum.Enum): ENABLE_DEEPSPEED_ZERO_KEY = 'ds_zero' DEEPSPEED_CONFIG_KEY = 'ds_config' CHECKPOINT_KEY = "checkpoint" - +DEPLOYED_KEY = "deployed" MII_CACHE_PATH = "MII_CACHE_PATH" MII_CACHE_PATH_DEFAULT = "/tmp/mii_cache" @@ -118,6 +118,7 @@ class ModelProvider(enum.Enum): TERMINATE_METHOD = "Terminate" CREATE_SESSION_METHOD = "CreateSession" DESTROY_SESSION_METHOD = "DestroySession" +ADD_DEPLOYMENT_METHOD = "AddDeployment" LB_MAX_WORKER_THREADS = 32 diff --git a/mii/deployment.py b/mii/deployment.py index dc970035..ae539b10 100644 --- a/mii/deployment.py +++ b/mii/deployment.py @@ -68,6 +68,7 @@ def deploy(task=None, If deployment_type is `LOCAL`, returns just the name of the deployment that can be used to create a query handle using `mii.mii_query_handle(deployment_name)` """ + mii.deployment_type = deployment_type if not deployments: assert all((model, task, deployment_name)), "model, task, and deployment name must be set to deploy singular model" deployments = [ @@ -79,12 +80,13 @@ def deploy(task=None, GPU_index_map=None, mii_config=mii.config.MIIConfig(**mii_config), ds_config=ds_config, - version=version) + version=version, + deployed=False) ] deployment_tag = deployment_name else: assert deployment_tag is not None, "deployment_tag must be set to deploy multiple models" - + mii.deployment_tag = deployment_tag # parse and validate mii config for deployment in deployments: mii_config = deployment.mii_config @@ -125,10 +127,10 @@ def deploy(task=None, model_path = MII_MODEL_PATH_DEFAULT elif model_path is None and deployment_type == DeploymentType.AML: model_path = "model" - + + mii.model_path = model_path # add fields for replica deployment replica_configs = [] - port_map = {} port_offset = 1 for deployment in deployments: mii_config = deployment.mii_config @@ -139,16 +141,16 @@ def deploy(task=None, for i, (hostname, gpu_indices) in enumerate(replica_pool): # Reserver port for a LB proxy when replication is enabled - if hostname not in port_map: - port_map[hostname] = set() + if hostname not in mii.port_map: + mii.port_map[hostname] = set() base_port = mii_config.port_number + i * mii_config.tensor_parallel + port_offset - if base_port in port_map[hostname]: - base_port = max(port_map[hostname]) + 1 + if base_port in mii.port_map[hostname]: + base_port = max(mii.port_map[hostname]) + 1 tensor_parallel_ports = list( range(base_port, base_port + mii_config.tensor_parallel)) for i in range(base_port, base_port + mii_config.tensor_parallel): - port_map[hostname].add(i) + mii.port_map[hostname].add(i) torch_dist_port = mii_config.torch_dist_port + i replica_configs.append( ReplicaConfig(task=get_task_name(deployment.task), @@ -166,7 +168,7 @@ def deploy(task=None, deployments=deployments, model_path=model_path, lb_config=lb_config) - + if deployment_type == DeploymentType.AML: _deploy_aml(deployment_tag=deployment_tag, model_name=model, version=version) elif deployment_type == DeploymentType.LOCAL: @@ -186,6 +188,39 @@ def deploy(task=None, else: raise Exception(f"Unknown deployment type: {deployment_type}") +def allocate_processes(deployments): + replica_configs = [] + port_offset = 1 + for deployment in deployments: + mii_config = deployment.mii_config + replica_pool = _allocate_processes(mii_config.hostfile, + mii_config.tensor_parallel, + mii_config.replica_num, + deployment.GPU_index_map) + + for i, (hostname, gpu_indices) in enumerate(replica_pool): + # Reserver port for a LB proxy when replication is enabled + if hostname not in mii.port_map: + mii.port_map[hostname] = set() + base_port = mii_config.port_number + i * mii_config.tensor_parallel + port_offset + if base_port in mii.port_map[hostname]: + base_port = max(mii.port_map[hostname]) + 1 + tensor_parallel_ports = list( + range(base_port, + base_port + mii_config.tensor_parallel)) + for i in range(base_port, base_port + mii_config.tensor_parallel): + mii.port_map[hostname].add(i) + torch_dist_port = mii_config.torch_dist_port + i + replica_configs.append( + ReplicaConfig(task=get_task_name(deployment.task), + deployment_name=deployment.deployment_name, + hostname=hostname, + tensor_parallel_ports=tensor_parallel_ports, + torch_dist_port=torch_dist_port, + gpu_indices=gpu_indices)) + lb_config = LoadBalancerConfig(port=mii_config.port_number, + replica_configs=replica_configs) + return lb_config def _deploy_local(deployment_tag, model_path): mii.utils.import_score_file(deployment_tag).init() diff --git a/mii/grpc_related/modelresponse_server.py b/mii/grpc_related/modelresponse_server.py index 94eec2d2..ab55ae32 100644 --- a/mii/grpc_related/modelresponse_server.py +++ b/mii/grpc_related/modelresponse_server.py @@ -13,7 +13,7 @@ import threading import time -from mii.constants import GRPC_MAX_MSG_SIZE, CREATE_SESSION_METHOD, DESTROY_SESSION_METHOD, TERMINATE_METHOD, LB_MAX_WORKER_THREADS, SERVER_SHUTDOWN_TIMEOUT, Tasks +from mii.constants import GRPC_MAX_MSG_SIZE, ADD_DEPLOYMENT_METHOD, CREATE_SESSION_METHOD, DESTROY_SESSION_METHOD, TERMINATE_METHOD, LB_MAX_WORKER_THREADS, SERVER_SHUTDOWN_TIMEOUT, Tasks from mii.method_table import GRPC_METHOD_TABLE from mii.client import create_channel @@ -32,6 +32,11 @@ def Terminate(self, request, context): def get_stop_event(self): return self._stop_event +class DeploymentManagement(ServiceBase, modelresponse_pb2_grpc.DeploymentManagementServicer): + def AddDeployment(self, request, context): + print("TESTING ADD DEPLOYMENT") + return google_dot_protobuf_dot_empty__pb2.Empty() + class ModelResponse(ServiceBase): """ @@ -142,7 +147,7 @@ def __init__(self, host, ports): self.stubs = [] for port in ports: channel = create_channel(host, port) - stub = modelresponse_pb2_grpc.ModelResponseStub(channel) + stub = modelresponse_pb2_grpc.DeploymentManagementStub(channel) self.stubs.append(stub) self.asyncio_loop = asyncio.get_event_loop() @@ -198,12 +203,19 @@ def choose_stub(self, call_count): def intercept_service(self, continuation, handler_call_details): next_handler = continuation(handler_call_details) + print(next_handler) assert next_handler.unary_unary is not None #USE KWARGS LIKE THEY ARE USED TO MAKE SESSIONS TO GET THE DEPLOYMENT NAME TO HASH THE COUNTERS/STUBS def invoke_intercept_method(request_proto, context): method_name = _get_grpc_method_name(handler_call_details.method) + if method_name == ADD_DEPLOYMENT_METHOD: + for name in self.stubs: + for stub in self.stubs[name]: + stub.invoke(ADD_DEPLOYMENT_METHOD, request_proto) + return google_dot_protobuf_dot_empty__pb2.Empty() + if method_name == TERMINATE_METHOD: for deployment in self.stubs: for stub in self.stubs[deployment]: @@ -290,7 +302,7 @@ def _do_serve(service_impl, port, interceptors=[]): GRPC_MAX_MSG_SIZE), ('grpc.max_receive_message_length', GRPC_MAX_MSG_SIZE)]) - modelresponse_pb2_grpc.add_ModelResponseServicer_to_server(service_impl, server) + modelresponse_pb2_grpc.add_DeploymentManagementServicer_to_server(service_impl, server) server.add_insecure_port(f'[::]:{port}') print(f"About to start server") server.start() @@ -300,11 +312,11 @@ def _do_serve(service_impl, port, interceptors=[]): def serve_inference(inference_pipeline, port): - _do_serve(ModelResponse(inference_pipeline), port) + _do_serve(DeploymentManagement(), port) def serve_load_balancing(lb_config): - _do_serve(ServiceBase(), + _do_serve(DeploymentManagement(), lb_config.port, [LoadBalancingInterceptor(lb_config.replica_configs)]) diff --git a/mii/grpc_related/proto/modelresponse.proto b/mii/grpc_related/proto/modelresponse.proto index ce55522b..c622074e 100644 --- a/mii/grpc_related/proto/modelresponse.proto +++ b/mii/grpc_related/proto/modelresponse.proto @@ -36,6 +36,10 @@ service ModelResponse { rpc Txt2ImgReply(MultiStringRequest) returns (ImageReply) {} } +service DeploymentManagement { + rpc AddDeployment(google.protobuf.Empty) returns (google.protobuf.Empty) {} +} + message Value { oneof oneof_values { string svalue = 1; diff --git a/mii/grpc_related/proto/modelresponse_pb2.py b/mii/grpc_related/proto/modelresponse_pb2.py index 7802b7f2..515ebb80 100644 --- a/mii/grpc_related/proto/modelresponse_pb2.py +++ b/mii/grpc_related/proto/modelresponse_pb2.py @@ -1,8 +1,4 @@ -# Copyright (c) Microsoft Corporation. -# SPDX-License-Identifier: Apache-2.0 - -# DeepSpeed Team - +# -*- coding: utf-8 -*- # Generated by the protocol buffer compiler. DO NOT EDIT! # source: modelresponse.proto """Generated protocol buffer code.""" @@ -14,54 +10,56 @@ _sym_db = _symbol_database.Default() + from google.protobuf import empty_pb2 as google_dot_protobuf_dot_empty__pb2 -DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( - b'\n\x13modelresponse.proto\x12\rmodelresponse\x1a\x1bgoogle/protobuf/empty.proto\"_\n\x05Value\x12\x10\n\x06svalue\x18\x01 \x01(\tH\x00\x12\x10\n\x06ivalue\x18\x02 \x01(\x03H\x00\x12\x10\n\x06\x66value\x18\x03 \x01(\x02H\x00\x12\x10\n\x06\x62value\x18\x04 \x01(\x08H\x00\x42\x0e\n\x0coneof_values\"\x1f\n\tSessionID\x12\x12\n\nsession_id\x18\x01 \x01(\t\"\xed\x01\n\x13SingleStringRequest\x12\x0f\n\x07request\x18\x01 \x01(\t\x12I\n\x0cquery_kwargs\x18\x02 \x03(\x0b\x32\x33.modelresponse.SingleStringRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x03 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\xeb\x01\n\x12MultiStringRequest\x12\x0f\n\x07request\x18\x01 \x03(\t\x12H\n\x0cquery_kwargs\x18\x02 \x03(\x0b\x32\x32.modelresponse.MultiStringRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x03 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\x85\x01\n\x11SingleStringReply\x12\x10\n\x08response\x18\x01 \x01(\t\x12\x12\n\ntime_taken\x18\x02 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x03 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\x84\x01\n\x10MultiStringReply\x12\x10\n\x08response\x18\x01 \x03(\t\x12\x12\n\ntime_taken\x18\x02 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x03 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\xeb\x01\n\tQARequest\x12\x10\n\x08question\x18\x01 \x01(\t\x12\x0f\n\x07\x63ontext\x18\x02 \x01(\t\x12?\n\x0cquery_kwargs\x18\x03 \x03(\x0b\x32).modelresponse.QARequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\xd3\x02\n\x13\x43onversationRequest\x12\x0c\n\x04text\x18\x01 \x01(\t\x12\x1c\n\x0f\x63onversation_id\x18\x02 \x01(\x03H\x00\x88\x01\x01\x12\x18\n\x10past_user_inputs\x18\x03 \x03(\t\x12\x1b\n\x13generated_responses\x18\x04 \x03(\t\x12I\n\x0cquery_kwargs\x18\x05 \x03(\x0b\x32\x33.modelresponse.ConversationRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x06 \x01(\tH\x01\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_conversation_idB\x12\n\x10_deployment_name\"\xc3\x01\n\x11\x43onversationReply\x12\x17\n\x0f\x63onversation_id\x18\x01 \x01(\x03\x12\x18\n\x10past_user_inputs\x18\x02 \x03(\t\x12\x1b\n\x13generated_responses\x18\x03 \x03(\t\x12\x12\n\ntime_taken\x18\x04 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x05 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x06 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\xaf\x01\n\nImageReply\x12\x0e\n\x06images\x18\x01 \x03(\x0c\x12\x1d\n\x15nsfw_content_detected\x18\x02 \x03(\x08\x12\x0c\n\x04mode\x18\x03 \x01(\t\x12\x0e\n\x06size_w\x18\x04 \x01(\x03\x12\x0e\n\x06size_h\x18\x05 \x01(\x03\x12\x12\n\ntime_taken\x18\x06 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x07 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name2\xd4\x06\n\rModelResponse\x12=\n\tTerminate\x12\x16.google.protobuf.Empty\x1a\x16.google.protobuf.Empty\"\x00\x12\x43\n\rCreateSession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12\x44\n\x0e\x44\x65stroySession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12V\n\x0eGeneratorReply\x12!.modelresponse.MultiStringRequest\x1a\x1f.modelresponse.MultiStringReply\"\x00\x12]\n\x13\x43lassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12V\n\x16QuestionAndAnswerReply\x12\x18.modelresponse.QARequest\x1a .modelresponse.SingleStringReply\"\x00\x12W\n\rFillMaskReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12\x62\n\x18TokenClassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12]\n\x13\x43onversationalReply\x12\".modelresponse.ConversationRequest\x1a .modelresponse.ConversationReply\"\x00\x12N\n\x0cTxt2ImgReply\x12!.modelresponse.MultiStringRequest\x1a\x19.modelresponse.ImageReply\"\x00\x62\x06proto3' -) + +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x13modelresponse.proto\x12\rmodelresponse\x1a\x1bgoogle/protobuf/empty.proto\"_\n\x05Value\x12\x10\n\x06svalue\x18\x01 \x01(\tH\x00\x12\x10\n\x06ivalue\x18\x02 \x01(\x03H\x00\x12\x10\n\x06\x66value\x18\x03 \x01(\x02H\x00\x12\x10\n\x06\x62value\x18\x04 \x01(\x08H\x00\x42\x0e\n\x0coneof_values\"\x1f\n\tSessionID\x12\x12\n\nsession_id\x18\x01 \x01(\t\"\xed\x01\n\x13SingleStringRequest\x12\x0f\n\x07request\x18\x01 \x01(\t\x12I\n\x0cquery_kwargs\x18\x02 \x03(\x0b\x32\x33.modelresponse.SingleStringRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x03 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\xeb\x01\n\x12MultiStringRequest\x12\x0f\n\x07request\x18\x01 \x03(\t\x12H\n\x0cquery_kwargs\x18\x02 \x03(\x0b\x32\x32.modelresponse.MultiStringRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x03 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\x85\x01\n\x11SingleStringReply\x12\x10\n\x08response\x18\x01 \x01(\t\x12\x12\n\ntime_taken\x18\x02 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x03 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\x84\x01\n\x10MultiStringReply\x12\x10\n\x08response\x18\x01 \x03(\t\x12\x12\n\ntime_taken\x18\x02 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x03 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\xeb\x01\n\tQARequest\x12\x10\n\x08question\x18\x01 \x01(\t\x12\x0f\n\x07\x63ontext\x18\x02 \x01(\t\x12?\n\x0cquery_kwargs\x18\x03 \x03(\x0b\x32).modelresponse.QARequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\xd3\x02\n\x13\x43onversationRequest\x12\x0c\n\x04text\x18\x01 \x01(\t\x12\x1c\n\x0f\x63onversation_id\x18\x02 \x01(\x03H\x00\x88\x01\x01\x12\x18\n\x10past_user_inputs\x18\x03 \x03(\t\x12\x1b\n\x13generated_responses\x18\x04 \x03(\t\x12I\n\x0cquery_kwargs\x18\x05 \x03(\x0b\x32\x33.modelresponse.ConversationRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x06 \x01(\tH\x01\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_conversation_idB\x12\n\x10_deployment_name\"\xc3\x01\n\x11\x43onversationReply\x12\x17\n\x0f\x63onversation_id\x18\x01 \x01(\x03\x12\x18\n\x10past_user_inputs\x18\x02 \x03(\t\x12\x1b\n\x13generated_responses\x18\x03 \x03(\t\x12\x12\n\ntime_taken\x18\x04 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x05 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x06 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\xaf\x01\n\nImageReply\x12\x0e\n\x06images\x18\x01 \x03(\x0c\x12\x1d\n\x15nsfw_content_detected\x18\x02 \x03(\x08\x12\x0c\n\x04mode\x18\x03 \x01(\t\x12\x0e\n\x06size_w\x18\x04 \x01(\x03\x12\x0e\n\x06size_h\x18\x05 \x01(\x03\x12\x12\n\ntime_taken\x18\x06 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x07 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name2\xd4\x06\n\rModelResponse\x12=\n\tTerminate\x12\x16.google.protobuf.Empty\x1a\x16.google.protobuf.Empty\"\x00\x12\x43\n\rCreateSession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12\x44\n\x0e\x44\x65stroySession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12V\n\x0eGeneratorReply\x12!.modelresponse.MultiStringRequest\x1a\x1f.modelresponse.MultiStringReply\"\x00\x12]\n\x13\x43lassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12V\n\x16QuestionAndAnswerReply\x12\x18.modelresponse.QARequest\x1a .modelresponse.SingleStringReply\"\x00\x12W\n\rFillMaskReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12\x62\n\x18TokenClassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12]\n\x13\x43onversationalReply\x12\".modelresponse.ConversationRequest\x1a .modelresponse.ConversationReply\"\x00\x12N\n\x0cTxt2ImgReply\x12!.modelresponse.MultiStringRequest\x1a\x19.modelresponse.ImageReply\"\x00\x32Y\n\x14\x44\x65ploymentManagement\x12\x41\n\rAddDeployment\x12\x16.google.protobuf.Empty\x1a\x16.google.protobuf.Empty\"\x00\x62\x06proto3') _globals = globals() _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) _builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'modelresponse_pb2', _globals) if _descriptor._USE_C_DESCRIPTORS == False: - DESCRIPTOR._options = None - _SINGLESTRINGREQUEST_QUERYKWARGSENTRY._options = None - _SINGLESTRINGREQUEST_QUERYKWARGSENTRY._serialized_options = b'8\001' - _MULTISTRINGREQUEST_QUERYKWARGSENTRY._options = None - _MULTISTRINGREQUEST_QUERYKWARGSENTRY._serialized_options = b'8\001' - _QAREQUEST_QUERYKWARGSENTRY._options = None - _QAREQUEST_QUERYKWARGSENTRY._serialized_options = b'8\001' - _CONVERSATIONREQUEST_QUERYKWARGSENTRY._options = None - _CONVERSATIONREQUEST_QUERYKWARGSENTRY._serialized_options = b'8\001' - _globals['_VALUE']._serialized_start = 67 - _globals['_VALUE']._serialized_end = 162 - _globals['_SESSIONID']._serialized_start = 164 - _globals['_SESSIONID']._serialized_end = 195 - _globals['_SINGLESTRINGREQUEST']._serialized_start = 198 - _globals['_SINGLESTRINGREQUEST']._serialized_end = 435 - _globals['_SINGLESTRINGREQUEST_QUERYKWARGSENTRY']._serialized_start = 343 - _globals['_SINGLESTRINGREQUEST_QUERYKWARGSENTRY']._serialized_end = 415 - _globals['_MULTISTRINGREQUEST']._serialized_start = 438 - _globals['_MULTISTRINGREQUEST']._serialized_end = 673 - _globals['_MULTISTRINGREQUEST_QUERYKWARGSENTRY']._serialized_start = 343 - _globals['_MULTISTRINGREQUEST_QUERYKWARGSENTRY']._serialized_end = 415 - _globals['_SINGLESTRINGREPLY']._serialized_start = 676 - _globals['_SINGLESTRINGREPLY']._serialized_end = 809 - _globals['_MULTISTRINGREPLY']._serialized_start = 812 - _globals['_MULTISTRINGREPLY']._serialized_end = 944 - _globals['_QAREQUEST']._serialized_start = 947 - _globals['_QAREQUEST']._serialized_end = 1182 - _globals['_QAREQUEST_QUERYKWARGSENTRY']._serialized_start = 343 - _globals['_QAREQUEST_QUERYKWARGSENTRY']._serialized_end = 415 - _globals['_CONVERSATIONREQUEST']._serialized_start = 1185 - _globals['_CONVERSATIONREQUEST']._serialized_end = 1524 - _globals['_CONVERSATIONREQUEST_QUERYKWARGSENTRY']._serialized_start = 343 - _globals['_CONVERSATIONREQUEST_QUERYKWARGSENTRY']._serialized_end = 415 - _globals['_CONVERSATIONREPLY']._serialized_start = 1527 - _globals['_CONVERSATIONREPLY']._serialized_end = 1722 - _globals['_IMAGEREPLY']._serialized_start = 1725 - _globals['_IMAGEREPLY']._serialized_end = 1900 - _globals['_MODELRESPONSE']._serialized_start = 1903 - _globals['_MODELRESPONSE']._serialized_end = 2755 + DESCRIPTOR._options = None + _SINGLESTRINGREQUEST_QUERYKWARGSENTRY._options = None + _SINGLESTRINGREQUEST_QUERYKWARGSENTRY._serialized_options = b'8\001' + _MULTISTRINGREQUEST_QUERYKWARGSENTRY._options = None + _MULTISTRINGREQUEST_QUERYKWARGSENTRY._serialized_options = b'8\001' + _QAREQUEST_QUERYKWARGSENTRY._options = None + _QAREQUEST_QUERYKWARGSENTRY._serialized_options = b'8\001' + _CONVERSATIONREQUEST_QUERYKWARGSENTRY._options = None + _CONVERSATIONREQUEST_QUERYKWARGSENTRY._serialized_options = b'8\001' + _globals['_VALUE']._serialized_start=67 + _globals['_VALUE']._serialized_end=162 + _globals['_SESSIONID']._serialized_start=164 + _globals['_SESSIONID']._serialized_end=195 + _globals['_SINGLESTRINGREQUEST']._serialized_start=198 + _globals['_SINGLESTRINGREQUEST']._serialized_end=435 + _globals['_SINGLESTRINGREQUEST_QUERYKWARGSENTRY']._serialized_start=343 + _globals['_SINGLESTRINGREQUEST_QUERYKWARGSENTRY']._serialized_end=415 + _globals['_MULTISTRINGREQUEST']._serialized_start=438 + _globals['_MULTISTRINGREQUEST']._serialized_end=673 + _globals['_MULTISTRINGREQUEST_QUERYKWARGSENTRY']._serialized_start=343 + _globals['_MULTISTRINGREQUEST_QUERYKWARGSENTRY']._serialized_end=415 + _globals['_SINGLESTRINGREPLY']._serialized_start=676 + _globals['_SINGLESTRINGREPLY']._serialized_end=809 + _globals['_MULTISTRINGREPLY']._serialized_start=812 + _globals['_MULTISTRINGREPLY']._serialized_end=944 + _globals['_QAREQUEST']._serialized_start=947 + _globals['_QAREQUEST']._serialized_end=1182 + _globals['_QAREQUEST_QUERYKWARGSENTRY']._serialized_start=343 + _globals['_QAREQUEST_QUERYKWARGSENTRY']._serialized_end=415 + _globals['_CONVERSATIONREQUEST']._serialized_start=1185 + _globals['_CONVERSATIONREQUEST']._serialized_end=1524 + _globals['_CONVERSATIONREQUEST_QUERYKWARGSENTRY']._serialized_start=343 + _globals['_CONVERSATIONREQUEST_QUERYKWARGSENTRY']._serialized_end=415 + _globals['_CONVERSATIONREPLY']._serialized_start=1527 + _globals['_CONVERSATIONREPLY']._serialized_end=1722 + _globals['_IMAGEREPLY']._serialized_start=1725 + _globals['_IMAGEREPLY']._serialized_end=1900 + _globals['_MODELRESPONSE']._serialized_start=1903 + _globals['_MODELRESPONSE']._serialized_end=2755 + _globals['_DEPLOYMENTMANAGEMENT']._serialized_start=2757 + _globals['_DEPLOYMENTMANAGEMENT']._serialized_end=2846 # @@protoc_insertion_point(module_scope) diff --git a/mii/grpc_related/proto/modelresponse_pb2_grpc.py b/mii/grpc_related/proto/modelresponse_pb2_grpc.py index 95cfa825..438fa0c2 100644 --- a/mii/grpc_related/proto/modelresponse_pb2_grpc.py +++ b/mii/grpc_related/proto/modelresponse_pb2_grpc.py @@ -1,8 +1,3 @@ -# Copyright (c) Microsoft Corporation. -# SPDX-License-Identifier: Apache-2.0 - -# DeepSpeed Team - # Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT! """Client and server classes corresponding to protobuf-defined services.""" import grpc @@ -13,6 +8,7 @@ class ModelResponseStub(object): """Missing associated documentation comment in .proto file.""" + def __init__(self, channel): """Constructor. @@ -20,60 +16,60 @@ def __init__(self, channel): channel: A grpc.Channel. """ self.Terminate = channel.unary_unary( - '/modelresponse.ModelResponse/Terminate', - request_serializer=google_dot_protobuf_dot_empty__pb2.Empty. - SerializeToString, - response_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, - ) + '/modelresponse.ModelResponse/Terminate', + request_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, + response_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, + ) self.CreateSession = channel.unary_unary( - '/modelresponse.ModelResponse/CreateSession', - request_serializer=modelresponse__pb2.SessionID.SerializeToString, - response_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, - ) + '/modelresponse.ModelResponse/CreateSession', + request_serializer=modelresponse__pb2.SessionID.SerializeToString, + response_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, + ) self.DestroySession = channel.unary_unary( - '/modelresponse.ModelResponse/DestroySession', - request_serializer=modelresponse__pb2.SessionID.SerializeToString, - response_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, - ) + '/modelresponse.ModelResponse/DestroySession', + request_serializer=modelresponse__pb2.SessionID.SerializeToString, + response_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, + ) self.GeneratorReply = channel.unary_unary( - '/modelresponse.ModelResponse/GeneratorReply', - request_serializer=modelresponse__pb2.MultiStringRequest.SerializeToString, - response_deserializer=modelresponse__pb2.MultiStringReply.FromString, - ) + '/modelresponse.ModelResponse/GeneratorReply', + request_serializer=modelresponse__pb2.MultiStringRequest.SerializeToString, + response_deserializer=modelresponse__pb2.MultiStringReply.FromString, + ) self.ClassificationReply = channel.unary_unary( - '/modelresponse.ModelResponse/ClassificationReply', - request_serializer=modelresponse__pb2.SingleStringRequest.SerializeToString, - response_deserializer=modelresponse__pb2.SingleStringReply.FromString, - ) + '/modelresponse.ModelResponse/ClassificationReply', + request_serializer=modelresponse__pb2.SingleStringRequest.SerializeToString, + response_deserializer=modelresponse__pb2.SingleStringReply.FromString, + ) self.QuestionAndAnswerReply = channel.unary_unary( - '/modelresponse.ModelResponse/QuestionAndAnswerReply', - request_serializer=modelresponse__pb2.QARequest.SerializeToString, - response_deserializer=modelresponse__pb2.SingleStringReply.FromString, - ) + '/modelresponse.ModelResponse/QuestionAndAnswerReply', + request_serializer=modelresponse__pb2.QARequest.SerializeToString, + response_deserializer=modelresponse__pb2.SingleStringReply.FromString, + ) self.FillMaskReply = channel.unary_unary( - '/modelresponse.ModelResponse/FillMaskReply', - request_serializer=modelresponse__pb2.SingleStringRequest.SerializeToString, - response_deserializer=modelresponse__pb2.SingleStringReply.FromString, - ) + '/modelresponse.ModelResponse/FillMaskReply', + request_serializer=modelresponse__pb2.SingleStringRequest.SerializeToString, + response_deserializer=modelresponse__pb2.SingleStringReply.FromString, + ) self.TokenClassificationReply = channel.unary_unary( - '/modelresponse.ModelResponse/TokenClassificationReply', - request_serializer=modelresponse__pb2.SingleStringRequest.SerializeToString, - response_deserializer=modelresponse__pb2.SingleStringReply.FromString, - ) + '/modelresponse.ModelResponse/TokenClassificationReply', + request_serializer=modelresponse__pb2.SingleStringRequest.SerializeToString, + response_deserializer=modelresponse__pb2.SingleStringReply.FromString, + ) self.ConversationalReply = channel.unary_unary( - '/modelresponse.ModelResponse/ConversationalReply', - request_serializer=modelresponse__pb2.ConversationRequest.SerializeToString, - response_deserializer=modelresponse__pb2.ConversationReply.FromString, - ) + '/modelresponse.ModelResponse/ConversationalReply', + request_serializer=modelresponse__pb2.ConversationRequest.SerializeToString, + response_deserializer=modelresponse__pb2.ConversationReply.FromString, + ) self.Txt2ImgReply = channel.unary_unary( - '/modelresponse.ModelResponse/Txt2ImgReply', - request_serializer=modelresponse__pb2.MultiStringRequest.SerializeToString, - response_deserializer=modelresponse__pb2.ImageReply.FromString, - ) + '/modelresponse.ModelResponse/Txt2ImgReply', + request_serializer=modelresponse__pb2.MultiStringRequest.SerializeToString, + response_deserializer=modelresponse__pb2.ImageReply.FromString, + ) class ModelResponseServicer(object): """Missing associated documentation comment in .proto file.""" + def Terminate(self, request, context): """Missing associated documentation comment in .proto file.""" context.set_code(grpc.StatusCode.UNIMPLEMENTED) @@ -137,334 +133,293 @@ def Txt2ImgReply(self, request, context): def add_ModelResponseServicer_to_server(servicer, server): rpc_method_handlers = { - 'Terminate': - grpc.unary_unary_rpc_method_handler( - servicer.Terminate, - request_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, - response_serializer=google_dot_protobuf_dot_empty__pb2.Empty. - SerializeToString, - ), - 'CreateSession': - grpc.unary_unary_rpc_method_handler( - servicer.CreateSession, - request_deserializer=modelresponse__pb2.SessionID.FromString, - response_serializer=google_dot_protobuf_dot_empty__pb2.Empty. - SerializeToString, - ), - 'DestroySession': - grpc.unary_unary_rpc_method_handler( - servicer.DestroySession, - request_deserializer=modelresponse__pb2.SessionID.FromString, - response_serializer=google_dot_protobuf_dot_empty__pb2.Empty. - SerializeToString, - ), - 'GeneratorReply': - grpc.unary_unary_rpc_method_handler( - servicer.GeneratorReply, - request_deserializer=modelresponse__pb2.MultiStringRequest.FromString, - response_serializer=modelresponse__pb2.MultiStringReply.SerializeToString, - ), - 'ClassificationReply': - grpc.unary_unary_rpc_method_handler( - servicer.ClassificationReply, - request_deserializer=modelresponse__pb2.SingleStringRequest.FromString, - response_serializer=modelresponse__pb2.SingleStringReply.SerializeToString, - ), - 'QuestionAndAnswerReply': - grpc.unary_unary_rpc_method_handler( - servicer.QuestionAndAnswerReply, - request_deserializer=modelresponse__pb2.QARequest.FromString, - response_serializer=modelresponse__pb2.SingleStringReply.SerializeToString, - ), - 'FillMaskReply': - grpc.unary_unary_rpc_method_handler( - servicer.FillMaskReply, - request_deserializer=modelresponse__pb2.SingleStringRequest.FromString, - response_serializer=modelresponse__pb2.SingleStringReply.SerializeToString, - ), - 'TokenClassificationReply': - grpc.unary_unary_rpc_method_handler( - servicer.TokenClassificationReply, - request_deserializer=modelresponse__pb2.SingleStringRequest.FromString, - response_serializer=modelresponse__pb2.SingleStringReply.SerializeToString, - ), - 'ConversationalReply': - grpc.unary_unary_rpc_method_handler( - servicer.ConversationalReply, - request_deserializer=modelresponse__pb2.ConversationRequest.FromString, - response_serializer=modelresponse__pb2.ConversationReply.SerializeToString, - ), - 'Txt2ImgReply': - grpc.unary_unary_rpc_method_handler( - servicer.Txt2ImgReply, - request_deserializer=modelresponse__pb2.MultiStringRequest.FromString, - response_serializer=modelresponse__pb2.ImageReply.SerializeToString, - ), + 'Terminate': grpc.unary_unary_rpc_method_handler( + servicer.Terminate, + request_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, + response_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, + ), + 'CreateSession': grpc.unary_unary_rpc_method_handler( + servicer.CreateSession, + request_deserializer=modelresponse__pb2.SessionID.FromString, + response_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, + ), + 'DestroySession': grpc.unary_unary_rpc_method_handler( + servicer.DestroySession, + request_deserializer=modelresponse__pb2.SessionID.FromString, + response_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, + ), + 'GeneratorReply': grpc.unary_unary_rpc_method_handler( + servicer.GeneratorReply, + request_deserializer=modelresponse__pb2.MultiStringRequest.FromString, + response_serializer=modelresponse__pb2.MultiStringReply.SerializeToString, + ), + 'ClassificationReply': grpc.unary_unary_rpc_method_handler( + servicer.ClassificationReply, + request_deserializer=modelresponse__pb2.SingleStringRequest.FromString, + response_serializer=modelresponse__pb2.SingleStringReply.SerializeToString, + ), + 'QuestionAndAnswerReply': grpc.unary_unary_rpc_method_handler( + servicer.QuestionAndAnswerReply, + request_deserializer=modelresponse__pb2.QARequest.FromString, + response_serializer=modelresponse__pb2.SingleStringReply.SerializeToString, + ), + 'FillMaskReply': grpc.unary_unary_rpc_method_handler( + servicer.FillMaskReply, + request_deserializer=modelresponse__pb2.SingleStringRequest.FromString, + response_serializer=modelresponse__pb2.SingleStringReply.SerializeToString, + ), + 'TokenClassificationReply': grpc.unary_unary_rpc_method_handler( + servicer.TokenClassificationReply, + request_deserializer=modelresponse__pb2.SingleStringRequest.FromString, + response_serializer=modelresponse__pb2.SingleStringReply.SerializeToString, + ), + 'ConversationalReply': grpc.unary_unary_rpc_method_handler( + servicer.ConversationalReply, + request_deserializer=modelresponse__pb2.ConversationRequest.FromString, + response_serializer=modelresponse__pb2.ConversationReply.SerializeToString, + ), + 'Txt2ImgReply': grpc.unary_unary_rpc_method_handler( + servicer.Txt2ImgReply, + request_deserializer=modelresponse__pb2.MultiStringRequest.FromString, + response_serializer=modelresponse__pb2.ImageReply.SerializeToString, + ), } - generic_handler = grpc.method_handlers_generic_handler('modelresponse.ModelResponse', - rpc_method_handlers) - server.add_generic_rpc_handlers((generic_handler, )) + generic_handler = grpc.method_handlers_generic_handler( + 'modelresponse.ModelResponse', rpc_method_handlers) + server.add_generic_rpc_handlers((generic_handler,)) -# This class is part of an EXPERIMENTAL API. + # This class is part of an EXPERIMENTAL API. class ModelResponse(object): """Missing associated documentation comment in .proto file.""" + @staticmethod def Terminate(request, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None): - return grpc.experimental.unary_unary( - request, target, - '/modelresponse.ModelResponse/Terminate', + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary(request, target, '/modelresponse.ModelResponse/Terminate', google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, google_dot_protobuf_dot_empty__pb2.Empty.FromString, - options, - channel_credentials, - insecure, - call_credentials, - compression, - wait_for_ready, - timeout, - metadata) + options, channel_credentials, + insecure, call_credentials, compression, wait_for_ready, timeout, metadata) @staticmethod def CreateSession(request, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None): - return grpc.experimental.unary_unary( - request, target, - '/modelresponse.ModelResponse/CreateSession', + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary(request, target, '/modelresponse.ModelResponse/CreateSession', modelresponse__pb2.SessionID.SerializeToString, google_dot_protobuf_dot_empty__pb2.Empty.FromString, - options, - channel_credentials, - insecure, - call_credentials, - compression, - wait_for_ready, - timeout, - metadata) + options, channel_credentials, + insecure, call_credentials, compression, wait_for_ready, timeout, metadata) @staticmethod def DestroySession(request, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None): - return grpc.experimental.unary_unary( - request, target, - '/modelresponse.ModelResponse/DestroySession', + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary(request, target, '/modelresponse.ModelResponse/DestroySession', modelresponse__pb2.SessionID.SerializeToString, google_dot_protobuf_dot_empty__pb2.Empty.FromString, - options, - channel_credentials, - insecure, - call_credentials, - compression, - wait_for_ready, - timeout, - metadata) + options, channel_credentials, + insecure, call_credentials, compression, wait_for_ready, timeout, metadata) @staticmethod def GeneratorReply(request, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None): - return grpc.experimental.unary_unary( - request, target, - '/modelresponse.ModelResponse/GeneratorReply', + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary(request, target, '/modelresponse.ModelResponse/GeneratorReply', modelresponse__pb2.MultiStringRequest.SerializeToString, modelresponse__pb2.MultiStringReply.FromString, - options, - channel_credentials, - insecure, - call_credentials, - compression, - wait_for_ready, - timeout, - metadata) + options, channel_credentials, + insecure, call_credentials, compression, wait_for_ready, timeout, metadata) @staticmethod def ClassificationReply(request, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None): - return grpc.experimental.unary_unary( - request, target, - '/modelresponse.ModelResponse/ClassificationReply', + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary(request, target, '/modelresponse.ModelResponse/ClassificationReply', modelresponse__pb2.SingleStringRequest.SerializeToString, modelresponse__pb2.SingleStringReply.FromString, - options, - channel_credentials, - insecure, - call_credentials, - compression, - wait_for_ready, - timeout, - metadata) + options, channel_credentials, + insecure, call_credentials, compression, wait_for_ready, timeout, metadata) @staticmethod def QuestionAndAnswerReply(request, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None): - return grpc.experimental.unary_unary( - request, target, - '/modelresponse.ModelResponse/QuestionAndAnswerReply', + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary(request, target, '/modelresponse.ModelResponse/QuestionAndAnswerReply', modelresponse__pb2.QARequest.SerializeToString, modelresponse__pb2.SingleStringReply.FromString, - options, - channel_credentials, - insecure, - call_credentials, - compression, - wait_for_ready, - timeout, - metadata) + options, channel_credentials, + insecure, call_credentials, compression, wait_for_ready, timeout, metadata) @staticmethod def FillMaskReply(request, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None): - return grpc.experimental.unary_unary( - request, target, - '/modelresponse.ModelResponse/FillMaskReply', + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary(request, target, '/modelresponse.ModelResponse/FillMaskReply', modelresponse__pb2.SingleStringRequest.SerializeToString, modelresponse__pb2.SingleStringReply.FromString, - options, - channel_credentials, - insecure, - call_credentials, - compression, - wait_for_ready, - timeout, - metadata) + options, channel_credentials, + insecure, call_credentials, compression, wait_for_ready, timeout, metadata) @staticmethod def TokenClassificationReply(request, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None): - return grpc.experimental.unary_unary( - request, target, - '/modelresponse.ModelResponse/TokenClassificationReply', + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary(request, target, '/modelresponse.ModelResponse/TokenClassificationReply', modelresponse__pb2.SingleStringRequest.SerializeToString, modelresponse__pb2.SingleStringReply.FromString, - options, - channel_credentials, - insecure, - call_credentials, - compression, - wait_for_ready, - timeout, - metadata) + options, channel_credentials, + insecure, call_credentials, compression, wait_for_ready, timeout, metadata) @staticmethod def ConversationalReply(request, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None): - return grpc.experimental.unary_unary( - request, target, - '/modelresponse.ModelResponse/ConversationalReply', + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary(request, target, '/modelresponse.ModelResponse/ConversationalReply', modelresponse__pb2.ConversationRequest.SerializeToString, modelresponse__pb2.ConversationReply.FromString, - options, - channel_credentials, - insecure, - call_credentials, - compression, - wait_for_ready, - timeout, - metadata) + options, channel_credentials, + insecure, call_credentials, compression, wait_for_ready, timeout, metadata) @staticmethod def Txt2ImgReply(request, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None): - return grpc.experimental.unary_unary( - request, target, - '/modelresponse.ModelResponse/Txt2ImgReply', + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary(request, target, '/modelresponse.ModelResponse/Txt2ImgReply', modelresponse__pb2.MultiStringRequest.SerializeToString, modelresponse__pb2.ImageReply.FromString, - options, - channel_credentials, - insecure, - call_credentials, - compression, - wait_for_ready, - timeout, - metadata) + options, channel_credentials, + insecure, call_credentials, compression, wait_for_ready, timeout, metadata) + + +class DeploymentManagementStub(object): + """Missing associated documentation comment in .proto file.""" + + def __init__(self, channel): + """Constructor. + + Args: + channel: A grpc.Channel. + """ + self.AddDeployment = channel.unary_unary( + '/modelresponse.DeploymentManagement/AddDeployment', + request_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, + response_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, + ) + + +class DeploymentManagementServicer(object): + """Missing associated documentation comment in .proto file.""" + + def AddDeployment(self, request, context): + """Missing associated documentation comment in .proto file.""" + context.set_code(grpc.StatusCode.UNIMPLEMENTED) + context.set_details('Method not implemented!') + raise NotImplementedError('Method not implemented!') + + +def add_DeploymentManagementServicer_to_server(servicer, server): + rpc_method_handlers = { + 'AddDeployment': grpc.unary_unary_rpc_method_handler( + servicer.AddDeployment, + request_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, + response_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, + ), + } + generic_handler = grpc.method_handlers_generic_handler( + 'modelresponse.DeploymentManagement', rpc_method_handlers) + server.add_generic_rpc_handlers((generic_handler,)) + + + # This class is part of an EXPERIMENTAL API. +class DeploymentManagement(object): + """Missing associated documentation comment in .proto file.""" + + @staticmethod + def AddDeployment(request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary(request, target, '/modelresponse.DeploymentManagement/AddDeployment', + google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, + google_dot_protobuf_dot_empty__pb2.Empty.FromString, + options, channel_credentials, + insecure, call_credentials, compression, wait_for_ready, timeout, metadata) From c2636b7d36f50d270e21f7f8eb867abb3457c06c Mon Sep 17 00:00:00 2001 From: Tosin Segun Date: Fri, 21 Jul 2023 01:02:31 +0000 Subject: [PATCH 45/69] More partial deploy updates --- mii/client.py | 25 +++++++++++++++++---- mii/grpc_related/proto/modelresponse.proto | 10 +++++++++ mii/grpc_related/proto/modelresponse_pb2.py | 12 +++++----- mii/models/score/generate.py | 1 + mii/models/score/score_template.py | 10 +++++++-- mii/server.py | 12 ++++++---- 6 files changed, 55 insertions(+), 15 deletions(-) diff --git a/mii/client.py b/mii/client.py index f76105ed..14dc7273 100644 --- a/mii/client.py +++ b/mii/client.py @@ -43,8 +43,10 @@ def mii_query_handle(deployment_tag): return MIINonPersistentClient(task, deployment_tag) deployments = _get_deployment_configs(deployment_tag) - mii_configs_dict = deployments[0][mii.constants.MII_CONFIGS_KEY] - mii_configs = mii.config.MIIConfig(**mii_configs_dict) + if len(deployments) > 0: + mii_configs_dict = deployments[0][mii.constants.MII_CONFIGS_KEY] + mii_configs = mii.config.MIIConfig(**mii_configs_dict) + return MIIClient(deployments, "localhost", mii_configs.port_number) @@ -160,7 +162,7 @@ def add_models(self, deployed=False) ] - """ + deployment_tag = mii.deployment_tag lb_config = allocate_processes(deployments) if mii.lb_config is not None: @@ -172,10 +174,25 @@ def add_models(self, mii.model_path = MII_MODEL_PATH_DEFAULT elif mii.model_path is None and deployment_type == DeploymentType.AML: model_path = "model" + deps = [] + for deployment in self.deployments: + data = { + 'deployment_name': deployment[mii.constants.DEPLOYMENT_NAME_KEY], + 'task': deployment[mii.constants.TASK_NAME_KEY], + 'model': deployment[mii.constants.MODEL_NAME_KEY], + 'enable_deepspeed': deployment[mii.constants.ENABLE_DEEPSPEED_KEY], + 'enable_zero': deployment[mii.constants.ENABLE_DEEPSPEED_ZERO_KEY], + 'GPU_index_map': None, + 'mii_config': deployment[mii.constants.MII_CONFIGS_KEY], + 'ds_config': deployment[mii.constants.DEEPSPEED_CONFIG_KEY], + 'version': 1 + 'deployed' deployment[mii.constants.DEPLOYED_KEY] + } + create_score_file(deployment_tag=deployment_tag, deployment_type=mii.deployment_type, deployments=self.deployments, model_path=mii.model_path, lb_config=mii.lb_config) if mii.deployment_type == DeploymentType.Local: mii.utils.import_score_file(deployment_tag).init() - """ + self.asyncio_loop.run_until_complete(self.add_models_async()) class MIITensorParallelClient(): """ diff --git a/mii/grpc_related/proto/modelresponse.proto b/mii/grpc_related/proto/modelresponse.proto index c622074e..7e4d3520 100644 --- a/mii/grpc_related/proto/modelresponse.proto +++ b/mii/grpc_related/proto/modelresponse.proto @@ -113,3 +113,13 @@ message ImageReply { float time_taken = 6; optional string deployment_name = 7; } + +message AddDeployRequest { + string task = 1; + string deployment_name = 2; + string hostname = 3; + repeated int64 tensor_parallel_ports = 4; + int64 torch_dist_port = 5; + repeated int64 gpu_indices = 6; + +} diff --git a/mii/grpc_related/proto/modelresponse_pb2.py b/mii/grpc_related/proto/modelresponse_pb2.py index 515ebb80..1fc27665 100644 --- a/mii/grpc_related/proto/modelresponse_pb2.py +++ b/mii/grpc_related/proto/modelresponse_pb2.py @@ -14,7 +14,7 @@ from google.protobuf import empty_pb2 as google_dot_protobuf_dot_empty__pb2 -DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x13modelresponse.proto\x12\rmodelresponse\x1a\x1bgoogle/protobuf/empty.proto\"_\n\x05Value\x12\x10\n\x06svalue\x18\x01 \x01(\tH\x00\x12\x10\n\x06ivalue\x18\x02 \x01(\x03H\x00\x12\x10\n\x06\x66value\x18\x03 \x01(\x02H\x00\x12\x10\n\x06\x62value\x18\x04 \x01(\x08H\x00\x42\x0e\n\x0coneof_values\"\x1f\n\tSessionID\x12\x12\n\nsession_id\x18\x01 \x01(\t\"\xed\x01\n\x13SingleStringRequest\x12\x0f\n\x07request\x18\x01 \x01(\t\x12I\n\x0cquery_kwargs\x18\x02 \x03(\x0b\x32\x33.modelresponse.SingleStringRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x03 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\xeb\x01\n\x12MultiStringRequest\x12\x0f\n\x07request\x18\x01 \x03(\t\x12H\n\x0cquery_kwargs\x18\x02 \x03(\x0b\x32\x32.modelresponse.MultiStringRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x03 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\x85\x01\n\x11SingleStringReply\x12\x10\n\x08response\x18\x01 \x01(\t\x12\x12\n\ntime_taken\x18\x02 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x03 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\x84\x01\n\x10MultiStringReply\x12\x10\n\x08response\x18\x01 \x03(\t\x12\x12\n\ntime_taken\x18\x02 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x03 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\xeb\x01\n\tQARequest\x12\x10\n\x08question\x18\x01 \x01(\t\x12\x0f\n\x07\x63ontext\x18\x02 \x01(\t\x12?\n\x0cquery_kwargs\x18\x03 \x03(\x0b\x32).modelresponse.QARequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\xd3\x02\n\x13\x43onversationRequest\x12\x0c\n\x04text\x18\x01 \x01(\t\x12\x1c\n\x0f\x63onversation_id\x18\x02 \x01(\x03H\x00\x88\x01\x01\x12\x18\n\x10past_user_inputs\x18\x03 \x03(\t\x12\x1b\n\x13generated_responses\x18\x04 \x03(\t\x12I\n\x0cquery_kwargs\x18\x05 \x03(\x0b\x32\x33.modelresponse.ConversationRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x06 \x01(\tH\x01\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_conversation_idB\x12\n\x10_deployment_name\"\xc3\x01\n\x11\x43onversationReply\x12\x17\n\x0f\x63onversation_id\x18\x01 \x01(\x03\x12\x18\n\x10past_user_inputs\x18\x02 \x03(\t\x12\x1b\n\x13generated_responses\x18\x03 \x03(\t\x12\x12\n\ntime_taken\x18\x04 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x05 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x06 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\xaf\x01\n\nImageReply\x12\x0e\n\x06images\x18\x01 \x03(\x0c\x12\x1d\n\x15nsfw_content_detected\x18\x02 \x03(\x08\x12\x0c\n\x04mode\x18\x03 \x01(\t\x12\x0e\n\x06size_w\x18\x04 \x01(\x03\x12\x0e\n\x06size_h\x18\x05 \x01(\x03\x12\x12\n\ntime_taken\x18\x06 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x07 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name2\xd4\x06\n\rModelResponse\x12=\n\tTerminate\x12\x16.google.protobuf.Empty\x1a\x16.google.protobuf.Empty\"\x00\x12\x43\n\rCreateSession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12\x44\n\x0e\x44\x65stroySession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12V\n\x0eGeneratorReply\x12!.modelresponse.MultiStringRequest\x1a\x1f.modelresponse.MultiStringReply\"\x00\x12]\n\x13\x43lassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12V\n\x16QuestionAndAnswerReply\x12\x18.modelresponse.QARequest\x1a .modelresponse.SingleStringReply\"\x00\x12W\n\rFillMaskReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12\x62\n\x18TokenClassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12]\n\x13\x43onversationalReply\x12\".modelresponse.ConversationRequest\x1a .modelresponse.ConversationReply\"\x00\x12N\n\x0cTxt2ImgReply\x12!.modelresponse.MultiStringRequest\x1a\x19.modelresponse.ImageReply\"\x00\x32Y\n\x14\x44\x65ploymentManagement\x12\x41\n\rAddDeployment\x12\x16.google.protobuf.Empty\x1a\x16.google.protobuf.Empty\"\x00\x62\x06proto3') +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x13modelresponse.proto\x12\rmodelresponse\x1a\x1bgoogle/protobuf/empty.proto\"_\n\x05Value\x12\x10\n\x06svalue\x18\x01 \x01(\tH\x00\x12\x10\n\x06ivalue\x18\x02 \x01(\x03H\x00\x12\x10\n\x06\x66value\x18\x03 \x01(\x02H\x00\x12\x10\n\x06\x62value\x18\x04 \x01(\x08H\x00\x42\x0e\n\x0coneof_values\"\x1f\n\tSessionID\x12\x12\n\nsession_id\x18\x01 \x01(\t\"\xed\x01\n\x13SingleStringRequest\x12\x0f\n\x07request\x18\x01 \x01(\t\x12I\n\x0cquery_kwargs\x18\x02 \x03(\x0b\x32\x33.modelresponse.SingleStringRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x03 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\xeb\x01\n\x12MultiStringRequest\x12\x0f\n\x07request\x18\x01 \x03(\t\x12H\n\x0cquery_kwargs\x18\x02 \x03(\x0b\x32\x32.modelresponse.MultiStringRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x03 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\x85\x01\n\x11SingleStringReply\x12\x10\n\x08response\x18\x01 \x01(\t\x12\x12\n\ntime_taken\x18\x02 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x03 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\x84\x01\n\x10MultiStringReply\x12\x10\n\x08response\x18\x01 \x03(\t\x12\x12\n\ntime_taken\x18\x02 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x03 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\xeb\x01\n\tQARequest\x12\x10\n\x08question\x18\x01 \x01(\t\x12\x0f\n\x07\x63ontext\x18\x02 \x01(\t\x12?\n\x0cquery_kwargs\x18\x03 \x03(\x0b\x32).modelresponse.QARequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\xd3\x02\n\x13\x43onversationRequest\x12\x0c\n\x04text\x18\x01 \x01(\t\x12\x1c\n\x0f\x63onversation_id\x18\x02 \x01(\x03H\x00\x88\x01\x01\x12\x18\n\x10past_user_inputs\x18\x03 \x03(\t\x12\x1b\n\x13generated_responses\x18\x04 \x03(\t\x12I\n\x0cquery_kwargs\x18\x05 \x03(\x0b\x32\x33.modelresponse.ConversationRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x06 \x01(\tH\x01\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_conversation_idB\x12\n\x10_deployment_name\"\xc3\x01\n\x11\x43onversationReply\x12\x17\n\x0f\x63onversation_id\x18\x01 \x01(\x03\x12\x18\n\x10past_user_inputs\x18\x02 \x03(\t\x12\x1b\n\x13generated_responses\x18\x03 \x03(\t\x12\x12\n\ntime_taken\x18\x04 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x05 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x06 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\xaf\x01\n\nImageReply\x12\x0e\n\x06images\x18\x01 \x03(\x0c\x12\x1d\n\x15nsfw_content_detected\x18\x02 \x03(\x08\x12\x0c\n\x04mode\x18\x03 \x01(\t\x12\x0e\n\x06size_w\x18\x04 \x01(\x03\x12\x0e\n\x06size_h\x18\x05 \x01(\x03\x12\x12\n\ntime_taken\x18\x06 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x07 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\x98\x01\n\x10\x41\x64\x64\x44\x65ployRequest\x12\x0c\n\x04task\x18\x01 \x01(\t\x12\x17\n\x0f\x64\x65ployment_name\x18\x02 \x01(\t\x12\x10\n\x08hostname\x18\x03 \x01(\t\x12\x1d\n\x15tensor_parallel_ports\x18\x04 \x03(\x03\x12\x17\n\x0ftorch_dist_port\x18\x05 \x01(\x03\x12\x13\n\x0bgpu_indices\x18\x06 \x03(\x03\x32\xd4\x06\n\rModelResponse\x12=\n\tTerminate\x12\x16.google.protobuf.Empty\x1a\x16.google.protobuf.Empty\"\x00\x12\x43\n\rCreateSession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12\x44\n\x0e\x44\x65stroySession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12V\n\x0eGeneratorReply\x12!.modelresponse.MultiStringRequest\x1a\x1f.modelresponse.MultiStringReply\"\x00\x12]\n\x13\x43lassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12V\n\x16QuestionAndAnswerReply\x12\x18.modelresponse.QARequest\x1a .modelresponse.SingleStringReply\"\x00\x12W\n\rFillMaskReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12\x62\n\x18TokenClassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12]\n\x13\x43onversationalReply\x12\".modelresponse.ConversationRequest\x1a .modelresponse.ConversationReply\"\x00\x12N\n\x0cTxt2ImgReply\x12!.modelresponse.MultiStringRequest\x1a\x19.modelresponse.ImageReply\"\x00\x32Y\n\x14\x44\x65ploymentManagement\x12\x41\n\rAddDeployment\x12\x16.google.protobuf.Empty\x1a\x16.google.protobuf.Empty\"\x00\x62\x06proto3') _globals = globals() _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) @@ -58,8 +58,10 @@ _globals['_CONVERSATIONREPLY']._serialized_end=1722 _globals['_IMAGEREPLY']._serialized_start=1725 _globals['_IMAGEREPLY']._serialized_end=1900 - _globals['_MODELRESPONSE']._serialized_start=1903 - _globals['_MODELRESPONSE']._serialized_end=2755 - _globals['_DEPLOYMENTMANAGEMENT']._serialized_start=2757 - _globals['_DEPLOYMENTMANAGEMENT']._serialized_end=2846 + _globals['_ADDDEPLOYREQUEST']._serialized_start=1903 + _globals['_ADDDEPLOYREQUEST']._serialized_end=2055 + _globals['_MODELRESPONSE']._serialized_start=2058 + _globals['_MODELRESPONSE']._serialized_end=2910 + _globals['_DEPLOYMENTMANAGEMENT']._serialized_start=2912 + _globals['_DEPLOYMENTMANAGEMENT']._serialized_end=3001 # @@protoc_insertion_point(module_scope) diff --git a/mii/models/score/generate.py b/mii/models/score/generate.py index dc73fdb9..50f0446f 100644 --- a/mii/models/score/generate.py +++ b/mii/models/score/generate.py @@ -27,6 +27,7 @@ def create_score_file(deployment_tag, mii.constants.MII_CONFIGS_KEY: deployment.mii_config.dict(), mii.constants.ENABLE_DEEPSPEED_ZERO_KEY: deployment.enable_zero, mii.constants.DEEPSPEED_CONFIG_KEY: deployment.ds_config, + mii.constants.DEPLOYED_KEY: deployment.deployed, } config_dict[deployment.deployment_name] = deployment_config diff --git a/mii/models/score/score_template.py b/mii/models/score/score_template.py index 83b46de3..84238511 100644 --- a/mii/models/score/score_template.py +++ b/mii/models/score/score_template.py @@ -18,7 +18,11 @@ def init(): model_path = mii.utils.full_model_path(configs[mii.constants.MODEL_PATH_KEY]) deployment_tag = configs[mii.constants.DEPLOYMENT_TAG_KEY] deployments = [] + lb_enabled = False for deployment in configs.values(): + if isinstance(deployment, dict) and deployment[mii.constants.DEPLOYED_KEY]: + lb_enabled = True + continue if not isinstance(deployment, dict): continue data = { @@ -42,11 +46,13 @@ def init(): assert task_name is not None, "The task name should be set before calling init" """ - mii.MIIServer(deployment_tag, + if len(deployments) > 0: + mii.MIIServer(deployment_tag, deployments, model_path, lb_config=configs.get(mii.constants.LOAD_BALANCER_CONFIG_KEY, - None)) + None), + lb_enabled=lb_enabled) global model model = None diff --git a/mii/server.py b/mii/server.py index ceaf2912..201c1a37 100644 --- a/mii/server.py +++ b/mii/server.py @@ -28,10 +28,10 @@ def config_to_b64_str(config): class MIIServer(): '''Initialize the model, setup the server for the model under model_path''' - def __init__(self, deployment_tag, deployments, model_path, lb_config=None): + def __init__(self, deployment_tag, deployments, model_path, lb_config=None, lb_enabled=False): #mii_configs = mii.config.MIIConfig(**mii_configs) - + self.lb_enabled = lb_enabled #self.task = mii.utils.get_task(task_name) self.deployments = deployments for deployment in deployments: @@ -48,7 +48,8 @@ def __init__(self, deployment_tag, deployments, model_path, lb_config=None): processes = self._initialize_service(deployment_tag, deployments, model_path, - lb_config) + lb_config, + name_map) self._wait_until_server_is_live(processes, lb_config.replica_configs) def _wait_until_server_is_live(self, processes, deployment): @@ -282,6 +283,8 @@ def _initialize_service(self, deployment_tag, deployments, model_path, lb_config for dep in deployments: if dep.deployment_name == name: deployment = dep + if deployment is None: + continue hostfile = tempfile.NamedTemporaryFile(delete=False) hostfile.write( f'{repl_config.hostname} slots={max(host_gpus[repl_config.hostname])+1}\n' @@ -306,7 +309,8 @@ def _initialize_service(self, deployment_tag, deployments, model_path, lb_config # we don't use deepspeed launcher for the load balancer because it does not need a GPU. # The deepspeed launcher determines the number of processes to launch based on GPUs available on the host or CUDA_VISIBLE_DEVICES, # and it is expected to assign one GPU to one process. - processes.append(self._launch_load_balancer(model_path, lb_config)) + if not self.lb_enabled: + processes.append(self._launch_load_balancer(model_path, lb_config)) for deployment in self.deployments: if deployment.mii_config.enable_restful_api: From 189e75ce41883b5b9b51b3c712b52c0397b36f75 Mon Sep 17 00:00:00 2001 From: Tosin Segun Date: Fri, 21 Jul 2023 21:03:04 +0000 Subject: [PATCH 46/69] Partial deploy started --- mii/__init__.py | 5 - mii/client.py | 92 ++++++++++++------- mii/constants.py | 1 + mii/deployment.py | 27 +++--- mii/grpc_related/modelresponse_server.py | 14 ++- mii/grpc_related/proto/modelresponse.proto | 2 +- mii/grpc_related/proto/modelresponse_pb2.py | 4 +- .../proto/modelresponse_pb2_grpc.py | 6 +- mii/models/score/generate.py | 2 + mii/models/score/score_template.py | 2 + mii/server.py | 2 +- 11 files changed, 93 insertions(+), 64 deletions(-) diff --git a/mii/__init__.py b/mii/__init__.py index aba912f3..66748a56 100644 --- a/mii/__init__.py +++ b/mii/__init__.py @@ -15,11 +15,6 @@ __version__ = "0.0.0" non_persistent_models = {} -port_map = {} -deployment_tag = "" -lb_config = None -model_path = None -deployment_type = None try: from .version import __version__ except ImportError: diff --git a/mii/client.py b/mii/client.py index 14dc7273..9da14d4f 100644 --- a/mii/client.py +++ b/mii/client.py @@ -6,6 +6,7 @@ import grpc import requests import mii +import time from mii.utils import get_task from mii.grpc_related.proto import modelresponse_pb2, modelresponse_pb2_grpc from mii.constants import GRPC_MAX_MSG_SIZE, Tasks, DeploymentType @@ -18,11 +19,14 @@ def _get_deployment_configs(deployment_tag): deployments = [] configs = mii.utils.import_score_file(deployment_tag).configs for deployment in configs: - if not isinstance(configs[deployment], dict): + if not isinstance(configs[deployment], dict) or deployment == mii.constants.PORT_MAP_KEY: continue configs[deployment][mii.constants.DEPLOYED_KEY] = True deployments.append(configs[deployment]) - return deployments + lb_config = configs[mii.constants.LOAD_BALANCER_CONFIG_KEY] + model_path = configs[mii.constants.MODEL_PATH_KEY] + port_map = configs[mii.constants.PORT_MAP_KEY] + return deployments, lb_config, model_path, port_map def mii_query_handle(deployment_tag): @@ -42,12 +46,12 @@ def mii_query_handle(deployment_tag): inference_pipeline, task = mii.non_persistent_models[deployment_tag] return MIINonPersistentClient(task, deployment_tag) - deployments = _get_deployment_configs(deployment_tag) + deployments, lb_config, model_path, port_map = _get_deployment_configs(deployment_tag) if len(deployments) > 0: mii_configs_dict = deployments[0][mii.constants.MII_CONFIGS_KEY] mii_configs = mii.config.MIIConfig(**mii_configs_dict) - return MIIClient(deployments, "localhost", mii_configs.port_number) + return MIIClient(deployments, "localhost", mii_configs.port_number, lb_config, model_path, port_map, deployment_tag) def create_channel(host, port): @@ -62,12 +66,16 @@ class MIIClient(): """ Client to send queries to a single endpoint. """ - def __init__(self, deployments, host, port): + def __init__(self, deployments, host, port, lb_config=None, model_path=None, port_map=None, deployment_tag=None): self.asyncio_loop = asyncio.get_event_loop() channel = create_channel(host, port) self.stub = modelresponse_pb2_grpc.DeploymentManagementStub(channel) #self.task = get_task(task_name) self.deployments = deployments + self.lb_config = lb_config + self.model_path = model_path + self.port_map = port_map + self.deployment_tag = deployment_tag def _get_deployment_task(self, deployment_name=None): task = None @@ -131,8 +139,8 @@ def destroy_session(self, session_id, deployment_name=None): assert task == Tasks.TEXT_GENERATION, f"Session deletion only available for task '{Tasks.TEXT_GENERATION}'." self.asyncio_loop.run_until_complete(self.destroy_session_async(session_id)) - async def add_models_async(self, request=None): - await getattr(self.stub, "AddDeployment")(modelresponse_pb2.google_dot_protobuf_dot_empty__pb2.Empty()) + async def add_models_async(self, proto_request): + await getattr(self.stub, "AddDeployment")(proto_request) def add_models(self, task=None, @@ -161,39 +169,57 @@ def add_models(self, version=version, deployed=False) ] - - deployment_tag = mii.deployment_tag - lb_config = allocate_processes(deployments) - if mii.lb_config is not None: - mii.lb_config.replica_configs.extend(lb_config.replica_configs) + for deployment in deployments: + deployment.task = get_task(deployment.task) + lb_config = allocate_processes(deployments, self.port_map) + if self.lb_config is not None: + self.lb_config.replica_configs.extend(lb_config.replica_configs) else: - mii.lb_config = lb_config + self.lb_config = lb_config self.deployments.extend(deployments) - if mii.model_path is None and deployment_type == DeploymentType.LOCAL: - mii.model_path = MII_MODEL_PATH_DEFAULT - elif mii.model_path is None and deployment_type == DeploymentType.AML: + if self.model_path is None and deployment_type == DeploymentType.LOCAL: + self.model_path = mii.constants.MII_MODEL_PATH_DEFAULT + elif self.model_path is None and deployment_type == DeploymentType.AML: model_path = "model" deps = [] for deployment in self.deployments: - data = { - 'deployment_name': deployment[mii.constants.DEPLOYMENT_NAME_KEY], - 'task': deployment[mii.constants.TASK_NAME_KEY], - 'model': deployment[mii.constants.MODEL_NAME_KEY], - 'enable_deepspeed': deployment[mii.constants.ENABLE_DEEPSPEED_KEY], - 'enable_zero': deployment[mii.constants.ENABLE_DEEPSPEED_ZERO_KEY], - 'GPU_index_map': None, - 'mii_config': deployment[mii.constants.MII_CONFIGS_KEY], - 'ds_config': deployment[mii.constants.DEEPSPEED_CONFIG_KEY], - 'version': 1 - 'deployed' deployment[mii.constants.DEPLOYED_KEY] - } - - create_score_file(deployment_tag=deployment_tag, deployment_type=mii.deployment_type, deployments=self.deployments, model_path=mii.model_path, lb_config=mii.lb_config) - if mii.deployment_type == DeploymentType.Local: - mii.utils.import_score_file(deployment_tag).init() + if isinstance(deployment, dict): + + data = { + 'deployment_name': deployment[mii.constants.DEPLOYMENT_NAME_KEY], + 'task': deployment[mii.constants.TASK_NAME_KEY], + 'model': deployment[mii.constants.MODEL_NAME_KEY], + 'enable_deepspeed': deployment[mii.constants.ENABLE_DEEPSPEED_KEY], + 'enable_zero': deployment[mii.constants.ENABLE_DEEPSPEED_ZERO_KEY], + 'GPU_index_map': None, + 'mii_config': deployment[mii.constants.MII_CONFIGS_KEY], + 'ds_config': deployment[mii.constants.DEEPSPEED_CONFIG_KEY], + 'version': 1, + 'deployed': deployment[mii.constants.DEPLOYED_KEY] + } + deps.append(DeploymentConfig.parse_obj(data)) + else: + deps.append(deployment) + for deployment in deps: + if isinstance(deployment.task, str): + deployment.task = get_task(deployment.task) + print(deps) + time.sleep(5) + create_score_file(deployment_tag=self.deployment_tag, deployment_type=deployment_type, deployments=deps, model_path=self.model_path, port_map=self.port_map, lb_config=lb_config) + if deployment_type == DeploymentType.LOCAL: + mii.utils.import_score_file(self.deployment_tag).init() - self.asyncio_loop.run_until_complete(self.add_models_async()) + for replica in lb_config.replica_configs: + request_proto = modelresponse_pb2.AddDeployRequest(task=replica.task, + deployment_name=replica.deployment_name, + hostname=replica.hostname, + tensor_parallel_ports=replica.tensor_parallel_ports, + torch_dist_port=replica.torch_dist_port, + gpu_indices=replica.gpu_indices + ) + + self.asyncio_loop.run_until_complete(self.add_models_async(request_proto)) class MIITensorParallelClient(): """ Client to send queries to multiple endpoints in parallel. diff --git a/mii/constants.py b/mii/constants.py index baffdcf9..61c7c474 100644 --- a/mii/constants.py +++ b/mii/constants.py @@ -89,6 +89,7 @@ class ModelProvider(enum.Enum): TEXT2IMG_NAME: ["query"] } +PORT_MAP_KEY = 'port_map' MODEL_NAME_KEY = 'model_name' TASK_NAME_KEY = 'task_name' DEPLOYMENT_NAME_KEY = 'deployment_name' diff --git a/mii/deployment.py b/mii/deployment.py index ae539b10..0848e89d 100644 --- a/mii/deployment.py +++ b/mii/deployment.py @@ -68,7 +68,6 @@ def deploy(task=None, If deployment_type is `LOCAL`, returns just the name of the deployment that can be used to create a query handle using `mii.mii_query_handle(deployment_name)` """ - mii.deployment_type = deployment_type if not deployments: assert all((model, task, deployment_name)), "model, task, and deployment name must be set to deploy singular model" deployments = [ @@ -86,7 +85,6 @@ def deploy(task=None, deployment_tag = deployment_name else: assert deployment_tag is not None, "deployment_tag must be set to deploy multiple models" - mii.deployment_tag = deployment_tag # parse and validate mii config for deployment in deployments: mii_config = deployment.mii_config @@ -128,10 +126,10 @@ def deploy(task=None, elif model_path is None and deployment_type == DeploymentType.AML: model_path = "model" - mii.model_path = model_path # add fields for replica deployment replica_configs = [] port_offset = 1 + port_map = {} for deployment in deployments: mii_config = deployment.mii_config replica_pool = _allocate_processes(mii_config.hostfile, @@ -141,16 +139,16 @@ def deploy(task=None, for i, (hostname, gpu_indices) in enumerate(replica_pool): # Reserver port for a LB proxy when replication is enabled - if hostname not in mii.port_map: - mii.port_map[hostname] = set() + if hostname not in port_map: + port_map[hostname] = set() base_port = mii_config.port_number + i * mii_config.tensor_parallel + port_offset - if base_port in mii.port_map[hostname]: - base_port = max(mii.port_map[hostname]) + 1 + if base_port in port_map[hostname]: + base_port = max(port_map[hostname]) + 1 tensor_parallel_ports = list( range(base_port, base_port + mii_config.tensor_parallel)) for i in range(base_port, base_port + mii_config.tensor_parallel): - mii.port_map[hostname].add(i) + port_map[hostname].add(i) torch_dist_port = mii_config.torch_dist_port + i replica_configs.append( ReplicaConfig(task=get_task_name(deployment.task), @@ -167,6 +165,7 @@ def deploy(task=None, deployment_type=deployment_type, deployments=deployments, model_path=model_path, + port_map=port_map, lb_config=lb_config) if deployment_type == DeploymentType.AML: @@ -188,7 +187,7 @@ def deploy(task=None, else: raise Exception(f"Unknown deployment type: {deployment_type}") -def allocate_processes(deployments): +def allocate_processes(deployments, port_map): replica_configs = [] port_offset = 1 for deployment in deployments: @@ -200,16 +199,16 @@ def allocate_processes(deployments): for i, (hostname, gpu_indices) in enumerate(replica_pool): # Reserver port for a LB proxy when replication is enabled - if hostname not in mii.port_map: - mii.port_map[hostname] = set() + if hostname not in port_map: + port_map[hostname] = set() base_port = mii_config.port_number + i * mii_config.tensor_parallel + port_offset - if base_port in mii.port_map[hostname]: - base_port = max(mii.port_map[hostname]) + 1 + if base_port in port_map[hostname]: + base_port = max(port_map[hostname]) + 1 tensor_parallel_ports = list( range(base_port, base_port + mii_config.tensor_parallel)) for i in range(base_port, base_port + mii_config.tensor_parallel): - mii.port_map[hostname].add(i) + port_map[hostname].add(i) torch_dist_port = mii_config.torch_dist_port + i replica_configs.append( ReplicaConfig(task=get_task_name(deployment.task), diff --git a/mii/grpc_related/modelresponse_server.py b/mii/grpc_related/modelresponse_server.py index ab55ae32..97c6c3cb 100644 --- a/mii/grpc_related/modelresponse_server.py +++ b/mii/grpc_related/modelresponse_server.py @@ -34,7 +34,7 @@ def get_stop_event(self): class DeploymentManagement(ServiceBase, modelresponse_pb2_grpc.DeploymentManagementServicer): def AddDeployment(self, request, context): - print("TESTING ADD DEPLOYMENT") + print("DEPLOYMENT ADDED") return google_dot_protobuf_dot_empty__pb2.Empty() @@ -203,7 +203,6 @@ def choose_stub(self, call_count): def intercept_service(self, continuation, handler_call_details): next_handler = continuation(handler_call_details) - print(next_handler) assert next_handler.unary_unary is not None #USE KWARGS LIKE THEY ARE USED TO MAKE SESSIONS TO GET THE DEPLOYMENT NAME TO HASH THE COUNTERS/STUBS @@ -211,9 +210,14 @@ def intercept_service(self, continuation, handler_call_details): def invoke_intercept_method(request_proto, context): method_name = _get_grpc_method_name(handler_call_details.method) if method_name == ADD_DEPLOYMENT_METHOD: - for name in self.stubs: - for stub in self.stubs[name]: - stub.invoke(ADD_DEPLOYMENT_METHOD, request_proto) + print(f"REQUEST PROTO -> {request_proto}") + task = str(getattr(request_proto, "task")) + deployment_name = str(getattr(request_proto, "deployment_name")) + hostname = str(getattr(request_proto, "hostname")) + tensor_parallel_ports = list(getattr(request_proto, "tensor_parallel_ports")) + torch_dist_port = int(getattr(request_proto, "torch_dist_port")) + gpu_indices = list(getattr(request_proto, "gpu_indices")) + print(type(gpu_indices[0])) return google_dot_protobuf_dot_empty__pb2.Empty() if method_name == TERMINATE_METHOD: diff --git a/mii/grpc_related/proto/modelresponse.proto b/mii/grpc_related/proto/modelresponse.proto index 7e4d3520..36d8c0e9 100644 --- a/mii/grpc_related/proto/modelresponse.proto +++ b/mii/grpc_related/proto/modelresponse.proto @@ -37,7 +37,7 @@ service ModelResponse { } service DeploymentManagement { - rpc AddDeployment(google.protobuf.Empty) returns (google.protobuf.Empty) {} + rpc AddDeployment(AddDeployRequest) returns (google.protobuf.Empty) {} } message Value { diff --git a/mii/grpc_related/proto/modelresponse_pb2.py b/mii/grpc_related/proto/modelresponse_pb2.py index 1fc27665..452de039 100644 --- a/mii/grpc_related/proto/modelresponse_pb2.py +++ b/mii/grpc_related/proto/modelresponse_pb2.py @@ -14,7 +14,7 @@ from google.protobuf import empty_pb2 as google_dot_protobuf_dot_empty__pb2 -DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x13modelresponse.proto\x12\rmodelresponse\x1a\x1bgoogle/protobuf/empty.proto\"_\n\x05Value\x12\x10\n\x06svalue\x18\x01 \x01(\tH\x00\x12\x10\n\x06ivalue\x18\x02 \x01(\x03H\x00\x12\x10\n\x06\x66value\x18\x03 \x01(\x02H\x00\x12\x10\n\x06\x62value\x18\x04 \x01(\x08H\x00\x42\x0e\n\x0coneof_values\"\x1f\n\tSessionID\x12\x12\n\nsession_id\x18\x01 \x01(\t\"\xed\x01\n\x13SingleStringRequest\x12\x0f\n\x07request\x18\x01 \x01(\t\x12I\n\x0cquery_kwargs\x18\x02 \x03(\x0b\x32\x33.modelresponse.SingleStringRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x03 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\xeb\x01\n\x12MultiStringRequest\x12\x0f\n\x07request\x18\x01 \x03(\t\x12H\n\x0cquery_kwargs\x18\x02 \x03(\x0b\x32\x32.modelresponse.MultiStringRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x03 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\x85\x01\n\x11SingleStringReply\x12\x10\n\x08response\x18\x01 \x01(\t\x12\x12\n\ntime_taken\x18\x02 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x03 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\x84\x01\n\x10MultiStringReply\x12\x10\n\x08response\x18\x01 \x03(\t\x12\x12\n\ntime_taken\x18\x02 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x03 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\xeb\x01\n\tQARequest\x12\x10\n\x08question\x18\x01 \x01(\t\x12\x0f\n\x07\x63ontext\x18\x02 \x01(\t\x12?\n\x0cquery_kwargs\x18\x03 \x03(\x0b\x32).modelresponse.QARequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\xd3\x02\n\x13\x43onversationRequest\x12\x0c\n\x04text\x18\x01 \x01(\t\x12\x1c\n\x0f\x63onversation_id\x18\x02 \x01(\x03H\x00\x88\x01\x01\x12\x18\n\x10past_user_inputs\x18\x03 \x03(\t\x12\x1b\n\x13generated_responses\x18\x04 \x03(\t\x12I\n\x0cquery_kwargs\x18\x05 \x03(\x0b\x32\x33.modelresponse.ConversationRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x06 \x01(\tH\x01\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_conversation_idB\x12\n\x10_deployment_name\"\xc3\x01\n\x11\x43onversationReply\x12\x17\n\x0f\x63onversation_id\x18\x01 \x01(\x03\x12\x18\n\x10past_user_inputs\x18\x02 \x03(\t\x12\x1b\n\x13generated_responses\x18\x03 \x03(\t\x12\x12\n\ntime_taken\x18\x04 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x05 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x06 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\xaf\x01\n\nImageReply\x12\x0e\n\x06images\x18\x01 \x03(\x0c\x12\x1d\n\x15nsfw_content_detected\x18\x02 \x03(\x08\x12\x0c\n\x04mode\x18\x03 \x01(\t\x12\x0e\n\x06size_w\x18\x04 \x01(\x03\x12\x0e\n\x06size_h\x18\x05 \x01(\x03\x12\x12\n\ntime_taken\x18\x06 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x07 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\x98\x01\n\x10\x41\x64\x64\x44\x65ployRequest\x12\x0c\n\x04task\x18\x01 \x01(\t\x12\x17\n\x0f\x64\x65ployment_name\x18\x02 \x01(\t\x12\x10\n\x08hostname\x18\x03 \x01(\t\x12\x1d\n\x15tensor_parallel_ports\x18\x04 \x03(\x03\x12\x17\n\x0ftorch_dist_port\x18\x05 \x01(\x03\x12\x13\n\x0bgpu_indices\x18\x06 \x03(\x03\x32\xd4\x06\n\rModelResponse\x12=\n\tTerminate\x12\x16.google.protobuf.Empty\x1a\x16.google.protobuf.Empty\"\x00\x12\x43\n\rCreateSession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12\x44\n\x0e\x44\x65stroySession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12V\n\x0eGeneratorReply\x12!.modelresponse.MultiStringRequest\x1a\x1f.modelresponse.MultiStringReply\"\x00\x12]\n\x13\x43lassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12V\n\x16QuestionAndAnswerReply\x12\x18.modelresponse.QARequest\x1a .modelresponse.SingleStringReply\"\x00\x12W\n\rFillMaskReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12\x62\n\x18TokenClassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12]\n\x13\x43onversationalReply\x12\".modelresponse.ConversationRequest\x1a .modelresponse.ConversationReply\"\x00\x12N\n\x0cTxt2ImgReply\x12!.modelresponse.MultiStringRequest\x1a\x19.modelresponse.ImageReply\"\x00\x32Y\n\x14\x44\x65ploymentManagement\x12\x41\n\rAddDeployment\x12\x16.google.protobuf.Empty\x1a\x16.google.protobuf.Empty\"\x00\x62\x06proto3') +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x13modelresponse.proto\x12\rmodelresponse\x1a\x1bgoogle/protobuf/empty.proto\"_\n\x05Value\x12\x10\n\x06svalue\x18\x01 \x01(\tH\x00\x12\x10\n\x06ivalue\x18\x02 \x01(\x03H\x00\x12\x10\n\x06\x66value\x18\x03 \x01(\x02H\x00\x12\x10\n\x06\x62value\x18\x04 \x01(\x08H\x00\x42\x0e\n\x0coneof_values\"\x1f\n\tSessionID\x12\x12\n\nsession_id\x18\x01 \x01(\t\"\xed\x01\n\x13SingleStringRequest\x12\x0f\n\x07request\x18\x01 \x01(\t\x12I\n\x0cquery_kwargs\x18\x02 \x03(\x0b\x32\x33.modelresponse.SingleStringRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x03 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\xeb\x01\n\x12MultiStringRequest\x12\x0f\n\x07request\x18\x01 \x03(\t\x12H\n\x0cquery_kwargs\x18\x02 \x03(\x0b\x32\x32.modelresponse.MultiStringRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x03 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\x85\x01\n\x11SingleStringReply\x12\x10\n\x08response\x18\x01 \x01(\t\x12\x12\n\ntime_taken\x18\x02 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x03 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\x84\x01\n\x10MultiStringReply\x12\x10\n\x08response\x18\x01 \x03(\t\x12\x12\n\ntime_taken\x18\x02 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x03 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\xeb\x01\n\tQARequest\x12\x10\n\x08question\x18\x01 \x01(\t\x12\x0f\n\x07\x63ontext\x18\x02 \x01(\t\x12?\n\x0cquery_kwargs\x18\x03 \x03(\x0b\x32).modelresponse.QARequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\xd3\x02\n\x13\x43onversationRequest\x12\x0c\n\x04text\x18\x01 \x01(\t\x12\x1c\n\x0f\x63onversation_id\x18\x02 \x01(\x03H\x00\x88\x01\x01\x12\x18\n\x10past_user_inputs\x18\x03 \x03(\t\x12\x1b\n\x13generated_responses\x18\x04 \x03(\t\x12I\n\x0cquery_kwargs\x18\x05 \x03(\x0b\x32\x33.modelresponse.ConversationRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x06 \x01(\tH\x01\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_conversation_idB\x12\n\x10_deployment_name\"\xc3\x01\n\x11\x43onversationReply\x12\x17\n\x0f\x63onversation_id\x18\x01 \x01(\x03\x12\x18\n\x10past_user_inputs\x18\x02 \x03(\t\x12\x1b\n\x13generated_responses\x18\x03 \x03(\t\x12\x12\n\ntime_taken\x18\x04 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x05 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x06 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\xaf\x01\n\nImageReply\x12\x0e\n\x06images\x18\x01 \x03(\x0c\x12\x1d\n\x15nsfw_content_detected\x18\x02 \x03(\x08\x12\x0c\n\x04mode\x18\x03 \x01(\t\x12\x0e\n\x06size_w\x18\x04 \x01(\x03\x12\x0e\n\x06size_h\x18\x05 \x01(\x03\x12\x12\n\ntime_taken\x18\x06 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x07 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\x98\x01\n\x10\x41\x64\x64\x44\x65ployRequest\x12\x0c\n\x04task\x18\x01 \x01(\t\x12\x17\n\x0f\x64\x65ployment_name\x18\x02 \x01(\t\x12\x10\n\x08hostname\x18\x03 \x01(\t\x12\x1d\n\x15tensor_parallel_ports\x18\x04 \x03(\x03\x12\x17\n\x0ftorch_dist_port\x18\x05 \x01(\x03\x12\x13\n\x0bgpu_indices\x18\x06 \x03(\x03\x32\xd4\x06\n\rModelResponse\x12=\n\tTerminate\x12\x16.google.protobuf.Empty\x1a\x16.google.protobuf.Empty\"\x00\x12\x43\n\rCreateSession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12\x44\n\x0e\x44\x65stroySession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12V\n\x0eGeneratorReply\x12!.modelresponse.MultiStringRequest\x1a\x1f.modelresponse.MultiStringReply\"\x00\x12]\n\x13\x43lassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12V\n\x16QuestionAndAnswerReply\x12\x18.modelresponse.QARequest\x1a .modelresponse.SingleStringReply\"\x00\x12W\n\rFillMaskReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12\x62\n\x18TokenClassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12]\n\x13\x43onversationalReply\x12\".modelresponse.ConversationRequest\x1a .modelresponse.ConversationReply\"\x00\x12N\n\x0cTxt2ImgReply\x12!.modelresponse.MultiStringRequest\x1a\x19.modelresponse.ImageReply\"\x00\x32\x62\n\x14\x44\x65ploymentManagement\x12J\n\rAddDeployment\x12\x1f.modelresponse.AddDeployRequest\x1a\x16.google.protobuf.Empty\"\x00\x62\x06proto3') _globals = globals() _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) @@ -63,5 +63,5 @@ _globals['_MODELRESPONSE']._serialized_start=2058 _globals['_MODELRESPONSE']._serialized_end=2910 _globals['_DEPLOYMENTMANAGEMENT']._serialized_start=2912 - _globals['_DEPLOYMENTMANAGEMENT']._serialized_end=3001 + _globals['_DEPLOYMENTMANAGEMENT']._serialized_end=3010 # @@protoc_insertion_point(module_scope) diff --git a/mii/grpc_related/proto/modelresponse_pb2_grpc.py b/mii/grpc_related/proto/modelresponse_pb2_grpc.py index 438fa0c2..dc91fcfc 100644 --- a/mii/grpc_related/proto/modelresponse_pb2_grpc.py +++ b/mii/grpc_related/proto/modelresponse_pb2_grpc.py @@ -375,7 +375,7 @@ def __init__(self, channel): """ self.AddDeployment = channel.unary_unary( '/modelresponse.DeploymentManagement/AddDeployment', - request_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, + request_serializer=modelresponse__pb2.AddDeployRequest.SerializeToString, response_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, ) @@ -394,7 +394,7 @@ def add_DeploymentManagementServicer_to_server(servicer, server): rpc_method_handlers = { 'AddDeployment': grpc.unary_unary_rpc_method_handler( servicer.AddDeployment, - request_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, + request_deserializer=modelresponse__pb2.AddDeployRequest.FromString, response_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, ), } @@ -419,7 +419,7 @@ def AddDeployment(request, timeout=None, metadata=None): return grpc.experimental.unary_unary(request, target, '/modelresponse.DeploymentManagement/AddDeployment', - google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, + modelresponse__pb2.AddDeployRequest.SerializeToString, google_dot_protobuf_dot_empty__pb2.Empty.FromString, options, channel_credentials, insecure, call_credentials, compression, wait_for_ready, timeout, metadata) diff --git a/mii/models/score/generate.py b/mii/models/score/generate.py index 50f0446f..d807ab45 100644 --- a/mii/models/score/generate.py +++ b/mii/models/score/generate.py @@ -13,11 +13,13 @@ def create_score_file(deployment_tag, deployment_type, deployments, model_path, + port_map, lb_config): config_dict = {} config_dict[mii.constants.MODEL_PATH_KEY] = model_path config_dict[mii.constants.DEPLOYMENT_TAG_KEY] = deployment_tag + config_dict[mii.constants.PORT_MAP_KEY] = port_map for deployment in deployments: deployment_config = { mii.constants.DEPLOYMENT_NAME_KEY: deployment.deployment_name, diff --git a/mii/models/score/score_template.py b/mii/models/score/score_template.py index 84238511..fd6a7f0f 100644 --- a/mii/models/score/score_template.py +++ b/mii/models/score/score_template.py @@ -19,9 +19,11 @@ def init(): deployment_tag = configs[mii.constants.DEPLOYMENT_TAG_KEY] deployments = [] lb_enabled = False + del configs[mii.constants.PORT_MAP_KEY] for deployment in configs.values(): if isinstance(deployment, dict) and deployment[mii.constants.DEPLOYED_KEY]: lb_enabled = True + print(deployment) continue if not isinstance(deployment, dict): continue diff --git a/mii/server.py b/mii/server.py index 201c1a37..f29e3a28 100644 --- a/mii/server.py +++ b/mii/server.py @@ -49,7 +49,7 @@ def __init__(self, deployment_tag, deployments, model_path, lb_config=None, lb_e deployments, model_path, lb_config, - name_map) + ) self._wait_until_server_is_live(processes, lb_config.replica_configs) def _wait_until_server_is_live(self, processes, deployment): From adee843f3430c5ab02e9e3e7f4193cf91757aa90 Mon Sep 17 00:00:00 2001 From: Tosin Segun Date: Mon, 24 Jul 2023 20:38:21 +0000 Subject: [PATCH 47/69] fixing add deploy api queries --- mii/client.py | 60 ++-- mii/deployment.py | 2 +- mii/grpc_related/modelresponse_server.py | 68 ++-- mii/grpc_related/proto/modelresponse.proto | 9 + mii/grpc_related/proto/modelresponse_pb2.py | 6 +- .../proto/modelresponse_pb2_grpc.py | 297 ++++++++++++++++++ 6 files changed, 356 insertions(+), 86 deletions(-) diff --git a/mii/client.py b/mii/client.py index 9da14d4f..9facd25c 100644 --- a/mii/client.py +++ b/mii/client.py @@ -6,7 +6,6 @@ import grpc import requests import mii -import time from mii.utils import get_task from mii.grpc_related.proto import modelresponse_pb2, modelresponse_pb2_grpc from mii.constants import GRPC_MAX_MSG_SIZE, Tasks, DeploymentType @@ -22,7 +21,19 @@ def _get_deployment_configs(deployment_tag): if not isinstance(configs[deployment], dict) or deployment == mii.constants.PORT_MAP_KEY: continue configs[deployment][mii.constants.DEPLOYED_KEY] = True - deployments.append(configs[deployment]) + data = { + 'deployment_name':configs[deployment][mii.constants.DEPLOYMENT_NAME_KEY], + 'task': configs[deployment][mii.constants.TASK_NAME_KEY], + 'model': configs[deployment][mii.constants.MODEL_NAME_KEY], + 'enable_deepspeed': configs[deployment][mii.constants.ENABLE_DEEPSPEED_KEY], + 'enable_zero': configs[deployment][mii.constants.ENABLE_DEEPSPEED_ZERO_KEY], + 'GPU_index_map': None, + 'mii_config': configs[deployment][mii.constants.MII_CONFIGS_KEY], + 'ds_config': configs[deployment][mii.constants.DEEPSPEED_CONFIG_KEY], + 'version': 1, + 'deployed': configs[deployment][mii.constants.DEPLOYED_KEY] + } + deployments.append(DeploymentConfig.parse_obj(data)) lb_config = configs[mii.constants.LOAD_BALANCER_CONFIG_KEY] model_path = configs[mii.constants.MODEL_PATH_KEY] port_map = configs[mii.constants.PORT_MAP_KEY] @@ -48,10 +59,10 @@ def mii_query_handle(deployment_tag): deployments, lb_config, model_path, port_map = _get_deployment_configs(deployment_tag) if len(deployments) > 0: - mii_configs_dict = deployments[0][mii.constants.MII_CONFIGS_KEY] - mii_configs = mii.config.MIIConfig(**mii_configs_dict) + mii_configs_dict = deployments[0].mii_config + #mii_configs = mii.config.MIIConfig(**mii_configs_dict) - return MIIClient(deployments, "localhost", mii_configs.port_number, lb_config, model_path, port_map, deployment_tag) + return MIIClient(deployments, "localhost", mii_configs_dict.port_number, lb_config, model_path, port_map, deployment_tag) def create_channel(host, port): @@ -70,7 +81,6 @@ def __init__(self, deployments, host, port, lb_config=None, model_path=None, por self.asyncio_loop = asyncio.get_event_loop() channel = create_channel(host, port) self.stub = modelresponse_pb2_grpc.DeploymentManagementStub(channel) - #self.task = get_task(task_name) self.deployments = deployments self.lb_config = lb_config self.model_path = model_path @@ -81,12 +91,13 @@ def _get_deployment_task(self, deployment_name=None): task = None if deployment_name is None: #mii.terminate() or single model assert len(self.deployments) == 1, "Must pass deployment_name to query when using multiple deployments" - deployment_name = self.deployments[0][mii.constants.DEPLOYMENT_NAME_KEY] - task = get_task(self.deployments[0][mii.constants.TASK_NAME_KEY]) + deployment_name = self.deployments[0].deployment_name + task = get_task(self.deployments[0].task) if isinstance(deployment.task, str) else self.deployments[0].task else: for deployment in self.deployments: - if deployment[mii.constants.DEPLOYMENT_NAME_KEY] == deployment_name: - task = get_task(deployment[mii.constants.TASK_NAME_KEY]) + print(deployment.deployment_name) + if deployment.deployment_name == deployment_name: + task = get_task(deployment.task) if isinstance(deployment.task, str) else deployment.task return deployment_name, task assert False, f"{deployment_name} not found in list of deployments" return deployment_name, task @@ -155,6 +166,7 @@ def add_models(self, deployment_type=DeploymentType.LOCAL, model_path=None, version=1): + if not deployments: assert all((model, task, deployment_name)), "model, task, and deployment name must be set to deploy singular model" deployments = [ @@ -172,7 +184,8 @@ def add_models(self, for deployment in deployments: deployment.task = get_task(deployment.task) - lb_config = allocate_processes(deployments, self.port_map) + lb_config, self.port_map = allocate_processes(deployments, self.port_map) + if self.lb_config is not None: self.lb_config.replica_configs.extend(lb_config.replica_configs) else: @@ -182,34 +195,13 @@ def add_models(self, self.model_path = mii.constants.MII_MODEL_PATH_DEFAULT elif self.model_path is None and deployment_type == DeploymentType.AML: model_path = "model" - deps = [] for deployment in self.deployments: - if isinstance(deployment, dict): - - data = { - 'deployment_name': deployment[mii.constants.DEPLOYMENT_NAME_KEY], - 'task': deployment[mii.constants.TASK_NAME_KEY], - 'model': deployment[mii.constants.MODEL_NAME_KEY], - 'enable_deepspeed': deployment[mii.constants.ENABLE_DEEPSPEED_KEY], - 'enable_zero': deployment[mii.constants.ENABLE_DEEPSPEED_ZERO_KEY], - 'GPU_index_map': None, - 'mii_config': deployment[mii.constants.MII_CONFIGS_KEY], - 'ds_config': deployment[mii.constants.DEEPSPEED_CONFIG_KEY], - 'version': 1, - 'deployed': deployment[mii.constants.DEPLOYED_KEY] - } - deps.append(DeploymentConfig.parse_obj(data)) - else: - deps.append(deployment) - for deployment in deps: if isinstance(deployment.task, str): deployment.task = get_task(deployment.task) - print(deps) - time.sleep(5) - create_score_file(deployment_tag=self.deployment_tag, deployment_type=deployment_type, deployments=deps, model_path=self.model_path, port_map=self.port_map, lb_config=lb_config) + create_score_file(deployment_tag=self.deployment_tag, deployment_type=deployment_type, deployments=self.deployments, model_path=self.model_path, port_map=self.port_map, lb_config=lb_config) if deployment_type == DeploymentType.LOCAL: mii.utils.import_score_file(self.deployment_tag).init() - + for replica in lb_config.replica_configs: request_proto = modelresponse_pb2.AddDeployRequest(task=replica.task, deployment_name=replica.deployment_name, diff --git a/mii/deployment.py b/mii/deployment.py index 0848e89d..20332fa4 100644 --- a/mii/deployment.py +++ b/mii/deployment.py @@ -219,7 +219,7 @@ def allocate_processes(deployments, port_map): gpu_indices=gpu_indices)) lb_config = LoadBalancerConfig(port=mii_config.port_number, replica_configs=replica_configs) - return lb_config + return lb_config, port_map def _deploy_local(deployment_tag, model_path): mii.utils.import_score_file(deployment_tag).init() diff --git a/mii/grpc_related/modelresponse_server.py b/mii/grpc_related/modelresponse_server.py index 97c6c3cb..aa000e53 100644 --- a/mii/grpc_related/modelresponse_server.py +++ b/mii/grpc_related/modelresponse_server.py @@ -142,15 +142,16 @@ class ParallelStubInvoker: This class aims to call gRPC methods without conversions between proto and python object. TensorParallelClient can be used for invocation with the conversions. """ - def __init__(self, host, ports): + def __init__(self, host, ports, asyncio_loop): # Assumption: target services are all on the same host self.stubs = [] for port in ports: + asyncio.set_event_loop(asyncio_loop) channel = create_channel(host, port) - stub = modelresponse_pb2_grpc.DeploymentManagementStub(channel) + stub = modelresponse_pb2_grpc.ModelResponseStub(channel) self.stubs.append(stub) - self.asyncio_loop = asyncio.get_event_loop() + self.asyncio_loop = asyncio_loop async def _invoke_async(self, method_name, proto_request): responses = [] @@ -184,12 +185,8 @@ def __init__(self, replica_configs): for repl in replica_configs: self.stubs[repl.deployment_name].append( ParallelStubInvoker(repl.hostname, - repl.tensor_parallel_ports)) - """ - self.counter = AtomicCounter() - self.task = get_task(task_name) - self.replica_sessions = {} - """ + repl.tensor_parallel_ports, + self.asyncio_loop)) # Start the asyncio loop in a separate thread def run_asyncio_loop(loop): @@ -210,14 +207,17 @@ def intercept_service(self, continuation, handler_call_details): def invoke_intercept_method(request_proto, context): method_name = _get_grpc_method_name(handler_call_details.method) if method_name == ADD_DEPLOYMENT_METHOD: - print(f"REQUEST PROTO -> {request_proto}") task = str(getattr(request_proto, "task")) deployment_name = str(getattr(request_proto, "deployment_name")) hostname = str(getattr(request_proto, "hostname")) tensor_parallel_ports = list(getattr(request_proto, "tensor_parallel_ports")) torch_dist_port = int(getattr(request_proto, "torch_dist_port")) gpu_indices = list(getattr(request_proto, "gpu_indices")) - print(type(gpu_indices[0])) + if deployment_name not in self.stubs: + self.stubs[deployment_name] = [] + self.counter[deployment_name] = AtomicCounter() + self.tasks[deployment_name] = task + self.stubs[deployment_name].append(ParallelStubInvoker(hostname, tensor_parallel_ports, self.asyncio_loop)) return google_dot_protobuf_dot_empty__pb2.Empty() if method_name == TERMINATE_METHOD: @@ -227,40 +227,8 @@ def invoke_intercept_method(request_proto, context): google_dot_protobuf_dot_empty__pb2.Empty()) self.asyncio_loop.call_soon_threadsafe(self.asyncio_loop.stop) return next_handler.unary_unary(request_proto, context) + deployment_name = getattr(request_proto, 'deployment_name') - """ - kwargs = unpack_proto_query_kwargs(request_proto.query_kwargs) - if method_name != TERMINATE_METHOD: - assert "deployment_name" in kwargs, "Must include deployment_name in kwargs for query" - deployment_name = kwargs.get('deployment_name') - kwargs.pop('deployment_name', None) - task = self.tasks[deployment_name] - assert task is not None, f"task for {deployment_name} not found" - method = GRPC_METHOD_TABLE[get_task(task)] - new_request = None - if method_name == "ConversationalReply": - request_dict = {} - request_dict['text'] = str(request_proto.text) - val = getattr(request_proto, 'conversation_id') - request_dict['conversation_id'] = int(val) if val is not None else None - request_dict['past_user_inputs'] = list(request_proto.past_user_inputs) - request_dict['generated_responses'] = list( - request_proto.generated_responses) - new_request = method.pack_request_to_proto(request_dict, **kwargs) - - elif method_name == "QuestionAndAnswerReply": - request_dict = {} - request_dict['question'] = str(request_proto.question) - request_dict['context'] = str(request_proto.context) - new_request = method.pack_request_to_proto(request_dict, **kwargs) - else: - request_dict = {} - request_dict["query"] = list( - request_proto.request - ) if method_name == "GeneratorReply" or method_name == "Txt2ImgReply" else str( - request_proto.request) - new_request = method.pack_request_to_proto(request_dict, **kwargs) - """ call_count = self.counter[deployment_name].get_and_increment() replica_index = call_count % len(self.stubs[deployment_name]) @@ -298,7 +266,7 @@ def invoke_intercept_method(request_proto, context): response_serializer=next_handler.response_serializer) -def _do_serve(service_impl, port, interceptors=[]): +def _do_serve(service_impl, port, interceptors=[], is_lb=False): stop_event = service_impl.get_stop_event() server = grpc.server(futures.ThreadPoolExecutor(max_workers=LB_MAX_WORKER_THREADS), interceptors=interceptors, @@ -306,7 +274,10 @@ def _do_serve(service_impl, port, interceptors=[]): GRPC_MAX_MSG_SIZE), ('grpc.max_receive_message_length', GRPC_MAX_MSG_SIZE)]) - modelresponse_pb2_grpc.add_DeploymentManagementServicer_to_server(service_impl, server) + if is_lb: + modelresponse_pb2_grpc.add_DeploymentManagementServicer_to_server(service_impl, server) + else: + modelresponse_pb2_grpc.add_ModelResponseServicer_to_server(service_impl, server) server.add_insecure_port(f'[::]:{port}') print(f"About to start server") server.start() @@ -316,13 +287,14 @@ def _do_serve(service_impl, port, interceptors=[]): def serve_inference(inference_pipeline, port): - _do_serve(DeploymentManagement(), port) + _do_serve(ModelResponse(inference_pipeline), port) def serve_load_balancing(lb_config): _do_serve(DeploymentManagement(), lb_config.port, - [LoadBalancingInterceptor(lb_config.replica_configs)]) + [LoadBalancingInterceptor(lb_config.replica_configs)], + True) if __name__ == '__main__': diff --git a/mii/grpc_related/proto/modelresponse.proto b/mii/grpc_related/proto/modelresponse.proto index 36d8c0e9..757fa0da 100644 --- a/mii/grpc_related/proto/modelresponse.proto +++ b/mii/grpc_related/proto/modelresponse.proto @@ -38,6 +38,15 @@ service ModelResponse { service DeploymentManagement { rpc AddDeployment(AddDeployRequest) returns (google.protobuf.Empty) {} + rpc CreateSession (SessionID) returns (google.protobuf.Empty) {} + rpc DestroySession (SessionID) returns (google.protobuf.Empty) {} + rpc GeneratorReply (MultiStringRequest) returns (MultiStringReply) {} + rpc ClassificationReply (SingleStringRequest) returns (SingleStringReply) {} + rpc QuestionAndAnswerReply(QARequest) returns (SingleStringReply) {} + rpc FillMaskReply(SingleStringRequest) returns (SingleStringReply) {} + rpc TokenClassificationReply(SingleStringRequest) returns (SingleStringReply) {} + rpc ConversationalReply(ConversationRequest) returns (ConversationReply) {} + rpc Txt2ImgReply(MultiStringRequest) returns (ImageReply) {} } message Value { diff --git a/mii/grpc_related/proto/modelresponse_pb2.py b/mii/grpc_related/proto/modelresponse_pb2.py index 452de039..073083b4 100644 --- a/mii/grpc_related/proto/modelresponse_pb2.py +++ b/mii/grpc_related/proto/modelresponse_pb2.py @@ -14,7 +14,7 @@ from google.protobuf import empty_pb2 as google_dot_protobuf_dot_empty__pb2 -DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x13modelresponse.proto\x12\rmodelresponse\x1a\x1bgoogle/protobuf/empty.proto\"_\n\x05Value\x12\x10\n\x06svalue\x18\x01 \x01(\tH\x00\x12\x10\n\x06ivalue\x18\x02 \x01(\x03H\x00\x12\x10\n\x06\x66value\x18\x03 \x01(\x02H\x00\x12\x10\n\x06\x62value\x18\x04 \x01(\x08H\x00\x42\x0e\n\x0coneof_values\"\x1f\n\tSessionID\x12\x12\n\nsession_id\x18\x01 \x01(\t\"\xed\x01\n\x13SingleStringRequest\x12\x0f\n\x07request\x18\x01 \x01(\t\x12I\n\x0cquery_kwargs\x18\x02 \x03(\x0b\x32\x33.modelresponse.SingleStringRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x03 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\xeb\x01\n\x12MultiStringRequest\x12\x0f\n\x07request\x18\x01 \x03(\t\x12H\n\x0cquery_kwargs\x18\x02 \x03(\x0b\x32\x32.modelresponse.MultiStringRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x03 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\x85\x01\n\x11SingleStringReply\x12\x10\n\x08response\x18\x01 \x01(\t\x12\x12\n\ntime_taken\x18\x02 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x03 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\x84\x01\n\x10MultiStringReply\x12\x10\n\x08response\x18\x01 \x03(\t\x12\x12\n\ntime_taken\x18\x02 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x03 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\xeb\x01\n\tQARequest\x12\x10\n\x08question\x18\x01 \x01(\t\x12\x0f\n\x07\x63ontext\x18\x02 \x01(\t\x12?\n\x0cquery_kwargs\x18\x03 \x03(\x0b\x32).modelresponse.QARequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\xd3\x02\n\x13\x43onversationRequest\x12\x0c\n\x04text\x18\x01 \x01(\t\x12\x1c\n\x0f\x63onversation_id\x18\x02 \x01(\x03H\x00\x88\x01\x01\x12\x18\n\x10past_user_inputs\x18\x03 \x03(\t\x12\x1b\n\x13generated_responses\x18\x04 \x03(\t\x12I\n\x0cquery_kwargs\x18\x05 \x03(\x0b\x32\x33.modelresponse.ConversationRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x06 \x01(\tH\x01\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_conversation_idB\x12\n\x10_deployment_name\"\xc3\x01\n\x11\x43onversationReply\x12\x17\n\x0f\x63onversation_id\x18\x01 \x01(\x03\x12\x18\n\x10past_user_inputs\x18\x02 \x03(\t\x12\x1b\n\x13generated_responses\x18\x03 \x03(\t\x12\x12\n\ntime_taken\x18\x04 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x05 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x06 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\xaf\x01\n\nImageReply\x12\x0e\n\x06images\x18\x01 \x03(\x0c\x12\x1d\n\x15nsfw_content_detected\x18\x02 \x03(\x08\x12\x0c\n\x04mode\x18\x03 \x01(\t\x12\x0e\n\x06size_w\x18\x04 \x01(\x03\x12\x0e\n\x06size_h\x18\x05 \x01(\x03\x12\x12\n\ntime_taken\x18\x06 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x07 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\x98\x01\n\x10\x41\x64\x64\x44\x65ployRequest\x12\x0c\n\x04task\x18\x01 \x01(\t\x12\x17\n\x0f\x64\x65ployment_name\x18\x02 \x01(\t\x12\x10\n\x08hostname\x18\x03 \x01(\t\x12\x1d\n\x15tensor_parallel_ports\x18\x04 \x03(\x03\x12\x17\n\x0ftorch_dist_port\x18\x05 \x01(\x03\x12\x13\n\x0bgpu_indices\x18\x06 \x03(\x03\x32\xd4\x06\n\rModelResponse\x12=\n\tTerminate\x12\x16.google.protobuf.Empty\x1a\x16.google.protobuf.Empty\"\x00\x12\x43\n\rCreateSession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12\x44\n\x0e\x44\x65stroySession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12V\n\x0eGeneratorReply\x12!.modelresponse.MultiStringRequest\x1a\x1f.modelresponse.MultiStringReply\"\x00\x12]\n\x13\x43lassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12V\n\x16QuestionAndAnswerReply\x12\x18.modelresponse.QARequest\x1a .modelresponse.SingleStringReply\"\x00\x12W\n\rFillMaskReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12\x62\n\x18TokenClassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12]\n\x13\x43onversationalReply\x12\".modelresponse.ConversationRequest\x1a .modelresponse.ConversationReply\"\x00\x12N\n\x0cTxt2ImgReply\x12!.modelresponse.MultiStringRequest\x1a\x19.modelresponse.ImageReply\"\x00\x32\x62\n\x14\x44\x65ploymentManagement\x12J\n\rAddDeployment\x12\x1f.modelresponse.AddDeployRequest\x1a\x16.google.protobuf.Empty\"\x00\x62\x06proto3') +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x13modelresponse.proto\x12\rmodelresponse\x1a\x1bgoogle/protobuf/empty.proto\"_\n\x05Value\x12\x10\n\x06svalue\x18\x01 \x01(\tH\x00\x12\x10\n\x06ivalue\x18\x02 \x01(\x03H\x00\x12\x10\n\x06\x66value\x18\x03 \x01(\x02H\x00\x12\x10\n\x06\x62value\x18\x04 \x01(\x08H\x00\x42\x0e\n\x0coneof_values\"\x1f\n\tSessionID\x12\x12\n\nsession_id\x18\x01 \x01(\t\"\xed\x01\n\x13SingleStringRequest\x12\x0f\n\x07request\x18\x01 \x01(\t\x12I\n\x0cquery_kwargs\x18\x02 \x03(\x0b\x32\x33.modelresponse.SingleStringRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x03 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\xeb\x01\n\x12MultiStringRequest\x12\x0f\n\x07request\x18\x01 \x03(\t\x12H\n\x0cquery_kwargs\x18\x02 \x03(\x0b\x32\x32.modelresponse.MultiStringRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x03 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\x85\x01\n\x11SingleStringReply\x12\x10\n\x08response\x18\x01 \x01(\t\x12\x12\n\ntime_taken\x18\x02 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x03 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\x84\x01\n\x10MultiStringReply\x12\x10\n\x08response\x18\x01 \x03(\t\x12\x12\n\ntime_taken\x18\x02 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x03 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\xeb\x01\n\tQARequest\x12\x10\n\x08question\x18\x01 \x01(\t\x12\x0f\n\x07\x63ontext\x18\x02 \x01(\t\x12?\n\x0cquery_kwargs\x18\x03 \x03(\x0b\x32).modelresponse.QARequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\xd3\x02\n\x13\x43onversationRequest\x12\x0c\n\x04text\x18\x01 \x01(\t\x12\x1c\n\x0f\x63onversation_id\x18\x02 \x01(\x03H\x00\x88\x01\x01\x12\x18\n\x10past_user_inputs\x18\x03 \x03(\t\x12\x1b\n\x13generated_responses\x18\x04 \x03(\t\x12I\n\x0cquery_kwargs\x18\x05 \x03(\x0b\x32\x33.modelresponse.ConversationRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x06 \x01(\tH\x01\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_conversation_idB\x12\n\x10_deployment_name\"\xc3\x01\n\x11\x43onversationReply\x12\x17\n\x0f\x63onversation_id\x18\x01 \x01(\x03\x12\x18\n\x10past_user_inputs\x18\x02 \x03(\t\x12\x1b\n\x13generated_responses\x18\x03 \x03(\t\x12\x12\n\ntime_taken\x18\x04 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x05 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x06 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\xaf\x01\n\nImageReply\x12\x0e\n\x06images\x18\x01 \x03(\x0c\x12\x1d\n\x15nsfw_content_detected\x18\x02 \x03(\x08\x12\x0c\n\x04mode\x18\x03 \x01(\t\x12\x0e\n\x06size_w\x18\x04 \x01(\x03\x12\x0e\n\x06size_h\x18\x05 \x01(\x03\x12\x12\n\ntime_taken\x18\x06 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x07 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\x98\x01\n\x10\x41\x64\x64\x44\x65ployRequest\x12\x0c\n\x04task\x18\x01 \x01(\t\x12\x17\n\x0f\x64\x65ployment_name\x18\x02 \x01(\t\x12\x10\n\x08hostname\x18\x03 \x01(\t\x12\x1d\n\x15tensor_parallel_ports\x18\x04 \x03(\x03\x12\x17\n\x0ftorch_dist_port\x18\x05 \x01(\x03\x12\x13\n\x0bgpu_indices\x18\x06 \x03(\x03\x32\xd4\x06\n\rModelResponse\x12=\n\tTerminate\x12\x16.google.protobuf.Empty\x1a\x16.google.protobuf.Empty\"\x00\x12\x43\n\rCreateSession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12\x44\n\x0e\x44\x65stroySession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12V\n\x0eGeneratorReply\x12!.modelresponse.MultiStringRequest\x1a\x1f.modelresponse.MultiStringReply\"\x00\x12]\n\x13\x43lassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12V\n\x16QuestionAndAnswerReply\x12\x18.modelresponse.QARequest\x1a .modelresponse.SingleStringReply\"\x00\x12W\n\rFillMaskReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12\x62\n\x18TokenClassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12]\n\x13\x43onversationalReply\x12\".modelresponse.ConversationRequest\x1a .modelresponse.ConversationReply\"\x00\x12N\n\x0cTxt2ImgReply\x12!.modelresponse.MultiStringRequest\x1a\x19.modelresponse.ImageReply\"\x00\x32\xe8\x06\n\x14\x44\x65ploymentManagement\x12J\n\rAddDeployment\x12\x1f.modelresponse.AddDeployRequest\x1a\x16.google.protobuf.Empty\"\x00\x12\x43\n\rCreateSession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12\x44\n\x0e\x44\x65stroySession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12V\n\x0eGeneratorReply\x12!.modelresponse.MultiStringRequest\x1a\x1f.modelresponse.MultiStringReply\"\x00\x12]\n\x13\x43lassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12V\n\x16QuestionAndAnswerReply\x12\x18.modelresponse.QARequest\x1a .modelresponse.SingleStringReply\"\x00\x12W\n\rFillMaskReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12\x62\n\x18TokenClassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12]\n\x13\x43onversationalReply\x12\".modelresponse.ConversationRequest\x1a .modelresponse.ConversationReply\"\x00\x12N\n\x0cTxt2ImgReply\x12!.modelresponse.MultiStringRequest\x1a\x19.modelresponse.ImageReply\"\x00\x62\x06proto3') _globals = globals() _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) @@ -62,6 +62,6 @@ _globals['_ADDDEPLOYREQUEST']._serialized_end=2055 _globals['_MODELRESPONSE']._serialized_start=2058 _globals['_MODELRESPONSE']._serialized_end=2910 - _globals['_DEPLOYMENTMANAGEMENT']._serialized_start=2912 - _globals['_DEPLOYMENTMANAGEMENT']._serialized_end=3010 + _globals['_DEPLOYMENTMANAGEMENT']._serialized_start=2913 + _globals['_DEPLOYMENTMANAGEMENT']._serialized_end=3785 # @@protoc_insertion_point(module_scope) diff --git a/mii/grpc_related/proto/modelresponse_pb2_grpc.py b/mii/grpc_related/proto/modelresponse_pb2_grpc.py index dc91fcfc..d93b85d7 100644 --- a/mii/grpc_related/proto/modelresponse_pb2_grpc.py +++ b/mii/grpc_related/proto/modelresponse_pb2_grpc.py @@ -378,6 +378,51 @@ def __init__(self, channel): request_serializer=modelresponse__pb2.AddDeployRequest.SerializeToString, response_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, ) + self.CreateSession = channel.unary_unary( + '/modelresponse.DeploymentManagement/CreateSession', + request_serializer=modelresponse__pb2.SessionID.SerializeToString, + response_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, + ) + self.DestroySession = channel.unary_unary( + '/modelresponse.DeploymentManagement/DestroySession', + request_serializer=modelresponse__pb2.SessionID.SerializeToString, + response_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, + ) + self.GeneratorReply = channel.unary_unary( + '/modelresponse.DeploymentManagement/GeneratorReply', + request_serializer=modelresponse__pb2.MultiStringRequest.SerializeToString, + response_deserializer=modelresponse__pb2.MultiStringReply.FromString, + ) + self.ClassificationReply = channel.unary_unary( + '/modelresponse.DeploymentManagement/ClassificationReply', + request_serializer=modelresponse__pb2.SingleStringRequest.SerializeToString, + response_deserializer=modelresponse__pb2.SingleStringReply.FromString, + ) + self.QuestionAndAnswerReply = channel.unary_unary( + '/modelresponse.DeploymentManagement/QuestionAndAnswerReply', + request_serializer=modelresponse__pb2.QARequest.SerializeToString, + response_deserializer=modelresponse__pb2.SingleStringReply.FromString, + ) + self.FillMaskReply = channel.unary_unary( + '/modelresponse.DeploymentManagement/FillMaskReply', + request_serializer=modelresponse__pb2.SingleStringRequest.SerializeToString, + response_deserializer=modelresponse__pb2.SingleStringReply.FromString, + ) + self.TokenClassificationReply = channel.unary_unary( + '/modelresponse.DeploymentManagement/TokenClassificationReply', + request_serializer=modelresponse__pb2.SingleStringRequest.SerializeToString, + response_deserializer=modelresponse__pb2.SingleStringReply.FromString, + ) + self.ConversationalReply = channel.unary_unary( + '/modelresponse.DeploymentManagement/ConversationalReply', + request_serializer=modelresponse__pb2.ConversationRequest.SerializeToString, + response_deserializer=modelresponse__pb2.ConversationReply.FromString, + ) + self.Txt2ImgReply = channel.unary_unary( + '/modelresponse.DeploymentManagement/Txt2ImgReply', + request_serializer=modelresponse__pb2.MultiStringRequest.SerializeToString, + response_deserializer=modelresponse__pb2.ImageReply.FromString, + ) class DeploymentManagementServicer(object): @@ -389,6 +434,60 @@ def AddDeployment(self, request, context): context.set_details('Method not implemented!') raise NotImplementedError('Method not implemented!') + def CreateSession(self, request, context): + """Missing associated documentation comment in .proto file.""" + context.set_code(grpc.StatusCode.UNIMPLEMENTED) + context.set_details('Method not implemented!') + raise NotImplementedError('Method not implemented!') + + def DestroySession(self, request, context): + """Missing associated documentation comment in .proto file.""" + context.set_code(grpc.StatusCode.UNIMPLEMENTED) + context.set_details('Method not implemented!') + raise NotImplementedError('Method not implemented!') + + def GeneratorReply(self, request, context): + """Missing associated documentation comment in .proto file.""" + context.set_code(grpc.StatusCode.UNIMPLEMENTED) + context.set_details('Method not implemented!') + raise NotImplementedError('Method not implemented!') + + def ClassificationReply(self, request, context): + """Missing associated documentation comment in .proto file.""" + context.set_code(grpc.StatusCode.UNIMPLEMENTED) + context.set_details('Method not implemented!') + raise NotImplementedError('Method not implemented!') + + def QuestionAndAnswerReply(self, request, context): + """Missing associated documentation comment in .proto file.""" + context.set_code(grpc.StatusCode.UNIMPLEMENTED) + context.set_details('Method not implemented!') + raise NotImplementedError('Method not implemented!') + + def FillMaskReply(self, request, context): + """Missing associated documentation comment in .proto file.""" + context.set_code(grpc.StatusCode.UNIMPLEMENTED) + context.set_details('Method not implemented!') + raise NotImplementedError('Method not implemented!') + + def TokenClassificationReply(self, request, context): + """Missing associated documentation comment in .proto file.""" + context.set_code(grpc.StatusCode.UNIMPLEMENTED) + context.set_details('Method not implemented!') + raise NotImplementedError('Method not implemented!') + + def ConversationalReply(self, request, context): + """Missing associated documentation comment in .proto file.""" + context.set_code(grpc.StatusCode.UNIMPLEMENTED) + context.set_details('Method not implemented!') + raise NotImplementedError('Method not implemented!') + + def Txt2ImgReply(self, request, context): + """Missing associated documentation comment in .proto file.""" + context.set_code(grpc.StatusCode.UNIMPLEMENTED) + context.set_details('Method not implemented!') + raise NotImplementedError('Method not implemented!') + def add_DeploymentManagementServicer_to_server(servicer, server): rpc_method_handlers = { @@ -397,6 +496,51 @@ def add_DeploymentManagementServicer_to_server(servicer, server): request_deserializer=modelresponse__pb2.AddDeployRequest.FromString, response_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, ), + 'CreateSession': grpc.unary_unary_rpc_method_handler( + servicer.CreateSession, + request_deserializer=modelresponse__pb2.SessionID.FromString, + response_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, + ), + 'DestroySession': grpc.unary_unary_rpc_method_handler( + servicer.DestroySession, + request_deserializer=modelresponse__pb2.SessionID.FromString, + response_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, + ), + 'GeneratorReply': grpc.unary_unary_rpc_method_handler( + servicer.GeneratorReply, + request_deserializer=modelresponse__pb2.MultiStringRequest.FromString, + response_serializer=modelresponse__pb2.MultiStringReply.SerializeToString, + ), + 'ClassificationReply': grpc.unary_unary_rpc_method_handler( + servicer.ClassificationReply, + request_deserializer=modelresponse__pb2.SingleStringRequest.FromString, + response_serializer=modelresponse__pb2.SingleStringReply.SerializeToString, + ), + 'QuestionAndAnswerReply': grpc.unary_unary_rpc_method_handler( + servicer.QuestionAndAnswerReply, + request_deserializer=modelresponse__pb2.QARequest.FromString, + response_serializer=modelresponse__pb2.SingleStringReply.SerializeToString, + ), + 'FillMaskReply': grpc.unary_unary_rpc_method_handler( + servicer.FillMaskReply, + request_deserializer=modelresponse__pb2.SingleStringRequest.FromString, + response_serializer=modelresponse__pb2.SingleStringReply.SerializeToString, + ), + 'TokenClassificationReply': grpc.unary_unary_rpc_method_handler( + servicer.TokenClassificationReply, + request_deserializer=modelresponse__pb2.SingleStringRequest.FromString, + response_serializer=modelresponse__pb2.SingleStringReply.SerializeToString, + ), + 'ConversationalReply': grpc.unary_unary_rpc_method_handler( + servicer.ConversationalReply, + request_deserializer=modelresponse__pb2.ConversationRequest.FromString, + response_serializer=modelresponse__pb2.ConversationReply.SerializeToString, + ), + 'Txt2ImgReply': grpc.unary_unary_rpc_method_handler( + servicer.Txt2ImgReply, + request_deserializer=modelresponse__pb2.MultiStringRequest.FromString, + response_serializer=modelresponse__pb2.ImageReply.SerializeToString, + ), } generic_handler = grpc.method_handlers_generic_handler( 'modelresponse.DeploymentManagement', rpc_method_handlers) @@ -423,3 +567,156 @@ def AddDeployment(request, google_dot_protobuf_dot_empty__pb2.Empty.FromString, options, channel_credentials, insecure, call_credentials, compression, wait_for_ready, timeout, metadata) + + @staticmethod + def CreateSession(request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary(request, target, '/modelresponse.DeploymentManagement/CreateSession', + modelresponse__pb2.SessionID.SerializeToString, + google_dot_protobuf_dot_empty__pb2.Empty.FromString, + options, channel_credentials, + insecure, call_credentials, compression, wait_for_ready, timeout, metadata) + + @staticmethod + def DestroySession(request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary(request, target, '/modelresponse.DeploymentManagement/DestroySession', + modelresponse__pb2.SessionID.SerializeToString, + google_dot_protobuf_dot_empty__pb2.Empty.FromString, + options, channel_credentials, + insecure, call_credentials, compression, wait_for_ready, timeout, metadata) + + @staticmethod + def GeneratorReply(request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary(request, target, '/modelresponse.DeploymentManagement/GeneratorReply', + modelresponse__pb2.MultiStringRequest.SerializeToString, + modelresponse__pb2.MultiStringReply.FromString, + options, channel_credentials, + insecure, call_credentials, compression, wait_for_ready, timeout, metadata) + + @staticmethod + def ClassificationReply(request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary(request, target, '/modelresponse.DeploymentManagement/ClassificationReply', + modelresponse__pb2.SingleStringRequest.SerializeToString, + modelresponse__pb2.SingleStringReply.FromString, + options, channel_credentials, + insecure, call_credentials, compression, wait_for_ready, timeout, metadata) + + @staticmethod + def QuestionAndAnswerReply(request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary(request, target, '/modelresponse.DeploymentManagement/QuestionAndAnswerReply', + modelresponse__pb2.QARequest.SerializeToString, + modelresponse__pb2.SingleStringReply.FromString, + options, channel_credentials, + insecure, call_credentials, compression, wait_for_ready, timeout, metadata) + + @staticmethod + def FillMaskReply(request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary(request, target, '/modelresponse.DeploymentManagement/FillMaskReply', + modelresponse__pb2.SingleStringRequest.SerializeToString, + modelresponse__pb2.SingleStringReply.FromString, + options, channel_credentials, + insecure, call_credentials, compression, wait_for_ready, timeout, metadata) + + @staticmethod + def TokenClassificationReply(request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary(request, target, '/modelresponse.DeploymentManagement/TokenClassificationReply', + modelresponse__pb2.SingleStringRequest.SerializeToString, + modelresponse__pb2.SingleStringReply.FromString, + options, channel_credentials, + insecure, call_credentials, compression, wait_for_ready, timeout, metadata) + + @staticmethod + def ConversationalReply(request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary(request, target, '/modelresponse.DeploymentManagement/ConversationalReply', + modelresponse__pb2.ConversationRequest.SerializeToString, + modelresponse__pb2.ConversationReply.FromString, + options, channel_credentials, + insecure, call_credentials, compression, wait_for_ready, timeout, metadata) + + @staticmethod + def Txt2ImgReply(request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary(request, target, '/modelresponse.DeploymentManagement/Txt2ImgReply', + modelresponse__pb2.MultiStringRequest.SerializeToString, + modelresponse__pb2.ImageReply.FromString, + options, channel_credentials, + insecure, call_credentials, compression, wait_for_ready, timeout, metadata) From a145be5c20a0c1323b909b88a401452be8dfbc70 Mon Sep 17 00:00:00 2001 From: Tosin Segun Date: Mon, 24 Jul 2023 21:33:57 +0000 Subject: [PATCH 48/69] Support for empty deployment 'group' --- mii/client.py | 23 +++++++++++++++-------- mii/models/score/generate.py | 29 ++++++++++++++++------------- 2 files changed, 31 insertions(+), 21 deletions(-) diff --git a/mii/client.py b/mii/client.py index 9facd25c..aa6225be 100644 --- a/mii/client.py +++ b/mii/client.py @@ -34,9 +34,9 @@ def _get_deployment_configs(deployment_tag): 'deployed': configs[deployment][mii.constants.DEPLOYED_KEY] } deployments.append(DeploymentConfig.parse_obj(data)) - lb_config = configs[mii.constants.LOAD_BALANCER_CONFIG_KEY] + lb_config = configs.get(mii.constants.LOAD_BALANCER_CONFIG_KEY) model_path = configs[mii.constants.MODEL_PATH_KEY] - port_map = configs[mii.constants.PORT_MAP_KEY] + port_map = configs.get(mii.constants.PORT_MAP_KEY) return deployments, lb_config, model_path, port_map @@ -58,11 +58,13 @@ def mii_query_handle(deployment_tag): return MIINonPersistentClient(task, deployment_tag) deployments, lb_config, model_path, port_map = _get_deployment_configs(deployment_tag) + mii_configs_dict = None if len(deployments) > 0: mii_configs_dict = deployments[0].mii_config #mii_configs = mii.config.MIIConfig(**mii_configs_dict) + port_number = None if mii_configs_dict == None else mii_configs_dict.port_number - return MIIClient(deployments, "localhost", mii_configs_dict.port_number, lb_config, model_path, port_map, deployment_tag) + return MIIClient(deployments, "localhost", port_number, lb_config, model_path, port_map, deployment_tag) def create_channel(host, port): @@ -79,12 +81,15 @@ class MIIClient(): """ def __init__(self, deployments, host, port, lb_config=None, model_path=None, port_map=None, deployment_tag=None): self.asyncio_loop = asyncio.get_event_loop() - channel = create_channel(host, port) - self.stub = modelresponse_pb2_grpc.DeploymentManagementStub(channel) + self.stub = None + self.host = host + if port is not None: + channel = create_channel(host, port) + self.stub = modelresponse_pb2_grpc.DeploymentManagementStub(channel) self.deployments = deployments self.lb_config = lb_config self.model_path = model_path - self.port_map = port_map + self.port_map = port_map if port_map is not None else {} self.deployment_tag = deployment_tag def _get_deployment_task(self, deployment_name=None): @@ -161,7 +166,6 @@ def add_models(self, enable_zero=False, ds_config=None, mii_config={}, - deployment_tag=None, deployments=[], deployment_type=DeploymentType.LOCAL, model_path=None, @@ -201,7 +205,10 @@ def add_models(self, create_score_file(deployment_tag=self.deployment_tag, deployment_type=deployment_type, deployments=self.deployments, model_path=self.model_path, port_map=self.port_map, lb_config=lb_config) if deployment_type == DeploymentType.LOCAL: mii.utils.import_score_file(self.deployment_tag).init() - + if self.stub is None: + self.port_number = self.deployments[0].mii_config.port_number + channel = create_channel(self.host, self.port_number) + self.stub = modelresponse_pb2_grpc.DeploymentManagementStub(channel) for replica in lb_config.replica_configs: request_proto = modelresponse_pb2.AddDeployRequest(task=replica.task, deployment_name=replica.deployment_name, diff --git a/mii/models/score/generate.py b/mii/models/score/generate.py index d807ab45..303aa7e6 100644 --- a/mii/models/score/generate.py +++ b/mii/models/score/generate.py @@ -19,19 +19,22 @@ def create_score_file(deployment_tag, config_dict = {} config_dict[mii.constants.MODEL_PATH_KEY] = model_path config_dict[mii.constants.DEPLOYMENT_TAG_KEY] = deployment_tag - config_dict[mii.constants.PORT_MAP_KEY] = port_map - for deployment in deployments: - deployment_config = { - mii.constants.DEPLOYMENT_NAME_KEY: deployment.deployment_name, - mii.constants.TASK_NAME_KEY: mii.utils.get_task_name(deployment.task), - mii.constants.MODEL_NAME_KEY: deployment.model, - mii.constants.ENABLE_DEEPSPEED_KEY: deployment.enable_deepspeed, - mii.constants.MII_CONFIGS_KEY: deployment.mii_config.dict(), - mii.constants.ENABLE_DEEPSPEED_ZERO_KEY: deployment.enable_zero, - mii.constants.DEEPSPEED_CONFIG_KEY: deployment.ds_config, - mii.constants.DEPLOYED_KEY: deployment.deployed, - } - config_dict[deployment.deployment_name] = deployment_config + if port_map is not None: + config_dict[mii.constants.PORT_MAP_KEY] = port_map + + if deployments is not None: + for deployment in deployments: + deployment_config = { + mii.constants.DEPLOYMENT_NAME_KEY: deployment.deployment_name, + mii.constants.TASK_NAME_KEY: mii.utils.get_task_name(deployment.task), + mii.constants.MODEL_NAME_KEY: deployment.model, + mii.constants.ENABLE_DEEPSPEED_KEY: deployment.enable_deepspeed, + mii.constants.MII_CONFIGS_KEY: deployment.mii_config.dict(), + mii.constants.ENABLE_DEEPSPEED_ZERO_KEY: deployment.enable_zero, + mii.constants.DEEPSPEED_CONFIG_KEY: deployment.ds_config, + mii.constants.DEPLOYED_KEY: deployment.deployed, + } + config_dict[deployment.deployment_name] = deployment_config if lb_config is not None: config_dict[mii.constants.LOAD_BALANCER_CONFIG_KEY] = lb_config From 082c05eecbab57f59288dd95d675bb82a550b0a5 Mon Sep 17 00:00:00 2001 From: Tosin Segun Date: Mon, 24 Jul 2023 21:34:55 +0000 Subject: [PATCH 49/69] Support for empty deployment 'group' --- mii/deployment.py | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/mii/deployment.py b/mii/deployment.py index 20332fa4..14edcb99 100644 --- a/mii/deployment.py +++ b/mii/deployment.py @@ -68,7 +68,22 @@ def deploy(task=None, If deployment_type is `LOCAL`, returns just the name of the deployment that can be used to create a query handle using `mii.mii_query_handle(deployment_name)` """ - if not deployments: + if model_path is None and deployment_type == DeploymentType.LOCAL: + model_path = MII_MODEL_PATH_DEFAULT + elif model_path is None and deployment_type == DeploymentType.AML: + model_path = "model" + + if not deployments and not all((model, task, deployment_name)): + assert deployment_tag is not None, "Deployment tag must be set when starting empty deployment" + create_score_file(deployment_tag=deployment_tag, + deployment_type=deployment_type, + deployments=None, + model_path=model_path, + port_map=None, + lb_config=None) + return None + + elif not deployments: assert all((model, task, deployment_name)), "model, task, and deployment name must be set to deploy singular model" deployments = [ DeploymentConfig(deployment_name=deployment_name, @@ -121,10 +136,6 @@ def deploy(task=None, ) # In local deployments use default path if no model path set - if model_path is None and deployment_type == DeploymentType.LOCAL: - model_path = MII_MODEL_PATH_DEFAULT - elif model_path is None and deployment_type == DeploymentType.AML: - model_path = "model" # add fields for replica deployment replica_configs = [] From 3ce77d2efdecde0bafbf9a6dedecf9b3d2a60a4c Mon Sep 17 00:00:00 2001 From: Tosin Segun Date: Tue, 25 Jul 2023 17:49:15 +0000 Subject: [PATCH 50/69] Partial Termination --- mii/client.py | 13 ++++ mii/constants.py | 2 +- mii/grpc_related/modelresponse_server.py | 16 ++++- mii/grpc_related/proto/modelresponse.proto | 6 ++ mii/grpc_related/proto/modelresponse_pb2.py | 12 ++-- .../proto/modelresponse_pb2_grpc.py | 66 +++++++++++++++++++ 6 files changed, 107 insertions(+), 8 deletions(-) diff --git a/mii/client.py b/mii/client.py index aa6225be..1a7c834b 100644 --- a/mii/client.py +++ b/mii/client.py @@ -155,6 +155,19 @@ def destroy_session(self, session_id, deployment_name=None): assert task == Tasks.TEXT_GENERATION, f"Session deletion only available for task '{Tasks.TEXT_GENERATION}'." self.asyncio_loop.run_until_complete(self.destroy_session_async(session_id)) + async def delete_model_async(self, proto_request): + await getattr(self.stub, "DeleteDeployment")(proto_request) + + def delete_model(self, deployment_name): + for deployment in self.deployments: + if deployment.deployment_name == deployment_name: + request_proto = modelresponse_pb2.DeleteDeployRequest(deployment_name=deployment_name) + self.asyncio_loop.run_until_complete(self.delete_model_async(request_proto)) + return None + assert False, f"Deployment: {deployment_name} not found" + + + async def add_models_async(self, proto_request): await getattr(self.stub, "AddDeployment")(proto_request) diff --git a/mii/constants.py b/mii/constants.py index 61c7c474..138da1e7 100644 --- a/mii/constants.py +++ b/mii/constants.py @@ -120,7 +120,7 @@ class ModelProvider(enum.Enum): CREATE_SESSION_METHOD = "CreateSession" DESTROY_SESSION_METHOD = "DestroySession" ADD_DEPLOYMENT_METHOD = "AddDeployment" - +DELETE_DEPLOYMENT_METHOD = "DeleteDeployment" LB_MAX_WORKER_THREADS = 32 SERVER_SHUTDOWN_TIMEOUT = 10 diff --git a/mii/grpc_related/modelresponse_server.py b/mii/grpc_related/modelresponse_server.py index aa000e53..3026e176 100644 --- a/mii/grpc_related/modelresponse_server.py +++ b/mii/grpc_related/modelresponse_server.py @@ -13,7 +13,7 @@ import threading import time -from mii.constants import GRPC_MAX_MSG_SIZE, ADD_DEPLOYMENT_METHOD, CREATE_SESSION_METHOD, DESTROY_SESSION_METHOD, TERMINATE_METHOD, LB_MAX_WORKER_THREADS, SERVER_SHUTDOWN_TIMEOUT, Tasks +from mii.constants import GRPC_MAX_MSG_SIZE, ADD_DEPLOYMENT_METHOD, DELETE_DEPLOYMENT_METHOD, CREATE_SESSION_METHOD, DESTROY_SESSION_METHOD, TERMINATE_METHOD, LB_MAX_WORKER_THREADS, SERVER_SHUTDOWN_TIMEOUT, Tasks from mii.method_table import GRPC_METHOD_TABLE from mii.client import create_channel @@ -36,7 +36,9 @@ class DeploymentManagement(ServiceBase, modelresponse_pb2_grpc.DeploymentManagem def AddDeployment(self, request, context): print("DEPLOYMENT ADDED") return google_dot_protobuf_dot_empty__pb2.Empty() - + + def DeleteDeployment(self, request, context): + return google_dot_protobuf_dot_empty__pb2.Empty() class ModelResponse(ServiceBase): """ @@ -228,6 +230,16 @@ def invoke_intercept_method(request_proto, context): self.asyncio_loop.call_soon_threadsafe(self.asyncio_loop.stop) return next_handler.unary_unary(request_proto, context) + if method_name == DELETE_DEPLOYMENT_METHOD: + deployment_name = str(getattr(request_proto, "deployment_name")) + for stub in self.stubs[deployment_name]: + stub.invoke(TERMINATE_METHOD, + google_dot_protobuf_dot_empty__pb2.Empty()) + del self.stubs[deployment_name] + del self.counter[deployment_name] + del self.tasks[deployment_name] + return google_dot_protobuf_dot_empty__pb2.Empty() + deployment_name = getattr(request_proto, 'deployment_name') call_count = self.counter[deployment_name].get_and_increment() replica_index = call_count % len(self.stubs[deployment_name]) diff --git a/mii/grpc_related/proto/modelresponse.proto b/mii/grpc_related/proto/modelresponse.proto index 757fa0da..ad626810 100644 --- a/mii/grpc_related/proto/modelresponse.proto +++ b/mii/grpc_related/proto/modelresponse.proto @@ -47,6 +47,8 @@ service DeploymentManagement { rpc TokenClassificationReply(SingleStringRequest) returns (SingleStringReply) {} rpc ConversationalReply(ConversationRequest) returns (ConversationReply) {} rpc Txt2ImgReply(MultiStringRequest) returns (ImageReply) {} + rpc DeleteDeployment(DeleteDeployRequest) returns (google.protobuf.Empty) {} + rpc Terminate (google.protobuf.Empty) returns (google.protobuf.Empty) {} } message Value { @@ -132,3 +134,7 @@ message AddDeployRequest { repeated int64 gpu_indices = 6; } + +message DeleteDeployRequest { + string deployment_name = 1; +} diff --git a/mii/grpc_related/proto/modelresponse_pb2.py b/mii/grpc_related/proto/modelresponse_pb2.py index 073083b4..e7abcc14 100644 --- a/mii/grpc_related/proto/modelresponse_pb2.py +++ b/mii/grpc_related/proto/modelresponse_pb2.py @@ -14,7 +14,7 @@ from google.protobuf import empty_pb2 as google_dot_protobuf_dot_empty__pb2 -DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x13modelresponse.proto\x12\rmodelresponse\x1a\x1bgoogle/protobuf/empty.proto\"_\n\x05Value\x12\x10\n\x06svalue\x18\x01 \x01(\tH\x00\x12\x10\n\x06ivalue\x18\x02 \x01(\x03H\x00\x12\x10\n\x06\x66value\x18\x03 \x01(\x02H\x00\x12\x10\n\x06\x62value\x18\x04 \x01(\x08H\x00\x42\x0e\n\x0coneof_values\"\x1f\n\tSessionID\x12\x12\n\nsession_id\x18\x01 \x01(\t\"\xed\x01\n\x13SingleStringRequest\x12\x0f\n\x07request\x18\x01 \x01(\t\x12I\n\x0cquery_kwargs\x18\x02 \x03(\x0b\x32\x33.modelresponse.SingleStringRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x03 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\xeb\x01\n\x12MultiStringRequest\x12\x0f\n\x07request\x18\x01 \x03(\t\x12H\n\x0cquery_kwargs\x18\x02 \x03(\x0b\x32\x32.modelresponse.MultiStringRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x03 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\x85\x01\n\x11SingleStringReply\x12\x10\n\x08response\x18\x01 \x01(\t\x12\x12\n\ntime_taken\x18\x02 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x03 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\x84\x01\n\x10MultiStringReply\x12\x10\n\x08response\x18\x01 \x03(\t\x12\x12\n\ntime_taken\x18\x02 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x03 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\xeb\x01\n\tQARequest\x12\x10\n\x08question\x18\x01 \x01(\t\x12\x0f\n\x07\x63ontext\x18\x02 \x01(\t\x12?\n\x0cquery_kwargs\x18\x03 \x03(\x0b\x32).modelresponse.QARequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\xd3\x02\n\x13\x43onversationRequest\x12\x0c\n\x04text\x18\x01 \x01(\t\x12\x1c\n\x0f\x63onversation_id\x18\x02 \x01(\x03H\x00\x88\x01\x01\x12\x18\n\x10past_user_inputs\x18\x03 \x03(\t\x12\x1b\n\x13generated_responses\x18\x04 \x03(\t\x12I\n\x0cquery_kwargs\x18\x05 \x03(\x0b\x32\x33.modelresponse.ConversationRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x06 \x01(\tH\x01\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_conversation_idB\x12\n\x10_deployment_name\"\xc3\x01\n\x11\x43onversationReply\x12\x17\n\x0f\x63onversation_id\x18\x01 \x01(\x03\x12\x18\n\x10past_user_inputs\x18\x02 \x03(\t\x12\x1b\n\x13generated_responses\x18\x03 \x03(\t\x12\x12\n\ntime_taken\x18\x04 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x05 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x06 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\xaf\x01\n\nImageReply\x12\x0e\n\x06images\x18\x01 \x03(\x0c\x12\x1d\n\x15nsfw_content_detected\x18\x02 \x03(\x08\x12\x0c\n\x04mode\x18\x03 \x01(\t\x12\x0e\n\x06size_w\x18\x04 \x01(\x03\x12\x0e\n\x06size_h\x18\x05 \x01(\x03\x12\x12\n\ntime_taken\x18\x06 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x07 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\x98\x01\n\x10\x41\x64\x64\x44\x65ployRequest\x12\x0c\n\x04task\x18\x01 \x01(\t\x12\x17\n\x0f\x64\x65ployment_name\x18\x02 \x01(\t\x12\x10\n\x08hostname\x18\x03 \x01(\t\x12\x1d\n\x15tensor_parallel_ports\x18\x04 \x03(\x03\x12\x17\n\x0ftorch_dist_port\x18\x05 \x01(\x03\x12\x13\n\x0bgpu_indices\x18\x06 \x03(\x03\x32\xd4\x06\n\rModelResponse\x12=\n\tTerminate\x12\x16.google.protobuf.Empty\x1a\x16.google.protobuf.Empty\"\x00\x12\x43\n\rCreateSession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12\x44\n\x0e\x44\x65stroySession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12V\n\x0eGeneratorReply\x12!.modelresponse.MultiStringRequest\x1a\x1f.modelresponse.MultiStringReply\"\x00\x12]\n\x13\x43lassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12V\n\x16QuestionAndAnswerReply\x12\x18.modelresponse.QARequest\x1a .modelresponse.SingleStringReply\"\x00\x12W\n\rFillMaskReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12\x62\n\x18TokenClassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12]\n\x13\x43onversationalReply\x12\".modelresponse.ConversationRequest\x1a .modelresponse.ConversationReply\"\x00\x12N\n\x0cTxt2ImgReply\x12!.modelresponse.MultiStringRequest\x1a\x19.modelresponse.ImageReply\"\x00\x32\xe8\x06\n\x14\x44\x65ploymentManagement\x12J\n\rAddDeployment\x12\x1f.modelresponse.AddDeployRequest\x1a\x16.google.protobuf.Empty\"\x00\x12\x43\n\rCreateSession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12\x44\n\x0e\x44\x65stroySession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12V\n\x0eGeneratorReply\x12!.modelresponse.MultiStringRequest\x1a\x1f.modelresponse.MultiStringReply\"\x00\x12]\n\x13\x43lassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12V\n\x16QuestionAndAnswerReply\x12\x18.modelresponse.QARequest\x1a .modelresponse.SingleStringReply\"\x00\x12W\n\rFillMaskReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12\x62\n\x18TokenClassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12]\n\x13\x43onversationalReply\x12\".modelresponse.ConversationRequest\x1a .modelresponse.ConversationReply\"\x00\x12N\n\x0cTxt2ImgReply\x12!.modelresponse.MultiStringRequest\x1a\x19.modelresponse.ImageReply\"\x00\x62\x06proto3') +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x13modelresponse.proto\x12\rmodelresponse\x1a\x1bgoogle/protobuf/empty.proto\"_\n\x05Value\x12\x10\n\x06svalue\x18\x01 \x01(\tH\x00\x12\x10\n\x06ivalue\x18\x02 \x01(\x03H\x00\x12\x10\n\x06\x66value\x18\x03 \x01(\x02H\x00\x12\x10\n\x06\x62value\x18\x04 \x01(\x08H\x00\x42\x0e\n\x0coneof_values\"\x1f\n\tSessionID\x12\x12\n\nsession_id\x18\x01 \x01(\t\"\xed\x01\n\x13SingleStringRequest\x12\x0f\n\x07request\x18\x01 \x01(\t\x12I\n\x0cquery_kwargs\x18\x02 \x03(\x0b\x32\x33.modelresponse.SingleStringRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x03 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\xeb\x01\n\x12MultiStringRequest\x12\x0f\n\x07request\x18\x01 \x03(\t\x12H\n\x0cquery_kwargs\x18\x02 \x03(\x0b\x32\x32.modelresponse.MultiStringRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x03 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\x85\x01\n\x11SingleStringReply\x12\x10\n\x08response\x18\x01 \x01(\t\x12\x12\n\ntime_taken\x18\x02 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x03 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\x84\x01\n\x10MultiStringReply\x12\x10\n\x08response\x18\x01 \x03(\t\x12\x12\n\ntime_taken\x18\x02 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x03 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\xeb\x01\n\tQARequest\x12\x10\n\x08question\x18\x01 \x01(\t\x12\x0f\n\x07\x63ontext\x18\x02 \x01(\t\x12?\n\x0cquery_kwargs\x18\x03 \x03(\x0b\x32).modelresponse.QARequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\xd3\x02\n\x13\x43onversationRequest\x12\x0c\n\x04text\x18\x01 \x01(\t\x12\x1c\n\x0f\x63onversation_id\x18\x02 \x01(\x03H\x00\x88\x01\x01\x12\x18\n\x10past_user_inputs\x18\x03 \x03(\t\x12\x1b\n\x13generated_responses\x18\x04 \x03(\t\x12I\n\x0cquery_kwargs\x18\x05 \x03(\x0b\x32\x33.modelresponse.ConversationRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x06 \x01(\tH\x01\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_conversation_idB\x12\n\x10_deployment_name\"\xc3\x01\n\x11\x43onversationReply\x12\x17\n\x0f\x63onversation_id\x18\x01 \x01(\x03\x12\x18\n\x10past_user_inputs\x18\x02 \x03(\t\x12\x1b\n\x13generated_responses\x18\x03 \x03(\t\x12\x12\n\ntime_taken\x18\x04 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x05 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x06 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\xaf\x01\n\nImageReply\x12\x0e\n\x06images\x18\x01 \x03(\x0c\x12\x1d\n\x15nsfw_content_detected\x18\x02 \x03(\x08\x12\x0c\n\x04mode\x18\x03 \x01(\t\x12\x0e\n\x06size_w\x18\x04 \x01(\x03\x12\x0e\n\x06size_h\x18\x05 \x01(\x03\x12\x12\n\ntime_taken\x18\x06 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x07 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\x98\x01\n\x10\x41\x64\x64\x44\x65ployRequest\x12\x0c\n\x04task\x18\x01 \x01(\t\x12\x17\n\x0f\x64\x65ployment_name\x18\x02 \x01(\t\x12\x10\n\x08hostname\x18\x03 \x01(\t\x12\x1d\n\x15tensor_parallel_ports\x18\x04 \x03(\x03\x12\x17\n\x0ftorch_dist_port\x18\x05 \x01(\x03\x12\x13\n\x0bgpu_indices\x18\x06 \x03(\x03\".\n\x13\x44\x65leteDeployRequest\x12\x17\n\x0f\x64\x65ployment_name\x18\x01 \x01(\t2\xd4\x06\n\rModelResponse\x12=\n\tTerminate\x12\x16.google.protobuf.Empty\x1a\x16.google.protobuf.Empty\"\x00\x12\x43\n\rCreateSession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12\x44\n\x0e\x44\x65stroySession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12V\n\x0eGeneratorReply\x12!.modelresponse.MultiStringRequest\x1a\x1f.modelresponse.MultiStringReply\"\x00\x12]\n\x13\x43lassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12V\n\x16QuestionAndAnswerReply\x12\x18.modelresponse.QARequest\x1a .modelresponse.SingleStringReply\"\x00\x12W\n\rFillMaskReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12\x62\n\x18TokenClassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12]\n\x13\x43onversationalReply\x12\".modelresponse.ConversationRequest\x1a .modelresponse.ConversationReply\"\x00\x12N\n\x0cTxt2ImgReply\x12!.modelresponse.MultiStringRequest\x1a\x19.modelresponse.ImageReply\"\x00\x32\xf9\x07\n\x14\x44\x65ploymentManagement\x12J\n\rAddDeployment\x12\x1f.modelresponse.AddDeployRequest\x1a\x16.google.protobuf.Empty\"\x00\x12\x43\n\rCreateSession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12\x44\n\x0e\x44\x65stroySession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12V\n\x0eGeneratorReply\x12!.modelresponse.MultiStringRequest\x1a\x1f.modelresponse.MultiStringReply\"\x00\x12]\n\x13\x43lassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12V\n\x16QuestionAndAnswerReply\x12\x18.modelresponse.QARequest\x1a .modelresponse.SingleStringReply\"\x00\x12W\n\rFillMaskReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12\x62\n\x18TokenClassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12]\n\x13\x43onversationalReply\x12\".modelresponse.ConversationRequest\x1a .modelresponse.ConversationReply\"\x00\x12N\n\x0cTxt2ImgReply\x12!.modelresponse.MultiStringRequest\x1a\x19.modelresponse.ImageReply\"\x00\x12P\n\x10\x44\x65leteDeployment\x12\".modelresponse.DeleteDeployRequest\x1a\x16.google.protobuf.Empty\"\x00\x12=\n\tTerminate\x12\x16.google.protobuf.Empty\x1a\x16.google.protobuf.Empty\"\x00\x62\x06proto3') _globals = globals() _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) @@ -60,8 +60,10 @@ _globals['_IMAGEREPLY']._serialized_end=1900 _globals['_ADDDEPLOYREQUEST']._serialized_start=1903 _globals['_ADDDEPLOYREQUEST']._serialized_end=2055 - _globals['_MODELRESPONSE']._serialized_start=2058 - _globals['_MODELRESPONSE']._serialized_end=2910 - _globals['_DEPLOYMENTMANAGEMENT']._serialized_start=2913 - _globals['_DEPLOYMENTMANAGEMENT']._serialized_end=3785 + _globals['_DELETEDEPLOYREQUEST']._serialized_start=2057 + _globals['_DELETEDEPLOYREQUEST']._serialized_end=2103 + _globals['_MODELRESPONSE']._serialized_start=2106 + _globals['_MODELRESPONSE']._serialized_end=2958 + _globals['_DEPLOYMENTMANAGEMENT']._serialized_start=2961 + _globals['_DEPLOYMENTMANAGEMENT']._serialized_end=3978 # @@protoc_insertion_point(module_scope) diff --git a/mii/grpc_related/proto/modelresponse_pb2_grpc.py b/mii/grpc_related/proto/modelresponse_pb2_grpc.py index d93b85d7..9c3ce85d 100644 --- a/mii/grpc_related/proto/modelresponse_pb2_grpc.py +++ b/mii/grpc_related/proto/modelresponse_pb2_grpc.py @@ -423,6 +423,16 @@ def __init__(self, channel): request_serializer=modelresponse__pb2.MultiStringRequest.SerializeToString, response_deserializer=modelresponse__pb2.ImageReply.FromString, ) + self.DeleteDeployment = channel.unary_unary( + '/modelresponse.DeploymentManagement/DeleteDeployment', + request_serializer=modelresponse__pb2.DeleteDeployRequest.SerializeToString, + response_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, + ) + self.Terminate = channel.unary_unary( + '/modelresponse.DeploymentManagement/Terminate', + request_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, + response_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, + ) class DeploymentManagementServicer(object): @@ -488,6 +498,18 @@ def Txt2ImgReply(self, request, context): context.set_details('Method not implemented!') raise NotImplementedError('Method not implemented!') + def DeleteDeployment(self, request, context): + """Missing associated documentation comment in .proto file.""" + context.set_code(grpc.StatusCode.UNIMPLEMENTED) + context.set_details('Method not implemented!') + raise NotImplementedError('Method not implemented!') + + def Terminate(self, request, context): + """Missing associated documentation comment in .proto file.""" + context.set_code(grpc.StatusCode.UNIMPLEMENTED) + context.set_details('Method not implemented!') + raise NotImplementedError('Method not implemented!') + def add_DeploymentManagementServicer_to_server(servicer, server): rpc_method_handlers = { @@ -541,6 +563,16 @@ def add_DeploymentManagementServicer_to_server(servicer, server): request_deserializer=modelresponse__pb2.MultiStringRequest.FromString, response_serializer=modelresponse__pb2.ImageReply.SerializeToString, ), + 'DeleteDeployment': grpc.unary_unary_rpc_method_handler( + servicer.DeleteDeployment, + request_deserializer=modelresponse__pb2.DeleteDeployRequest.FromString, + response_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, + ), + 'Terminate': grpc.unary_unary_rpc_method_handler( + servicer.Terminate, + request_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, + response_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, + ), } generic_handler = grpc.method_handlers_generic_handler( 'modelresponse.DeploymentManagement', rpc_method_handlers) @@ -720,3 +752,37 @@ def Txt2ImgReply(request, modelresponse__pb2.ImageReply.FromString, options, channel_credentials, insecure, call_credentials, compression, wait_for_ready, timeout, metadata) + + @staticmethod + def DeleteDeployment(request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary(request, target, '/modelresponse.DeploymentManagement/DeleteDeployment', + modelresponse__pb2.DeleteDeployRequest.SerializeToString, + google_dot_protobuf_dot_empty__pb2.Empty.FromString, + options, channel_credentials, + insecure, call_credentials, compression, wait_for_ready, timeout, metadata) + + @staticmethod + def Terminate(request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary(request, target, '/modelresponse.DeploymentManagement/Terminate', + google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, + google_dot_protobuf_dot_empty__pb2.Empty.FromString, + options, channel_credentials, + insecure, call_credentials, compression, wait_for_ready, timeout, metadata) From b40ecbdd0697abd4aeb57c10f50e6bb27020a9fe Mon Sep 17 00:00:00 2001 From: Tosin Segun Date: Tue, 25 Jul 2023 21:14:58 +0000 Subject: [PATCH 51/69] Refactoring --- mii/client.py | 72 ++++++++++++------------ mii/constants.py | 1 + mii/deployment.py | 39 ++----------- mii/grpc_related/modelresponse_server.py | 7 ++- mii/models/score/generate.py | 5 +- mii/models/score/score_template.py | 10 +--- mii/server.py | 34 ++++++----- 7 files changed, 71 insertions(+), 97 deletions(-) diff --git a/mii/client.py b/mii/client.py index 1a7c834b..8bcbc39c 100644 --- a/mii/client.py +++ b/mii/client.py @@ -15,25 +15,24 @@ def _get_deployment_configs(deployment_tag): - deployments = [] + deployments = {} configs = mii.utils.import_score_file(deployment_tag).configs - for deployment in configs: - if not isinstance(configs[deployment], dict) or deployment == mii.constants.PORT_MAP_KEY: - continue - configs[deployment][mii.constants.DEPLOYED_KEY] = True + for deployment in configs[mii.constants.DEPLOYMENTS_KEY].values(): + deployment[mii.constants.DEPLOYED_KEY] = True + deployment_name = deployment[mii.constants.DEPLOYMENT_NAME_KEY] data = { - 'deployment_name':configs[deployment][mii.constants.DEPLOYMENT_NAME_KEY], - 'task': configs[deployment][mii.constants.TASK_NAME_KEY], - 'model': configs[deployment][mii.constants.MODEL_NAME_KEY], - 'enable_deepspeed': configs[deployment][mii.constants.ENABLE_DEEPSPEED_KEY], - 'enable_zero': configs[deployment][mii.constants.ENABLE_DEEPSPEED_ZERO_KEY], + 'deployment_name':deployment[mii.constants.DEPLOYMENT_NAME_KEY], + 'task': deployment[mii.constants.TASK_NAME_KEY], + 'model': deployment[mii.constants.MODEL_NAME_KEY], + 'enable_deepspeed': deployment[mii.constants.ENABLE_DEEPSPEED_KEY], + 'enable_zero': deployment[mii.constants.ENABLE_DEEPSPEED_ZERO_KEY], 'GPU_index_map': None, - 'mii_config': configs[deployment][mii.constants.MII_CONFIGS_KEY], - 'ds_config': configs[deployment][mii.constants.DEEPSPEED_CONFIG_KEY], + 'mii_config': deployment[mii.constants.MII_CONFIGS_KEY], + 'ds_config': deployment[mii.constants.DEEPSPEED_CONFIG_KEY], 'version': 1, - 'deployed': configs[deployment][mii.constants.DEPLOYED_KEY] + 'deployed': deployment[mii.constants.DEPLOYED_KEY] } - deployments.append(DeploymentConfig.parse_obj(data)) + deployments[deployment_name] = DeploymentConfig.parse_obj(data) lb_config = configs.get(mii.constants.LOAD_BALANCER_CONFIG_KEY) model_path = configs[mii.constants.MODEL_PATH_KEY] port_map = configs.get(mii.constants.PORT_MAP_KEY) @@ -60,7 +59,7 @@ def mii_query_handle(deployment_tag): deployments, lb_config, model_path, port_map = _get_deployment_configs(deployment_tag) mii_configs_dict = None if len(deployments) > 0: - mii_configs_dict = deployments[0].mii_config + mii_configs_dict = next(iter(deployments.values())).mii_config #mii_configs = mii.config.MIIConfig(**mii_configs_dict) port_number = None if mii_configs_dict == None else mii_configs_dict.port_number @@ -96,14 +95,14 @@ def _get_deployment_task(self, deployment_name=None): task = None if deployment_name is None: #mii.terminate() or single model assert len(self.deployments) == 1, "Must pass deployment_name to query when using multiple deployments" - deployment_name = self.deployments[0].deployment_name - task = get_task(self.deployments[0].task) if isinstance(deployment.task, str) else self.deployments[0].task + deployment = next(iter(self.deployments.values())) + deployment_name = deployment.deployment_name + task = get_task(deployment.task) if isinstance(deployment.task, str) else deployment.task else: - for deployment in self.deployments: - print(deployment.deployment_name) - if deployment.deployment_name == deployment_name: - task = get_task(deployment.task) if isinstance(deployment.task, str) else deployment.task - return deployment_name, task + if deployment_name in self.deployments: + deployment = self.deployments[deployment_name] + task = get_task(deployment.task) if isinstance(deployment.task, str) else deployment.task + return deployment_name, task assert False, f"{deployment_name} not found in list of deployments" return deployment_name, task @@ -159,11 +158,11 @@ async def delete_model_async(self, proto_request): await getattr(self.stub, "DeleteDeployment")(proto_request) def delete_model(self, deployment_name): - for deployment in self.deployments: - if deployment.deployment_name == deployment_name: - request_proto = modelresponse_pb2.DeleteDeployRequest(deployment_name=deployment_name) - self.asyncio_loop.run_until_complete(self.delete_model_async(request_proto)) - return None + if deployment_name in self.deployments: + request_proto = modelresponse_pb2.DeleteDeployRequest(deployment_name=deployment_name) + self.asyncio_loop.run_until_complete(self.delete_model_async(request_proto)) + del self.deployments[deployment_name] + return None assert False, f"Deployment: {deployment_name} not found" @@ -199,27 +198,30 @@ def add_models(self, deployed=False) ] + deps = {deployment.deployment_name: deployment for deployment in deployments} for deployment in deployments: deployment.task = get_task(deployment.task) - lb_config, self.port_map = allocate_processes(deployments, self.port_map) + lb_config, self.port_map = allocate_processes(deps, self.port_map) if self.lb_config is not None: self.lb_config.replica_configs.extend(lb_config.replica_configs) else: self.lb_config = lb_config - self.deployments.extend(deployments) + for deployment in deployments: + self.deployments[deployment.deployment_name] = deployment + #self.deployments.extend(deployments) if self.model_path is None and deployment_type == DeploymentType.LOCAL: self.model_path = mii.constants.MII_MODEL_PATH_DEFAULT elif self.model_path is None and deployment_type == DeploymentType.AML: model_path = "model" - for deployment in self.deployments: + for deployment in self.deployments.values(): if isinstance(deployment.task, str): deployment.task = get_task(deployment.task) create_score_file(deployment_tag=self.deployment_tag, deployment_type=deployment_type, deployments=self.deployments, model_path=self.model_path, port_map=self.port_map, lb_config=lb_config) if deployment_type == DeploymentType.LOCAL: mii.utils.import_score_file(self.deployment_tag).init() if self.stub is None: - self.port_number = self.deployments[0].mii_config.port_number + self.port_number = next(iter(self.deployments.values())).mii_config.port_number channel = create_channel(self.host, self.port_number) self.stub = modelresponse_pb2_grpc.DeploymentManagementStub(channel) for replica in lb_config.replica_configs: @@ -321,9 +323,9 @@ def terminate(self): def terminate_restful_gateway(deployment_tag): - deployments = _get_deployment_configs(deployment_tag) - for deployment in deployments: - mii_configs_dict = deployment[mii.constants.MII_CONFIGS_KEY] - mii_configs = mii.config.MIIConfig(**mii_configs_dict) + deployments, _, _, _ = _get_deployment_configs(deployment_tag) + for deployment in deployments.values(): + mii_configs_dict = deployment.mii_config + #mii_configs = mii.config.MIIConfig(**mii_configs_dict) if mii_configs.enable_restful_api: requests.get(f"http://localhost:{mii_configs.restful_api_port}/terminate") diff --git a/mii/constants.py b/mii/constants.py index 138da1e7..520a3c3d 100644 --- a/mii/constants.py +++ b/mii/constants.py @@ -89,6 +89,7 @@ class ModelProvider(enum.Enum): TEXT2IMG_NAME: ["query"] } +DEPLOYMENTS_KEY = 'deployments' PORT_MAP_KEY = 'port_map' MODEL_NAME_KEY = 'model_name' TASK_NAME_KEY = 'task_name' diff --git a/mii/deployment.py b/mii/deployment.py index 14edcb99..bc71be48 100644 --- a/mii/deployment.py +++ b/mii/deployment.py @@ -126,6 +126,7 @@ def deploy(task=None, deployment.task, deployment.model) + if enable_deepspeed: logger.info( f"************* MII is using DeepSpeed Optimizations to accelerate your model: {deployment.model} *************" @@ -135,46 +136,18 @@ def deploy(task=None, f"************* DeepSpeed Optimizations not enabled. Please use enable_deepspeed to get better performance for: {deployment.model} *************" ) + deps = {deployment.deployment_name: deployment for deployment in deployments} + # In local deployments use default path if no model path set # add fields for replica deployment - replica_configs = [] - port_offset = 1 port_map = {} - for deployment in deployments: - mii_config = deployment.mii_config - replica_pool = _allocate_processes(mii_config.hostfile, - mii_config.tensor_parallel, - mii_config.replica_num, - deployment.GPU_index_map) - - for i, (hostname, gpu_indices) in enumerate(replica_pool): - # Reserver port for a LB proxy when replication is enabled - if hostname not in port_map: - port_map[hostname] = set() - base_port = mii_config.port_number + i * mii_config.tensor_parallel + port_offset - if base_port in port_map[hostname]: - base_port = max(port_map[hostname]) + 1 - tensor_parallel_ports = list( - range(base_port, - base_port + mii_config.tensor_parallel)) - for i in range(base_port, base_port + mii_config.tensor_parallel): - port_map[hostname].add(i) - torch_dist_port = mii_config.torch_dist_port + i - replica_configs.append( - ReplicaConfig(task=get_task_name(deployment.task), - deployment_name=deployment.deployment_name, - hostname=hostname, - tensor_parallel_ports=tensor_parallel_ports, - torch_dist_port=torch_dist_port, - gpu_indices=gpu_indices)) - lb_config = LoadBalancerConfig(port=mii_config.port_number, - replica_configs=replica_configs) + lb_config, port_map = allocate_processes(deps, port_map) if deployment_type != DeploymentType.NON_PERSISTENT: create_score_file(deployment_tag=deployment_tag, deployment_type=deployment_type, - deployments=deployments, + deployments=deps, model_path=model_path, port_map=port_map, lb_config=lb_config) @@ -201,7 +174,7 @@ def deploy(task=None, def allocate_processes(deployments, port_map): replica_configs = [] port_offset = 1 - for deployment in deployments: + for deployment in deployments.values(): mii_config = deployment.mii_config replica_pool = _allocate_processes(mii_config.hostfile, mii_config.tensor_parallel, diff --git a/mii/grpc_related/modelresponse_server.py b/mii/grpc_related/modelresponse_server.py index 3026e176..5c988bd7 100644 --- a/mii/grpc_related/modelresponse_server.py +++ b/mii/grpc_related/modelresponse_server.py @@ -223,8 +223,9 @@ def invoke_intercept_method(request_proto, context): return google_dot_protobuf_dot_empty__pb2.Empty() if method_name == TERMINATE_METHOD: - for deployment in self.stubs: - for stub in self.stubs[deployment]: + print(self.stubs.keys()) + for deployment_name in self.stubs: + for stub in self.stubs[deployment_name]: stub.invoke(TERMINATE_METHOD, google_dot_protobuf_dot_empty__pb2.Empty()) self.asyncio_loop.call_soon_threadsafe(self.asyncio_loop.stop) @@ -232,6 +233,7 @@ def invoke_intercept_method(request_proto, context): if method_name == DELETE_DEPLOYMENT_METHOD: deployment_name = str(getattr(request_proto, "deployment_name")) + assert deployment_name in self.stubs, f"Deployment: {deployment_name} not found" for stub in self.stubs[deployment_name]: stub.invoke(TERMINATE_METHOD, google_dot_protobuf_dot_empty__pb2.Empty()) @@ -241,6 +243,7 @@ def invoke_intercept_method(request_proto, context): return google_dot_protobuf_dot_empty__pb2.Empty() deployment_name = getattr(request_proto, 'deployment_name') + assert deployment_name in self.stubs, f"Deployment: {deployment_name} not found" call_count = self.counter[deployment_name].get_and_increment() replica_index = call_count % len(self.stubs[deployment_name]) diff --git a/mii/models/score/generate.py b/mii/models/score/generate.py index 303aa7e6..55f63046 100644 --- a/mii/models/score/generate.py +++ b/mii/models/score/generate.py @@ -19,11 +19,12 @@ def create_score_file(deployment_tag, config_dict = {} config_dict[mii.constants.MODEL_PATH_KEY] = model_path config_dict[mii.constants.DEPLOYMENT_TAG_KEY] = deployment_tag + config_dict[mii.constants.DEPLOYMENTS_KEY] = {} if port_map is not None: config_dict[mii.constants.PORT_MAP_KEY] = port_map if deployments is not None: - for deployment in deployments: + for deployment in deployments.values(): deployment_config = { mii.constants.DEPLOYMENT_NAME_KEY: deployment.deployment_name, mii.constants.TASK_NAME_KEY: mii.utils.get_task_name(deployment.task), @@ -34,7 +35,7 @@ def create_score_file(deployment_tag, mii.constants.DEEPSPEED_CONFIG_KEY: deployment.ds_config, mii.constants.DEPLOYED_KEY: deployment.deployed, } - config_dict[deployment.deployment_name] = deployment_config + config_dict[mii.constants.DEPLOYMENTS_KEY][deployment.deployment_name] = deployment_config if lb_config is not None: config_dict[mii.constants.LOAD_BALANCER_CONFIG_KEY] = lb_config diff --git a/mii/models/score/score_template.py b/mii/models/score/score_template.py index fd6a7f0f..117f3866 100644 --- a/mii/models/score/score_template.py +++ b/mii/models/score/score_template.py @@ -19,14 +19,11 @@ def init(): deployment_tag = configs[mii.constants.DEPLOYMENT_TAG_KEY] deployments = [] lb_enabled = False - del configs[mii.constants.PORT_MAP_KEY] - for deployment in configs.values(): - if isinstance(deployment, dict) and deployment[mii.constants.DEPLOYED_KEY]: + for deployment in configs[mii.constants.DEPLOYMENTS_KEY].values(): + if deployment[mii.constants.DEPLOYED_KEY]: lb_enabled = True print(deployment) continue - if not isinstance(deployment, dict): - continue data = { 'deployment_name': deployment[mii.constants.DEPLOYMENT_NAME_KEY], 'task': deployment[mii.constants.TASK_NAME_KEY], @@ -48,8 +45,7 @@ def init(): assert task_name is not None, "The task name should be set before calling init" """ - if len(deployments) > 0: - mii.MIIServer(deployment_tag, + mii.MIIServer(deployment_tag, deployments, model_path, lb_config=configs.get(mii.constants.LOAD_BALANCER_CONFIG_KEY, diff --git a/mii/server.py b/mii/server.py index f29e3a28..e7f1360e 100644 --- a/mii/server.py +++ b/mii/server.py @@ -29,28 +29,26 @@ def config_to_b64_str(config): class MIIServer(): '''Initialize the model, setup the server for the model under model_path''' def __init__(self, deployment_tag, deployments, model_path, lb_config=None, lb_enabled=False): - - #mii_configs = mii.config.MIIConfig(**mii_configs) - self.lb_enabled = lb_enabled - #self.task = mii.utils.get_task(task_name) - self.deployments = deployments - for deployment in deployments: - assert get_num_gpus(deployment.mii_config) > 0, f"GPU count for {deployment.deployment_name} must be greater than 0" - mii_configs = deployment.mii_config - deployment.task = mii.utils.get_task(deployment.task) - if mii_configs.hostfile is None: - hostfile = tempfile.NamedTemporaryFile(delete=False) - num_gpu = torch.cuda.device_count() - with open(hostfile, "w") as f: - f.write(f"localhost slots={num_gpu}") - mii.configs.hostfile = hostfile - - processes = self._initialize_service(deployment_tag, + if len(deployments) > 0: + self.lb_enabled = lb_enabled + self.deployments = deployments + for deployment in deployments: + assert get_num_gpus(deployment.mii_config) > 0, f"GPU count for {deployment.deployment_name} must be greater than 0" + mii_configs = deployment.mii_config + deployment.task = mii.utils.get_task(deployment.task) + if mii_configs.hostfile is None: + hostfile = tempfile.NamedTemporaryFile(delete=False) + num_gpu = torch.cuda.device_count() + with open(hostfile, "w") as f: + f.write(f"localhost slots={num_gpu}") + mii.configs.hostfile = hostfile + + processes = self._initialize_service(deployment_tag, deployments, model_path, lb_config, ) - self._wait_until_server_is_live(processes, lb_config.replica_configs) + self._wait_until_server_is_live(processes, lb_config.replica_configs) def _wait_until_server_is_live(self, processes, deployment): for process, repl_config in zip(processes, deployment): From 72dd95c8440978e2b13bb9201f134a1bfcadb935 Mon Sep 17 00:00:00 2001 From: Tosin Segun Date: Tue, 25 Jul 2023 21:21:04 +0000 Subject: [PATCH 52/69] formatting --- mii/client.py | 102 +- mii/deployment.py | 7 +- mii/grpc_related/modelresponse_server.py | 24 +- mii/grpc_related/proto/modelresponse.proto | 2 +- mii/grpc_related/proto/modelresponse_pb2.py | 101 +- .../proto/modelresponse_pb2_grpc.py | 1135 ++++++++++------- mii/models/score/generate.py | 5 +- mii/server.py | 18 +- 8 files changed, 833 insertions(+), 561 deletions(-) diff --git a/mii/client.py b/mii/client.py index 8bcbc39c..f937d69d 100644 --- a/mii/client.py +++ b/mii/client.py @@ -21,17 +21,17 @@ def _get_deployment_configs(deployment_tag): deployment[mii.constants.DEPLOYED_KEY] = True deployment_name = deployment[mii.constants.DEPLOYMENT_NAME_KEY] data = { - 'deployment_name':deployment[mii.constants.DEPLOYMENT_NAME_KEY], - 'task': deployment[mii.constants.TASK_NAME_KEY], - 'model': deployment[mii.constants.MODEL_NAME_KEY], - 'enable_deepspeed': deployment[mii.constants.ENABLE_DEEPSPEED_KEY], - 'enable_zero': deployment[mii.constants.ENABLE_DEEPSPEED_ZERO_KEY], - 'GPU_index_map': None, - 'mii_config': deployment[mii.constants.MII_CONFIGS_KEY], - 'ds_config': deployment[mii.constants.DEEPSPEED_CONFIG_KEY], - 'version': 1, - 'deployed': deployment[mii.constants.DEPLOYED_KEY] - } + 'deployment_name': deployment[mii.constants.DEPLOYMENT_NAME_KEY], + 'task': deployment[mii.constants.TASK_NAME_KEY], + 'model': deployment[mii.constants.MODEL_NAME_KEY], + 'enable_deepspeed': deployment[mii.constants.ENABLE_DEEPSPEED_KEY], + 'enable_zero': deployment[mii.constants.ENABLE_DEEPSPEED_ZERO_KEY], + 'GPU_index_map': None, + 'mii_config': deployment[mii.constants.MII_CONFIGS_KEY], + 'ds_config': deployment[mii.constants.DEEPSPEED_CONFIG_KEY], + 'version': 1, + 'deployed': deployment[mii.constants.DEPLOYED_KEY] + } deployments[deployment_name] = DeploymentConfig.parse_obj(data) lb_config = configs.get(mii.constants.LOAD_BALANCER_CONFIG_KEY) model_path = configs[mii.constants.MODEL_PATH_KEY] @@ -63,7 +63,13 @@ def mii_query_handle(deployment_tag): #mii_configs = mii.config.MIIConfig(**mii_configs_dict) port_number = None if mii_configs_dict == None else mii_configs_dict.port_number - return MIIClient(deployments, "localhost", port_number, lb_config, model_path, port_map, deployment_tag) + return MIIClient(deployments, + "localhost", + port_number, + lb_config, + model_path, + port_map, + deployment_tag) def create_channel(host, port): @@ -78,7 +84,14 @@ class MIIClient(): """ Client to send queries to a single endpoint. """ - def __init__(self, deployments, host, port, lb_config=None, model_path=None, port_map=None, deployment_tag=None): + def __init__(self, + deployments, + host, + port, + lb_config=None, + model_path=None, + port_map=None, + deployment_tag=None): self.asyncio_loop = asyncio.get_event_loop() self.stub = None self.host = host @@ -97,11 +110,13 @@ def _get_deployment_task(self, deployment_name=None): assert len(self.deployments) == 1, "Must pass deployment_name to query when using multiple deployments" deployment = next(iter(self.deployments.values())) deployment_name = deployment.deployment_name - task = get_task(deployment.task) if isinstance(deployment.task, str) else deployment.task + task = get_task(deployment.task) if isinstance(deployment.task, + str) else deployment.task else: if deployment_name in self.deployments: deployment = self.deployments[deployment_name] - task = get_task(deployment.task) if isinstance(deployment.task, str) else deployment.task + task = get_task(deployment.task) if isinstance(deployment.task, + str) else deployment.task return deployment_name, task assert False, f"{deployment_name} not found in list of deployments" return deployment_name, task @@ -159,13 +174,12 @@ async def delete_model_async(self, proto_request): def delete_model(self, deployment_name): if deployment_name in self.deployments: - request_proto = modelresponse_pb2.DeleteDeployRequest(deployment_name=deployment_name) + request_proto = modelresponse_pb2.DeleteDeployRequest( + deployment_name=deployment_name) self.asyncio_loop.run_until_complete(self.delete_model_async(request_proto)) del self.deployments[deployment_name] return None assert False, f"Deployment: {deployment_name} not found" - - async def add_models_async(self, proto_request): await getattr(self.stub, "AddDeployment")(proto_request) @@ -182,27 +196,27 @@ def add_models(self, deployment_type=DeploymentType.LOCAL, model_path=None, version=1): - + if not deployments: assert all((model, task, deployment_name)), "model, task, and deployment name must be set to deploy singular model" deployments = [ DeploymentConfig(deployment_name=deployment_name, - task=task, - model=model, - enable_deepspeed=enable_deepspeed, - enable_zero=enable_zero, - GPU_index_map=None, - mii_config=mii.config.MIIConfig(**mii_config), - ds_config=ds_config, - version=version, - deployed=False) + task=task, + model=model, + enable_deepspeed=enable_deepspeed, + enable_zero=enable_zero, + GPU_index_map=None, + mii_config=mii.config.MIIConfig(**mii_config), + ds_config=ds_config, + version=version, + deployed=False) ] - + deps = {deployment.deployment_name: deployment for deployment in deployments} for deployment in deployments: deployment.task = get_task(deployment.task) lb_config, self.port_map = allocate_processes(deps, self.port_map) - + if self.lb_config is not None: self.lb_config.replica_configs.extend(lb_config.replica_configs) else: @@ -217,23 +231,31 @@ def add_models(self, for deployment in self.deployments.values(): if isinstance(deployment.task, str): deployment.task = get_task(deployment.task) - create_score_file(deployment_tag=self.deployment_tag, deployment_type=deployment_type, deployments=self.deployments, model_path=self.model_path, port_map=self.port_map, lb_config=lb_config) + create_score_file(deployment_tag=self.deployment_tag, + deployment_type=deployment_type, + deployments=self.deployments, + model_path=self.model_path, + port_map=self.port_map, + lb_config=lb_config) if deployment_type == DeploymentType.LOCAL: mii.utils.import_score_file(self.deployment_tag).init() if self.stub is None: - self.port_number = next(iter(self.deployments.values())).mii_config.port_number + self.port_number = next(iter( + self.deployments.values())).mii_config.port_number channel = create_channel(self.host, self.port_number) self.stub = modelresponse_pb2_grpc.DeploymentManagementStub(channel) for replica in lb_config.replica_configs: - request_proto = modelresponse_pb2.AddDeployRequest(task=replica.task, - deployment_name=replica.deployment_name, - hostname=replica.hostname, - tensor_parallel_ports=replica.tensor_parallel_ports, - torch_dist_port=replica.torch_dist_port, - gpu_indices=replica.gpu_indices - ) + request_proto = modelresponse_pb2.AddDeployRequest( + task=replica.task, + deployment_name=replica.deployment_name, + hostname=replica.hostname, + tensor_parallel_ports=replica.tensor_parallel_ports, + torch_dist_port=replica.torch_dist_port, + gpu_indices=replica.gpu_indices) self.asyncio_loop.run_until_complete(self.add_models_async(request_proto)) + + class MIITensorParallelClient(): """ Client to send queries to multiple endpoints in parallel. @@ -325,7 +347,7 @@ def terminate(self): def terminate_restful_gateway(deployment_tag): deployments, _, _, _ = _get_deployment_configs(deployment_tag) for deployment in deployments.values(): - mii_configs_dict = deployment.mii_config + mii_configs = deployment.mii_config #mii_configs = mii.config.MIIConfig(**mii_configs_dict) if mii_configs.enable_restful_api: requests.get(f"http://localhost:{mii_configs.restful_api_port}/terminate") diff --git a/mii/deployment.py b/mii/deployment.py index bc71be48..54b8abce 100644 --- a/mii/deployment.py +++ b/mii/deployment.py @@ -126,7 +126,6 @@ def deploy(task=None, deployment.task, deployment.model) - if enable_deepspeed: logger.info( f"************* MII is using DeepSpeed Optimizations to accelerate your model: {deployment.model} *************" @@ -139,7 +138,7 @@ def deploy(task=None, deps = {deployment.deployment_name: deployment for deployment in deployments} # In local deployments use default path if no model path set - + # add fields for replica deployment port_map = {} lb_config, port_map = allocate_processes(deps, port_map) @@ -151,7 +150,7 @@ def deploy(task=None, model_path=model_path, port_map=port_map, lb_config=lb_config) - + if deployment_type == DeploymentType.AML: _deploy_aml(deployment_tag=deployment_tag, model_name=model, version=version) elif deployment_type == DeploymentType.LOCAL: @@ -171,6 +170,7 @@ def deploy(task=None, else: raise Exception(f"Unknown deployment type: {deployment_type}") + def allocate_processes(deployments, port_map): replica_configs = [] port_offset = 1 @@ -205,6 +205,7 @@ def allocate_processes(deployments, port_map): replica_configs=replica_configs) return lb_config, port_map + def _deploy_local(deployment_tag, model_path): mii.utils.import_score_file(deployment_tag).init() diff --git a/mii/grpc_related/modelresponse_server.py b/mii/grpc_related/modelresponse_server.py index 5c988bd7..5204779c 100644 --- a/mii/grpc_related/modelresponse_server.py +++ b/mii/grpc_related/modelresponse_server.py @@ -32,14 +32,17 @@ def Terminate(self, request, context): def get_stop_event(self): return self._stop_event -class DeploymentManagement(ServiceBase, modelresponse_pb2_grpc.DeploymentManagementServicer): + +class DeploymentManagement(ServiceBase, + modelresponse_pb2_grpc.DeploymentManagementServicer): def AddDeployment(self, request, context): print("DEPLOYMENT ADDED") return google_dot_protobuf_dot_empty__pb2.Empty() - + def DeleteDeployment(self, request, context): return google_dot_protobuf_dot_empty__pb2.Empty() + class ModelResponse(ServiceBase): """ Implementation class of an MII inference server @@ -212,14 +215,19 @@ def invoke_intercept_method(request_proto, context): task = str(getattr(request_proto, "task")) deployment_name = str(getattr(request_proto, "deployment_name")) hostname = str(getattr(request_proto, "hostname")) - tensor_parallel_ports = list(getattr(request_proto, "tensor_parallel_ports")) + tensor_parallel_ports = list( + getattr(request_proto, + "tensor_parallel_ports")) torch_dist_port = int(getattr(request_proto, "torch_dist_port")) gpu_indices = list(getattr(request_proto, "gpu_indices")) if deployment_name not in self.stubs: self.stubs[deployment_name] = [] self.counter[deployment_name] = AtomicCounter() self.tasks[deployment_name] = task - self.stubs[deployment_name].append(ParallelStubInvoker(hostname, tensor_parallel_ports, self.asyncio_loop)) + self.stubs[deployment_name].append( + ParallelStubInvoker(hostname, + tensor_parallel_ports, + self.asyncio_loop)) return google_dot_protobuf_dot_empty__pb2.Empty() if method_name == TERMINATE_METHOD: @@ -230,13 +238,13 @@ def invoke_intercept_method(request_proto, context): google_dot_protobuf_dot_empty__pb2.Empty()) self.asyncio_loop.call_soon_threadsafe(self.asyncio_loop.stop) return next_handler.unary_unary(request_proto, context) - + if method_name == DELETE_DEPLOYMENT_METHOD: deployment_name = str(getattr(request_proto, "deployment_name")) assert deployment_name in self.stubs, f"Deployment: {deployment_name} not found" for stub in self.stubs[deployment_name]: stub.invoke(TERMINATE_METHOD, - google_dot_protobuf_dot_empty__pb2.Empty()) + google_dot_protobuf_dot_empty__pb2.Empty()) del self.stubs[deployment_name] del self.counter[deployment_name] del self.tasks[deployment_name] @@ -290,7 +298,9 @@ def _do_serve(service_impl, port, interceptors=[], is_lb=False): ('grpc.max_receive_message_length', GRPC_MAX_MSG_SIZE)]) if is_lb: - modelresponse_pb2_grpc.add_DeploymentManagementServicer_to_server(service_impl, server) + modelresponse_pb2_grpc.add_DeploymentManagementServicer_to_server( + service_impl, + server) else: modelresponse_pb2_grpc.add_ModelResponseServicer_to_server(service_impl, server) server.add_insecure_port(f'[::]:{port}') diff --git a/mii/grpc_related/proto/modelresponse.proto b/mii/grpc_related/proto/modelresponse.proto index ad626810..7daf300a 100644 --- a/mii/grpc_related/proto/modelresponse.proto +++ b/mii/grpc_related/proto/modelresponse.proto @@ -132,7 +132,7 @@ message AddDeployRequest { repeated int64 tensor_parallel_ports = 4; int64 torch_dist_port = 5; repeated int64 gpu_indices = 6; - + } message DeleteDeployRequest { diff --git a/mii/grpc_related/proto/modelresponse_pb2.py b/mii/grpc_related/proto/modelresponse_pb2.py index e7abcc14..30c7a340 100644 --- a/mii/grpc_related/proto/modelresponse_pb2.py +++ b/mii/grpc_related/proto/modelresponse_pb2.py @@ -1,4 +1,7 @@ -# -*- coding: utf-8 -*- +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team # Generated by the protocol buffer compiler. DO NOT EDIT! # source: modelresponse.proto """Generated protocol buffer code.""" @@ -10,60 +13,60 @@ _sym_db = _symbol_database.Default() - from google.protobuf import empty_pb2 as google_dot_protobuf_dot_empty__pb2 - -DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x13modelresponse.proto\x12\rmodelresponse\x1a\x1bgoogle/protobuf/empty.proto\"_\n\x05Value\x12\x10\n\x06svalue\x18\x01 \x01(\tH\x00\x12\x10\n\x06ivalue\x18\x02 \x01(\x03H\x00\x12\x10\n\x06\x66value\x18\x03 \x01(\x02H\x00\x12\x10\n\x06\x62value\x18\x04 \x01(\x08H\x00\x42\x0e\n\x0coneof_values\"\x1f\n\tSessionID\x12\x12\n\nsession_id\x18\x01 \x01(\t\"\xed\x01\n\x13SingleStringRequest\x12\x0f\n\x07request\x18\x01 \x01(\t\x12I\n\x0cquery_kwargs\x18\x02 \x03(\x0b\x32\x33.modelresponse.SingleStringRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x03 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\xeb\x01\n\x12MultiStringRequest\x12\x0f\n\x07request\x18\x01 \x03(\t\x12H\n\x0cquery_kwargs\x18\x02 \x03(\x0b\x32\x32.modelresponse.MultiStringRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x03 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\x85\x01\n\x11SingleStringReply\x12\x10\n\x08response\x18\x01 \x01(\t\x12\x12\n\ntime_taken\x18\x02 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x03 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\x84\x01\n\x10MultiStringReply\x12\x10\n\x08response\x18\x01 \x03(\t\x12\x12\n\ntime_taken\x18\x02 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x03 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\xeb\x01\n\tQARequest\x12\x10\n\x08question\x18\x01 \x01(\t\x12\x0f\n\x07\x63ontext\x18\x02 \x01(\t\x12?\n\x0cquery_kwargs\x18\x03 \x03(\x0b\x32).modelresponse.QARequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\xd3\x02\n\x13\x43onversationRequest\x12\x0c\n\x04text\x18\x01 \x01(\t\x12\x1c\n\x0f\x63onversation_id\x18\x02 \x01(\x03H\x00\x88\x01\x01\x12\x18\n\x10past_user_inputs\x18\x03 \x03(\t\x12\x1b\n\x13generated_responses\x18\x04 \x03(\t\x12I\n\x0cquery_kwargs\x18\x05 \x03(\x0b\x32\x33.modelresponse.ConversationRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x06 \x01(\tH\x01\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_conversation_idB\x12\n\x10_deployment_name\"\xc3\x01\n\x11\x43onversationReply\x12\x17\n\x0f\x63onversation_id\x18\x01 \x01(\x03\x12\x18\n\x10past_user_inputs\x18\x02 \x03(\t\x12\x1b\n\x13generated_responses\x18\x03 \x03(\t\x12\x12\n\ntime_taken\x18\x04 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x05 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x06 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\xaf\x01\n\nImageReply\x12\x0e\n\x06images\x18\x01 \x03(\x0c\x12\x1d\n\x15nsfw_content_detected\x18\x02 \x03(\x08\x12\x0c\n\x04mode\x18\x03 \x01(\t\x12\x0e\n\x06size_w\x18\x04 \x01(\x03\x12\x0e\n\x06size_h\x18\x05 \x01(\x03\x12\x12\n\ntime_taken\x18\x06 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x07 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\x98\x01\n\x10\x41\x64\x64\x44\x65ployRequest\x12\x0c\n\x04task\x18\x01 \x01(\t\x12\x17\n\x0f\x64\x65ployment_name\x18\x02 \x01(\t\x12\x10\n\x08hostname\x18\x03 \x01(\t\x12\x1d\n\x15tensor_parallel_ports\x18\x04 \x03(\x03\x12\x17\n\x0ftorch_dist_port\x18\x05 \x01(\x03\x12\x13\n\x0bgpu_indices\x18\x06 \x03(\x03\".\n\x13\x44\x65leteDeployRequest\x12\x17\n\x0f\x64\x65ployment_name\x18\x01 \x01(\t2\xd4\x06\n\rModelResponse\x12=\n\tTerminate\x12\x16.google.protobuf.Empty\x1a\x16.google.protobuf.Empty\"\x00\x12\x43\n\rCreateSession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12\x44\n\x0e\x44\x65stroySession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12V\n\x0eGeneratorReply\x12!.modelresponse.MultiStringRequest\x1a\x1f.modelresponse.MultiStringReply\"\x00\x12]\n\x13\x43lassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12V\n\x16QuestionAndAnswerReply\x12\x18.modelresponse.QARequest\x1a .modelresponse.SingleStringReply\"\x00\x12W\n\rFillMaskReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12\x62\n\x18TokenClassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12]\n\x13\x43onversationalReply\x12\".modelresponse.ConversationRequest\x1a .modelresponse.ConversationReply\"\x00\x12N\n\x0cTxt2ImgReply\x12!.modelresponse.MultiStringRequest\x1a\x19.modelresponse.ImageReply\"\x00\x32\xf9\x07\n\x14\x44\x65ploymentManagement\x12J\n\rAddDeployment\x12\x1f.modelresponse.AddDeployRequest\x1a\x16.google.protobuf.Empty\"\x00\x12\x43\n\rCreateSession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12\x44\n\x0e\x44\x65stroySession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12V\n\x0eGeneratorReply\x12!.modelresponse.MultiStringRequest\x1a\x1f.modelresponse.MultiStringReply\"\x00\x12]\n\x13\x43lassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12V\n\x16QuestionAndAnswerReply\x12\x18.modelresponse.QARequest\x1a .modelresponse.SingleStringReply\"\x00\x12W\n\rFillMaskReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12\x62\n\x18TokenClassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12]\n\x13\x43onversationalReply\x12\".modelresponse.ConversationRequest\x1a .modelresponse.ConversationReply\"\x00\x12N\n\x0cTxt2ImgReply\x12!.modelresponse.MultiStringRequest\x1a\x19.modelresponse.ImageReply\"\x00\x12P\n\x10\x44\x65leteDeployment\x12\".modelresponse.DeleteDeployRequest\x1a\x16.google.protobuf.Empty\"\x00\x12=\n\tTerminate\x12\x16.google.protobuf.Empty\x1a\x16.google.protobuf.Empty\"\x00\x62\x06proto3') +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( + b'\n\x13modelresponse.proto\x12\rmodelresponse\x1a\x1bgoogle/protobuf/empty.proto\"_\n\x05Value\x12\x10\n\x06svalue\x18\x01 \x01(\tH\x00\x12\x10\n\x06ivalue\x18\x02 \x01(\x03H\x00\x12\x10\n\x06\x66value\x18\x03 \x01(\x02H\x00\x12\x10\n\x06\x62value\x18\x04 \x01(\x08H\x00\x42\x0e\n\x0coneof_values\"\x1f\n\tSessionID\x12\x12\n\nsession_id\x18\x01 \x01(\t\"\xed\x01\n\x13SingleStringRequest\x12\x0f\n\x07request\x18\x01 \x01(\t\x12I\n\x0cquery_kwargs\x18\x02 \x03(\x0b\x32\x33.modelresponse.SingleStringRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x03 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\xeb\x01\n\x12MultiStringRequest\x12\x0f\n\x07request\x18\x01 \x03(\t\x12H\n\x0cquery_kwargs\x18\x02 \x03(\x0b\x32\x32.modelresponse.MultiStringRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x03 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\x85\x01\n\x11SingleStringReply\x12\x10\n\x08response\x18\x01 \x01(\t\x12\x12\n\ntime_taken\x18\x02 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x03 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\x84\x01\n\x10MultiStringReply\x12\x10\n\x08response\x18\x01 \x03(\t\x12\x12\n\ntime_taken\x18\x02 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x03 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\xeb\x01\n\tQARequest\x12\x10\n\x08question\x18\x01 \x01(\t\x12\x0f\n\x07\x63ontext\x18\x02 \x01(\t\x12?\n\x0cquery_kwargs\x18\x03 \x03(\x0b\x32).modelresponse.QARequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\xd3\x02\n\x13\x43onversationRequest\x12\x0c\n\x04text\x18\x01 \x01(\t\x12\x1c\n\x0f\x63onversation_id\x18\x02 \x01(\x03H\x00\x88\x01\x01\x12\x18\n\x10past_user_inputs\x18\x03 \x03(\t\x12\x1b\n\x13generated_responses\x18\x04 \x03(\t\x12I\n\x0cquery_kwargs\x18\x05 \x03(\x0b\x32\x33.modelresponse.ConversationRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x06 \x01(\tH\x01\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_conversation_idB\x12\n\x10_deployment_name\"\xc3\x01\n\x11\x43onversationReply\x12\x17\n\x0f\x63onversation_id\x18\x01 \x01(\x03\x12\x18\n\x10past_user_inputs\x18\x02 \x03(\t\x12\x1b\n\x13generated_responses\x18\x03 \x03(\t\x12\x12\n\ntime_taken\x18\x04 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x05 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x06 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\xaf\x01\n\nImageReply\x12\x0e\n\x06images\x18\x01 \x03(\x0c\x12\x1d\n\x15nsfw_content_detected\x18\x02 \x03(\x08\x12\x0c\n\x04mode\x18\x03 \x01(\t\x12\x0e\n\x06size_w\x18\x04 \x01(\x03\x12\x0e\n\x06size_h\x18\x05 \x01(\x03\x12\x12\n\ntime_taken\x18\x06 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x07 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\x98\x01\n\x10\x41\x64\x64\x44\x65ployRequest\x12\x0c\n\x04task\x18\x01 \x01(\t\x12\x17\n\x0f\x64\x65ployment_name\x18\x02 \x01(\t\x12\x10\n\x08hostname\x18\x03 \x01(\t\x12\x1d\n\x15tensor_parallel_ports\x18\x04 \x03(\x03\x12\x17\n\x0ftorch_dist_port\x18\x05 \x01(\x03\x12\x13\n\x0bgpu_indices\x18\x06 \x03(\x03\".\n\x13\x44\x65leteDeployRequest\x12\x17\n\x0f\x64\x65ployment_name\x18\x01 \x01(\t2\xd4\x06\n\rModelResponse\x12=\n\tTerminate\x12\x16.google.protobuf.Empty\x1a\x16.google.protobuf.Empty\"\x00\x12\x43\n\rCreateSession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12\x44\n\x0e\x44\x65stroySession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12V\n\x0eGeneratorReply\x12!.modelresponse.MultiStringRequest\x1a\x1f.modelresponse.MultiStringReply\"\x00\x12]\n\x13\x43lassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12V\n\x16QuestionAndAnswerReply\x12\x18.modelresponse.QARequest\x1a .modelresponse.SingleStringReply\"\x00\x12W\n\rFillMaskReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12\x62\n\x18TokenClassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12]\n\x13\x43onversationalReply\x12\".modelresponse.ConversationRequest\x1a .modelresponse.ConversationReply\"\x00\x12N\n\x0cTxt2ImgReply\x12!.modelresponse.MultiStringRequest\x1a\x19.modelresponse.ImageReply\"\x00\x32\xf9\x07\n\x14\x44\x65ploymentManagement\x12J\n\rAddDeployment\x12\x1f.modelresponse.AddDeployRequest\x1a\x16.google.protobuf.Empty\"\x00\x12\x43\n\rCreateSession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12\x44\n\x0e\x44\x65stroySession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12V\n\x0eGeneratorReply\x12!.modelresponse.MultiStringRequest\x1a\x1f.modelresponse.MultiStringReply\"\x00\x12]\n\x13\x43lassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12V\n\x16QuestionAndAnswerReply\x12\x18.modelresponse.QARequest\x1a .modelresponse.SingleStringReply\"\x00\x12W\n\rFillMaskReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12\x62\n\x18TokenClassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12]\n\x13\x43onversationalReply\x12\".modelresponse.ConversationRequest\x1a .modelresponse.ConversationReply\"\x00\x12N\n\x0cTxt2ImgReply\x12!.modelresponse.MultiStringRequest\x1a\x19.modelresponse.ImageReply\"\x00\x12P\n\x10\x44\x65leteDeployment\x12\".modelresponse.DeleteDeployRequest\x1a\x16.google.protobuf.Empty\"\x00\x12=\n\tTerminate\x12\x16.google.protobuf.Empty\x1a\x16.google.protobuf.Empty\"\x00\x62\x06proto3' +) _globals = globals() _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) _builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'modelresponse_pb2', _globals) if _descriptor._USE_C_DESCRIPTORS == False: - DESCRIPTOR._options = None - _SINGLESTRINGREQUEST_QUERYKWARGSENTRY._options = None - _SINGLESTRINGREQUEST_QUERYKWARGSENTRY._serialized_options = b'8\001' - _MULTISTRINGREQUEST_QUERYKWARGSENTRY._options = None - _MULTISTRINGREQUEST_QUERYKWARGSENTRY._serialized_options = b'8\001' - _QAREQUEST_QUERYKWARGSENTRY._options = None - _QAREQUEST_QUERYKWARGSENTRY._serialized_options = b'8\001' - _CONVERSATIONREQUEST_QUERYKWARGSENTRY._options = None - _CONVERSATIONREQUEST_QUERYKWARGSENTRY._serialized_options = b'8\001' - _globals['_VALUE']._serialized_start=67 - _globals['_VALUE']._serialized_end=162 - _globals['_SESSIONID']._serialized_start=164 - _globals['_SESSIONID']._serialized_end=195 - _globals['_SINGLESTRINGREQUEST']._serialized_start=198 - _globals['_SINGLESTRINGREQUEST']._serialized_end=435 - _globals['_SINGLESTRINGREQUEST_QUERYKWARGSENTRY']._serialized_start=343 - _globals['_SINGLESTRINGREQUEST_QUERYKWARGSENTRY']._serialized_end=415 - _globals['_MULTISTRINGREQUEST']._serialized_start=438 - _globals['_MULTISTRINGREQUEST']._serialized_end=673 - _globals['_MULTISTRINGREQUEST_QUERYKWARGSENTRY']._serialized_start=343 - _globals['_MULTISTRINGREQUEST_QUERYKWARGSENTRY']._serialized_end=415 - _globals['_SINGLESTRINGREPLY']._serialized_start=676 - _globals['_SINGLESTRINGREPLY']._serialized_end=809 - _globals['_MULTISTRINGREPLY']._serialized_start=812 - _globals['_MULTISTRINGREPLY']._serialized_end=944 - _globals['_QAREQUEST']._serialized_start=947 - _globals['_QAREQUEST']._serialized_end=1182 - _globals['_QAREQUEST_QUERYKWARGSENTRY']._serialized_start=343 - _globals['_QAREQUEST_QUERYKWARGSENTRY']._serialized_end=415 - _globals['_CONVERSATIONREQUEST']._serialized_start=1185 - _globals['_CONVERSATIONREQUEST']._serialized_end=1524 - _globals['_CONVERSATIONREQUEST_QUERYKWARGSENTRY']._serialized_start=343 - _globals['_CONVERSATIONREQUEST_QUERYKWARGSENTRY']._serialized_end=415 - _globals['_CONVERSATIONREPLY']._serialized_start=1527 - _globals['_CONVERSATIONREPLY']._serialized_end=1722 - _globals['_IMAGEREPLY']._serialized_start=1725 - _globals['_IMAGEREPLY']._serialized_end=1900 - _globals['_ADDDEPLOYREQUEST']._serialized_start=1903 - _globals['_ADDDEPLOYREQUEST']._serialized_end=2055 - _globals['_DELETEDEPLOYREQUEST']._serialized_start=2057 - _globals['_DELETEDEPLOYREQUEST']._serialized_end=2103 - _globals['_MODELRESPONSE']._serialized_start=2106 - _globals['_MODELRESPONSE']._serialized_end=2958 - _globals['_DEPLOYMENTMANAGEMENT']._serialized_start=2961 - _globals['_DEPLOYMENTMANAGEMENT']._serialized_end=3978 + DESCRIPTOR._options = None + _SINGLESTRINGREQUEST_QUERYKWARGSENTRY._options = None + _SINGLESTRINGREQUEST_QUERYKWARGSENTRY._serialized_options = b'8\001' + _MULTISTRINGREQUEST_QUERYKWARGSENTRY._options = None + _MULTISTRINGREQUEST_QUERYKWARGSENTRY._serialized_options = b'8\001' + _QAREQUEST_QUERYKWARGSENTRY._options = None + _QAREQUEST_QUERYKWARGSENTRY._serialized_options = b'8\001' + _CONVERSATIONREQUEST_QUERYKWARGSENTRY._options = None + _CONVERSATIONREQUEST_QUERYKWARGSENTRY._serialized_options = b'8\001' + _globals['_VALUE']._serialized_start = 67 + _globals['_VALUE']._serialized_end = 162 + _globals['_SESSIONID']._serialized_start = 164 + _globals['_SESSIONID']._serialized_end = 195 + _globals['_SINGLESTRINGREQUEST']._serialized_start = 198 + _globals['_SINGLESTRINGREQUEST']._serialized_end = 435 + _globals['_SINGLESTRINGREQUEST_QUERYKWARGSENTRY']._serialized_start = 343 + _globals['_SINGLESTRINGREQUEST_QUERYKWARGSENTRY']._serialized_end = 415 + _globals['_MULTISTRINGREQUEST']._serialized_start = 438 + _globals['_MULTISTRINGREQUEST']._serialized_end = 673 + _globals['_MULTISTRINGREQUEST_QUERYKWARGSENTRY']._serialized_start = 343 + _globals['_MULTISTRINGREQUEST_QUERYKWARGSENTRY']._serialized_end = 415 + _globals['_SINGLESTRINGREPLY']._serialized_start = 676 + _globals['_SINGLESTRINGREPLY']._serialized_end = 809 + _globals['_MULTISTRINGREPLY']._serialized_start = 812 + _globals['_MULTISTRINGREPLY']._serialized_end = 944 + _globals['_QAREQUEST']._serialized_start = 947 + _globals['_QAREQUEST']._serialized_end = 1182 + _globals['_QAREQUEST_QUERYKWARGSENTRY']._serialized_start = 343 + _globals['_QAREQUEST_QUERYKWARGSENTRY']._serialized_end = 415 + _globals['_CONVERSATIONREQUEST']._serialized_start = 1185 + _globals['_CONVERSATIONREQUEST']._serialized_end = 1524 + _globals['_CONVERSATIONREQUEST_QUERYKWARGSENTRY']._serialized_start = 343 + _globals['_CONVERSATIONREQUEST_QUERYKWARGSENTRY']._serialized_end = 415 + _globals['_CONVERSATIONREPLY']._serialized_start = 1527 + _globals['_CONVERSATIONREPLY']._serialized_end = 1722 + _globals['_IMAGEREPLY']._serialized_start = 1725 + _globals['_IMAGEREPLY']._serialized_end = 1900 + _globals['_ADDDEPLOYREQUEST']._serialized_start = 1903 + _globals['_ADDDEPLOYREQUEST']._serialized_end = 2055 + _globals['_DELETEDEPLOYREQUEST']._serialized_start = 2057 + _globals['_DELETEDEPLOYREQUEST']._serialized_end = 2103 + _globals['_MODELRESPONSE']._serialized_start = 2106 + _globals['_MODELRESPONSE']._serialized_end = 2958 + _globals['_DEPLOYMENTMANAGEMENT']._serialized_start = 2961 + _globals['_DEPLOYMENTMANAGEMENT']._serialized_end = 3978 # @@protoc_insertion_point(module_scope) diff --git a/mii/grpc_related/proto/modelresponse_pb2_grpc.py b/mii/grpc_related/proto/modelresponse_pb2_grpc.py index 9c3ce85d..49393660 100644 --- a/mii/grpc_related/proto/modelresponse_pb2_grpc.py +++ b/mii/grpc_related/proto/modelresponse_pb2_grpc.py @@ -1,3 +1,7 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team # Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT! """Client and server classes corresponding to protobuf-defined services.""" import grpc @@ -8,7 +12,6 @@ class ModelResponseStub(object): """Missing associated documentation comment in .proto file.""" - def __init__(self, channel): """Constructor. @@ -16,60 +19,60 @@ def __init__(self, channel): channel: A grpc.Channel. """ self.Terminate = channel.unary_unary( - '/modelresponse.ModelResponse/Terminate', - request_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, - response_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, - ) + '/modelresponse.ModelResponse/Terminate', + request_serializer=google_dot_protobuf_dot_empty__pb2.Empty. + SerializeToString, + response_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, + ) self.CreateSession = channel.unary_unary( - '/modelresponse.ModelResponse/CreateSession', - request_serializer=modelresponse__pb2.SessionID.SerializeToString, - response_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, - ) + '/modelresponse.ModelResponse/CreateSession', + request_serializer=modelresponse__pb2.SessionID.SerializeToString, + response_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, + ) self.DestroySession = channel.unary_unary( - '/modelresponse.ModelResponse/DestroySession', - request_serializer=modelresponse__pb2.SessionID.SerializeToString, - response_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, - ) + '/modelresponse.ModelResponse/DestroySession', + request_serializer=modelresponse__pb2.SessionID.SerializeToString, + response_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, + ) self.GeneratorReply = channel.unary_unary( - '/modelresponse.ModelResponse/GeneratorReply', - request_serializer=modelresponse__pb2.MultiStringRequest.SerializeToString, - response_deserializer=modelresponse__pb2.MultiStringReply.FromString, - ) + '/modelresponse.ModelResponse/GeneratorReply', + request_serializer=modelresponse__pb2.MultiStringRequest.SerializeToString, + response_deserializer=modelresponse__pb2.MultiStringReply.FromString, + ) self.ClassificationReply = channel.unary_unary( - '/modelresponse.ModelResponse/ClassificationReply', - request_serializer=modelresponse__pb2.SingleStringRequest.SerializeToString, - response_deserializer=modelresponse__pb2.SingleStringReply.FromString, - ) + '/modelresponse.ModelResponse/ClassificationReply', + request_serializer=modelresponse__pb2.SingleStringRequest.SerializeToString, + response_deserializer=modelresponse__pb2.SingleStringReply.FromString, + ) self.QuestionAndAnswerReply = channel.unary_unary( - '/modelresponse.ModelResponse/QuestionAndAnswerReply', - request_serializer=modelresponse__pb2.QARequest.SerializeToString, - response_deserializer=modelresponse__pb2.SingleStringReply.FromString, - ) + '/modelresponse.ModelResponse/QuestionAndAnswerReply', + request_serializer=modelresponse__pb2.QARequest.SerializeToString, + response_deserializer=modelresponse__pb2.SingleStringReply.FromString, + ) self.FillMaskReply = channel.unary_unary( - '/modelresponse.ModelResponse/FillMaskReply', - request_serializer=modelresponse__pb2.SingleStringRequest.SerializeToString, - response_deserializer=modelresponse__pb2.SingleStringReply.FromString, - ) + '/modelresponse.ModelResponse/FillMaskReply', + request_serializer=modelresponse__pb2.SingleStringRequest.SerializeToString, + response_deserializer=modelresponse__pb2.SingleStringReply.FromString, + ) self.TokenClassificationReply = channel.unary_unary( - '/modelresponse.ModelResponse/TokenClassificationReply', - request_serializer=modelresponse__pb2.SingleStringRequest.SerializeToString, - response_deserializer=modelresponse__pb2.SingleStringReply.FromString, - ) + '/modelresponse.ModelResponse/TokenClassificationReply', + request_serializer=modelresponse__pb2.SingleStringRequest.SerializeToString, + response_deserializer=modelresponse__pb2.SingleStringReply.FromString, + ) self.ConversationalReply = channel.unary_unary( - '/modelresponse.ModelResponse/ConversationalReply', - request_serializer=modelresponse__pb2.ConversationRequest.SerializeToString, - response_deserializer=modelresponse__pb2.ConversationReply.FromString, - ) + '/modelresponse.ModelResponse/ConversationalReply', + request_serializer=modelresponse__pb2.ConversationRequest.SerializeToString, + response_deserializer=modelresponse__pb2.ConversationReply.FromString, + ) self.Txt2ImgReply = channel.unary_unary( - '/modelresponse.ModelResponse/Txt2ImgReply', - request_serializer=modelresponse__pb2.MultiStringRequest.SerializeToString, - response_deserializer=modelresponse__pb2.ImageReply.FromString, - ) + '/modelresponse.ModelResponse/Txt2ImgReply', + request_serializer=modelresponse__pb2.MultiStringRequest.SerializeToString, + response_deserializer=modelresponse__pb2.ImageReply.FromString, + ) class ModelResponseServicer(object): """Missing associated documentation comment in .proto file.""" - def Terminate(self, request, context): """Missing associated documentation comment in .proto file.""" context.set_code(grpc.StatusCode.UNIMPLEMENTED) @@ -133,240 +136,341 @@ def Txt2ImgReply(self, request, context): def add_ModelResponseServicer_to_server(servicer, server): rpc_method_handlers = { - 'Terminate': grpc.unary_unary_rpc_method_handler( - servicer.Terminate, - request_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, - response_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, - ), - 'CreateSession': grpc.unary_unary_rpc_method_handler( - servicer.CreateSession, - request_deserializer=modelresponse__pb2.SessionID.FromString, - response_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, - ), - 'DestroySession': grpc.unary_unary_rpc_method_handler( - servicer.DestroySession, - request_deserializer=modelresponse__pb2.SessionID.FromString, - response_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, - ), - 'GeneratorReply': grpc.unary_unary_rpc_method_handler( - servicer.GeneratorReply, - request_deserializer=modelresponse__pb2.MultiStringRequest.FromString, - response_serializer=modelresponse__pb2.MultiStringReply.SerializeToString, - ), - 'ClassificationReply': grpc.unary_unary_rpc_method_handler( - servicer.ClassificationReply, - request_deserializer=modelresponse__pb2.SingleStringRequest.FromString, - response_serializer=modelresponse__pb2.SingleStringReply.SerializeToString, - ), - 'QuestionAndAnswerReply': grpc.unary_unary_rpc_method_handler( - servicer.QuestionAndAnswerReply, - request_deserializer=modelresponse__pb2.QARequest.FromString, - response_serializer=modelresponse__pb2.SingleStringReply.SerializeToString, - ), - 'FillMaskReply': grpc.unary_unary_rpc_method_handler( - servicer.FillMaskReply, - request_deserializer=modelresponse__pb2.SingleStringRequest.FromString, - response_serializer=modelresponse__pb2.SingleStringReply.SerializeToString, - ), - 'TokenClassificationReply': grpc.unary_unary_rpc_method_handler( - servicer.TokenClassificationReply, - request_deserializer=modelresponse__pb2.SingleStringRequest.FromString, - response_serializer=modelresponse__pb2.SingleStringReply.SerializeToString, - ), - 'ConversationalReply': grpc.unary_unary_rpc_method_handler( - servicer.ConversationalReply, - request_deserializer=modelresponse__pb2.ConversationRequest.FromString, - response_serializer=modelresponse__pb2.ConversationReply.SerializeToString, - ), - 'Txt2ImgReply': grpc.unary_unary_rpc_method_handler( - servicer.Txt2ImgReply, - request_deserializer=modelresponse__pb2.MultiStringRequest.FromString, - response_serializer=modelresponse__pb2.ImageReply.SerializeToString, - ), + 'Terminate': + grpc.unary_unary_rpc_method_handler( + servicer.Terminate, + request_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, + response_serializer=google_dot_protobuf_dot_empty__pb2.Empty. + SerializeToString, + ), + 'CreateSession': + grpc.unary_unary_rpc_method_handler( + servicer.CreateSession, + request_deserializer=modelresponse__pb2.SessionID.FromString, + response_serializer=google_dot_protobuf_dot_empty__pb2.Empty. + SerializeToString, + ), + 'DestroySession': + grpc.unary_unary_rpc_method_handler( + servicer.DestroySession, + request_deserializer=modelresponse__pb2.SessionID.FromString, + response_serializer=google_dot_protobuf_dot_empty__pb2.Empty. + SerializeToString, + ), + 'GeneratorReply': + grpc.unary_unary_rpc_method_handler( + servicer.GeneratorReply, + request_deserializer=modelresponse__pb2.MultiStringRequest.FromString, + response_serializer=modelresponse__pb2.MultiStringReply.SerializeToString, + ), + 'ClassificationReply': + grpc.unary_unary_rpc_method_handler( + servicer.ClassificationReply, + request_deserializer=modelresponse__pb2.SingleStringRequest.FromString, + response_serializer=modelresponse__pb2.SingleStringReply.SerializeToString, + ), + 'QuestionAndAnswerReply': + grpc.unary_unary_rpc_method_handler( + servicer.QuestionAndAnswerReply, + request_deserializer=modelresponse__pb2.QARequest.FromString, + response_serializer=modelresponse__pb2.SingleStringReply.SerializeToString, + ), + 'FillMaskReply': + grpc.unary_unary_rpc_method_handler( + servicer.FillMaskReply, + request_deserializer=modelresponse__pb2.SingleStringRequest.FromString, + response_serializer=modelresponse__pb2.SingleStringReply.SerializeToString, + ), + 'TokenClassificationReply': + grpc.unary_unary_rpc_method_handler( + servicer.TokenClassificationReply, + request_deserializer=modelresponse__pb2.SingleStringRequest.FromString, + response_serializer=modelresponse__pb2.SingleStringReply.SerializeToString, + ), + 'ConversationalReply': + grpc.unary_unary_rpc_method_handler( + servicer.ConversationalReply, + request_deserializer=modelresponse__pb2.ConversationRequest.FromString, + response_serializer=modelresponse__pb2.ConversationReply.SerializeToString, + ), + 'Txt2ImgReply': + grpc.unary_unary_rpc_method_handler( + servicer.Txt2ImgReply, + request_deserializer=modelresponse__pb2.MultiStringRequest.FromString, + response_serializer=modelresponse__pb2.ImageReply.SerializeToString, + ), } - generic_handler = grpc.method_handlers_generic_handler( - 'modelresponse.ModelResponse', rpc_method_handlers) - server.add_generic_rpc_handlers((generic_handler,)) + generic_handler = grpc.method_handlers_generic_handler('modelresponse.ModelResponse', + rpc_method_handlers) + server.add_generic_rpc_handlers((generic_handler, )) - # This class is part of an EXPERIMENTAL API. +# This class is part of an EXPERIMENTAL API. class ModelResponse(object): """Missing associated documentation comment in .proto file.""" - @staticmethod def Terminate(request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary( + request, target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None): - return grpc.experimental.unary_unary(request, target, '/modelresponse.ModelResponse/Terminate', + '/modelresponse.ModelResponse/Terminate', google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, google_dot_protobuf_dot_empty__pb2.Empty.FromString, - options, channel_credentials, - insecure, call_credentials, compression, wait_for_ready, timeout, metadata) + options, + channel_credentials, + insecure, + call_credentials, + compression, + wait_for_ready, + timeout, + metadata) @staticmethod def CreateSession(request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary( + request, target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None): - return grpc.experimental.unary_unary(request, target, '/modelresponse.ModelResponse/CreateSession', + '/modelresponse.ModelResponse/CreateSession', modelresponse__pb2.SessionID.SerializeToString, google_dot_protobuf_dot_empty__pb2.Empty.FromString, - options, channel_credentials, - insecure, call_credentials, compression, wait_for_ready, timeout, metadata) + options, + channel_credentials, + insecure, + call_credentials, + compression, + wait_for_ready, + timeout, + metadata) @staticmethod def DestroySession(request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary( + request, target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None): - return grpc.experimental.unary_unary(request, target, '/modelresponse.ModelResponse/DestroySession', + '/modelresponse.ModelResponse/DestroySession', modelresponse__pb2.SessionID.SerializeToString, google_dot_protobuf_dot_empty__pb2.Empty.FromString, - options, channel_credentials, - insecure, call_credentials, compression, wait_for_ready, timeout, metadata) + options, + channel_credentials, + insecure, + call_credentials, + compression, + wait_for_ready, + timeout, + metadata) @staticmethod def GeneratorReply(request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary( + request, target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None): - return grpc.experimental.unary_unary(request, target, '/modelresponse.ModelResponse/GeneratorReply', + '/modelresponse.ModelResponse/GeneratorReply', modelresponse__pb2.MultiStringRequest.SerializeToString, modelresponse__pb2.MultiStringReply.FromString, - options, channel_credentials, - insecure, call_credentials, compression, wait_for_ready, timeout, metadata) + options, + channel_credentials, + insecure, + call_credentials, + compression, + wait_for_ready, + timeout, + metadata) @staticmethod def ClassificationReply(request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary( + request, target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None): - return grpc.experimental.unary_unary(request, target, '/modelresponse.ModelResponse/ClassificationReply', + '/modelresponse.ModelResponse/ClassificationReply', modelresponse__pb2.SingleStringRequest.SerializeToString, modelresponse__pb2.SingleStringReply.FromString, - options, channel_credentials, - insecure, call_credentials, compression, wait_for_ready, timeout, metadata) + options, + channel_credentials, + insecure, + call_credentials, + compression, + wait_for_ready, + timeout, + metadata) @staticmethod def QuestionAndAnswerReply(request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary( + request, target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None): - return grpc.experimental.unary_unary(request, target, '/modelresponse.ModelResponse/QuestionAndAnswerReply', + '/modelresponse.ModelResponse/QuestionAndAnswerReply', modelresponse__pb2.QARequest.SerializeToString, modelresponse__pb2.SingleStringReply.FromString, - options, channel_credentials, - insecure, call_credentials, compression, wait_for_ready, timeout, metadata) + options, + channel_credentials, + insecure, + call_credentials, + compression, + wait_for_ready, + timeout, + metadata) @staticmethod def FillMaskReply(request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary( + request, target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None): - return grpc.experimental.unary_unary(request, target, '/modelresponse.ModelResponse/FillMaskReply', + '/modelresponse.ModelResponse/FillMaskReply', modelresponse__pb2.SingleStringRequest.SerializeToString, modelresponse__pb2.SingleStringReply.FromString, - options, channel_credentials, - insecure, call_credentials, compression, wait_for_ready, timeout, metadata) + options, + channel_credentials, + insecure, + call_credentials, + compression, + wait_for_ready, + timeout, + metadata) @staticmethod def TokenClassificationReply(request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary( + request, target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None): - return grpc.experimental.unary_unary(request, target, '/modelresponse.ModelResponse/TokenClassificationReply', + '/modelresponse.ModelResponse/TokenClassificationReply', modelresponse__pb2.SingleStringRequest.SerializeToString, modelresponse__pb2.SingleStringReply.FromString, - options, channel_credentials, - insecure, call_credentials, compression, wait_for_ready, timeout, metadata) + options, + channel_credentials, + insecure, + call_credentials, + compression, + wait_for_ready, + timeout, + metadata) @staticmethod def ConversationalReply(request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary( + request, target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None): - return grpc.experimental.unary_unary(request, target, '/modelresponse.ModelResponse/ConversationalReply', + '/modelresponse.ModelResponse/ConversationalReply', modelresponse__pb2.ConversationRequest.SerializeToString, modelresponse__pb2.ConversationReply.FromString, - options, channel_credentials, - insecure, call_credentials, compression, wait_for_ready, timeout, metadata) + options, + channel_credentials, + insecure, + call_credentials, + compression, + wait_for_ready, + timeout, + metadata) @staticmethod def Txt2ImgReply(request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary( + request, target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None): - return grpc.experimental.unary_unary(request, target, '/modelresponse.ModelResponse/Txt2ImgReply', + '/modelresponse.ModelResponse/Txt2ImgReply', modelresponse__pb2.MultiStringRequest.SerializeToString, modelresponse__pb2.ImageReply.FromString, - options, channel_credentials, - insecure, call_credentials, compression, wait_for_ready, timeout, metadata) + options, + channel_credentials, + insecure, + call_credentials, + compression, + wait_for_ready, + timeout, + metadata) class DeploymentManagementStub(object): """Missing associated documentation comment in .proto file.""" - def __init__(self, channel): """Constructor. @@ -374,70 +478,70 @@ def __init__(self, channel): channel: A grpc.Channel. """ self.AddDeployment = channel.unary_unary( - '/modelresponse.DeploymentManagement/AddDeployment', - request_serializer=modelresponse__pb2.AddDeployRequest.SerializeToString, - response_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, - ) + '/modelresponse.DeploymentManagement/AddDeployment', + request_serializer=modelresponse__pb2.AddDeployRequest.SerializeToString, + response_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, + ) self.CreateSession = channel.unary_unary( - '/modelresponse.DeploymentManagement/CreateSession', - request_serializer=modelresponse__pb2.SessionID.SerializeToString, - response_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, - ) + '/modelresponse.DeploymentManagement/CreateSession', + request_serializer=modelresponse__pb2.SessionID.SerializeToString, + response_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, + ) self.DestroySession = channel.unary_unary( - '/modelresponse.DeploymentManagement/DestroySession', - request_serializer=modelresponse__pb2.SessionID.SerializeToString, - response_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, - ) + '/modelresponse.DeploymentManagement/DestroySession', + request_serializer=modelresponse__pb2.SessionID.SerializeToString, + response_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, + ) self.GeneratorReply = channel.unary_unary( - '/modelresponse.DeploymentManagement/GeneratorReply', - request_serializer=modelresponse__pb2.MultiStringRequest.SerializeToString, - response_deserializer=modelresponse__pb2.MultiStringReply.FromString, - ) + '/modelresponse.DeploymentManagement/GeneratorReply', + request_serializer=modelresponse__pb2.MultiStringRequest.SerializeToString, + response_deserializer=modelresponse__pb2.MultiStringReply.FromString, + ) self.ClassificationReply = channel.unary_unary( - '/modelresponse.DeploymentManagement/ClassificationReply', - request_serializer=modelresponse__pb2.SingleStringRequest.SerializeToString, - response_deserializer=modelresponse__pb2.SingleStringReply.FromString, - ) + '/modelresponse.DeploymentManagement/ClassificationReply', + request_serializer=modelresponse__pb2.SingleStringRequest.SerializeToString, + response_deserializer=modelresponse__pb2.SingleStringReply.FromString, + ) self.QuestionAndAnswerReply = channel.unary_unary( - '/modelresponse.DeploymentManagement/QuestionAndAnswerReply', - request_serializer=modelresponse__pb2.QARequest.SerializeToString, - response_deserializer=modelresponse__pb2.SingleStringReply.FromString, - ) + '/modelresponse.DeploymentManagement/QuestionAndAnswerReply', + request_serializer=modelresponse__pb2.QARequest.SerializeToString, + response_deserializer=modelresponse__pb2.SingleStringReply.FromString, + ) self.FillMaskReply = channel.unary_unary( - '/modelresponse.DeploymentManagement/FillMaskReply', - request_serializer=modelresponse__pb2.SingleStringRequest.SerializeToString, - response_deserializer=modelresponse__pb2.SingleStringReply.FromString, - ) + '/modelresponse.DeploymentManagement/FillMaskReply', + request_serializer=modelresponse__pb2.SingleStringRequest.SerializeToString, + response_deserializer=modelresponse__pb2.SingleStringReply.FromString, + ) self.TokenClassificationReply = channel.unary_unary( - '/modelresponse.DeploymentManagement/TokenClassificationReply', - request_serializer=modelresponse__pb2.SingleStringRequest.SerializeToString, - response_deserializer=modelresponse__pb2.SingleStringReply.FromString, - ) + '/modelresponse.DeploymentManagement/TokenClassificationReply', + request_serializer=modelresponse__pb2.SingleStringRequest.SerializeToString, + response_deserializer=modelresponse__pb2.SingleStringReply.FromString, + ) self.ConversationalReply = channel.unary_unary( - '/modelresponse.DeploymentManagement/ConversationalReply', - request_serializer=modelresponse__pb2.ConversationRequest.SerializeToString, - response_deserializer=modelresponse__pb2.ConversationReply.FromString, - ) + '/modelresponse.DeploymentManagement/ConversationalReply', + request_serializer=modelresponse__pb2.ConversationRequest.SerializeToString, + response_deserializer=modelresponse__pb2.ConversationReply.FromString, + ) self.Txt2ImgReply = channel.unary_unary( - '/modelresponse.DeploymentManagement/Txt2ImgReply', - request_serializer=modelresponse__pb2.MultiStringRequest.SerializeToString, - response_deserializer=modelresponse__pb2.ImageReply.FromString, - ) + '/modelresponse.DeploymentManagement/Txt2ImgReply', + request_serializer=modelresponse__pb2.MultiStringRequest.SerializeToString, + response_deserializer=modelresponse__pb2.ImageReply.FromString, + ) self.DeleteDeployment = channel.unary_unary( - '/modelresponse.DeploymentManagement/DeleteDeployment', - request_serializer=modelresponse__pb2.DeleteDeployRequest.SerializeToString, - response_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, - ) + '/modelresponse.DeploymentManagement/DeleteDeployment', + request_serializer=modelresponse__pb2.DeleteDeployRequest.SerializeToString, + response_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, + ) self.Terminate = channel.unary_unary( - '/modelresponse.DeploymentManagement/Terminate', - request_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, - response_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, - ) + '/modelresponse.DeploymentManagement/Terminate', + request_serializer=google_dot_protobuf_dot_empty__pb2.Empty. + SerializeToString, + response_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, + ) class DeploymentManagementServicer(object): """Missing associated documentation comment in .proto file.""" - def AddDeployment(self, request, context): """Missing associated documentation comment in .proto file.""" context.set_code(grpc.StatusCode.UNIMPLEMENTED) @@ -513,276 +617,401 @@ def Terminate(self, request, context): def add_DeploymentManagementServicer_to_server(servicer, server): rpc_method_handlers = { - 'AddDeployment': grpc.unary_unary_rpc_method_handler( - servicer.AddDeployment, - request_deserializer=modelresponse__pb2.AddDeployRequest.FromString, - response_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, - ), - 'CreateSession': grpc.unary_unary_rpc_method_handler( - servicer.CreateSession, - request_deserializer=modelresponse__pb2.SessionID.FromString, - response_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, - ), - 'DestroySession': grpc.unary_unary_rpc_method_handler( - servicer.DestroySession, - request_deserializer=modelresponse__pb2.SessionID.FromString, - response_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, - ), - 'GeneratorReply': grpc.unary_unary_rpc_method_handler( - servicer.GeneratorReply, - request_deserializer=modelresponse__pb2.MultiStringRequest.FromString, - response_serializer=modelresponse__pb2.MultiStringReply.SerializeToString, - ), - 'ClassificationReply': grpc.unary_unary_rpc_method_handler( - servicer.ClassificationReply, - request_deserializer=modelresponse__pb2.SingleStringRequest.FromString, - response_serializer=modelresponse__pb2.SingleStringReply.SerializeToString, - ), - 'QuestionAndAnswerReply': grpc.unary_unary_rpc_method_handler( - servicer.QuestionAndAnswerReply, - request_deserializer=modelresponse__pb2.QARequest.FromString, - response_serializer=modelresponse__pb2.SingleStringReply.SerializeToString, - ), - 'FillMaskReply': grpc.unary_unary_rpc_method_handler( - servicer.FillMaskReply, - request_deserializer=modelresponse__pb2.SingleStringRequest.FromString, - response_serializer=modelresponse__pb2.SingleStringReply.SerializeToString, - ), - 'TokenClassificationReply': grpc.unary_unary_rpc_method_handler( - servicer.TokenClassificationReply, - request_deserializer=modelresponse__pb2.SingleStringRequest.FromString, - response_serializer=modelresponse__pb2.SingleStringReply.SerializeToString, - ), - 'ConversationalReply': grpc.unary_unary_rpc_method_handler( - servicer.ConversationalReply, - request_deserializer=modelresponse__pb2.ConversationRequest.FromString, - response_serializer=modelresponse__pb2.ConversationReply.SerializeToString, - ), - 'Txt2ImgReply': grpc.unary_unary_rpc_method_handler( - servicer.Txt2ImgReply, - request_deserializer=modelresponse__pb2.MultiStringRequest.FromString, - response_serializer=modelresponse__pb2.ImageReply.SerializeToString, - ), - 'DeleteDeployment': grpc.unary_unary_rpc_method_handler( - servicer.DeleteDeployment, - request_deserializer=modelresponse__pb2.DeleteDeployRequest.FromString, - response_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, - ), - 'Terminate': grpc.unary_unary_rpc_method_handler( - servicer.Terminate, - request_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, - response_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, - ), + 'AddDeployment': + grpc.unary_unary_rpc_method_handler( + servicer.AddDeployment, + request_deserializer=modelresponse__pb2.AddDeployRequest.FromString, + response_serializer=google_dot_protobuf_dot_empty__pb2.Empty. + SerializeToString, + ), + 'CreateSession': + grpc.unary_unary_rpc_method_handler( + servicer.CreateSession, + request_deserializer=modelresponse__pb2.SessionID.FromString, + response_serializer=google_dot_protobuf_dot_empty__pb2.Empty. + SerializeToString, + ), + 'DestroySession': + grpc.unary_unary_rpc_method_handler( + servicer.DestroySession, + request_deserializer=modelresponse__pb2.SessionID.FromString, + response_serializer=google_dot_protobuf_dot_empty__pb2.Empty. + SerializeToString, + ), + 'GeneratorReply': + grpc.unary_unary_rpc_method_handler( + servicer.GeneratorReply, + request_deserializer=modelresponse__pb2.MultiStringRequest.FromString, + response_serializer=modelresponse__pb2.MultiStringReply.SerializeToString, + ), + 'ClassificationReply': + grpc.unary_unary_rpc_method_handler( + servicer.ClassificationReply, + request_deserializer=modelresponse__pb2.SingleStringRequest.FromString, + response_serializer=modelresponse__pb2.SingleStringReply.SerializeToString, + ), + 'QuestionAndAnswerReply': + grpc.unary_unary_rpc_method_handler( + servicer.QuestionAndAnswerReply, + request_deserializer=modelresponse__pb2.QARequest.FromString, + response_serializer=modelresponse__pb2.SingleStringReply.SerializeToString, + ), + 'FillMaskReply': + grpc.unary_unary_rpc_method_handler( + servicer.FillMaskReply, + request_deserializer=modelresponse__pb2.SingleStringRequest.FromString, + response_serializer=modelresponse__pb2.SingleStringReply.SerializeToString, + ), + 'TokenClassificationReply': + grpc.unary_unary_rpc_method_handler( + servicer.TokenClassificationReply, + request_deserializer=modelresponse__pb2.SingleStringRequest.FromString, + response_serializer=modelresponse__pb2.SingleStringReply.SerializeToString, + ), + 'ConversationalReply': + grpc.unary_unary_rpc_method_handler( + servicer.ConversationalReply, + request_deserializer=modelresponse__pb2.ConversationRequest.FromString, + response_serializer=modelresponse__pb2.ConversationReply.SerializeToString, + ), + 'Txt2ImgReply': + grpc.unary_unary_rpc_method_handler( + servicer.Txt2ImgReply, + request_deserializer=modelresponse__pb2.MultiStringRequest.FromString, + response_serializer=modelresponse__pb2.ImageReply.SerializeToString, + ), + 'DeleteDeployment': + grpc.unary_unary_rpc_method_handler( + servicer.DeleteDeployment, + request_deserializer=modelresponse__pb2.DeleteDeployRequest.FromString, + response_serializer=google_dot_protobuf_dot_empty__pb2.Empty. + SerializeToString, + ), + 'Terminate': + grpc.unary_unary_rpc_method_handler( + servicer.Terminate, + request_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, + response_serializer=google_dot_protobuf_dot_empty__pb2.Empty. + SerializeToString, + ), } generic_handler = grpc.method_handlers_generic_handler( - 'modelresponse.DeploymentManagement', rpc_method_handlers) - server.add_generic_rpc_handlers((generic_handler,)) + 'modelresponse.DeploymentManagement', + rpc_method_handlers) + server.add_generic_rpc_handlers((generic_handler, )) - # This class is part of an EXPERIMENTAL API. +# This class is part of an EXPERIMENTAL API. class DeploymentManagement(object): """Missing associated documentation comment in .proto file.""" - @staticmethod def AddDeployment(request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary( + request, target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None): - return grpc.experimental.unary_unary(request, target, '/modelresponse.DeploymentManagement/AddDeployment', + '/modelresponse.DeploymentManagement/AddDeployment', modelresponse__pb2.AddDeployRequest.SerializeToString, google_dot_protobuf_dot_empty__pb2.Empty.FromString, - options, channel_credentials, - insecure, call_credentials, compression, wait_for_ready, timeout, metadata) + options, + channel_credentials, + insecure, + call_credentials, + compression, + wait_for_ready, + timeout, + metadata) @staticmethod def CreateSession(request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary( + request, target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None): - return grpc.experimental.unary_unary(request, target, '/modelresponse.DeploymentManagement/CreateSession', + '/modelresponse.DeploymentManagement/CreateSession', modelresponse__pb2.SessionID.SerializeToString, google_dot_protobuf_dot_empty__pb2.Empty.FromString, - options, channel_credentials, - insecure, call_credentials, compression, wait_for_ready, timeout, metadata) + options, + channel_credentials, + insecure, + call_credentials, + compression, + wait_for_ready, + timeout, + metadata) @staticmethod def DestroySession(request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary( + request, target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None): - return grpc.experimental.unary_unary(request, target, '/modelresponse.DeploymentManagement/DestroySession', + '/modelresponse.DeploymentManagement/DestroySession', modelresponse__pb2.SessionID.SerializeToString, google_dot_protobuf_dot_empty__pb2.Empty.FromString, - options, channel_credentials, - insecure, call_credentials, compression, wait_for_ready, timeout, metadata) + options, + channel_credentials, + insecure, + call_credentials, + compression, + wait_for_ready, + timeout, + metadata) @staticmethod def GeneratorReply(request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary( + request, target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None): - return grpc.experimental.unary_unary(request, target, '/modelresponse.DeploymentManagement/GeneratorReply', + '/modelresponse.DeploymentManagement/GeneratorReply', modelresponse__pb2.MultiStringRequest.SerializeToString, modelresponse__pb2.MultiStringReply.FromString, - options, channel_credentials, - insecure, call_credentials, compression, wait_for_ready, timeout, metadata) + options, + channel_credentials, + insecure, + call_credentials, + compression, + wait_for_ready, + timeout, + metadata) @staticmethod def ClassificationReply(request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary( + request, target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None): - return grpc.experimental.unary_unary(request, target, '/modelresponse.DeploymentManagement/ClassificationReply', + '/modelresponse.DeploymentManagement/ClassificationReply', modelresponse__pb2.SingleStringRequest.SerializeToString, modelresponse__pb2.SingleStringReply.FromString, - options, channel_credentials, - insecure, call_credentials, compression, wait_for_ready, timeout, metadata) + options, + channel_credentials, + insecure, + call_credentials, + compression, + wait_for_ready, + timeout, + metadata) @staticmethod def QuestionAndAnswerReply(request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary( + request, target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None): - return grpc.experimental.unary_unary(request, target, '/modelresponse.DeploymentManagement/QuestionAndAnswerReply', + '/modelresponse.DeploymentManagement/QuestionAndAnswerReply', modelresponse__pb2.QARequest.SerializeToString, modelresponse__pb2.SingleStringReply.FromString, - options, channel_credentials, - insecure, call_credentials, compression, wait_for_ready, timeout, metadata) + options, + channel_credentials, + insecure, + call_credentials, + compression, + wait_for_ready, + timeout, + metadata) @staticmethod def FillMaskReply(request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary( + request, target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None): - return grpc.experimental.unary_unary(request, target, '/modelresponse.DeploymentManagement/FillMaskReply', + '/modelresponse.DeploymentManagement/FillMaskReply', modelresponse__pb2.SingleStringRequest.SerializeToString, modelresponse__pb2.SingleStringReply.FromString, - options, channel_credentials, - insecure, call_credentials, compression, wait_for_ready, timeout, metadata) + options, + channel_credentials, + insecure, + call_credentials, + compression, + wait_for_ready, + timeout, + metadata) @staticmethod def TokenClassificationReply(request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary( + request, target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None): - return grpc.experimental.unary_unary(request, target, '/modelresponse.DeploymentManagement/TokenClassificationReply', + '/modelresponse.DeploymentManagement/TokenClassificationReply', modelresponse__pb2.SingleStringRequest.SerializeToString, modelresponse__pb2.SingleStringReply.FromString, - options, channel_credentials, - insecure, call_credentials, compression, wait_for_ready, timeout, metadata) + options, + channel_credentials, + insecure, + call_credentials, + compression, + wait_for_ready, + timeout, + metadata) @staticmethod def ConversationalReply(request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary( + request, target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None): - return grpc.experimental.unary_unary(request, target, '/modelresponse.DeploymentManagement/ConversationalReply', + '/modelresponse.DeploymentManagement/ConversationalReply', modelresponse__pb2.ConversationRequest.SerializeToString, modelresponse__pb2.ConversationReply.FromString, - options, channel_credentials, - insecure, call_credentials, compression, wait_for_ready, timeout, metadata) + options, + channel_credentials, + insecure, + call_credentials, + compression, + wait_for_ready, + timeout, + metadata) @staticmethod def Txt2ImgReply(request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary( + request, target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None): - return grpc.experimental.unary_unary(request, target, '/modelresponse.DeploymentManagement/Txt2ImgReply', + '/modelresponse.DeploymentManagement/Txt2ImgReply', modelresponse__pb2.MultiStringRequest.SerializeToString, modelresponse__pb2.ImageReply.FromString, - options, channel_credentials, - insecure, call_credentials, compression, wait_for_ready, timeout, metadata) + options, + channel_credentials, + insecure, + call_credentials, + compression, + wait_for_ready, + timeout, + metadata) @staticmethod def DeleteDeployment(request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary( + request, target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None): - return grpc.experimental.unary_unary(request, target, '/modelresponse.DeploymentManagement/DeleteDeployment', + '/modelresponse.DeploymentManagement/DeleteDeployment', modelresponse__pb2.DeleteDeployRequest.SerializeToString, google_dot_protobuf_dot_empty__pb2.Empty.FromString, - options, channel_credentials, - insecure, call_credentials, compression, wait_for_ready, timeout, metadata) + options, + channel_credentials, + insecure, + call_credentials, + compression, + wait_for_ready, + timeout, + metadata) @staticmethod def Terminate(request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary( + request, target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None): - return grpc.experimental.unary_unary(request, target, '/modelresponse.DeploymentManagement/Terminate', + '/modelresponse.DeploymentManagement/Terminate', google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, google_dot_protobuf_dot_empty__pb2.Empty.FromString, - options, channel_credentials, - insecure, call_credentials, compression, wait_for_ready, timeout, metadata) + options, + channel_credentials, + insecure, + call_credentials, + compression, + wait_for_ready, + timeout, + metadata) diff --git a/mii/models/score/generate.py b/mii/models/score/generate.py index 55f63046..b87218ad 100644 --- a/mii/models/score/generate.py +++ b/mii/models/score/generate.py @@ -22,7 +22,7 @@ def create_score_file(deployment_tag, config_dict[mii.constants.DEPLOYMENTS_KEY] = {} if port_map is not None: config_dict[mii.constants.PORT_MAP_KEY] = port_map - + if deployments is not None: for deployment in deployments.values(): deployment_config = { @@ -35,7 +35,8 @@ def create_score_file(deployment_tag, mii.constants.DEEPSPEED_CONFIG_KEY: deployment.ds_config, mii.constants.DEPLOYED_KEY: deployment.deployed, } - config_dict[mii.constants.DEPLOYMENTS_KEY][deployment.deployment_name] = deployment_config + config_dict[mii.constants.DEPLOYMENTS_KEY][ + deployment.deployment_name] = deployment_config if lb_config is not None: config_dict[mii.constants.LOAD_BALANCER_CONFIG_KEY] = lb_config diff --git a/mii/server.py b/mii/server.py index e7f1360e..5bb1b83a 100644 --- a/mii/server.py +++ b/mii/server.py @@ -28,7 +28,12 @@ def config_to_b64_str(config): class MIIServer(): '''Initialize the model, setup the server for the model under model_path''' - def __init__(self, deployment_tag, deployments, model_path, lb_config=None, lb_enabled=False): + def __init__(self, + deployment_tag, + deployments, + model_path, + lb_config=None, + lb_enabled=False): if len(deployments) > 0: self.lb_enabled = lb_enabled self.deployments = deployments @@ -43,11 +48,12 @@ def __init__(self, deployment_tag, deployments, model_path, lb_config=None, lb_e f.write(f"localhost slots={num_gpu}") mii.configs.hostfile = hostfile - processes = self._initialize_service(deployment_tag, - deployments, - model_path, - lb_config, - ) + processes = self._initialize_service( + deployment_tag, + deployments, + model_path, + lb_config, + ) self._wait_until_server_is_live(processes, lb_config.replica_configs) def _wait_until_server_is_live(self, processes, deployment): From a4e3d56f1d303cfaeb98aa1873ee12b71f6ebf11 Mon Sep 17 00:00:00 2001 From: Tosin Segun Date: Tue, 25 Jul 2023 22:33:27 +0000 Subject: [PATCH 53/69] fixing bug for partial termination --- mii/grpc_related/modelresponse_server.py | 27 ++++++++++++------------ 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/mii/grpc_related/modelresponse_server.py b/mii/grpc_related/modelresponse_server.py index 5204779c..c4b49a21 100644 --- a/mii/grpc_related/modelresponse_server.py +++ b/mii/grpc_related/modelresponse_server.py @@ -212,26 +212,25 @@ def intercept_service(self, continuation, handler_call_details): def invoke_intercept_method(request_proto, context): method_name = _get_grpc_method_name(handler_call_details.method) if method_name == ADD_DEPLOYMENT_METHOD: - task = str(getattr(request_proto, "task")) deployment_name = str(getattr(request_proto, "deployment_name")) - hostname = str(getattr(request_proto, "hostname")) - tensor_parallel_ports = list( - getattr(request_proto, - "tensor_parallel_ports")) - torch_dist_port = int(getattr(request_proto, "torch_dist_port")) - gpu_indices = list(getattr(request_proto, "gpu_indices")) if deployment_name not in self.stubs: + task = str(getattr(request_proto, "task")) + hostname = str(getattr(request_proto, "hostname")) + tensor_parallel_ports = list( + getattr(request_proto, + "tensor_parallel_ports")) + torch_dist_port = int(getattr(request_proto, "torch_dist_port")) + gpu_indices = list(getattr(request_proto, "gpu_indices")) self.stubs[deployment_name] = [] - self.counter[deployment_name] = AtomicCounter() - self.tasks[deployment_name] = task - self.stubs[deployment_name].append( - ParallelStubInvoker(hostname, - tensor_parallel_ports, - self.asyncio_loop)) + self.counter[deployment_name] = AtomicCounter() + self.tasks[deployment_name] = task + self.stubs[deployment_name].append( + ParallelStubInvoker(hostname, + tensor_parallel_ports, + self.asyncio_loop)) return google_dot_protobuf_dot_empty__pb2.Empty() if method_name == TERMINATE_METHOD: - print(self.stubs.keys()) for deployment_name in self.stubs: for stub in self.stubs[deployment_name]: stub.invoke(TERMINATE_METHOD, From 4b5bb47235cf4cd1a5c9aa0eb6aac6b675f771e8 Mon Sep 17 00:00:00 2001 From: Tosin Segun Date: Tue, 25 Jul 2023 22:58:18 +0000 Subject: [PATCH 54/69] Removing comments --- mii/client.py | 9 +++------ mii/grpc_related/restful_gateway.py | 2 -- 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/mii/client.py b/mii/client.py index f937d69d..83e39681 100644 --- a/mii/client.py +++ b/mii/client.py @@ -57,11 +57,10 @@ def mii_query_handle(deployment_tag): return MIINonPersistentClient(task, deployment_tag) deployments, lb_config, model_path, port_map = _get_deployment_configs(deployment_tag) - mii_configs_dict = None + mii_configs = None if len(deployments) > 0: - mii_configs_dict = next(iter(deployments.values())).mii_config - #mii_configs = mii.config.MIIConfig(**mii_configs_dict) - port_number = None if mii_configs_dict == None else mii_configs_dict.port_number + mii_configs = next(iter(deployments.values())).mii_config + port_number = None if mii_configs == None else mii_configs.port_number return MIIClient(deployments, "localhost", @@ -223,7 +222,6 @@ def add_models(self, self.lb_config = lb_config for deployment in deployments: self.deployments[deployment.deployment_name] = deployment - #self.deployments.extend(deployments) if self.model_path is None and deployment_type == DeploymentType.LOCAL: self.model_path = mii.constants.MII_MODEL_PATH_DEFAULT elif self.model_path is None and deployment_type == DeploymentType.AML: @@ -348,6 +346,5 @@ def terminate_restful_gateway(deployment_tag): deployments, _, _, _ = _get_deployment_configs(deployment_tag) for deployment in deployments.values(): mii_configs = deployment.mii_config - #mii_configs = mii.config.MIIConfig(**mii_configs_dict) if mii_configs.enable_restful_api: requests.get(f"http://localhost:{mii_configs.restful_api_port}/terminate") diff --git a/mii/grpc_related/restful_gateway.py b/mii/grpc_related/restful_gateway.py index d3dc53da..f4302d45 100644 --- a/mii/grpc_related/restful_gateway.py +++ b/mii/grpc_related/restful_gateway.py @@ -21,8 +21,6 @@ def createRestfulGatewayApp(deployment_name, task, mii_config, server_thread): # client must be thread-safe client = mii.mii_query_handle(deployment_name) - #client = mii.MIIClient(deployment_name, "localhost", mii_config.port_number) - class RestfulGatewayService(Resource): def __init__(self): super().__init__() From 30d2b03ccf8f83289bce561c0cee0b19f6f091cb Mon Sep 17 00:00:00 2001 From: Tosin Segun Date: Wed, 26 Jul 2023 00:41:34 +0000 Subject: [PATCH 55/69] Including GPU index map in score file --- examples/multi_model/query.py | 4 ++++ mii/client.py | 4 ++-- mii/constants.py | 2 +- mii/models/score/generate.py | 1 + mii/models/score/score_template.py | 2 +- 5 files changed, 9 insertions(+), 4 deletions(-) diff --git a/examples/multi_model/query.py b/examples/multi_model/query.py index bf760b49..f506830f 100644 --- a/examples/multi_model/query.py +++ b/examples/multi_model/query.py @@ -17,6 +17,7 @@ max_new_tokens=30, ) results.append(result) +print(result) result = generator.query({ 'query': @@ -25,6 +26,7 @@ "microsoft/DialogRPT-human-vs-rand_deployment" }) results.append(result) +print(result) result = generator.query({ 'text': "DeepSpeed is the greatest", @@ -34,6 +36,7 @@ "deployment_name": "microsoft/DialoGPT-large_deployment" }) results.append(result) +print(result) result = generator.query({ 'question': @@ -44,3 +47,4 @@ "deepset/roberta-large-squad2" + "-qa-deployment" }) results.append(result) +print(result) diff --git a/mii/client.py b/mii/client.py index 83e39681..9a922070 100644 --- a/mii/client.py +++ b/mii/client.py @@ -26,13 +26,13 @@ def _get_deployment_configs(deployment_tag): 'model': deployment[mii.constants.MODEL_NAME_KEY], 'enable_deepspeed': deployment[mii.constants.ENABLE_DEEPSPEED_KEY], 'enable_zero': deployment[mii.constants.ENABLE_DEEPSPEED_ZERO_KEY], - 'GPU_index_map': None, + 'GPU_index_map': deployment[mii.constants.GPU_INDEX_KEY], 'mii_config': deployment[mii.constants.MII_CONFIGS_KEY], 'ds_config': deployment[mii.constants.DEEPSPEED_CONFIG_KEY], 'version': 1, 'deployed': deployment[mii.constants.DEPLOYED_KEY] } - deployments[deployment_name] = DeploymentConfig.parse_obj(data) + deployments[deployment_name] = DeploymentConfig(**data) lb_config = configs.get(mii.constants.LOAD_BALANCER_CONFIG_KEY) model_path = configs[mii.constants.MODEL_PATH_KEY] port_map = configs.get(mii.constants.PORT_MAP_KEY) diff --git a/mii/constants.py b/mii/constants.py index 520a3c3d..9fff96c2 100644 --- a/mii/constants.py +++ b/mii/constants.py @@ -88,7 +88,7 @@ class ModelProvider(enum.Enum): 'generated_responses'], TEXT2IMG_NAME: ["query"] } - +GPU_INDEX_KEY = "index_keys" DEPLOYMENTS_KEY = 'deployments' PORT_MAP_KEY = 'port_map' MODEL_NAME_KEY = 'model_name' diff --git a/mii/models/score/generate.py b/mii/models/score/generate.py index b87218ad..86ceca28 100644 --- a/mii/models/score/generate.py +++ b/mii/models/score/generate.py @@ -34,6 +34,7 @@ def create_score_file(deployment_tag, mii.constants.ENABLE_DEEPSPEED_ZERO_KEY: deployment.enable_zero, mii.constants.DEEPSPEED_CONFIG_KEY: deployment.ds_config, mii.constants.DEPLOYED_KEY: deployment.deployed, + mii.constants.GPU_INDEX_KEY: deployment.GPU_index_map } config_dict[mii.constants.DEPLOYMENTS_KEY][ deployment.deployment_name] = deployment_config diff --git a/mii/models/score/score_template.py b/mii/models/score/score_template.py index 117f3866..a8969ee0 100644 --- a/mii/models/score/score_template.py +++ b/mii/models/score/score_template.py @@ -30,7 +30,7 @@ def init(): 'model': deployment[mii.constants.MODEL_NAME_KEY], 'enable_deepspeed': deployment[mii.constants.ENABLE_DEEPSPEED_KEY], 'enable_zero': deployment[mii.constants.ENABLE_DEEPSPEED_ZERO_KEY], - 'GPU_index_map': None, + 'GPU_index_map': deployment[mii.constants.GPU_INDEX_KEY], 'mii_config': deployment[mii.constants.MII_CONFIGS_KEY], 'ds_config': deployment[mii.constants.DEEPSPEED_CONFIG_KEY], 'version': 1 From c5d59963366c5f6141b26e2c98e5a96900d51b2f Mon Sep 17 00:00:00 2001 From: Tosin Segun Date: Wed, 26 Jul 2023 22:31:07 +0000 Subject: [PATCH 56/69] Refactoring deployment --- mii/client.py | 51 +++++++------- mii/config.py | 8 ++- mii/constants.py | 2 + mii/deployment.py | 84 +++++++++++++++++------- mii/grpc_related/modelresponse_server.py | 4 +- mii/grpc_related/proto/build_script.sh | 2 +- mii/server.py | 1 - mii/terminate.py | 2 +- 8 files changed, 100 insertions(+), 54 deletions(-) diff --git a/mii/client.py b/mii/client.py index 9a922070..e4e8fe2c 100644 --- a/mii/client.py +++ b/mii/client.py @@ -10,7 +10,7 @@ from mii.grpc_related.proto import modelresponse_pb2, modelresponse_pb2_grpc from mii.constants import GRPC_MAX_MSG_SIZE, Tasks, DeploymentType from mii.method_table import GRPC_METHOD_TABLE -from mii.deployment import allocate_processes, create_score_file +from mii.deployment import allocate_processes, create_score_file, validate_deployment from mii.config import DeploymentConfig @@ -105,7 +105,7 @@ def __init__(self, def _get_deployment_task(self, deployment_name=None): task = None - if deployment_name is None: #mii.terminate() or single model + if deployment_name is None or deployment_name == mii.constants.MII_TERMINATE_DEP_NAME: #mii.terminate() or single model assert len(self.deployments) == 1, "Must pass deployment_name to query when using multiple deployments" deployment = next(iter(self.deployments.values())) deployment_name = deployment.deployment_name @@ -116,8 +116,8 @@ def _get_deployment_task(self, deployment_name=None): deployment = self.deployments[deployment_name] task = get_task(deployment.task) if isinstance(deployment.task, str) else deployment.task - return deployment_name, task - assert False, f"{deployment_name} not found in list of deployments" + else: + assert False, f"{deployment_name} not found in list of deployments" return deployment_name, task async def _request_async_response(self, request_dict, task, **query_kwargs): @@ -130,7 +130,7 @@ async def _request_async_response(self, request_dict, task, **query_kwargs): return task_methods.unpack_response_from_proto(proto_response) def query(self, request_dict, **query_kwargs): - deployment_name = request_dict.get('deployment_name') + deployment_name = request_dict.get(mii.constants.DEPLOYMENT_NAME_KEY) deployment_name, task = self._get_deployment_task(deployment_name) request_dict['deployment_name'] = deployment_name return self.asyncio_loop.run_until_complete( @@ -195,25 +195,26 @@ def add_models(self, deployment_type=DeploymentType.LOCAL, model_path=None, version=1): - - if not deployments: - assert all((model, task, deployment_name)), "model, task, and deployment name must be set to deploy singular model" - deployments = [ - DeploymentConfig(deployment_name=deployment_name, - task=task, - model=model, - enable_deepspeed=enable_deepspeed, - enable_zero=enable_zero, - GPU_index_map=None, - mii_config=mii.config.MIIConfig(**mii_config), - ds_config=ds_config, - version=version, - deployed=False) - ] + + _, deployments = validate_deployment(task=task, + model=model, + deployment_name=deployment_name, + enable_deepspeed=enable_deepspeed, + enable_zero=enable_zero, + ds_config=ds_config, + mii_config=mii_config, + deployment_tag=self.deployment_tag, + deployments=deployments, + deployment_type=deployment_type, + model_path=model_path, + version=version) + + if not deployments: #Empty deployment + return None deps = {deployment.deployment_name: deployment for deployment in deployments} - for deployment in deployments: - deployment.task = get_task(deployment.task) + #for deployment in deployments: + # deployment.task = get_task(deployment.task) lb_config, self.port_map = allocate_processes(deps, self.port_map) if self.lb_config is not None: @@ -226,9 +227,9 @@ def add_models(self, self.model_path = mii.constants.MII_MODEL_PATH_DEFAULT elif self.model_path is None and deployment_type == DeploymentType.AML: model_path = "model" - for deployment in self.deployments.values(): - if isinstance(deployment.task, str): - deployment.task = get_task(deployment.task) + #for deployment in self.deployments.values(): + #if isinstance(deployment.task, str): + #deployment.task = get_task(deployment.task) create_score_file(deployment_tag=self.deployment_tag, deployment_type=deployment_type, deployments=self.deployments, diff --git a/mii/config.py b/mii/config.py index ea3fbe43..c28853df 100644 --- a/mii/config.py +++ b/mii/config.py @@ -7,7 +7,7 @@ from enum import Enum from pydantic import BaseModel, validator, root_validator from deepspeed.launcher.runner import DLTS_HOSTFILE - +from mii.utils import get_task class DtypeEnum(Enum): # The torch dtype must always be the first value (so we return torch.dtype) @@ -127,7 +127,7 @@ class Config: validate_all = True -validate_assignment = True + validate_assignment = True class DeploymentConfig(BaseModel): @@ -141,3 +141,7 @@ class DeploymentConfig(BaseModel): ds_config: dict = None version: int = 1 deployed: bool = False + + @validator("task") + def convert_task_str(cls, field_value, values): + return get_task(field_value) diff --git a/mii/constants.py b/mii/constants.py index 9fff96c2..beb5de0c 100644 --- a/mii/constants.py +++ b/mii/constants.py @@ -102,6 +102,8 @@ class ModelProvider(enum.Enum): DEEPSPEED_CONFIG_KEY = 'ds_config' CHECKPOINT_KEY = "checkpoint" DEPLOYED_KEY = "deployed" +MII_TERMINATE_DEP_NAME="__MII_TERMINATE_CALL__" + MII_CACHE_PATH = "MII_CACHE_PATH" MII_CACHE_PATH_DEFAULT = "/tmp/mii_cache" diff --git a/mii/deployment.py b/mii/deployment.py index 54b8abce..fb639fbb 100644 --- a/mii/deployment.py +++ b/mii/deployment.py @@ -73,33 +73,29 @@ def deploy(task=None, elif model_path is None and deployment_type == DeploymentType.AML: model_path = "model" - if not deployments and not all((model, task, deployment_name)): - assert deployment_tag is not None, "Deployment tag must be set when starting empty deployment" + deployment_tag, deployments = validate_deployment(task=task, + model=model, + deployment_name=deployment_name, + enable_deepspeed=enable_deepspeed, + enable_zero=enable_zero, + ds_config=ds_config, + mii_config=mii_config, + deployment_tag=deployment_tag, + deployments=deployments, + deployment_type=deployment_type, + model_path=model_path, + version=version) + + if not deployments: #Empty deployment create_score_file(deployment_tag=deployment_tag, deployment_type=deployment_type, deployments=None, model_path=model_path, port_map=None, lb_config=None) + print(f"Starting empty deployment, deployment_tag -> {deployment_tag}" return None - elif not deployments: - assert all((model, task, deployment_name)), "model, task, and deployment name must be set to deploy singular model" - deployments = [ - DeploymentConfig(deployment_name=deployment_name, - task=task, - model=model, - enable_deepspeed=enable_deepspeed, - enable_zero=enable_zero, - GPU_index_map=None, - mii_config=mii.config.MIIConfig(**mii_config), - ds_config=ds_config, - version=version, - deployed=False) - ] - deployment_tag = deployment_name - else: - assert deployment_tag is not None, "deployment_tag must be set to deploy multiple models" # parse and validate mii config for deployment in deployments: mii_config = deployment.mii_config @@ -112,13 +108,11 @@ def deploy(task=None, # aml only allows certain characters for deployment names if deployment_type == DeploymentType.AML: + assert len(deployments == 1), "mii does not currently support empty/multi-model deployment on AML" allowed_chars = set(string.ascii_lowercase + string.ascii_uppercase + string.digits + '-') assert set(deployment_name) <= allowed_chars, "AML deployment names can only contain a-z, A-Z, 0-9, and '-'" - for deployment in deployments: - deployment.task = mii.utils.get_task(deployment.task) - if not mii_config.skip_model_check: mii.utils.check_if_task_and_model_is_valid(deployment.task, deployment.model) if enable_deepspeed: @@ -205,6 +199,52 @@ def allocate_processes(deployments, port_map): replica_configs=replica_configs) return lb_config, port_map +def validate_deployment(task=None, + model=None, + deployment_name=None, + enable_deepspeed=True, + enable_zero=False, + ds_config=None, + mii_config={}, + deployment_tag=None, + deployments=[], + deployment_type=DeploymentType.LOCAL, + model_path=None, + version=1): + + if deployments and any((model, task, deployment_name)): + assert False, "Do not input deployments and model/task/deployment_name at the same time" + + elif deployments: + assert deployment_tag, "deployment_tag must be set to for mulitple models" + return deployment_tag, deployments + + elif not any((model, task, deployment_name)): + assert deployment_tag, "deployment_tag must be set for an empty deployment" + create_score_file(deployment_tag=deployment_tag, + deployment_type=deployment_type, + deployments=None, + model_path=model_path, + port_map=None, + lb_config=None) + return deployment_tag, None + + assert all((model, task, deployment_name)), "model, task, and deployment_name must be set for a single model" + deployments = [ + DeploymentConfig(deployment_name=deployment_name, + task=task, + model=model, + enable_deepspeed=enable_deepspeed, + enable_zero=enable_zero, + GPU_index_map=None, + mii_config=mii.config.MIIConfig(**mii_config), + ds_config=ds_config, + version=version, + deployed=False) + ] + if deployment_tag is None: + deployment_tag = deployment_name + return deployment_tag, deployments def _deploy_local(deployment_tag, model_path): mii.utils.import_score_file(deployment_tag).init() diff --git a/mii/grpc_related/modelresponse_server.py b/mii/grpc_related/modelresponse_server.py index c4b49a21..8cbc536e 100644 --- a/mii/grpc_related/modelresponse_server.py +++ b/mii/grpc_related/modelresponse_server.py @@ -36,7 +36,6 @@ def get_stop_event(self): class DeploymentManagement(ServiceBase, modelresponse_pb2_grpc.DeploymentManagementServicer): def AddDeployment(self, request, context): - print("DEPLOYMENT ADDED") return google_dot_protobuf_dot_empty__pb2.Empty() def DeleteDeployment(self, request, context): @@ -207,7 +206,6 @@ def intercept_service(self, continuation, handler_call_details): next_handler = continuation(handler_call_details) assert next_handler.unary_unary is not None - #USE KWARGS LIKE THEY ARE USED TO MAKE SESSIONS TO GET THE DEPLOYMENT NAME TO HASH THE COUNTERS/STUBS def invoke_intercept_method(request_proto, context): method_name = _get_grpc_method_name(handler_call_details.method) @@ -228,6 +226,8 @@ def invoke_intercept_method(request_proto, context): ParallelStubInvoker(hostname, tensor_parallel_ports, self.asyncio_loop)) + else: + print(f"deployment: {deployment_name} already exists") return google_dot_protobuf_dot_empty__pb2.Empty() if method_name == TERMINATE_METHOD: diff --git a/mii/grpc_related/proto/build_script.sh b/mii/grpc_related/proto/build_script.sh index d8615a85..9aaf3bd2 100644 --- a/mii/grpc_related/proto/build_script.sh +++ b/mii/grpc_related/proto/build_script.sh @@ -3,7 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 # DeepSpeed Team -python3 -m grpc_tools.protoc -I./ --python_out=. --grpc_python_out=. ./modelresponse.proto +python -m grpc_tools.protoc -I./ --python_out=. --grpc_python_out=. ./modelresponse.proto # update import to be global wrt mii sed -i 's/modelresponse_pb2/mii.grpc_related.proto.modelresponse_pb2/g' modelresponse_pb2_grpc.py diff --git a/mii/server.py b/mii/server.py index 5bb1b83a..083f2ba3 100644 --- a/mii/server.py +++ b/mii/server.py @@ -40,7 +40,6 @@ def __init__(self, for deployment in deployments: assert get_num_gpus(deployment.mii_config) > 0, f"GPU count for {deployment.deployment_name} must be greater than 0" mii_configs = deployment.mii_config - deployment.task = mii.utils.get_task(deployment.task) if mii_configs.hostfile is None: hostfile = tempfile.NamedTemporaryFile(delete=False) num_gpu = torch.cuda.device_count() diff --git a/mii/terminate.py b/mii/terminate.py index 0a2b82b4..77df55ff 100644 --- a/mii/terminate.py +++ b/mii/terminate.py @@ -14,7 +14,7 @@ def terminate(deployment_tag): generator.terminate() return try: - generator.query({'query': ''}, None) + generator.query({'query': ''}, mii.constants.MII_TERMINATE_DEP_NAME) except grpc.aio._call.AioRpcError as error: if error._code == grpc.StatusCode.UNAVAILABLE: mii.utils.logger.warn(f"Server for {deployment_tag} not found") From 3ae178156a7c3ec2d2aebb36bb72397cd8081ede Mon Sep 17 00:00:00 2001 From: Tosin Segun Date: Wed, 26 Jul 2023 23:49:28 +0000 Subject: [PATCH 57/69] Refactoring and formatting --- mii/client.py | 19 +++--- mii/config.py | 3 +- mii/constants.py | 2 +- mii/deployment.py | 84 ++++++++++++------------ mii/grpc_related/modelresponse_server.py | 1 - mii/models/score/generate.py | 5 +- mii/models/score/score_template.py | 6 +- 7 files changed, 60 insertions(+), 60 deletions(-) diff --git a/mii/client.py b/mii/client.py index e4e8fe2c..dee6f78e 100644 --- a/mii/client.py +++ b/mii/client.py @@ -18,7 +18,6 @@ def _get_deployment_configs(deployment_tag): deployments = {} configs = mii.utils.import_score_file(deployment_tag).configs for deployment in configs[mii.constants.DEPLOYMENTS_KEY].values(): - deployment[mii.constants.DEPLOYED_KEY] = True deployment_name = deployment[mii.constants.DEPLOYMENT_NAME_KEY] data = { 'deployment_name': deployment[mii.constants.DEPLOYMENT_NAME_KEY], @@ -30,7 +29,6 @@ def _get_deployment_configs(deployment_tag): 'mii_config': deployment[mii.constants.MII_CONFIGS_KEY], 'ds_config': deployment[mii.constants.DEEPSPEED_CONFIG_KEY], 'version': 1, - 'deployed': deployment[mii.constants.DEPLOYED_KEY] } deployments[deployment_name] = DeploymentConfig(**data) lb_config = configs.get(mii.constants.LOAD_BALANCER_CONFIG_KEY) @@ -61,6 +59,9 @@ def mii_query_handle(deployment_tag): if len(deployments) > 0: mii_configs = next(iter(deployments.values())).mii_config port_number = None if mii_configs == None else mii_configs.port_number + if port_number: + for deployment in deployments.values(): + assert deployment.mii_config.port_number == port_number, f"All port numbers is each deployments mii_configs must match" return MIIClient(deployments, "localhost", @@ -195,7 +196,7 @@ def add_models(self, deployment_type=DeploymentType.LOCAL, model_path=None, version=1): - + _, deployments = validate_deployment(task=task, model=model, deployment_name=deployment_name, @@ -209,7 +210,7 @@ def add_models(self, model_path=model_path, version=version) - if not deployments: #Empty deployment + if not deployments: #Empty deployment return None deps = {deployment.deployment_name: deployment for deployment in deployments} @@ -228,14 +229,16 @@ def add_models(self, elif self.model_path is None and deployment_type == DeploymentType.AML: model_path = "model" #for deployment in self.deployments.values(): - #if isinstance(deployment.task, str): - #deployment.task = get_task(deployment.task) + #if isinstance(deployment.task, str): + #deployment.task = get_task(deployment.task) + lb_enabled = True if len(self.deployments) else False create_score_file(deployment_tag=self.deployment_tag, deployment_type=deployment_type, - deployments=self.deployments, + deployments=deps, model_path=self.model_path, port_map=self.port_map, - lb_config=lb_config) + lb_config=lb_config, + deployed=lb_enabled) if deployment_type == DeploymentType.LOCAL: mii.utils.import_score_file(self.deployment_tag).init() if self.stub is None: diff --git a/mii/config.py b/mii/config.py index c28853df..89a4d328 100644 --- a/mii/config.py +++ b/mii/config.py @@ -9,6 +9,7 @@ from deepspeed.launcher.runner import DLTS_HOSTFILE from mii.utils import get_task + class DtypeEnum(Enum): # The torch dtype must always be the first value (so we return torch.dtype) fp16 = torch.float16, "torch.float16", "fp16", "float16", "half" @@ -126,7 +127,6 @@ class LoadBalancerConfig(BaseModel): class Config: validate_all = True - validate_assignment = True @@ -140,7 +140,6 @@ class DeploymentConfig(BaseModel): mii_config: MIIConfig = MIIConfig.parse_obj({}) ds_config: dict = None version: int = 1 - deployed: bool = False @validator("task") def convert_task_str(cls, field_value, values): diff --git a/mii/constants.py b/mii/constants.py index beb5de0c..810b5088 100644 --- a/mii/constants.py +++ b/mii/constants.py @@ -102,7 +102,7 @@ class ModelProvider(enum.Enum): DEEPSPEED_CONFIG_KEY = 'ds_config' CHECKPOINT_KEY = "checkpoint" DEPLOYED_KEY = "deployed" -MII_TERMINATE_DEP_NAME="__MII_TERMINATE_CALL__" +MII_TERMINATE_DEP_NAME = "__MII_TERMINATE_CALL__" MII_CACHE_PATH = "MII_CACHE_PATH" MII_CACHE_PATH_DEFAULT = "/tmp/mii_cache" diff --git a/mii/deployment.py b/mii/deployment.py index fb639fbb..5e9c5737 100644 --- a/mii/deployment.py +++ b/mii/deployment.py @@ -86,14 +86,14 @@ def deploy(task=None, model_path=model_path, version=version) - if not deployments: #Empty deployment + if not deployments: #Empty deployment create_score_file(deployment_tag=deployment_tag, deployment_type=deployment_type, deployments=None, model_path=model_path, port_map=None, lb_config=None) - print(f"Starting empty deployment, deployment_tag -> {deployment_tag}" + print(f"Starting empty deployment, deployment_tag -> {deployment_tag}") return None # parse and validate mii config @@ -199,52 +199,54 @@ def allocate_processes(deployments, port_map): replica_configs=replica_configs) return lb_config, port_map -def validate_deployment(task=None, - model=None, - deployment_name=None, - enable_deepspeed=True, - enable_zero=False, - ds_config=None, - mii_config={}, - deployment_tag=None, - deployments=[], - deployment_type=DeploymentType.LOCAL, - model_path=None, - version=1): - if deployments and any((model, task, deployment_name)): - assert False, "Do not input deployments and model/task/deployment_name at the same time" - - elif deployments: - assert deployment_tag, "deployment_tag must be set to for mulitple models" - return deployment_tag, deployments +def validate_deployment(task=None, + model=None, + deployment_name=None, + enable_deepspeed=True, + enable_zero=False, + ds_config=None, + mii_config={}, + deployment_tag=None, + deployments=[], + deployment_type=DeploymentType.LOCAL, + model_path=None, + version=1): + + if deployments and any((model, task, deployment_name)): + assert False, "Do not input deployments and model/task/deployment_name at the same time" + + elif deployments: + assert deployment_tag, "deployment_tag must be set to for multiple models" + return deployment_tag, deployments - elif not any((model, task, deployment_name)): - assert deployment_tag, "deployment_tag must be set for an empty deployment" - create_score_file(deployment_tag=deployment_tag, + elif not any((model, task, deployment_name)): + assert deployment_tag, "deployment_tag must be set for an empty deployment" + create_score_file(deployment_tag=deployment_tag, deployment_type=deployment_type, deployments=None, model_path=model_path, port_map=None, lb_config=None) - return deployment_tag, None - - assert all((model, task, deployment_name)), "model, task, and deployment_name must be set for a single model" - deployments = [ - DeploymentConfig(deployment_name=deployment_name, - task=task, - model=model, - enable_deepspeed=enable_deepspeed, - enable_zero=enable_zero, - GPU_index_map=None, - mii_config=mii.config.MIIConfig(**mii_config), - ds_config=ds_config, - version=version, - deployed=False) - ] - if deployment_tag is None: - deployment_tag = deployment_name - return deployment_tag, deployments + return deployment_tag, None + + assert all((model, task, deployment_name)), "model, task, and deployment_name must be set for a single model" + deployments = [ + DeploymentConfig(deployment_name=deployment_name, + task=task, + model=model, + enable_deepspeed=enable_deepspeed, + enable_zero=enable_zero, + GPU_index_map=None, + mii_config=mii.config.MIIConfig(**mii_config), + ds_config=ds_config, + version=version, + deployed=False) + ] + if deployment_tag is None: + deployment_tag = deployment_name + return deployment_tag, deployments + def _deploy_local(deployment_tag, model_path): mii.utils.import_score_file(deployment_tag).init() diff --git a/mii/grpc_related/modelresponse_server.py b/mii/grpc_related/modelresponse_server.py index 8cbc536e..6b35d56f 100644 --- a/mii/grpc_related/modelresponse_server.py +++ b/mii/grpc_related/modelresponse_server.py @@ -206,7 +206,6 @@ def intercept_service(self, continuation, handler_call_details): next_handler = continuation(handler_call_details) assert next_handler.unary_unary is not None - def invoke_intercept_method(request_proto, context): method_name = _get_grpc_method_name(handler_call_details.method) if method_name == ADD_DEPLOYMENT_METHOD: diff --git a/mii/models/score/generate.py b/mii/models/score/generate.py index 86ceca28..48e74776 100644 --- a/mii/models/score/generate.py +++ b/mii/models/score/generate.py @@ -14,11 +14,13 @@ def create_score_file(deployment_tag, deployments, model_path, port_map, - lb_config): + lb_config, + deployed=False): config_dict = {} config_dict[mii.constants.MODEL_PATH_KEY] = model_path config_dict[mii.constants.DEPLOYMENT_TAG_KEY] = deployment_tag + config_dict[mii.constants.DEPLOYED_KEY] = deployed config_dict[mii.constants.DEPLOYMENTS_KEY] = {} if port_map is not None: config_dict[mii.constants.PORT_MAP_KEY] = port_map @@ -33,7 +35,6 @@ def create_score_file(deployment_tag, mii.constants.MII_CONFIGS_KEY: deployment.mii_config.dict(), mii.constants.ENABLE_DEEPSPEED_ZERO_KEY: deployment.enable_zero, mii.constants.DEEPSPEED_CONFIG_KEY: deployment.ds_config, - mii.constants.DEPLOYED_KEY: deployment.deployed, mii.constants.GPU_INDEX_KEY: deployment.GPU_index_map } config_dict[mii.constants.DEPLOYMENTS_KEY][ diff --git a/mii/models/score/score_template.py b/mii/models/score/score_template.py index a8969ee0..ec6046e2 100644 --- a/mii/models/score/score_template.py +++ b/mii/models/score/score_template.py @@ -18,12 +18,8 @@ def init(): model_path = mii.utils.full_model_path(configs[mii.constants.MODEL_PATH_KEY]) deployment_tag = configs[mii.constants.DEPLOYMENT_TAG_KEY] deployments = [] - lb_enabled = False + lb_enabled = configs[mii.constants.DEPLOYED_KEY] for deployment in configs[mii.constants.DEPLOYMENTS_KEY].values(): - if deployment[mii.constants.DEPLOYED_KEY]: - lb_enabled = True - print(deployment) - continue data = { 'deployment_name': deployment[mii.constants.DEPLOYMENT_NAME_KEY], 'task': deployment[mii.constants.TASK_NAME_KEY], From 4b8f02fa0b12ca1534fddadac6a665a9fe0fefbb Mon Sep 17 00:00:00 2001 From: Tosin Segun Date: Fri, 28 Jul 2023 07:35:56 +0000 Subject: [PATCH 58/69] Refactoring --- examples/multi_model/deploy.py | 4 +-- mii/client.py | 38 ++++++++++++++------- mii/config.py | 24 +++++++------ mii/constants.py | 7 ++-- mii/deployment.py | 51 ++++++++++++++++------------ mii/models/score/generate.py | 32 +++++++++++++----- mii/models/score/score_template.py | 4 ++- mii/server.py | 54 +++++++++++++++++++----------- 8 files changed, 136 insertions(+), 78 deletions(-) diff --git a/examples/multi_model/deploy.py b/examples/multi_model/deploy.py index f0408da7..c0b93b56 100644 --- a/examples/multi_model/deploy.py +++ b/examples/multi_model/deploy.py @@ -19,7 +19,7 @@ model=name, deployment_name=name + "_deployment", GPU_index_map=gpu_index_map3, - mii_config=mii.config.MIIConfig(**mii_configs1))) + mii_configs=mii.config.MIIConfig(**mii_configs1))) # gpt2 name = "microsoft/DialogRPT-human-vs-rand" @@ -35,7 +35,7 @@ model=name, deployment_name=name + "_deployment", GPU_index_map=gpu_index_map1, - mii_config=mii.config.MIIConfig(**mii_configs2))) + mii_configs=mii.config.MIIConfig(**mii_configs2))) name = "deepset/roberta-large-squad2" deployments.append( diff --git a/mii/client.py b/mii/client.py index dee6f78e..8867e000 100644 --- a/mii/client.py +++ b/mii/client.py @@ -19,6 +19,7 @@ def _get_deployment_configs(deployment_tag): configs = mii.utils.import_score_file(deployment_tag).configs for deployment in configs[mii.constants.DEPLOYMENTS_KEY].values(): deployment_name = deployment[mii.constants.DEPLOYMENT_NAME_KEY] + """ data = { 'deployment_name': deployment[mii.constants.DEPLOYMENT_NAME_KEY], 'task': deployment[mii.constants.TASK_NAME_KEY], @@ -30,7 +31,8 @@ def _get_deployment_configs(deployment_tag): 'ds_config': deployment[mii.constants.DEEPSPEED_CONFIG_KEY], 'version': 1, } - deployments[deployment_name] = DeploymentConfig(**data) + """ + deployments[deployment_name] = DeploymentConfig(**deployment) lb_config = configs.get(mii.constants.LOAD_BALANCER_CONFIG_KEY) model_path = configs[mii.constants.MODEL_PATH_KEY] port_map = configs.get(mii.constants.PORT_MAP_KEY) @@ -57,11 +59,12 @@ def mii_query_handle(deployment_tag): deployments, lb_config, model_path, port_map = _get_deployment_configs(deployment_tag) mii_configs = None if len(deployments) > 0: - mii_configs = next(iter(deployments.values())).mii_config + mii_configs = getattr(next(iter(deployments.values())), + mii.constants.MII_CONFIGS_KEY) port_number = None if mii_configs == None else mii_configs.port_number if port_number: for deployment in deployments.values(): - assert deployment.mii_config.port_number == port_number, f"All port numbers is each deployments mii_configs must match" + assert getattr(deployment, mii.constants.MII_CONFIGS_KEY).port_number == port_number, f"All port numbers is each deployments mii_configs must match" return MIIClient(deployments, "localhost", @@ -109,14 +112,18 @@ def _get_deployment_task(self, deployment_name=None): if deployment_name is None or deployment_name == mii.constants.MII_TERMINATE_DEP_NAME: #mii.terminate() or single model assert len(self.deployments) == 1, "Must pass deployment_name to query when using multiple deployments" deployment = next(iter(self.deployments.values())) - deployment_name = deployment.deployment_name - task = get_task(deployment.task) if isinstance(deployment.task, - str) else deployment.task + deployment_name = getattr(deployment, mii.constants.DEPLOYMENT_NAME_KEY) + #task = get_task(deployment.task) if isinstance(deployment.task, + #str) else deployment.task + task = getattr(deployment, mii.constants.TASK_NAME_KEY) else: if deployment_name in self.deployments: deployment = self.deployments[deployment_name] + """ task = get_task(deployment.task) if isinstance(deployment.task, str) else deployment.task + """ + task = getattr(deployment, mii.constants.TASK_NAME_KEY) else: assert False, f"{deployment_name} not found in list of deployments" return deployment_name, task @@ -213,17 +220,22 @@ def add_models(self, if not deployments: #Empty deployment return None - deps = {deployment.deployment_name: deployment for deployment in deployments} + deps = { + getattr(deployment, + mii.constants.DEPLOYMENT_NAME_KEY): deployment + for deployment in deployments + } #for deployment in deployments: # deployment.task = get_task(deployment.task) lb_config, self.port_map = allocate_processes(deps, self.port_map) - + lb_enabled = True if len(self.deployments) else False if self.lb_config is not None: self.lb_config.replica_configs.extend(lb_config.replica_configs) else: self.lb_config = lb_config for deployment in deployments: - self.deployments[deployment.deployment_name] = deployment + self.deployments[getattr(deployment, + mii.constants.DEPLOYMENT_NAME_KEY)] = deployment if self.model_path is None and deployment_type == DeploymentType.LOCAL: self.model_path = mii.constants.MII_MODEL_PATH_DEFAULT elif self.model_path is None and deployment_type == DeploymentType.AML: @@ -231,7 +243,7 @@ def add_models(self, #for deployment in self.deployments.values(): #if isinstance(deployment.task, str): #deployment.task = get_task(deployment.task) - lb_enabled = True if len(self.deployments) else False + #lb_enabled = True if len(self.deployments) else False create_score_file(deployment_tag=self.deployment_tag, deployment_type=deployment_type, deployments=deps, @@ -242,8 +254,8 @@ def add_models(self, if deployment_type == DeploymentType.LOCAL: mii.utils.import_score_file(self.deployment_tag).init() if self.stub is None: - self.port_number = next(iter( - self.deployments.values())).mii_config.port_number + self.port_number = getattr(next(iter(self.deployments.values())), + mii.constants.MII_CONFIGS_KEY).port_number channel = create_channel(self.host, self.port_number) self.stub = modelresponse_pb2_grpc.DeploymentManagementStub(channel) for replica in lb_config.replica_configs: @@ -349,6 +361,6 @@ def terminate(self): def terminate_restful_gateway(deployment_tag): deployments, _, _, _ = _get_deployment_configs(deployment_tag) for deployment in deployments.values(): - mii_configs = deployment.mii_config + mii_configs = getattr(deployment, mii.constants.MII_CONFIGS_KEY) if mii_configs.enable_restful_api: requests.get(f"http://localhost:{mii_configs.restful_api_port}/terminate") diff --git a/mii/config.py b/mii/config.py index 89a4d328..b8325562 100644 --- a/mii/config.py +++ b/mii/config.py @@ -5,7 +5,7 @@ import torch from typing import Union, List from enum import Enum -from pydantic import BaseModel, validator, root_validator +from pydantic import BaseModel, validator, root_validator, Field from deepspeed.launcher.runner import DLTS_HOSTFILE from mii.utils import get_task @@ -131,16 +131,20 @@ class Config: class DeploymentConfig(BaseModel): - deployment_name: str - task: str - model: str - enable_deepspeed: bool = True - enable_zero: bool = False - GPU_index_map: dict = None - mii_config: MIIConfig = MIIConfig.parse_obj({}) - ds_config: dict = None - version: int = 1 + deployment_name: str = Field(alias="DEPLOYMENT_NAME_KEY") + task: str = Field(alias="TASK_NAME_KEY") + model: str = Field(alias="MODEL_NAME_KEY") + ds_optimize: bool = Field(default=True, alias="ENABLE_DEEPSPEED_KEY") + ds_zero: bool = Field(default=False, alias="ENABLE_DEEPSPEED_ZERO_KEY") + GPU_index_map: dict = Field(default=None, alias="GPU_INDEX_KEY") + mii_configs: MIIConfig = Field(default=MIIConfig.parse_obj({}), + alias="MII_CONFIGS_KEY") + ds_config: dict = Field(default=None, alias="DEEPSPEED_CONFIG_KEY") + version: int = Field(default=1, alias="VERSION_KEY") @validator("task") def convert_task_str(cls, field_value, values): return get_task(field_value) + + class Config: + allow_population_by_field_name = True diff --git a/mii/constants.py b/mii/constants.py index 810b5088..f4860cc9 100644 --- a/mii/constants.py +++ b/mii/constants.py @@ -88,11 +88,11 @@ class ModelProvider(enum.Enum): 'generated_responses'], TEXT2IMG_NAME: ["query"] } -GPU_INDEX_KEY = "index_keys" +GPU_INDEX_KEY = "GPU_index_map" DEPLOYMENTS_KEY = 'deployments' PORT_MAP_KEY = 'port_map' -MODEL_NAME_KEY = 'model_name' -TASK_NAME_KEY = 'task_name' +MODEL_NAME_KEY = 'model' +TASK_NAME_KEY = 'task' DEPLOYMENT_NAME_KEY = 'deployment_name' MODEL_PATH_KEY = 'model_path' LOAD_BALANCER_CONFIG_KEY = 'load_balancer_config' @@ -102,6 +102,7 @@ class ModelProvider(enum.Enum): DEEPSPEED_CONFIG_KEY = 'ds_config' CHECKPOINT_KEY = "checkpoint" DEPLOYED_KEY = "deployed" +VERSION_KEY = "version" MII_TERMINATE_DEP_NAME = "__MII_TERMINATE_CALL__" MII_CACHE_PATH = "MII_CACHE_PATH" diff --git a/mii/deployment.py b/mii/deployment.py index 5e9c5737..5744d182 100644 --- a/mii/deployment.py +++ b/mii/deployment.py @@ -98,9 +98,12 @@ def deploy(task=None, # parse and validate mii config for deployment in deployments: - mii_config = deployment.mii_config - if deployment.enable_zero: - if deployment.ds_config.get("fp16", {}).get("enabled", False): + mii_config = getattr(deployment, mii.constants.MII_CONFIGS_KEY) + if getattr(deployment, mii.constants.ENABLE_DEEPSPEED_ZERO_KEY): + if getattr(deployment, + mii.constants.DEEPSPEED_CONFIG_KEY).get("fp16", + {}).get("enabled", + False): assert (mii_config.dtype == torch.half), "MII Config Error: MII dtype and ZeRO dtype must match" else: assert (mii_config.dtype == torch.float), "MII Config Error: MII dtype and ZeRO dtype must match" @@ -114,7 +117,11 @@ def deploy(task=None, assert set(deployment_name) <= allowed_chars, "AML deployment names can only contain a-z, A-Z, 0-9, and '-'" if not mii_config.skip_model_check: - mii.utils.check_if_task_and_model_is_valid(deployment.task, deployment.model) + mii.utils.check_if_task_and_model_is_valid( + getattr(deployment, + mii.constants.TASK_NAME_KEY), + getattr(deployment, + mii.constants.MODEL_NAME_KEY)) if enable_deepspeed: mii.utils.check_if_task_and_model_is_supported( deployment.task, @@ -169,7 +176,7 @@ def allocate_processes(deployments, port_map): replica_configs = [] port_offset = 1 for deployment in deployments.values(): - mii_config = deployment.mii_config + mii_config = getattr(deployment, mii.constants.MII_CONFIGS_KEY) replica_pool = _allocate_processes(mii_config.hostfile, mii_config.tensor_parallel, mii_config.replica_num, @@ -189,12 +196,15 @@ def allocate_processes(deployments, port_map): port_map[hostname].add(i) torch_dist_port = mii_config.torch_dist_port + i replica_configs.append( - ReplicaConfig(task=get_task_name(deployment.task), - deployment_name=deployment.deployment_name, - hostname=hostname, - tensor_parallel_ports=tensor_parallel_ports, - torch_dist_port=torch_dist_port, - gpu_indices=gpu_indices)) + ReplicaConfig( + task=get_task_name(getattr(deployment, + mii.constants.TASK_NAME_KEY)), + deployment_name=(getattr(deployment, + mii.constants.DEPLOYMENT_NAME_KEY)), + hostname=hostname, + tensor_parallel_ports=tensor_parallel_ports, + torch_dist_port=torch_dist_port, + gpu_indices=gpu_indices)) lb_config = LoadBalancerConfig(port=mii_config.port_number, replica_configs=replica_configs) return lb_config, port_map @@ -232,16 +242,15 @@ def validate_deployment(task=None, assert all((model, task, deployment_name)), "model, task, and deployment_name must be set for a single model" deployments = [ - DeploymentConfig(deployment_name=deployment_name, - task=task, - model=model, - enable_deepspeed=enable_deepspeed, - enable_zero=enable_zero, - GPU_index_map=None, - mii_config=mii.config.MIIConfig(**mii_config), - ds_config=ds_config, - version=version, - deployed=False) + DeploymentConfig(DEPLOYMENT_NAME_KEY=deployment_name, + TASK_NAME_KEY=task, + MODEL_NAME_KEY=model, + ENABLE_DEEPSPEED_KEY=enable_deepspeed, + ENABLE_DEEPSPEED_ZERO_KEY=enable_zero, + GPU_INDEX_KEY=None, + MII_CONFIGS_KEY=mii.config.MIIConfig(**mii_config), + DEEPSPEED_CONFIG_KEY=ds_config, + VERSION_KEY=version) ] if deployment_tag is None: deployment_tag = deployment_name diff --git a/mii/models/score/generate.py b/mii/models/score/generate.py index 48e74776..2f2bf8b0 100644 --- a/mii/models/score/generate.py +++ b/mii/models/score/generate.py @@ -28,14 +28,30 @@ def create_score_file(deployment_tag, if deployments is not None: for deployment in deployments.values(): deployment_config = { - mii.constants.DEPLOYMENT_NAME_KEY: deployment.deployment_name, - mii.constants.TASK_NAME_KEY: mii.utils.get_task_name(deployment.task), - mii.constants.MODEL_NAME_KEY: deployment.model, - mii.constants.ENABLE_DEEPSPEED_KEY: deployment.enable_deepspeed, - mii.constants.MII_CONFIGS_KEY: deployment.mii_config.dict(), - mii.constants.ENABLE_DEEPSPEED_ZERO_KEY: deployment.enable_zero, - mii.constants.DEEPSPEED_CONFIG_KEY: deployment.ds_config, - mii.constants.GPU_INDEX_KEY: deployment.GPU_index_map + mii.constants.DEPLOYMENT_NAME_KEY: + getattr(deployment, + mii.constants.DEPLOYMENT_NAME_KEY), + mii.constants.TASK_NAME_KEY: + mii.utils.get_task_name(getattr(deployment, + mii.constants.TASK_NAME_KEY)), + mii.constants.MODEL_NAME_KEY: + getattr(deployment, + mii.constants.MODEL_NAME_KEY), + mii.constants.ENABLE_DEEPSPEED_KEY: + getattr(deployment, + mii.constants.ENABLE_DEEPSPEED_KEY), + mii.constants.MII_CONFIGS_KEY: + getattr(deployment, + mii.constants.MII_CONFIGS_KEY).dict(), + mii.constants.ENABLE_DEEPSPEED_ZERO_KEY: + getattr(deployment, + mii.constants.ENABLE_DEEPSPEED_ZERO_KEY), + mii.constants.DEEPSPEED_CONFIG_KEY: + getattr(deployment, + mii.constants.DEEPSPEED_CONFIG_KEY), + mii.constants.GPU_INDEX_KEY: + getattr(deployment, + mii.constants.GPU_INDEX_KEY) } config_dict[mii.constants.DEPLOYMENTS_KEY][ deployment.deployment_name] = deployment_config diff --git a/mii/models/score/score_template.py b/mii/models/score/score_template.py index ec6046e2..c4905f3c 100644 --- a/mii/models/score/score_template.py +++ b/mii/models/score/score_template.py @@ -20,6 +20,7 @@ def init(): deployments = [] lb_enabled = configs[mii.constants.DEPLOYED_KEY] for deployment in configs[mii.constants.DEPLOYMENTS_KEY].values(): + """ data = { 'deployment_name': deployment[mii.constants.DEPLOYMENT_NAME_KEY], 'task': deployment[mii.constants.TASK_NAME_KEY], @@ -31,7 +32,8 @@ def init(): 'ds_config': deployment[mii.constants.DEEPSPEED_CONFIG_KEY], 'version': 1 } - deployments.append(mii.DeploymentConfig.parse_obj(data)) + """ + deployments.append(mii.DeploymentConfig(**deployment)) """ deployment_name = configs[mii.constants.DEPLOYMENT_NAME_KEY] model_name = configs[mii.constants.MODEL_NAME_KEY] diff --git a/mii/server.py b/mii/server.py index 083f2ba3..4fef14f4 100644 --- a/mii/server.py +++ b/mii/server.py @@ -38,8 +38,8 @@ def __init__(self, self.lb_enabled = lb_enabled self.deployments = deployments for deployment in deployments: - assert get_num_gpus(deployment.mii_config) > 0, f"GPU count for {deployment.deployment_name} must be greater than 0" - mii_configs = deployment.mii_config + mii_configs = getattr(deployment, mii.constants.MII_CONFIGS_KEY) + assert get_num_gpus(mii_configs) > 0, f"GPU count for {deployment.deployment_name} must be greater than 0" if mii_configs.hostfile is None: hostfile = tempfile.NamedTemporaryFile(delete=False) num_gpu = torch.cuda.device_count() @@ -106,8 +106,8 @@ def _build_server_args(self, task = "" for deployment in self.deployments: - if deployment_name == deployment.deployment_name: - task = deployment.task + if deployment_name == getattr(deployment, mii.constants.DEPLOYMENT_NAME_KEY): + task = getattr(deployment, mii.constants.TASK_NAME_KEY) break server_args_str = f"--deployment-name {deployment_name} --task-name {mii.utils.get_task_name(task)} --model {model_name} --model-path {model_path} --port {port}" server_args_str += " --ds-optimize" if ds_optimize else "" @@ -284,7 +284,7 @@ def _initialize_service(self, deployment_tag, deployments, model_path, lb_config name = repl_config.deployment_name deployment = None for dep in deployments: - if dep.deployment_name == name: + if getattr(dep, mii.constants.DEPLOYMENT_NAME_KEY) == name: deployment = dep if deployment is None: continue @@ -295,16 +295,22 @@ def _initialize_service(self, deployment_tag, deployments, model_path, lb_config processes.append( self._launch_deepspeed( name, - deployment.model, + getattr(deployment, + mii.constants.MODEL_NAME_KEY), model_path, - deployment.enable_deepspeed, - deployment.enable_zero, - deployment.ds_config, - deployment.mii_config, + getattr(deployment, + mii.constants.ENABLE_DEEPSPEED_KEY), + getattr(deployment, + mii.constants.ENABLE_DEEPSPEED_ZERO_KEY), + getattr(deployment, + mii.constants.DEEPSPEED_CONFIG_KEY), + getattr(deployment, + mii.constants.MII_CONFIGS_KEY), hostfile.name, repl_config.hostname, repl_config.tensor_parallel_ports[0], - deployment.mii_config.torch_dist_port + (100 * i) + + getattr(deployment, + mii.constants.MII_CONFIGS_KEY).torch_dist_port + (100 * i) + repl_config.gpu_indices[0], repl_config.gpu_indices)) @@ -316,17 +322,25 @@ def _initialize_service(self, deployment_tag, deployments, model_path, lb_config processes.append(self._launch_load_balancer(model_path, lb_config)) for deployment in self.deployments: - if deployment.mii_config.enable_restful_api: + if getattr(deployment, mii.constants.MII_CONFIGS_KEY).enable_restful_api: # start rest api server processes.append( - self._launch_restful_gateway(deployment.deployment_name, - deployment.model, - model_path, - deployment.enable_deepspeed, - deployment.enable_zero, - deployment.ds_config, - deployment.mii_config, - deployment.mii_config.port_number)) + self._launch_restful_gateway( + getattr(deployment, + mii.constants.DEPLOYMENT_NAME_KEY), + getattr(deployment, + mii.constants.MODEL_NAME_KEY), + model_path, + getattr(deployment, + mii.constants.ENABLE_DEEPSPEED_KEY), + getattr(deployment, + mii.constants.ENABLE_DEEPSPEED_ZERO_KEY), + getattr(deployment, + mii.constants.DEEPSPEED_CONFIG_KEY), + getattr(deployment, + mii.constants.MII_CONFIGS_KEY), + getattr(deployment, + mii.constants.MII_CONFIGS_KEY).port_number)) break return processes From c51ce3773f8427b40289464f1fad044eb576ab3d Mon Sep 17 00:00:00 2001 From: Tosin Segun Date: Fri, 28 Jul 2023 07:45:29 +0000 Subject: [PATCH 59/69] Fixing Readme --- README.md | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/README.md b/README.md index 27e6513b..83eed709 100644 --- a/README.md +++ b/README.md @@ -178,24 +178,6 @@ mii.deploy(... mii_config=mii_configs) ``` -**Non-persistent Deployment** - -You can enable a non-persistent deployment which allows you to make queries without standing up a server. The non-persistent deployment acts as a simplified interface to DeepSpeed-inference for use cases that do not require creating a persistent model server process. Changing the `deployment_type` to `NON_PERSISTENT` in `mii.deploy(...)` will activate this option. - -```python -... -mii.deploy(deployment_name = DEPLOYMENT_NAME, - deployment_type=mii.constants.DeploymentType.NON_PERSISTENT - ... - ) - -generator = mii.mii_query_handle(DEPLOYMENT_NAME) -result = generator.query({"query": ["DeepSpeed is", "Seattle is"]}, do_sample=True, max_new_tokens=30}) - -``` - -You can find a complete example [here]("https://github.com/microsoft/DeepSpeed-MII/tree/main/examples/non_persistent") - Any HTTP client can be used to call the APIs. An example of using curl is: ```bash # Assume deployment_name and restful_api_port are set to bloom560m_deployment and 28080 respectively: @@ -219,6 +201,24 @@ response = requests.post(url, data=json_params, headers={ print(response.json()) ``` +**Non-persistent Deployment** + +You can enable a non-persistent deployment which allows you to make queries without standing up a server. The non-persistent deployment acts as a simplified interface to DeepSpeed-inference for use cases that do not require creating a persistent model server process. Changing the `deployment_type` to `NON_PERSISTENT` in `mii.deploy(...)` will activate this option. + +```python +... +mii.deploy(deployment_name = DEPLOYMENT_NAME, + deployment_type=mii.constants.DeploymentType.NON_PERSISTENT + ... + ) + +generator = mii.mii_query_handle(DEPLOYMENT_NAME) +result = generator.query({"query": ["DeepSpeed is", "Seattle is"]}, do_sample=True, max_new_tokens=30}) + +``` + +You can find a complete example [here]("https://github.com/microsoft/DeepSpeed-MII/tree/main/examples/non_persistent") + ## Deploying with MII-Azure MII supports deployment on Azure via AML Inference. To enable this, MII generates AML deployment assets for a given model that can be deployed using the Azure-CLI, as shown in the code below. Furthermore, deploying on Azure, allows MII to leverage DeepSpeed-Azure as its optimization backend, which offers better latency and cost reduction than DeepSpeed-Public. From 43479db8357461a40624c9f097cb5e59854f1bf5 Mon Sep 17 00:00:00 2001 From: Tosin Segun Date: Fri, 28 Jul 2023 21:48:59 +0000 Subject: [PATCH 60/69] Refactoring GRPC --- mii/client.py | 94 ++-- mii/grpc_related/modelresponse_server.py | 3 + mii/grpc_related/proto/modelresponse.proto | 10 - mii/grpc_related/proto/modelresponse_pb2.py | 4 +- .../proto/modelresponse_pb2_grpc.py | 434 ------------------ 5 files changed, 58 insertions(+), 487 deletions(-) diff --git a/mii/client.py b/mii/client.py index 8867e000..1236876b 100644 --- a/mii/client.py +++ b/mii/client.py @@ -66,13 +66,13 @@ def mii_query_handle(deployment_tag): for deployment in deployments.values(): assert getattr(deployment, mii.constants.MII_CONFIGS_KEY).port_number == port_number, f"All port numbers is each deployments mii_configs must match" - return MIIClient(deployments, - "localhost", - port_number, - lb_config, - model_path, - port_map, - deployment_tag) + return LBClient(deployments, + "localhost", + port_number, + lb_config, + model_path, + port_map, + deployment_tag) def create_channel(host, port): @@ -87,25 +87,15 @@ class MIIClient(): """ Client to send queries to a single endpoint. """ - def __init__(self, - deployments, - host, - port, - lb_config=None, - model_path=None, - port_map=None, - deployment_tag=None): + def __init__(self, deployments, host, port): self.asyncio_loop = asyncio.get_event_loop() - self.stub = None + self.mr_stub = None + self.channel = None self.host = host if port is not None: - channel = create_channel(host, port) - self.stub = modelresponse_pb2_grpc.DeploymentManagementStub(channel) + self.channel = create_channel(host, port) + self.mr_stub = modelresponse_pb2_grpc.ModelResponseStub(self.channel) self.deployments = deployments - self.lb_config = lb_config - self.model_path = model_path - self.port_map = port_map if port_map is not None else {} - self.deployment_tag = deployment_tag def _get_deployment_task(self, deployment_name=None): task = None @@ -134,7 +124,7 @@ async def _request_async_response(self, request_dict, task, **query_kwargs): task_methods = GRPC_METHOD_TABLE[task] proto_request = task_methods.pack_request_to_proto(request_dict, **query_kwargs) - proto_response = await getattr(self.stub, task_methods.method)(proto_request) + proto_response = await getattr(self.mr_stub, task_methods.method)(proto_request) return task_methods.unpack_response_from_proto(proto_response) def query(self, request_dict, **query_kwargs): @@ -147,14 +137,14 @@ def query(self, request_dict, **query_kwargs): **query_kwargs)) async def terminate_async(self): - await self.stub.Terminate( + await self.mr_stub.Terminate( modelresponse_pb2.google_dot_protobuf_dot_empty__pb2.Empty()) def terminate(self): self.asyncio_loop.run_until_complete(self.terminate_async()) async def create_session_async(self, session_id): - return await self.stub.CreateSession( + return await self.mr_stub.CreateSession( modelresponse_pb2.SessionID(session_id=session_id)) def create_session(self, session_id, deployment_name=None): @@ -166,8 +156,8 @@ def create_session(self, session_id, deployment_name=None): self.create_session_async(session_id)) async def destroy_session_async(self, session_id): - await self.stub.DestroySession(modelresponse_pb2.SessionID(session_id=session_id) - ) + await self.mr_stub.DestroySession( + modelresponse_pb2.SessionID(session_id=session_id)) def destroy_session(self, session_id, deployment_name=None): if len(self.deployments > 1): @@ -176,20 +166,28 @@ def destroy_session(self, session_id, deployment_name=None): assert task == Tasks.TEXT_GENERATION, f"Session deletion only available for task '{Tasks.TEXT_GENERATION}'." self.asyncio_loop.run_until_complete(self.destroy_session_async(session_id)) - async def delete_model_async(self, proto_request): - await getattr(self.stub, "DeleteDeployment")(proto_request) - def delete_model(self, deployment_name): - if deployment_name in self.deployments: - request_proto = modelresponse_pb2.DeleteDeployRequest( - deployment_name=deployment_name) - self.asyncio_loop.run_until_complete(self.delete_model_async(request_proto)) - del self.deployments[deployment_name] - return None - assert False, f"Deployment: {deployment_name} not found" +class LBClient(MIIClient): + def __init__(self, + deployments, + host, + port, + lb_config=None, + model_path=None, + port_map=None, + deployment_tag=None): + super().__init__(deployments, host, port) + self.lb_stub = None + if port is not None: + channel = create_channel(host, port) if not self.channel else self.channel + self.lb_stub = modelresponse_pb2_grpc.DeploymentManagementStub(channel) + self.lb_config = lb_config + self.model_path = model_path + self.port_map = port_map if port_map is not None else {} + self.deployment_tag = deployment_tag async def add_models_async(self, proto_request): - await getattr(self.stub, "AddDeployment")(proto_request) + await getattr(self.lb_stub, "AddDeployment")(proto_request) def add_models(self, task=None, @@ -253,11 +251,13 @@ def add_models(self, deployed=lb_enabled) if deployment_type == DeploymentType.LOCAL: mii.utils.import_score_file(self.deployment_tag).init() - if self.stub is None: + if self.lb_stub is None: self.port_number = getattr(next(iter(self.deployments.values())), mii.constants.MII_CONFIGS_KEY).port_number - channel = create_channel(self.host, self.port_number) - self.stub = modelresponse_pb2_grpc.DeploymentManagementStub(channel) + self.channel = create_channel(self.host, self.port_number) + self.lb_stub = modelresponse_pb2_grpc.DeploymentManagementStub(self.channel) + if not self.mr_stub: + self.mr_stub = modelresponse_pb2_grpc.ModelResponseStub(self.channel) for replica in lb_config.replica_configs: request_proto = modelresponse_pb2.AddDeployRequest( task=replica.task, @@ -269,6 +269,18 @@ def add_models(self, self.asyncio_loop.run_until_complete(self.add_models_async(request_proto)) + async def delete_model_async(self, proto_request): + await getattr(self.lb_stub, "DeleteDeployment")(proto_request) + + def delete_model(self, deployment_name): + if deployment_name in self.deployments: + request_proto = modelresponse_pb2.DeleteDeployRequest( + deployment_name=deployment_name) + self.asyncio_loop.run_until_complete(self.delete_model_async(request_proto)) + del self.deployments[deployment_name] + return None + assert False, f"Deployment: {deployment_name} not found" + class MIITensorParallelClient(): """ diff --git a/mii/grpc_related/modelresponse_server.py b/mii/grpc_related/modelresponse_server.py index 6b35d56f..026d4268 100644 --- a/mii/grpc_related/modelresponse_server.py +++ b/mii/grpc_related/modelresponse_server.py @@ -299,6 +299,9 @@ def _do_serve(service_impl, port, interceptors=[], is_lb=False): modelresponse_pb2_grpc.add_DeploymentManagementServicer_to_server( service_impl, server) + modelresponse_pb2_grpc.add_ModelResponseServicer_to_server( + ModelResponse(None), + server) else: modelresponse_pb2_grpc.add_ModelResponseServicer_to_server(service_impl, server) server.add_insecure_port(f'[::]:{port}') diff --git a/mii/grpc_related/proto/modelresponse.proto b/mii/grpc_related/proto/modelresponse.proto index 7daf300a..fc8a108f 100644 --- a/mii/grpc_related/proto/modelresponse.proto +++ b/mii/grpc_related/proto/modelresponse.proto @@ -38,17 +38,7 @@ service ModelResponse { service DeploymentManagement { rpc AddDeployment(AddDeployRequest) returns (google.protobuf.Empty) {} - rpc CreateSession (SessionID) returns (google.protobuf.Empty) {} - rpc DestroySession (SessionID) returns (google.protobuf.Empty) {} - rpc GeneratorReply (MultiStringRequest) returns (MultiStringReply) {} - rpc ClassificationReply (SingleStringRequest) returns (SingleStringReply) {} - rpc QuestionAndAnswerReply(QARequest) returns (SingleStringReply) {} - rpc FillMaskReply(SingleStringRequest) returns (SingleStringReply) {} - rpc TokenClassificationReply(SingleStringRequest) returns (SingleStringReply) {} - rpc ConversationalReply(ConversationRequest) returns (ConversationReply) {} - rpc Txt2ImgReply(MultiStringRequest) returns (ImageReply) {} rpc DeleteDeployment(DeleteDeployRequest) returns (google.protobuf.Empty) {} - rpc Terminate (google.protobuf.Empty) returns (google.protobuf.Empty) {} } message Value { diff --git a/mii/grpc_related/proto/modelresponse_pb2.py b/mii/grpc_related/proto/modelresponse_pb2.py index 30c7a340..fe37da18 100644 --- a/mii/grpc_related/proto/modelresponse_pb2.py +++ b/mii/grpc_related/proto/modelresponse_pb2.py @@ -16,7 +16,7 @@ from google.protobuf import empty_pb2 as google_dot_protobuf_dot_empty__pb2 DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( - b'\n\x13modelresponse.proto\x12\rmodelresponse\x1a\x1bgoogle/protobuf/empty.proto\"_\n\x05Value\x12\x10\n\x06svalue\x18\x01 \x01(\tH\x00\x12\x10\n\x06ivalue\x18\x02 \x01(\x03H\x00\x12\x10\n\x06\x66value\x18\x03 \x01(\x02H\x00\x12\x10\n\x06\x62value\x18\x04 \x01(\x08H\x00\x42\x0e\n\x0coneof_values\"\x1f\n\tSessionID\x12\x12\n\nsession_id\x18\x01 \x01(\t\"\xed\x01\n\x13SingleStringRequest\x12\x0f\n\x07request\x18\x01 \x01(\t\x12I\n\x0cquery_kwargs\x18\x02 \x03(\x0b\x32\x33.modelresponse.SingleStringRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x03 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\xeb\x01\n\x12MultiStringRequest\x12\x0f\n\x07request\x18\x01 \x03(\t\x12H\n\x0cquery_kwargs\x18\x02 \x03(\x0b\x32\x32.modelresponse.MultiStringRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x03 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\x85\x01\n\x11SingleStringReply\x12\x10\n\x08response\x18\x01 \x01(\t\x12\x12\n\ntime_taken\x18\x02 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x03 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\x84\x01\n\x10MultiStringReply\x12\x10\n\x08response\x18\x01 \x03(\t\x12\x12\n\ntime_taken\x18\x02 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x03 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\xeb\x01\n\tQARequest\x12\x10\n\x08question\x18\x01 \x01(\t\x12\x0f\n\x07\x63ontext\x18\x02 \x01(\t\x12?\n\x0cquery_kwargs\x18\x03 \x03(\x0b\x32).modelresponse.QARequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\xd3\x02\n\x13\x43onversationRequest\x12\x0c\n\x04text\x18\x01 \x01(\t\x12\x1c\n\x0f\x63onversation_id\x18\x02 \x01(\x03H\x00\x88\x01\x01\x12\x18\n\x10past_user_inputs\x18\x03 \x03(\t\x12\x1b\n\x13generated_responses\x18\x04 \x03(\t\x12I\n\x0cquery_kwargs\x18\x05 \x03(\x0b\x32\x33.modelresponse.ConversationRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x06 \x01(\tH\x01\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_conversation_idB\x12\n\x10_deployment_name\"\xc3\x01\n\x11\x43onversationReply\x12\x17\n\x0f\x63onversation_id\x18\x01 \x01(\x03\x12\x18\n\x10past_user_inputs\x18\x02 \x03(\t\x12\x1b\n\x13generated_responses\x18\x03 \x03(\t\x12\x12\n\ntime_taken\x18\x04 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x05 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x06 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\xaf\x01\n\nImageReply\x12\x0e\n\x06images\x18\x01 \x03(\x0c\x12\x1d\n\x15nsfw_content_detected\x18\x02 \x03(\x08\x12\x0c\n\x04mode\x18\x03 \x01(\t\x12\x0e\n\x06size_w\x18\x04 \x01(\x03\x12\x0e\n\x06size_h\x18\x05 \x01(\x03\x12\x12\n\ntime_taken\x18\x06 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x07 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\x98\x01\n\x10\x41\x64\x64\x44\x65ployRequest\x12\x0c\n\x04task\x18\x01 \x01(\t\x12\x17\n\x0f\x64\x65ployment_name\x18\x02 \x01(\t\x12\x10\n\x08hostname\x18\x03 \x01(\t\x12\x1d\n\x15tensor_parallel_ports\x18\x04 \x03(\x03\x12\x17\n\x0ftorch_dist_port\x18\x05 \x01(\x03\x12\x13\n\x0bgpu_indices\x18\x06 \x03(\x03\".\n\x13\x44\x65leteDeployRequest\x12\x17\n\x0f\x64\x65ployment_name\x18\x01 \x01(\t2\xd4\x06\n\rModelResponse\x12=\n\tTerminate\x12\x16.google.protobuf.Empty\x1a\x16.google.protobuf.Empty\"\x00\x12\x43\n\rCreateSession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12\x44\n\x0e\x44\x65stroySession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12V\n\x0eGeneratorReply\x12!.modelresponse.MultiStringRequest\x1a\x1f.modelresponse.MultiStringReply\"\x00\x12]\n\x13\x43lassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12V\n\x16QuestionAndAnswerReply\x12\x18.modelresponse.QARequest\x1a .modelresponse.SingleStringReply\"\x00\x12W\n\rFillMaskReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12\x62\n\x18TokenClassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12]\n\x13\x43onversationalReply\x12\".modelresponse.ConversationRequest\x1a .modelresponse.ConversationReply\"\x00\x12N\n\x0cTxt2ImgReply\x12!.modelresponse.MultiStringRequest\x1a\x19.modelresponse.ImageReply\"\x00\x32\xf9\x07\n\x14\x44\x65ploymentManagement\x12J\n\rAddDeployment\x12\x1f.modelresponse.AddDeployRequest\x1a\x16.google.protobuf.Empty\"\x00\x12\x43\n\rCreateSession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12\x44\n\x0e\x44\x65stroySession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12V\n\x0eGeneratorReply\x12!.modelresponse.MultiStringRequest\x1a\x1f.modelresponse.MultiStringReply\"\x00\x12]\n\x13\x43lassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12V\n\x16QuestionAndAnswerReply\x12\x18.modelresponse.QARequest\x1a .modelresponse.SingleStringReply\"\x00\x12W\n\rFillMaskReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12\x62\n\x18TokenClassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12]\n\x13\x43onversationalReply\x12\".modelresponse.ConversationRequest\x1a .modelresponse.ConversationReply\"\x00\x12N\n\x0cTxt2ImgReply\x12!.modelresponse.MultiStringRequest\x1a\x19.modelresponse.ImageReply\"\x00\x12P\n\x10\x44\x65leteDeployment\x12\".modelresponse.DeleteDeployRequest\x1a\x16.google.protobuf.Empty\"\x00\x12=\n\tTerminate\x12\x16.google.protobuf.Empty\x1a\x16.google.protobuf.Empty\"\x00\x62\x06proto3' + b'\n\x13modelresponse.proto\x12\rmodelresponse\x1a\x1bgoogle/protobuf/empty.proto\"_\n\x05Value\x12\x10\n\x06svalue\x18\x01 \x01(\tH\x00\x12\x10\n\x06ivalue\x18\x02 \x01(\x03H\x00\x12\x10\n\x06\x66value\x18\x03 \x01(\x02H\x00\x12\x10\n\x06\x62value\x18\x04 \x01(\x08H\x00\x42\x0e\n\x0coneof_values\"\x1f\n\tSessionID\x12\x12\n\nsession_id\x18\x01 \x01(\t\"\xed\x01\n\x13SingleStringRequest\x12\x0f\n\x07request\x18\x01 \x01(\t\x12I\n\x0cquery_kwargs\x18\x02 \x03(\x0b\x32\x33.modelresponse.SingleStringRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x03 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\xeb\x01\n\x12MultiStringRequest\x12\x0f\n\x07request\x18\x01 \x03(\t\x12H\n\x0cquery_kwargs\x18\x02 \x03(\x0b\x32\x32.modelresponse.MultiStringRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x03 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\x85\x01\n\x11SingleStringReply\x12\x10\n\x08response\x18\x01 \x01(\t\x12\x12\n\ntime_taken\x18\x02 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x03 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\x84\x01\n\x10MultiStringReply\x12\x10\n\x08response\x18\x01 \x03(\t\x12\x12\n\ntime_taken\x18\x02 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x03 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\xeb\x01\n\tQARequest\x12\x10\n\x08question\x18\x01 \x01(\t\x12\x0f\n\x07\x63ontext\x18\x02 \x01(\t\x12?\n\x0cquery_kwargs\x18\x03 \x03(\x0b\x32).modelresponse.QARequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\xd3\x02\n\x13\x43onversationRequest\x12\x0c\n\x04text\x18\x01 \x01(\t\x12\x1c\n\x0f\x63onversation_id\x18\x02 \x01(\x03H\x00\x88\x01\x01\x12\x18\n\x10past_user_inputs\x18\x03 \x03(\t\x12\x1b\n\x13generated_responses\x18\x04 \x03(\t\x12I\n\x0cquery_kwargs\x18\x05 \x03(\x0b\x32\x33.modelresponse.ConversationRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x06 \x01(\tH\x01\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_conversation_idB\x12\n\x10_deployment_name\"\xc3\x01\n\x11\x43onversationReply\x12\x17\n\x0f\x63onversation_id\x18\x01 \x01(\x03\x12\x18\n\x10past_user_inputs\x18\x02 \x03(\t\x12\x1b\n\x13generated_responses\x18\x03 \x03(\t\x12\x12\n\ntime_taken\x18\x04 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x05 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x06 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\xaf\x01\n\nImageReply\x12\x0e\n\x06images\x18\x01 \x03(\x0c\x12\x1d\n\x15nsfw_content_detected\x18\x02 \x03(\x08\x12\x0c\n\x04mode\x18\x03 \x01(\t\x12\x0e\n\x06size_w\x18\x04 \x01(\x03\x12\x0e\n\x06size_h\x18\x05 \x01(\x03\x12\x12\n\ntime_taken\x18\x06 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x07 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\x98\x01\n\x10\x41\x64\x64\x44\x65ployRequest\x12\x0c\n\x04task\x18\x01 \x01(\t\x12\x17\n\x0f\x64\x65ployment_name\x18\x02 \x01(\t\x12\x10\n\x08hostname\x18\x03 \x01(\t\x12\x1d\n\x15tensor_parallel_ports\x18\x04 \x03(\x03\x12\x17\n\x0ftorch_dist_port\x18\x05 \x01(\x03\x12\x13\n\x0bgpu_indices\x18\x06 \x03(\x03\".\n\x13\x44\x65leteDeployRequest\x12\x17\n\x0f\x64\x65ployment_name\x18\x01 \x01(\t2\xd4\x06\n\rModelResponse\x12=\n\tTerminate\x12\x16.google.protobuf.Empty\x1a\x16.google.protobuf.Empty\"\x00\x12\x43\n\rCreateSession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12\x44\n\x0e\x44\x65stroySession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12V\n\x0eGeneratorReply\x12!.modelresponse.MultiStringRequest\x1a\x1f.modelresponse.MultiStringReply\"\x00\x12]\n\x13\x43lassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12V\n\x16QuestionAndAnswerReply\x12\x18.modelresponse.QARequest\x1a .modelresponse.SingleStringReply\"\x00\x12W\n\rFillMaskReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12\x62\n\x18TokenClassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12]\n\x13\x43onversationalReply\x12\".modelresponse.ConversationRequest\x1a .modelresponse.ConversationReply\"\x00\x12N\n\x0cTxt2ImgReply\x12!.modelresponse.MultiStringRequest\x1a\x19.modelresponse.ImageReply\"\x00\x32\xb4\x01\n\x14\x44\x65ploymentManagement\x12J\n\rAddDeployment\x12\x1f.modelresponse.AddDeployRequest\x1a\x16.google.protobuf.Empty\"\x00\x12P\n\x10\x44\x65leteDeployment\x12\".modelresponse.DeleteDeployRequest\x1a\x16.google.protobuf.Empty\"\x00\x62\x06proto3' ) _globals = globals() @@ -68,5 +68,5 @@ _globals['_MODELRESPONSE']._serialized_start = 2106 _globals['_MODELRESPONSE']._serialized_end = 2958 _globals['_DEPLOYMENTMANAGEMENT']._serialized_start = 2961 - _globals['_DEPLOYMENTMANAGEMENT']._serialized_end = 3978 + _globals['_DEPLOYMENTMANAGEMENT']._serialized_end = 3141 # @@protoc_insertion_point(module_scope) diff --git a/mii/grpc_related/proto/modelresponse_pb2_grpc.py b/mii/grpc_related/proto/modelresponse_pb2_grpc.py index 49393660..e90d037d 100644 --- a/mii/grpc_related/proto/modelresponse_pb2_grpc.py +++ b/mii/grpc_related/proto/modelresponse_pb2_grpc.py @@ -482,62 +482,11 @@ def __init__(self, channel): request_serializer=modelresponse__pb2.AddDeployRequest.SerializeToString, response_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, ) - self.CreateSession = channel.unary_unary( - '/modelresponse.DeploymentManagement/CreateSession', - request_serializer=modelresponse__pb2.SessionID.SerializeToString, - response_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, - ) - self.DestroySession = channel.unary_unary( - '/modelresponse.DeploymentManagement/DestroySession', - request_serializer=modelresponse__pb2.SessionID.SerializeToString, - response_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, - ) - self.GeneratorReply = channel.unary_unary( - '/modelresponse.DeploymentManagement/GeneratorReply', - request_serializer=modelresponse__pb2.MultiStringRequest.SerializeToString, - response_deserializer=modelresponse__pb2.MultiStringReply.FromString, - ) - self.ClassificationReply = channel.unary_unary( - '/modelresponse.DeploymentManagement/ClassificationReply', - request_serializer=modelresponse__pb2.SingleStringRequest.SerializeToString, - response_deserializer=modelresponse__pb2.SingleStringReply.FromString, - ) - self.QuestionAndAnswerReply = channel.unary_unary( - '/modelresponse.DeploymentManagement/QuestionAndAnswerReply', - request_serializer=modelresponse__pb2.QARequest.SerializeToString, - response_deserializer=modelresponse__pb2.SingleStringReply.FromString, - ) - self.FillMaskReply = channel.unary_unary( - '/modelresponse.DeploymentManagement/FillMaskReply', - request_serializer=modelresponse__pb2.SingleStringRequest.SerializeToString, - response_deserializer=modelresponse__pb2.SingleStringReply.FromString, - ) - self.TokenClassificationReply = channel.unary_unary( - '/modelresponse.DeploymentManagement/TokenClassificationReply', - request_serializer=modelresponse__pb2.SingleStringRequest.SerializeToString, - response_deserializer=modelresponse__pb2.SingleStringReply.FromString, - ) - self.ConversationalReply = channel.unary_unary( - '/modelresponse.DeploymentManagement/ConversationalReply', - request_serializer=modelresponse__pb2.ConversationRequest.SerializeToString, - response_deserializer=modelresponse__pb2.ConversationReply.FromString, - ) - self.Txt2ImgReply = channel.unary_unary( - '/modelresponse.DeploymentManagement/Txt2ImgReply', - request_serializer=modelresponse__pb2.MultiStringRequest.SerializeToString, - response_deserializer=modelresponse__pb2.ImageReply.FromString, - ) self.DeleteDeployment = channel.unary_unary( '/modelresponse.DeploymentManagement/DeleteDeployment', request_serializer=modelresponse__pb2.DeleteDeployRequest.SerializeToString, response_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, ) - self.Terminate = channel.unary_unary( - '/modelresponse.DeploymentManagement/Terminate', - request_serializer=google_dot_protobuf_dot_empty__pb2.Empty. - SerializeToString, - response_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, - ) class DeploymentManagementServicer(object): @@ -548,72 +497,12 @@ def AddDeployment(self, request, context): context.set_details('Method not implemented!') raise NotImplementedError('Method not implemented!') - def CreateSession(self, request, context): - """Missing associated documentation comment in .proto file.""" - context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details('Method not implemented!') - raise NotImplementedError('Method not implemented!') - - def DestroySession(self, request, context): - """Missing associated documentation comment in .proto file.""" - context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details('Method not implemented!') - raise NotImplementedError('Method not implemented!') - - def GeneratorReply(self, request, context): - """Missing associated documentation comment in .proto file.""" - context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details('Method not implemented!') - raise NotImplementedError('Method not implemented!') - - def ClassificationReply(self, request, context): - """Missing associated documentation comment in .proto file.""" - context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details('Method not implemented!') - raise NotImplementedError('Method not implemented!') - - def QuestionAndAnswerReply(self, request, context): - """Missing associated documentation comment in .proto file.""" - context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details('Method not implemented!') - raise NotImplementedError('Method not implemented!') - - def FillMaskReply(self, request, context): - """Missing associated documentation comment in .proto file.""" - context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details('Method not implemented!') - raise NotImplementedError('Method not implemented!') - - def TokenClassificationReply(self, request, context): - """Missing associated documentation comment in .proto file.""" - context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details('Method not implemented!') - raise NotImplementedError('Method not implemented!') - - def ConversationalReply(self, request, context): - """Missing associated documentation comment in .proto file.""" - context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details('Method not implemented!') - raise NotImplementedError('Method not implemented!') - - def Txt2ImgReply(self, request, context): - """Missing associated documentation comment in .proto file.""" - context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details('Method not implemented!') - raise NotImplementedError('Method not implemented!') - def DeleteDeployment(self, request, context): """Missing associated documentation comment in .proto file.""" context.set_code(grpc.StatusCode.UNIMPLEMENTED) context.set_details('Method not implemented!') raise NotImplementedError('Method not implemented!') - def Terminate(self, request, context): - """Missing associated documentation comment in .proto file.""" - context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details('Method not implemented!') - raise NotImplementedError('Method not implemented!') - def add_DeploymentManagementServicer_to_server(servicer, server): rpc_method_handlers = { @@ -624,62 +513,6 @@ def add_DeploymentManagementServicer_to_server(servicer, server): response_serializer=google_dot_protobuf_dot_empty__pb2.Empty. SerializeToString, ), - 'CreateSession': - grpc.unary_unary_rpc_method_handler( - servicer.CreateSession, - request_deserializer=modelresponse__pb2.SessionID.FromString, - response_serializer=google_dot_protobuf_dot_empty__pb2.Empty. - SerializeToString, - ), - 'DestroySession': - grpc.unary_unary_rpc_method_handler( - servicer.DestroySession, - request_deserializer=modelresponse__pb2.SessionID.FromString, - response_serializer=google_dot_protobuf_dot_empty__pb2.Empty. - SerializeToString, - ), - 'GeneratorReply': - grpc.unary_unary_rpc_method_handler( - servicer.GeneratorReply, - request_deserializer=modelresponse__pb2.MultiStringRequest.FromString, - response_serializer=modelresponse__pb2.MultiStringReply.SerializeToString, - ), - 'ClassificationReply': - grpc.unary_unary_rpc_method_handler( - servicer.ClassificationReply, - request_deserializer=modelresponse__pb2.SingleStringRequest.FromString, - response_serializer=modelresponse__pb2.SingleStringReply.SerializeToString, - ), - 'QuestionAndAnswerReply': - grpc.unary_unary_rpc_method_handler( - servicer.QuestionAndAnswerReply, - request_deserializer=modelresponse__pb2.QARequest.FromString, - response_serializer=modelresponse__pb2.SingleStringReply.SerializeToString, - ), - 'FillMaskReply': - grpc.unary_unary_rpc_method_handler( - servicer.FillMaskReply, - request_deserializer=modelresponse__pb2.SingleStringRequest.FromString, - response_serializer=modelresponse__pb2.SingleStringReply.SerializeToString, - ), - 'TokenClassificationReply': - grpc.unary_unary_rpc_method_handler( - servicer.TokenClassificationReply, - request_deserializer=modelresponse__pb2.SingleStringRequest.FromString, - response_serializer=modelresponse__pb2.SingleStringReply.SerializeToString, - ), - 'ConversationalReply': - grpc.unary_unary_rpc_method_handler( - servicer.ConversationalReply, - request_deserializer=modelresponse__pb2.ConversationRequest.FromString, - response_serializer=modelresponse__pb2.ConversationReply.SerializeToString, - ), - 'Txt2ImgReply': - grpc.unary_unary_rpc_method_handler( - servicer.Txt2ImgReply, - request_deserializer=modelresponse__pb2.MultiStringRequest.FromString, - response_serializer=modelresponse__pb2.ImageReply.SerializeToString, - ), 'DeleteDeployment': grpc.unary_unary_rpc_method_handler( servicer.DeleteDeployment, @@ -687,13 +520,6 @@ def add_DeploymentManagementServicer_to_server(servicer, server): response_serializer=google_dot_protobuf_dot_empty__pb2.Empty. SerializeToString, ), - 'Terminate': - grpc.unary_unary_rpc_method_handler( - servicer.Terminate, - request_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, - response_serializer=google_dot_protobuf_dot_empty__pb2.Empty. - SerializeToString, - ), } generic_handler = grpc.method_handlers_generic_handler( 'modelresponse.DeploymentManagement', @@ -730,240 +556,6 @@ def AddDeployment(request, timeout, metadata) - @staticmethod - def CreateSession(request, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None): - return grpc.experimental.unary_unary( - request, - target, - '/modelresponse.DeploymentManagement/CreateSession', - modelresponse__pb2.SessionID.SerializeToString, - google_dot_protobuf_dot_empty__pb2.Empty.FromString, - options, - channel_credentials, - insecure, - call_credentials, - compression, - wait_for_ready, - timeout, - metadata) - - @staticmethod - def DestroySession(request, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None): - return grpc.experimental.unary_unary( - request, - target, - '/modelresponse.DeploymentManagement/DestroySession', - modelresponse__pb2.SessionID.SerializeToString, - google_dot_protobuf_dot_empty__pb2.Empty.FromString, - options, - channel_credentials, - insecure, - call_credentials, - compression, - wait_for_ready, - timeout, - metadata) - - @staticmethod - def GeneratorReply(request, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None): - return grpc.experimental.unary_unary( - request, - target, - '/modelresponse.DeploymentManagement/GeneratorReply', - modelresponse__pb2.MultiStringRequest.SerializeToString, - modelresponse__pb2.MultiStringReply.FromString, - options, - channel_credentials, - insecure, - call_credentials, - compression, - wait_for_ready, - timeout, - metadata) - - @staticmethod - def ClassificationReply(request, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None): - return grpc.experimental.unary_unary( - request, - target, - '/modelresponse.DeploymentManagement/ClassificationReply', - modelresponse__pb2.SingleStringRequest.SerializeToString, - modelresponse__pb2.SingleStringReply.FromString, - options, - channel_credentials, - insecure, - call_credentials, - compression, - wait_for_ready, - timeout, - metadata) - - @staticmethod - def QuestionAndAnswerReply(request, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None): - return grpc.experimental.unary_unary( - request, - target, - '/modelresponse.DeploymentManagement/QuestionAndAnswerReply', - modelresponse__pb2.QARequest.SerializeToString, - modelresponse__pb2.SingleStringReply.FromString, - options, - channel_credentials, - insecure, - call_credentials, - compression, - wait_for_ready, - timeout, - metadata) - - @staticmethod - def FillMaskReply(request, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None): - return grpc.experimental.unary_unary( - request, - target, - '/modelresponse.DeploymentManagement/FillMaskReply', - modelresponse__pb2.SingleStringRequest.SerializeToString, - modelresponse__pb2.SingleStringReply.FromString, - options, - channel_credentials, - insecure, - call_credentials, - compression, - wait_for_ready, - timeout, - metadata) - - @staticmethod - def TokenClassificationReply(request, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None): - return grpc.experimental.unary_unary( - request, - target, - '/modelresponse.DeploymentManagement/TokenClassificationReply', - modelresponse__pb2.SingleStringRequest.SerializeToString, - modelresponse__pb2.SingleStringReply.FromString, - options, - channel_credentials, - insecure, - call_credentials, - compression, - wait_for_ready, - timeout, - metadata) - - @staticmethod - def ConversationalReply(request, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None): - return grpc.experimental.unary_unary( - request, - target, - '/modelresponse.DeploymentManagement/ConversationalReply', - modelresponse__pb2.ConversationRequest.SerializeToString, - modelresponse__pb2.ConversationReply.FromString, - options, - channel_credentials, - insecure, - call_credentials, - compression, - wait_for_ready, - timeout, - metadata) - - @staticmethod - def Txt2ImgReply(request, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None): - return grpc.experimental.unary_unary( - request, - target, - '/modelresponse.DeploymentManagement/Txt2ImgReply', - modelresponse__pb2.MultiStringRequest.SerializeToString, - modelresponse__pb2.ImageReply.FromString, - options, - channel_credentials, - insecure, - call_credentials, - compression, - wait_for_ready, - timeout, - metadata) - @staticmethod def DeleteDeployment(request, target, @@ -989,29 +581,3 @@ def DeleteDeployment(request, wait_for_ready, timeout, metadata) - - @staticmethod - def Terminate(request, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None): - return grpc.experimental.unary_unary( - request, - target, - '/modelresponse.DeploymentManagement/Terminate', - google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, - google_dot_protobuf_dot_empty__pb2.Empty.FromString, - options, - channel_credentials, - insecure, - call_credentials, - compression, - wait_for_ready, - timeout, - metadata) From e1b6d230967e6727b2535bbd885f2ee1d9424ed3 Mon Sep 17 00:00:00 2001 From: Tosin Segun Date: Fri, 28 Jul 2023 23:09:05 +0000 Subject: [PATCH 61/69] Fixing LB process not terminating --- mii/client.py | 36 +++++---------- mii/grpc_related/modelresponse_server.py | 3 ++ mii/grpc_related/proto/build_script.sh | 2 +- mii/grpc_related/proto/modelresponse.proto | 1 + mii/grpc_related/proto/modelresponse_pb2.py | 5 ++- .../proto/modelresponse_pb2_grpc.py | 45 +++++++++++++++++++ mii/models/score/score_template.py | 21 --------- mii/server.py | 13 ------ 8 files changed, 63 insertions(+), 63 deletions(-) diff --git a/mii/client.py b/mii/client.py index 1236876b..c1edd93f 100644 --- a/mii/client.py +++ b/mii/client.py @@ -19,19 +19,6 @@ def _get_deployment_configs(deployment_tag): configs = mii.utils.import_score_file(deployment_tag).configs for deployment in configs[mii.constants.DEPLOYMENTS_KEY].values(): deployment_name = deployment[mii.constants.DEPLOYMENT_NAME_KEY] - """ - data = { - 'deployment_name': deployment[mii.constants.DEPLOYMENT_NAME_KEY], - 'task': deployment[mii.constants.TASK_NAME_KEY], - 'model': deployment[mii.constants.MODEL_NAME_KEY], - 'enable_deepspeed': deployment[mii.constants.ENABLE_DEEPSPEED_KEY], - 'enable_zero': deployment[mii.constants.ENABLE_DEEPSPEED_ZERO_KEY], - 'GPU_index_map': deployment[mii.constants.GPU_INDEX_KEY], - 'mii_config': deployment[mii.constants.MII_CONFIGS_KEY], - 'ds_config': deployment[mii.constants.DEEPSPEED_CONFIG_KEY], - 'version': 1, - } - """ deployments[deployment_name] = DeploymentConfig(**deployment) lb_config = configs.get(mii.constants.LOAD_BALANCER_CONFIG_KEY) model_path = configs[mii.constants.MODEL_PATH_KEY] @@ -103,16 +90,10 @@ def _get_deployment_task(self, deployment_name=None): assert len(self.deployments) == 1, "Must pass deployment_name to query when using multiple deployments" deployment = next(iter(self.deployments.values())) deployment_name = getattr(deployment, mii.constants.DEPLOYMENT_NAME_KEY) - #task = get_task(deployment.task) if isinstance(deployment.task, - #str) else deployment.task task = getattr(deployment, mii.constants.TASK_NAME_KEY) else: if deployment_name in self.deployments: deployment = self.deployments[deployment_name] - """ - task = get_task(deployment.task) if isinstance(deployment.task, - str) else deployment.task - """ task = getattr(deployment, mii.constants.TASK_NAME_KEY) else: assert False, f"{deployment_name} not found in list of deployments" @@ -137,7 +118,7 @@ def query(self, request_dict, **query_kwargs): **query_kwargs)) async def terminate_async(self): - await self.mr_stub.Terminate( + await self.lb_stub.Terminate( modelresponse_pb2.google_dot_protobuf_dot_empty__pb2.Empty()) def terminate(self): @@ -186,6 +167,15 @@ def __init__(self, self.port_map = port_map if port_map is not None else {} self.deployment_tag = deployment_tag + """ + async def terminate_async(self): + await self.lb_stub.Terminate( + modelresponse_pb2.google_dot_protobuf_dot_empty__pb2.Empty()) + + def terminate(self): + self.asyncio_loop.run_until_complete(self.terminate_async()) + """ + async def add_models_async(self, proto_request): await getattr(self.lb_stub, "AddDeployment")(proto_request) @@ -223,8 +213,6 @@ def add_models(self, mii.constants.DEPLOYMENT_NAME_KEY): deployment for deployment in deployments } - #for deployment in deployments: - # deployment.task = get_task(deployment.task) lb_config, self.port_map = allocate_processes(deps, self.port_map) lb_enabled = True if len(self.deployments) else False if self.lb_config is not None: @@ -238,10 +226,6 @@ def add_models(self, self.model_path = mii.constants.MII_MODEL_PATH_DEFAULT elif self.model_path is None and deployment_type == DeploymentType.AML: model_path = "model" - #for deployment in self.deployments.values(): - #if isinstance(deployment.task, str): - #deployment.task = get_task(deployment.task) - #lb_enabled = True if len(self.deployments) else False create_score_file(deployment_tag=self.deployment_tag, deployment_type=deployment_type, deployments=deps, diff --git a/mii/grpc_related/modelresponse_server.py b/mii/grpc_related/modelresponse_server.py index 026d4268..4aa485dc 100644 --- a/mii/grpc_related/modelresponse_server.py +++ b/mii/grpc_related/modelresponse_server.py @@ -35,6 +35,9 @@ def get_stop_event(self): class DeploymentManagement(ServiceBase, modelresponse_pb2_grpc.DeploymentManagementServicer): + def __init__(self): + ServiceBase.__init__(self) + def AddDeployment(self, request, context): return google_dot_protobuf_dot_empty__pb2.Empty() diff --git a/mii/grpc_related/proto/build_script.sh b/mii/grpc_related/proto/build_script.sh index 9aaf3bd2..d8615a85 100644 --- a/mii/grpc_related/proto/build_script.sh +++ b/mii/grpc_related/proto/build_script.sh @@ -3,7 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 # DeepSpeed Team -python -m grpc_tools.protoc -I./ --python_out=. --grpc_python_out=. ./modelresponse.proto +python3 -m grpc_tools.protoc -I./ --python_out=. --grpc_python_out=. ./modelresponse.proto # update import to be global wrt mii sed -i 's/modelresponse_pb2/mii.grpc_related.proto.modelresponse_pb2/g' modelresponse_pb2_grpc.py diff --git a/mii/grpc_related/proto/modelresponse.proto b/mii/grpc_related/proto/modelresponse.proto index fc8a108f..146e1f30 100644 --- a/mii/grpc_related/proto/modelresponse.proto +++ b/mii/grpc_related/proto/modelresponse.proto @@ -39,6 +39,7 @@ service ModelResponse { service DeploymentManagement { rpc AddDeployment(AddDeployRequest) returns (google.protobuf.Empty) {} rpc DeleteDeployment(DeleteDeployRequest) returns (google.protobuf.Empty) {} + rpc Terminate (google.protobuf.Empty) returns (google.protobuf.Empty) {} } message Value { diff --git a/mii/grpc_related/proto/modelresponse_pb2.py b/mii/grpc_related/proto/modelresponse_pb2.py index fe37da18..72c33ed8 100644 --- a/mii/grpc_related/proto/modelresponse_pb2.py +++ b/mii/grpc_related/proto/modelresponse_pb2.py @@ -2,6 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 # DeepSpeed Team +# -*- coding: utf-8 -*- # Generated by the protocol buffer compiler. DO NOT EDIT! # source: modelresponse.proto """Generated protocol buffer code.""" @@ -16,7 +17,7 @@ from google.protobuf import empty_pb2 as google_dot_protobuf_dot_empty__pb2 DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( - b'\n\x13modelresponse.proto\x12\rmodelresponse\x1a\x1bgoogle/protobuf/empty.proto\"_\n\x05Value\x12\x10\n\x06svalue\x18\x01 \x01(\tH\x00\x12\x10\n\x06ivalue\x18\x02 \x01(\x03H\x00\x12\x10\n\x06\x66value\x18\x03 \x01(\x02H\x00\x12\x10\n\x06\x62value\x18\x04 \x01(\x08H\x00\x42\x0e\n\x0coneof_values\"\x1f\n\tSessionID\x12\x12\n\nsession_id\x18\x01 \x01(\t\"\xed\x01\n\x13SingleStringRequest\x12\x0f\n\x07request\x18\x01 \x01(\t\x12I\n\x0cquery_kwargs\x18\x02 \x03(\x0b\x32\x33.modelresponse.SingleStringRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x03 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\xeb\x01\n\x12MultiStringRequest\x12\x0f\n\x07request\x18\x01 \x03(\t\x12H\n\x0cquery_kwargs\x18\x02 \x03(\x0b\x32\x32.modelresponse.MultiStringRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x03 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\x85\x01\n\x11SingleStringReply\x12\x10\n\x08response\x18\x01 \x01(\t\x12\x12\n\ntime_taken\x18\x02 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x03 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\x84\x01\n\x10MultiStringReply\x12\x10\n\x08response\x18\x01 \x03(\t\x12\x12\n\ntime_taken\x18\x02 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x03 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\xeb\x01\n\tQARequest\x12\x10\n\x08question\x18\x01 \x01(\t\x12\x0f\n\x07\x63ontext\x18\x02 \x01(\t\x12?\n\x0cquery_kwargs\x18\x03 \x03(\x0b\x32).modelresponse.QARequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\xd3\x02\n\x13\x43onversationRequest\x12\x0c\n\x04text\x18\x01 \x01(\t\x12\x1c\n\x0f\x63onversation_id\x18\x02 \x01(\x03H\x00\x88\x01\x01\x12\x18\n\x10past_user_inputs\x18\x03 \x03(\t\x12\x1b\n\x13generated_responses\x18\x04 \x03(\t\x12I\n\x0cquery_kwargs\x18\x05 \x03(\x0b\x32\x33.modelresponse.ConversationRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x06 \x01(\tH\x01\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_conversation_idB\x12\n\x10_deployment_name\"\xc3\x01\n\x11\x43onversationReply\x12\x17\n\x0f\x63onversation_id\x18\x01 \x01(\x03\x12\x18\n\x10past_user_inputs\x18\x02 \x03(\t\x12\x1b\n\x13generated_responses\x18\x03 \x03(\t\x12\x12\n\ntime_taken\x18\x04 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x05 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x06 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\xaf\x01\n\nImageReply\x12\x0e\n\x06images\x18\x01 \x03(\x0c\x12\x1d\n\x15nsfw_content_detected\x18\x02 \x03(\x08\x12\x0c\n\x04mode\x18\x03 \x01(\t\x12\x0e\n\x06size_w\x18\x04 \x01(\x03\x12\x0e\n\x06size_h\x18\x05 \x01(\x03\x12\x12\n\ntime_taken\x18\x06 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x07 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\x98\x01\n\x10\x41\x64\x64\x44\x65ployRequest\x12\x0c\n\x04task\x18\x01 \x01(\t\x12\x17\n\x0f\x64\x65ployment_name\x18\x02 \x01(\t\x12\x10\n\x08hostname\x18\x03 \x01(\t\x12\x1d\n\x15tensor_parallel_ports\x18\x04 \x03(\x03\x12\x17\n\x0ftorch_dist_port\x18\x05 \x01(\x03\x12\x13\n\x0bgpu_indices\x18\x06 \x03(\x03\".\n\x13\x44\x65leteDeployRequest\x12\x17\n\x0f\x64\x65ployment_name\x18\x01 \x01(\t2\xd4\x06\n\rModelResponse\x12=\n\tTerminate\x12\x16.google.protobuf.Empty\x1a\x16.google.protobuf.Empty\"\x00\x12\x43\n\rCreateSession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12\x44\n\x0e\x44\x65stroySession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12V\n\x0eGeneratorReply\x12!.modelresponse.MultiStringRequest\x1a\x1f.modelresponse.MultiStringReply\"\x00\x12]\n\x13\x43lassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12V\n\x16QuestionAndAnswerReply\x12\x18.modelresponse.QARequest\x1a .modelresponse.SingleStringReply\"\x00\x12W\n\rFillMaskReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12\x62\n\x18TokenClassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12]\n\x13\x43onversationalReply\x12\".modelresponse.ConversationRequest\x1a .modelresponse.ConversationReply\"\x00\x12N\n\x0cTxt2ImgReply\x12!.modelresponse.MultiStringRequest\x1a\x19.modelresponse.ImageReply\"\x00\x32\xb4\x01\n\x14\x44\x65ploymentManagement\x12J\n\rAddDeployment\x12\x1f.modelresponse.AddDeployRequest\x1a\x16.google.protobuf.Empty\"\x00\x12P\n\x10\x44\x65leteDeployment\x12\".modelresponse.DeleteDeployRequest\x1a\x16.google.protobuf.Empty\"\x00\x62\x06proto3' + b'\n\x13modelresponse.proto\x12\rmodelresponse\x1a\x1bgoogle/protobuf/empty.proto\"_\n\x05Value\x12\x10\n\x06svalue\x18\x01 \x01(\tH\x00\x12\x10\n\x06ivalue\x18\x02 \x01(\x03H\x00\x12\x10\n\x06\x66value\x18\x03 \x01(\x02H\x00\x12\x10\n\x06\x62value\x18\x04 \x01(\x08H\x00\x42\x0e\n\x0coneof_values\"\x1f\n\tSessionID\x12\x12\n\nsession_id\x18\x01 \x01(\t\"\xed\x01\n\x13SingleStringRequest\x12\x0f\n\x07request\x18\x01 \x01(\t\x12I\n\x0cquery_kwargs\x18\x02 \x03(\x0b\x32\x33.modelresponse.SingleStringRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x03 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\xeb\x01\n\x12MultiStringRequest\x12\x0f\n\x07request\x18\x01 \x03(\t\x12H\n\x0cquery_kwargs\x18\x02 \x03(\x0b\x32\x32.modelresponse.MultiStringRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x03 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\x85\x01\n\x11SingleStringReply\x12\x10\n\x08response\x18\x01 \x01(\t\x12\x12\n\ntime_taken\x18\x02 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x03 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\x84\x01\n\x10MultiStringReply\x12\x10\n\x08response\x18\x01 \x03(\t\x12\x12\n\ntime_taken\x18\x02 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x03 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\xeb\x01\n\tQARequest\x12\x10\n\x08question\x18\x01 \x01(\t\x12\x0f\n\x07\x63ontext\x18\x02 \x01(\t\x12?\n\x0cquery_kwargs\x18\x03 \x03(\x0b\x32).modelresponse.QARequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x04 \x01(\tH\x00\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_deployment_name\"\xd3\x02\n\x13\x43onversationRequest\x12\x0c\n\x04text\x18\x01 \x01(\t\x12\x1c\n\x0f\x63onversation_id\x18\x02 \x01(\x03H\x00\x88\x01\x01\x12\x18\n\x10past_user_inputs\x18\x03 \x03(\t\x12\x1b\n\x13generated_responses\x18\x04 \x03(\t\x12I\n\x0cquery_kwargs\x18\x05 \x03(\x0b\x32\x33.modelresponse.ConversationRequest.QueryKwargsEntry\x12\x1c\n\x0f\x64\x65ployment_name\x18\x06 \x01(\tH\x01\x88\x01\x01\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\x42\x12\n\x10_conversation_idB\x12\n\x10_deployment_name\"\xc3\x01\n\x11\x43onversationReply\x12\x17\n\x0f\x63onversation_id\x18\x01 \x01(\x03\x12\x18\n\x10past_user_inputs\x18\x02 \x03(\t\x12\x1b\n\x13generated_responses\x18\x03 \x03(\t\x12\x12\n\ntime_taken\x18\x04 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x05 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x06 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\xaf\x01\n\nImageReply\x12\x0e\n\x06images\x18\x01 \x03(\x0c\x12\x1d\n\x15nsfw_content_detected\x18\x02 \x03(\x08\x12\x0c\n\x04mode\x18\x03 \x01(\t\x12\x0e\n\x06size_w\x18\x04 \x01(\x03\x12\x0e\n\x06size_h\x18\x05 \x01(\x03\x12\x12\n\ntime_taken\x18\x06 \x01(\x02\x12\x1c\n\x0f\x64\x65ployment_name\x18\x07 \x01(\tH\x00\x88\x01\x01\x42\x12\n\x10_deployment_name\"\x98\x01\n\x10\x41\x64\x64\x44\x65ployRequest\x12\x0c\n\x04task\x18\x01 \x01(\t\x12\x17\n\x0f\x64\x65ployment_name\x18\x02 \x01(\t\x12\x10\n\x08hostname\x18\x03 \x01(\t\x12\x1d\n\x15tensor_parallel_ports\x18\x04 \x03(\x03\x12\x17\n\x0ftorch_dist_port\x18\x05 \x01(\x03\x12\x13\n\x0bgpu_indices\x18\x06 \x03(\x03\".\n\x13\x44\x65leteDeployRequest\x12\x17\n\x0f\x64\x65ployment_name\x18\x01 \x01(\t2\xd4\x06\n\rModelResponse\x12=\n\tTerminate\x12\x16.google.protobuf.Empty\x1a\x16.google.protobuf.Empty\"\x00\x12\x43\n\rCreateSession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12\x44\n\x0e\x44\x65stroySession\x12\x18.modelresponse.SessionID\x1a\x16.google.protobuf.Empty\"\x00\x12V\n\x0eGeneratorReply\x12!.modelresponse.MultiStringRequest\x1a\x1f.modelresponse.MultiStringReply\"\x00\x12]\n\x13\x43lassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12V\n\x16QuestionAndAnswerReply\x12\x18.modelresponse.QARequest\x1a .modelresponse.SingleStringReply\"\x00\x12W\n\rFillMaskReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12\x62\n\x18TokenClassificationReply\x12\".modelresponse.SingleStringRequest\x1a .modelresponse.SingleStringReply\"\x00\x12]\n\x13\x43onversationalReply\x12\".modelresponse.ConversationRequest\x1a .modelresponse.ConversationReply\"\x00\x12N\n\x0cTxt2ImgReply\x12!.modelresponse.MultiStringRequest\x1a\x19.modelresponse.ImageReply\"\x00\x32\xf3\x01\n\x14\x44\x65ploymentManagement\x12J\n\rAddDeployment\x12\x1f.modelresponse.AddDeployRequest\x1a\x16.google.protobuf.Empty\"\x00\x12P\n\x10\x44\x65leteDeployment\x12\".modelresponse.DeleteDeployRequest\x1a\x16.google.protobuf.Empty\"\x00\x12=\n\tTerminate\x12\x16.google.protobuf.Empty\x1a\x16.google.protobuf.Empty\"\x00\x62\x06proto3' ) _globals = globals() @@ -68,5 +69,5 @@ _globals['_MODELRESPONSE']._serialized_start = 2106 _globals['_MODELRESPONSE']._serialized_end = 2958 _globals['_DEPLOYMENTMANAGEMENT']._serialized_start = 2961 - _globals['_DEPLOYMENTMANAGEMENT']._serialized_end = 3141 + _globals['_DEPLOYMENTMANAGEMENT']._serialized_end = 3204 # @@protoc_insertion_point(module_scope) diff --git a/mii/grpc_related/proto/modelresponse_pb2_grpc.py b/mii/grpc_related/proto/modelresponse_pb2_grpc.py index e90d037d..5334f127 100644 --- a/mii/grpc_related/proto/modelresponse_pb2_grpc.py +++ b/mii/grpc_related/proto/modelresponse_pb2_grpc.py @@ -487,6 +487,12 @@ def __init__(self, channel): request_serializer=modelresponse__pb2.DeleteDeployRequest.SerializeToString, response_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, ) + self.Terminate = channel.unary_unary( + '/modelresponse.DeploymentManagement/Terminate', + request_serializer=google_dot_protobuf_dot_empty__pb2.Empty. + SerializeToString, + response_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, + ) class DeploymentManagementServicer(object): @@ -503,6 +509,12 @@ def DeleteDeployment(self, request, context): context.set_details('Method not implemented!') raise NotImplementedError('Method not implemented!') + def Terminate(self, request, context): + """Missing associated documentation comment in .proto file.""" + context.set_code(grpc.StatusCode.UNIMPLEMENTED) + context.set_details('Method not implemented!') + raise NotImplementedError('Method not implemented!') + def add_DeploymentManagementServicer_to_server(servicer, server): rpc_method_handlers = { @@ -520,6 +532,13 @@ def add_DeploymentManagementServicer_to_server(servicer, server): response_serializer=google_dot_protobuf_dot_empty__pb2.Empty. SerializeToString, ), + 'Terminate': + grpc.unary_unary_rpc_method_handler( + servicer.Terminate, + request_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, + response_serializer=google_dot_protobuf_dot_empty__pb2.Empty. + SerializeToString, + ), } generic_handler = grpc.method_handlers_generic_handler( 'modelresponse.DeploymentManagement', @@ -581,3 +600,29 @@ def DeleteDeployment(request, wait_for_ready, timeout, metadata) + + @staticmethod + def Terminate(request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary( + request, + target, + '/modelresponse.DeploymentManagement/Terminate', + google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, + google_dot_protobuf_dot_empty__pb2.Empty.FromString, + options, + channel_credentials, + insecure, + call_credentials, + compression, + wait_for_ready, + timeout, + metadata) diff --git a/mii/models/score/score_template.py b/mii/models/score/score_template.py index c4905f3c..7c8208b8 100644 --- a/mii/models/score/score_template.py +++ b/mii/models/score/score_template.py @@ -20,28 +20,7 @@ def init(): deployments = [] lb_enabled = configs[mii.constants.DEPLOYED_KEY] for deployment in configs[mii.constants.DEPLOYMENTS_KEY].values(): - """ - data = { - 'deployment_name': deployment[mii.constants.DEPLOYMENT_NAME_KEY], - 'task': deployment[mii.constants.TASK_NAME_KEY], - 'model': deployment[mii.constants.MODEL_NAME_KEY], - 'enable_deepspeed': deployment[mii.constants.ENABLE_DEEPSPEED_KEY], - 'enable_zero': deployment[mii.constants.ENABLE_DEEPSPEED_ZERO_KEY], - 'GPU_index_map': deployment[mii.constants.GPU_INDEX_KEY], - 'mii_config': deployment[mii.constants.MII_CONFIGS_KEY], - 'ds_config': deployment[mii.constants.DEEPSPEED_CONFIG_KEY], - 'version': 1 - } - """ deployments.append(mii.DeploymentConfig(**deployment)) - """ - deployment_name = configs[mii.constants.DEPLOYMENT_NAME_KEY] - model_name = configs[mii.constants.MODEL_NAME_KEY] - task_name = configs[mii.constants.TASK_NAME_KEY] - - assert model_name is not None, "The model name should be set before calling init" - assert task_name is not None, "The task name should be set before calling init" - """ mii.MIIServer(deployment_tag, deployments, diff --git a/mii/server.py b/mii/server.py index 4fef14f4..30d69363 100644 --- a/mii/server.py +++ b/mii/server.py @@ -166,19 +166,6 @@ def _launch_load_balancer(self, model_path, lb_config): mii_env["TRANSFORMERS_CACHE"] = model_path logger.info(f"load balancer server launch: {cmd}") return subprocess.Popen(cmd, env=mii_env) - """ - return self._launch_server_process( - deployment_name, - model_name, - model_path, - ds_optimize, - ds_zero, - ds_config, - mii_configs, - mii_configs.port_number, - "load balancer", - ex_server_args=[f"--load-balancer {b64_config_str}"]) - """ def _launch_restful_gateway(self, deployment_name, From 1675bd8474f25ad96ff8ed7fa19a2860d203f8b3 Mon Sep 17 00:00:00 2001 From: Tosin Segun Date: Mon, 31 Jul 2023 20:41:09 +0000 Subject: [PATCH 62/69] Adding multi_deployment and partial deploy/terminate unit tests --- tests/conftest.py | 99 ++++++++++++++++++++++++++++++++++ tests/test_multi_deployment.py | 35 ++++++++++++ 2 files changed, 134 insertions(+) create mode 100644 tests/test_multi_deployment.py diff --git a/tests/conftest.py b/tests/conftest.py index cb812069..ed224bee 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -108,6 +108,55 @@ def ds_config(request): return request.param +@pytest.fixture(scope="function", params=["Multi_Model_Tag"]) +def deployment_tag(request): + return request.param + + +@pytest.fixture(scope="function", params=[[]]) +def deployments(request): + ret = {} + gpu_index_map1 = {'master': [0]} + gpu_index_map2 = {'master': [1]} + gpu_index_map3 = {'master': [0, 1]} + + deployments = [] + + mii_configs1 = {"tensor_parallel": 2, "dtype": "fp16"} + mii_configs2 = {"tensor_parallel": 1} + + name = "bigscience/bloom-560m" + deployments.append( + mii.DeploymentConfig(task='text-generation', + model=name, + deployment_name=name + "_deployment", + GPU_index_map=gpu_index_map3, + mii_configs=mii.config.MIIConfig(**mii_configs1))) + + name = "microsoft/DialogRPT-human-vs-rand" + deployments.append( + mii.DeploymentConfig(task='text-classification', + model=name, + deployment_name=name + "_deployment", + GPU_index_map=gpu_index_map2)) + + name = "microsoft/DialoGPT-large" + deployments.append( + mii.DeploymentConfig(task='conversational', + model=name, + deployment_name=name + "_deployment", + GPU_index_map=gpu_index_map1, + mii_configs=mii.config.MIIConfig(**mii_configs2))) + + name = "deepset/roberta-large-squad2" + deployments.append( + mii.DeploymentConfig(task="question-answering", + model=name, + deployment_name=name + "-qa-deployment", + GPU_index_map=gpu_index_map2)) + return deployments + + @pytest.fixture(scope="function") def deployment_config(task_name: str, model_name: str, @@ -130,6 +179,19 @@ def deployment_config(task_name: str, return config +@pytest.fixture(scope="function") +def multi_deployment_config(deployments: list, + deployment_tag: str, + deployment_type: str): + config = SimpleNamespace(deployments=deployments, + deployment_type=deployment_type, + deployment_tag=deployment_tag, + model_path=os.getenv("TRANSFORMERS_CACHE", + None)) + validate_config(config) + return config + + @pytest.fixture(scope="function", params=[None]) def expected_failure(request): return request.param @@ -147,6 +209,43 @@ def deployment(deployment_config, expected_failure): mii.terminate(deployment_config.deployment_name) +@pytest.fixture(scope="function") +def multi_deployment(deployment_tag, multi_deployment_config): + mii.deploy(**multi_deployment_config.__dict__) + yield multi_deployment_config + mii.terminate(deployment_tag) + + @pytest.fixture(scope="function", params=[{"query": "DeepSpeed is the greatest"}]) def query(request): return request.param + + +@pytest.fixture(scope="function") +def multi_query(request): + queries = [] + queries.append({ + "query": ["DeepSpeed is", + "Seattle is"], + "deployment_name": "bloom-560m_deployment" + }) + + queries.append({ + 'query': "DeepSpeed is the greatest", + "deployment_name": "microsoft/DialogRPT-human-vs-rand_deployment" + }) + + queries.append({ + 'text': "DeepSpeed is the greatest", + 'conversation_id': 3, + 'past_user_inputs': [], + 'generated_responses': [], + "deployment_name": "microsoft/DialoGPT-large_deployment" + }) + + queries.append({ + 'question': "What is the greatest?", + 'context': "DeepSpeed is the greatest", + "deployment_name": "deepset/roberta-large-squad2" + "-qa-deployment" + }) + return queries diff --git a/tests/test_multi_deployment.py b/tests/test_multi_deployment.py new file mode 100644 index 00000000..9caa9828 --- /dev/null +++ b/tests/test_multi_deployment.py @@ -0,0 +1,35 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team +import pytest +import mii + + +def test_multi_deploy(deployment_tag, multi_deployment, multi_query): + generator = mii.mii_query_handle(deployment_tag) + for query in multi_query: + result = generator.query(query) + assert result + + +@pytest.mark.parametrize( + "task_name, model_name, query", + [ + ( + "text-generation", + "bigscience/bloom-560m", + { + "query": ["DeepSpeed is the greatest", + 'Seattle is'] + }, + ), + ], +) +def test_partial_deploy(deployment_tag, multi_deployment, deployment_config, query): + generator = mii.mii_query_handle(deployment_tag) + generator.add_models(**deployment_config.__dict__) + query["deployment_name"] = deployment_config.deployment_name + result = generator.query(query) + generator.delete_model(deployment_config.deployment_name) + assert result From 8684a61ae65d166d1e25fa363941489313bc65ab Mon Sep 17 00:00:00 2001 From: Tosin Segun Date: Mon, 31 Jul 2023 21:01:49 +0000 Subject: [PATCH 63/69] Removing comments --- mii/client.py | 9 --------- mii/constants.py | 2 +- mii/terminate.py | 2 +- 3 files changed, 2 insertions(+), 11 deletions(-) diff --git a/mii/client.py b/mii/client.py index c1edd93f..e2eaaa38 100644 --- a/mii/client.py +++ b/mii/client.py @@ -167,15 +167,6 @@ def __init__(self, self.port_map = port_map if port_map is not None else {} self.deployment_tag = deployment_tag - """ - async def terminate_async(self): - await self.lb_stub.Terminate( - modelresponse_pb2.google_dot_protobuf_dot_empty__pb2.Empty()) - - def terminate(self): - self.asyncio_loop.run_until_complete(self.terminate_async()) - """ - async def add_models_async(self, proto_request): await getattr(self.lb_stub, "AddDeployment")(proto_request) diff --git a/mii/constants.py b/mii/constants.py index f4860cc9..f2dced4e 100644 --- a/mii/constants.py +++ b/mii/constants.py @@ -103,7 +103,7 @@ class ModelProvider(enum.Enum): CHECKPOINT_KEY = "checkpoint" DEPLOYED_KEY = "deployed" VERSION_KEY = "version" -MII_TERMINATE_DEP_NAME = "__MII_TERMINATE_CALL__" +MII_TERMINATE_DEP_KEY = "__MII_TERMINATE_CALL__" MII_CACHE_PATH = "MII_CACHE_PATH" MII_CACHE_PATH_DEFAULT = "/tmp/mii_cache" diff --git a/mii/terminate.py b/mii/terminate.py index 77df55ff..5585832b 100644 --- a/mii/terminate.py +++ b/mii/terminate.py @@ -14,7 +14,7 @@ def terminate(deployment_tag): generator.terminate() return try: - generator.query({'query': ''}, mii.constants.MII_TERMINATE_DEP_NAME) + generator.query({'query': ''}, mii.constants.MII_TERMINATE_DEP_KEY) except grpc.aio._call.AioRpcError as error: if error._code == grpc.StatusCode.UNAVAILABLE: mii.utils.logger.warn(f"Server for {deployment_tag} not found") From 56a7fcede7bda7eef2a86e6917cca49f1252bdba Mon Sep 17 00:00:00 2001 From: Tosin Segun Date: Tue, 1 Aug 2023 18:31:26 +0000 Subject: [PATCH 64/69] Fixing spelling issues --- mii/client.py | 2 +- tests/conftest.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/mii/client.py b/mii/client.py index e2eaaa38..2e60149c 100644 --- a/mii/client.py +++ b/mii/client.py @@ -86,7 +86,7 @@ def __init__(self, deployments, host, port): def _get_deployment_task(self, deployment_name=None): task = None - if deployment_name is None or deployment_name == mii.constants.MII_TERMINATE_DEP_NAME: #mii.terminate() or single model + if deployment_name is None or deployment_name == mii.constants.MII_TERMINATE_DEP_KEY: #mii.terminate() or single model assert len(self.deployments) == 1, "Must pass deployment_name to query when using multiple deployments" deployment = next(iter(self.deployments.values())) deployment_name = getattr(deployment, mii.constants.DEPLOYMENT_NAME_KEY) diff --git a/tests/conftest.py b/tests/conftest.py index ed224bee..29be37be 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -227,7 +227,7 @@ def multi_query(request): queries.append({ "query": ["DeepSpeed is", "Seattle is"], - "deployment_name": "bloom-560m_deployment" + "deployment_name": "bigscience/bloom-560m_deployment" }) queries.append({ From fb70c3db2bd6bba91c42fe8defb0c8450d5abff0 Mon Sep 17 00:00:00 2001 From: TosinSeg <90005810+TosinSeg@users.noreply.github.com> Date: Tue, 1 Aug 2023 14:41:58 -0700 Subject: [PATCH 65/69] Update mii/client.py Co-authored-by: Michael Wyatt --- mii/client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mii/client.py b/mii/client.py index 2e60149c..c795fef2 100644 --- a/mii/client.py +++ b/mii/client.py @@ -17,7 +17,7 @@ def _get_deployment_configs(deployment_tag): deployments = {} configs = mii.utils.import_score_file(deployment_tag).configs - for deployment in configs[mii.constants.DEPLOYMENTS_KEY].values(): + for deployment in configs.get(mii.constants.DEPLOYMENTS_KEY).values(): deployment_name = deployment[mii.constants.DEPLOYMENT_NAME_KEY] deployments[deployment_name] = DeploymentConfig(**deployment) lb_config = configs.get(mii.constants.LOAD_BALANCER_CONFIG_KEY) From e2cfe8a60e1528d3db4a4c4cfdb916d97d2b3275 Mon Sep 17 00:00:00 2001 From: TosinSeg <90005810+TosinSeg@users.noreply.github.com> Date: Tue, 1 Aug 2023 14:42:12 -0700 Subject: [PATCH 66/69] Update mii/client.py Co-authored-by: Michael Wyatt --- mii/client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mii/client.py b/mii/client.py index c795fef2..a4f717d0 100644 --- a/mii/client.py +++ b/mii/client.py @@ -21,7 +21,7 @@ def _get_deployment_configs(deployment_tag): deployment_name = deployment[mii.constants.DEPLOYMENT_NAME_KEY] deployments[deployment_name] = DeploymentConfig(**deployment) lb_config = configs.get(mii.constants.LOAD_BALANCER_CONFIG_KEY) - model_path = configs[mii.constants.MODEL_PATH_KEY] + model_path = configs.get(mii.constants.MODEL_PATH_KEY) port_map = configs.get(mii.constants.PORT_MAP_KEY) return deployments, lb_config, model_path, port_map From 1312738637fa00665cd53cd421cfb351f8ffd4af Mon Sep 17 00:00:00 2001 From: Tosin Segun Date: Tue, 1 Aug 2023 22:00:55 +0000 Subject: [PATCH 67/69] Removing AML from addDeploy --- mii/client.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mii/client.py b/mii/client.py index a4f717d0..ab459cfa 100644 --- a/mii/client.py +++ b/mii/client.py @@ -87,7 +87,8 @@ def __init__(self, deployments, host, port): def _get_deployment_task(self, deployment_name=None): task = None if deployment_name is None or deployment_name == mii.constants.MII_TERMINATE_DEP_KEY: #mii.terminate() or single model - assert len(self.deployments) == 1, "Must pass deployment_name to query when using multiple deployments" + if deployment_name is None: + assert len(self.deployments) == 1, "Must pass deployment_name to query when using multiple deployments" deployment = next(iter(self.deployments.values())) deployment_name = getattr(deployment, mii.constants.DEPLOYMENT_NAME_KEY) task = getattr(deployment, mii.constants.TASK_NAME_KEY) @@ -182,7 +183,7 @@ def add_models(self, deployment_type=DeploymentType.LOCAL, model_path=None, version=1): - + assert deployment_type != DeploymentType.AML, "Cannot currently add models to AML deployment" _, deployments = validate_deployment(task=task, model=model, deployment_name=deployment_name, @@ -215,8 +216,7 @@ def add_models(self, mii.constants.DEPLOYMENT_NAME_KEY)] = deployment if self.model_path is None and deployment_type == DeploymentType.LOCAL: self.model_path = mii.constants.MII_MODEL_PATH_DEFAULT - elif self.model_path is None and deployment_type == DeploymentType.AML: - model_path = "model" + create_score_file(deployment_tag=self.deployment_tag, deployment_type=deployment_type, deployments=deps, From b0f0da4f2f35648edab60a5120a5620ebac93941 Mon Sep 17 00:00:00 2001 From: Tosin Segun Date: Wed, 2 Aug 2023 19:35:20 +0000 Subject: [PATCH 68/69] Refactoring MIIConfig and DeploymentConfig --- examples/multi_model/deploy.py | 14 +++-- mii/client.py | 72 ++++++++++++---------- mii/config.py | 94 ++++++++++++++++++++++++----- mii/constants.py | 1 + mii/deployment.py | 38 +++++++----- mii/launch/multi_gpu_server.py | 10 +++- mii/models/score/generate.py | 44 ++++++++++++-- mii/models/score/score_template.py | 8 ++- mii/server.py | 96 +++++++++++++++++------------- 9 files changed, 256 insertions(+), 121 deletions(-) diff --git a/examples/multi_model/deploy.py b/examples/multi_model/deploy.py index c0b93b56..525b2da3 100644 --- a/examples/multi_model/deploy.py +++ b/examples/multi_model/deploy.py @@ -19,7 +19,8 @@ model=name, deployment_name=name + "_deployment", GPU_index_map=gpu_index_map3, - mii_configs=mii.config.MIIConfig(**mii_configs1))) + tensor_parallel=2, + dtype="fp16")) # gpt2 name = "microsoft/DialogRPT-human-vs-rand" @@ -31,11 +32,12 @@ name = "microsoft/DialoGPT-large" deployments.append( - mii.DeploymentConfig(task='conversational', - model=name, - deployment_name=name + "_deployment", - GPU_index_map=gpu_index_map1, - mii_configs=mii.config.MIIConfig(**mii_configs2))) + mii.DeploymentConfig( + task='conversational', + model=name, + deployment_name=name + "_deployment", + GPU_index_map=gpu_index_map1, + )) name = "deepset/roberta-large-squad2" deployments.append( diff --git a/mii/client.py b/mii/client.py index ab459cfa..d478f7f2 100644 --- a/mii/client.py +++ b/mii/client.py @@ -10,8 +10,8 @@ from mii.grpc_related.proto import modelresponse_pb2, modelresponse_pb2_grpc from mii.constants import GRPC_MAX_MSG_SIZE, Tasks, DeploymentType from mii.method_table import GRPC_METHOD_TABLE -from mii.deployment import allocate_processes, create_score_file, validate_deployment -from mii.config import DeploymentConfig +from mii.deployment import allocate_processes +from mii.config import DeploymentConfig, MIIConfig def _get_deployment_configs(deployment_tag): @@ -23,7 +23,9 @@ def _get_deployment_configs(deployment_tag): lb_config = configs.get(mii.constants.LOAD_BALANCER_CONFIG_KEY) model_path = configs.get(mii.constants.MODEL_PATH_KEY) port_map = configs.get(mii.constants.PORT_MAP_KEY) - return deployments, lb_config, model_path, port_map + deployment_type = configs.get(mii.constants.DEPLOYMENT_TYPE_KEY) + mii_configs = MIIConfig(**configs.get(mii.constants.MII_CONFIGS_KEY)) + return deployments, lb_config, model_path, port_map, deployment_type, mii_configs def mii_query_handle(deployment_tag): @@ -43,15 +45,17 @@ def mii_query_handle(deployment_tag): inference_pipeline, task = mii.non_persistent_models[deployment_tag] return MIINonPersistentClient(task, deployment_tag) - deployments, lb_config, model_path, port_map = _get_deployment_configs(deployment_tag) - mii_configs = None + deployments, lb_config, model_path, port_map, deployment_type, mii_configs = _get_deployment_configs(deployment_tag) + """mii_configs = None if len(deployments) > 0: mii_configs = getattr(next(iter(deployments.values())), mii.constants.MII_CONFIGS_KEY) + """ port_number = None if mii_configs == None else mii_configs.port_number - if port_number: + """if port_number: for deployment in deployments.values(): assert getattr(deployment, mii.constants.MII_CONFIGS_KEY).port_number == port_number, f"All port numbers is each deployments mii_configs must match" + """ return LBClient(deployments, "localhost", @@ -59,7 +63,9 @@ def mii_query_handle(deployment_tag): lb_config, model_path, port_map, - deployment_tag) + deployment_tag, + deployment_type, + mii_configs) def create_channel(host, port): @@ -157,7 +163,9 @@ def __init__(self, lb_config=None, model_path=None, port_map=None, - deployment_tag=None): + deployment_tag=None, + deployment_type=DeploymentType.LOCAL, + mii_configs={}): super().__init__(deployments, host, port) self.lb_stub = None if port is not None: @@ -167,24 +175,15 @@ def __init__(self, self.model_path = model_path self.port_map = port_map if port_map is not None else {} self.deployment_tag = deployment_tag + self.deployment_type = deployment_type + self.mii_configs = mii_configs async def add_models_async(self, proto_request): await getattr(self.lb_stub, "AddDeployment")(proto_request) - def add_models(self, - task=None, - model=None, - deployment_name=None, - enable_deepspeed=True, - enable_zero=False, - ds_config=None, - mii_config={}, - deployments=[], - deployment_type=DeploymentType.LOCAL, - model_path=None, - version=1): - assert deployment_type != DeploymentType.AML, "Cannot currently add models to AML deployment" - _, deployments = validate_deployment(task=task, + def add_models(self, deployments=[], model_path=None, version=1): + assert self.deployment_type != DeploymentType.AML, "Cannot currently add models to AML deployment" + """_, deployments = validate_deployment(task=task, model=model, deployment_name=deployment_name, enable_deepspeed=enable_deepspeed, @@ -196,7 +195,7 @@ def add_models(self, deployment_type=deployment_type, model_path=model_path, version=version) - + """ if not deployments: #Empty deployment return None @@ -205,7 +204,7 @@ def add_models(self, mii.constants.DEPLOYMENT_NAME_KEY): deployment for deployment in deployments } - lb_config, self.port_map = allocate_processes(deps, self.port_map) + lb_config, self.port_map = allocate_processes(deps, self.port_map, self.mii_configs) lb_enabled = True if len(self.deployments) else False if self.lb_config is not None: self.lb_config.replica_configs.extend(lb_config.replica_configs) @@ -214,21 +213,30 @@ def add_models(self, for deployment in deployments: self.deployments[getattr(deployment, mii.constants.DEPLOYMENT_NAME_KEY)] = deployment - if self.model_path is None and deployment_type == DeploymentType.LOCAL: + if self.model_path is None and self.deployment_type == DeploymentType.LOCAL: self.model_path = mii.constants.MII_MODEL_PATH_DEFAULT - - create_score_file(deployment_tag=self.deployment_tag, + """create_score_file(deployment_tag=self.deployment_tag, deployment_type=deployment_type, deployments=deps, model_path=self.model_path, port_map=self.port_map, lb_config=lb_config, deployed=lb_enabled) + if deployment_type == DeploymentType.LOCAL: mii.utils.import_score_file(self.deployment_tag).init() + """ + if not self.mii_configs: + self.mii_configs = mii.configs.MIIConfigs(**{}) + mii.MIIServer(self.deployment_tag, + deps.values(), + self.model_path, + lb_config=lb_config, + lb_enabled=lb_enabled, + mii_configs=self.mii_configs) + if self.lb_stub is None: - self.port_number = getattr(next(iter(self.deployments.values())), - mii.constants.MII_CONFIGS_KEY).port_number + self.port_number = self.mii_configs.port_number self.channel = create_channel(self.host, self.port_number) self.lb_stub = modelresponse_pb2_grpc.DeploymentManagementStub(self.channel) if not self.mr_stub: @@ -346,8 +354,8 @@ def terminate(self): def terminate_restful_gateway(deployment_tag): - deployments, _, _, _ = _get_deployment_configs(deployment_tag) + deployments, _, _, _, _, mii_configs = _get_deployment_configs(deployment_tag) for deployment in deployments.values(): - mii_configs = getattr(deployment, mii.constants.MII_CONFIGS_KEY) - if mii_configs.enable_restful_api: + #mii_configs = getattr(deployment, mii.constants.MII_CONFIGS_KEY) + if deployment.enable_restful_api: requests.get(f"http://localhost:{mii_configs.restful_api_port}/terminate") diff --git a/mii/config.py b/mii/config.py index b8325562..695054c3 100644 --- a/mii/config.py +++ b/mii/config.py @@ -8,6 +8,7 @@ from pydantic import BaseModel, validator, root_validator, Field from deepspeed.launcher.runner import DLTS_HOSTFILE from mii.utils import get_task +from mii.constants import DEPLOYMENT_NAME_KEY, TASK_NAME_KEY, MODEL_NAME_KEY, ENABLE_DEEPSPEED_KEY, ENABLE_DEEPSPEED_ZERO_KEY, GPU_INDEX_KEY, DEEPSPEED_CONFIG_KEY, VERSION_KEY class DtypeEnum(Enum): @@ -57,7 +58,7 @@ class MIIConfig(BaseModel): replica_num: int = 1 hostfile: str = DLTS_HOSTFILE trust_remote_code: bool = False - + """ @validator("deploy_rank") def deploy_valid(cls, field_value, values): if "tensor_parallel" not in values: @@ -76,8 +77,9 @@ def deploy_valid(cls, field_value, values): # number of ranks provided must be equal to TP size, DP is handled outside MII currently assert values["tensor_parallel"] == len(field_value), \ f"{len(field_value)} rank(s) provided in 'deploy_rank' does not align with tensor_parallel size of {values['tensor_parallel']}" - return field_value - + return field_value + """ + """ @validator('checkpoint_dict') def checkpoint_dict_valid(cls, value): if value is None: @@ -90,7 +92,8 @@ def checkpoint_dict_valid(cls, value): if not value.get(k, ''): raise ValueError(f"Missing key={k} in checkpoint_dict") return value - + """ + """ @root_validator def meta_tensor_or_sys_mem(cls, values): if values.get("meta_tensor") and values.get("load_with_sys_mem"): @@ -98,7 +101,7 @@ def meta_tensor_or_sys_mem(cls, values): "`meta_tensor` and `load_with_sys_mem` cannot be active at the same time." ) return values - + """ class Config: validate_all = True validate_assignment = True @@ -131,16 +134,72 @@ class Config: class DeploymentConfig(BaseModel): - deployment_name: str = Field(alias="DEPLOYMENT_NAME_KEY") - task: str = Field(alias="TASK_NAME_KEY") - model: str = Field(alias="MODEL_NAME_KEY") - ds_optimize: bool = Field(default=True, alias="ENABLE_DEEPSPEED_KEY") - ds_zero: bool = Field(default=False, alias="ENABLE_DEEPSPEED_ZERO_KEY") - GPU_index_map: dict = Field(default=None, alias="GPU_INDEX_KEY") - mii_configs: MIIConfig = Field(default=MIIConfig.parse_obj({}), - alias="MII_CONFIGS_KEY") - ds_config: dict = Field(default=None, alias="DEEPSPEED_CONFIG_KEY") - version: int = Field(default=1, alias="VERSION_KEY") + deployment_name: str = Field(alias=DEPLOYMENT_NAME_KEY) + task: str = Field(alias=TASK_NAME_KEY) + model: str = Field(alias=MODEL_NAME_KEY) + ds_optimize: bool = Field(default=True, alias=ENABLE_DEEPSPEED_KEY) + ds_zero: bool = Field(default=False, alias=ENABLE_DEEPSPEED_ZERO_KEY) + GPU_index_map: dict = Field(default=None, alias=GPU_INDEX_KEY) + #mii_configs: MIIConfig = Field(default={}, alias=MII_CONFIGS_KEY) + ds_config: dict = Field(default=None, alias=DEEPSPEED_CONFIG_KEY) + version: int = Field(default=1, alias=VERSION_KEY) + tensor_parallel: int = 1 + dtype: DtypeEnum = torch.float32 + meta_tensor: bool = False + load_with_sys_mem: bool = False + replace_with_kernel_inject: bool = True + profile_model_time: bool = False + skip_model_check: bool = False + max_tokens: int = 1024 + enable_restful_api: bool = False + replica_num: int = 1 + hostfile: str = DLTS_HOSTFILE + deploy_rank: Union[int, List[int]] = -1 + enable_cuda_graph: bool = False + checkpoint_dict: Union[dict, None] = None + hf_auth_token: str = None + trust_remote_code: bool = False + + @validator('checkpoint_dict') + def checkpoint_dict_valid(cls, value): + if value is None: + return value + if value.get('base_dir', ''): + raise ValueError( + "please unset 'base_dir' it will be set w.r.t. the deployment 'model_path'" + ) + for k in ['checkpoints', 'parallelization', 'version', 'type']: + if not value.get(k, ''): + raise ValueError(f"Missing key={k} in checkpoint_dict") + return value + + @validator("deploy_rank") + def deploy_valid(cls, field_value, values): + if "tensor_parallel" not in values: + raise ValueError( + "'tensor_parallel' must be defined in the pydantic model before 'deploy_rank'" + ) + + # if deploy rank is not given, default to align with TP value + if field_value == -1: + field_value = list(range(values["tensor_parallel"])) + + # ensure deploy rank type is always list for easier consumption later + if not isinstance(field_value, list): + field_value = [field_value] + + # number of ranks provided must be equal to TP size, DP is handled outside MII currently + assert values["tensor_parallel"] == len(field_value), \ + f"{len(field_value)} rank(s) provided in 'deploy_rank' does not align with tensor_parallel size of {values['tensor_parallel']}" + return field_value + + @root_validator + def meta_tensor_or_sys_mem(cls, values): + if values.get("meta_tensor") and values.get("load_with_sys_mem"): + raise ValueError( + "`meta_tensor` and `load_with_sys_mem` cannot be active at the same time." + ) + return values @validator("task") def convert_task_str(cls, field_value, values): @@ -148,3 +207,8 @@ def convert_task_str(cls, field_value, values): class Config: allow_population_by_field_name = True + validate_all = True + validate_assignment = True + use_enum_values = True + extra = 'forbid' + json_encoders = {torch.dtype: lambda x: str(x)} diff --git a/mii/constants.py b/mii/constants.py index f2dced4e..3d674efe 100644 --- a/mii/constants.py +++ b/mii/constants.py @@ -104,6 +104,7 @@ class ModelProvider(enum.Enum): DEPLOYED_KEY = "deployed" VERSION_KEY = "version" MII_TERMINATE_DEP_KEY = "__MII_TERMINATE_CALL__" +DEPLOYMENT_TYPE_KEY = "deployment_type" MII_CACHE_PATH = "MII_CACHE_PATH" MII_CACHE_PATH_DEFAULT = "/tmp/mii_cache" diff --git a/mii/deployment.py b/mii/deployment.py index 5744d182..1976ca4a 100644 --- a/mii/deployment.py +++ b/mii/deployment.py @@ -68,6 +68,9 @@ def deploy(task=None, If deployment_type is `LOCAL`, returns just the name of the deployment that can be used to create a query handle using `mii.mii_query_handle(deployment_name)` """ + if not mii_config: + mii_config = mii.config.MIIConfig(**{}) + if model_path is None and deployment_type == DeploymentType.LOCAL: model_path = MII_MODEL_PATH_DEFAULT elif model_path is None and deployment_type == DeploymentType.AML: @@ -98,15 +101,15 @@ def deploy(task=None, # parse and validate mii config for deployment in deployments: - mii_config = getattr(deployment, mii.constants.MII_CONFIGS_KEY) + #mii_config = getattr(deployment, mii.constants.MII_CONFIGS_KEY) if getattr(deployment, mii.constants.ENABLE_DEEPSPEED_ZERO_KEY): if getattr(deployment, mii.constants.DEEPSPEED_CONFIG_KEY).get("fp16", {}).get("enabled", False): - assert (mii_config.dtype == torch.half), "MII Config Error: MII dtype and ZeRO dtype must match" + assert (deployment.dtype == torch.half), "MII Config Error: MII dtype and ZeRO dtype must match" else: - assert (mii_config.dtype == torch.float), "MII Config Error: MII dtype and ZeRO dtype must match" + assert (deployment.dtype == torch.float), "MII Config Error: MII dtype and ZeRO dtype must match" assert not (enable_deepspeed and enable_zero), "MII Config Error: DeepSpeed and ZeRO cannot both be enabled, select only one" # aml only allows certain characters for deployment names @@ -137,12 +140,11 @@ def deploy(task=None, ) deps = {deployment.deployment_name: deployment for deployment in deployments} - # In local deployments use default path if no model path set # add fields for replica deployment port_map = {} - lb_config, port_map = allocate_processes(deps, port_map) + lb_config, port_map = allocate_processes(deps, port_map, mii_config) if deployment_type != DeploymentType.NON_PERSISTENT: create_score_file(deployment_tag=deployment_tag, @@ -150,7 +152,8 @@ def deploy(task=None, deployments=deps, model_path=model_path, port_map=port_map, - lb_config=lb_config) + lb_config=lb_config, + mii_configs=mii_config) if deployment_type == DeploymentType.AML: _deploy_aml(deployment_tag=deployment_tag, model_name=model, version=version) @@ -166,33 +169,35 @@ def deploy(task=None, enable_deepspeed, enable_zero, provider, - mii_config), + deployment), task) else: raise Exception(f"Unknown deployment type: {deployment_type}") -def allocate_processes(deployments, port_map): +def allocate_processes(deployments, port_map, mii_config): replica_configs = [] port_offset = 1 for deployment in deployments.values(): - mii_config = getattr(deployment, mii.constants.MII_CONFIGS_KEY) - replica_pool = _allocate_processes(mii_config.hostfile, - mii_config.tensor_parallel, - mii_config.replica_num, - deployment.GPU_index_map) + #mii_config = getattr(deployment, mii.constants.MII_CONFIGS_KEY) + replica_pool = _allocate_processes( + deployment.hostfile, + deployment.tensor_parallel, + deployment.replica_num, + getattr(deployment, + mii.constants.GPU_INDEX_KEY)) for i, (hostname, gpu_indices) in enumerate(replica_pool): # Reserver port for a LB proxy when replication is enabled if hostname not in port_map: port_map[hostname] = set() - base_port = mii_config.port_number + i * mii_config.tensor_parallel + port_offset + base_port = mii_config.port_number + i * deployment.tensor_parallel + port_offset if base_port in port_map[hostname]: base_port = max(port_map[hostname]) + 1 tensor_parallel_ports = list( range(base_port, - base_port + mii_config.tensor_parallel)) - for i in range(base_port, base_port + mii_config.tensor_parallel): + base_port + deployment.tensor_parallel)) + for i in range(base_port, base_port + deployment.tensor_parallel): port_map[hostname].add(i) torch_dist_port = mii_config.torch_dist_port + i replica_configs.append( @@ -236,6 +241,7 @@ def validate_deployment(task=None, deployment_type=deployment_type, deployments=None, model_path=model_path, + mii_configs={}, port_map=None, lb_config=None) return deployment_tag, None diff --git a/mii/launch/multi_gpu_server.py b/mii/launch/multi_gpu_server.py index 1f7fc00a..194cc4a9 100644 --- a/mii/launch/multi_gpu_server.py +++ b/mii/launch/multi_gpu_server.py @@ -6,8 +6,8 @@ import argparse import mii -from mii import MIIConfig, LoadBalancerConfig - +from mii import MIIConfig, LoadBalancerConfig, DeploymentConfig +from mii.utils import get_task_name from mii.models.load_models import load_models from mii.grpc_related.modelresponse_server import serve_inference, serve_load_balancing from mii.grpc_related.restful_gateway import RestfulGatewayThread @@ -45,6 +45,7 @@ def main(): "--restful-gateway", action='store_true', help="launch restful api gateway") + parser.add_argument("-f", "--deployment", type=str, help="base64 encoded deployment") args = parser.parse_args() @@ -53,6 +54,9 @@ def main(): # convert dict -> mii config mii_config = MIIConfig(**config_dict) + deployment_dict = decode_config_from_str(args.deployment) + deployment_dict['task'] = get_task_name(mii.constants.Tasks(deployment_dict['task'])) + deployment = DeploymentConfig(**deployment_dict) if args.restful_gateway: print(f"Starting RESTful API gateway on port: {mii_config.restful_api_port}") gateway_thread = RestfulGatewayThread(args.deployment_name, @@ -77,7 +81,7 @@ def main(): ds_zero=args.ds_zero, ds_config_path=args.ds_config, provider=provider, - mii_config=mii_config) + mii_config=deployment) print(f"Starting server on port: {port}") serve_inference(inference_pipeline, port) diff --git a/mii/models/score/generate.py b/mii/models/score/generate.py index 2f2bf8b0..6d608fc8 100644 --- a/mii/models/score/generate.py +++ b/mii/models/score/generate.py @@ -15,9 +15,13 @@ def create_score_file(deployment_tag, model_path, port_map, lb_config, + mii_configs={}, deployed=False): config_dict = {} + config_dict[ + mii.constants.MII_CONFIGS_KEY] = mii_configs.dict() if mii_configs else {} + config_dict[mii.constants.DEPLOYMENT_TYPE_KEY] = deployment_type.value config_dict[mii.constants.MODEL_PATH_KEY] = model_path config_dict[mii.constants.DEPLOYMENT_TAG_KEY] = deployment_tag config_dict[mii.constants.DEPLOYED_KEY] = deployed @@ -40,9 +44,9 @@ def create_score_file(deployment_tag, mii.constants.ENABLE_DEEPSPEED_KEY: getattr(deployment, mii.constants.ENABLE_DEEPSPEED_KEY), - mii.constants.MII_CONFIGS_KEY: - getattr(deployment, - mii.constants.MII_CONFIGS_KEY).dict(), + #mii.constants.MII_CONFIGS_KEY: + #getattr(deployment, + # mii.constants.MII_CONFIGS_KEY).dict(), mii.constants.ENABLE_DEEPSPEED_ZERO_KEY: getattr(deployment, mii.constants.ENABLE_DEEPSPEED_ZERO_KEY), @@ -51,7 +55,39 @@ def create_score_file(deployment_tag, mii.constants.DEEPSPEED_CONFIG_KEY), mii.constants.GPU_INDEX_KEY: getattr(deployment, - mii.constants.GPU_INDEX_KEY) + mii.constants.GPU_INDEX_KEY), + 'tensor_parallel': + deployment.tensor_parallel, + 'dtype': + deployment.dtype, + 'meta_tensor': + deployment.meta_tensor, + 'load_with_sys_mem': + deployment.load_with_sys_mem, + 'replace_with_kernel_inject': + deployment.replace_with_kernel_inject, + 'profile_model_time': + deployment.profile_model_time, + 'skip_model_check': + deployment.skip_model_check, + 'max_tokens': + deployment.max_tokens, + 'enable_restful_api': + deployment.enable_restful_api, + 'replica_num': + deployment.replica_num, + 'hostfile': + deployment.hostfile, + 'deploy_rank': + deployment.deploy_rank, + 'enable_cuda_graph': + deployment.enable_cuda_graph, + 'checkpoint_dict': + deployment.checkpoint_dict, + 'hf_auth_token': + deployment.hf_auth_token, + 'trust_remote_code': + deployment.trust_remote_code } config_dict[mii.constants.DEPLOYMENTS_KEY][ deployment.deployment_name] = deployment_config diff --git a/mii/models/score/score_template.py b/mii/models/score/score_template.py index 7c8208b8..df4d94d0 100644 --- a/mii/models/score/score_template.py +++ b/mii/models/score/score_template.py @@ -8,7 +8,7 @@ import json import torch import mii -from mii.config import LoadBalancerConfig, ReplicaConfig +from mii.config import LoadBalancerConfig, ReplicaConfig, MIIConfig import time model = None @@ -16,18 +16,20 @@ def init(): model_path = mii.utils.full_model_path(configs[mii.constants.MODEL_PATH_KEY]) + mii_configs = configs[mii.constants.MII_CONFIGS_KEY] deployment_tag = configs[mii.constants.DEPLOYMENT_TAG_KEY] deployments = [] lb_enabled = configs[mii.constants.DEPLOYED_KEY] for deployment in configs[mii.constants.DEPLOYMENTS_KEY].values(): deployments.append(mii.DeploymentConfig(**deployment)) - + mii_configs = MIIConfig(**mii_configs) mii.MIIServer(deployment_tag, deployments, model_path, lb_config=configs.get(mii.constants.LOAD_BALANCER_CONFIG_KEY, None), - lb_enabled=lb_enabled) + lb_enabled=lb_enabled, + mii_configs=mii_configs) global model model = None diff --git a/mii/server.py b/mii/server.py index 30d69363..1aeac364 100644 --- a/mii/server.py +++ b/mii/server.py @@ -33,26 +33,26 @@ def __init__(self, deployments, model_path, lb_config=None, - lb_enabled=False): + lb_enabled=False, + mii_configs={}): if len(deployments) > 0: self.lb_enabled = lb_enabled self.deployments = deployments for deployment in deployments: - mii_configs = getattr(deployment, mii.constants.MII_CONFIGS_KEY) - assert get_num_gpus(mii_configs) > 0, f"GPU count for {deployment.deployment_name} must be greater than 0" - if mii_configs.hostfile is None: + #mii_configs = getattr(deployment, mii.constants.MII_CONFIGS_KEY) + assert get_num_gpus(deployment) > 0, f"GPU count for {deployment.deployment_name} must be greater than 0" + if deployment.hostfile is None: hostfile = tempfile.NamedTemporaryFile(delete=False) num_gpu = torch.cuda.device_count() with open(hostfile, "w") as f: f.write(f"localhost slots={num_gpu}") - mii.configs.hostfile = hostfile - - processes = self._initialize_service( - deployment_tag, - deployments, - model_path, - lb_config, - ) + deployment.hostfile = hostfile + deps = {dep.deployment_name: dep for dep in deployments} + processes = self._initialize_service(deployment_tag, + deps, + model_path, + lb_config, + mii_configs) self._wait_until_server_is_live(processes, lb_config.replica_configs) def _wait_until_server_is_live(self, processes, deployment): @@ -100,10 +100,11 @@ def _build_server_args(self, ds_zero, ds_config, mii_configs, - port): + port, + deployment): # serialize mii config b64_config_str = config_to_b64_str(mii_configs) - + b64_deployment = config_to_b64_str(deployment) task = "" for deployment in self.deployments: if deployment_name == getattr(deployment, mii.constants.DEPLOYMENT_NAME_KEY): @@ -117,6 +118,7 @@ def _build_server_args(self, server_args_str += f" --provider {provider}" server_args_str += f" --config {b64_config_str}" + server_args_str += f" -f {b64_deployment}" server_args_str += " --ds-zero" if ds_zero else "" if ds_zero and ds_config is not None: if isinstance(ds_config, dict): @@ -175,17 +177,21 @@ def _launch_restful_gateway(self, ds_zero, ds_config, mii_configs, - port): - return self._launch_server_process(deployment_name, - model_name, - model_path, - ds_optimize, - ds_zero, - ds_config, - mii_configs, - port, - "restful api gateway", - ex_server_args=["--restful-gateway"]) + port, + deployment): + return self._launch_server_process( + deployment_name, + model_name, + model_path, + ds_optimize, + ds_zero, + ds_config, + mii_configs, + port, + "restful api gateway", + deployment, + ex_server_args=["--restful-gateway"], + ) def _launch_server_process(self, deployment_name, @@ -197,6 +203,7 @@ def _launch_server_process(self, mii_configs, port, msg_server_type, + deployment, ds_launch_str=None, ex_server_args=[]): launch_str = f"{sys.executable} -m mii.launch.multi_gpu_server" @@ -207,7 +214,8 @@ def _launch_server_process(self, ds_zero, ds_config, mii_configs, - port) + port, + deployment) server_args_str += f" " + \ " ".join(ex_server_args) if ex_server_args else "" @@ -233,7 +241,8 @@ def _launch_deepspeed(self, host, port, master_port, - deploy_ranks): + deploy_ranks, + deployment): # use different hostfiles for replica instances # pass /dev/null when no replica is used worker_str = f"-H {hostfile} " @@ -256,12 +265,17 @@ def _launch_deepspeed(self, mii_configs, port, "MII server", + deployment, ds_launch_str=ds_launch_str) - def _initialize_service(self, deployment_tag, deployments, model_path, lb_config): + def _initialize_service(self, + deployment_tag, + deployments, + model_path, + lb_config, + mii_configs): processes = [] - host_gpus = defaultdict(list) for repl_config in lb_config.replica_configs: host_gpus[repl_config.hostname].extend(repl_config.gpu_indices) @@ -269,10 +283,11 @@ def _initialize_service(self, deployment_tag, deployments, model_path, lb_config # Start replica instances for i, repl_config in enumerate(lb_config.replica_configs): name = repl_config.deployment_name - deployment = None - for dep in deployments: + deployment = None if name not in deployments else deployments[name] + """for dep in deployments: if getattr(dep, mii.constants.DEPLOYMENT_NAME_KEY) == name: deployment = dep + """ if deployment is None: continue hostfile = tempfile.NamedTemporaryFile(delete=False) @@ -291,15 +306,13 @@ def _initialize_service(self, deployment_tag, deployments, model_path, lb_config mii.constants.ENABLE_DEEPSPEED_ZERO_KEY), getattr(deployment, mii.constants.DEEPSPEED_CONFIG_KEY), - getattr(deployment, - mii.constants.MII_CONFIGS_KEY), + mii_configs, hostfile.name, repl_config.hostname, repl_config.tensor_parallel_ports[0], - getattr(deployment, - mii.constants.MII_CONFIGS_KEY).torch_dist_port + (100 * i) + - repl_config.gpu_indices[0], - repl_config.gpu_indices)) + mii_configs.torch_dist_port + (100 * i) + repl_config.gpu_indices[0], + repl_config.gpu_indices, + deployment)) # start load balancer here. # we don't use deepspeed launcher for the load balancer because it does not need a GPU. @@ -309,7 +322,7 @@ def _initialize_service(self, deployment_tag, deployments, model_path, lb_config processes.append(self._launch_load_balancer(model_path, lb_config)) for deployment in self.deployments: - if getattr(deployment, mii.constants.MII_CONFIGS_KEY).enable_restful_api: + if deployment.enable_restful_api: # start rest api server processes.append( self._launch_restful_gateway( @@ -324,10 +337,9 @@ def _initialize_service(self, deployment_tag, deployments, model_path, lb_config mii.constants.ENABLE_DEEPSPEED_ZERO_KEY), getattr(deployment, mii.constants.DEEPSPEED_CONFIG_KEY), - getattr(deployment, - mii.constants.MII_CONFIGS_KEY), - getattr(deployment, - mii.constants.MII_CONFIGS_KEY).port_number)) + mii_configs, + mii_configs.port_number), + deployment) break return processes From b78068ecb987f6932b27ae7f48832e52aa58f5ec Mon Sep 17 00:00:00 2001 From: Tosin Segun Date: Fri, 11 Aug 2023 17:33:18 +0000 Subject: [PATCH 69/69] Partial deploy/termination example --- examples/multi_model/add_delete_models.py | 32 +++++++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 examples/multi_model/add_delete_models.py diff --git a/examples/multi_model/add_delete_models.py b/examples/multi_model/add_delete_models.py new file mode 100644 index 00000000..2a85b0f3 --- /dev/null +++ b/examples/multi_model/add_delete_models.py @@ -0,0 +1,32 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +import mii + +deployments = [] +results = [] +name = 'bigscience/bloom-560m' +mii_configs1 = {"tensor_parallel": 1, "dtype": "fp16"} +deployments.append( + mii.DeploymentConfig(task='text-generation', + model=name, + deployment_name=name + "_deployment5", + mii_configs=mii.config.MIIConfig(**mii_configs1) + )) + +generator = mii.mii_query_handle("multi_models") +generator.add_models(deployments=deployments) + +result = generator.query( + { + "query": ["DeepSpeed is", + "Seattle is"], + "deployment_name": "bigscience/bloom-560m_deployment5" + }, + do_sample=True, + max_new_tokens=30, +) +print(result) +generator.delete_model("bigscience/bloom-560m_deployment5")