fix max-token conflict w/ DS (#49)

depenglee1707 · web-flow · commit 9ed5a23ebfdd · 2024-03-26T18:59:30.000+08:00
diff --git a/llmserve/backend/llm/predictor.py b/llmserve/backend/llm/predictor.py
@@ -19,6 +19,7 @@
     init_torch_dist_process_group_async,
     initialize_node,
     timeit,
+    get_max_token_size,
 )
 from llmserve.backend.logger import get_logger
 from llmserve.backend.server.models import Args, LLMConfig, Prompt, Response
@@ -94,7 +95,7 @@ def init_model(
 
     if llm_config.warmup and warmup_inputs:
         prowarmup_inputs_max = Prompt(prompt=warmup_inputs * (
-            int(llm_config.max_input_words / (len(warmup_inputs.split()) + 1)) + 1
+            int(get_max_token_size(llm_config) / (len(warmup_inputs.split()) + 1))
         ), use_prompt_format=False)
 
         logger.info(
diff --git a/llmserve/backend/llm/utils.py b/llmserve/backend/llm/utils.py
@@ -19,7 +19,7 @@
 from torch.hub import _get_torch_home
 
 from llmserve.backend.logger import get_logger
-from llmserve.backend.server.models import S3MirrorConfig
+from llmserve.backend.server.models import S3MirrorConfig, LLMConfig
 
 logger = get_logger(__name__)
 
@@ -279,6 +279,14 @@ async def init_torch_dist_process_group_async(
         node_id = node_and_gpu_ids[rank][0]
         local_rank = node_to_workers[node_id].index(rank)
         local_world_size = len(node_to_workers[node_id])
+        logger.info("++++++++++++++")
+        logger.info(rank)
+        logger.info(world_size)
+        logger.info(local_rank)
+        logger.info(local_world_size)
+        logger.info(master_addr)
+        logger.info(master_port)
+        logger.info(list(node_to_gpu_ids[node_id]))
         setup_futures.append(
             worker.execute.remote(
                 _init_torch_distributed,
@@ -301,3 +309,7 @@ async def init_torch_dist_process_group_async(
     await asyncio.gather(*setup_futures)
 
     return local_ranks
+
+# To get max input token size for warmup. w/ DS, there is "max_tokens" localed "initializer/max_tokens", it will conflict with "max_input_words", prefer "max_tokens" if both existed 
+def get_max_token_size(llm_config: LLMConfig):
+    return llm_config.initialization.initializer.max_tokens if hasattr(llm_config.initialization.initializer, "max_tokens") else llm_config.max_input_words
diff --git a/models/text-generation--bigscience--bloom-3b.yaml b/models/text-generation--bigscience--bloom-3b.yaml
@@ -32,8 +32,8 @@ model_config:
         trust_remote_code: true
     pipeline: default
   generation:
-    max_batch_size: 2
-    batch_wait_timeout_s: 30
+    max_batch_size: 10
+    batch_wait_timeout_s: 0
     generate_kwargs:
       do_sample: false
       max_new_tokens: 512