Fix multi-GPU deployment (#133)

czczup · web-flow · commit 07ef6e9c7bb2 · 2024-04-30T00:50:26.000+08:00
* Fix _no_split_modules

* Support device_map auto in model_worker

* Fix distributed deployment
diff --git a/internvl_chat/internvl/model/internvl_chat/modeling_internvl_chat.py b/internvl_chat/internvl/model/internvl_chat/modeling_internvl_chat.py
@@ -26,7 +26,7 @@
 class InternVLChatModel(PreTrainedModel):
     config_class = InternVLChatConfig
     main_input_name = 'pixel_values'
-    _no_split_modules = ['InternVisionEncoderLayer', 'LlamaDecoderLayer', 'LlamaForCausalLM']
+    _no_split_modules = ['InternVisionEncoderLayer', 'LlamaDecoderLayer']
 
     def __init__(self, config: InternVLChatConfig, vision_model=None, language_model=None):
         super().__init__(config)
@@ -226,7 +226,7 @@ def extract_feature(self, pixel_values):
         vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1)
         vit_embeds = self.pixel_shuffle(vit_embeds, scale_factor=self.downsample_ratio)
         vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], -1, vit_embeds.shape[-1])
-        vit_embeds = self.mlp1(vit_embeds).to(pixel_values.device)
+        vit_embeds = self.mlp1(vit_embeds)#.to(pixel_values.device)
         return vit_embeds
 
     def chat(self, tokenizer, pixel_values, question, generation_config, history=None, return_history=False,
@@ -354,7 +354,7 @@ def generate(
             input_ids = input_ids.reshape(B * N)
             selected = (input_ids == self.img_context_token_id)
             assert selected.sum() != 0
-            input_embeds[selected] = vit_embeds.reshape(-1, C)
+            input_embeds[selected] = vit_embeds.reshape(-1, C).to(input_embeds.device)
 
             input_embeds = input_embeds.reshape(B, N, C)
         else:
diff --git a/internvl_chat/internvl/serve/model_worker.py b/internvl_chat/internvl/serve/model_worker.py
@@ -61,14 +61,20 @@ def __init__(self, controller_addr, worker_addr,
         else:
             self.model_name = model_name
 
-        self.device = device
         logger.info(f'Loading the model {self.model_name} on worker {worker_id} ...')
         from transformers import AutoTokenizer, CLIPImageProcessor
 
         self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
-        self.model = InternVLChatModel.from_pretrained(
-            model_path, load_in_8bit=load_8bit, torch_dtype=torch.float16).eval()
-        if not load_8bit:
+        if device == 'auto':
+            import os
+            os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
+            # This can make distributed deployment work properly, wonder why
+            self.model = InternVLChatModel.from_pretrained(
+                model_path, load_in_8bit=load_8bit, torch_dtype=torch.float16, device_map='auto').eval()
+        else:
+            self.model = InternVLChatModel.from_pretrained(
+                model_path, load_in_8bit=load_8bit, torch_dtype=torch.float16).eval()
+        if not load_8bit and not device == 'auto':
             self.model = self.model.cuda()
         self.image_size = self.model.config.force_image_size
         self.image_processor = CLIPImageProcessor(
@@ -184,7 +190,7 @@ def generate_stream(self, params):
         stop_str = params.get('stop', None)
         do_sample = True if temperature > 0.001 else False
         logger.info(f'num_image_tokens: {num_image_tokens}')
-        input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, num_image_tokens, return_tensors='pt').unsqueeze(0).to(self.device)
+        input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, num_image_tokens, return_tensors='pt').unsqueeze(0).cuda()
         input_ids[input_ids==IMAGE_TOKEN_INDEX] = model.img_context_token_id
 
         keywords = [stop_str]