Skip to content

Commit 07ef6e9

Browse files
authored
Fix multi-GPU deployment (#133)
* Fix _no_split_modules * Support device_map auto in model_worker * Fix distributed deployment
1 parent b254ccc commit 07ef6e9

File tree

2 files changed

+14
-8
lines changed

2 files changed

+14
-8
lines changed

internvl_chat/internvl/model/internvl_chat/modeling_internvl_chat.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
class InternVLChatModel(PreTrainedModel):
2727
config_class = InternVLChatConfig
2828
main_input_name = 'pixel_values'
29-
_no_split_modules = ['InternVisionEncoderLayer', 'LlamaDecoderLayer', 'LlamaForCausalLM']
29+
_no_split_modules = ['InternVisionEncoderLayer', 'LlamaDecoderLayer']
3030

3131
def __init__(self, config: InternVLChatConfig, vision_model=None, language_model=None):
3232
super().__init__(config)
@@ -226,7 +226,7 @@ def extract_feature(self, pixel_values):
226226
vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1)
227227
vit_embeds = self.pixel_shuffle(vit_embeds, scale_factor=self.downsample_ratio)
228228
vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], -1, vit_embeds.shape[-1])
229-
vit_embeds = self.mlp1(vit_embeds).to(pixel_values.device)
229+
vit_embeds = self.mlp1(vit_embeds)#.to(pixel_values.device)
230230
return vit_embeds
231231

232232
def chat(self, tokenizer, pixel_values, question, generation_config, history=None, return_history=False,
@@ -354,7 +354,7 @@ def generate(
354354
input_ids = input_ids.reshape(B * N)
355355
selected = (input_ids == self.img_context_token_id)
356356
assert selected.sum() != 0
357-
input_embeds[selected] = vit_embeds.reshape(-1, C)
357+
input_embeds[selected] = vit_embeds.reshape(-1, C).to(input_embeds.device)
358358

359359
input_embeds = input_embeds.reshape(B, N, C)
360360
else:

internvl_chat/internvl/serve/model_worker.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -61,14 +61,20 @@ def __init__(self, controller_addr, worker_addr,
6161
else:
6262
self.model_name = model_name
6363

64-
self.device = device
6564
logger.info(f'Loading the model {self.model_name} on worker {worker_id} ...')
6665
from transformers import AutoTokenizer, CLIPImageProcessor
6766

6867
self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
69-
self.model = InternVLChatModel.from_pretrained(
70-
model_path, load_in_8bit=load_8bit, torch_dtype=torch.float16).eval()
71-
if not load_8bit:
68+
if device == 'auto':
69+
import os
70+
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
71+
# This can make distributed deployment work properly, wonder why
72+
self.model = InternVLChatModel.from_pretrained(
73+
model_path, load_in_8bit=load_8bit, torch_dtype=torch.float16, device_map='auto').eval()
74+
else:
75+
self.model = InternVLChatModel.from_pretrained(
76+
model_path, load_in_8bit=load_8bit, torch_dtype=torch.float16).eval()
77+
if not load_8bit and not device == 'auto':
7278
self.model = self.model.cuda()
7379
self.image_size = self.model.config.force_image_size
7480
self.image_processor = CLIPImageProcessor(
@@ -184,7 +190,7 @@ def generate_stream(self, params):
184190
stop_str = params.get('stop', None)
185191
do_sample = True if temperature > 0.001 else False
186192
logger.info(f'num_image_tokens: {num_image_tokens}')
187-
input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, num_image_tokens, return_tensors='pt').unsqueeze(0).to(self.device)
193+
input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, num_image_tokens, return_tensors='pt').unsqueeze(0).cuda()
188194
input_ids[input_ids==IMAGE_TOKEN_INDEX] = model.img_context_token_id
189195

190196
keywords = [stop_str]

0 commit comments

Comments
 (0)