ModelTC
diff --git a/‎lightllm/models/internvl/model.py‎
Lines changed: 48 additions & 1 deletion b/‎lightllm/models/internvl/model.py‎
Lines changed: 48 additions & 1 deletion
diff --git a/‎lightllm/models/qwen_vl/layer_infer/pre_layer_infer.py‎
Lines changed: 1 addition & 1 deletion b/‎lightllm/models/qwen_vl/layer_infer/pre_layer_infer.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎lightllm/models/whisper/__init__.py‎ b/‎lightllm/models/whisper/__init__.py‎
diff --git a/‎lightllm/models/whisper/defaults.py‎
Lines changed: 1 addition & 0 deletions b/‎lightllm/models/whisper/defaults.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎lightllm/models/whisper/whisper_audio.py‎
Lines changed: 206 additions & 0 deletions b/‎lightllm/models/whisper/whisper_audio.py‎
Lines changed: 206 additions & 0 deletions
diff --git a/‎lightllm/server/api_cli.py‎
Lines changed: 6 additions & 1 deletion b/‎lightllm/server/api_cli.py‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎lightllm/server/api_start.py‎
Lines changed: 40 additions & 11 deletions b/‎lightllm/server/api_start.py‎
Lines changed: 40 additions & 11 deletions
diff --git a/‎lightllm/server/audioserver/__init__.py‎ b/‎lightllm/server/audioserver/__init__.py‎
@@ -6,7 +6,7 @@
 from lightllm.models.qwen2.model import Qwen2TpPartModel
 from lightllm.models.deepseek2.model import Deepseek2TpPartModel
 from lightllm.models.qwen_vl.layer_infer.pre_layer_infer import LlamaMultimodalPreLayerInfer
-from lightllm.server.multimodal_params import MultimodalParams, ImageItem
+from lightllm.server.multimodal_params import AudioItem, MultimodalParams, ImageItem
 from lightllm.common.build_utils import repair_config
 from lightllm.models.internvl.layer_weights.pre_and_post_layer_weight import (
     InternVLLlamaPreAndPostLayerWeight,
@@ -26,6 +26,8 @@
 IMG_START_TOKEN = "<img>"
 IMG_END_TOKEN = "</img>"
 IMG_TOKEN = "<image>"
+AUDIO_START_TOKEN = "<audio>"
+AUDIO_END_TOKEN = "</audio>"
 
 
 # Warp of the origal tokenizer
@@ -40,6 +42,12 @@ def __init__(self, tokenizer, model_cfg, **kwargs):
 
         self.image_end_tag = IMG_END_TOKEN
         self.image_end_id = tokenizer.convert_tokens_to_ids(self.image_end_tag)
+
+        self.audio_start_tag = AUDIO_START_TOKEN
+        self.audio_start_id = tokenizer.convert_tokens_to_ids(self.audio_start_tag)
+
+        self.audio_end_tag = AUDIO_END_TOKEN
+        self.audio_end_id = tokenizer.convert_tokens_to_ids(self.audio_end_tag)
         self.get_image_patch_func = get_image_patch_func(kwargs["weight_dir"])
 
     def init_imageItem_extral_params(
@@ -69,6 +77,20 @@ def get_image_token_length(self, img: ImageItem):
             * self.image_length
         )
 
+    def get_audio_token_length(self, audio: AudioItem):
+        L = audio.audio_length
+        L = L if L <= 480000 else 480000  # max_length < 30s
+        mel_len = L // 160
+        dilation = 1
+        L_in = mel_len
+        for (padding, kernel_size, stride) in eval("[(1,3,1)] + [(1,3,2)] "):
+            L_out = L_in + 2 * padding - dilation * (kernel_size - 1) - 1
+            L_out = 1 + L_out // stride
+            L_in = L_out
+        audio_len_after_cnn = L_out
+        audio_token_num = (audio_len_after_cnn - 2) // 2 + 1
+        return audio_token_num
+
     # only change the impl of the encode func:
     def encode(self, prompt, multimodal_params: MultimodalParams = None, **kwargs):
         # TEXT<image>TEXT<image>TEXT --> TEXT<img></img>TEXT<img></img>TEXT
@@ -103,6 +125,31 @@ def encode(self, prompt, multimodal_params: MultimodalParams = None, **kwargs):
             except ValueError:
                 break
         input_ids.extend(origin_ids[start_idx:])
+
+        # audio
+        origin_ids = input_ids
+        input_ids = []
+        audio_id = 0
+        start_idx = 0
+        while True:
+            try:
+                start_idx = origin_ids.index(self.audio_start_id, start_idx)
+                if start_idx + 1 >= len(origin_ids):
+                    break
+                if origin_ids[start_idx + 1] == self.audio_end_id:
+                    input_ids.extend(origin_ids[: start_idx + 1])
+                    token_id = multimodal_params.audios[audio_id].token_id
+                    token_num = multimodal_params.audios[audio_id].token_num
+                    input_ids.extend(range(token_id, token_id + token_num))
+                    input_ids.append(self.audio_end_id)
+                    origin_ids = origin_ids[start_idx + 2 :]
+                    start_idx = 0
+                    audio_id += 1
+                else:
+                    raise ValueError("audio token error")
+            except ValueError:
+                break
+        input_ids.extend(origin_ids[start_idx:])
         return input_ids
 
     def __getattr__(self, name):
 
@@ -43,7 +43,7 @@ def context_forward(self, input_ids, infer_state: LlamaInferStateInfo, layer_wei
         dtype = layer_weight.wte_weight_.dtype
         hidden_size = layer_weight.wte_weight_.shape[1]
         for batch_id, p in enumerate(infer_state.multimodal_params):
-            for img in p["images"]:
+            for img in p["images"] + p["audios"]:
                 # skip the same image
                 if img["token_id"] in img_start_token_ids:
                     continue
 
@@ -0,0 +1 @@
+MIN_AUDIO_LEN = 480  # 最短音频长度
@@ -0,0 +1,206 @@
+import torch
+import librosa
+from io import BytesIO
+from typing import List, Union
+import numpy as np
+from torch import nn
+from safetensors.torch import load_file
+
+import json
+import torch.nn.functional as F
+import math
+import os
+import rpyc
+from transformers.processing_utils import ProcessorMixin
+from lightllm.server.embed_cache.utils import tensor2bytes, read_shm, create_shm, get_shm_name_data, get_shm_name_embed
+
+
+class WhisperProcessor(ProcessorMixin):
+    r"""
+    Constructs a Whisper processor which wraps a Whisper feature extractor and a Whisper tokenizer into a single
+    processor.
+
+    [`WhisperProcessor`] offers all the functionalities of [`WhisperFeatureExtractor`] and [`WhisperTokenizer`]. See
+    the [`~WhisperProcessor.__call__`] and [`~WhisperProcessor.decode`] for more information.
+
+    Args:
+        feature_extractor (`WhisperFeatureExtractor`):
+            An instance of [`WhisperFeatureExtractor`]. The feature extractor is a required input.
+        tokenizer (`WhisperTokenizer`):
+            An instance of [`WhisperTokenizer`]. The tokenizer is a required input.
+    """
+    attributes = ["feature_extractor"]
+    feature_extractor_class = "WhisperFeatureExtractor"
+    # tokenizer_class = "WhisperTokenizer"
+
+    def __init__(self, feature_extractor):
+        super().__init__(feature_extractor)
+        self.current_processor = self.feature_extractor
+        self._in_target_context_manager = False
+
+    def get_decoder_prompt_ids(self, task=None, language=None, no_timestamps=True):
+        return self.tokenizer.get_decoder_prompt_ids(task=task, language=language, no_timestamps=no_timestamps)
+
+    def get_T_after_cnn(self, L_in, dilation=1):
+        for (padding, kernel_size, stride) in eval("[(1,3,1)] + [(1,3,2)] "):
+            L_out = L_in + 2 * padding - dilation * (kernel_size - 1) - 1
+            L_out = 1 + L_out // stride
+            L_in = L_out
+        return L_out
+
+    def __call__(self, audios, audio_lens, *args, **kwargs):
+        """
+        Forwards the `audios` argument to WhisperFeatureExtractor's [`~WhisperFeatureExtractor.__call__`] and the `text`
+        argument to [`~WhisperTokenizer.__call__`]. Please refer to the doctsring of the above two methods for more
+        information.
+        """
+        # For backward compatibility
+        if self._in_target_context_manager:
+            return self.current_processor(*args, **kwargs)
+
+        sampling_rate = kwargs.pop("sampling_rate", 16000)
+
+        audio_lens = np.where(audio_lens <= 480000, audio_lens, 480000)
+        audio_lens = audio_lens // 160
+        audio_lens_after_cnn = self.get_T_after_cnn(audio_lens)
+        padded_inputs = self.feature_extractor(audios, *args, sampling_rate=sampling_rate, **kwargs)
+
+        return padded_inputs["input_features"], audio_lens_after_cnn
+
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to WhisperTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to WhisperTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+
+    def get_prompt_ids(self, text: str, return_tensors="np"):
+        return self.tokenizer.get_prompt_ids(text, return_tensors=return_tensors)
+
+
+class WhisperAudioModel:
+    def __init__(self, kvargs):
+        self.max_seconds = 30
+        self.sampling_rate = 16000
+        self.max_length = self.max_seconds * self.sampling_rate
+        self.cache_port = kvargs["client_port"]
+        self.cache_client = rpyc.connect("localhost", self.cache_port)
+        data_type = kvargs["data_type"]
+        if data_type in ["bf16", "bfloat16"]:
+            self.data_type = torch.bfloat16
+        else:
+            self.data_type = torch.float16
+
+    def cuda(self):
+        self.audio = self.audio.cuda()
+        for k, v in self.projector_weights.items():
+            self.projector_weights[k] = v.cuda()
+        self.device = torch.device("cuda")
+        return self
+
+    def load_model(self, weight_dir, config):
+        self.audio_processor = WhisperProcessor.from_pretrained(weight_dir)
+        from transformers.models.whisper.modeling_whisper import WhisperEncoder, WhisperConfig
+
+        self.audio = WhisperEncoder(WhisperConfig(**config["audio_config"])).to(self.data_type)
+        self.device = torch.device("cpu")
+        self.projector_weights = {}
+        self.load_weight(weight_dir)
+
+    def load_weight(self, weight_dir):
+        weight_path = os.path.join(weight_dir, "model.safetensors.index.json")
+        weight_map = json.load(open(weight_path, "r"))["weight_map"]
+        params_map = {}
+        audio_weight = {}
+        for k, v in weight_map.items():
+            if "mlp2" not in k and "audio_model" not in k:
+                continue
+            filename = weight_map[k]
+            if filename not in params_map:
+                tensor_data = load_file(os.path.join(weight_dir, filename))
+                params_map[filename] = tensor_data
+            if "mlp2" in k:
+                self.projector_weights[k] = params_map[filename][k].to(self.data_type)
+            if "audio_model" in k:
+                audio_weight[k[len("audio_model.encoder.") :]] = params_map[filename][k].to(self.data_type)
+
+        self.audio.load_state_dict(audio_weight)
+
+        assert "mlp2.0.bias" in self.projector_weights
+        assert "mlp2.0.weight" in self.projector_weights
+        assert "mlp2.1.bias" in self.projector_weights
+        assert "mlp2.1.weight" in self.projector_weights
+        assert "mlp2.3.bias" in self.projector_weights
+        assert "mlp2.3.weight" in self.projector_weights
+
+    def forward(self, audio_values, audio_lens_after_cnn):
+        audio_values = audio_values.to(self.data_type).to(device=self.device)
+        audio_values = audio_values.squeeze(1)
+        audio_lens_after_cnn = torch.tensor(audio_lens_after_cnn).cuda()
+        max_len_in_batch = torch.max(audio_lens_after_cnn).item()
+
+        padding_mask = torch.ones([audio_values.size(0), max_len_in_batch]).to(
+            dtype=audio_values.dtype, device=audio_values.device
+        )
+        for index in range(len(audio_values)):
+            padding_mask[index, : audio_lens_after_cnn[index].item()] = 0
+        last_hidden_state = self.audio(audio_values, padding_mask).last_hidden_state
+        x = F.layer_norm(
+            last_hidden_state,
+            normalized_shape=(last_hidden_state.shape[-1],),
+            weight=self.projector_weights["mlp2.0.weight"],
+            bias=self.projector_weights["mlp2.0.bias"],
+        )
+        x = F.linear(x, weight=self.projector_weights["mlp2.1.weight"], bias=self.projector_weights["mlp2.1.bias"])
+        x = F.gelu(x)
+        x = F.linear(x, weight=self.projector_weights["mlp2.3.weight"], bias=self.projector_weights["mlp2.3.bias"])
+        return x
+
+    def encode(self, audio_items: List[Union[str, BytesIO]]):
+        batch_audios = []
+        batch_audio_lens = np.zeros(len(audio_items), dtype=np.int32)
+        uuids = []
+        for i, item in enumerate(audio_items):
+            if isinstance(item, int):
+                uuids.append(item)
+                audio_data = read_shm(get_shm_name_data(item))
+                audio = BytesIO(audio_data)
+                audio, _ = librosa.load(audio, sr=16000)
+            elif isinstance(item, BytesIO):
+                audio, _ = librosa.load(item, sr=16000)
+            elif item.startswith("http://") or item.startswith("https://"):
+                import requests
+
+                audio = BytesIO(requests.get(item, stream=True).raw.read())
+                audio, _ = librosa.load(audio, sr=16000)
+            else:
+                raise ValueError(f"cannot read audio which type is {type(item)}!")
+
+            # padding to min audio len
+            from .defaults import MIN_AUDIO_LEN
+
+            if audio.shape[0] < MIN_AUDIO_LEN:
+                audio = np.pad(audio, (0, MIN_AUDIO_LEN - len(audio)), mode="constant", constant_values=0.0)
+
+            batch_audio_lens[i] = min(audio.shape[0], self.max_length)
+            batch_audios.append(audio)
+
+        audios, audio_lens_after_cnn = self.audio_processor(
+            batch_audios, batch_audio_lens, sampling_rate=16000, return_tensors="pt"
+        )
+        audios = self.forward(audios, audio_lens_after_cnn)
+        audio_lens_after_cnn = np.array(audio_lens_after_cnn, dtype=np.int32)
+        audio_token_num = (audio_lens_after_cnn - 2) // 2 + 1
+
+        for i in range(len(uuids)):
+            if not self.cache_client.root.get_item_embed(uuids[i]):
+                cur_embed_bytes = tensor2bytes(audios[i][: audio_token_num[i]])
+                create_shm(get_shm_name_embed(uuids[i]), cur_embed_bytes)
+                self.cache_client.root.set_item_embed(uuids[i])
@@ -205,7 +205,12 @@ def make_argument_parser() -> argparse.ArgumentParser:
                         use env FIRST_ALLOWED_TOKENS to set the range, like FIRST_ALLOWED_TOKENS=1,2 ..""",
     )
     parser.add_argument(
-        "--enable_multimodal", action="store_true", help="Whether or not to allow to load additional multimodal models."
+        "--enable_multimodal", action="store_true", help="Whether or not to allow to load additional visual models."
+    )
+    parser.add_argument(
+        "--enable_multimodal_audio",
+        action="store_true",
+        help="Whether or not to allow to load additional audio models (requird --enable_multimodal).",
     )
     parser.add_argument(
         "--enable_mps", action="store_true", help="Whether to enable nvidia mps for multimodal service."
 
@@ -4,6 +4,7 @@
 import uuid
 import subprocess
 import signal
+from lightllm.server.audioserver.manager import start_audio_process
 from lightllm.utils.net_utils import alloc_can_use_network_port, PortLocker
 from lightllm.utils.start_utils import process_manager
 from .metrics.manager import start_metric_manager
@@ -173,11 +174,19 @@ def normal_or_p_d_start(args):
 
     node_world_size = args.tp // args.nnodes
     can_use_ports = alloc_can_use_network_port(
-        num=6 + node_world_size + args.visual_dp * args.visual_tp, used_nccl_ports=already_uesd_ports
+        num=7 + node_world_size + args.visual_dp * args.visual_tp, used_nccl_ports=already_uesd_ports
     )
     logger.info(f"alloced ports: {can_use_ports}")
-    router_port, detokenization_port, detokenization_pub_port, visual_port, cache_port, metric_port = can_use_ports[0:6]
-    can_use_ports = can_use_ports[6:]
+    (
+        router_port,
+        detokenization_port,
+        detokenization_pub_port,
+        visual_port,
+        audio_port,
+        cache_port,
+        metric_port,
+    ) = can_use_ports[0:7]
+    can_use_ports = can_use_ports[7:]
 
     visual_model_tp_ports = []
     for _ in range(args.visual_dp):
@@ -190,6 +199,7 @@ def normal_or_p_d_start(args):
     args.detokenization_port = detokenization_port
     args.detokenization_pub_port = detokenization_pub_port
     args.visual_port = visual_port
+    args.audio_port = audio_port
     args.cache_port = cache_port
     args.metric_port = metric_port
 
@@ -218,14 +228,33 @@ def normal_or_p_d_start(args):
             ],
             start_args=[(cache_port, args)],
         )
-        process_manager.start_submodule_processes(
-            start_funcs=[
-                start_visual_process,
-            ],
-            start_args=[
-                (args, router_port, visual_port, cache_port, visual_model_tp_ports),
-            ],
-        )
+        if args.enable_multimodal_audio:
+            process_manager.start_submodule_processes(
+                start_funcs=[
+                    start_visual_process,
+                ],
+                start_args=[
+                    (args, audio_port, visual_port, cache_port, visual_model_tp_ports),
+                ],
+            )
+            process_manager.start_submodule_processes(
+                start_funcs=[
+                    start_audio_process,
+                ],
+                start_args=[
+                    (args, router_port, audio_port, cache_port),
+                ],
+            )
+
+        else:
+            process_manager.start_submodule_processes(
+                start_funcs=[
+                    start_visual_process,
+                ],
+                start_args=[
+                    (args, router_port, visual_port, cache_port, visual_model_tp_ports),
+                ],
+            )
 
     process_manager.start_submodule_processes(
         start_funcs=[
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+MIN_AUDIO_LEN = 480 # 最短音频长度`
Original file line number	Diff line number	Diff line change
`@@ -205,7 +205,12 @@ def make_argument_parser() -> argparse.ArgumentParser:`
`205`	`205`	`use env FIRST_ALLOWED_TOKENS to set the range, like FIRST_ALLOWED_TOKENS=1,2 ..""",`
`206`	`206`	`)`
`207`	`207`	`parser.add_argument(`
`208`		`- "--enable_multimodal", action="store_true", help="Whether or not to allow to load additional multimodal models."`
	`208`	`+ "--enable_multimodal", action="store_true", help="Whether or not to allow to load additional visual models."`
	`209`	`+ )`
	`210`	`+ parser.add_argument(`
	`211`	`+ "--enable_multimodal_audio",`
	`212`	`+ action="store_true",`
	`213`	`+ help="Whether or not to allow to load additional audio models (requird --enable_multimodal).",`
`209`	`214`	`)`
`210`	`215`	`parser.add_argument(`
`211`	`216`	`"--enable_mps", action="store_true", help="Whether to enable nvidia mps for multimodal service."`