alibaba
diff --git a/‎maga_transformer/async_decoder_engine/embedding/embedding_decoder_engine.py‎
Lines changed: 3 additions & 2 deletions b/‎maga_transformer/async_decoder_engine/embedding/embedding_decoder_engine.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎maga_transformer/async_decoder_engine/embedding/embedding_model_executor.py‎
Lines changed: 7 additions & 7 deletions b/‎maga_transformer/async_decoder_engine/embedding/embedding_model_executor.py‎
Lines changed: 7 additions & 7 deletions
diff --git a/‎maga_transformer/async_decoder_engine/embedding/embedding_scheduler.py‎
Lines changed: 3 additions & 0 deletions b/‎maga_transformer/async_decoder_engine/embedding/embedding_scheduler.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎maga_transformer/async_decoder_engine/embedding/embedding_stream.py‎
Lines changed: 11 additions & 9 deletions b/‎maga_transformer/async_decoder_engine/embedding/embedding_stream.py‎
Lines changed: 11 additions & 9 deletions
diff --git a/‎maga_transformer/async_decoder_engine/embedding/post_process/colbert_embedding_module.py‎
Lines changed: 32 additions & 0 deletions b/‎maga_transformer/async_decoder_engine/embedding/post_process/colbert_embedding_module.py‎
Lines changed: 32 additions & 0 deletions
diff --git a/‎maga_transformer/async_decoder_engine/embedding/post_process/dense_embedding_module.py‎
Lines changed: 67 additions & 0 deletions b/‎maga_transformer/async_decoder_engine/embedding/post_process/dense_embedding_module.py‎
Lines changed: 67 additions & 0 deletions
diff --git a/‎maga_transformer/async_decoder_engine/embedding/post_process/post_process_factory.py‎
Lines changed: 0 additions & 24 deletions b/‎maga_transformer/async_decoder_engine/embedding/post_process/post_process_factory.py‎
Lines changed: 0 additions & 24 deletions
diff --git a/‎maga_transformer/async_decoder_engine/embedding/post_process/post_process_module.py‎
Lines changed: 73 additions & 4 deletions b/‎maga_transformer/async_decoder_engine/embedding/post_process/post_process_module.py‎
Lines changed: 73 additions & 4 deletions
@@ -40,7 +40,6 @@ async def _generate_loop(self, streams: List[EmbeddingStream]) -> List[Embedding
             if all(finished):
                 break
             await asyncio.sleep(0.001)
-
         return [stream.output for stream in streams]
 
     @torch.inference_mode()
@@ -57,7 +56,9 @@ def step(self):
                 self.batch_input_.tp_sync()
                 embedding_outputs = self.executor_.process(self.batch_input_)
             if g_parallel_info.tp_rank == 0:
-                for idx, stream in enumerate(streams):
+                # do synchronize before update result
+                torch.cuda.synchronize()
+                for idx, stream in enumerate(streams):                    
                     stream.update(embedding_outputs[idx])
                 self.report_metric(len(streams), t.cost_ms())
 
 
@@ -6,7 +6,7 @@
 from maga_transformer.config.gpt_init_model_parameters import GptInitModelParameters
 from maga_transformer.ops.gpt_ops.gpt_op import GptOp
 from maga_transformer.async_decoder_engine.embedding.embedding_stream import EmbeddingBatchedInput, EmbeddingOutput
-from maga_transformer.async_decoder_engine.embedding.post_process.post_process_factory import PostProcessFactory
+from maga_transformer.async_decoder_engine.embedding.post_process.post_process_module import PostProcessModule
 
 class EmbeddingModelExecutor(object):
     def __init__(self, model: BaseModel, config: GptInitModelParameters):
@@ -15,10 +15,10 @@ def __init__(self, model: BaseModel, config: GptInitModelParameters):
         self.gpt_op_ = GptOp(self.config_, False)
         self.gpt_op_.set_weight(self.model_.weight)
 
-        self.post_process_module_ = PostProcessFactory.create_post_process_module(self.config_, self.model_.dtype)
+        self.post_process_module_ = PostProcessModule(self.config_, self.model_.dtype, self.model_.tokenizer)
 
-    def _pre_process(self, batch_input: EmbeddingBatchedInput):        
-        combo_tokens_tensor = to_cuda(torch.IntTensor(batch_input.combo_tokens))        
+    def _pre_process(self, batch_input: EmbeddingBatchedInput):
+        combo_tokens_tensor = to_cuda(torch.IntTensor(batch_input.combo_tokens))
         position_ids_tensor = to_cuda(self.model_.create_context_position_ids(batch_input.context_lengths_list))
         input_embeds = self.model_.async_input_word_embedding(combo_tokens_tensor, [])
         if self.model_.position_encoding is not None:
@@ -29,7 +29,7 @@ def _pre_process(self, batch_input: EmbeddingBatchedInput):
 
         if self.model_.pre_decoder_layernorm is not None:
             input_embeds = self.model_.pre_decoder_layernorm(input_embeds)
-        
+
         attention_mask = self.model_.create_context_decoder_mask(batch_input.context_lengths_list)
         return input_embeds, attention_mask, position_ids_tensor
 
@@ -50,6 +50,6 @@ def process(self, batch_input: EmbeddingBatchedInput) -> List[EmbeddingOutput]:
             prefix_lengths=torch.IntTensor([0] * batch_input.batch_size),
             count_length=torch.BoolTensor([True]),
             max_prefix_length=torch.IntTensor([0]),
-            lora_ids=torch.IntTensor([-1] * batch_input.batch_size))    
-        output = self.post_process_module_.process(batch_input, hidden_states, attention_mask)        
+            lora_ids=torch.IntTensor([-1] * batch_input.batch_size))
+        output = self.post_process_module_.process(batch_input, hidden_states, attention_mask, batch_input.embedding_config)
         return output
@@ -30,6 +30,9 @@ def schedule(self) -> List[EmbeddingStream]:
             for stream in copy.copy(self.waiting_streams_):
                 if total_len + stream.input.input_length > self.config_.max_context_batch_size * self.config_.max_seq_len:
                     break
+                # make sure embedding config is the same
+                if len(new_streams) > 0 and stream.input.embedding_config != new_streams[0].input.embedding_config:
+                    break
                 new_streams.append(stream)
                 total_len += stream.input.input_length
 
 
@@ -1,20 +1,20 @@
 import torch
-from typing import Any, List, Optional
+from typing import Any, List, Dict, Optional
 from maga_transformer.utils.util import to_cuda, to_cpu
 
 from maga_transformer.distribute.worker_info import g_parallel_info
-from maga_transformer.config.generate_config import GenerateConfig
+from maga_transformer.embedding.embedding_config import EmbeddingGenerateConfig
 from maga_transformer.config.base_model_config import PyDanticModelBase
 
 class EmbeddingInput(PyDanticModelBase):
     token_ids: List[int]
     token_type_ids: List[int]
     input_length: int
-    generate_config: GenerateConfig
+    embedding_config: EmbeddingGenerateConfig
 
 class EmbeddingOutput(PyDanticModelBase):
     sentence_embedding: Optional[torch.Tensor] = None
-    sparse_embedding: Optional[torch.Tensor] = None
+    sparse_embedding: Optional[Dict[str, float]] = None
     colbert_embedding: Optional[torch.Tensor] = None
 
 class EmbeddingStream(PyDanticModelBase):
@@ -26,10 +26,9 @@ class EmbeddingStream(PyDanticModelBase):
     def set_error(self, error: str):
         self.error_info = error
 
-    def update(self,
-               embedding_output: EmbeddingOutput):
-        self.finished = True
+    def update(self, embedding_output: EmbeddingOutput):
         self.output = embedding_output
+        self.finished = True
 
 class EmbeddingBatchedInput(object):
     def __init__(self, nccl_op: Any) -> None:
@@ -41,6 +40,8 @@ def clear(self):
         self.context_lengths_list: List[int] = []
         self.combo_tokens: List[int] = []
         self.combo_token_type_ids: List[int] = []
+        # no need to broadcast embedding config since only tp=0 will use it
+        self.embedding_config = EmbeddingGenerateConfig()
 
     def generate_model_input(self, streams: List[EmbeddingStream]):
         self.clear()
@@ -51,6 +52,7 @@ def generate_model_input(self, streams: List[EmbeddingStream]):
             self.combo_tokens.extend(stream.input.token_ids)
             self.combo_token_type_ids.extend(stream.input.token_type_ids)
         self.batch_size = len(self.context_lengths_list)
+        self.embedding_config = streams[0].input.embedding_config
         self.token_num = len(self.combo_tokens)
 
     def tp_sync(self):
@@ -64,13 +66,13 @@ def tp_sync(self):
         torch.cuda.current_stream().synchronize()
         shape_hints = shape_hints.cpu().numpy()
         assert shape_hints[0] == check_num and shape_hints[-1] == check_num2, 'check sum error'
-        
+
         if g_parallel_info.tp_rank == 0:
             context_length_tensor = to_cuda(torch.IntTensor(self.context_lengths_list))
             combo_tokens_tensor = to_cuda(torch.IntTensor(self.combo_tokens))
             combo_token_type_ids_tensor = to_cuda(torch.IntTensor(self.combo_token_type_ids))
         else:
-            self.batch_size = shape_hints[1]            
+            self.batch_size = shape_hints[1]
             self.token_num = shape_hints[2]
             context_length_tensor = torch.zeros([self.batch_size], dtype=torch.int32, device="cuda:0")
             combo_tokens_tensor = torch.zeros([self.token_num], dtype=torch.int32, device="cuda:0")
 
@@ -0,0 +1,32 @@
+import os
+from numpy.typing import NDArray
+import numpy as np
+import torch
+from typing import List, Dict, Union, Optional
+
+from maga_transformer.config.gpt_init_model_parameters import GptInitModelParameters
+
+class ColBertEmbeddingModule(object):
+    def __init__(self, hidden_size: int, state_dict: Dict[str, torch.Tensor], dtype: Union[str, torch.dtype]):
+        self.colbert_linear = torch.nn.Linear(in_features=hidden_size, out_features=hidden_size)
+        self.colbert_linear.load_state_dict(state_dict)
+        self.colbert_linear = self.colbert_linear.to(dtype).cuda()
+
+    def _process_colbert_vecs(self, colbert_vecs: torch.Tensor, tokens_num: int):
+        # delte the vectors of padding tokens
+        return colbert_vecs[:tokens_num - 1]  # we don't use the embedding of cls, so select tokens_num-1
+
+    def __call__(self, hidden_states: torch.Tensor, attention_mask: torch.Tensor, input_length: List[int], do_normalize: bool=True) -> List[torch.Tensor]:
+        colbert_vecs = self.colbert_linear(hidden_states[:, 1:])
+        colbert_vecs = colbert_vecs * attention_mask[:, 1:][:, :, None].float()
+        if do_normalize:
+            colbert_vecs = torch.nn.functional.normalize(colbert_vecs, dim=-1)
+        all_colbert_vec = (list(map(self._process_colbert_vecs, colbert_vecs.cpu(), input_length)))
+        return all_colbert_vec
+
+def init_colbert_embedding_module(config: GptInitModelParameters, dtype: Union[str, torch.dtype]) -> Optional[ColBertEmbeddingModule]:
+    colbert_linear_path = os.path.join(config.ckpt_path, 'colbert_linear.pt')
+    if os.path.exists(colbert_linear_path):
+        sparse_linear_dict = torch.load(colbert_linear_path, map_location='cpu')
+        return ColBertEmbeddingModule(config.hidden_size, sparse_linear_dict, dtype)
+    return None
@@ -0,0 +1,67 @@
+import os
+import json
+from collections import OrderedDict
+import torch
+import torch.nn as nn
+from typing import List, Dict, Union
+
+from sentence_transformers.util import import_from_string
+from sentence_transformers.models import Transformer, Normalize
+
+from maga_transformer.config.gpt_init_model_parameters import GptInitModelParameters
+
+class DenseEmbeddingModule(object):
+    def __call__(self, hidden_states: torch.Tensor, attention_mask: torch.Tensor, input_length: List[int], do_normalize: bool) -> torch.Tensor:
+        raise NotImplementedError()
+    
+def init_dense_embedding_module(config: GptInitModelParameters, dtype: Union[str, torch.dtype]) -> DenseEmbeddingModule:
+    if os.path.exists(os.path.join(config.ckpt_path, 'modules.json')):
+        dense_embedding_module = SentenceTransformerModule(config, dtype)
+    else:
+        dense_embedding_module =  NormalModule(config.is_causal)
+    return dense_embedding_module
+
+class NormalModule(DenseEmbeddingModule):
+    def __init__(self, is_casual: bool):
+        self.is_casual = is_casual
+
+    def __call__(self, hidden_states: torch.Tensor, attention_mask: torch.Tensor, input_lengths: List[int], do_normalize: bool) -> torch.Tensor:
+        batch_size = len(input_lengths)
+        if self.is_casual:
+            ts =  torch.stack([hidden_states[idx][pos - 1] for idx, pos in enumerate(input_lengths)])
+        else:
+            ts = torch.stack([hidden_states[idx][0] for idx, pos in enumerate(input_lengths)])
+
+        if do_normalize:
+            ts = torch.nn.functional.normalize(ts, dim=1)
+        return ts
+
+class SentenceTransformerModule(DenseEmbeddingModule):
+    def __init__(self, config: GptInitModelParameters, dtype: Union[str, torch.dtype]):
+        modules_config_path = os.path.join(config.ckpt_path, 'modules.json')
+        assert os.path.exists(modules_config_path), "not found modules.json from sentence_transformer"
+        with open(modules_config_path) as fIn:
+            modules_config = json.load(fIn)
+        modules: OrderedDict[str, nn.Module] = OrderedDict()
+        for module_config in modules_config:
+            module_class = import_from_string(module_config["type"])
+            # For Transformer, don't load the full directory, rely on `transformers` instead
+            # But, do load the config file first.
+            if module_class == Transformer and module_config["path"] == "":
+                pass
+            else:
+                # Normalize does not require any files to be loaded
+                if module_class == Normalize:
+                    module_path = None
+                else:
+                    module_path = os.path.join(config.ckpt_path, module_config["path"])
+                module = module_class.load(module_path)
+                modules[module_config["name"]] = module
+        self.model = nn.Sequential(modules).cuda().to(dtype)
+
+    def __call__(self, hidden_states: torch.Tensor, attention_mask: torch.Tensor, input_length: List[int], do_normalize: bool) -> torch.Tensor:
+        input =  {
+            "token_embeddings": hidden_states,
+            "attention_mask": attention_mask
+        }
+        return self.model(input)['sentence_embedding']
@@ -1,8 +1,77 @@
 import torch
-from typing import List
-from maga_transformer.async_decoder_engine.embedding.embedding_stream import EmbeddingBatchedInput, EmbeddingOutput
+from typing import List, Union, Optional, Dict, Tuple
+from torch.nn.utils.rnn import pad_sequence
+from transformers import PreTrainedTokenizerBase
 
+from maga_transformer.utils.util import to_cuda
+from maga_transformer.embedding.embedding_config import EmbeddingGenerateConfig, EmbeddingType
+from maga_transformer.config.gpt_init_model_parameters import GptInitModelParameters
+from maga_transformer.async_decoder_engine.embedding.post_process.dense_embedding_module import  init_dense_embedding_module
+from maga_transformer.async_decoder_engine.embedding.post_process.sparse_emebdding_module import init_sparse_embedding_module
+from maga_transformer.async_decoder_engine.embedding.post_process.colbert_embedding_module import init_colbert_embedding_module
+from maga_transformer.async_decoder_engine.embedding.embedding_stream import EmbeddingBatchedInput, EmbeddingOutput
 
 class PostProcessModule(object):
-    def process(self, batch_query: EmbeddingBatchedInput, hidde_states: torch.Tensor, attention_mask: torch.Tensor) -> List[EmbeddingOutput]:
-        raise NotImplementedError()
+    def __init__(self, config: GptInitModelParameters, dtype: Union[torch.dtype, str], tokenizer: PreTrainedTokenizerBase):
+        self.config_ = config
+        self.dtype_ = dtype
+        self.tokenizer_ = tokenizer
+        self.pad_token_id_ = self.tokenizer_.pad_token_id if self.tokenizer_.pad_token_id is not None else 0
+        self.dense_embedding_module_ = init_dense_embedding_module(config, dtype)
+        self.sparse_embedding_module_ = init_sparse_embedding_module(config, tokenizer, dtype)
+        self.colbert_embedding_module_ = init_colbert_embedding_module(config, dtype)
+
+
+    # attention_mask from [batch, max_seq, max_seq] to [batch, max_seq]
+    # hidden_states/input_ids from [combo_length, hidden_states] to [batch, max_seq, hidden_states]    
+    def _reorder_input(self, batch_input: EmbeddingBatchedInput, hidde_states: torch.Tensor, attention_mask: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        sliced_hidden_states: List[torch.Tensor] = []
+        sliced_input_ids: List[torch.Tensor] = []
+        attention_mask_indexs: List[int] = []
+        hidden_bias = 0
+        mask_bias = 0
+        for input_length in batch_input.context_lengths_list:
+            sliced_hidden_states.append(hidde_states[hidden_bias: hidden_bias + input_length])
+            sliced_input_ids.append(torch.IntTensor(batch_input.combo_tokens[hidden_bias: hidden_bias + input_length]))
+            attention_mask_indexs.append(mask_bias + input_length - 1)
+            mask_bias += attention_mask.shape[1]
+            hidden_bias += input_length
+        batched_hidden_states = pad_sequence(sliced_hidden_states, batch_first=True, padding_value=self.pad_token_id_)
+        batched_input_ids = pad_sequence(sliced_input_ids, batch_first=True, padding_value=self.pad_token_id_)
+        batched_attention_mask = attention_mask.reshape(-1, attention_mask.shape[2])[attention_mask_indexs].contiguous()
+        return batched_input_ids, batched_hidden_states, batched_attention_mask
+
+    def _set_outputs(self, outputs: List[EmbeddingOutput],
+                     dense_embedding: Optional[torch.Tensor],
+                     sparse_embedding: Optional[List[Dict[str, float]]],
+                     colbert_embedding: Optional[List[torch.Tensor]]):
+        if dense_embedding is not None:
+            for index, dense in enumerate(dense_embedding):
+                outputs[index].sentence_embedding = dense
+        if sparse_embedding is not None:
+            for index, sparse in enumerate(sparse_embedding):
+                outputs[index].sparse_embedding = sparse
+        if colbert_embedding is not None:
+            for index, colbert in enumerate(colbert_embedding):
+                outputs[index].colbert_embedding = colbert
+
+    def process(self, batch_input: EmbeddingBatchedInput, hidde_states: torch.Tensor, attention_mask: torch.Tensor, embedding_config: EmbeddingGenerateConfig) -> List[EmbeddingOutput]:
+        outputs = [EmbeddingOutput() for _ in range(batch_input.batch_size)]
+        batch_input_ids, batch_hidden_states, batch_attention_mask = self._reorder_input(batch_input, hidde_states, attention_mask)
+        dense_embedding = None
+        sprase_embedding = None
+        colbert_embedding = None
+        if embedding_config.type == EmbeddingType.DENSE:
+            dense_embedding = self.dense_embedding_module_(hidden_states=batch_hidden_states, attention_mask=batch_attention_mask,
+                                                           input_length=batch_input.context_lengths_list,
+                                                           do_normalize=embedding_config.do_normalize)
+        if embedding_config.type == EmbeddingType.SPARSE:
+            if self.sparse_embedding_module_ is None:
+                raise Exception("module not support sparse embedding")
+            sprase_embedding = self.sparse_embedding_module_(batch_input_ids, batch_hidden_states)
+        if embedding_config.type == EmbeddingType.COLBERT:
+            if self.colbert_embedding_module_ == None:
+                raise Exception("module not support colbert embedding")
+            colbert_embedding = self.colbert_embedding_module_(batch_hidden_states, batch_attention_mask, batch_input.context_lengths_list, do_normalize=embedding_config.do_normalize)
+        self._set_outputs(outputs, dense_embedding, sprase_embedding, colbert_embedding)
+        return outputs