Bert model's ability to handle long texts and address the 512 token limit

peng3307165 · peng3307165 · commit 2cba23b726c9 · 2023-11-09T15:06:18.000+08:00
diff --git a/.gitignore b/.gitignore
@@ -147,3 +147,4 @@ dmypy.json
 /embedding_npy
 /flask_server
 *.bin
+*ini
diff --git a/flask4modelcache.py b/flask4modelcache.py
@@ -1,10 +1,9 @@
 # -*- coding: utf-8 -*-
-import json
+import time
+from datetime import datetime
 from flask import Flask, request
 import logging
-from datetime import datetime
 import configparser
-import time
 import json
 from modelcache import cache
 from modelcache.adapter import adapter
@@ -105,10 +104,12 @@ def user_backend():
 
     if request_type == 'query':
         try:
+            start_time = time.time()
             response = adapter.ChatCompletion.create_query(
                 scope={"model": model},
                 query=query
             )
+            delta_time = '{}s'.format(round(time.time() - start_time, 2))
             if response is None:
                 result = {"errorCode": 0, "errorDesc": '', "cacheHit": False, "delta_time": delta_time, "hit_query": '',
                           "answer": ''}
@@ -120,6 +121,7 @@ def user_backend():
                 hit_query = response_hitquery(response)
                 result = {"errorCode": 0, "errorDesc": '', "cacheHit": True, "delta_time": delta_time,
                           "hit_query": hit_query, "answer": answer}
+            delta_time_log = round(time.time() - start_time, 2)
             future = executor.submit(save_query_info, result, model, query, delta_time_log)
         except Exception as e:
             result = {"errorCode": 202, "errorDesc": e, "cacheHit": False, "delta_time": 0,
diff --git a/modelcache/adapter/adapter.py b/modelcache/adapter/adapter.py
@@ -35,6 +35,7 @@ def create_insert(cls, *args, **kwargs):
             logging.info('adapt_insert_e: {}'.format(e))
             return 'adapt_insert_exception'
 
+
     @classmethod
     def create_remove(cls, *args, **kwargs):
         try:
diff --git a/modelcache/embedding/data2vec.py b/modelcache/embedding/data2vec.py
@@ -28,18 +28,66 @@ def __init__(self, model: str = "sentence-transformers/all-MiniLM-L6-v2"):
             config = AutoConfig.from_pretrained(model)
             self.__dimension = config.hidden_size
 
+        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
         self.tokenizer = BertTokenizer.from_pretrained(model, local_files_only=True)
         self.model = BertModel.from_pretrained(model, local_files_only=True)
 
     def to_embeddings(self, data, **_):
         encoded_input = self.tokenizer(data, padding=True, truncation=True, return_tensors='pt')
-        with torch.no_grad():
-            model_output = self.model(**encoded_input)
+        num_tokens = sum(map(len, encoded_input['input_ids']))
 
-        sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
-        sentence_embeddings = sentence_embeddings.squeeze(0).detach().numpy()
-        embedding_array = np.array(sentence_embeddings).astype("float32")
-        return embedding_array
+        if num_tokens <= 512:
+            with torch.no_grad():
+                encoded_input = {k: v.to(self.device) for k, v in encoded_input.items()}
+                model_output = self.model(**encoded_input)
+            sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
+            sentence_embeddings = sentence_embeddings.squeeze(0).detach().cpu().numpy()
+            embedding_array = np.array(sentence_embeddings).astype("float32")
+            return embedding_array
+        else:
+            window_size = 510
+            start = 0
+            input_ids = encoded_input['input_ids']
+            input_ids = input_ids[:, 1:-1]
+            start_token = self.tokenizer.cls_token
+            end_token = self.tokenizer.sep_token
+            start_token_id = self.tokenizer.convert_tokens_to_ids(start_token)
+            end_token_id = self.tokenizer.convert_tokens_to_ids(end_token)
+            begin_element = torch.tensor([[start_token_id]])
+            end_element = torch.tensor([[end_token_id]])
+
+            embedding_array_list = list()
+            while start < num_tokens:
+                # Calculate the ending position of the sliding window.
+                end = start + window_size
+                # If the ending position exceeds the length, adjust it to the length.
+                if end > num_tokens:
+                    end = num_tokens
+                # Retrieve the data within the sliding window.
+                input_ids_window = input_ids[:, start:end]
+                # Insert a new element at position 0.
+                input_ids_window = torch.cat([begin_element, input_ids_window[:, 0:]], dim=1)
+                # Insert a new element at the last position.
+                input_ids_window = torch.cat([input_ids_window, end_element], dim=1)
+                input_ids_window_length = sum(map(len, input_ids_window))
+                token_type_ids = torch.tensor([[0] * input_ids_window_length])
+                attention_mask = torch.tensor([[1] * input_ids_window_length])
+
+                # Concatenate new input_ids
+                encoded_input_window = {'input_ids': input_ids_window, 'token_type_ids': token_type_ids,
+                                        'attention_mask': attention_mask}
+                with torch.no_grad():
+                    encoded_input_window = {k: v.to(self.device) for k, v in encoded_input_window.items()}
+                    model_output_window = self.model(**encoded_input_window)
+
+                sentence_embeddings_window = mean_pooling(model_output_window, encoded_input_window['attention_mask'])
+                sentence_embeddings_window = sentence_embeddings_window.squeeze(0).detach().cpu().numpy()
+                embedding_array_window = np.array(sentence_embeddings_window).astype("float32")
+                embedding_array_list.append(embedding_array_window)
+                start = end
+
+            embedding_array = np.mean(embedding_array_list, axis=0)
+            return embedding_array
 
     def post_proc(self, token_embeddings, inputs):
         attention_mask = inputs["attention_mask"]