@@ -755,6 +755,42 @@ def test_imagetext2text_generation_zai_glm(self):
755755 )
756756 print (output_text )
757757
758+ @never_test ()
759+ def test_sentence_similary_alibaba_nlp_gte (self ):
760+ """
761+ clear&&NEVERTEST=1 python _unittests/ut_tasks/try_tasks.py -k alibaba
762+ """
763+ import torch .nn .functional as F
764+ from transformers import AutoModel , AutoTokenizer
765+
766+ input_texts = [
767+ "what is the capital of China?" ,
768+ "how to implement quick sort in python?" ,
769+ "Beijing" ,
770+ "sorting algorithms" ,
771+ ]
772+
773+ model_path = "Alibaba-NLP/gte-large-en-v1.5"
774+ tokenizer = AutoTokenizer .from_pretrained (model_path )
775+ model = AutoModel .from_pretrained (model_path , trust_remote_code = True )
776+
777+ # Tokenize the input texts
778+ batch_dict = tokenizer (
779+ input_texts , max_length = 8192 , padding = True , truncation = True , return_tensors = "pt"
780+ )
781+
782+ print ("-- type:" , type (model ))
783+ print ("-- subclasses:" , type (model ).__subclasses__ ())
784+ print ("-- inputs:" , self .string_type (batch_dict , with_shape = True ))
785+ outputs = model (** batch_dict )
786+ print ("-- outputs:" , self .string_type (outputs , with_shape = True ))
787+ embeddings = outputs .last_hidden_state [:, 0 ]
788+
789+ # (Optionally) normalize embeddings
790+ embeddings = F .normalize (embeddings , p = 2 , dim = 1 )
791+ scores = (embeddings [:1 ] @ embeddings [1 :].T ) * 100
792+ print (scores .tolist ())
793+
758794
759795if __name__ == "__main__" :
760796 unittest .main (verbosity = 2 )
0 commit comments