@@ -860,21 +860,27 @@ def call(self, inputs):
860860 half_sample = sample [:half_sample_len ]
861861
862862 # Tokenize the text
863- half_sample_tokenized = tokenizer (half_sample )
863+ half_sample_tokenized = tokenizer (
864+ half_sample ,
865+ max_length = max_seq_length ,
866+ padding = 'max_length' ,
867+ truncation = True ,
868+ add_special_tokens = False
869+ )['input_ids' ]
864870
865- # Extract token IDs as a list of integers (not tensors)
866- if isinstance (half_sample_tokenized , dict ):
867- # If tokenizer returns a dict, extract the token IDs
868- token_ids = half_sample_tokenized ['input_ids' ] # or 'token_ids' depending on your tokenizer
869- else :
870- # If tokenizer returns a list directly
871- token_ids = half_sample_tokenized
871+ # # Extract token IDs as a list of integers (not tensors)
872+ # if isinstance(half_sample_tokenized, dict):
873+ # # If tokenizer returns a dict, extract the token IDs
874+ # token_ids = half_sample_tokenized['input_ids'] # or 'token_ids' depending on your tokenizer
875+ # else:
876+ # # If tokenizer returns a list directly
877+ # token_ids = half_sample_tokenized
872878
873- # Convert to Python list of integers if it's a tensor
874- if hasattr (token_ids , 'numpy' ):
875- token_ids = token_ids .numpy ().tolist ()
876- if not isinstance (token_ids , list ):
877- token_ids = list (token_ids )
879+ # # Convert to Python list of integers if it's a tensor
880+ # if hasattr(token_ids, 'numpy'):
881+ # token_ids = token_ids.numpy().tolist()
882+ # if not isinstance(token_ids, list):
883+ # token_ids = list(token_ids)
878884
879885 # Now pass the list of integers to your generate method
880886 generated_tokens = generator .generate (
@@ -960,15 +966,21 @@ def call(self, inputs):
960966 half_sample = sample [:half_sample_len ]
961967
962968 # Tokenize the text
963- half_sample_tokenized = tokenizer (half_sample )
969+ half_sample_tokenized = tokenizer (
970+ half_sample ,
971+ max_length = max_seq_length ,
972+ padding = 'max_length' ,
973+ truncation = True ,
974+ add_special_tokens = False
975+ )['input_ids' ]
964976
965- # Extract token IDs as a list of integers (not tensors)
966- if isinstance (half_sample_tokenized , dict ):
967- # If tokenizer returns a dict, extract the token IDs
968- token_ids = half_sample_tokenized ['input_ids' ] # or 'token_ids' depending on your tokenizer
969- else :
970- # If tokenizer returns a list directly
971- token_ids = half_sample_tokenized
977+ # # Extract token IDs as a list of integers (not tensors)
978+ # if isinstance(half_sample_tokenized, dict):
979+ # # If tokenizer returns a dict, extract the token IDs
980+ # token_ids = half_sample_tokenized['input_ids'] # or 'token_ids' depending on your tokenizer
981+ # else:
982+ # # If tokenizer returns a list directly
983+ # token_ids = half_sample_tokenized
972984
973985 # Convert to Python list of integers if it's a tensor
974986 if hasattr (token_ids , 'numpy' ):
0 commit comments