@@ -859,32 +859,24 @@ def call(self, inputs):
859859 half_sample_len = int (np .ceil (len (sample ) / 2 ))
860860 half_sample = sample [:half_sample_len ]
861861
862- # Tokenize the text
862+ # Tokenize the text without padding first to get actual tokens
863863 half_sample_tokenized = tokenizer (
864864 half_sample ,
865- max_length = MAX_SEQ_LENGTH ,
866- padding = 'max_length' ,
867- truncation = True ,
868865 add_special_tokens = False
869866 )['input_ids' ]
870867
871- # # Extract token IDs as a list of integers (not tensors)
872- # if isinstance(half_sample_tokenized, dict):
873- # # If tokenizer returns a dict, extract the token IDs
874- # token_ids = half_sample_tokenized['input_ids'] # or 'token_ids' depending on your tokenizer
875- # else:
876- # # If tokenizer returns a list directly
877- # token_ids = half_sample_tokenized
868+ # Convert to Python list of integers
869+ if hasattr (half_sample_tokenized , 'numpy' ):
870+ token_ids = half_sample_tokenized .numpy ().tolist ()
871+ else :
872+ token_ids = [int (token_id ) for token_id in half_sample_tokenized ]
878873
879- # # Convert to Python list of integers if it's a tensor
880- # if hasattr(token_ids, 'numpy'):
881- # token_ids = token_ids.numpy().tolist()
882- # if not isinstance(token_ids, list):
883- # token_ids = list(token_ids)
874+ print (f"Actual token count: { len (token_ids )} " )
875+ print (f"First 10 tokens: { token_ids [:10 ]} " )
884876
885877 # Now pass the list of integers to your generate method
886878 generated_tokens = generator .generate (
887- token_ids = half_sample_tokenized , # This should now be a list of integers
879+ token_ids = token_ids , # Just the actual tokens, no padding
888880 do_sample = False ,
889881 max_new_tokens = 40
890882 )
@@ -962,39 +954,30 @@ def call(self, inputs):
962954reconstituted_generator = tf .keras .models .load_model (model_save_path )
963955print ("Model reconstituted successfully!" )
964956
965- ##### here <--------<<<<<<
966957
967958counter = 0
968959for sample in non_instruct_samples :
969960 half_sample_len = int (np .ceil (len (sample ) / 2 ))
970961 half_sample = sample [:half_sample_len ]
971962
972- # Tokenize the text
963+ # Tokenize the text without padding first to get actual tokens
973964 half_sample_tokenized = tokenizer (
974965 half_sample ,
975- max_length = MAX_SEQ_LENGTH ,
976- padding = 'max_length' ,
977- truncation = True ,
978966 add_special_tokens = False
979967 )['input_ids' ]
980968
981- # # Extract token IDs as a list of integers (not tensors)
982- # if isinstance(half_sample_tokenized, dict):
983- # # If tokenizer returns a dict, extract the token IDs
984- # token_ids = half_sample_tokenized['input_ids'] # or 'token_ids' depending on your tokenizer
985- # else:
986- # # If tokenizer returns a list directly
987- # token_ids = half_sample_tokenized
969+ # Convert to Python list of integers
970+ if hasattr (half_sample_tokenized , 'numpy' ):
971+ token_ids = half_sample_tokenized .numpy ().tolist ()
972+ else :
973+ token_ids = [int (token_id ) for token_id in half_sample_tokenized ]
988974
989- # # Convert to Python list of integers if it's a tensor
990- # if hasattr(token_ids, 'numpy'):
991- # token_ids = token_ids.numpy().tolist()
992- # if not isinstance(token_ids, list):
993- # token_ids = list(token_ids)
975+ print (f"Actual token count: { len (token_ids )} " )
976+ print (f"First 10 tokens: { token_ids [:10 ]} " )
994977
995978 # Now pass the list of integers to your generate method
996- generated_tokens = reconstituted_generator .generate (
997- token_ids = half_sample_tokenized , # This should now be a list of integers
979+ generated_tokens = reconstituted_generator .generate (
980+ token_ids = token_ids , # Just the actual tokens, no padding
998981 do_sample = False ,
999982 max_new_tokens = 40
1000983 )
@@ -1004,6 +987,7 @@ def call(self, inputs):
1004987 print (f"PROMPT number { counter } : { half_sample } ; RESPONSE: { full_generated_text } " )
1005988 counter += 1
1006989
990+
1007991# # Test with all original data samples - REAL WORLD DEMO (reconstituted)
1008992# print("\n" + "="*50)
1009993# print("GENERATED TEXT SAMPLES FROM ALL DATA - REAL WORLD USAGE (reconstituted)")
0 commit comments