@@ -231,7 +231,7 @@ def package_non_instruct_text(text: str, desired_samples: int, max_length_tokens
231231 return samples
232232
233233# Separate into samples
234- non_instruct_samples = package_non_instruct_text (text = bible , desired_samples = 30 , max_length_tokens = 1200 )
234+ non_instruct_samples = package_non_instruct_text (text = bible , desired_samples = 30 , max_length_tokens = int ( np . ceil ( MAX_SEQ_LENGTH * .8 )) ##
235235
236236del (bible )
237237collect ()
@@ -641,9 +641,9 @@ def reset_state(self):
641641
642642best_model_found = cerebros_automl .get_best_model ()
643643best_model_found .save (MODEL_FILE_NAME )
644- del (best_model_found )
645- del (cerebros_automl )
646- collect ()
644+ # del(best_model_found)
645+ # del(cerebros_automl)
646+ # collect()
647647
648648file_size_bytes = getsize (MODEL_FILE_NAME )
649649print (f"Model size on disk: { file_size_bytes / (1024 * 1024 ):.2f} MB" )
@@ -659,62 +659,62 @@ def reset_state(self):
659659pad_token_id = tokenizer .pad_token_id
660660end_prompt_token_id = tokenizer .encode ("</prompt>" , add_special_tokens = False )[0 ]
661661
662- # Generate text for first 5 test samples (Working)
663- generated_texts = []
664- for i in range (min (5 , len (x_test_packaged [0 ]))):
665- original_input = x_test_packaged [0 ][i ].numpy ()
662+ # # Generate text for first 5 test samples (Working)
663+ # generated_texts = []
664+ # for i in range(min(5, len(x_test_packaged[0]))):
665+ # original_input = x_test_packaged[0][i].numpy()
666666
667- # Find the end of the prompt
668- try :
669- end_prompt_index = list (original_input ).index (end_prompt_token_id )
670- except ValueError :
671- end_prompt_index = 0
667+ # # Find the end of the prompt
668+ # try:
669+ # end_prompt_index = list(original_input).index(end_prompt_token_id)
670+ # except ValueError:
671+ # end_prompt_index = 0
672672
673- # Extract the prompt part
674- prompt_tokens = original_input [:end_prompt_index + 1 ].tolist ()
673+ # # Extract the prompt part
674+ # prompt_tokens = original_input[:end_prompt_index+1].tolist()
675675
676- # Generate tokens sequentially
677- generated_tokens = []
678- current_input = prompt_tokens .copy ()
676+ # # Generate tokens sequentially
677+ # generated_tokens = []
678+ # current_input = prompt_tokens.copy()
679679
680- # Generate up to 100 tokens or until pad token
681- for _ in range (100 ):
682- # Pad or truncate to MAX_SEQ_LENGTH
683- input_tensor = tf .constant ([current_input + [pad_token_id ] * (MAX_SEQ_LENGTH - len (current_input ))], dtype = tf .int32 )
680+ # # Generate up to 100 tokens or until pad token
681+ # for _ in range(100):
682+ # # Pad or truncate to MAX_SEQ_LENGTH
683+ # input_tensor = tf.constant([current_input + [pad_token_id] * (MAX_SEQ_LENGTH - len(current_input))], dtype=tf.int32)
684684
685- # Get prediction
686- prediction = reconstituted_model (input_tensor )
687- next_token_id = int (tf .argmax (prediction [0 ], axis = - 1 ).numpy ())
685+ # # Get prediction
686+ # prediction = reconstituted_model(input_tensor)
687+ # next_token_id = int(tf.argmax(prediction[0], axis=-1).numpy())
688688
689- # Stop if pad token generated
690- if next_token_id == pad_token_id :
691- break
689+ # # Stop if pad token generated
690+ # if next_token_id == pad_token_id:
691+ # break
692692
693- generated_tokens .append (next_token_id )
694- current_input .append (next_token_id )
693+ # generated_tokens.append(next_token_id)
694+ # current_input.append(next_token_id)
695695
696- # Stop if we exceed max length
697- if len (current_input ) >= MAX_SEQ_LENGTH :
698- break
696+ # # Stop if we exceed max length
697+ # if len(current_input) >= MAX_SEQ_LENGTH:
698+ # break
699699
700- generated_texts .append ((prompt_tokens , generated_tokens ))
700+ # generated_texts.append((prompt_tokens, generated_tokens))
701701
702- # Decode and print with proper formatting
703- for idx , (prompt_tokens , generated_tokens ) in enumerate (generated_texts ):
704- # Decode prompt
705- prompt_text = tokenizer .decode (prompt_tokens , skip_special_tokens = False )
702+ # # Decode and print with proper formatting
703+ # for idx, (prompt_tokens, generated_tokens) in enumerate(generated_texts):
704+ # # Decode prompt
705+ # prompt_text = tokenizer.decode(prompt_tokens, skip_special_tokens=False)
706706
707- # Extract original prompt content
708- if '<prompt>' in prompt_text and '</prompt>' in prompt_text :
709- original_prompt = prompt_text .split ('<prompt>' )[- 1 ].split ('</prompt>' )[0 ]
710- else :
711- original_prompt = prompt_text [:50 ] + "..." if len (prompt_text ) > 50 else prompt_text
707+ # # Extract original prompt content
708+ # if '<prompt>' in prompt_text and '</prompt>' in prompt_text:
709+ # original_prompt = prompt_text.split('<prompt>')[-1].split('</prompt>')[0]
710+ # else:
711+ # original_prompt = prompt_text[:50] + "..." if len(prompt_text) > 50 else prompt_text
712712
713- # Decode generated text
714- generated_text = tokenizer .decode (generated_tokens , skip_special_tokens = False ) if generated_tokens else ""
713+ # # Decode generated text
714+ # generated_text = tokenizer.decode(generated_tokens, skip_special_tokens=False) if generated_tokens else ""
715715
716- print (f"\n Generated text from sample { idx + 1 } :" )
717- print (f"<prompt>{ original_prompt } </prompt>{ generated_text } " )
716+ # print(f"\nGenerated text from sample {idx+1}:")
717+ # print(f"<prompt>{original_prompt}</prompt>{generated_text}")
718718
719719
720720
@@ -747,7 +747,7 @@ def __init__(self, config, **kwargs):
747747 self .max_sequence_length = config .max_sequence_length
748748 self .padding_token = config .padding_token
749749 # Make self.model = the reconstituted model (constant)
750- self .model = reconstituted_model
750+ self .model = best_model_found # reconstituted_model
751751
752752 def get_config (self ):
753753 return {
@@ -786,6 +786,7 @@ def generate(self, token_ids, do_sample=False, max_new_tokens=None):
786786 current_tokens = token_ids .copy ()
787787
788788 # Autoregressive generation loop
789+ temp_gen_count = 0 # <--------<< Debug code to remove later
789790 for _ in range (max_new_tokens ):
790791 # Pad or truncate to max_sequence_length (CORRECTED PADDING LOGIC)
791792 if len (current_tokens ) > self .max_sequence_length :
@@ -802,12 +803,18 @@ def generate(self, token_ids, do_sample=False, max_new_tokens=None):
802803 # Get next token based on sampling strategy
803804 if do_sample :
804805 # Sample from the distribution
805- probabilities = tf .nn .softmax (logits [0 ], axis = - 1 )
806- next_token_id = tf .random .categorical (tf .math .log (probabilities )[None , :], 1 )[0 , 0 ].numpy ()
806+ # probabilities = tf.nn.softmax(logits[0], axis=-1) # Model already applies softmax
807+ next_token_id = tf .random .categorical (tf .math .log (logits [ 0 ] )[None , :], 1 )[0 , 0 ].numpy ()
807808 else :
808809 # Greedy sampling (argmax)
809810 next_token_id = int (tf .argmax (logits [0 ], axis = - 1 ).numpy ())
810-
811+ # Debug code to removel later
812+ print (f"Generating { temp_gen_count } " )
813+ print (f"... next_token_id: { next_token_id } " )
814+ next_word = tokenizer .decode (next_token_id )
815+ print (f"Next decoded word: { next_word } " )
816+ temp_gen_count = + = 1
817+
811818 # Check for termination condition
812819 if next_token_id == self .padding_token :
813820 break
@@ -845,54 +852,73 @@ def call(self, inputs):
845852)
846853generator = CerebrosAutoregressiveTextGenerator (config )
847854
848- # Process ALL original samples from data - REAL WORLD USAGE
849- generated_texts = []
850- for i , original_text in enumerate (data [:5 ]): # Process first 5 samples
851- print (f"\n Processing sample { i + 1 } ..." )
852-
853- # Extract prompt part (everything up to and including </prompt>)
854- if '</prompt>' in original_text :
855- prompt_part = original_text .split ('</prompt>' )[0 ] + '</prompt>'
856- else :
857- prompt_part = original_text
858-
859- # Tokenize the prompt part
860- tokenized = tokenizer (
861- prompt_part ,
862- add_special_tokens = False , # We handle special tokens manually
863- return_tensors = None # Return lists, not tensors
864- )
865- prompt_tokens = tokenized ['input_ids' ]
866-
867- print (f"Original prompt: { prompt_part [:100 ]} ..." )
868- print (f"Tokenized prompt length: { len (prompt_tokens )} tokens" )
869-
870- # Generate tokens using the wrapper class - REAL WORLD USAGE
855+ print ("########### BEFORE SEARIALIZING THE GENERATIVE MODEL" )
856+
857+ counter = 0
858+ for sample in non_instruct_samples :
859+ half_sample_len = int (np .ceil (len (sample )))
860+ half_sample = sample [:half_sample ]
861+ half_sample_tokenized = tokenizer (half_sample )
871862 generated_tokens = generator .generate (
872- token_ids = prompt_tokens ,
863+ token_ids = half_sample_tokenized ,
873864 do_sample = False ,
874- max_new_tokens = 100
865+ max_new_tokens = 40
875866 )
876-
877- # Decode the full generated text
878867 full_generated_text = tokenizer .decode (generated_tokens , skip_special_tokens = False )
868+ print (f"PROMPT number { counter } : { half_sample } ; RESPONSE: { full_generated_text } " )
869+
870+
871+
872+
873+
874+ # # Process ALL original samples from data - REAL WORLD USAGE
875+ # generated_texts = []
876+ # for i, original_text in enumerate(data[:5]): # Process first 5 samples
877+ # print(f"\nProcessing sample {i+1}...")
878+
879+ # # Extract prompt part (everything up to and including </prompt>)
880+ # if '</prompt>' in original_text:
881+ # prompt_part = original_text.split('</prompt>')[0] + '</prompt>'
882+ # else:
883+ # prompt_part = original_text
884+
885+ # # Tokenize the prompt part
886+ # tokenized = tokenizer(
887+ # prompt_part,
888+ # add_special_tokens=False, # We handle special tokens manually
889+ # return_tensors=None # Return lists, not tensors
890+ # )
891+ # prompt_tokens = tokenized['input_ids']
892+
893+ # print(f"Original prompt: {prompt_part[:100]}...")
894+ # print(f"Tokenized prompt length: {len(prompt_tokens)} tokens")
895+
896+ # # Generate tokens using the wrapper class - REAL WORLD USAGE
897+ # generated_tokens = generator.generate(
898+ # token_ids=prompt_tokens,
899+ # do_sample=False,
900+ # max_new_tokens=100
901+ # )
902+
903+ # # Decode the full generated text
904+ # full_generated_text = tokenizer.decode(generated_tokens, skip_special_tokens=False)
879905
880- # Extract just the newly generated part (after the prompt)
881- generated_part = full_generated_text [len (prompt_part ):]
906+ # # Extract just the newly generated part (after the prompt)
907+ # generated_part = full_generated_text[len(prompt_part):]
882908
883- generated_texts .append ((prompt_part , generated_part ))
909+ # generated_texts.append((prompt_part, generated_part))
884910
885- print (f"Generated response: { generated_part } ..." )
911+ # print(f"Generated response: {generated_part}...")
886912
887- # Display results with proper formatting
888- print ("\n " + "=" * 50 )
889- print ("FINAL GENERATED RESULTS" )
890- print ("=" * 50 )
913+ # # Display results with proper formatting
914+ # print("\n" + "="*50)
915+ # print("FINAL GENERATED RESULTS")
916+ # print("="*50)
891917
892- for idx , (original_prompt , generated_response ) in enumerate (generated_texts ):
893- print (f"\n Sample { idx + 1 } :" )
894- print (f"Prompt:{ original_prompt } " )
895- print (f"Response: { generated_response } " )
918+ # for idx, (original_prompt, generated_response) in enumerate(generated_texts):
919+ # print(f"\nSample {idx+1}:")
920+ # print(f"Prompt:{original_prompt}")
921+ # print(f"Response: {generated_response}")
896922
897923# Save the model
898924model_save_path = f"{ TIME } _cerebros-autoregressive-text-generator.keras"
@@ -909,53 +935,68 @@ def call(self, inputs):
909935reconstituted_generator = tf .keras .models .load_model (model_save_path )
910936print ("Model reconstituted successfully!" )
911937
912- # Test with all original data samples - REAL WORLD DEMO (reconstituted)
913- print ("\n " + "=" * 50 )
914- print ("GENERATED TEXT SAMPLES FROM ALL DATA - REAL WORLD USAGE (reconstituted)" )
915- print ("=" * 50 )
938+ ##### here <--------<<<<<<
916939
917- generated_texts_all = []
918- for i , text in enumerate (data ):
919- # Extract prompt part (everything up to and including </prompt>)
920- if '</prompt>' in text :
921- prompt_text = text .split ('</prompt>' )[0 ] + '</prompt>'
922- else :
923- prompt_text = text
924-
925- # Tokenize the prompt part for model input
926- tokenized = tokenizer (
927- prompt_text ,
928- max_length = MAX_SEQ_LENGTH ,
929- padding = 'max_length' ,
930- truncation = True ,
931- add_special_tokens = False
932- )
933- token_ids = tokenized ['input_ids' ]
934-
935- # Generate using the reconstituted model
936- generated_token_ids = reconstituted_generator .generate (
937- token_ids = token_ids ,
940+ counter = 0
941+ for sample in non_instruct_samples :
942+ half_sample_len = int (np .ceil (len (sample )))
943+ half_sample = sample [:half_sample ]
944+ half_sample_tokenized = tokenizer (half_sample )
945+ generated_tokens = reconstituted_generator .generate (
946+ token_ids = half_sample_tokenized ,
938947 do_sample = False ,
939- max_new_tokens = 100
948+ max_new_tokens = 40
940949 )
950+ full_generated_text = tokenizer .decode (generated_tokens , skip_special_tokens = False )
951+ print (f"PROMPT number { counter } : { half_sample } ; RESPONSE: { full_generated_text } " )
952+
953+ # # Test with all original data samples - REAL WORLD DEMO (reconstituted)
954+ # print("\n" + "="*50)
955+ # print("GENERATED TEXT SAMPLES FROM ALL DATA - REAL WORLD USAGE (reconstituted)")
956+ # print("="*50)
957+
958+ # generated_texts_all = []
959+ # for i, text in enumerate(data):
960+ # # Extract prompt part (everything up to and including </prompt>)
961+ # if '</prompt>' in text:
962+ # prompt_text = text.split('</prompt>')[0] + '</prompt>'
963+ # else:
964+ # prompt_text = text
965+
966+ # # Tokenize the prompt part for model input
967+ # tokenized = tokenizer(
968+ # prompt_text,
969+ # max_length=MAX_SEQ_LENGTH,
970+ # padding='max_length',
971+ # truncation=True,
972+ # add_special_tokens=False
973+ # )
974+ # token_ids = tokenized['input_ids']
975+
976+ # # Generate using the reconstituted model
977+ # generated_token_ids = reconstituted_generator.generate(
978+ # token_ids=token_ids,
979+ # do_sample=False,
980+ # max_new_tokens=100
981+ # )
941982
942- # Decode generated text
943- generated_text = tokenizer .decode (generated_token_ids , skip_special_tokens = False )
944- generated_texts_all .append (generated_text )
983+ # # Decode generated text
984+ # generated_text = tokenizer.decode(generated_token_ids, skip_special_tokens=False)
985+ # generated_texts_all.append(generated_text)
945986
946987
947- print (f"\n Sample { i + 1 } :" )
948- print (f"Prompt: { prompt_text } " )
949- print (f"Generated: { generated_text } " )
950- # [len(prompt_text):][:200]}...")
988+ # print(f"\nSample {i+1}:")
989+ # print(f"Prompt: {prompt_text}")
990+ # print(f"Generated: {generated_text}")
991+ # # [len(prompt_text):][:200]}...")
951992
952993print ("\n All samples processed with reconstituted model!" )
953994
954995
955- # Test with all original data samples
956- print ("\n " + "=" * 50 )
957- print ("GENERATED TEXT SAMPLES FROM ALL DATA" )
958- print ("=" * 50 )
996+ # # Test with all original data samples
997+ # print("\n" + "="*50)
998+ # print("GENERATED TEXT SAMPLES FROM ALL DATA")
999+ # print("="*50)
9591000
9601001# generated_texts_all = []
9611002# for i, text in enumerate(data[:3]): # Process first 3 for demo
0 commit comments