Skip to content

Commit e974dfd

Browse files
Update phishing_email_detection_gpt2.py
Refactor generation examples....
1 parent f3049e2 commit e974dfd

File tree

1 file changed

+168
-127
lines changed

1 file changed

+168
-127
lines changed

phishing_email_detection_gpt2.py

Lines changed: 168 additions & 127 deletions
Original file line numberDiff line numberDiff line change
@@ -231,7 +231,7 @@ def package_non_instruct_text(text: str, desired_samples: int, max_length_tokens
231231
return samples
232232

233233
# Separate into samples
234-
non_instruct_samples = package_non_instruct_text(text=bible, desired_samples=30, max_length_tokens=1200)
234+
non_instruct_samples = package_non_instruct_text(text=bible, desired_samples=30, max_length_tokens=int(np.ceil(MAX_SEQ_LENGTH * .8)) ##
235235

236236
del(bible)
237237
collect()
@@ -641,9 +641,9 @@ def reset_state(self):
641641

642642
best_model_found = cerebros_automl.get_best_model()
643643
best_model_found.save(MODEL_FILE_NAME)
644-
del(best_model_found)
645-
del(cerebros_automl)
646-
collect()
644+
# del(best_model_found)
645+
# del(cerebros_automl)
646+
# collect()
647647

648648
file_size_bytes = getsize(MODEL_FILE_NAME)
649649
print(f"Model size on disk: {file_size_bytes / (1024*1024):.2f} MB")
@@ -659,62 +659,62 @@ def reset_state(self):
659659
pad_token_id = tokenizer.pad_token_id
660660
end_prompt_token_id = tokenizer.encode("</prompt>", add_special_tokens=False)[0]
661661

662-
# Generate text for first 5 test samples (Working)
663-
generated_texts = []
664-
for i in range(min(5, len(x_test_packaged[0]))):
665-
original_input = x_test_packaged[0][i].numpy()
662+
# # Generate text for first 5 test samples (Working)
663+
# generated_texts = []
664+
# for i in range(min(5, len(x_test_packaged[0]))):
665+
# original_input = x_test_packaged[0][i].numpy()
666666

667-
# Find the end of the prompt
668-
try:
669-
end_prompt_index = list(original_input).index(end_prompt_token_id)
670-
except ValueError:
671-
end_prompt_index = 0
667+
# # Find the end of the prompt
668+
# try:
669+
# end_prompt_index = list(original_input).index(end_prompt_token_id)
670+
# except ValueError:
671+
# end_prompt_index = 0
672672

673-
# Extract the prompt part
674-
prompt_tokens = original_input[:end_prompt_index+1].tolist()
673+
# # Extract the prompt part
674+
# prompt_tokens = original_input[:end_prompt_index+1].tolist()
675675

676-
# Generate tokens sequentially
677-
generated_tokens = []
678-
current_input = prompt_tokens.copy()
676+
# # Generate tokens sequentially
677+
# generated_tokens = []
678+
# current_input = prompt_tokens.copy()
679679

680-
# Generate up to 100 tokens or until pad token
681-
for _ in range(100):
682-
# Pad or truncate to MAX_SEQ_LENGTH
683-
input_tensor = tf.constant([current_input + [pad_token_id] * (MAX_SEQ_LENGTH - len(current_input))], dtype=tf.int32)
680+
# # Generate up to 100 tokens or until pad token
681+
# for _ in range(100):
682+
# # Pad or truncate to MAX_SEQ_LENGTH
683+
# input_tensor = tf.constant([current_input + [pad_token_id] * (MAX_SEQ_LENGTH - len(current_input))], dtype=tf.int32)
684684

685-
# Get prediction
686-
prediction = reconstituted_model(input_tensor)
687-
next_token_id = int(tf.argmax(prediction[0], axis=-1).numpy())
685+
# # Get prediction
686+
# prediction = reconstituted_model(input_tensor)
687+
# next_token_id = int(tf.argmax(prediction[0], axis=-1).numpy())
688688

689-
# Stop if pad token generated
690-
if next_token_id == pad_token_id:
691-
break
689+
# # Stop if pad token generated
690+
# if next_token_id == pad_token_id:
691+
# break
692692

693-
generated_tokens.append(next_token_id)
694-
current_input.append(next_token_id)
693+
# generated_tokens.append(next_token_id)
694+
# current_input.append(next_token_id)
695695

696-
# Stop if we exceed max length
697-
if len(current_input) >= MAX_SEQ_LENGTH:
698-
break
696+
# # Stop if we exceed max length
697+
# if len(current_input) >= MAX_SEQ_LENGTH:
698+
# break
699699

700-
generated_texts.append((prompt_tokens, generated_tokens))
700+
# generated_texts.append((prompt_tokens, generated_tokens))
701701

702-
# Decode and print with proper formatting
703-
for idx, (prompt_tokens, generated_tokens) in enumerate(generated_texts):
704-
# Decode prompt
705-
prompt_text = tokenizer.decode(prompt_tokens, skip_special_tokens=False)
702+
# # Decode and print with proper formatting
703+
# for idx, (prompt_tokens, generated_tokens) in enumerate(generated_texts):
704+
# # Decode prompt
705+
# prompt_text = tokenizer.decode(prompt_tokens, skip_special_tokens=False)
706706

707-
# Extract original prompt content
708-
if '<prompt>' in prompt_text and '</prompt>' in prompt_text:
709-
original_prompt = prompt_text.split('<prompt>')[-1].split('</prompt>')[0]
710-
else:
711-
original_prompt = prompt_text[:50] + "..." if len(prompt_text) > 50 else prompt_text
707+
# # Extract original prompt content
708+
# if '<prompt>' in prompt_text and '</prompt>' in prompt_text:
709+
# original_prompt = prompt_text.split('<prompt>')[-1].split('</prompt>')[0]
710+
# else:
711+
# original_prompt = prompt_text[:50] + "..." if len(prompt_text) > 50 else prompt_text
712712

713-
# Decode generated text
714-
generated_text = tokenizer.decode(generated_tokens, skip_special_tokens=False) if generated_tokens else ""
713+
# # Decode generated text
714+
# generated_text = tokenizer.decode(generated_tokens, skip_special_tokens=False) if generated_tokens else ""
715715

716-
print(f"\nGenerated text from sample {idx+1}:")
717-
print(f"<prompt>{original_prompt}</prompt>{generated_text}")
716+
# print(f"\nGenerated text from sample {idx+1}:")
717+
# print(f"<prompt>{original_prompt}</prompt>{generated_text}")
718718

719719

720720

@@ -747,7 +747,7 @@ def __init__(self, config, **kwargs):
747747
self.max_sequence_length = config.max_sequence_length
748748
self.padding_token = config.padding_token
749749
# Make self.model = the reconstituted model (constant)
750-
self.model = reconstituted_model
750+
self.model = best_model_found # reconstituted_model
751751

752752
def get_config(self):
753753
return {
@@ -786,6 +786,7 @@ def generate(self, token_ids, do_sample=False, max_new_tokens=None):
786786
current_tokens = token_ids.copy()
787787

788788
# Autoregressive generation loop
789+
temp_gen_count = 0 # <--------<< Debug code to remove later
789790
for _ in range(max_new_tokens):
790791
# Pad or truncate to max_sequence_length (CORRECTED PADDING LOGIC)
791792
if len(current_tokens) > self.max_sequence_length:
@@ -802,12 +803,18 @@ def generate(self, token_ids, do_sample=False, max_new_tokens=None):
802803
# Get next token based on sampling strategy
803804
if do_sample:
804805
# Sample from the distribution
805-
probabilities = tf.nn.softmax(logits[0], axis=-1)
806-
next_token_id = tf.random.categorical(tf.math.log(probabilities)[None, :], 1)[0, 0].numpy()
806+
# probabilities = tf.nn.softmax(logits[0], axis=-1) # Model already applies softmax
807+
next_token_id = tf.random.categorical(tf.math.log(logits[0])[None, :], 1)[0, 0].numpy()
807808
else:
808809
# Greedy sampling (argmax)
809810
next_token_id = int(tf.argmax(logits[0], axis=-1).numpy())
810-
811+
# Debug code to removel later
812+
print(f"Generating {temp_gen_count}")
813+
print(f"... next_token_id: {next_token_id}")
814+
next_word = tokenizer.decode(next_token_id)
815+
print(f"Next decoded word: {next_word}")
816+
temp_gen_count =+=1
817+
811818
# Check for termination condition
812819
if next_token_id == self.padding_token:
813820
break
@@ -845,54 +852,73 @@ def call(self, inputs):
845852
)
846853
generator = CerebrosAutoregressiveTextGenerator(config)
847854

848-
# Process ALL original samples from data - REAL WORLD USAGE
849-
generated_texts = []
850-
for i, original_text in enumerate(data[:5]): # Process first 5 samples
851-
print(f"\nProcessing sample {i+1}...")
852-
853-
# Extract prompt part (everything up to and including </prompt>)
854-
if '</prompt>' in original_text:
855-
prompt_part = original_text.split('</prompt>')[0] + '</prompt>'
856-
else:
857-
prompt_part = original_text
858-
859-
# Tokenize the prompt part
860-
tokenized = tokenizer(
861-
prompt_part,
862-
add_special_tokens=False, # We handle special tokens manually
863-
return_tensors=None # Return lists, not tensors
864-
)
865-
prompt_tokens = tokenized['input_ids']
866-
867-
print(f"Original prompt: {prompt_part[:100]}...")
868-
print(f"Tokenized prompt length: {len(prompt_tokens)} tokens")
869-
870-
# Generate tokens using the wrapper class - REAL WORLD USAGE
855+
print("########### BEFORE SEARIALIZING THE GENERATIVE MODEL")
856+
857+
counter = 0
858+
for sample in non_instruct_samples:
859+
half_sample_len = int(np.ceil(len(sample)))
860+
half_sample = sample[:half_sample]
861+
half_sample_tokenized = tokenizer(half_sample)
871862
generated_tokens = generator.generate(
872-
token_ids=prompt_tokens,
863+
token_ids=half_sample_tokenized,
873864
do_sample=False,
874-
max_new_tokens=100
865+
max_new_tokens=40
875866
)
876-
877-
# Decode the full generated text
878867
full_generated_text = tokenizer.decode(generated_tokens, skip_special_tokens=False)
868+
print(f"PROMPT number {counter}: {half_sample}; RESPONSE: {full_generated_text}")
869+
870+
871+
872+
873+
874+
# # Process ALL original samples from data - REAL WORLD USAGE
875+
# generated_texts = []
876+
# for i, original_text in enumerate(data[:5]): # Process first 5 samples
877+
# print(f"\nProcessing sample {i+1}...")
878+
879+
# # Extract prompt part (everything up to and including </prompt>)
880+
# if '</prompt>' in original_text:
881+
# prompt_part = original_text.split('</prompt>')[0] + '</prompt>'
882+
# else:
883+
# prompt_part = original_text
884+
885+
# # Tokenize the prompt part
886+
# tokenized = tokenizer(
887+
# prompt_part,
888+
# add_special_tokens=False, # We handle special tokens manually
889+
# return_tensors=None # Return lists, not tensors
890+
# )
891+
# prompt_tokens = tokenized['input_ids']
892+
893+
# print(f"Original prompt: {prompt_part[:100]}...")
894+
# print(f"Tokenized prompt length: {len(prompt_tokens)} tokens")
895+
896+
# # Generate tokens using the wrapper class - REAL WORLD USAGE
897+
# generated_tokens = generator.generate(
898+
# token_ids=prompt_tokens,
899+
# do_sample=False,
900+
# max_new_tokens=100
901+
# )
902+
903+
# # Decode the full generated text
904+
# full_generated_text = tokenizer.decode(generated_tokens, skip_special_tokens=False)
879905

880-
# Extract just the newly generated part (after the prompt)
881-
generated_part = full_generated_text[len(prompt_part):]
906+
# # Extract just the newly generated part (after the prompt)
907+
# generated_part = full_generated_text[len(prompt_part):]
882908

883-
generated_texts.append((prompt_part, generated_part))
909+
# generated_texts.append((prompt_part, generated_part))
884910

885-
print(f"Generated response: {generated_part}...")
911+
# print(f"Generated response: {generated_part}...")
886912

887-
# Display results with proper formatting
888-
print("\n" + "="*50)
889-
print("FINAL GENERATED RESULTS")
890-
print("="*50)
913+
# # Display results with proper formatting
914+
# print("\n" + "="*50)
915+
# print("FINAL GENERATED RESULTS")
916+
# print("="*50)
891917

892-
for idx, (original_prompt, generated_response) in enumerate(generated_texts):
893-
print(f"\nSample {idx+1}:")
894-
print(f"Prompt:{original_prompt}")
895-
print(f"Response: {generated_response}")
918+
# for idx, (original_prompt, generated_response) in enumerate(generated_texts):
919+
# print(f"\nSample {idx+1}:")
920+
# print(f"Prompt:{original_prompt}")
921+
# print(f"Response: {generated_response}")
896922

897923
# Save the model
898924
model_save_path = f"{TIME}_cerebros-autoregressive-text-generator.keras"
@@ -909,53 +935,68 @@ def call(self, inputs):
909935
reconstituted_generator = tf.keras.models.load_model(model_save_path)
910936
print("Model reconstituted successfully!")
911937

912-
# Test with all original data samples - REAL WORLD DEMO (reconstituted)
913-
print("\n" + "="*50)
914-
print("GENERATED TEXT SAMPLES FROM ALL DATA - REAL WORLD USAGE (reconstituted)")
915-
print("="*50)
938+
##### here <--------<<<<<<
916939

917-
generated_texts_all = []
918-
for i, text in enumerate(data):
919-
# Extract prompt part (everything up to and including </prompt>)
920-
if '</prompt>' in text:
921-
prompt_text = text.split('</prompt>')[0] + '</prompt>'
922-
else:
923-
prompt_text = text
924-
925-
# Tokenize the prompt part for model input
926-
tokenized = tokenizer(
927-
prompt_text,
928-
max_length=MAX_SEQ_LENGTH,
929-
padding='max_length',
930-
truncation=True,
931-
add_special_tokens=False
932-
)
933-
token_ids = tokenized['input_ids']
934-
935-
# Generate using the reconstituted model
936-
generated_token_ids = reconstituted_generator.generate(
937-
token_ids=token_ids,
940+
counter = 0
941+
for sample in non_instruct_samples:
942+
half_sample_len = int(np.ceil(len(sample)))
943+
half_sample = sample[:half_sample]
944+
half_sample_tokenized = tokenizer(half_sample)
945+
generated_tokens = reconstituted_generator.generate(
946+
token_ids=half_sample_tokenized,
938947
do_sample=False,
939-
max_new_tokens=100
948+
max_new_tokens=40
940949
)
950+
full_generated_text = tokenizer.decode(generated_tokens, skip_special_tokens=False)
951+
print(f"PROMPT number {counter}: {half_sample}; RESPONSE: {full_generated_text}")
952+
953+
# # Test with all original data samples - REAL WORLD DEMO (reconstituted)
954+
# print("\n" + "="*50)
955+
# print("GENERATED TEXT SAMPLES FROM ALL DATA - REAL WORLD USAGE (reconstituted)")
956+
# print("="*50)
957+
958+
# generated_texts_all = []
959+
# for i, text in enumerate(data):
960+
# # Extract prompt part (everything up to and including </prompt>)
961+
# if '</prompt>' in text:
962+
# prompt_text = text.split('</prompt>')[0] + '</prompt>'
963+
# else:
964+
# prompt_text = text
965+
966+
# # Tokenize the prompt part for model input
967+
# tokenized = tokenizer(
968+
# prompt_text,
969+
# max_length=MAX_SEQ_LENGTH,
970+
# padding='max_length',
971+
# truncation=True,
972+
# add_special_tokens=False
973+
# )
974+
# token_ids = tokenized['input_ids']
975+
976+
# # Generate using the reconstituted model
977+
# generated_token_ids = reconstituted_generator.generate(
978+
# token_ids=token_ids,
979+
# do_sample=False,
980+
# max_new_tokens=100
981+
# )
941982

942-
# Decode generated text
943-
generated_text = tokenizer.decode(generated_token_ids, skip_special_tokens=False)
944-
generated_texts_all.append(generated_text)
983+
# # Decode generated text
984+
# generated_text = tokenizer.decode(generated_token_ids, skip_special_tokens=False)
985+
# generated_texts_all.append(generated_text)
945986

946987

947-
print(f"\nSample {i+1}:")
948-
print(f"Prompt: {prompt_text}")
949-
print(f"Generated: {generated_text}")
950-
# [len(prompt_text):][:200]}...")
988+
# print(f"\nSample {i+1}:")
989+
# print(f"Prompt: {prompt_text}")
990+
# print(f"Generated: {generated_text}")
991+
# # [len(prompt_text):][:200]}...")
951992

952993
print("\nAll samples processed with reconstituted model!")
953994

954995

955-
# Test with all original data samples
956-
print("\n" + "="*50)
957-
print("GENERATED TEXT SAMPLES FROM ALL DATA")
958-
print("="*50)
996+
# # Test with all original data samples
997+
# print("\n" + "="*50)
998+
# print("GENERATED TEXT SAMPLES FROM ALL DATA")
999+
# print("="*50)
9591000

9601001
# generated_texts_all = []
9611002
# for i, text in enumerate(data[:3]): # Process first 3 for demo

0 commit comments

Comments
 (0)