Skip to content

Commit 3bc1800

Browse files
Update phishing_email_detection_gpt2.py
Debug generation examples...
1 parent 57edcc3 commit 3bc1800

File tree

1 file changed

+20
-36
lines changed

1 file changed

+20
-36
lines changed

phishing_email_detection_gpt2.py

Lines changed: 20 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -859,32 +859,24 @@ def call(self, inputs):
859859
half_sample_len = int(np.ceil(len(sample) / 2))
860860
half_sample = sample[:half_sample_len]
861861

862-
# Tokenize the text
862+
# Tokenize the text without padding first to get actual tokens
863863
half_sample_tokenized = tokenizer(
864864
half_sample,
865-
max_length=MAX_SEQ_LENGTH,
866-
padding='max_length',
867-
truncation=True,
868865
add_special_tokens=False
869866
)['input_ids']
870867

871-
# # Extract token IDs as a list of integers (not tensors)
872-
# if isinstance(half_sample_tokenized, dict):
873-
# # If tokenizer returns a dict, extract the token IDs
874-
# token_ids = half_sample_tokenized['input_ids'] # or 'token_ids' depending on your tokenizer
875-
# else:
876-
# # If tokenizer returns a list directly
877-
# token_ids = half_sample_tokenized
868+
# Convert to Python list of integers
869+
if hasattr(half_sample_tokenized, 'numpy'):
870+
token_ids = half_sample_tokenized.numpy().tolist()
871+
else:
872+
token_ids = [int(token_id) for token_id in half_sample_tokenized]
878873

879-
# # Convert to Python list of integers if it's a tensor
880-
# if hasattr(token_ids, 'numpy'):
881-
# token_ids = token_ids.numpy().tolist()
882-
# if not isinstance(token_ids, list):
883-
# token_ids = list(token_ids)
874+
print(f"Actual token count: {len(token_ids)}")
875+
print(f"First 10 tokens: {token_ids[:10]}")
884876

885877
# Now pass the list of integers to your generate method
886878
generated_tokens = generator.generate(
887-
token_ids=half_sample_tokenized, # This should now be a list of integers
879+
token_ids=token_ids, # Just the actual tokens, no padding
888880
do_sample=False,
889881
max_new_tokens=40
890882
)
@@ -962,39 +954,30 @@ def call(self, inputs):
962954
reconstituted_generator = tf.keras.models.load_model(model_save_path)
963955
print("Model reconstituted successfully!")
964956

965-
##### here <--------<<<<<<
966957

967958
counter = 0
968959
for sample in non_instruct_samples:
969960
half_sample_len = int(np.ceil(len(sample) / 2))
970961
half_sample = sample[:half_sample_len]
971962

972-
# Tokenize the text
963+
# Tokenize the text without padding first to get actual tokens
973964
half_sample_tokenized = tokenizer(
974965
half_sample,
975-
max_length=MAX_SEQ_LENGTH,
976-
padding='max_length',
977-
truncation=True,
978966
add_special_tokens=False
979967
)['input_ids']
980968

981-
# # Extract token IDs as a list of integers (not tensors)
982-
# if isinstance(half_sample_tokenized, dict):
983-
# # If tokenizer returns a dict, extract the token IDs
984-
# token_ids = half_sample_tokenized['input_ids'] # or 'token_ids' depending on your tokenizer
985-
# else:
986-
# # If tokenizer returns a list directly
987-
# token_ids = half_sample_tokenized
969+
# Convert to Python list of integers
970+
if hasattr(half_sample_tokenized, 'numpy'):
971+
token_ids = half_sample_tokenized.numpy().tolist()
972+
else:
973+
token_ids = [int(token_id) for token_id in half_sample_tokenized]
988974

989-
# # Convert to Python list of integers if it's a tensor
990-
# if hasattr(token_ids, 'numpy'):
991-
# token_ids = token_ids.numpy().tolist()
992-
# if not isinstance(token_ids, list):
993-
# token_ids = list(token_ids)
975+
print(f"Actual token count: {len(token_ids)}")
976+
print(f"First 10 tokens: {token_ids[:10]}")
994977

995978
# Now pass the list of integers to your generate method
996-
generated_tokens = reconstituted_generator.generate(
997-
token_ids=half_sample_tokenized, # This should now be a list of integers
979+
generated_tokens = reconstituted_generator.generate(
980+
token_ids=token_ids, # Just the actual tokens, no padding
998981
do_sample=False,
999982
max_new_tokens=40
1000983
)
@@ -1004,6 +987,7 @@ def call(self, inputs):
1004987
print(f"PROMPT number {counter}: {half_sample}; RESPONSE: {full_generated_text}")
1005988
counter += 1
1006989

990+
1007991
# # Test with all original data samples - REAL WORLD DEMO (reconstituted)
1008992
# print("\n" + "="*50)
1009993
# print("GENERATED TEXT SAMPLES FROM ALL DATA - REAL WORLD USAGE (reconstituted)")

0 commit comments

Comments
 (0)