Skip to content

Commit d11b947

Browse files
Update phishing_email_detection_gpt2.py
Debug generation ...
1 parent 66bf4c6 commit d11b947

File tree

1 file changed

+46
-4
lines changed

1 file changed

+46
-4
lines changed

phishing_email_detection_gpt2.py

Lines changed: 46 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -856,16 +856,37 @@ def call(self, inputs):
856856

857857
counter = 0
858858
for sample in non_instruct_samples:
859-
half_sample_len = int(np.ceil(len(sample)))
859+
half_sample_len = int(np.ceil(len(sample) / 2))
860860
half_sample = sample[:half_sample_len]
861+
862+
# Tokenize the text
861863
half_sample_tokenized = tokenizer(half_sample)
864+
865+
# Extract token IDs as a list of integers (not tensors)
866+
if isinstance(half_sample_tokenized, dict):
867+
# If tokenizer returns a dict, extract the token IDs
868+
token_ids = half_sample_tokenized['input_ids'] # or 'token_ids' depending on your tokenizer
869+
else:
870+
# If tokenizer returns a list directly
871+
token_ids = half_sample_tokenized
872+
873+
# Convert to Python list of integers if it's a tensor
874+
if hasattr(token_ids, 'numpy'):
875+
token_ids = token_ids.numpy().tolist()
876+
if not isinstance(token_ids, list):
877+
token_ids = list(token_ids)
878+
879+
# Now pass the list of integers to your generate method
862880
generated_tokens = generator.generate(
863-
token_ids=half_sample_tokenized,
881+
token_ids=token_ids, # This should now be a list of integers
864882
do_sample=False,
865883
max_new_tokens=40
866884
)
885+
886+
# Decode the result
867887
full_generated_text = tokenizer.decode(generated_tokens, skip_special_tokens=False)
868888
print(f"PROMPT number {counter}: {half_sample}; RESPONSE: {full_generated_text}")
889+
counter += 1
869890

870891

871892

@@ -939,16 +960,37 @@ def call(self, inputs):
939960

940961
counter = 0
941962
for sample in non_instruct_samples:
942-
half_sample_len = int(np.ceil(len(sample)))
963+
half_sample_len = int(np.ceil(len(sample) / 2))
943964
half_sample = sample[:half_sample_len]
965+
966+
# Tokenize the text
944967
half_sample_tokenized = tokenizer(half_sample)
968+
969+
# Extract token IDs as a list of integers (not tensors)
970+
if isinstance(half_sample_tokenized, dict):
971+
# If tokenizer returns a dict, extract the token IDs
972+
token_ids = half_sample_tokenized['input_ids'] # or 'token_ids' depending on your tokenizer
973+
else:
974+
# If tokenizer returns a list directly
975+
token_ids = half_sample_tokenized
976+
977+
# Convert to Python list of integers if it's a tensor
978+
if hasattr(token_ids, 'numpy'):
979+
token_ids = token_ids.numpy().tolist()
980+
if not isinstance(token_ids, list):
981+
token_ids = list(token_ids)
982+
983+
# Now pass the list of integers to your generate method
945984
generated_tokens = reconstituted_generator.generate(
946-
token_ids=half_sample_tokenized,
985+
token_ids=token_ids, # This should now be a list of integers
947986
do_sample=False,
948987
max_new_tokens=40
949988
)
989+
990+
# Decode the result
950991
full_generated_text = tokenizer.decode(generated_tokens, skip_special_tokens=False)
951992
print(f"PROMPT number {counter}: {half_sample}; RESPONSE: {full_generated_text}")
993+
counter += 1
952994

953995
# # Test with all original data samples - REAL WORLD DEMO (reconstituted)
954996
# print("\n" + "="*50)

0 commit comments

Comments
 (0)