Skip to content

Commit 295b252

Browse files
Update phishing_email_detection_gpt2.py
Fix generation example ....
1 parent 495c0ae commit 295b252

File tree

1 file changed

+46
-8
lines changed

1 file changed

+46
-8
lines changed

phishing_email_detection_gpt2.py

Lines changed: 46 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -856,18 +856,37 @@ def call(self, inputs):
856856

857857
counter = 0
858858
for sample in non_instruct_samples:
859-
half_sample_len = int(np.ceil(len(sample)))
859+
half_sample_len = int(np.ceil(len(sample) / 2))
860860
half_sample = sample[:half_sample_len]
861+
862+
# Tokenize the text
861863
half_sample_tokenized = tokenizer(half_sample)
864+
865+
# Extract token IDs as a list of integers (not tensors)
866+
if isinstance(half_sample_tokenized, dict):
867+
# If tokenizer returns a dict, extract the token IDs
868+
token_ids = half_sample_tokenized['input_ids'] # or 'token_ids' depending on your tokenizer
869+
else:
870+
# If tokenizer returns a list directly
871+
token_ids = half_sample_tokenized
872+
873+
# Convert to Python list of integers if it's a tensor
874+
if hasattr(token_ids, 'numpy'):
875+
token_ids = token_ids.numpy().tolist()
876+
if not isinstance(token_ids, list):
877+
token_ids = list(token_ids)
878+
879+
# Now pass the list of integers to your generate method
862880
generated_tokens = generator.generate(
863-
token_ids=half_sample_tokenized,
881+
token_ids=token_ids, # This should now be a list of integers
864882
do_sample=False,
865883
max_new_tokens=40
866884
)
885+
886+
# Decode the result
867887
full_generated_text = tokenizer.decode(generated_tokens, skip_special_tokens=False)
868888
print(f"PROMPT number {counter}: {half_sample}; RESPONSE: {full_generated_text}")
869-
870-
889+
counter += 1
871890

872891

873892

@@ -935,20 +954,39 @@ def call(self, inputs):
935954
reconstituted_generator = tf.keras.models.load_model(model_save_path)
936955
print("Model reconstituted successfully!")
937956

938-
##### here <--------<<<<<<
939-
940957
counter = 0
941958
for sample in non_instruct_samples:
942-
half_sample_len = int(np.ceil(len(sample)))
959+
half_sample_len = int(np.ceil(len(sample) / 2))
943960
half_sample = sample[:half_sample_len]
961+
962+
# Tokenize the text
944963
half_sample_tokenized = tokenizer(half_sample)
964+
965+
# Extract token IDs as a list of integers (not tensors)
966+
if isinstance(half_sample_tokenized, dict):
967+
# If tokenizer returns a dict, extract the token IDs
968+
token_ids = half_sample_tokenized['input_ids'] # or 'token_ids' depending on your tokenizer
969+
else:
970+
# If tokenizer returns a list directly
971+
token_ids = half_sample_tokenized
972+
973+
# Convert to Python list of integers if it's a tensor
974+
if hasattr(token_ids, 'numpy'):
975+
token_ids = token_ids.numpy().tolist()
976+
if not isinstance(token_ids, list):
977+
token_ids = list(token_ids)
978+
979+
# Now pass the list of integers to your generate method
945980
generated_tokens = reconstituted_generator.generate(
946-
token_ids=half_sample_tokenized,
981+
token_ids=token_ids, # This should now be a list of integers
947982
do_sample=False,
948983
max_new_tokens=40
949984
)
985+
986+
# Decode the result
950987
full_generated_text = tokenizer.decode(generated_tokens, skip_special_tokens=False)
951988
print(f"PROMPT number {counter}: {half_sample}; RESPONSE: {full_generated_text}")
989+
counter += 1
952990

953991
# # Test with all original data samples - REAL WORLD DEMO (reconstituted)
954992
# print("\n" + "="*50)

0 commit comments

Comments
 (0)