Skip to content

Commit fe7cd57

Browse files
Update phishing_email_detection_gpt2.py
Clean up the code a little.
1 parent 4903762 commit fe7cd57

File tree

1 file changed

+12
-9
lines changed

1 file changed

+12
-9
lines changed

phishing_email_detection_gpt2.py

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -807,7 +807,7 @@ def complete_text(text):
807807
tokenizer.decode(generated_tokens).replace(text, "")
808808
return generated_text
809809

810-
test_text = "I saw the sun and it was"
810+
test_text = "I saw the sun and it was as"
811811
response = complete_text(test_text)
812812

813813
print(f"I ask the generator: {test_text}... It responds:")
@@ -822,8 +822,8 @@ def complete_text(text):
822822
sample,
823823
add_special_tokens=False
824824
)['input_ids']
825-
half_index = int(np.ceil(len(sample_tokenized) * 0.5))
826-
half_sample_tokenized = sample_tokenized[:half_index]
825+
start_generate_index = int(np.ceil(len(sample_tokenized) * 0.5))
826+
half_sample_tokenized = sample_tokenized[:start_generate_index]
827827

828828
# Convert to Python list of integers
829829
if hasattr(half_sample_tokenized, 'numpy'):
@@ -868,14 +868,14 @@ def complete_text(text):
868868

869869
counter = 0
870870
for sample in non_instruct_samples:
871-
half_sample_len = int(np.ceil(len(sample) / 2))
872-
half_sample = sample[:half_sample_len]
873-
871+
874872
# Tokenize the text without padding first to get actual tokens
875-
half_sample_tokenized = tokenizer(
876-
half_sample,
873+
sample_tokenized = tokenizer(
874+
sample,
877875
add_special_tokens=False
878876
)['input_ids']
877+
start_generate_index = int(np.ceil(len(sample_tokenized) * 0.5))
878+
half_sample_tokenized = sample_tokenized[:start_generate_index]
879879

880880
# Convert to Python list of integers
881881
if hasattr(half_sample_tokenized, 'numpy'):
@@ -894,7 +894,10 @@ def complete_text(text):
894894
)
895895

896896
# Decode the result
897-
full_generated_text = tokenizer.decode(generated_tokens, skip_special_tokens=False)
897+
half_sample = tokenizer.decode(half_sample_tokenized)
898+
full_generated_text = tokenizer.decode(generated_tokens, skip_special_tokens=False)\
899+
.replace(half_sample, "")
900+
898901
print(f"PROMPT number {counter}: {half_sample}; RESPONSE: {full_generated_text}")
899902
counter += 1
900903

0 commit comments

Comments
 (0)