Skip to content

Commit a95a646

Browse files
Update phishing_email_detection_gpt2.py
Proper tokenization ....
1 parent 295b252 commit a95a646

File tree

1 file changed

+33
-21
lines changed

1 file changed

+33
-21
lines changed

phishing_email_detection_gpt2.py

Lines changed: 33 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -860,21 +860,27 @@ def call(self, inputs):
860860
half_sample = sample[:half_sample_len]
861861

862862
# Tokenize the text
863-
half_sample_tokenized = tokenizer(half_sample)
863+
half_sample_tokenized = tokenizer(
864+
half_sample,
865+
max_length=max_seq_length,
866+
padding='max_length',
867+
truncation=True,
868+
add_special_tokens=False
869+
)['input_ids']
864870

865-
# Extract token IDs as a list of integers (not tensors)
866-
if isinstance(half_sample_tokenized, dict):
867-
# If tokenizer returns a dict, extract the token IDs
868-
token_ids = half_sample_tokenized['input_ids'] # or 'token_ids' depending on your tokenizer
869-
else:
870-
# If tokenizer returns a list directly
871-
token_ids = half_sample_tokenized
871+
# # Extract token IDs as a list of integers (not tensors)
872+
# if isinstance(half_sample_tokenized, dict):
873+
# # If tokenizer returns a dict, extract the token IDs
874+
# token_ids = half_sample_tokenized['input_ids'] # or 'token_ids' depending on your tokenizer
875+
# else:
876+
# # If tokenizer returns a list directly
877+
# token_ids = half_sample_tokenized
872878

873-
# Convert to Python list of integers if it's a tensor
874-
if hasattr(token_ids, 'numpy'):
875-
token_ids = token_ids.numpy().tolist()
876-
if not isinstance(token_ids, list):
877-
token_ids = list(token_ids)
879+
# # Convert to Python list of integers if it's a tensor
880+
# if hasattr(token_ids, 'numpy'):
881+
# token_ids = token_ids.numpy().tolist()
882+
# if not isinstance(token_ids, list):
883+
# token_ids = list(token_ids)
878884

879885
# Now pass the list of integers to your generate method
880886
generated_tokens = generator.generate(
@@ -960,15 +966,21 @@ def call(self, inputs):
960966
half_sample = sample[:half_sample_len]
961967

962968
# Tokenize the text
963-
half_sample_tokenized = tokenizer(half_sample)
969+
half_sample_tokenized = tokenizer(
970+
half_sample,
971+
max_length=max_seq_length,
972+
padding='max_length',
973+
truncation=True,
974+
add_special_tokens=False
975+
)['input_ids']
964976

965-
# Extract token IDs as a list of integers (not tensors)
966-
if isinstance(half_sample_tokenized, dict):
967-
# If tokenizer returns a dict, extract the token IDs
968-
token_ids = half_sample_tokenized['input_ids'] # or 'token_ids' depending on your tokenizer
969-
else:
970-
# If tokenizer returns a list directly
971-
token_ids = half_sample_tokenized
977+
# # Extract token IDs as a list of integers (not tensors)
978+
# if isinstance(half_sample_tokenized, dict):
979+
# # If tokenizer returns a dict, extract the token IDs
980+
# token_ids = half_sample_tokenized['input_ids'] # or 'token_ids' depending on your tokenizer
981+
# else:
982+
# # If tokenizer returns a list directly
983+
# token_ids = half_sample_tokenized
972984

973985
# Convert to Python list of integers if it's a tensor
974986
if hasattr(token_ids, 'numpy'):

0 commit comments

Comments
 (0)