Skip to content

Commit c15225b

Browse files
author
Tong Li
committed
modify data loader
1 parent 070907d commit c15225b

File tree

1 file changed

+3
-6
lines changed
  • applications/ColossalChat/coati/dataset

1 file changed

+3
-6
lines changed

applications/ColossalChat/coati/dataset/loader.py

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -387,11 +387,6 @@ def apply_chat_template_and_mask(
387387
if padding and len(tokens) < max_length:
388388
to_pad = max_length - len(tokens)
389389
# Left padding for generation.
390-
# if tokenizer.padding_side == "right":
391-
# tokens.extend([tokenizer.pad_token_id] * to_pad)
392-
# assistant_mask.extend([False] * to_pad)
393-
# attention_mask.extend([0] * to_pad)
394-
# else:
395390
tokens = [tokenizer.pad_token_id] * to_pad + tokens
396391
assistant_mask = [False] * to_pad + assistant_mask
397392
attention_mask = [0] * to_pad + attention_mask
@@ -405,7 +400,9 @@ def apply_chat_template_and_mask(
405400
labels[~torch.tensor(assistant_mask, dtype=torch.bool)] = ignore_idx
406401

407402
if gt_answer is not None:
408-
gt_answer = tokenizer.encode(gt_answer, padding="max_length", max_length=64, return_tensors="pt")
403+
gt_answer = tokenizer.encode(
404+
gt_answer, padding="max_length", truncation=True, max_length=128, return_tensors="pt"
405+
)
409406
gt_answer = gt_answer.squeeze(1)
410407
return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels, "gt_answer": gt_answer}
411408

0 commit comments

Comments
 (0)