diff --git a/megatron/data/dataset_utils.py b/megatron/data/dataset_utils.py index 3841e263e..1297f6f07 100644 --- a/megatron/data/dataset_utils.py +++ b/megatron/data/dataset_utils.py @@ -261,10 +261,16 @@ def create_masked_lm_predictions(tokens, continue # Note(mingdachen): # Skip current piece if they are covered in lm masking or previous ngrams. + is_covered = False for index_set in cand_index_set[0]: for index in index_set: if index in covered_indexes: - continue + is_covered = True + break + if is_covered: + break + if is_covered: + continue if not geometric_dist: n = np_rng.choice(ngrams[:len(cand_index_set)], diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py index 09304b1dd..b14ac3df4 100644 --- a/megatron/tokenizer/tokenizer.py +++ b/megatron/tokenizer/tokenizer.py @@ -296,7 +296,7 @@ def __init__(self, vocab_file, merge_file): @property def vocab_size(self): - return len(self.tokenizer.encoder) + return len(self.tokenizer) @property def vocab(self): diff --git a/tasks/eval_harness/evaluate.py b/tasks/eval_harness/evaluate.py index 68dd649fd..d733ec24b 100644 --- a/tasks/eval_harness/evaluate.py +++ b/tasks/eval_harness/evaluate.py @@ -133,7 +133,7 @@ def _collate(x): for _, context_enc, continuation_enc in chunk: # when too long to fit in context, truncate from the left inp = torch.tensor( - (context_enc + continuation_enc)[-(self.max_length + 1):][:-1] + (context_enc + continuation_enc)[-(self.max_length + 1):] , dtype=torch.long).to(self.device) inplen, = inp.shape