From ce3f6c08098094729f267791992836fcea9f1e0c Mon Sep 17 00:00:00 2001 From: janEbert Date: Fri, 17 Feb 2023 13:00:26 +0100 Subject: [PATCH 1/3] Fix covered index skipping --- megatron/data/dataset_utils.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/megatron/data/dataset_utils.py b/megatron/data/dataset_utils.py index 3841e263e..1297f6f07 100644 --- a/megatron/data/dataset_utils.py +++ b/megatron/data/dataset_utils.py @@ -261,10 +261,16 @@ def create_masked_lm_predictions(tokens, continue # Note(mingdachen): # Skip current piece if they are covered in lm masking or previous ngrams. + is_covered = False for index_set in cand_index_set[0]: for index in index_set: if index in covered_indexes: - continue + is_covered = True + break + if is_covered: + break + if is_covered: + continue if not geometric_dist: n = np_rng.choice(ngrams[:len(cand_index_set)], From f7c583f315faed0362bde9bfb7c083f111364b8d Mon Sep 17 00:00:00 2001 From: janEbert Date: Fri, 24 Feb 2023 11:58:05 +0100 Subject: [PATCH 2/3] Fix GPT tokenizer vocab size query Did not include additional special tokens. --- megatron/tokenizer/tokenizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py index 09304b1dd..b14ac3df4 100644 --- a/megatron/tokenizer/tokenizer.py +++ b/megatron/tokenizer/tokenizer.py @@ -296,7 +296,7 @@ def __init__(self, vocab_file, merge_file): @property def vocab_size(self): - return len(self.tokenizer.encoder) + return len(self.tokenizer) @property def vocab(self): From cfd6374e01eccf6f19f210c541c94e8d1697362b Mon Sep 17 00:00:00 2001 From: janEbert Date: Tue, 28 Feb 2023 14:47:13 +0100 Subject: [PATCH 3/3] Do not remove last token This corrupts the targets. There is no good reason for this. --- tasks/eval_harness/evaluate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tasks/eval_harness/evaluate.py b/tasks/eval_harness/evaluate.py index 68dd649fd..d733ec24b 100644 --- a/tasks/eval_harness/evaluate.py +++ b/tasks/eval_harness/evaluate.py @@ -133,7 +133,7 @@ def _collate(x): for _, context_enc, continuation_enc in chunk: # when too long to fit in context, truncate from the left inp = torch.tensor( - (context_enc + continuation_enc)[-(self.max_length + 1):][:-1] + (context_enc + continuation_enc)[-(self.max_length + 1):] , dtype=torch.long).to(self.device) inplen, = inp.shape