Skip to content

Commit 0b00762

Browse files
committed
Hopefully this fixes MLM
1 parent fc05c7f commit 0b00762

File tree

1 file changed

+4
-3
lines changed

1 file changed

+4
-3
lines changed

megatron/data/mlm_dataset.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -275,13 +275,14 @@ def __init__(
275275
# according to `noise_density` and `mean_noise_span_length`. We can also define the label length accordingly.
276276
number_of_raw_tokens, inputs_length, targets_length, num_noise_spans = compute_input_and_target_lengths(
277277
# +1 is used so that we can compute the as autoregressive systems require us to add one more token.
278-
sequence_length=self.sequence_length + 1,
278+
sequence_length=self.sequence_length,
279279
noise_density=self.noise_density,
280280
mean_noise_span_length=self.mean_noise_span_length
281281
)
282-
self.number_of_raw_tokens = number_of_raw_tokens
283282
self.inputs_length = inputs_length
284-
self.targets_length = targets_length
283+
# As the loss we add a token at the end
284+
self.number_of_raw_tokens = number_of_raw_tokens + 1
285+
self.targets_length = targets_length +1
285286
self.num_noise_spans = num_noise_spans
286287

287288
# Build the samples mapping.

0 commit comments

Comments
 (0)