do not use sample packing for running predicitons

ChaoPang · ChaoPang · commit bda8fe9c39b5 · 2025-04-29T07:12:24.000-04:00
diff --git a/src/cehrbert/data_generators/hf_data_generator/hf_dataset_collator.py b/src/cehrbert/data_generators/hf_data_generator/hf_dataset_collator.py
@@ -95,6 +95,9 @@ def __call__(self, examples):
             )
             # The attention_mask is set to 1 to enable attention for the CLS token
             batch["attention_mask"] = torch.cat([torch.full((batch_size, 1), 1.0), batch["attention_mask"]], dim=1)
+            assert (
+                batch["attention_mask"].shape[0] == 0
+            ), f"batch['attention_mask'].shape[0] must be 0 in sample packing"
             # Set the age of the CLS token to the starting age
             batch["ages"] = torch.cat([batch["ages"][:, 0:1], batch["ages"]], dim=1)
             # Set the age of the CLS token to the starting date
diff --git a/src/cehrbert/runners/hf_cehrbert_finetune_runner.py b/src/cehrbert/runners/hf_cehrbert_finetune_runner.py
@@ -330,7 +330,12 @@ def main():
             dataset=processed_dataset["test"],
             batch_size=per_device_eval_batch_size,
             num_workers=training_args.dataloader_num_workers,
-            collate_fn=data_collator,
+            collate_fn=CehrBertDataCollator(
+                tokenizer=tokenizer,
+                max_length=config.max_position_embeddings,
+                is_pretraining=False,
+                mlm_probability=config.mlm_probability,
+            ),
             pin_memory=training_args.dataloader_pin_memory,
         )
         do_predict(test_dataloader, model_args, training_args)