removed the logic to collect cache files associated with filtering the dataset

ChaoPang · ChaoPang · commit e62c362b1d89 · 2025-04-06T22:53:36.000-04:00
diff --git a/src/cehrbert/data_generators/hf_data_generator/hf_dataset.py b/src/cehrbert/data_generators/hf_data_generator/hf_dataset.py
@@ -43,8 +43,6 @@ def create_cehrbert_pretraining_dataset(
 
     # Remove patients without any records
     dataset = filter_dataset(dataset, data_args)
-    if cache_file_collector:
-        cache_file_collector.add_cache_files(dataset)
     # If the data is already in meds, we don't need to sort the sequence anymore
     if data_args.is_data_in_meds:
         mapping_functions = [HFTokenizationMapping(concept_tokenizer, True)]
@@ -85,8 +83,6 @@ def create_cehrbert_finetuning_dataset(
 
     # Remove patients without any records
     dataset = filter_dataset(dataset, data_args)
-    if cache_file_collector:
-        cache_file_collector.add_cache_files(dataset)
     if data_args.is_data_in_meds:
         mapping_functions = [
             HFFineTuningMapping(),
diff --git a/src/cehrbert/runners/hf_cehrbert_pretrain_runner.py b/src/cehrbert/runners/hf_cehrbert_pretrain_runner.py
@@ -230,7 +230,6 @@ def main():
                     f"validation_split_num: {data_args.validation_split_num}\n"
                     f"streaming: {data_args.streaming}"
                 )
-            cache_file_collector.add_cache_files(dataset)
         # Create the CEHR-BERT tokenizer if it's not available in the output folder
         tokenizer = load_and_create_tokenizer(data_args=data_args, model_args=model_args, dataset=dataset)
         # sort the patient features chronologically and tokenize the data
diff --git a/tests/integration_tests/runners/hf_cehrbert_pretrain_runner_test.py b/tests/integration_tests/runners/hf_cehrbert_pretrain_runner_test.py
@@ -67,7 +67,7 @@ def test_train_model(self):
             "10",
             "--save_strategy",
             "steps",
-            "--evaluation_strategy",
+            "--eval_strategy",
             "steps",
             "--do_train",
             "true",

Original file line number	Diff line number	Diff line change
`@@ -230,7 +230,6 @@ def main():`
`230`	`230`	`f"validation_split_num: {data_args.validation_split_num}\n"`
`231`	`231`	`f"streaming: {data_args.streaming}"`
`232`	`232`	`)`
`233`		`- cache_file_collector.add_cache_files(dataset)`
`234`	`233`	`# Create the CEHR-BERT tokenizer if it's not available in the output folder`
`235`	`234`	`tokenizer = load_and_create_tokenizer(data_args=data_args, model_args=model_args, dataset=dataset)`
`236`	`235`	`# sort the patient features chronologically and tokenize the data`