cumc-dbmi
diff --git a/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion b/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/cehrbert/data_generators/hf_data_generator/hf_dataset_collator.py‎
Lines changed: 0 additions & 10 deletions b/‎src/cehrbert/data_generators/hf_data_generator/hf_dataset_collator.py‎
Lines changed: 0 additions & 10 deletions
diff --git a/‎src/cehrbert/data_generators/hf_data_generator/hf_dataset_mapping.py‎
Lines changed: 25 additions & 3 deletions b/‎src/cehrbert/data_generators/hf_data_generator/hf_dataset_mapping.py‎
Lines changed: 25 additions & 3 deletions
@@ -52,7 +52,7 @@ dependencies = [
     "tqdm>=4.66.1",
     "torch==2.4.0",
     "tokenizers>=0.19.0",
-    "transformers>=4.40.0",
+    "transformers>=4.41.0",
     "accelerate>=0.31.0",
     "Werkzeug==3.0.1",
     "wandb>=0.17.8",
 
@@ -111,18 +111,8 @@ def __call__(self, examples):
 
         # This is the most crucial logic for generating the training labels
         if self.is_pretraining:
-
-            batch_mlm_skip_values = [
-                self._convert_to_tensor(example["mlm_skip_values"]).to(torch.bool) for example in examples
-            ]
-            batch["mlm_skip_values"] = pad_sequence(batch_mlm_skip_values, batch_first=True, padding_value=False)
-            # Set the mlm_skip_values of the CLS token to a default value False
-            batch["mlm_skip_values"] = torch.cat([torch.full((batch_size, 1), False), batch["mlm_skip_values"]], dim=1)
-
             # If the labels field is already provided, we will build the MLM labels off of that.
             # The labels value indicates the positions that are not allowed for MLM.
-            # For example, the mlm_skip_values=1, this means this is a lab value and
-            # we don't want to predict the tokens at this position
             if "labels" in examples[0]:
                 batch_labels = [self._convert_to_tensor(example["labels"]) for example in examples]
                 batch["labels"] = pad_sequence(batch_labels, batch_first=True, padding_value=-100)
 
@@ -17,7 +17,7 @@
 from meds.schema import birth_code, death_code
 from pandas import Series
 
-from cehrbert.med_extension.schema_extension import Event, Visit
+from cehrbert.med_extension.schema_extension import Event
 from cehrbert.models.hf_models.tokenization_hf_cehrbert import CehrBertTokenizer
 from cehrbert.runners.hf_runner_argument_dataclass import DataTrainingArguments
 
@@ -573,17 +573,39 @@ def __init__(self, concept_tokenizer: CehrBertTokenizer, is_pretraining: bool):
         self._is_pretraining = is_pretraining
         self._lab_token_ids = self._concept_tokenizer.lab_token_ids
 
+    @staticmethod
+    def fill_na_value(values, value_to_fill):
+        none_values = np.array([x is None for x in values])
+        if none_values.any():
+            values = values.copy()
+            values[none_values] = value_to_fill
+        return values
+
     def transform(self, record: Dict[str, Any]) -> Dict[str, Any]:
 
         input_ids = self._concept_tokenizer.encode(record["concept_ids"])
         record["input_ids"] = input_ids
         concept_value_masks = record["concept_value_masks"]
+
+        # These fields may not exist in the old version of the datasets
+        if "units" in record:
+            record["units"] = self.fill_na_value(record["units"], NA)
+        if "concept_as_values" in record:
+            record["concept_as_values"] = self.fill_na_value(record["concept_as_values"], NA)
+
         # Backward compatibility
         if "concept_values" not in record:
             record["concept_values"] = record["number_as_values"]
 
-        if np.isnan(record["concept_values"]).any():
-            record["concept_values"] = [v if not pd.isna(v) else 0.0 for v in record["concept_values"]]
+        concept_value_is_nan = np.isnan(record["concept_values"])
+        if concept_value_is_nan.any():
+            # Create a writeable copy
+            concept_value_masks = concept_value_masks.copy()
+            concept_value_masks[concept_value_is_nan] = 0
+            record["concept_value_masks"] = concept_value_masks
+            concept_values = record["concept_values"].copy()
+            concept_values[concept_value_is_nan] = 0.0
+            record["concept_values"] = concept_values
 
         assert len(input_ids) == len(
             record["concept_ids"]