cumc-dbmi
diff --git a/‎src/cehrbert/cehrbert_utils.py‎
Lines changed: 90 additions & 0 deletions b/‎src/cehrbert/cehrbert_utils.py‎
Lines changed: 90 additions & 0 deletions
diff --git a/‎src/cehrbert/data_generators/hf_data_generator/hf_dataset.py‎
Lines changed: 1 addition & 0 deletions b/‎src/cehrbert/data_generators/hf_data_generator/hf_dataset.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/cehrbert/data_generators/hf_data_generator/hf_dataset_mapping.py‎
Lines changed: 103 additions & 1 deletion b/‎src/cehrbert/data_generators/hf_data_generator/hf_dataset_mapping.py‎
Lines changed: 103 additions & 1 deletion
diff --git a/‎src/cehrbert/linear_prob/compute_cehrbert_features.py‎
Lines changed: 19 additions & 15 deletions b/‎src/cehrbert/linear_prob/compute_cehrbert_features.py‎
Lines changed: 19 additions & 15 deletions
@@ -0,0 +1,90 @@
+import re
+from datetime import datetime, timedelta, timezone
+from typing import List, Optional, Union
+
+from transformers.utils import logging
+
+# Regular expression pattern to match inpatient attendance tokens
+MEDS_CODE_PATTERN = re.compile(r".*/.*")
+INPATIENT_ATT_PATTERN = re.compile(r"(?:VS-|i-)D(\d+)(?:-VE)?")
+DEMOGRAPHIC_PROMPT_SIZE = 4
+logger = logging.get_logger("transformers")
+
+
+def construct_time_sequence(
+    concept_ids: List[str], epoch_times: Optional[List[Union[int, float]]] = None
+) -> List[float]:
+    if epoch_times is not None:
+        return epoch_times
+
+    if concept_ids[0].lower().startswith("year"):
+        year_str = concept_ids[0].split(":")[1]
+    else:
+        year_str = "1985"
+
+    datetime_cursor = datetime(int(year_str), month=1, day=1, hour=0, minute=0, second=0).replace(tzinfo=timezone.utc)
+    epoch_times = []
+    for concept_id in concept_ids:
+        if is_att_token(concept_id):
+            att_days = extract_time_interval_in_days(concept_id)
+            datetime_cursor += timedelta(days=att_days)
+        epoch_times.append(datetime_cursor.timestamp())
+    return epoch_times
+
+
+def is_att_token(token: str):
+    """
+    Check if the token is an attention token.
+
+    :param token: Token to check.
+    :return: True if the token is an attention token, False otherwise.
+    """
+    if bool(re.match(r"^D\d+", token)):  # day tokens
+        return True
+    elif bool(re.match(r"^W\d+", token)):  # week tokens
+        return True
+    elif bool(re.match(r"^M\d+", token)):  # month tokens
+        return True
+    elif bool(re.match(r"^Y\d+", token)):  # year tokens
+        return True
+    elif token == "LT":
+        return True
+    elif token[:3] == "VS-":  # VS-D7-VE
+        return True
+    elif token[:2] == "i-" and not token.startswith("i-H"):  # i-D7 and exclude hour tokens
+        return True
+    return False
+
+
+def extract_time_interval_in_days(token: str):
+    """
+    Extract the time interval in days from a token.
+
+    :param token: Token to extract from.
+    :return: Time interval in days.
+    :raises ValueError: If the token is invalid.
+    """
+    try:
+        if token[0] == "D":  # day tokens
+            return int(token[1:])
+        elif token[0] == "W":  # week tokens
+            return int(token[1:]) * 7
+        elif token[0] == "M":  # month tokens
+            return int(token[1:]) * 30
+        elif token[0] == "Y":  # year tokens
+            return int(token[1:]) * 365
+        elif token == "LT":
+            return 365 * 3
+        elif token[:3] == "VS-":  # VS-D7-VE
+            part = token.split("-")[1]
+            if part.startswith("LT"):
+                return 365 * 3
+            return int(part[1:])
+        elif token[:2] == "i-":  # i-D7
+            part = token.split("-")[1]
+            if part.startswith("LT"):
+                return 365 * 3
+            return int(token.split("-")[1][1:])
+    except Exception:
+        raise ValueError(f"Invalid time token: {token}")
+    raise ValueError(f"Invalid time token: {token}")
@@ -26,6 +26,7 @@
     "num_of_visits",
     "number_as_values",
     "concept_as_values",
+    "epoch_times",
 ]
 
 TRANSFORMER_COLUMNS = ["input_ids", "labels"]
 
@@ -4,6 +4,7 @@
 import itertools
 import re
 from abc import ABC, abstractmethod
+from collections import defaultdict
 from dataclasses import dataclass
 from enum import Enum
 from typing import Any, Dict, Generator, List, Optional, Union
@@ -17,6 +18,7 @@
 from meds.schema import birth_code, death_code
 from pandas import Series
 
+from cehrbert.cehrbert_utils import construct_time_sequence
 from cehrbert.med_extension.schema_extension import Event
 from cehrbert.models.hf_models.tokenization_hf_cehrbert import CehrBertTokenizer
 from cehrbert.runners.hf_runner_argument_dataclass import DataTrainingArguments
@@ -284,6 +286,7 @@ def remove_columns(self):
     def _update_cehrbert_record(
         cehrbert_record: Dict[str, Any],
         code: str,
+        time: datetime.datetime,
         visit_segment: int = 0,
         date: int = 0,
         age: int = -1,
@@ -304,6 +307,7 @@ def _update_cehrbert_record(
         cehrbert_record["concept_values"].append(concept_value)
         cehrbert_record["units"].append(unit)
         cehrbert_record["mlm_skip_values"].append(mlm_skip_value)
+        cehrbert_record["epoch_times"].append(time.replace(tzinfo=datetime.timezone.utc).timestamp())
 
     def transform(self, record: Dict[str, Any]) -> Dict[str, Any]:
 
@@ -320,6 +324,7 @@ def transform(self, record: Dict[str, Any]) -> Dict[str, Any]:
             "units": [],
             "mlm_skip_values": [],
             "visit_concept_ids": [],
+            "epoch_times": [],
         }
         # Extract the demographic information
         birth_datetime = record["birth_datetime"]
@@ -340,7 +345,10 @@ def transform(self, record: Dict[str, Any]) -> Dict[str, Any]:
             year_str = f"year:{str(first_visit_start_datetime.year)}"
             age_str = f"age:{str(relativedelta(first_visit_start_datetime, birth_datetime).years)}"
 
-            self._update_cehrbert_record(cehrbert_record, year_str)
+            self._update_cehrbert_record(
+                cehrbert_record,
+                year_str,
+            )
             self._update_cehrbert_record(cehrbert_record, age_str)
             self._update_cehrbert_record(cehrbert_record, gender)
             self._update_cehrbert_record(cehrbert_record, race)
@@ -377,6 +385,7 @@ def transform(self, record: Dict[str, Any]) -> Dict[str, Any]:
                     cehrbert_record,
                     code=self._time_token_function(time_delta),
                     visit_concept_order=i + 1,
+                    time=visit_start_datetime,
                 )
 
             # Add the VS token to the patient timeline to mark the start of a visit
@@ -393,6 +402,7 @@ def transform(self, record: Dict[str, Any]) -> Dict[str, Any]:
                 date=date,
                 visit_segment=visit_segment,
                 visit_concept_id=visit_type,
+                time=date_cursor,
             )
 
             if self._include_auxiliary_token:
@@ -404,6 +414,7 @@ def transform(self, record: Dict[str, Any]) -> Dict[str, Any]:
                     date=date,
                     visit_segment=visit_segment,
                     visit_concept_id=visit_type,
+                    time=date_cursor,
                 )
             # Keep track of the existing outpatient events, we don't want to add them again
             existing_outpatient_events = list()
@@ -450,6 +461,7 @@ def transform(self, record: Dict[str, Any]) -> Dict[str, Any]:
                                 visit_concept_order=i + 1,
                                 visit_segment=visit_segment,
                                 visit_concept_id=visit_type,
+                                time=date_cursor,
                             )
                 else:
                     # For outpatient visits, we use the visit time stamp to calculate age and time because we assume
@@ -471,6 +483,7 @@ def transform(self, record: Dict[str, Any]) -> Dict[str, Any]:
                     concept_value=concept_value,
                     unit=unit,
                     mlm_skip_value=concept_value_mask,
+                    time=date_cursor,
                 )
                 existing_outpatient_events.append((date, code, concept_value))
 
@@ -496,6 +509,7 @@ def transform(self, record: Dict[str, Any]) -> Dict[str, Any]:
                         visit_concept_order=i + 1,
                         visit_segment=visit_segment,
                         visit_concept_id=visit_type,
+                        time=date_cursor,
                     )
 
             # Reuse the age and date calculated for the last event in the patient timeline
@@ -507,6 +521,7 @@ def transform(self, record: Dict[str, Any]) -> Dict[str, Any]:
                 visit_concept_order=i + 1,
                 visit_segment=visit_segment,
                 visit_concept_id=visit_type,
+                time=date_cursor,
             )
 
             # Toggle visit_segment_indicator
@@ -519,11 +534,17 @@ def transform(self, record: Dict[str, Any]) -> Dict[str, Any]:
         cehrbert_record["num_of_concepts"] = len(cehrbert_record["concept_ids"])
         cehrbert_record["num_of_visits"] = len(visits)
 
+        if record.get("index_date", None) is not None:
+            cehrbert_record["index_date"] = record["index_date"].replace(tzinfo=datetime.timezone.utc).timestamp()
         if "label" in record:
             cehrbert_record["label"] = record["label"]
         if "age_at_index" in record:
             cehrbert_record["age_at_index"] = record["age_at_index"]
 
+        assert len(cehrbert_record["epoch_times"]) == len(
+            cehrbert_record["concept_ids"]
+        ), "The number of time stamps must match with the number of concepts in the sequence"
+
         return cehrbert_record
 
 
@@ -594,6 +615,7 @@ def transform(self, record: Dict[str, Any]) -> Dict[str, Any]:
         input_ids = self._concept_tokenizer.encode(record["concept_ids"])
         record["input_ids"] = input_ids
         concept_value_masks = record["concept_value_masks"]
+        record["epoch_times"] = construct_time_sequence(record["concept_ids"], record.get("epoch_times", None))
 
         # These fields may not exist in the old version of the datasets
         if "units" in record:
@@ -651,6 +673,86 @@ def transform(self, record: Dict[str, Any]) -> Dict[str, Any]:
         return record
 
 
+class ExtractTokenizedSequenceDataMapping:
+    def __init__(
+        self,
+        person_index_date_map: Dict[int, List[Dict[str, Any]]],
+        observation_window: int = 0,
+    ):
+        self.person_index_date_map = person_index_date_map
+        self.observation_window = observation_window
+
+    def _calculate_prediction_start_time(self, prediction_time: float):
+        if self.observation_window and self.observation_window > 0:
+            return max(prediction_time - self.observation_window * 24 * 3600, 0)
+        return 0
+
+    def transform(self, record: Dict[str, Any]) -> Dict[str, Any]:
+        person_id = record["person_id"]
+        prediction_times = self.person_index_date_map[person_id]
+        prediction_start_end_times = [
+            (
+                self._calculate_prediction_start_time(
+                    prediction_time_label_map["index_date"].replace(tzinfo=datetime.timezone.utc).timestamp()
+                ),
+                prediction_time_label_map["index_date"].replace(tzinfo=datetime.timezone.utc).timestamp(),
+                prediction_time_label_map["label"],
+            )
+            for prediction_time_label_map in prediction_times
+        ]
+        observation_window_indices = np.zeros((len(prediction_times), len(record["epoch_times"])), dtype=bool)
+        for i, epoch_time in enumerate(record["epoch_times"]):
+            for sample_n, (
+                feature_extraction_time_start,
+                feature_extraction_end_end,
+                _,
+            ) in enumerate(prediction_start_end_times):
+                if feature_extraction_time_start <= epoch_time <= feature_extraction_end_end:
+                    observation_window_indices[sample_n][i] = True
+
+        seq_length = len(record["epoch_times"])
+        time_series_columns = ["concept_ids", "input_ids"]
+        static_inputs = dict()
+        for k, v in record.items():
+            if k in ["concept_ids", "input_ids"]:
+                continue
+            if isinstance(v, (list, np.ndarray)) and len(v) == seq_length:
+                time_series_columns.append(k)
+            else:
+                static_inputs[k] = v
+
+        batched_samples = defaultdict(list)
+        for (_, index_date, label), observation_window_index in zip(
+            prediction_start_end_times, observation_window_indices
+        ):
+            for k, v in static_inputs.items():
+                batched_samples[k].append(v)
+            batched_samples["classifier_label"].append(label)
+            batched_samples["index_date"].append(index_date)
+            try:
+                start_age = int(record["concept_ids"][1].split(":")[1])
+            except Exception:
+                start_age = -1
+            batched_samples["age_at_index"].append(start_age)
+            for time_series_column in time_series_columns:
+                batched_samples[time_series_column].append(
+                    np.asarray(record[time_series_column])[observation_window_index]
+                )
+        return batched_samples
+
+    def batch_transform(self, record: Dict[str, Any]) -> Dict[str, Any]:
+        all_batched_record = defaultdict(list)
+        all_columns = record.keys()
+        for i in range(len(record["concept_ids"])):
+            one_record = {}
+            for column in all_columns:
+                one_record[column] = record[column][i]
+            new_batched_record = self.transform(one_record)
+            for k, v in new_batched_record.items():
+                all_batched_record[k].extend(v)
+        return all_batched_record
+
+
 class HFFineTuningMapping(DatasetMapping):
     """Consider removing this transformation in the future."""
 
 
@@ -22,6 +22,7 @@
 from cehrbert.data_generators.hf_data_generator.sample_packing_sampler import SamplePackingBatchSampler
 from cehrbert.models.hf_models.hf_cehrbert import CehrBertForPreTraining
 from cehrbert.models.hf_models.tokenization_hf_cehrbert import CehrBertTokenizer
+from cehrbert.runners.data_utils import extract_cohort_sequences
 from cehrbert.runners.hf_cehrbert_finetune_runner import prepare_finetune_dataset
 from cehrbert.runners.runner_util import generate_prepared_ds_path, parse_runner_args
 
@@ -85,21 +86,24 @@ def main():
         LOG.info("Prepared dataset loaded from disk...")
 
     if processed_dataset is None:
-        # Organize them into a single DatasetDict
-        final_splits = prepare_finetune_dataset(data_args, training_args, cache_file_collector)
-
-        # TODO: temp solution, this column is mixed typed and causes an issue when transforming the data
-        if not data_args.streaming:
-            all_columns = final_splits["train"].column_names
-            if "visit_concept_ids" in all_columns:
-                final_splits = final_splits.remove_columns(["visit_concept_ids"])
-
-        processed_dataset = create_cehrbert_finetuning_dataset(
-            dataset=final_splits,
-            concept_tokenizer=cehrgpt_tokenizer,
-            data_args=data_args,
-            cache_file_collector=cache_file_collector,
-        )
+        if cehrbert_args.tokenized_full_dataset_path is not None:
+            processed_dataset = extract_cohort_sequences(data_args, cehrbert_args, cache_file_collector)
+        else:
+            # Organize them into a single DatasetDict
+            final_splits = prepare_finetune_dataset(data_args, training_args, cache_file_collector)
+
+            # TODO: temp solution, this column is mixed typed and causes an issue when transforming the data
+            if not data_args.streaming:
+                all_columns = final_splits["train"].column_names
+                if "visit_concept_ids" in all_columns:
+                    final_splits = final_splits.remove_columns(["visit_concept_ids"])
+
+            processed_dataset = create_cehrbert_finetuning_dataset(
+                dataset=final_splits,
+                concept_tokenizer=cehrgpt_tokenizer,
+                data_args=data_args,
+                cache_file_collector=cache_file_collector,
+            )
         if not data_args.streaming:
             processed_dataset.save_to_disk(prepared_ds_path)
             processed_dataset.cleanup_cache_files()
Original file line number	Diff line number	Diff line change
`@@ -26,6 +26,7 @@`
`26`	`26`	`"num_of_visits",`
`27`	`27`	`"number_as_values",`
`28`	`28`	`"concept_as_values",`
	`29`	`+ "epoch_times",`
`29`	`30`	`]`
`30`	`31`
`31`	`32`	`TRANSFORMER_COLUMNS = ["input_ids", "labels"]`