Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,16 @@ jobs:
uses: actions/setup-python@v3
with:
python-version: "3.10"

- name: Free up disk space
run: |
sudo rm -rf /usr/share/dotnet
sudo rm -rf /opt/ghc
sudo rm -rf "/usr/local/share/boost"
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
sudo docker image prune --all --force
df -h
- name: Install dependencies
run: |
python -m pip install --upgrade pip
Expand Down
1 change: 1 addition & 0 deletions constraints.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
xformers==0 # blocks installation
8 changes: 2 additions & 6 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -39,19 +39,15 @@ dependencies = [
"numpy>=1.24.3",
"packaging>=23.2",
"pandas>=2.2.0",
"peft>=0.10.0",
"peft==0.10.0",
"Pillow>=10.3.0",
"pyarrow>=15.0.0",
"python-dateutil==2.8.2",
"PyYAML==6.0.1",
"scikit-learn==1.4.0",
"scipy==1.12.0",
"tensorflow>=2.15.0",
"keras==2.15.0",
"tensorflow-metal>=1.1.0; sys_platform == 'darwin'", # macOS only
"tensorflow-datasets>=4.5.2",
"tqdm>=4.66.1",
"torch==2.4.0",
"torch>=2.4.0, <=2.8.0",
"tokenizers>=0.19.0",
"transformers>=4.41.0, <= 4.45.0",
"accelerate>=0.31.0",
Expand Down
44 changes: 44 additions & 0 deletions src/cehrbert/baseline_evaluation/evaluation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import pandas as pd

from .evaluation_parse_args import create_evaluation_args
from .model_evaluators.frequency_model_evaluators import (
LogisticRegressionModelEvaluator,
XGBClassifierEvaluator,
)


def evaluate_baseline_models(args):
# Load the training data
dataset = pd.read_parquet(args.data_path)
test_person_ids = None
if args.patient_splits_folder:
patient_splits = pd.read_parquet(args.patient_splits_folder)
test_person_ids = patient_splits[patient_splits.split == "test"]

LogisticRegressionModelEvaluator(
dataset=dataset,
evaluation_folder=args.evaluation_folder,
num_of_folds=args.num_of_folds,
is_transfer_learning=args.is_transfer_learning,
training_percentage=args.training_percentage,
k_fold_test=args.k_fold_test,
test_person_ids=test_person_ids,
).eval_model()

XGBClassifierEvaluator(
dataset=dataset,
evaluation_folder=args.evaluation_folder,
num_of_folds=args.num_of_folds,
is_transfer_learning=args.is_transfer_learning,
training_percentage=args.training_percentage,
k_fold_test=args.k_fold_test,
test_person_ids=test_person_ids,
).eval_model()


def main(args):
evaluate_baseline_models(args)


if __name__ == "__main__":
main(create_evaluation_args().parse_args())
46 changes: 46 additions & 0 deletions src/cehrbert/baseline_evaluation/evaluation_parse_args.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import argparse


def create_evaluation_args():
main_parser = argparse.ArgumentParser(description="Arguments for evaluating the models")
main_parser.add_argument(
"-d",
"--data_path",
dest="data_path",
action="store",
help="The training data path",
required=True,
)
main_parser.add_argument(
"--patient_splits_folder",
dest="patient_splits_folder",
action="store",
help="The test person_ids data",
required=False,
)
main_parser.add_argument(
"-ef",
"--evaluation_folder",
dest="evaluation_folder",
action="store",
required=True,
)
main_parser.add_argument(
"-n",
"--num_of_folds",
dest="num_of_folds",
action="store",
required=False,
type=int,
default=4,
)
main_parser.add_argument("--is_transfer_learning", dest="is_transfer_learning", action="store_true")
main_parser.add_argument(
"--training_percentage",
dest="training_percentage",
required=False,
action="store",
type=float,
default=1.0,
)
return main_parser
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from tensorflow.keras.preprocessing.text import Tokenizer
from xgboost import XGBClassifier

from cehrbert.evaluations.model_evaluators.model_evaluators import AbstractModelEvaluator
from .model_evaluators import AbstractModelEvaluator
from cehrbert.utils.model_utils import compute_binary_metrics


Expand Down
Original file line number Diff line number Diff line change
@@ -1,43 +1,62 @@
import copy
import os
import pathlib
from abc import abstractmethod
import logging
from abc import ABC, abstractmethod
from cehrbert.utils.model_utils import create_folder_if_not_exist

import tensorflow as tf

from cehrbert.trainers.model_trainer import AbstractModel
class AbstractModel(ABC):
def __init__(self, *args, **kwargs):
super().__init__()
self._model = self._create_model(*args, **kwargs)

@abstractmethod
def _create_model(self, *args, **kwargs):
pass

@abstractmethod
def train_model(self, *args, **kwargs):
pass

@abstractmethod
def get_model_folder(self):
pass

def get_model_metrics_folder(self):
return create_folder_if_not_exist(self.get_model_folder(), "metrics")

def get_model_test_metrics_folder(self):
return create_folder_if_not_exist(self.get_model_folder(), "test_metrics")

def get_model_test_prediction_folder(self):
return create_folder_if_not_exist(self.get_model_folder(), "test_prediction")

def get_metrics():
"""
Standard metrics used for compiling the models.
def get_model_history_folder(self):
return create_folder_if_not_exist(self.get_model_folder(), "history")

:return:
"""
@classmethod
def get_logger(cls):
return logging.getLogger(cls.__name__)

return [
"binary_accuracy",
tf.keras.metrics.Recall(name="recall"),
tf.keras.metrics.Precision(name="precision"),
tf.keras.metrics.AUC(curve="PR", name="pr_auc"),
tf.keras.metrics.AUC(name="auc"),
]
def __str__(self):
return str(self.__class__.__name__)


class AbstractModelEvaluator(AbstractModel):
def __init__(
self,
dataset,
evaluation_folder,
num_of_folds,
is_transfer_learning: bool = False,
training_percentage: float = 1.0,
learning_rate: float = 1e-4,
is_chronological_test: bool = False,
k_fold_test: bool = False,
test_person_ids=None,
*args,
**kwargs,
self,
dataset,
evaluation_folder,
num_of_folds,
is_transfer_learning: bool = False,
training_percentage: float = 1.0,
learning_rate: float = 1e-4,
is_chronological_test: bool = False,
k_fold_test: bool = False,
test_person_ids=None,
*args,
**kwargs,
):
self._dataset = copy.copy(dataset)
self._evaluation_folder = evaluation_folder
Expand Down
13 changes: 0 additions & 13 deletions src/cehrbert/config/grid_search_config.py

This file was deleted.

9 changes: 0 additions & 9 deletions src/cehrbert/config/output_names.py

This file was deleted.

33 changes: 0 additions & 33 deletions src/cehrbert/data_generators/data_classes.py

This file was deleted.

Loading