Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
260 changes: 145 additions & 115 deletions 05-incremental_learning.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,121 +14,137 @@
from szdetect import pull_features as pf
from szdetect import project_settings as s

inference = False

def main():
max_per_range = 2500
files_count = len(glob.glob(str(s.FEATURES_DIR)+"/*.parquet"))
print(f"files_count {files_count}")

ranges = []
start = 1

while start <= files_count:
end = min(start + max_per_range - 1, files_count) # Ensure we don't exceed total_files
ranges.append((start, end))
start = end + 1 # Move to the next range

# mid = files_count // 2
# ranges = [(i, min(i + mid - 1, files_count)) for i in range(0, files_count, mid)]
# print(f"ranges {ranges}")

params = {'max_depth': 9, 'min_child_weight': 15,
'scale_pos_weight': 13,
'max_delta_step': 1,
'eval_metric':'aucpr', 'reg_alpha': 50,
'learning_rate': 0.5, 'gamma': 0.3, 'booster': 'gbtree'}

sel = MRMR(method="FCQ", regression=False)
sc = StandardScaler()
model = xgb.XGBClassifier()
model.set_params(**params)


iteration = 0
booster = None
for r in ranges:
print(f"iteration {iteration}")
df = pf.pull_features(
feature_dir=s.FEATURES_DIR,
label_file=s.LABELS_FILE,
feature_group="all",
train_only=True,
step_size=s.PREPROCESSING_KWARGS['segment_eeg']['step_size'],
start_eeg=r[0],
end_eeg=r[1],
)

index_col = [
"dataset_name",
"subject",
"session",
"run",
"unique_id",
"timestamp",
"second",
"label"
]

feature_col = ["region_side", "freqs", "feature"]

wide_df = df.select(index_col + feature_col + ["value"]).pivot(
values="value", index=index_col, on=feature_col, maintain_order=True
)
del df

n_neg = len(wide_df.filter(pl.col("label")==False))
n_pos = len(wide_df.filter(pl.col("label")==True))

try:
scale_pos_weight = int(n_neg / n_pos)
except ZeroDivisionError:
scale_pos_weight = 1

# Update weight balancing
params["scale_pos_weight"]= scale_pos_weight

X = wide_df.drop(index_col)
X = X.to_pandas()
y_true = wide_df.select("label")
y_true = y_true.to_pandas().values.ravel()

if iteration < 1:
print("Training...")
tt1 = datetime.datetime.now()
# fit selector and scaler only once
X = sel.fit_transform(X, y_true)
X = sc.fit_transform(X)
model.fit(X, y_true)
booster = model.get_booster()

with open(s.MRMR_FILE, 'wb') as f:
pickle.dump(sel, f)
with open(s.SCALER_FILE, 'wb') as f:
pickle.dump(sc, f)
with open(s.MODEL_FILE, 'wb') as f:
pickle.dump(model, f)
iteration += 1
tt2 = datetime.datetime.now()
print(f'\n\t\tTraining time for one model in outer fold is {tt2-tt1}')
else:
# incremental learning
try:
assert booster is not None
model.fit(X, y_true, xgb_model=booster)
mod_name = s.PIPE_DIR / f'iter_{iteration}_xgb.sav'
if mod_name.exists():
print(f"Model {iteration} already stored")
else:
with open(mod_name, 'wb') as f:
pickle.dump(model, f)
if not inference:
max_per_range = 2500
files_count = len(glob.glob(str(s.FEATURES_DIR)+"/*.parquet"))
print(f"files_count {files_count}")

ranges = []
start = 1

while start <= files_count:
end = min(start + max_per_range - 1, files_count) # Ensure we don't exceed total_files
ranges.append((start, end))
start = end + 1 # Move to the next range

# mid = files_count // 2
# ranges = [(i, min(i + mid - 1, files_count)) for i in range(0, files_count, mid)]
# print(f"ranges {ranges}")

params = {'max_depth': 9, 'min_child_weight': 15,
'scale_pos_weight': 13,
'max_delta_step': 1,
'eval_metric':'aucpr', 'reg_alpha': 50,
'learning_rate': 0.1, 'gamma': 0.3, 'booster': 'gbtree'}

sel = MRMR(method="FCQ", regression=False)
sc = StandardScaler()
model = xgb.XGBClassifier()
model.set_params(**params)


iteration = 0
booster = None
for r in ranges:
print(f"iteration {iteration}")
df = pf.pull_features(
feature_dir=s.FEATURES_DIR,
label_file=s.LABELS_FILE,
feature_group="all",
train_only=True,
step_size=s.PREPROCESSING_KWARGS['segment_eeg']['step_size'],
start_eeg=r[0],
end_eeg=r[1],
)

index_col = [
"dataset_name",
"subject",
"session",
"run",
"unique_id",
"timestamp",
"second",
"label"
]

feature_col = ["region_side", "freqs", "feature"]

wide_df = df.select(index_col + feature_col + ["value"]).pivot(
values="value", index=index_col, on=feature_col, maintain_order=True
)
del df

print("Training data stats:")
datasets = wide_df.select("dataset_name").unique().to_series().to_list()
for dataset in datasets:
df_dt = wide_df.filter(pl.col("dataset_name")==dataset)
nb_eegs = df_dt.select("unique_id").unique().to_series().to_list()
subjects = df_dt.select("subject").unique().to_series().to_list()
sz_eegs = df_dt.filter(pl.col("label")==True).select("unique_id").unique().to_series().to_list()

print(f"Stats for {dataset}")
print(f"\tNb EEG files {len(nb_eegs)}")
print(f"\tNb subjects {len(subjects)}")
print(f"\tNb EEG files with seizures {len(sz_eegs)}")

n_neg = len(wide_df.filter(pl.col("label")==False))
n_pos = len(wide_df.filter(pl.col("label")==True))

try:
scale_pos_weight = int(n_neg / n_pos)
except ZeroDivisionError:
scale_pos_weight = 1

# Update weight balancing
params["scale_pos_weight"]= scale_pos_weight


X = wide_df.drop(index_col)
X = X.to_pandas()
y_true = wide_df.select("label")
y_true = y_true.to_pandas().values.ravel()

if iteration < 1:
print("Training...")
tt1 = datetime.datetime.now()
# fit selector and scaler only once
X = sel.fit_transform(X, y_true)
X = sc.fit_transform(X)
model.fit(X, y_true)
booster = model.get_booster()

with open(s.MRMR_FILE, 'wb') as f:
pickle.dump(sel, f)
with open(s.SCALER_FILE, 'wb') as f:
pickle.dump(sc, f)
with open(s.MODEL_FILE, 'wb') as f:
pickle.dump(model, f)
iteration += 1
except AssertionError:
print(AssertionError)
print("Model has to be previsouly fitted to continue training from.")

del X, y_true
del wide_df
print(f"XGB model fitted {iteration} time(s).")
tt2 = datetime.datetime.now()
print(f'\n\t\tTraining time for one model in outer fold is {tt2-tt1}')
else:
# incremental learning
try:
assert booster is not None
model.fit(X, y_true, xgb_model=booster)
mod_name = s.PIPE_DIR / f'iter_{iteration}_xgb.sav'
if mod_name.exists():
print(f"Model {iteration} already stored")
else:
with open(mod_name, 'wb') as f:
pickle.dump(model, f)
iteration += 1
except AssertionError:
print(AssertionError)
print("Model has to be previsouly fitted to continue training from.")

del X, y_true
del wide_df
print(f"XGB model fitted {iteration} time(s).")


print("Pulling test only data ... ")
Expand Down Expand Up @@ -157,21 +173,35 @@ def main():
)
del df

print("Testing data stats:")
datasets = wide_df.select("dataset_name").unique().to_series().to_list()
for dataset in datasets:
df_dt = wide_df.filter(pl.col("dataset_name")==dataset)
nb_eegs = df_dt.select("unique_id").unique().to_series().to_list()
subjects = df_dt.select("subject").unique().to_series().to_list()
sz_eegs = df_dt.filter(pl.col("label")==True).select("unique_id").unique().to_series().to_list()

print(f"Stats for {dataset}")
print(f"\tNb EEG files {len(nb_eegs)}")
print(f"\tNb subjects {len(subjects)}")
print(f"\tNb EEG files with seizures {len(sz_eegs)}")

model = pickle.load(open(s.MODEL_FILE, "rb"))
scaler = pickle.load(open(s.SCALER_FILE, "rb"))
mrmr = pickle.load(open(s.MRMR_FILE, "rb"))

X_test = wide_df.drop(index_col)
X_test = X_test.to_pandas()
X_test_fs = mrmr.transform(X_test)
X_test_scaled = scaler.transform(X_test_fs)
y_pred = model.predict(X_test_scaled)
X_test = mrmr.transform(X_test)
X_test = scaler.transform(X_test)
y_pred = model.predict(X_test)


df_pred = wide_df.select(index_col).with_row_index()
y_pred = np.array(y_pred, dtype=bool)
df_pred = df_pred.with_columns(pl.Series("y_pred", y_pred))
df_pred.write_parquet(s.PIPE_DIR / f'y_pred_test.parquet')
print("y_pred saved.")


if __name__ == "__main__":
Expand Down
3 changes: 2 additions & 1 deletion config.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@ labels_file = "/mnt/data/SeizureDetectionChallenge2025/output/labels.parquet"

[features]
features_config = "features_config.yaml"
features_dir = "data/cleaned/inference/"
# features_dir = "data/cleaned/inference/"
features_dir = "/mnt/data/SeizureDetectionChallenge2025/data/cleaned/features_v4/"
num_workers = 10
overwrite = false
log_dir = "logs/inference"
Expand Down
29 changes: 29 additions & 0 deletions tests/find_sz_files.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import os
import pandas as pd
from szdetect import project_settings as s

features_dir = s.FEATURES_DIR
labels_file = s.LABELS_FILE
df_labels = pd.read_parquet(labels_file)

sz_files = df_labels[df_labels.label == True].unique_id.unique()
sz_files_train = df_labels[
(df_labels.label == True) & (df_labels.training == True)
].unique_id.unique()
sz_files_test = df_labels[
(df_labels.label == True) & (df_labels.training == False)
].unique_id.unique()
exists_train = [os.path.exists(os.path.join(features_dir, file+'.parquet')) for file in sz_files_train]
exists_test = [os.path.exists(os.path.join(features_dir, file+'.parquet')) for file in sz_files_test]


# Get only the files that exist
existing_files_train = [file for file, exist in zip(sz_files_train, exists_train) if exist]
existing_files_test = [file for file, exist in zip(sz_files_test, exists_test) if exist]


print("existing_files_train \n", existing_files_train)
print("existing_files_test \n", existing_files_test)



Loading