Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,9 @@ def extract_k_best_modalities_per_task(self):
representations[task.model.name] = {}
for modality in self.modalities:
k_best_results, cached_data = (
self.optimization_results.get_k_best_results(modality, self.k, task)
self.optimization_results.get_k_best_results(
modality, self.k, task, self.scoring_metric
)
)
representations[task.model.name][modality.modality_id] = k_best_results
self.k_best_representations[task.model.name].extend(k_best_results)
Expand Down
18 changes: 13 additions & 5 deletions src/main/python/systemds/scuro/drsearch/multimodal_optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#
# -------------------------------------------------------------
import os
import torch
import multiprocessing as mp
import itertools
import threading
Expand Down Expand Up @@ -83,8 +84,9 @@ def _evaluate_dag_worker(dag_pickle, task_pickle, modalities_pickle, debug=False

return OptimizationResult(
dag=dag_copy,
train_score=scores[0],
val_score=scores[1],
train_score=scores[0].average_scores,
val_score=scores[1].average_scores,
test_score=scores[2].average_scores,
runtime=total_time,
task_name=task_copy.model.name,
task_time=eval_time,
Expand All @@ -106,6 +108,7 @@ def __init__(
debug: bool = True,
min_modalities: int = 2,
max_modalities: int = None,
metric: str = "accuracy",
):
self.modalities = modalities
self.tasks = tasks
Expand All @@ -116,6 +119,7 @@ def __init__(

self.operator_registry = Registry()
self.fusion_operators = self.operator_registry.get_fusion_operators()
self.metric_name = metric

self.k_best_representations = self._extract_k_best_representations(
unimodal_optimization_results
Expand Down Expand Up @@ -242,7 +246,7 @@ def _extract_k_best_representations(
for modality in self.modalities:
k_best_results, cached_data = (
unimodal_optimization_results.get_k_best_results(
modality, self.k, task
modality, self.k, task, self.metric_name
)
)

Expand Down Expand Up @@ -367,6 +371,8 @@ def _evaluate_dag(self, dag: RepresentationDag, task: Task) -> "OptimizationResu
task_copy,
)

torch.cuda.empty_cache()

if fused_representation is None:
return None

Expand All @@ -388,8 +394,9 @@ def _evaluate_dag(self, dag: RepresentationDag, task: Task) -> "OptimizationResu

return OptimizationResult(
dag=dag_copy,
train_score=scores[0],
val_score=scores[1],
train_score=scores[0].average_scores,
val_score=scores[1].average_scores,
test_score=scores[2].average_scores,
runtime=total_time,
representation_time=total_time - eval_time,
task_name=task_copy.model.name,
Expand Down Expand Up @@ -479,6 +486,7 @@ class OptimizationResult:
dag: RepresentationDag
train_score: PerformanceMeasure = None
val_score: PerformanceMeasure = None
test_score: PerformanceMeasure = None
runtime: float = 0.0
task_time: float = 0.0
representation_time: float = 0.0
Expand Down
25 changes: 14 additions & 11 deletions src/main/python/systemds/scuro/drsearch/ranking.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,7 @@
#
# -------------------------------------------------------------

from dataclasses import replace
from typing import Callable, Iterable, List, Optional
from typing import Callable, Iterable, Optional


def rank_by_tradeoff(
Expand All @@ -31,14 +30,15 @@ def rank_by_tradeoff(
runtime_accessor: Optional[Callable[[object], float]] = None,
cache_scores: bool = True,
score_attr: str = "tradeoff_score",
) -> List:
):
entries = list(entries)
if not entries:
return []

performance_score_accessor = lambda entry: getattr(entry, "val_score")[
performance_metric_name
]

if runtime_accessor is None:

def runtime_accessor(entry):
Expand Down Expand Up @@ -77,14 +77,17 @@ def safe_normalize(values, vmin, vmax):
if cache_scores:
for entry, score in zip(entries, scores):
if hasattr(entry, score_attr):
try:
new_entry = replace(entry, **{score_attr: score})
entries[entries.index(entry)] = new_entry
except TypeError:
setattr(entry, score_attr, score)
setattr(entry, score_attr, score)
else:
setattr(entry, score_attr, score)

return sorted(
entries, key=lambda entry: getattr(entry, score_attr, 0.0), reverse=True
)
sorted_entries = sorted(entries, key=lambda e: e.tradeoff_score, reverse=True)

sorted_indices = [
i
for i, _ in sorted(
enumerate(entries), key=lambda pair: pair[1].tradeoff_score, reverse=True
)
]

return sorted_entries, sorted_indices
123 changes: 68 additions & 55 deletions src/main/python/systemds/scuro/drsearch/task.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,10 @@
# -------------------------------------------------------------
import copy
import time
from typing import List, Union
from systemds.scuro.modality.modality import Modality
from systemds.scuro.representations.representation import Representation
from typing import List
from systemds.scuro.models.model import Model
import numpy as np
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split


class PerformanceMeasure:
Expand Down Expand Up @@ -69,7 +67,8 @@ def __init__(
val_indices: List,
kfold=5,
measure_performance=True,
performance_measures="accuracy",
performance_measures=["accuracy"],
fusion_train_split=0.8,
):
"""
Parent class for the prediction task that is performed on top of the aligned representation
Expand All @@ -85,7 +84,7 @@ def __init__(
self.model = model
self.labels = labels
self.train_indices = train_indices
self.val_indices = val_indices
self.test_indices = val_indices
self.kfold = kfold
self.measure_performance = measure_performance
self.inference_time = []
Expand All @@ -94,6 +93,47 @@ def __init__(
self.performance_measures = performance_measures
self.train_scores = PerformanceMeasure("train", performance_measures)
self.val_scores = PerformanceMeasure("val", performance_measures)
self.test_scores = PerformanceMeasure("test", performance_measures)
self.fusion_train_indices = None
self._create_cv_splits()

def _create_cv_splits(self):
train_labels = [self.labels[i] for i in self.train_indices]
train_labels_array = np.array(train_labels)

train_indices_array = np.array(self.train_indices)

self.cv_train_indices = []
self.cv_val_indices = []

for fold_idx in range(self.kfold):
fold_train_indices_array, fold_val_indices_array, _, _ = train_test_split(
train_indices_array,
train_labels_array,
test_size=0.2,
shuffle=True,
random_state=11 + fold_idx,
)

fold_train_indices = fold_train_indices_array.tolist()
fold_val_indices = fold_val_indices_array.tolist()

self.cv_train_indices.append(fold_train_indices)
self.cv_val_indices.append(fold_val_indices)

overlap = set(fold_train_indices) & set(fold_val_indices)
if overlap:
raise ValueError(
f"Fold {fold_idx}: Overlap detected between train and val indices: {overlap}"
)

all_val_indices = set()
for val_indices in self.cv_val_indices:
all_val_indices.update(val_indices)

self.fusion_train_indices = [
idx for idx in self.train_indices if idx not in all_val_indices
]

def create_model(self):
"""
Expand All @@ -107,12 +147,12 @@ def create_model(self):
def get_train_test_split(self, data):
X_train = [data[i] for i in self.train_indices]
y_train = [self.labels[i] for i in self.train_indices]
if self.val_indices is None:
if self.test_indices is None:
X_test = None
y_test = None
else:
X_test = [data[i] for i in self.val_indices]
y_test = [self.labels[i] for i in self.val_indices]
X_test = [data[i] for i in self.test_indices]
y_test = [self.labels[i] for i in self.test_indices]

return X_train, y_train, X_test, y_test

Expand All @@ -125,71 +165,44 @@ def run(self, data):
"""
self._reset_params()
model = self.create_model()
skf = KFold(n_splits=self.kfold, shuffle=True, random_state=11)

fold = 0
X, y, _, _ = self.get_train_test_split(data)
test_X = np.array([data[i] for i in self.test_indices])
test_y = np.array([self.labels[i] for i in self.test_indices])

for fold_idx in range(self.kfold):
fold_train_indices = self.cv_train_indices[fold_idx]
fold_val_indices = self.cv_val_indices[fold_idx]

for train, test in skf.split(X, y):
train_X = np.array(X)[train]
train_y = np.array(y)[train]
test_X = np.array(X)[test]
test_y = np.array(y)[test]
self._run_fold(model, train_X, train_y, test_X, test_y)
fold += 1
train_X = np.array([data[i] for i in fold_train_indices])
train_y = np.array([self.labels[i] for i in fold_train_indices])
val_X = np.array([data[i] for i in fold_val_indices])
val_y = np.array([self.labels[i] for i in fold_val_indices])

self._run_fold(model, train_X, train_y, val_X, val_y, test_X, test_y)

return [
self.train_scores.compute_averages(),
self.val_scores.compute_averages(),
self.test_scores.compute_averages(),
]

def _reset_params(self):
self.inference_time = []
self.training_time = []
self.train_scores = PerformanceMeasure("train", self.performance_measures)
self.val_scores = PerformanceMeasure("val", self.performance_measures)
self.test_scores = PerformanceMeasure("test", self.performance_measures)

def _run_fold(self, model, train_X, train_y, test_X, test_y):
def _run_fold(self, model, train_X, train_y, val_X, val_y, test_X, test_y):
train_start = time.time()
train_score = model.fit(train_X, train_y, test_X, test_y)
train_score = model.fit(train_X, train_y, val_X, val_y)
train_end = time.time()
self.training_time.append(train_end - train_start)
self.train_scores.add_scores(train_score[0])
val_score = model.test(val_X, val_y)
test_start = time.time()
test_score = model.test(np.array(test_X), test_y)
test_end = time.time()
self.inference_time.append(test_end - test_start)
self.val_scores.add_scores(test_score[0])

def create_representation_and_run(
self,
representation: Representation,
modalities: Union[List[Modality], Modality],
):
self._reset_params()
skf = KFold(n_splits=self.kfold, shuffle=True, random_state=11)

fold = 0
X, y, _, _ = self.get_train_test_split(data)

for train, test in skf.split(X, y):
train_X = np.array(X)[train]
train_y = np.array(y)[train]
test_X = s.transform(np.array(X)[test])
test_y = np.array(y)[test]

if isinstance(modalities, Modality):
rep = modality.apply_representation(representation())
else:
representation().transform(
train_X, train_y
) # TODO: think about a way how to handle masks

self._run_fold(train_X, train_y, test_X, test_y)
fold += 1

if self.measure_performance:
self.inference_time = np.mean(self.inference_time)
self.training_time = np.mean(self.training_time)

return [np.mean(train_scores), np.mean(test_scores)]
self.val_scores.add_scores(val_score[0])
self.test_scores.add_scores(test_score[0])
Loading
Loading