2020# -------------------------------------------------------------
2121import copy
2222import time
23- from typing import List , Union
24- from systemds .scuro .modality .modality import Modality
25- from systemds .scuro .representations .representation import Representation
23+ from typing import List
2624from systemds .scuro .models .model import Model
2725import numpy as np
28- from sklearn .model_selection import KFold
26+ from sklearn .model_selection import train_test_split
2927
3028
3129class PerformanceMeasure :
@@ -69,7 +67,8 @@ def __init__(
6967 val_indices : List ,
7068 kfold = 5 ,
7169 measure_performance = True ,
72- performance_measures = "accuracy" ,
70+ performance_measures = ["accuracy" ],
71+ fusion_train_split = 0.8 ,
7372 ):
7473 """
7574 Parent class for the prediction task that is performed on top of the aligned representation
@@ -85,7 +84,7 @@ def __init__(
8584 self .model = model
8685 self .labels = labels
8786 self .train_indices = train_indices
88- self .val_indices = val_indices
87+ self .test_indices = val_indices
8988 self .kfold = kfold
9089 self .measure_performance = measure_performance
9190 self .inference_time = []
@@ -94,6 +93,47 @@ def __init__(
9493 self .performance_measures = performance_measures
9594 self .train_scores = PerformanceMeasure ("train" , performance_measures )
9695 self .val_scores = PerformanceMeasure ("val" , performance_measures )
96+ self .test_scores = PerformanceMeasure ("test" , performance_measures )
97+ self .fusion_train_indices = None
98+ self ._create_cv_splits ()
99+
100+ def _create_cv_splits (self ):
101+ train_labels = [self .labels [i ] for i in self .train_indices ]
102+ train_labels_array = np .array (train_labels )
103+
104+ train_indices_array = np .array (self .train_indices )
105+
106+ self .cv_train_indices = []
107+ self .cv_val_indices = []
108+
109+ for fold_idx in range (self .kfold ):
110+ fold_train_indices_array , fold_val_indices_array , _ , _ = train_test_split (
111+ train_indices_array ,
112+ train_labels_array ,
113+ test_size = 0.2 ,
114+ shuffle = True ,
115+ random_state = 11 + fold_idx ,
116+ )
117+
118+ fold_train_indices = fold_train_indices_array .tolist ()
119+ fold_val_indices = fold_val_indices_array .tolist ()
120+
121+ self .cv_train_indices .append (fold_train_indices )
122+ self .cv_val_indices .append (fold_val_indices )
123+
124+ overlap = set (fold_train_indices ) & set (fold_val_indices )
125+ if overlap :
126+ raise ValueError (
127+ f"Fold { fold_idx } : Overlap detected between train and val indices: { overlap } "
128+ )
129+
130+ all_val_indices = set ()
131+ for val_indices in self .cv_val_indices :
132+ all_val_indices .update (val_indices )
133+
134+ self .fusion_train_indices = [
135+ idx for idx in self .train_indices if idx not in all_val_indices
136+ ]
97137
98138 def create_model (self ):
99139 """
@@ -107,12 +147,12 @@ def create_model(self):
107147 def get_train_test_split (self , data ):
108148 X_train = [data [i ] for i in self .train_indices ]
109149 y_train = [self .labels [i ] for i in self .train_indices ]
110- if self .val_indices is None :
150+ if self .test_indices is None :
111151 X_test = None
112152 y_test = None
113153 else :
114- X_test = [data [i ] for i in self .val_indices ]
115- y_test = [self .labels [i ] for i in self .val_indices ]
154+ X_test = [data [i ] for i in self .test_indices ]
155+ y_test = [self .labels [i ] for i in self .test_indices ]
116156
117157 return X_train , y_train , X_test , y_test
118158
@@ -125,71 +165,44 @@ def run(self, data):
125165 """
126166 self ._reset_params ()
127167 model = self .create_model ()
128- skf = KFold (n_splits = self .kfold , shuffle = True , random_state = 11 )
129168
130- fold = 0
131- X , y , _ , _ = self .get_train_test_split (data )
169+ test_X = np .array ([data [i ] for i in self .test_indices ])
170+ test_y = np .array ([self .labels [i ] for i in self .test_indices ])
171+
172+ for fold_idx in range (self .kfold ):
173+ fold_train_indices = self .cv_train_indices [fold_idx ]
174+ fold_val_indices = self .cv_val_indices [fold_idx ]
132175
133- for train , test in skf .split (X , y ):
134- train_X = np .array (X )[train ]
135- train_y = np .array (y )[train ]
136- test_X = np .array (X )[test ]
137- test_y = np .array (y )[test ]
138- self ._run_fold (model , train_X , train_y , test_X , test_y )
139- fold += 1
176+ train_X = np .array ([data [i ] for i in fold_train_indices ])
177+ train_y = np .array ([self .labels [i ] for i in fold_train_indices ])
178+ val_X = np .array ([data [i ] for i in fold_val_indices ])
179+ val_y = np .array ([self .labels [i ] for i in fold_val_indices ])
180+
181+ self ._run_fold (model , train_X , train_y , val_X , val_y , test_X , test_y )
140182
141183 return [
142184 self .train_scores .compute_averages (),
143185 self .val_scores .compute_averages (),
186+ self .test_scores .compute_averages (),
144187 ]
145188
146189 def _reset_params (self ):
147190 self .inference_time = []
148191 self .training_time = []
149192 self .train_scores = PerformanceMeasure ("train" , self .performance_measures )
150193 self .val_scores = PerformanceMeasure ("val" , self .performance_measures )
194+ self .test_scores = PerformanceMeasure ("test" , self .performance_measures )
151195
152- def _run_fold (self , model , train_X , train_y , test_X , test_y ):
196+ def _run_fold (self , model , train_X , train_y , val_X , val_y , test_X , test_y ):
153197 train_start = time .time ()
154- train_score = model .fit (train_X , train_y , test_X , test_y )
198+ train_score = model .fit (train_X , train_y , val_X , val_y )
155199 train_end = time .time ()
156200 self .training_time .append (train_end - train_start )
157201 self .train_scores .add_scores (train_score [0 ])
202+ val_score = model .test (val_X , val_y )
158203 test_start = time .time ()
159204 test_score = model .test (np .array (test_X ), test_y )
160205 test_end = time .time ()
161206 self .inference_time .append (test_end - test_start )
162- self .val_scores .add_scores (test_score [0 ])
163-
164- def create_representation_and_run (
165- self ,
166- representation : Representation ,
167- modalities : Union [List [Modality ], Modality ],
168- ):
169- self ._reset_params ()
170- skf = KFold (n_splits = self .kfold , shuffle = True , random_state = 11 )
171-
172- fold = 0
173- X , y , _ , _ = self .get_train_test_split (data )
174-
175- for train , test in skf .split (X , y ):
176- train_X = np .array (X )[train ]
177- train_y = np .array (y )[train ]
178- test_X = s .transform (np .array (X )[test ])
179- test_y = np .array (y )[test ]
180-
181- if isinstance (modalities , Modality ):
182- rep = modality .apply_representation (representation ())
183- else :
184- representation ().transform (
185- train_X , train_y
186- ) # TODO: think about a way how to handle masks
187-
188- self ._run_fold (train_X , train_y , test_X , test_y )
189- fold += 1
190-
191- if self .measure_performance :
192- self .inference_time = np .mean (self .inference_time )
193- self .training_time = np .mean (self .training_time )
194-
195- return [np .mean (train_scores ), np .mean (test_scores )]
207+ self .val_scores .add_scores (val_score [0 ])
208+ self .test_scores .add_scores (test_score [0 ])
0 commit comments