Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/main/python/systemds/scuro/dataloader/json_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,5 +55,6 @@ def extract(self, file: str, index: Optional[Union[str, List[str]]] = None):
except:
text = json_file[self.field]

text = " ".join(text)
self.data.append(text)
self.metadata[idx] = self.modality_type.create_metadata(len(text), text)
Original file line number Diff line number Diff line change
Expand Up @@ -174,9 +174,13 @@ def visit_node(node_id):
all_results.append(result)

if self.maximize_metric:
best_params, best_score = max(all_results, key=lambda x: x[1])
best_params, best_score = max(
all_results, key=lambda x: x[1].scores[self.scoring_metric]
)
else:
best_params, best_score = min(all_results, key=lambda x: x[1])
best_params, best_score = min(
all_results, key=lambda x: x[1].scores[self.scoring_metric]
)

tuning_time = time.time() - start_time

Expand Down
20 changes: 12 additions & 8 deletions src/main/python/systemds/scuro/drsearch/multimodal_optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
import threading
from dataclasses import dataclass
from typing import List, Dict, Any, Generator
from systemds.scuro.drsearch.task import Task
from systemds.scuro.drsearch.task import Task, PerformanceMeasure
from systemds.scuro.drsearch.representation_dag import (
RepresentationDag,
RepresentationDAGBuilder,
Expand Down Expand Up @@ -87,7 +87,8 @@ def _evaluate_dag_worker(dag_pickle, task_pickle, modalities_pickle, debug=False
val_score=scores[1],
runtime=total_time,
task_name=task_copy.model.name,
evaluation_time=eval_time,
task_time=eval_time,
representation_time=total_time - eval_time,
)
except Exception:
if debug:
Expand Down Expand Up @@ -390,8 +391,9 @@ def _evaluate_dag(self, dag: RepresentationDag, task: Task) -> "OptimizationResu
train_score=scores[0],
val_score=scores[1],
runtime=total_time,
representation_time=total_time - eval_time,
task_name=task_copy.model.name,
evaluation_time=eval_time,
task_time=eval_time,
)

except Exception as e:
Expand Down Expand Up @@ -475,8 +477,10 @@ def store_results(self, file_name=None):
@dataclass
class OptimizationResult:
dag: RepresentationDag
train_score: float
val_score: float
runtime: float
task_name: str
evaluation_time: float = 0.0
train_score: PerformanceMeasure = None
val_score: PerformanceMeasure = None
runtime: float = 0.0
task_time: float = 0.0
representation_time: float = 0.0
task_name: str = ""
tradeoff_score: float = 0.0
90 changes: 90 additions & 0 deletions src/main/python/systemds/scuro/drsearch/ranking.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
# -------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
# -------------------------------------------------------------

from dataclasses import replace
from typing import Callable, Iterable, List, Optional


def rank_by_tradeoff(
entries: Iterable,
*,
weights=(0.7, 0.3),
performance_metric_name: str = "accuracy",
runtime_accessor: Optional[Callable[[object], float]] = None,
cache_scores: bool = True,
score_attr: str = "tradeoff_score",
) -> List:
entries = list(entries)
if not entries:
return []

performance_score_accessor = lambda entry: getattr(entry, "val_score")[
performance_metric_name
]
if runtime_accessor is None:

def runtime_accessor(entry):
if hasattr(entry, "runtime"):
return getattr(entry, "runtime")
rep = getattr(entry, "representation_time", 0.0)
task = getattr(entry, "task_time", 0.0)
return rep + task

performance = [float(performance_score_accessor(e)) for e in entries]
runtimes = [float(runtime_accessor(e)) for e in entries]

perf_min, perf_max = min(performance), max(performance)
run_min, run_max = min(runtimes), max(runtimes)

def safe_normalize(values, vmin, vmax):
if vmax - vmin == 0.0:
return [1.0] * len(values)
return [(v - vmin) / (vmax - vmin) for v in values]

norm_perf = safe_normalize(performance, perf_min, perf_max)
norm_run = safe_normalize(runtimes, run_min, run_max)
norm_run = [1.0 - r for r in norm_run]

acc_w, run_w = weights
total_w = (acc_w or 0.0) + (run_w or 0.0)
if total_w == 0.0:
acc_w = 1.0
run_w = 0.0
else:
acc_w /= total_w
run_w /= total_w

scores = [acc_w * a + run_w * r for a, r in zip(norm_perf, norm_run)]

if cache_scores:
for entry, score in zip(entries, scores):
if hasattr(entry, score_attr):
try:
new_entry = replace(entry, **{score_attr: score})
entries[entries.index(entry)] = new_entry
except TypeError:
setattr(entry, score_attr, score)
else:
setattr(entry, score_attr, score)

return sorted(
entries, key=lambda entry: getattr(entry, score_attr, 0.0), reverse=True
)
58 changes: 49 additions & 9 deletions src/main/python/systemds/scuro/drsearch/task.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,37 @@
from sklearn.model_selection import KFold


class PerformanceMeasure:
def __init__(self, name, metrics, higher_is_better=True):
self.average_scores = None
self.name = name
self.metrics = metrics
self.higher_is_better = higher_is_better
self.scores = {}

if isinstance(metrics, list):
for metric in metrics:
self.scores[metric] = []
else:
self.scores[metrics] = []

def add_scores(self, scores):
if isinstance(self.metrics, list):
for metric in self.metrics:
self.scores[metric].append(scores[metric])
else:
self.scores[self.metrics].append(scores[self.metrics])

def compute_averages(self):
self.average_scores = {}
if isinstance(self.metrics, list):
for metric in self.metrics:
self.average_scores[metric] = np.mean(self.scores[metric])
else:
self.average_scores[self.metrics] = np.mean(self.scores[self.metrics])
return self


class Task:
def __init__(
self,
Expand All @@ -38,6 +69,7 @@ def __init__(
val_indices: List,
kfold=5,
measure_performance=True,
performance_measures="accuracy",
):
"""
Parent class for the prediction task that is performed on top of the aligned representation
Expand All @@ -59,8 +91,9 @@ def __init__(
self.inference_time = []
self.training_time = []
self.expected_dim = 1
self.train_scores = []
self.val_scores = []
self.performance_measures = performance_measures
self.train_scores = PerformanceMeasure("train", performance_measures)
self.val_scores = PerformanceMeasure("val", performance_measures)

def create_model(self):
"""
Expand All @@ -74,8 +107,12 @@ def create_model(self):
def get_train_test_split(self, data):
X_train = [data[i] for i in self.train_indices]
y_train = [self.labels[i] for i in self.train_indices]
X_test = [data[i] for i in self.val_indices]
y_test = [self.labels[i] for i in self.val_indices]
if self.val_indices is None:
X_test = None
y_test = None
else:
X_test = [data[i] for i in self.val_indices]
y_test = [self.labels[i] for i in self.val_indices]

return X_train, y_train, X_test, y_test

Expand All @@ -101,25 +138,28 @@ def run(self, data):
self._run_fold(model, train_X, train_y, test_X, test_y)
fold += 1

return [np.mean(self.train_scores), np.mean(self.val_scores)]
return [
self.train_scores.compute_averages(),
self.val_scores.compute_averages(),
]

def _reset_params(self):
self.inference_time = []
self.training_time = []
self.train_scores = []
self.val_scores = []
self.train_scores = PerformanceMeasure("train", self.performance_measures)
self.val_scores = PerformanceMeasure("val", self.performance_measures)

def _run_fold(self, model, train_X, train_y, test_X, test_y):
train_start = time.time()
train_score = model.fit(train_X, train_y, test_X, test_y)
train_end = time.time()
self.training_time.append(train_end - train_start)
self.train_scores.append(train_score)
self.train_scores.add_scores(train_score[0])
test_start = time.time()
test_score = model.test(np.array(test_X), test_y)
test_end = time.time()
self.inference_time.append(test_end - test_start)
self.val_scores.append(test_score)
self.val_scores.add_scores(test_score[0])

def create_representation_and_run(
self,
Expand Down
Loading
Loading