Skip to content

Commit 5e4044b

Browse files
mpolson64facebook-github-bot
authored andcommitted
Remove unnecessary Curve metrics (facebook#2486)
Summary: Pull Request resolved: facebook#2486 Reviewed By: Balandat Differential Revision: D57922964 fbshipit-source-id: 893efaf9311023d6398c95d48261fac6bc9f2138
1 parent 128cfdd commit 5e4044b

File tree

1 file changed

+0
-197
lines changed

1 file changed

+0
-197
lines changed

ax/metrics/curve.py

Lines changed: 0 additions & 197 deletions
Original file line numberDiff line numberDiff line change
@@ -18,20 +18,16 @@
1818
from logging import Logger
1919
from typing import Any, Dict, Iterable, Optional, Set, Union
2020

21-
import numpy as np
2221
import pandas as pd
2322
from ax.core.base_trial import BaseTrial
2423
from ax.core.batch_trial import BatchTrial
2524
from ax.core.experiment import Experiment
2625
from ax.core.map_data import MapData, MapKeyInfo
2726
from ax.core.map_metric import MapMetric
2827
from ax.core.metric import Metric, MetricFetchE, MetricFetchResult
29-
from ax.core.trial import Trial
30-
from ax.early_stopping.utils import align_partial_results
3128
from ax.exceptions.core import UnsupportedError
3229
from ax.utils.common.logger import get_logger
3330
from ax.utils.common.result import Err, Ok
34-
from ax.utils.common.typeutils import checked_cast
3531

3632
logger: Logger = get_logger(__name__)
3733

@@ -268,66 +264,6 @@ def get_df_from_curve_series(
268264
)
269265

270266

271-
class AbstractScalarizedCurveMetric(AbstractCurveMetric):
272-
"""A linear scalarization of (partial) learning curves of ML model training jobs:
273-
274-
scalarized_curve = offset + sum_i(coefficients[i] * curve[i]).
275-
276-
It is assumed that the output of `get_curves_from_ids` contains all of the curves
277-
necessary for performing the scalarization.
278-
"""
279-
280-
def __init__(
281-
self,
282-
name: str,
283-
coefficients: Dict[str, float],
284-
offset: float = 0.0,
285-
lower_is_better: bool = True,
286-
cumulative_best: bool = False,
287-
smoothing_window: Optional[int] = None,
288-
) -> None:
289-
"""Construct a AbstractScalarizedCurveMetric.
290-
291-
Args:
292-
name: Name of metric.
293-
coefficients: A mapping from learning curve names to their
294-
scalarization coefficients.
295-
offset: The offset of the affine scalarization.
296-
lower_is_better: If True, lower values (of the scalarized metric) are
297-
considered better.
298-
cumulative_best: If True, for each trial, apply cumulative best to
299-
the curve (i.e., if lower is better, then we return a curve
300-
representing the cumulative min of the raw curve).
301-
smoothing_window: If not None, specifies the window size used for a
302-
rolling mean applied to the raw curve data. This can be helpful
303-
if the underlying data is expected to be very noisy.
304-
"""
305-
MapMetric.__init__(self, name=name, lower_is_better=lower_is_better)
306-
self.coefficients = coefficients
307-
self.offset = offset
308-
self.cumulative_best = cumulative_best
309-
self.smoothing_window = smoothing_window
310-
311-
@property
312-
def curve_names(self) -> Set[str]:
313-
return set(self.coefficients.keys())
314-
315-
def get_df_from_curve_series(
316-
self,
317-
experiment: Experiment,
318-
all_curve_series: Dict[Union[int, str], Dict[str, pd.Series]],
319-
metrics: Iterable[Metric],
320-
trial_idx_to_id: Dict[int, Union[int, str]],
321-
) -> Optional[pd.DataFrame]:
322-
return get_df_from_scalarized_curve_series(
323-
experiment=experiment,
324-
all_curve_series=all_curve_series,
325-
metrics=metrics,
326-
trial_idx_to_id=trial_idx_to_id,
327-
map_key=self.map_key_info.key,
328-
)
329-
330-
331267
def get_df_from_curve_series(
332268
experiment: Experiment,
333269
all_curve_series: Dict[Union[int, str], Dict[str, pd.Series]],
@@ -377,91 +313,6 @@ def get_df_from_curve_series(
377313
return pd.concat(dfs, axis=0, ignore_index=True)
378314

379315

380-
def get_df_from_scalarized_curve_series(
381-
experiment: Experiment,
382-
all_curve_series: Dict[Union[int, str], Dict[str, pd.Series]],
383-
metrics: Iterable[Metric],
384-
trial_idx_to_id: Dict[int, Union[int, str]],
385-
map_key: str,
386-
) -> Optional[pd.DataFrame]:
387-
"""Convert a `all_curve_series` dict (from `get_curves_from_ids`) into
388-
a dataframe. For each metric, we first get all curves represented in
389-
`coefficients` and then perform scalarization.
390-
391-
Args:
392-
experiment: The experiment.
393-
all_curve_series: A dict containing curve data, as output from
394-
`get_curves_from_ids`.
395-
metrics: The metrics from which data is being fetched.
396-
trial_idx_to_id: A dict mapping trial index to ids.
397-
map_key: The progression key of the metric's MapKeyInfo.
398-
399-
Returns:
400-
A dataframe containing curve data or None if no curve data could be found.
401-
"""
402-
dfs = []
403-
complete_metrics_by_trial = {trial_idx: [] for trial_idx in trial_idx_to_id.keys()}
404-
for trial_idx, id_ in trial_idx_to_id.items():
405-
if id_ not in all_curve_series:
406-
logger.info(f"Could not get curve data for id {id_}. Ignoring.")
407-
continue
408-
curve_series = all_curve_series[id_]
409-
for m in metrics:
410-
curve_dfs = []
411-
for curve_name in m.coefficients.keys(): # pyre-ignore[16]
412-
if curve_name in curve_series:
413-
curve_df = _get_single_curve(
414-
curve_series=curve_series,
415-
curve_name=curve_name,
416-
map_key=map_key,
417-
trial=experiment.trials[trial_idx],
418-
cumulative_best=m.cumulative_best, # pyre-ignore[16]
419-
lower_is_better=m.lower_is_better, # pyre-ignore[6]
420-
smoothing_window=m.smoothing_window, # pyre-ignore[16]
421-
)
422-
curve_dfs.append(curve_df)
423-
else:
424-
logger.info(
425-
f"{curve_name} not present in curves from {id_}, so the "
426-
f"scalarization for {m.name} cannot be computed. Returning "
427-
"without this metric."
428-
)
429-
break
430-
if len(curve_dfs) == len(m.coefficients):
431-
# only keep if all curves needed by the metric are available
432-
dfs.extend(curve_dfs)
433-
# mark metrics who have all underlying curves
434-
complete_metrics_by_trial[trial_idx].append(m)
435-
436-
if len(dfs) == 0:
437-
return None
438-
439-
all_data_df = pd.concat(dfs, axis=0, ignore_index=True)
440-
sub_dfs = []
441-
# Do not create a common index across trials, only across the curves
442-
# involved in the scalarized metric.
443-
for trial_idx, dfi in all_data_df.groupby("trial_index"):
444-
# the `do_forward_fill = True` pads with the latest
445-
# observation to handle situations where learning curves
446-
# report different amounts of data.
447-
trial_curves = dfi["metric_name"].unique().tolist()
448-
dfs_mean, dfs_sem = align_partial_results(
449-
dfi,
450-
progr_key=map_key,
451-
metrics=trial_curves,
452-
do_forward_fill=True,
453-
)
454-
for metric in complete_metrics_by_trial[trial_idx]:
455-
sub_df = _get_scalarized_curve_metric_sub_df(
456-
dfs_mean=dfs_mean,
457-
dfs_sem=dfs_sem,
458-
metric=metric,
459-
trial=checked_cast(Trial, experiment.trials[trial_idx]),
460-
)
461-
sub_dfs.append(sub_df)
462-
return pd.concat(sub_dfs, axis=0, ignore_index=True)
463-
464-
465316
def _get_single_curve(
466317
curve_series: Dict[str, pd.Series],
467318
curve_name: str,
@@ -492,51 +343,3 @@ def _get_single_curve(
492343
dfi["mean"] = dfi["mean"].cummin() if lower_is_better else dfi["mean"].cummax()
493344
# pyre-fixme[7]: Expected `DataFrame` but got `Optional[DataFrame]`.
494345
return dfi.drop_duplicates()
495-
496-
497-
def _get_scalarized_curve_metric_sub_df(
498-
dfs_mean: Dict[str, pd.DataFrame],
499-
dfs_sem: Dict[str, pd.DataFrame],
500-
metric: AbstractScalarizedCurveMetric,
501-
trial: Trial,
502-
) -> pd.DataFrame:
503-
"""Helper to construct sub-dfs for a ScalarizedCurveMetric.
504-
505-
Args:
506-
df_mean: A mapping from Curve metric names to a dataframe
507-
containing the means of the respective metric. The progression
508-
indices are assumed to be aliged across metrics (e.g. as
509-
obtained via `align_partial_results`).
510-
df_sem: A mapping from Curve metric names to a dataframe
511-
containing the sems of the respective metric. If empty,
512-
assume the metrics are subject to noise of unknown magnitude.
513-
metric: The ScalarizedCurveMetric to perform the aggregation for.
514-
trial: The trial associated with the data in `df_mean` and `df_sem`.
515-
516-
Returns:
517-
A dataframe with the scalarized mean and sem in `mean` and `sem`
518-
columns, respectively.
519-
"""
520-
sub_df = metric.offset + sum(
521-
coeff * dfs_mean[metric] for metric, coeff in metric.coefficients.items()
522-
)
523-
sub_df = sub_df.rename(columns={trial.index: "mean"}) # pyre-ignore [16]
524-
if dfs_sem:
525-
var_df = sum(
526-
(coeff * dfs_sem[metric]) ** 2
527-
for metric, coeff in metric.coefficients.items()
528-
)
529-
sem_df = var_df.apply(np.sqrt).rename( # pyre-ignore [16]
530-
columns={trial.index: "sem"}
531-
)
532-
sub_df = pd.concat([sub_df, sem_df], axis=1)
533-
else:
534-
sub_df["sem"] = float("nan")
535-
sub_df = sub_df.reset_index()
536-
sub_df["trial_index"] = trial.index
537-
sub_df["arm_name"] = trial.arm.name # pyre-ignore [16]
538-
sub_df["metric_name"] = metric.name
539-
# When scalarizing curves, sometimes the last progression will be different
540-
# across curves, even for the same trial. This dropna() will only keep the
541-
# progressions that are available for all curves.
542-
return sub_df.dropna(subset=["mean"])

0 commit comments

Comments
 (0)