Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
104 changes: 101 additions & 3 deletions bofire/data_models/domain/domain.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,31 @@ def validate_constraints(self):
c.validate_inputs(self.inputs)
return self

@model_validator(mode="after")
def validate_timeseries_config(self):
"""Validate the timeseries configuration of the domain.

Raises:
ValueError: If multiple features are marked as timeseries.

Returns:
self: The validated domain instance.
"""
# Get all numerical inputs that have the is_timeseries attribute
from bofire.data_models.features.numerical import NumericalInput

timeseries_features = [
f.key
for f in self.inputs
if isinstance(f, NumericalInput) and getattr(f, "is_timeseries", False)
]

if len(timeseries_features) > 1:
raise ValueError(
f"Multiple features ({', '.join(timeseries_features)}) are marked as timeseries. Only one is allowed."
)
return self

# TODO: tidy this up
def get_nchoosek_combinations(self, exhaustive: bool = False):
"""Get all possible NChooseK combinations
Expand Down Expand Up @@ -310,9 +335,9 @@ def aggregate_by_duplicates(
experiments = self.coerce_invalids(experiments)

# group and aggregate
agg: Dict[str, Any] = {
feat: method for feat in self.outputs.get_keys(ContinuousOutput)
}
agg: Dict[str, Any] = dict.fromkeys(
self.outputs.get_keys(ContinuousOutput), method
)
agg["labcode"] = lambda x: delimiter.join(sorted(x.tolist()))
for feat in self.outputs.get_keys(Output):
agg[f"valid_{feat}"] = lambda x: 1
Expand Down Expand Up @@ -383,6 +408,79 @@ def validate_experiments(
strict=strict,
)
experiments = self.outputs.validate_experiments(experiments=experiments)

# Check for _trajectory_id if timeseries features are present
from bofire.data_models.features.numerical import NumericalInput

timeseries_features = [
f.key
for f in self.inputs
if isinstance(f, NumericalInput) and getattr(f, "is_timeseries", False)
]

if len(timeseries_features) > 0:
trajectory_col = "_trajectory_id"
if trajectory_col not in experiments.columns:
raise ValueError(
f"Timeseries feature '{timeseries_features[0]}' detected, but required column '{trajectory_col}' "
f"is not present in the experiments. When using timeseries features, you must include a "
f"'{trajectory_col}' column that identifies which trajectory/experiment each row belongs to."
)

return experiments

def add_trajectory_id(
self,
experiments: pd.DataFrame,
eps: float = 1e-6,
) -> pd.DataFrame:
"""Add _trajectory_id column to experiments by inferring trajectory groupings.

This method automatically groups experiments into trajectories based on their
non-timeseries input feature values. Experiments with the same (or nearly same,
within eps) values for all non-timeseries features are assigned the same
trajectory ID.

This is useful when you have experimental data from multiple runs/trajectories
but haven't manually labeled which observations belong to which trajectory.

Args:
experiments (pd.DataFrame): Dataframe with experimental data. Must contain
columns for all input features defined in the domain. If _trajectory_id
already exists, it will be overwritten.
eps (float, optional): Tolerance for comparing continuous values. Two
continuous values are considered equal if their absolute difference is
less than eps. Default: 1e-6. Does not apply to discrete, categorical,
or molecular features.

Returns:
pd.DataFrame: Copy of experiments with _trajectory_id column added or updated.

Raises:
ValueError: If no timeseries feature is found in the domain.
ValueError: If required input feature columns are missing from experiments.

Example:
>>> domain = Domain(
... inputs=Inputs(features=[
... ContinuousInput(key="time", bounds=(0, 100), is_timeseries=True),
... ContinuousInput(key="temperature", bounds=(20, 80)),
... ]),
... outputs=Outputs(features=[ContinuousOutput(key="yield")]),
... )
>>> experiments = pd.DataFrame({
... 'time': [0, 10, 20, 0, 10, 20],
... 'temperature': [25, 25, 25, 30, 30, 30],
... 'yield': [0.1, 0.2, 0.3, 0.2, 0.3, 0.4],
... 'valid_yield': [1] * 6,
... })
>>> experiments = domain.add_trajectory_id(experiments)
>>> # Rows with temperature=25 get one ID, temperature=30 get another
"""
from bofire.utils.timeseries import infer_trajectory_id

experiments = experiments.copy()
experiments["_trajectory_id"] = infer_trajectory_id(experiments, self, eps=eps)
return experiments

def describe_experiments(self, experiments: pd.DataFrame) -> pd.DataFrame:
Expand Down
5 changes: 5 additions & 0 deletions bofire/data_models/features/numerical.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

import numpy as np
import pandas as pd
from pydantic import Field

from bofire.data_models.features.feature import Input, TTransform

Expand All @@ -11,6 +12,10 @@ class NumericalInput(Input):
"""Abstract base class for all numerical (ordinal) input features."""

unit: Optional[str] = None
is_timeseries: bool = Field(
default=False,
description="Field to mark if this feature represents time in a timeseries",
)

@staticmethod
def valid_transform_types() -> List:
Expand Down
42 changes: 39 additions & 3 deletions bofire/surrogates/trainable.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,12 @@

import numpy as np
import pandas as pd
from sklearn.model_selection import GroupShuffleSplit, KFold, StratifiedKFold
from sklearn.model_selection import (
GroupKFold,
GroupShuffleSplit,
KFold,
StratifiedKFold,
)

from bofire.data_models.enum import OutputFilteringEnum
from bofire.data_models.features.api import (
Expand Down Expand Up @@ -85,6 +90,7 @@ def cross_validate(
random_state: Optional[int] = None,
stratified_feature: Optional[str] = None,
group_split_column: Optional[str] = None,
use_shuffle_split: bool = False,
hooks: Optional[
Dict[
str,
Expand Down Expand Up @@ -118,6 +124,9 @@ def cross_validate(
training and testing sets. This is useful in scenarios where data points are related or dependent on each
other, and splitting them into different sets would violate the assumption of independence. The number of
unique groups must be greater than or equal to the number of folds. Defaults to None.
use_shuffle_split (bool, optional): When group_split_column is provided, use GroupShuffleSplit
instead of GroupKFold. GroupKFold (default) ensures each group is tested exactly once,
while GroupShuffleSplit allows flexible test_size but may not test all groups. Defaults to False.
hooks (Dict[str, Callable[[Model, pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame], Any]], optional):
Dictionary of callable hooks that are called within the CV loop. The callable retrieves the current trained
modeld and the current CV folds in the following order: X_train, y_train, X_test, y_test. Defaults to {}.
Expand Down Expand Up @@ -156,6 +165,26 @@ def cross_validate(
"The feature to be stratified needs to be a DiscreteInput, CategoricalInput, CategoricalOutput, or ContinuousOutput",
)

# Auto-detect timeseries and use appropriate group split
if group_split_column is None:
# Check if any input feature is marked as timeseries
timeseries_features = [
feat
for feat in self.inputs.get() # type: ignore
if hasattr(feat, "is_timeseries") and feat.is_timeseries
]
if len(timeseries_features) > 0:
# When timeseries feature is present, require _trajectory_id column
trajectory_col = "_trajectory_id"
if trajectory_col in experiments.columns:
group_split_column = trajectory_col
else:
raise ValueError(
f"Timeseries feature '{timeseries_features[0].key}' detected, but required column '{trajectory_col}' "
f"is not present in the experiments. When using timeseries features, you must include a "
f"'{trajectory_col}' column that identifies which trajectory/experiment each row belongs to."
)

if group_split_column is not None:
# check if the group split column is present in the experiments
if group_split_column not in experiments.columns:
Expand Down Expand Up @@ -187,6 +216,7 @@ def cross_validate(
stratified_feature=stratified_feature,
group_split_column=group_split_column,
random_state=random_state,
use_shuffle_split=use_shuffle_split,
)

key = self.outputs.get_keys()[0] # type: ignore
Expand Down Expand Up @@ -301,8 +331,9 @@ def _make_cv_split(
stratified_feature: Optional[str] = None,
group_split_column: Optional[str] = None,
random_state: Optional[int] = None,
use_shuffle_split: bool = False,
) -> Tuple[
Union[KFold, StratifiedKFold, GroupShuffleSplit],
Union[KFold, StratifiedKFold, GroupKFold, GroupShuffleSplit],
Generator[Tuple[np.ndarray, np.ndarray], None, None],
]:
"""
Expand All @@ -321,7 +352,12 @@ def _make_cv_split(
if stratified_feature is None:
if group_split_column is not None:
# GROUP SPLIT FUNCTIONALITY
cv = GroupShuffleSplit(n_splits=folds, random_state=random_state)
if use_shuffle_split:
# Use GroupShuffleSplit for flexible test_size
cv = GroupShuffleSplit(n_splits=folds, random_state=random_state)
else:
# Use GroupKFold for exhaustive testing (default)
cv = GroupKFold(n_splits=folds)
cv_func = cv.split(experiments, groups=experiments[group_split_column])
else:
cv = KFold(n_splits=folds, shuffle=True, random_state=random_state)
Expand Down
152 changes: 152 additions & 0 deletions bofire/utils/timeseries.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
"""Utilities for working with timeseries data in BoFire."""

from typing import TYPE_CHECKING

import numpy as np
import pandas as pd


if TYPE_CHECKING:
from bofire.data_models.domain.domain import Domain


def infer_trajectory_id(
experiments: pd.DataFrame,
domain: "Domain",
eps: float = 1e-6,
) -> pd.Series:
"""
Automatically infer trajectory IDs by grouping experiments with the same
non-timeseries input feature values.

For each row in the experiments DataFrame, this function identifies which
trajectory it belongs to by comparing the values of all non-timeseries input
features. Rows with the same (or nearly same, within eps) values for all
non-timeseries features are assigned the same trajectory ID.

This is useful when you have experimental data from multiple runs/trajectories
where the timeseries feature varies, but you haven't manually labeled which
observations belong to which trajectory.

Args:
experiments: DataFrame with experimental data. Must contain columns for
all input features defined in the domain.
domain: Domain object that defines the input features and identifies
which feature (if any) is marked as timeseries.
eps: Tolerance for comparing continuous values. Two continuous values
are considered equal if their absolute difference is less than eps.
Default: 1e-6. Does not apply to discrete, categorical, or molecular
features, which use exact equality.

Returns:
pd.Series: Series of integer trajectory IDs (0, 1, 2, ...) corresponding
to each row in experiments. Rows with the same trajectory ID belong
to the same experimental run/trajectory.

Raises:
ValueError: If no timeseries feature is found in the domain.
ValueError: If required input feature columns are missing from experiments.

Example:
>>> from bofire.data_models.domain.api import Domain, Inputs, Outputs
>>> from bofire.data_models.features.api import ContinuousInput, ContinuousOutput
>>> from bofire.utils.timeseries import infer_trajectory_id
>>> import pandas as pd
>>>
>>> # Define domain with timeseries
>>> inputs = Inputs(features=[
... ContinuousInput(key="time", bounds=(0, 100), is_timeseries=True),
... ContinuousInput(key="temperature", bounds=(20, 80)),
... ])
>>> outputs = Outputs(features=[ContinuousOutput(key="yield")])
>>> domain = Domain(inputs=inputs, outputs=outputs)
>>>
>>> # Create experiments (3 trajectories, temperature varies between them)
>>> experiments = pd.DataFrame({
... 'time': [0, 10, 20, 0, 10, 20, 0, 10, 20],
... 'temperature': [25, 25, 25, 30, 30, 30, 25, 25, 25],
... 'yield': [0.1, 0.2, 0.3, 0.2, 0.3, 0.4, 0.1, 0.2, 0.3],
... 'valid_yield': [1] * 9,
... })
>>>
>>> # Infer trajectory IDs
>>> experiments['_trajectory_id'] = infer_trajectory_id(experiments, domain)
>>> print(experiments['_trajectory_id'].tolist())
[0, 0, 0, 1, 1, 1, 0, 0, 0] # Rows with temp=25 get same ID
"""
from bofire.data_models.features.categorical import CategoricalInput
from bofire.data_models.features.discrete import DiscreteInput
from bofire.data_models.features.molecular import MolecularInput
from bofire.data_models.features.numerical import NumericalInput

# Identify timeseries feature
timeseries_features = [
f
for f in domain.inputs
if isinstance(f, NumericalInput) and getattr(f, "is_timeseries", False)
]

if len(timeseries_features) == 0:
raise ValueError(
"No timeseries feature found in the domain. "
"At least one input feature must be marked with is_timeseries=True."
)

timeseries_key = timeseries_features[0].key

# Get all non-timeseries input features
grouping_features = [f for f in domain.inputs if f.key != timeseries_key]

if len(grouping_features) == 0:
# Special case: only timeseries feature exists
# All rows belong to the same trajectory
return pd.Series(0, index=experiments.index)

# Check that all required columns are present
missing_cols = [
f.key for f in grouping_features if f.key not in experiments.columns
]
if missing_cols:
raise ValueError(
f"Required input feature columns missing from experiments: {missing_cols}"
)

# Build a grouping key for each row based on non-timeseries features
# We'll create a tuple for each row representing its grouping values

def make_grouping_key(row):
"""Create a hashable grouping key for a row."""
key_parts = []
for feature in grouping_features:
value = row[feature.key]

if isinstance(feature, (CategoricalInput, DiscreteInput, MolecularInput)):
# Categorical, discrete, and molecular features: use exact value
key_parts.append(value)
elif isinstance(feature, NumericalInput):
# Continuous features: round to tolerance for grouping
# This handles floating point comparison issues
if pd.isna(value):
key_parts.append(None)
else:
# Round to appropriate decimal places based on eps
decimal_places = max(0, int(-np.log10(eps)))
rounded_value = round(float(value), decimal_places)
key_parts.append(rounded_value)
else:
# Fallback: use exact value
key_parts.append(value)

return tuple(key_parts)

# Create grouping keys for all rows
grouping_keys = experiments.apply(make_grouping_key, axis=1)

# Map unique grouping keys to trajectory IDs
unique_keys = grouping_keys.unique()
key_to_id = {key: idx for idx, key in enumerate(unique_keys)}

# Assign trajectory IDs
trajectory_ids = grouping_keys.map(key_to_id)

return trajectory_ids
1 change: 1 addition & 0 deletions docs/examples.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ Optimization with some of the challenges faced in real-world experiments:
- [A toy example for optimizing a reaction](reaction_optimization.ipynb)
- [Using a Tanimoto fingerprint kernel to optimize over molecules](fingerprint_bayesopt.ipynb)
- [Using a MultiFidelity strategy with cheap, approximate experiments](multifidelity_bo.ipynb)
- [Handling timeseries data and kinetic measurements](timeseries_kinetic_measurements.ipynb)

## API with BoFire

Expand Down
Loading
Loading