diff --git a/bofire/data_models/domain/domain.py b/bofire/data_models/domain/domain.py
index 73f6ed8c2..1120340ca 100644
--- a/bofire/data_models/domain/domain.py
+++ b/bofire/data_models/domain/domain.py
@@ -138,6 +138,31 @@ def validate_constraints(self):
c.validate_inputs(self.inputs)
return self
+ @model_validator(mode="after")
+ def validate_timeseries_config(self):
+ """Validate the timeseries configuration of the domain.
+
+ Raises:
+ ValueError: If multiple features are marked as timeseries.
+
+ Returns:
+ self: The validated domain instance.
+ """
+ # Get all numerical inputs that have the is_timeseries attribute
+ from bofire.data_models.features.numerical import NumericalInput
+
+ timeseries_features = [
+ f.key
+ for f in self.inputs
+ if isinstance(f, NumericalInput) and getattr(f, "is_timeseries", False)
+ ]
+
+ if len(timeseries_features) > 1:
+ raise ValueError(
+ f"Multiple features ({', '.join(timeseries_features)}) are marked as timeseries. Only one is allowed."
+ )
+ return self
+
# TODO: tidy this up
def get_nchoosek_combinations(self, exhaustive: bool = False):
"""Get all possible NChooseK combinations
@@ -310,9 +335,9 @@ def aggregate_by_duplicates(
experiments = self.coerce_invalids(experiments)
# group and aggregate
- agg: Dict[str, Any] = {
- feat: method for feat in self.outputs.get_keys(ContinuousOutput)
- }
+ agg: Dict[str, Any] = dict.fromkeys(
+ self.outputs.get_keys(ContinuousOutput), method
+ )
agg["labcode"] = lambda x: delimiter.join(sorted(x.tolist()))
for feat in self.outputs.get_keys(Output):
agg[f"valid_{feat}"] = lambda x: 1
@@ -383,6 +408,79 @@ def validate_experiments(
strict=strict,
)
experiments = self.outputs.validate_experiments(experiments=experiments)
+
+ # Check for _trajectory_id if timeseries features are present
+ from bofire.data_models.features.numerical import NumericalInput
+
+ timeseries_features = [
+ f.key
+ for f in self.inputs
+ if isinstance(f, NumericalInput) and getattr(f, "is_timeseries", False)
+ ]
+
+ if len(timeseries_features) > 0:
+ trajectory_col = "_trajectory_id"
+ if trajectory_col not in experiments.columns:
+ raise ValueError(
+ f"Timeseries feature '{timeseries_features[0]}' detected, but required column '{trajectory_col}' "
+ f"is not present in the experiments. When using timeseries features, you must include a "
+ f"'{trajectory_col}' column that identifies which trajectory/experiment each row belongs to."
+ )
+
+ return experiments
+
+ def add_trajectory_id(
+ self,
+ experiments: pd.DataFrame,
+ eps: float = 1e-6,
+ ) -> pd.DataFrame:
+ """Add _trajectory_id column to experiments by inferring trajectory groupings.
+
+ This method automatically groups experiments into trajectories based on their
+ non-timeseries input feature values. Experiments with the same (or nearly same,
+ within eps) values for all non-timeseries features are assigned the same
+ trajectory ID.
+
+ This is useful when you have experimental data from multiple runs/trajectories
+ but haven't manually labeled which observations belong to which trajectory.
+
+ Args:
+ experiments (pd.DataFrame): Dataframe with experimental data. Must contain
+ columns for all input features defined in the domain. If _trajectory_id
+ already exists, it will be overwritten.
+ eps (float, optional): Tolerance for comparing continuous values. Two
+ continuous values are considered equal if their absolute difference is
+ less than eps. Default: 1e-6. Does not apply to discrete, categorical,
+ or molecular features.
+
+ Returns:
+ pd.DataFrame: Copy of experiments with _trajectory_id column added or updated.
+
+ Raises:
+ ValueError: If no timeseries feature is found in the domain.
+ ValueError: If required input feature columns are missing from experiments.
+
+ Example:
+ >>> domain = Domain(
+ ... inputs=Inputs(features=[
+ ... ContinuousInput(key="time", bounds=(0, 100), is_timeseries=True),
+ ... ContinuousInput(key="temperature", bounds=(20, 80)),
+ ... ]),
+ ... outputs=Outputs(features=[ContinuousOutput(key="yield")]),
+ ... )
+ >>> experiments = pd.DataFrame({
+ ... 'time': [0, 10, 20, 0, 10, 20],
+ ... 'temperature': [25, 25, 25, 30, 30, 30],
+ ... 'yield': [0.1, 0.2, 0.3, 0.2, 0.3, 0.4],
+ ... 'valid_yield': [1] * 6,
+ ... })
+ >>> experiments = domain.add_trajectory_id(experiments)
+ >>> # Rows with temperature=25 get one ID, temperature=30 get another
+ """
+ from bofire.utils.timeseries import infer_trajectory_id
+
+ experiments = experiments.copy()
+ experiments["_trajectory_id"] = infer_trajectory_id(experiments, self, eps=eps)
return experiments
def describe_experiments(self, experiments: pd.DataFrame) -> pd.DataFrame:
diff --git a/bofire/data_models/features/numerical.py b/bofire/data_models/features/numerical.py
index a8e4aa2d0..278610a67 100644
--- a/bofire/data_models/features/numerical.py
+++ b/bofire/data_models/features/numerical.py
@@ -3,6 +3,7 @@
import numpy as np
import pandas as pd
+from pydantic import Field
from bofire.data_models.features.feature import Input, TTransform
@@ -11,6 +12,10 @@ class NumericalInput(Input):
"""Abstract base class for all numerical (ordinal) input features."""
unit: Optional[str] = None
+ is_timeseries: bool = Field(
+ default=False,
+ description="Field to mark if this feature represents time in a timeseries",
+ )
@staticmethod
def valid_transform_types() -> List:
diff --git a/bofire/surrogates/trainable.py b/bofire/surrogates/trainable.py
index eb91ddd65..1280eb982 100644
--- a/bofire/surrogates/trainable.py
+++ b/bofire/surrogates/trainable.py
@@ -4,7 +4,12 @@
import numpy as np
import pandas as pd
-from sklearn.model_selection import GroupShuffleSplit, KFold, StratifiedKFold
+from sklearn.model_selection import (
+ GroupKFold,
+ GroupShuffleSplit,
+ KFold,
+ StratifiedKFold,
+)
from bofire.data_models.enum import OutputFilteringEnum
from bofire.data_models.features.api import (
@@ -85,6 +90,7 @@ def cross_validate(
random_state: Optional[int] = None,
stratified_feature: Optional[str] = None,
group_split_column: Optional[str] = None,
+ use_shuffle_split: bool = False,
hooks: Optional[
Dict[
str,
@@ -118,6 +124,9 @@ def cross_validate(
training and testing sets. This is useful in scenarios where data points are related or dependent on each
other, and splitting them into different sets would violate the assumption of independence. The number of
unique groups must be greater than or equal to the number of folds. Defaults to None.
+ use_shuffle_split (bool, optional): When group_split_column is provided, use GroupShuffleSplit
+ instead of GroupKFold. GroupKFold (default) ensures each group is tested exactly once,
+ while GroupShuffleSplit allows flexible test_size but may not test all groups. Defaults to False.
hooks (Dict[str, Callable[[Model, pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame], Any]], optional):
Dictionary of callable hooks that are called within the CV loop. The callable retrieves the current trained
modeld and the current CV folds in the following order: X_train, y_train, X_test, y_test. Defaults to {}.
@@ -156,6 +165,26 @@ def cross_validate(
"The feature to be stratified needs to be a DiscreteInput, CategoricalInput, CategoricalOutput, or ContinuousOutput",
)
+ # Auto-detect timeseries and use appropriate group split
+ if group_split_column is None:
+ # Check if any input feature is marked as timeseries
+ timeseries_features = [
+ feat
+ for feat in self.inputs.get() # type: ignore
+ if hasattr(feat, "is_timeseries") and feat.is_timeseries
+ ]
+ if len(timeseries_features) > 0:
+ # When timeseries feature is present, require _trajectory_id column
+ trajectory_col = "_trajectory_id"
+ if trajectory_col in experiments.columns:
+ group_split_column = trajectory_col
+ else:
+ raise ValueError(
+ f"Timeseries feature '{timeseries_features[0].key}' detected, but required column '{trajectory_col}' "
+ f"is not present in the experiments. When using timeseries features, you must include a "
+ f"'{trajectory_col}' column that identifies which trajectory/experiment each row belongs to."
+ )
+
if group_split_column is not None:
# check if the group split column is present in the experiments
if group_split_column not in experiments.columns:
@@ -187,6 +216,7 @@ def cross_validate(
stratified_feature=stratified_feature,
group_split_column=group_split_column,
random_state=random_state,
+ use_shuffle_split=use_shuffle_split,
)
key = self.outputs.get_keys()[0] # type: ignore
@@ -301,8 +331,9 @@ def _make_cv_split(
stratified_feature: Optional[str] = None,
group_split_column: Optional[str] = None,
random_state: Optional[int] = None,
+ use_shuffle_split: bool = False,
) -> Tuple[
- Union[KFold, StratifiedKFold, GroupShuffleSplit],
+ Union[KFold, StratifiedKFold, GroupKFold, GroupShuffleSplit],
Generator[Tuple[np.ndarray, np.ndarray], None, None],
]:
"""
@@ -321,7 +352,12 @@ def _make_cv_split(
if stratified_feature is None:
if group_split_column is not None:
# GROUP SPLIT FUNCTIONALITY
- cv = GroupShuffleSplit(n_splits=folds, random_state=random_state)
+ if use_shuffle_split:
+ # Use GroupShuffleSplit for flexible test_size
+ cv = GroupShuffleSplit(n_splits=folds, random_state=random_state)
+ else:
+ # Use GroupKFold for exhaustive testing (default)
+ cv = GroupKFold(n_splits=folds)
cv_func = cv.split(experiments, groups=experiments[group_split_column])
else:
cv = KFold(n_splits=folds, shuffle=True, random_state=random_state)
diff --git a/bofire/utils/timeseries.py b/bofire/utils/timeseries.py
new file mode 100644
index 000000000..904b4e6b1
--- /dev/null
+++ b/bofire/utils/timeseries.py
@@ -0,0 +1,152 @@
+"""Utilities for working with timeseries data in BoFire."""
+
+from typing import TYPE_CHECKING
+
+import numpy as np
+import pandas as pd
+
+
+if TYPE_CHECKING:
+ from bofire.data_models.domain.domain import Domain
+
+
+def infer_trajectory_id(
+ experiments: pd.DataFrame,
+ domain: "Domain",
+ eps: float = 1e-6,
+) -> pd.Series:
+ """
+ Automatically infer trajectory IDs by grouping experiments with the same
+ non-timeseries input feature values.
+
+ For each row in the experiments DataFrame, this function identifies which
+ trajectory it belongs to by comparing the values of all non-timeseries input
+ features. Rows with the same (or nearly same, within eps) values for all
+ non-timeseries features are assigned the same trajectory ID.
+
+ This is useful when you have experimental data from multiple runs/trajectories
+ where the timeseries feature varies, but you haven't manually labeled which
+ observations belong to which trajectory.
+
+ Args:
+ experiments: DataFrame with experimental data. Must contain columns for
+ all input features defined in the domain.
+ domain: Domain object that defines the input features and identifies
+ which feature (if any) is marked as timeseries.
+ eps: Tolerance for comparing continuous values. Two continuous values
+ are considered equal if their absolute difference is less than eps.
+ Default: 1e-6. Does not apply to discrete, categorical, or molecular
+ features, which use exact equality.
+
+ Returns:
+ pd.Series: Series of integer trajectory IDs (0, 1, 2, ...) corresponding
+ to each row in experiments. Rows with the same trajectory ID belong
+ to the same experimental run/trajectory.
+
+ Raises:
+ ValueError: If no timeseries feature is found in the domain.
+ ValueError: If required input feature columns are missing from experiments.
+
+ Example:
+ >>> from bofire.data_models.domain.api import Domain, Inputs, Outputs
+ >>> from bofire.data_models.features.api import ContinuousInput, ContinuousOutput
+ >>> from bofire.utils.timeseries import infer_trajectory_id
+ >>> import pandas as pd
+ >>>
+ >>> # Define domain with timeseries
+ >>> inputs = Inputs(features=[
+ ... ContinuousInput(key="time", bounds=(0, 100), is_timeseries=True),
+ ... ContinuousInput(key="temperature", bounds=(20, 80)),
+ ... ])
+ >>> outputs = Outputs(features=[ContinuousOutput(key="yield")])
+ >>> domain = Domain(inputs=inputs, outputs=outputs)
+ >>>
+ >>> # Create experiments (3 trajectories, temperature varies between them)
+ >>> experiments = pd.DataFrame({
+ ... 'time': [0, 10, 20, 0, 10, 20, 0, 10, 20],
+ ... 'temperature': [25, 25, 25, 30, 30, 30, 25, 25, 25],
+ ... 'yield': [0.1, 0.2, 0.3, 0.2, 0.3, 0.4, 0.1, 0.2, 0.3],
+ ... 'valid_yield': [1] * 9,
+ ... })
+ >>>
+ >>> # Infer trajectory IDs
+ >>> experiments['_trajectory_id'] = infer_trajectory_id(experiments, domain)
+ >>> print(experiments['_trajectory_id'].tolist())
+ [0, 0, 0, 1, 1, 1, 0, 0, 0] # Rows with temp=25 get same ID
+ """
+ from bofire.data_models.features.categorical import CategoricalInput
+ from bofire.data_models.features.discrete import DiscreteInput
+ from bofire.data_models.features.molecular import MolecularInput
+ from bofire.data_models.features.numerical import NumericalInput
+
+ # Identify timeseries feature
+ timeseries_features = [
+ f
+ for f in domain.inputs
+ if isinstance(f, NumericalInput) and getattr(f, "is_timeseries", False)
+ ]
+
+ if len(timeseries_features) == 0:
+ raise ValueError(
+ "No timeseries feature found in the domain. "
+ "At least one input feature must be marked with is_timeseries=True."
+ )
+
+ timeseries_key = timeseries_features[0].key
+
+ # Get all non-timeseries input features
+ grouping_features = [f for f in domain.inputs if f.key != timeseries_key]
+
+ if len(grouping_features) == 0:
+ # Special case: only timeseries feature exists
+ # All rows belong to the same trajectory
+ return pd.Series(0, index=experiments.index)
+
+ # Check that all required columns are present
+ missing_cols = [
+ f.key for f in grouping_features if f.key not in experiments.columns
+ ]
+ if missing_cols:
+ raise ValueError(
+ f"Required input feature columns missing from experiments: {missing_cols}"
+ )
+
+ # Build a grouping key for each row based on non-timeseries features
+ # We'll create a tuple for each row representing its grouping values
+
+ def make_grouping_key(row):
+ """Create a hashable grouping key for a row."""
+ key_parts = []
+ for feature in grouping_features:
+ value = row[feature.key]
+
+ if isinstance(feature, (CategoricalInput, DiscreteInput, MolecularInput)):
+ # Categorical, discrete, and molecular features: use exact value
+ key_parts.append(value)
+ elif isinstance(feature, NumericalInput):
+ # Continuous features: round to tolerance for grouping
+ # This handles floating point comparison issues
+ if pd.isna(value):
+ key_parts.append(None)
+ else:
+ # Round to appropriate decimal places based on eps
+ decimal_places = max(0, int(-np.log10(eps)))
+ rounded_value = round(float(value), decimal_places)
+ key_parts.append(rounded_value)
+ else:
+ # Fallback: use exact value
+ key_parts.append(value)
+
+ return tuple(key_parts)
+
+ # Create grouping keys for all rows
+ grouping_keys = experiments.apply(make_grouping_key, axis=1)
+
+ # Map unique grouping keys to trajectory IDs
+ unique_keys = grouping_keys.unique()
+ key_to_id = {key: idx for idx, key in enumerate(unique_keys)}
+
+ # Assign trajectory IDs
+ trajectory_ids = grouping_keys.map(key_to_id)
+
+ return trajectory_ids
diff --git a/docs/examples.md b/docs/examples.md
index 2a6cd94d7..f126af1f3 100644
--- a/docs/examples.md
+++ b/docs/examples.md
@@ -19,6 +19,7 @@ Optimization with some of the challenges faced in real-world experiments:
- [A toy example for optimizing a reaction](reaction_optimization.ipynb)
- [Using a Tanimoto fingerprint kernel to optimize over molecules](fingerprint_bayesopt.ipynb)
- [Using a MultiFidelity strategy with cheap, approximate experiments](multifidelity_bo.ipynb)
+- [Handling timeseries data and kinetic measurements](timeseries_kinetic_measurements.ipynb)
## API with BoFire
diff --git a/tests/bofire/data_models/domain/test_domain_validators.py b/tests/bofire/data_models/domain/test_domain_validators.py
index d712734fb..bb10f3a33 100644
--- a/tests/bofire/data_models/domain/test_domain_validators.py
+++ b/tests/bofire/data_models/domain/test_domain_validators.py
@@ -11,7 +11,7 @@
ConstraintNotFulfilledError,
LinearEqualityConstraint,
)
-from bofire.data_models.domain.api import Domain
+from bofire.data_models.domain.api import Domain, Inputs, Outputs
from bofire.data_models.features.api import (
CategoricalDescriptorInput,
CategoricalInput,
@@ -405,3 +405,128 @@ def test_outputs_add_valid_columns():
experiments["valid_out1"] = _test_val
with pytest.raises(ValueError):
domain0.outputs.add_valid_columns(experiments)
+
+
+def test_domain_timeseries_validation():
+ """Test that the Domain validates timeseries configuration correctly."""
+
+ # Test 1: Single timeseries feature - should work
+ time_input = ContinuousInput(key="time", bounds=(0, 100), is_timeseries=True)
+ x_input = ContinuousInput(key="x", bounds=(0, 10))
+ y_output = ContinuousOutput(key="y")
+
+ domain = Domain(
+ inputs=Inputs(features=[time_input, x_input]),
+ outputs=Outputs(features=[y_output]),
+ )
+ # Check that the domain was created successfully
+ assert (
+ len(
+ [
+ f
+ for f in domain.inputs
+ if hasattr(f, "is_timeseries") and f.is_timeseries
+ ]
+ )
+ == 1
+ )
+
+ # Test 2: Multiple timeseries features - should fail
+ time_input2 = ContinuousInput(key="time", bounds=(0, 100), is_timeseries=True)
+ step_input2 = DiscreteInput(key="step", values=[0, 1, 2, 3], is_timeseries=True)
+ x_input2 = ContinuousInput(key="x", bounds=(0, 10))
+ y_output2 = ContinuousOutput(key="y")
+
+ with pytest.raises(
+ ValueError, match="Multiple features .* are marked as timeseries"
+ ):
+ Domain(
+ inputs=Inputs(features=[time_input2, step_input2, x_input2]),
+ outputs=Outputs(features=[y_output2]),
+ )
+
+ # Test 3: Normal domain without timeseries - should work
+ x_input3 = ContinuousInput(key="x", bounds=(0, 10))
+ y_output3 = ContinuousOutput(key="y")
+
+ domain3 = Domain(
+ inputs=Inputs(features=[x_input3]),
+ outputs=Outputs(features=[y_output3]),
+ )
+ # Check that no timeseries features exist
+ assert (
+ len(
+ [
+ f
+ for f in domain3.inputs
+ if hasattr(f, "is_timeseries") and f.is_timeseries
+ ]
+ )
+ == 0
+ )
+
+
+def test_domain_timeseries_trajectory_id_validation():
+ """Test that validate_experiments checks for _trajectory_id when timeseries features are present."""
+
+ # Create domain with timeseries feature
+ time_input = ContinuousInput(key="time", bounds=(0, 100), is_timeseries=True)
+ x_input = ContinuousInput(key="x", bounds=(0, 10))
+ y_output = ContinuousOutput(key="y")
+
+ domain = Domain(
+ inputs=Inputs(features=[time_input, x_input]),
+ outputs=Outputs(features=[y_output]),
+ )
+
+ # Test 1: Experiments with _trajectory_id - should work
+ experiments_with_trajectory = pd.DataFrame(
+ {
+ "_trajectory_id": [0, 0, 0, 1, 1, 1],
+ "time": [0, 10, 20, 0, 10, 20],
+ "x": [1, 2, 3, 4, 5, 6],
+ "y": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6],
+ "valid_y": [1, 1, 1, 1, 1, 1],
+ }
+ )
+
+ # Should not raise an error
+ validated = domain.validate_experiments(experiments_with_trajectory)
+ assert validated is not None
+ assert "_trajectory_id" in validated.columns
+
+ # Test 2: Experiments without _trajectory_id - should fail
+ experiments_without_trajectory = pd.DataFrame(
+ {
+ "time": [0, 10, 20, 0, 10, 20],
+ "x": [1, 2, 3, 4, 5, 6],
+ "y": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6],
+ "valid_y": [1, 1, 1, 1, 1, 1],
+ }
+ )
+
+ with pytest.raises(
+ ValueError,
+ match="Timeseries feature 'time' detected, but required column '_trajectory_id' is not present",
+ ):
+ domain.validate_experiments(experiments_without_trajectory)
+
+ # Test 3: Non-timeseries domain doesn't require _trajectory_id
+ domain_no_timeseries = Domain(
+ inputs=Inputs(features=[x_input]), # No timeseries feature
+ outputs=Outputs(features=[y_output]),
+ )
+
+ experiments_no_timeseries = pd.DataFrame(
+ {
+ "x": [1, 2, 3, 4, 5, 6],
+ "y": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6],
+ "valid_y": [1, 1, 1, 1, 1, 1],
+ }
+ )
+
+ # Should work without _trajectory_id since there's no timeseries feature
+ validated_no_ts = domain_no_timeseries.validate_experiments(
+ experiments_no_timeseries
+ )
+ assert validated_no_ts is not None
diff --git a/tests/bofire/data_models/features/test_continuous.py b/tests/bofire/data_models/features/test_continuous.py
index 5fb93f38a..952e69a95 100644
--- a/tests/bofire/data_models/features/test_continuous.py
+++ b/tests/bofire/data_models/features/test_continuous.py
@@ -7,7 +7,11 @@
from pandas.testing import assert_series_equal
import tests.bofire.data_models.specs.api as specs
-from bofire.data_models.features.api import ContinuousDescriptorInput, ContinuousInput
+from bofire.data_models.features.api import (
+ ContinuousDescriptorInput,
+ ContinuousInput,
+ DiscreteInput,
+)
def test_continuous_input_invalid_stepsize():
@@ -324,3 +328,20 @@ def test_continuous_input_feature_to_unit_range(feature, x, expected, real):
def test_continuous_input_feature_is_fixed(input_feature, expected, expected_value):
assert input_feature.is_fixed() == expected
assert input_feature.fixed_value() == expected_value
+
+
+def test_continuous_input_is_timeseries():
+ """Test that the is_timeseries flag works correctly for ContinuousInput."""
+ # Default value should be False
+ feature = ContinuousInput(key="x", bounds=(0, 10))
+ assert not feature.is_timeseries
+
+ # Should be able to set to True
+ time_feature = ContinuousInput(key="time", bounds=(0, 100), is_timeseries=True)
+ assert time_feature.is_timeseries
+
+ # DiscreteInput should also support is_timeseries
+ discrete_time_feature = DiscreteInput(
+ key="timestep", values=[0, 1, 2, 3, 4], is_timeseries=True
+ )
+ assert discrete_time_feature.is_timeseries
diff --git a/tests/bofire/data_models/specs/features.py b/tests/bofire/data_models/specs/features.py
index 5395c08a6..bdc62c80a 100644
--- a/tests/bofire/data_models/specs/features.py
+++ b/tests/bofire/data_models/specs/features.py
@@ -21,6 +21,7 @@
"values": [random.random(), random.random() + 3],
"unit": random.choice(["°C", "mg", "mmol/l", None]),
"rtol": 1e-7,
+ "is_timeseries": False,
},
)
@@ -45,6 +46,7 @@
"unit": random.choice(["°C", "mg", "mmol/l", None]),
"local_relative_bounds": None,
"stepsize": None,
+ "is_timeseries": False,
"allow_zero": False,
},
)
@@ -69,6 +71,7 @@
"key": str(uuid.uuid4()),
"bounds": [3, 5.3],
"descriptors": ["d1", "d2"],
+ "is_timeseries": False,
"values": [1.0, 2.0],
"unit": random.choice(["°C", "mg", "mmol/l", None]),
"local_relative_bounds": None,
diff --git a/tests/bofire/surrogates/test_cross_validate.py b/tests/bofire/surrogates/test_cross_validate.py
index b5430f0a5..bd6b953c9 100644
--- a/tests/bofire/surrogates/test_cross_validate.py
+++ b/tests/bofire/surrogates/test_cross_validate.py
@@ -1,6 +1,11 @@
import pandas as pd
import pytest
-from sklearn.model_selection import GroupShuffleSplit, KFold, StratifiedKFold
+from sklearn.model_selection import (
+ GroupKFold,
+ GroupShuffleSplit,
+ KFold,
+ StratifiedKFold,
+)
import bofire.surrogates.api as surrogates
from bofire.data_models.domain.api import Inputs, Outputs
@@ -645,13 +650,26 @@ def test_make_cv_split():
assert isinstance(cv, StratifiedKFold)
assert len(list(cv_func)) == 5
- # Test GroupShuffleSplit split
+ # Test GroupKFold split (default for group_split_column)
cv, cv_func = model._make_cv_split(
experiments,
folds=2,
random_state=1,
stratified_feature=None,
group_split_column="group",
+ use_shuffle_split=False, # Use GroupKFold (default)
+ )
+ assert isinstance(cv, GroupKFold)
+ assert len(list(cv_func)) == 2
+
+ # Test GroupShuffleSplit split (optional)
+ cv, cv_func = model._make_cv_split(
+ experiments,
+ folds=2,
+ random_state=1,
+ stratified_feature=None,
+ group_split_column="group",
+ use_shuffle_split=True, # Use GroupShuffleSplit
)
assert isinstance(cv, GroupShuffleSplit)
assert len(list(cv_func)) == 2
@@ -691,3 +709,373 @@ def test_check_valid_nfolds():
model._check_valid_nfolds(1, 10)
with pytest.raises(ValueError, match="Experiments is empty."):
model._check_valid_nfolds(5, 0)
+
+
+def test_model_cross_validate_timeseries():
+ """Test cross-validation with timeseries data using GroupKFold."""
+ # Create inputs with a timeseries feature
+ inputs = Inputs(
+ features=[
+ ContinuousInput(
+ key="time",
+ bounds=(0, 100),
+ is_timeseries=True, # Mark as timeseries
+ ),
+ ContinuousInput(
+ key="x",
+ bounds=(-4, 4),
+ ),
+ ],
+ )
+ outputs = Outputs(features=[ContinuousOutput(key="y")])
+
+ # Create experiments with overlapping time values across trajectories with realistic noise
+ # This reflects realistic timeseries where different experiments run over similar time ranges
+ # with slight measurement variations
+ experiments = pd.DataFrame(
+ {
+ "time": [
+ 0.0,
+ 4.95,
+ 10.1,
+ 14.9, # trajectory 0
+ 0.05,
+ 5.02,
+ 9.98,
+ 15.05, # trajectory 1
+ 0.0,
+ 5.1,
+ 10.03,
+ 15.01, # trajectory 2
+ 0.02,
+ 4.99,
+ 9.95,
+ 14.98, # trajectory 3
+ ],
+ "x": [-4, -3, -2, -1, 0, 1, 2, 3, -4, -3, -2, -1, 0, 1, 2, 3],
+ "y": [1, 2, 3, 4, 5, 6, 7, 8, 2, 3, 4, 5, 6, 7, 8, 9],
+ "_trajectory_id": [
+ 0,
+ 0,
+ 0,
+ 0,
+ 1,
+ 1,
+ 1,
+ 1,
+ 2,
+ 2,
+ 2,
+ 2,
+ 3,
+ 3,
+ 3,
+ 3,
+ ], # Trajectory/experiment groups
+ "valid_y": [1] * 16,
+ }
+ )
+
+ # Initialize model
+ model = SingleTaskGPSurrogate(
+ inputs=inputs,
+ outputs=outputs,
+ )
+ model = surrogates.map(model)
+
+ # Test with automatic group_split_column for timeseries data
+ # Since we have a timeseries feature and _trajectory_id column exists,
+ # it should be used automatically
+ train_cv, test_cv, _ = model.cross_validate(
+ experiments,
+ folds=4,
+ # No group_split_column specified - should auto-use _trajectory_id
+ )
+
+ # Verify that groups are kept together
+ test_indices = []
+ train_indices = []
+ for cvresults in test_cv.results:
+ test_indices.append(list(cvresults.observed.index))
+
+ for cvresults in train_cv.results:
+ train_indices.append(list(cvresults.observed.index))
+
+ # Get indices for each experiment group
+ exp0_indices = experiments[experiments["_trajectory_id"] == 0].index.tolist()
+ exp1_indices = experiments[experiments["_trajectory_id"] == 1].index.tolist()
+ exp2_indices = experiments[experiments["_trajectory_id"] == 2].index.tolist()
+ exp3_indices = experiments[experiments["_trajectory_id"] == 3].index.tolist()
+
+ all_exp_indices = [exp0_indices, exp1_indices, exp2_indices, exp3_indices]
+
+ # Verify that each experiment group is entirely in either train or test set
+ for test_index, train_index in zip(test_indices, train_indices):
+ for exp_indices in all_exp_indices:
+ test_set = set(test_index)
+ train_set = set(train_index)
+ # Each experiment should be entirely in either test or train set
+ assert test_set.issuperset(exp_indices) or train_set.issuperset(exp_indices)
+
+
+def test_model_cross_validate_groupkfold_vs_shufflesplit():
+ """Test that GroupKFold is default and GroupShuffleSplit can be enabled with flag."""
+ inputs = Inputs(
+ features=[
+ ContinuousInput(
+ key="time",
+ bounds=(0, 100),
+ is_timeseries=True, # Timeseries feature
+ ),
+ ContinuousInput(key="x", bounds=(-4, 4)),
+ ],
+ )
+ outputs = Outputs(features=[ContinuousOutput(key="y")])
+
+ experiments = pd.DataFrame(
+ {
+ "time": [0, 5, 10, 15],
+ "x": [-4, -3, -2, -1],
+ "y": [1, 2, 3, 4],
+ "group": [0, 0, 1, 1],
+ "valid_y": [1] * 4,
+ }
+ )
+
+ model = SingleTaskGPSurrogate(inputs=inputs, outputs=outputs)
+ model = surrogates.map(model)
+
+ # Test 1: Default behavior - should use GroupKFold
+ cv_default, _ = model._make_cv_split(
+ experiments,
+ folds=2,
+ group_split_column="group",
+ )
+ assert isinstance(cv_default, GroupKFold)
+
+ # Test 2: With use_shuffle_split=True - should use GroupShuffleSplit
+ cv_shuffle, _ = model._make_cv_split(
+ experiments,
+ folds=2,
+ group_split_column="group",
+ use_shuffle_split=True,
+ random_state=42,
+ )
+ assert isinstance(cv_shuffle, GroupShuffleSplit)
+
+
+def test_model_cross_validate_timeseries_automatic_trajectory_id():
+ """Test that cross_validate automatically uses _trajectory_id column for timeseries."""
+ # Create inputs with a timeseries feature
+ inputs = Inputs(
+ features=[
+ ContinuousInput(
+ key="time",
+ bounds=(0, 100),
+ is_timeseries=True,
+ ),
+ ContinuousInput(
+ key="x",
+ bounds=(-4, 4),
+ ),
+ ],
+ )
+ outputs = Outputs(features=[ContinuousOutput(key="y")])
+
+ # Create experiments WITH a _trajectory_id column
+ experiments = pd.DataFrame(
+ {
+ "_trajectory_id": [0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3],
+ "time": [0, 5, 10, 0, 5, 10, 0, 5, 10, 0, 5, 10],
+ "x": [-4, -3, -2, -1, 0, 1, 2, 3, 4, -4, -3, -2],
+ "y": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
+ "valid_y": [1] * 12,
+ }
+ )
+
+ model = SingleTaskGPSurrogate(
+ inputs=inputs,
+ outputs=outputs,
+ )
+ model = surrogates.map(model)
+
+ # This should automatically use _trajectory_id as group_split_column
+ cv_train, cv_test, _ = model.cross_validate(
+ experiments,
+ folds=2,
+ include_X=True, # Include X to check trajectory groups
+ # No group_split_column specified - should auto-use _trajectory_id
+ )
+
+ # Check that results are returned
+ assert len(cv_train.results) == 2
+ assert len(cv_test.results) == 2
+
+ # CRITICAL: Verify that trajectory groups are never split between train and test
+ for fold_idx in range(2):
+ train_data = cv_train.results[fold_idx].X
+ test_data = cv_test.results[fold_idx].X
+
+ # Get the trajectory IDs from the original experiments
+ train_indices = train_data.index
+ test_indices = test_data.index
+
+ train_trajectories = set(experiments.loc[train_indices, "_trajectory_id"])
+ test_trajectories = set(experiments.loc[test_indices, "_trajectory_id"])
+
+ # Ensure no overlap between train and test trajectory groups
+ assert (
+ len(train_trajectories.intersection(test_trajectories)) == 0
+ ), f"Fold {fold_idx}: Trajectory groups are mixed between train and test!"
+
+ # Ensure all trajectories are accounted for
+ assert train_trajectories.union(test_trajectories) == {
+ 0,
+ 1,
+ 2,
+ 3,
+ }, f"Fold {fold_idx}: Not all trajectories are covered!"
+
+
+def test_model_cross_validate_timeseries_use_shuffle_split():
+ """Test that use_shuffle_split parameter works correctly with timeseries data."""
+ inputs = Inputs(
+ features=[
+ ContinuousInput(key="time", bounds=(0, 100), is_timeseries=True),
+ ContinuousInput(key="x", bounds=(-4, 4)),
+ ],
+ )
+ outputs = Outputs(features=[ContinuousOutput(key="y")])
+
+ # Create experiments with 4 trajectories
+ experiments = pd.DataFrame(
+ {
+ "_trajectory_id": [0, 0, 1, 1, 2, 2, 3, 3],
+ "time": [0, 10, 0, 10, 0, 10, 0, 10],
+ "x": [-4, -3, -2, -1, 0, 1, 2, 3],
+ "y": [1, 2, 3, 4, 5, 6, 7, 8],
+ "valid_y": [1] * 8,
+ }
+ )
+
+ model = SingleTaskGPSurrogate(inputs=inputs, outputs=outputs)
+ model = surrogates.map(model)
+
+ # Test with GroupKFold (default) - should use all trajectories
+ train_cv_kfold, test_cv_kfold, _ = model.cross_validate(
+ experiments,
+ folds=2,
+ )
+
+ # Test with GroupShuffleSplit - may not use all trajectories
+ train_cv_shuffle, test_cv_shuffle, _ = model.cross_validate(
+ experiments,
+ folds=2,
+ use_shuffle_split=True,
+ random_state=42,
+ )
+
+ # Both should return results
+ assert len(train_cv_kfold.results) == 2
+ assert len(test_cv_kfold.results) == 2
+ assert len(train_cv_shuffle.results) == 2
+ assert len(test_cv_shuffle.results) == 2
+
+
+def test_model_cross_validate_groupkfold_exhaustive():
+ """Test that GroupKFold ensures every trajectory is tested exactly once."""
+ inputs = Inputs(
+ features=[
+ ContinuousInput(key="time", bounds=(0, 100), is_timeseries=True),
+ ContinuousInput(key="x", bounds=(-4, 4)),
+ ],
+ )
+ outputs = Outputs(features=[ContinuousOutput(key="y")])
+
+ # Create experiments with 6 trajectories
+ n_trajectories = 6
+ experiments = pd.DataFrame(
+ {
+ "_trajectory_id": [i for i in range(n_trajectories) for _ in range(3)],
+ "time": [0, 5, 10] * n_trajectories,
+ "x": list(range(18)),
+ "y": list(range(18)),
+ "valid_y": [1] * 18,
+ }
+ )
+
+ model = SingleTaskGPSurrogate(inputs=inputs, outputs=outputs)
+ model = surrogates.map(model)
+
+ # Use 3 folds - each trajectory should appear in exactly one test fold
+ train_cv, test_cv, _ = model.cross_validate(
+ experiments,
+ folds=3,
+ )
+
+ # Collect all tested trajectory IDs across folds
+ tested_trajectories = set()
+ for cv_result in test_cv.results:
+ test_indices = cv_result.observed.index
+ test_data = experiments.iloc[test_indices]
+ tested_trajectories.update(test_data["_trajectory_id"].unique())
+
+ # Every trajectory should be tested exactly once
+ assert tested_trajectories == set(range(n_trajectories))
+
+ # Check that no trajectory appears in multiple test sets
+ trajectory_test_counts = {i: 0 for i in range(n_trajectories)}
+ for cv_result in test_cv.results:
+ test_indices = cv_result.observed.index
+ test_data = experiments.iloc[test_indices]
+ for traj_id in test_data["_trajectory_id"].unique():
+ trajectory_test_counts[traj_id] += 1
+
+ # Each trajectory should be tested exactly once
+ assert all(count == 1 for count in trajectory_test_counts.values())
+
+
+def test_model_cross_validate_timeseries_missing_column():
+ """Test that error is raised when _trajectory_id column is missing with timeseries feature."""
+ # Create inputs with a timeseries feature
+ inputs = Inputs(
+ features=[
+ ContinuousInput(
+ key="time",
+ bounds=(0, 100),
+ is_timeseries=True,
+ ),
+ ContinuousInput(
+ key="x",
+ bounds=(-4, 4),
+ ),
+ ],
+ )
+ outputs = Outputs(features=[ContinuousOutput(key="y")])
+
+ # Create experiments WITHOUT the _trajectory_id column
+ experiments = pd.DataFrame(
+ {
+ "time": [0, 5, 10, 15],
+ "x": [-4, -3, -2, -1],
+ "y": [1, 2, 3, 4],
+ "valid_y": [1] * 4,
+ }
+ )
+
+ model = SingleTaskGPSurrogate(
+ inputs=inputs,
+ outputs=outputs,
+ )
+ model = surrogates.map(model)
+
+ # Should raise error about missing _trajectory_id column
+ with pytest.raises(
+ ValueError,
+ match="Timeseries feature 'time' detected, but required column '_trajectory_id' is not present",
+ ):
+ model.cross_validate(
+ experiments,
+ folds=2,
+ # No group_split_column specified - should error on missing _trajectory_id
+ )
diff --git a/tests/bofire/utils/test_timeseries.py b/tests/bofire/utils/test_timeseries.py
new file mode 100644
index 000000000..6d40054a1
--- /dev/null
+++ b/tests/bofire/utils/test_timeseries.py
@@ -0,0 +1,364 @@
+"""Tests for timeseries utilities."""
+
+import pandas as pd
+import pytest
+
+from bofire.data_models.domain.api import Domain, Inputs, Outputs
+from bofire.data_models.features.api import (
+ CategoricalInput,
+ ContinuousInput,
+ ContinuousOutput,
+ DiscreteInput,
+ MolecularInput,
+)
+from bofire.utils.timeseries import infer_trajectory_id
+
+
+def test_infer_trajectory_id_continuous_only():
+ """Test trajectory ID inference with continuous features only."""
+ # Create domain with timeseries
+ inputs = Inputs(
+ features=[
+ ContinuousInput(key="time", bounds=(0, 100), is_timeseries=True),
+ ContinuousInput(key="temperature", bounds=(20, 80)),
+ ContinuousInput(key="pressure", bounds=(1, 5)),
+ ]
+ )
+ outputs = Outputs(features=[ContinuousOutput(key="yield")])
+ domain = Domain(inputs=inputs, outputs=outputs)
+
+ # Create experiments with 3 distinct trajectories
+ experiments = pd.DataFrame(
+ {
+ "time": [0, 10, 20, 0, 10, 20, 0, 10, 20],
+ "temperature": [25, 25, 25, 30, 30, 30, 25, 25, 25],
+ "pressure": [2, 2, 2, 2, 2, 2, 3, 3, 3],
+ "yield": [0.1, 0.2, 0.3, 0.2, 0.3, 0.4, 0.15, 0.25, 0.35],
+ "valid_yield": [1] * 9,
+ }
+ )
+
+ trajectory_ids = infer_trajectory_id(experiments, domain)
+
+ # Check that we have 3 trajectories
+ assert len(trajectory_ids.unique()) == 3
+
+ # Check that rows with same temp/pressure get same ID
+ assert trajectory_ids.iloc[0] == trajectory_ids.iloc[1] == trajectory_ids.iloc[2]
+ assert trajectory_ids.iloc[3] == trajectory_ids.iloc[4] == trajectory_ids.iloc[5]
+ assert trajectory_ids.iloc[6] == trajectory_ids.iloc[7] == trajectory_ids.iloc[8]
+
+ # All three trajectories should have different IDs
+ assert (
+ len({trajectory_ids.iloc[0], trajectory_ids.iloc[3], trajectory_ids.iloc[6]})
+ == 3
+ )
+
+
+def test_infer_trajectory_id_with_categorical():
+ """Test trajectory ID inference with categorical features."""
+ inputs = Inputs(
+ features=[
+ ContinuousInput(key="time", bounds=(0, 100), is_timeseries=True),
+ CategoricalInput(key="catalyst", categories=["A", "B", "C"]),
+ ContinuousInput(key="temperature", bounds=(20, 80)),
+ ]
+ )
+ outputs = Outputs(features=[ContinuousOutput(key="yield")])
+ domain = Domain(inputs=inputs, outputs=outputs)
+
+ experiments = pd.DataFrame(
+ {
+ "time": [0, 10, 20, 0, 10, 20, 0, 10, 20],
+ "catalyst": ["A", "A", "A", "B", "B", "B", "A", "A", "A"],
+ "temperature": [25, 25, 25, 25, 25, 25, 30, 30, 30],
+ "yield": [0.1, 0.2, 0.3, 0.2, 0.3, 0.4, 0.15, 0.25, 0.35],
+ "valid_yield": [1] * 9,
+ }
+ )
+
+ trajectory_ids = infer_trajectory_id(experiments, domain)
+
+ # Should have 3 trajectories: (A, 25), (B, 25), (A, 30)
+ assert len(trajectory_ids.unique()) == 3
+
+ # Rows 0-2: catalyst A, temp 25
+ assert trajectory_ids.iloc[0] == trajectory_ids.iloc[1] == trajectory_ids.iloc[2]
+
+ # Rows 3-5: catalyst B, temp 25
+ assert trajectory_ids.iloc[3] == trajectory_ids.iloc[4] == trajectory_ids.iloc[5]
+
+ # Rows 6-8: catalyst A, temp 30
+ assert trajectory_ids.iloc[6] == trajectory_ids.iloc[7] == trajectory_ids.iloc[8]
+
+ # All three should be different
+ assert (
+ len({trajectory_ids.iloc[0], trajectory_ids.iloc[3], trajectory_ids.iloc[6]})
+ == 3
+ )
+
+
+def test_infer_trajectory_id_with_discrete():
+ """Test trajectory ID inference with discrete features."""
+ inputs = Inputs(
+ features=[
+ ContinuousInput(key="time", bounds=(0, 100), is_timeseries=True),
+ DiscreteInput(key="n_cycles", values=[1, 2, 3, 4, 5]),
+ ContinuousInput(key="temperature", bounds=(20, 80)),
+ ]
+ )
+ outputs = Outputs(features=[ContinuousOutput(key="yield")])
+ domain = Domain(inputs=inputs, outputs=outputs)
+
+ experiments = pd.DataFrame(
+ {
+ "time": [0, 10, 20, 0, 10, 20],
+ "n_cycles": [2, 2, 2, 3, 3, 3],
+ "temperature": [25, 25, 25, 25, 25, 25],
+ "yield": [0.1, 0.2, 0.3, 0.2, 0.3, 0.4],
+ "valid_yield": [1] * 6,
+ }
+ )
+
+ trajectory_ids = infer_trajectory_id(experiments, domain)
+
+ # Should have 2 trajectories: n_cycles=2 and n_cycles=3
+ assert len(trajectory_ids.unique()) == 2
+
+ # First 3 rows should have same ID
+ assert trajectory_ids.iloc[0] == trajectory_ids.iloc[1] == trajectory_ids.iloc[2]
+
+ # Last 3 rows should have same ID
+ assert trajectory_ids.iloc[3] == trajectory_ids.iloc[4] == trajectory_ids.iloc[5]
+
+ # The two groups should be different
+ assert trajectory_ids.iloc[0] != trajectory_ids.iloc[3]
+
+
+def test_infer_trajectory_id_mixed_features():
+ """Test trajectory ID inference with mixed feature types."""
+ inputs = Inputs(
+ features=[
+ ContinuousInput(key="time", bounds=(0, 100), is_timeseries=True),
+ CategoricalInput(
+ key="solvent", categories=["water", "ethanol", "methanol"]
+ ),
+ DiscreteInput(key="stirring_speed", values=[100, 200, 300]),
+ ContinuousInput(key="temperature", bounds=(20, 80)),
+ ]
+ )
+ outputs = Outputs(features=[ContinuousOutput(key="yield")])
+ domain = Domain(inputs=inputs, outputs=outputs)
+
+ experiments = pd.DataFrame(
+ {
+ "time": [0, 10, 20, 0, 10, 20, 0, 10, 20],
+ "solvent": [
+ "water",
+ "water",
+ "water",
+ "ethanol",
+ "ethanol",
+ "ethanol",
+ "water",
+ "water",
+ "water",
+ ],
+ "stirring_speed": [100, 100, 100, 100, 100, 100, 200, 200, 200],
+ "temperature": [25, 25, 25, 25, 25, 25, 25, 25, 25],
+ "yield": [0.1, 0.2, 0.3, 0.2, 0.3, 0.4, 0.15, 0.25, 0.35],
+ "valid_yield": [1] * 9,
+ }
+ )
+
+ trajectory_ids = infer_trajectory_id(experiments, domain)
+
+ # Should have 3 trajectories
+ assert len(trajectory_ids.unique()) == 3
+
+ # Check grouping
+ assert trajectory_ids.iloc[0] == trajectory_ids.iloc[1] == trajectory_ids.iloc[2]
+ assert trajectory_ids.iloc[3] == trajectory_ids.iloc[4] == trajectory_ids.iloc[5]
+ assert trajectory_ids.iloc[6] == trajectory_ids.iloc[7] == trajectory_ids.iloc[8]
+
+
+def test_infer_trajectory_id_with_eps():
+ """Test that eps parameter works for continuous features."""
+ inputs = Inputs(
+ features=[
+ ContinuousInput(key="time", bounds=(0, 100), is_timeseries=True),
+ ContinuousInput(key="temperature", bounds=(20, 80)),
+ ]
+ )
+ outputs = Outputs(features=[ContinuousOutput(key="yield")])
+ domain = Domain(inputs=inputs, outputs=outputs)
+
+ # Create experiments with small differences in temperature
+ experiments = pd.DataFrame(
+ {
+ "time": [0, 10, 20, 0, 10, 20],
+ "temperature": [25.0, 25.0, 25.0, 25.0000001, 25.0000001, 25.0000001],
+ "yield": [0.1, 0.2, 0.3, 0.2, 0.3, 0.4],
+ "valid_yield": [1] * 6,
+ }
+ )
+
+ # With default eps=1e-6, should group together
+ trajectory_ids_tight = infer_trajectory_id(experiments, domain, eps=1e-6)
+ assert len(trajectory_ids_tight.unique()) == 1
+
+ # With stricter eps, should be separate
+ trajectory_ids_strict = infer_trajectory_id(experiments, domain, eps=1e-8)
+ assert len(trajectory_ids_strict.unique()) == 2
+
+
+def test_infer_trajectory_id_only_timeseries():
+ """Test when only timeseries feature exists (edge case)."""
+ inputs = Inputs(
+ features=[
+ ContinuousInput(key="time", bounds=(0, 100), is_timeseries=True),
+ ]
+ )
+ outputs = Outputs(features=[ContinuousOutput(key="yield")])
+ domain = Domain(inputs=inputs, outputs=outputs)
+
+ experiments = pd.DataFrame(
+ {
+ "time": [0, 10, 20, 30],
+ "yield": [0.1, 0.2, 0.3, 0.4],
+ "valid_yield": [1] * 4,
+ }
+ )
+
+ trajectory_ids = infer_trajectory_id(experiments, domain)
+
+ # All rows should have same trajectory ID (only one trajectory)
+ assert len(trajectory_ids.unique()) == 1
+ assert trajectory_ids.iloc[0] == 0
+
+
+def test_infer_trajectory_id_no_timeseries_error():
+ """Test that error is raised when no timeseries feature exists."""
+ inputs = Inputs(
+ features=[
+ ContinuousInput(key="temperature", bounds=(20, 80)),
+ ContinuousInput(key="pressure", bounds=(1, 5)),
+ ]
+ )
+ outputs = Outputs(features=[ContinuousOutput(key="yield")])
+ domain = Domain(inputs=inputs, outputs=outputs)
+
+ experiments = pd.DataFrame(
+ {
+ "temperature": [25, 30],
+ "pressure": [2, 3],
+ "yield": [0.1, 0.2],
+ "valid_yield": [1, 1],
+ }
+ )
+
+ with pytest.raises(ValueError, match="No timeseries feature found"):
+ infer_trajectory_id(experiments, domain)
+
+
+def test_infer_trajectory_id_missing_columns_error():
+ """Test that error is raised when required columns are missing."""
+ inputs = Inputs(
+ features=[
+ ContinuousInput(key="time", bounds=(0, 100), is_timeseries=True),
+ ContinuousInput(key="temperature", bounds=(20, 80)),
+ ]
+ )
+ outputs = Outputs(features=[ContinuousOutput(key="yield")])
+ domain = Domain(inputs=inputs, outputs=outputs)
+
+ experiments = pd.DataFrame(
+ {
+ "time": [0, 10, 20],
+ # Missing temperature column
+ "yield": [0.1, 0.2, 0.3],
+ "valid_yield": [1, 1, 1],
+ }
+ )
+
+ with pytest.raises(ValueError, match="Required input feature columns missing"):
+ infer_trajectory_id(experiments, domain)
+
+
+def test_domain_add_trajectory_id():
+ """Test the Domain.add_trajectory_id convenience method."""
+ inputs = Inputs(
+ features=[
+ ContinuousInput(key="time", bounds=(0, 100), is_timeseries=True),
+ ContinuousInput(key="temperature", bounds=(20, 80)),
+ ]
+ )
+ outputs = Outputs(features=[ContinuousOutput(key="yield")])
+ domain = Domain(inputs=inputs, outputs=outputs)
+
+ experiments = pd.DataFrame(
+ {
+ "time": [0, 10, 20, 0, 10, 20],
+ "temperature": [25, 25, 25, 30, 30, 30],
+ "yield": [0.1, 0.2, 0.3, 0.2, 0.3, 0.4],
+ "valid_yield": [1] * 6,
+ }
+ )
+
+ # Test that add_trajectory_id works
+ result = domain.add_trajectory_id(experiments)
+
+ # Check that _trajectory_id was added
+ assert "_trajectory_id" in result.columns
+
+ # Check that original dataframe was not modified
+ assert "_trajectory_id" not in experiments.columns
+
+ # Check grouping is correct
+ assert len(result["_trajectory_id"].unique()) == 2
+ assert result["_trajectory_id"].iloc[0] == result["_trajectory_id"].iloc[1]
+ assert result["_trajectory_id"].iloc[3] == result["_trajectory_id"].iloc[4]
+ assert result["_trajectory_id"].iloc[0] != result["_trajectory_id"].iloc[3]
+
+
+def test_infer_trajectory_id_with_molecular():
+ """Test trajectory ID inference with molecular features."""
+ inputs = Inputs(
+ features=[
+ ContinuousInput(key="time", bounds=(0, 100), is_timeseries=True),
+ MolecularInput(key="solvent"),
+ ContinuousInput(key="temperature", bounds=(20, 80)),
+ ]
+ )
+ outputs = Outputs(features=[ContinuousOutput(key="yield")])
+ domain = Domain(inputs=inputs, outputs=outputs)
+
+ experiments = pd.DataFrame(
+ {
+ "time": [0, 10, 20, 0, 10, 20, 0, 10, 20],
+ "solvent": [
+ "O",
+ "O",
+ "O",
+ "CCO",
+ "CCO",
+ "CCO",
+ "O",
+ "O",
+ "O",
+ ], # water, ethanol, water
+ "temperature": [25, 25, 25, 25, 25, 25, 30, 30, 30],
+ "yield": [0.1, 0.2, 0.3, 0.2, 0.3, 0.4, 0.15, 0.25, 0.35],
+ "valid_yield": [1] * 9,
+ }
+ )
+
+ trajectory_ids = infer_trajectory_id(experiments, domain)
+
+ # Should have 3 trajectories: (water, 25), (ethanol, 25), (water, 30)
+ assert len(trajectory_ids.unique()) == 3
+
+ # Check grouping
+ assert trajectory_ids.iloc[0] == trajectory_ids.iloc[1] == trajectory_ids.iloc[2]
+ assert trajectory_ids.iloc[3] == trajectory_ids.iloc[4] == trajectory_ids.iloc[5]
+ assert trajectory_ids.iloc[6] == trajectory_ids.iloc[7] == trajectory_ids.iloc[8]
diff --git a/tutorials/basic_examples/timeseries_kinetic_measurements.ipynb b/tutorials/basic_examples/timeseries_kinetic_measurements.ipynb
new file mode 100644
index 000000000..d07b77cfa
--- /dev/null
+++ b/tutorials/basic_examples/timeseries_kinetic_measurements.ipynb
@@ -0,0 +1,1128 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Timeseries Features in BoFire: Preventing Data Leakage in Kinetic Measurements\n",
+ "\n",
+ "This notebook demonstrates the importance of properly handling timeseries data in cross-validation when working with kinetic measurements. We'll show how improper cross-validation can lead to data leakage and overly optimistic performance metrics.\n",
+ "\n",
+ "## Scenario: Catalytic Reaction Optimization\n",
+ "\n",
+ "We're optimizing a catalytic reaction where we measure:\n",
+ "- **Yield** over time (focusing on yield for cross-validation demonstration)\n",
+ "- Variables: Temperature, Catalyst amount, Catalyst type (A, B, C), Base type (A, B, C), Base equivalents\n",
+ "- Each experiment consists of 8 time points (0, 15, 30, 45, 60, 90, 120, 180 minutes)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import matplotlib.pyplot as plt\n",
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "import seaborn as sns\n",
+ "from sklearn.model_selection import GroupKFold, KFold\n",
+ "\n",
+ "import bofire.surrogates.api as surrogates\n",
+ "from bofire.data_models.domain.api import Domain, Inputs, Outputs\n",
+ "from bofire.data_models.enum import RegressionMetricsEnum\n",
+ "from bofire.data_models.features.api import (\n",
+ " CategoricalInput,\n",
+ " ContinuousInput,\n",
+ " ContinuousOutput,\n",
+ ")\n",
+ "from bofire.data_models.kernels.api import HammingDistanceKernel, MaternKernel\n",
+ "from bofire.data_models.objectives.api import MaximizeObjective\n",
+ "from bofire.data_models.priors.api import MBO_LENGTHSCALE_PRIOR, MBO_NOISE_PRIOR\n",
+ "from bofire.data_models.surrogates.api import MixedSingleTaskGPSurrogate\n",
+ "\n",
+ "\n",
+ "# Set random seed for reproducibility\n",
+ "np.random.seed(42)\n",
+ "\n",
+ "# Configure plotting\n",
+ "sns.set_style(\"whitegrid\")\n",
+ "plt.rcParams[\"figure.figsize\"] = (12, 6)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 1. Generate Synthetic Kinetic Data\n",
+ "\n",
+ "We'll create realistic kinetic data with the following characteristics:\n",
+ "- Catalyst B doesn't work (no conversion)\n",
+ "- Base B doesn't work (no conversion)\n",
+ "- Catalyst A + Base A gives excellent yield\n",
+ "- High base loading increases impurities\n",
+ "- Temperature affects reaction rate"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Generated 128 data points from 16 experiments\n",
+ "\n",
+ "First few rows:\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " _trajectory_id | \n",
+ " time | \n",
+ " temperature | \n",
+ " catalyst_amount | \n",
+ " base_equiv | \n",
+ " catalyst | \n",
+ " base | \n",
+ " yield | \n",
+ " impurity | \n",
+ " valid_yield | \n",
+ " valid_impurity | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 0 | \n",
+ " 0.000000 | \n",
+ " 74.981605 | \n",
+ " 9.556429 | \n",
+ " 2.463988 | \n",
+ " A | \n",
+ " A | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 0 | \n",
+ " 14.765263 | \n",
+ " 74.981605 | \n",
+ " 9.556429 | \n",
+ " 2.463988 | \n",
+ " A | \n",
+ " A | \n",
+ " 33.628679 | \n",
+ " 0.562401 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 0 | \n",
+ " 29.767135 | \n",
+ " 74.981605 | \n",
+ " 9.556429 | \n",
+ " 2.463988 | \n",
+ " A | \n",
+ " A | \n",
+ " 52.252612 | \n",
+ " 0.119179 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 0 | \n",
+ " 44.137541 | \n",
+ " 74.981605 | \n",
+ " 9.556429 | \n",
+ " 2.463988 | \n",
+ " A | \n",
+ " A | \n",
+ " 65.710234 | \n",
+ " 0.000000 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 0 | \n",
+ " 60.157124 | \n",
+ " 74.981605 | \n",
+ " 9.556429 | \n",
+ " 2.463988 | \n",
+ " A | \n",
+ " A | \n",
+ " 73.651333 | \n",
+ " 0.170375 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " 0 | \n",
+ " 90.732824 | \n",
+ " 74.981605 | \n",
+ " 9.556429 | \n",
+ " 2.463988 | \n",
+ " A | \n",
+ " A | \n",
+ " 83.852661 | \n",
+ " 0.273337 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 6 | \n",
+ " 0 | \n",
+ " 119.287626 | \n",
+ " 74.981605 | \n",
+ " 9.556429 | \n",
+ " 2.463988 | \n",
+ " A | \n",
+ " A | \n",
+ " 90.243038 | \n",
+ " 1.294397 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 7 | \n",
+ " 0 | \n",
+ " 179.424503 | \n",
+ " 74.981605 | \n",
+ " 9.556429 | \n",
+ " 2.463988 | \n",
+ " A | \n",
+ " A | \n",
+ " 92.994677 | \n",
+ " 1.819754 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 8 | \n",
+ " 1 | \n",
+ " 0.000000 | \n",
+ " 62.602064 | \n",
+ " 9.539970 | \n",
+ " 2.931264 | \n",
+ " B | \n",
+ " B | \n",
+ " 0.751396 | \n",
+ " 0.000000 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 9 | \n",
+ " 1 | \n",
+ " 15.926139 | \n",
+ " 62.602064 | \n",
+ " 9.539970 | \n",
+ " 2.931264 | \n",
+ " B | \n",
+ " B | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " _trajectory_id time temperature catalyst_amount base_equiv \\\n",
+ "0 0 0.000000 74.981605 9.556429 2.463988 \n",
+ "1 0 14.765263 74.981605 9.556429 2.463988 \n",
+ "2 0 29.767135 74.981605 9.556429 2.463988 \n",
+ "3 0 44.137541 74.981605 9.556429 2.463988 \n",
+ "4 0 60.157124 74.981605 9.556429 2.463988 \n",
+ "5 0 90.732824 74.981605 9.556429 2.463988 \n",
+ "6 0 119.287626 74.981605 9.556429 2.463988 \n",
+ "7 0 179.424503 74.981605 9.556429 2.463988 \n",
+ "8 1 0.000000 62.602064 9.539970 2.931264 \n",
+ "9 1 15.926139 62.602064 9.539970 2.931264 \n",
+ "\n",
+ " catalyst base yield impurity valid_yield valid_impurity \n",
+ "0 A A 0.000000 0.000000 1 1 \n",
+ "1 A A 33.628679 0.562401 1 1 \n",
+ "2 A A 52.252612 0.119179 1 1 \n",
+ "3 A A 65.710234 0.000000 1 1 \n",
+ "4 A A 73.651333 0.170375 1 1 \n",
+ "5 A A 83.852661 0.273337 1 1 \n",
+ "6 A A 90.243038 1.294397 1 1 \n",
+ "7 A A 92.994677 1.819754 1 1 \n",
+ "8 B B 0.751396 0.000000 1 1 \n",
+ "9 B B 0.000000 0.000000 1 1 "
+ ]
+ },
+ "execution_count": null,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "def generate_kinetic_data(n_experiments: int = 16) -> pd.DataFrame:\n",
+ " \"\"\"Generate synthetic kinetic measurement data for catalytic reactions.\"\"\"\n",
+ "\n",
+ " # Time points for kinetic measurements (in minutes)\n",
+ " time_points = np.array([0, 15, 30, 45, 60, 90, 120, 180])\n",
+ "\n",
+ " # Generate experimental conditions\n",
+ " experiments = []\n",
+ "\n",
+ " for exp_id in range(n_experiments):\n",
+ " # Randomly sample conditions\n",
+ " temperature = np.random.uniform(60, 100) # °C\n",
+ " catalyst_amount = np.random.uniform(1, 10) # mol%\n",
+ " base_equiv = np.random.uniform(1, 3) # equivalents\n",
+ " catalyst = np.random.choice([\"A\", \"B\", \"C\"])\n",
+ " base = np.random.choice([\"A\", \"B\", \"C\"])\n",
+ "\n",
+ " # Generate kinetic profiles based on conditions\n",
+ " for _i, time in enumerate(time_points):\n",
+ " # Base reaction kinetics\n",
+ " k_rate = 0.01 * (temperature - 50) / 50 # Temperature effect on rate\n",
+ "\n",
+ " # Catalyst effects\n",
+ " if catalyst == \"B\":\n",
+ " catalyst_factor = 0.0 # Catalyst B doesn't work\n",
+ " elif catalyst == \"A\":\n",
+ " catalyst_factor = 1.5 # Catalyst A is excellent\n",
+ " else: # catalyst == 'C'\n",
+ " catalyst_factor = 0.7 # Catalyst C is moderate\n",
+ "\n",
+ " # Base effects\n",
+ " if base == \"B\":\n",
+ " base_factor = 0.0 # Base B doesn't work\n",
+ " elif base == \"A\":\n",
+ " base_factor = 1.2 # Base A is good\n",
+ " else: # base == 'C'\n",
+ " base_factor = 0.8 # Base C is moderate\n",
+ "\n",
+ " # Synergistic effect: Catalyst A + Base A\n",
+ " synergy = 1.5 if (catalyst == \"A\" and base == \"A\") else 1.0\n",
+ "\n",
+ " # Calculate yield (follows first-order kinetics)\n",
+ " effective_rate = (\n",
+ " k_rate * catalyst_factor * base_factor * synergy * catalyst_amount / 5\n",
+ " )\n",
+ " max_yield = min(\n",
+ " 95, 60 * catalyst_factor * base_factor * synergy\n",
+ " ) # Asymptotic max\n",
+ " yield_val = max_yield * (1 - np.exp(-effective_rate * time))\n",
+ "\n",
+ " # Calculate impurity (increases with base loading and time)\n",
+ " impurity_rate = 0.002 * base_equiv * (temperature - 50) / 50\n",
+ " impurity = 2 * base_equiv * (1 - np.exp(-impurity_rate * time))\n",
+ "\n",
+ " # Add some noise\n",
+ " yield_val += np.random.normal(0, 2)\n",
+ " impurity += np.random.normal(0, 0.5)\n",
+ "\n",
+ " # Add realistic noise to time measurements\n",
+ " actual_time = time + np.random.normal(0, 0.5) if time > 0 else time\n",
+ "\n",
+ " experiments.append(\n",
+ " {\n",
+ " \"_trajectory_id\": exp_id,\n",
+ " \"time\": actual_time,\n",
+ " \"temperature\": temperature,\n",
+ " \"catalyst_amount\": catalyst_amount,\n",
+ " \"base_equiv\": base_equiv,\n",
+ " \"catalyst\": catalyst,\n",
+ " \"base\": base,\n",
+ " \"yield\": max(0, yield_val), # Ensure non-negative\n",
+ " \"impurity\": max(0, impurity), # Ensure non-negative\n",
+ " }\n",
+ " )\n",
+ "\n",
+ " df = pd.DataFrame(experiments)\n",
+ " # Add validity columns for BoFire\n",
+ " df[\"valid_yield\"] = 1\n",
+ " df[\"valid_impurity\"] = 1\n",
+ "\n",
+ " return df\n",
+ "\n",
+ "\n",
+ "# Generate the data\n",
+ "kinetic_data = generate_kinetic_data(n_experiments=16)\n",
+ "print(\n",
+ " f\"Generated {len(kinetic_data)} data points from {kinetic_data['_trajectory_id'].nunique()} experiments\"\n",
+ ")\n",
+ "print(\"\\nFirst few rows:\")\n",
+ "kinetic_data.head(10)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 2. Visualize the Kinetic Data\n",
+ "\n",
+ "Let's visualize some of the kinetic profiles to understand our data better."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "Catalyst-Base combinations in dataset:\n",
+ "catalyst base\n",
+ "A A 5\n",
+ " C 3\n",
+ "B B 1\n",
+ " C 3\n",
+ "C A 2\n",
+ " B 1\n",
+ " C 1\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Plot kinetic profiles for a few experiments\n",
+ "fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))\n",
+ "\n",
+ "# Select a few interesting experiments to plot\n",
+ "selected_exps = kinetic_data[\"_trajectory_id\"].unique()[:6]\n",
+ "colors = plt.cm.tab10(np.linspace(0, 0.6, len(selected_exps)))\n",
+ "\n",
+ "for idx, exp_id in enumerate(selected_exps):\n",
+ " exp_data = kinetic_data[kinetic_data[\"_trajectory_id\"] == exp_id].sort_values(\n",
+ " \"time\"\n",
+ " )\n",
+ "\n",
+ " # Get experiment conditions for label\n",
+ " catalyst = exp_data.iloc[0][\"catalyst\"]\n",
+ " base = exp_data.iloc[0][\"base\"]\n",
+ " temp = exp_data.iloc[0][\"temperature\"]\n",
+ "\n",
+ " label = f\"Exp {exp_id}: Cat-{catalyst}, Base-{base}, T={temp:.0f}°C\"\n",
+ "\n",
+ " ax1.plot(\n",
+ " exp_data[\"time\"],\n",
+ " exp_data[\"yield\"],\n",
+ " \"o-\",\n",
+ " color=colors[idx],\n",
+ " label=label,\n",
+ " alpha=0.7,\n",
+ " )\n",
+ " ax2.plot(\n",
+ " exp_data[\"time\"],\n",
+ " exp_data[\"impurity\"],\n",
+ " \"o-\",\n",
+ " color=colors[idx],\n",
+ " label=label,\n",
+ " alpha=0.7,\n",
+ " )\n",
+ "\n",
+ "ax1.set_xlabel(\"Time (minutes)\")\n",
+ "ax1.set_ylabel(\"Yield (%)\")\n",
+ "ax1.set_title(\"Yield Kinetic Profiles\")\n",
+ "ax1.legend(loc=\"best\", fontsize=8)\n",
+ "ax1.grid(True, alpha=0.3)\n",
+ "\n",
+ "ax2.set_xlabel(\"Time (minutes)\")\n",
+ "ax2.set_ylabel(\"Impurity (%)\")\n",
+ "ax2.set_title(\"Impurity Kinetic Profiles\")\n",
+ "ax2.legend(loc=\"best\", fontsize=8)\n",
+ "ax2.grid(True, alpha=0.3)\n",
+ "\n",
+ "plt.tight_layout()\n",
+ "plt.show()\n",
+ "\n",
+ "# Show distribution of catalyst/base combinations\n",
+ "print(\"\\nCatalyst-Base combinations in dataset:\")\n",
+ "combo_counts = kinetic_data.groupby([\"catalyst\", \"base\"])[\"_trajectory_id\"].nunique()\n",
+ "print(combo_counts.to_string())"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 3. Define the Optimization Domain\n",
+ "\n",
+ "We'll create two versions of the domain:\n",
+ "1. **Without** the `is_timeseries` flag (incorrect approach)\n",
+ "2. **With** the `is_timeseries` flag (correct approach)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Domain created successfully!\n",
+ "Inputs: base_equiv, catalyst_amount, temperature, time, base, catalyst\n",
+ "Output: yield\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Domain WITHOUT timeseries flag (incorrect)\n",
+ "def create_domain_without_timeseries() -> Domain:\n",
+ " inputs = Inputs(\n",
+ " features=[\n",
+ " ContinuousInput(\n",
+ " key=\"time\",\n",
+ " bounds=(0, 180),\n",
+ " unit=\"min\",\n",
+ " # Note: is_timeseries=False (default)\n",
+ " ),\n",
+ " ContinuousInput(\n",
+ " key=\"temperature\",\n",
+ " bounds=(60, 100),\n",
+ " unit=\"°C\",\n",
+ " ),\n",
+ " ContinuousInput(\n",
+ " key=\"catalyst_amount\",\n",
+ " bounds=(1, 10),\n",
+ " unit=\"mol%\",\n",
+ " ),\n",
+ " ContinuousInput(\n",
+ " key=\"base_equiv\",\n",
+ " bounds=(1, 3),\n",
+ " unit=\"equiv\",\n",
+ " ),\n",
+ " CategoricalInput(\n",
+ " key=\"catalyst\",\n",
+ " categories=[\"A\", \"B\", \"C\"],\n",
+ " ),\n",
+ " CategoricalInput(\n",
+ " key=\"base\",\n",
+ " categories=[\"A\", \"B\", \"C\"],\n",
+ " ),\n",
+ " ]\n",
+ " )\n",
+ "\n",
+ " # Single output for cross-validation\n",
+ " outputs = Outputs(\n",
+ " features=[\n",
+ " ContinuousOutput(\n",
+ " key=\"yield\",\n",
+ " objective=MaximizeObjective(w=1.0),\n",
+ " unit=\"%\",\n",
+ " ),\n",
+ " ]\n",
+ " )\n",
+ "\n",
+ " return Domain(inputs=inputs, outputs=outputs)\n",
+ "\n",
+ "\n",
+ "# Domain WITH timeseries flag (correct)\n",
+ "def create_domain_with_timeseries() -> Domain:\n",
+ " inputs = Inputs(\n",
+ " features=[\n",
+ " ContinuousInput(\n",
+ " key=\"time\",\n",
+ " bounds=(0, 180),\n",
+ " unit=\"min\",\n",
+ " is_timeseries=True, # Mark as timeseries!\n",
+ " ),\n",
+ " ContinuousInput(\n",
+ " key=\"temperature\",\n",
+ " bounds=(60, 100),\n",
+ " unit=\"°C\",\n",
+ " ),\n",
+ " ContinuousInput(\n",
+ " key=\"catalyst_amount\",\n",
+ " bounds=(1, 10),\n",
+ " unit=\"mol%\",\n",
+ " ),\n",
+ " ContinuousInput(\n",
+ " key=\"base_equiv\",\n",
+ " bounds=(1, 3),\n",
+ " unit=\"equiv\",\n",
+ " ),\n",
+ " CategoricalInput(\n",
+ " key=\"catalyst\",\n",
+ " categories=[\"A\", \"B\", \"C\"],\n",
+ " ),\n",
+ " CategoricalInput(\n",
+ " key=\"base\",\n",
+ " categories=[\"A\", \"B\", \"C\"],\n",
+ " ),\n",
+ " ]\n",
+ " )\n",
+ "\n",
+ " # Single output for cross-validation\n",
+ " outputs = Outputs(\n",
+ " features=[\n",
+ " ContinuousOutput(\n",
+ " key=\"yield\",\n",
+ " objective=MaximizeObjective(w=1.0),\n",
+ " unit=\"%\",\n",
+ " ),\n",
+ " ]\n",
+ " )\n",
+ "\n",
+ " return Domain(inputs=inputs, outputs=outputs)\n",
+ "\n",
+ "\n",
+ "domain_without_ts = create_domain_without_timeseries()\n",
+ "domain_with_ts = create_domain_with_timeseries()\n",
+ "\n",
+ "print(\"Domain created successfully!\")\n",
+ "print(f\"Inputs: {', '.join(domain_with_ts.inputs.get_keys())}\")\n",
+ "print(f\"Output: {', '.join(domain_with_ts.outputs.get_keys())}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 4. Cross-Validation WITHOUT Timeseries Flag (Incorrect)\n",
+ "\n",
+ "First, let's see what happens when we don't properly handle timeseries data. This approach will randomly split data points, potentially putting different time points from the same experiment in both training and test sets."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Performing cross-validation WITHOUT timeseries handling...\n",
+ "This will use regular KFold, potentially splitting time points from the same experiment.\n",
+ "\n",
+ "\n",
+ "=== Results WITHOUT timeseries handling ===\n",
+ "\n",
+ "Yield metrics:\n",
+ " Train R²: 0.9990\n",
+ " Train RMSE: 1.0029\n",
+ " Test R²: 0.9865\n",
+ " Test RMSE: 3.7298\n",
+ "\n",
+ "⚠️ Note the suspiciously high test R² value!\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Create and train model without timeseries handling\n",
+ "# Use better priors for lengthscales and noise\n",
+ "model_spec_without_ts = MixedSingleTaskGPSurrogate(\n",
+ " inputs=domain_without_ts.inputs,\n",
+ " outputs=domain_without_ts.outputs,\n",
+ " continuous_kernel=MaternKernel(\n",
+ " ard=True, nu=2.5, lengthscale_prior=MBO_LENGTHSCALE_PRIOR()\n",
+ " ),\n",
+ " categorical_kernel=HammingDistanceKernel(ard=True),\n",
+ " noise_prior=MBO_NOISE_PRIOR(),\n",
+ ")\n",
+ "model_without_ts = surrogates.map(model_spec_without_ts)\n",
+ "\n",
+ "# Perform cross-validation (will use regular KFold, mixing time points from same experiments)\n",
+ "print(\"Performing cross-validation WITHOUT timeseries handling...\")\n",
+ "print(\n",
+ " \"This will use regular KFold, potentially splitting time points from the same experiment.\\n\"\n",
+ ")\n",
+ "\n",
+ "cv_train_no_ts, cv_test_no_ts, _ = model_without_ts.cross_validate(\n",
+ " experiments=kinetic_data,\n",
+ " folds=4,\n",
+ " random_state=42,\n",
+ ")\n",
+ "\n",
+ "# Get metrics for yield (RMSE = sqrt(MSD))\n",
+ "yield_metrics_no_ts = {\n",
+ " \"Train R²\": float(cv_train_no_ts.get_metric(RegressionMetricsEnum.R2).iloc[0]),\n",
+ " \"Train RMSE\": np.sqrt(\n",
+ " float(cv_train_no_ts.get_metric(RegressionMetricsEnum.MSD).iloc[0])\n",
+ " ),\n",
+ " \"Test R²\": float(cv_test_no_ts.get_metric(RegressionMetricsEnum.R2).iloc[0]),\n",
+ " \"Test RMSE\": np.sqrt(\n",
+ " float(cv_test_no_ts.get_metric(RegressionMetricsEnum.MSD).iloc[0])\n",
+ " ),\n",
+ "}\n",
+ "\n",
+ "print(\"\\n=== Results WITHOUT timeseries handling ===\")\n",
+ "print(\"\\nYield metrics:\")\n",
+ "for metric, value in yield_metrics_no_ts.items():\n",
+ " print(f\" {metric}: {value:.4f}\")\n",
+ "\n",
+ "print(\"\\n⚠️ Note the suspiciously high test R² value!\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 5. Cross-Validation WITH Timeseries Flag (Correct)\n",
+ "\n",
+ "Now let's use the correct approach with the `is_timeseries` flag. This will ensure that all time points from the same experiment stay together in either training or test set."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Performing cross-validation WITH timeseries handling...\n",
+ "This will automatically detect the timeseries feature and use GroupKFold.\n",
+ "All time points from the same experiment will stay together.\n",
+ "\n",
+ "\n",
+ "=== Results WITH proper timeseries handling ===\n",
+ "\n",
+ "Yield metrics:\n",
+ " Train R²: 0.9991\n",
+ " Train RMSE: 0.9549\n",
+ " Test R²: 0.8544\n",
+ " Test RMSE: 12.2661\n",
+ "\n",
+ "✅ These are more realistic test metrics!\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Create and train model with proper timeseries handling\n",
+ "# Use better priors for lengthscales and noise\n",
+ "model_spec_with_ts = MixedSingleTaskGPSurrogate(\n",
+ " inputs=domain_with_ts.inputs,\n",
+ " outputs=domain_with_ts.outputs,\n",
+ " continuous_kernel=MaternKernel(\n",
+ " ard=True, nu=2.5, lengthscale_prior=MBO_LENGTHSCALE_PRIOR()\n",
+ " ),\n",
+ " categorical_kernel=HammingDistanceKernel(ard=True),\n",
+ " noise_prior=MBO_NOISE_PRIOR(),\n",
+ ")\n",
+ "model_with_ts = surrogates.map(model_spec_with_ts)\n",
+ "\n",
+ "# Perform cross-validation (will automatically use GroupKFold with _trajectory_id)\n",
+ "print(\"Performing cross-validation WITH timeseries handling...\")\n",
+ "print(\"This will automatically detect the timeseries feature and use GroupKFold.\")\n",
+ "print(\"All time points from the same experiment will stay together.\\n\")\n",
+ "\n",
+ "cv_train_ts, cv_test_ts, _ = model_with_ts.cross_validate(\n",
+ " experiments=kinetic_data,\n",
+ " folds=4,\n",
+ " random_state=42,\n",
+ ")\n",
+ "\n",
+ "# Get metrics for yield (RMSE = sqrt(MSD))\n",
+ "yield_metrics_ts = {\n",
+ " \"Train R²\": float(cv_train_ts.get_metric(RegressionMetricsEnum.R2).iloc[0]),\n",
+ " \"Train RMSE\": np.sqrt(\n",
+ " float(cv_train_ts.get_metric(RegressionMetricsEnum.MSD).iloc[0])\n",
+ " ),\n",
+ " \"Test R²\": float(cv_test_ts.get_metric(RegressionMetricsEnum.R2).iloc[0]),\n",
+ " \"Test RMSE\": np.sqrt(\n",
+ " float(cv_test_ts.get_metric(RegressionMetricsEnum.MSD).iloc[0])\n",
+ " ),\n",
+ "}\n",
+ "\n",
+ "print(\"\\n=== Results WITH proper timeseries handling ===\")\n",
+ "print(\"\\nYield metrics:\")\n",
+ "for metric, value in yield_metrics_ts.items():\n",
+ " print(f\" {metric}: {value:.4f}\")\n",
+ "\n",
+ "print(\"\\n✅ These are more realistic test metrics!\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 6. Compare Results: Data Leakage Analysis\n",
+ "\n",
+ "Let's compare the cross-validation results to understand the impact of data leakage."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Create comparison dataframe\n",
+ "comparison_data = pd.DataFrame(\n",
+ " {\n",
+ " \"Metric\": [\"Train R²\", \"Test R²\", \"Train RMSE\", \"Test RMSE\"],\n",
+ " \"Without Timeseries\": [\n",
+ " yield_metrics_no_ts[\"Train R²\"],\n",
+ " yield_metrics_no_ts[\"Test R²\"],\n",
+ " yield_metrics_no_ts[\"Train RMSE\"],\n",
+ " yield_metrics_no_ts[\"Test RMSE\"],\n",
+ " ],\n",
+ " \"With Timeseries\": [\n",
+ " yield_metrics_ts[\"Train R²\"],\n",
+ " yield_metrics_ts[\"Test R²\"],\n",
+ " yield_metrics_ts[\"Train RMSE\"],\n",
+ " yield_metrics_ts[\"Test RMSE\"],\n",
+ " ],\n",
+ " }\n",
+ ")\n",
+ "\n",
+ "# Plot comparison\n",
+ "fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))\n",
+ "\n",
+ "# R² Comparison\n",
+ "ax = ax1\n",
+ "x = np.arange(2) # Train and Test\n",
+ "width = 0.35\n",
+ "\n",
+ "train_no_ts = yield_metrics_no_ts[\"Train R²\"]\n",
+ "test_no_ts = yield_metrics_no_ts[\"Test R²\"]\n",
+ "train_ts = yield_metrics_ts[\"Train R²\"]\n",
+ "test_ts = yield_metrics_ts[\"Test R²\"]\n",
+ "\n",
+ "# Bars for R²\n",
+ "bars1 = ax.bar(\n",
+ " x - width / 2,\n",
+ " [train_no_ts, test_no_ts],\n",
+ " width,\n",
+ " label=\"Without Timeseries\",\n",
+ " color=[\"lightblue\", \"lightcoral\"],\n",
+ " edgecolor=[\"darkblue\", \"darkred\"],\n",
+ " linewidth=2,\n",
+ ")\n",
+ "bars2 = ax.bar(\n",
+ " x + width / 2,\n",
+ " [train_ts, test_ts],\n",
+ " width,\n",
+ " label=\"With Timeseries\",\n",
+ " color=[\"lightgreen\", \"darkgreen\"],\n",
+ " edgecolor=\"darkgreen\",\n",
+ " linewidth=2,\n",
+ ")\n",
+ "\n",
+ "ax.set_ylabel(\"R² Score\")\n",
+ "ax.set_title(\"R² Comparison: Impact of Timeseries Handling\")\n",
+ "ax.set_xticks(x)\n",
+ "ax.set_xticklabels([\"Train\", \"Test\"])\n",
+ "ax.legend()\n",
+ "ax.grid(True, alpha=0.3)\n",
+ "ax.set_ylim([0, 1.05])\n",
+ "\n",
+ "# Add value labels on bars\n",
+ "for bars in [bars1, bars2]:\n",
+ " for bar in bars:\n",
+ " height = bar.get_height()\n",
+ " ax.text(\n",
+ " bar.get_x() + bar.get_width() / 2.0,\n",
+ " height,\n",
+ " f\"{height:.3f}\",\n",
+ " ha=\"center\",\n",
+ " va=\"bottom\",\n",
+ " fontsize=9,\n",
+ " )\n",
+ "\n",
+ "# RMSE Comparison\n",
+ "ax2 = ax2\n",
+ "train_rmse_no_ts = yield_metrics_no_ts[\"Train RMSE\"]\n",
+ "test_rmse_no_ts = yield_metrics_no_ts[\"Test RMSE\"]\n",
+ "train_rmse_ts = yield_metrics_ts[\"Train RMSE\"]\n",
+ "test_rmse_ts = yield_metrics_ts[\"Test RMSE\"]\n",
+ "\n",
+ "bars3 = ax2.bar(\n",
+ " x - width / 2,\n",
+ " [train_rmse_no_ts, test_rmse_no_ts],\n",
+ " width,\n",
+ " label=\"Without Timeseries\",\n",
+ " color=[\"lightblue\", \"lightcoral\"],\n",
+ " edgecolor=[\"darkblue\", \"darkred\"],\n",
+ " linewidth=2,\n",
+ ")\n",
+ "bars4 = ax2.bar(\n",
+ " x + width / 2,\n",
+ " [train_rmse_ts, test_rmse_ts],\n",
+ " width,\n",
+ " label=\"With Timeseries\",\n",
+ " color=[\"lightgreen\", \"darkgreen\"],\n",
+ " edgecolor=\"darkgreen\",\n",
+ " linewidth=2,\n",
+ ")\n",
+ "\n",
+ "ax2.set_ylabel(\"RMSE\")\n",
+ "ax2.set_title(\"RMSE Comparison: Impact of Timeseries Handling\")\n",
+ "ax2.set_xticks(x)\n",
+ "ax2.set_xticklabels([\"Train\", \"Test\"])\n",
+ "ax2.legend()\n",
+ "ax2.grid(True, alpha=0.3)\n",
+ "\n",
+ "# Add value labels on bars\n",
+ "for bars in [bars3, bars4]:\n",
+ " for bar in bars:\n",
+ " height = bar.get_height()\n",
+ " ax2.text(\n",
+ " bar.get_x() + bar.get_width() / 2.0,\n",
+ " height,\n",
+ " f\"{height:.2f}\",\n",
+ " ha=\"center\",\n",
+ " va=\"bottom\",\n",
+ " fontsize=9,\n",
+ " )\n",
+ "\n",
+ "plt.tight_layout()\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 7. Understanding Data Leakage\n",
+ "\n",
+ "Let's visualize exactly how data leakage occurs when we don't use proper timeseries handling."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "WITHOUT timeseries handling (regular KFold):\n",
+ "Train experiments: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]\n",
+ "Test experiments: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]\n",
+ "\n",
+ "🚨 Experiments split between train and test: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]\n",
+ "Number of leaked experiments: 16 out of 16\n",
+ "\n",
+ "Example - Experiment 0:\n",
+ " Total time points: 8\n",
+ " Time points in TRAIN: 6 - Times: ['14.8', '29.8', '44.1', '90.7', '119.3', '179.4']\n",
+ " Time points in TEST: 2 - Times: ['0.0', '60.2']\n",
+ "\n",
+ " ⚠️ The model sees some time points during training and predicts\n",
+ " neighboring time points during testing - this is cheating!\n",
+ "\n",
+ "============================================================\n",
+ "\n",
+ "WITH timeseries handling (GroupKFold with _trajectory_id):\n",
+ "Train experiments: [0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14]\n",
+ "Test experiments: [3, 7, 11, 15]\n",
+ "\n",
+ "✅ Experiments split between train and test: None\n",
+ "Number of leaked experiments: 0\n",
+ "\n",
+ "✅ Each experiment's complete kinetic profile stays together!\n"
+ ]
+ }
+ ],
+ "source": [
+ "# from sklearn.model_selection import KFold\n",
+ "\n",
+ "\n",
+ "# Demonstrate data leakage with a single fold\n",
+ "kf = KFold(n_splits=4, shuffle=True, random_state=42)\n",
+ "train_idx, test_idx = next(kf.split(kinetic_data))\n",
+ "\n",
+ "# Check which experiments have points in both train and test\n",
+ "train_experiments = set(kinetic_data.iloc[train_idx][\"_trajectory_id\"])\n",
+ "test_experiments = set(kinetic_data.iloc[test_idx][\"_trajectory_id\"])\n",
+ "leaked_experiments = train_experiments.intersection(test_experiments)\n",
+ "\n",
+ "print(\"WITHOUT timeseries handling (regular KFold):\")\n",
+ "print(f\"Train experiments: {sorted(train_experiments)}\")\n",
+ "print(f\"Test experiments: {sorted(test_experiments)}\")\n",
+ "print(f\"\\n🚨 Experiments split between train and test: {sorted(leaked_experiments)}\")\n",
+ "print(\n",
+ " f\"Number of leaked experiments: {len(leaked_experiments)} out of {kinetic_data['_trajectory_id'].nunique()}\"\n",
+ ")\n",
+ "\n",
+ "# Show example of leakage for one experiment\n",
+ "if leaked_experiments:\n",
+ " example_exp = list(leaked_experiments)[0]\n",
+ " exp_data = kinetic_data[kinetic_data[\"_trajectory_id\"] == example_exp]\n",
+ " exp_train_idx = [i for i in train_idx if i in exp_data.index]\n",
+ " exp_test_idx = [i for i in test_idx if i in exp_data.index]\n",
+ "\n",
+ " train_times = sorted(exp_data.loc[exp_train_idx, \"time\"].values)\n",
+ " test_times = sorted(exp_data.loc[exp_test_idx, \"time\"].values)\n",
+ "\n",
+ " print(f\"\\nExample - Experiment {example_exp}:\")\n",
+ " print(f\" Total time points: {len(exp_data)}\")\n",
+ " print(\n",
+ " f\" Time points in TRAIN: {len(exp_train_idx)} - Times: {[f'{t:.1f}' for t in train_times]}\"\n",
+ " )\n",
+ " print(\n",
+ " f\" Time points in TEST: {len(exp_test_idx)} - Times: {[f'{t:.1f}' for t in test_times]}\"\n",
+ " )\n",
+ " print(\"\\n ⚠️ The model sees some time points during training and predicts\")\n",
+ " print(\" neighboring time points during testing - this is cheating!\")\n",
+ "\n",
+ "print(\"\\n\" + \"=\" * 60)\n",
+ "\n",
+ "# Now show correct handling\n",
+ "# from sklearn.model_selection import GroupKFold\n",
+ "\n",
+ "\n",
+ "gkf = GroupKFold(n_splits=4)\n",
+ "train_idx_correct, test_idx_correct = next(\n",
+ " gkf.split(kinetic_data, groups=kinetic_data[\"_trajectory_id\"])\n",
+ ")\n",
+ "\n",
+ "train_experiments_correct = set(kinetic_data.iloc[train_idx_correct][\"_trajectory_id\"])\n",
+ "test_experiments_correct = set(kinetic_data.iloc[test_idx_correct][\"_trajectory_id\"])\n",
+ "leaked_experiments_correct = train_experiments_correct.intersection(\n",
+ " test_experiments_correct\n",
+ ")\n",
+ "\n",
+ "print(\"\\nWITH timeseries handling (GroupKFold with _trajectory_id):\")\n",
+ "print(f\"Train experiments: {sorted(train_experiments_correct)}\")\n",
+ "print(f\"Test experiments: {sorted(test_experiments_correct)}\")\n",
+ "print(\n",
+ " f\"\\n✅ Experiments split between train and test: {sorted(leaked_experiments_correct) if leaked_experiments_correct else 'None'}\"\n",
+ ")\n",
+ "print(f\"Number of leaked experiments: {len(leaked_experiments_correct)}\")\n",
+ "print(\"\\n✅ Each experiment's complete kinetic profile stays together!\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 8. Key Takeaways\n",
+ "\n",
+ "### 🔑 **Critical Insights:**\n",
+ "\n",
+ "1. **Data Leakage in Timeseries**: Without proper handling, cross-validation splits time points from the same experiment between train and test sets. The model learns to interpolate between neighboring time points rather than truly predicting new experiments.\n",
+ "\n",
+ "2. **Overly Optimistic Metrics**: The test R² without timeseries handling is artificially high because the model has seen other time points from the same experiments during training.\n",
+ "\n",
+ "3. **Proper Validation**: Using the `is_timeseries` flag ensures that complete kinetic profiles stay together, providing realistic performance estimates.\n",
+ "\n",
+ "### 📋 **Best Practices for Kinetic/Timeseries Data:**\n",
+ "\n",
+ "- Always mark your time feature with `is_timeseries=True`\n",
+ "- Include a `_trajectory_id` column to identify which measurements belong to the same experiment\n",
+ "- Use GroupKFold (default) or GroupShuffleSplit for cross-validation\n",
+ "- Be skeptical of suspiciously high test metrics - they might indicate data leakage!\n",
+ "\n",
+ "### 🎯 **When to Use Timeseries Features:**\n",
+ "\n",
+ "- Kinetic measurements (reaction progress over time)\n",
+ "- Process monitoring (temperature, pressure profiles)\n",
+ "- Batch processes (fermentation, crystallization)\n",
+ "- Any sequential measurements where order and grouping matter"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 9. Bonus: Using GroupShuffleSplit for Flexible Test Size\n",
+ "\n",
+ "Sometimes you might want more control over your test set size. Here's how to use GroupShuffleSplit:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Using GroupShuffleSplit for flexible test set size...\n",
+ "\n",
+ "GroupShuffleSplit Results (5 random 80/20 splits):\n",
+ "Yield Test R²: 0.8629\n",
+ "Yield Test RMSE: 12.7704\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Cross-validation with GroupShuffleSplit\n",
+ "print(\"Using GroupShuffleSplit for flexible test set size...\\n\")\n",
+ "\n",
+ "cv_train_shuffle, cv_test_shuffle, _ = model_with_ts.cross_validate(\n",
+ " experiments=kinetic_data,\n",
+ " folds=5, # Number of random splits\n",
+ " use_shuffle_split=True, # Use GroupShuffleSplit instead of GroupKFold\n",
+ " random_state=42,\n",
+ ")\n",
+ "\n",
+ "print(\"GroupShuffleSplit Results (5 random 80/20 splits):\")\n",
+ "print(\n",
+ " f\"Yield Test R²: {float(cv_test_shuffle.get_metric(RegressionMetricsEnum.R2).iloc[0]):.4f}\"\n",
+ ")\n",
+ "print(\n",
+ " f\"Yield Test RMSE: {np.sqrt(float(cv_test_shuffle.get_metric(RegressionMetricsEnum.MSD).iloc[0])):.4f}\"\n",
+ ")"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.12.8"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}