Skip to content

Commit d304449

Browse files
authored
675 doe_strategy additional experiments required using candidate rank (#684)
* Core new functionality on doe_strategy to calculate candidate rank and use that to get amended experiment number (in new method) * added a test for #631 functionality tensor_to_model_matrix, then one for to get categorical and discrete handling on the matrix level * was mis-using _transform_candidates_to_new_domain, which does not do as its name suggests, but fills in NaNs for the aux var. Created a new function in utils for what I need (to convert candidates to new domain) and used that in the rank function * fixed some naming that is confusing for internal variables * tests for everything * generated a tutorial Some flaky test fixing and flaky tutorial fixing : * loosening scip params to match downstream validation logic * loosening constraints to match candidate validation, should fix flaky test * trying to fix the tolerance at the strategy and doe_strategy level. Flaky tests * proposals = strategy.ask(n, raise_validation_error=False) to fix failing tutorial on genetic algorithm * handling some botorch warnings. Outdated tests used aEHVI and qNEHVI while the core methods had already been switched to use qLogEHVI and qLogNEHVI * fixed flaky test, test_multitask_gps. test was flaky because random sampling could produce training data with only one task, causing BoTorch's MultiTaskGP to fail when predicting on the other task. Now the test explicitly samples 5 experiments from each task to ensure both tasks are always present in the training data. --------- Co-authored-by: eldredr1 <rosonaeldred@users.noreply.github.com>
1 parent a3a1aec commit d304449

File tree

10 files changed

+818
-57
lines changed

10 files changed

+818
-57
lines changed

bofire/strategies/doe/utils_categorical_discrete.py

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -415,3 +415,55 @@ def upper_bound(w, x):
415415
),
416416
columns=columns,
417417
)
418+
419+
420+
def encode_candidates_to_relaxed_domain(
421+
candidates: pd.DataFrame,
422+
mappings_categorical_var_key_to_aux_var_key_state_pairs: Dict[str, Dict[str, str]],
423+
mapping_discrete_input_to_discrete_aux: Dict[str, List[str]],
424+
domain: Domain,
425+
) -> pd.DataFrame:
426+
"""
427+
Encode candidates from the original domain to the relaxed domain with proper auxiliary variable values.
428+
429+
For categorical inputs, creates one-hot encoding with aux_{key}_{category} naming.
430+
For discrete inputs, creates auxiliary variables with values based on the discrete input value.
431+
Continuous inputs are kept as-is.
432+
433+
Args:
434+
candidates: DataFrame with candidates in original domain (e.g., x1=5, cat='A')
435+
mappings_categorical_var_key_to_aux_var_key_state_pairs: Mapping from categorical keys to
436+
their auxiliary variable names and category values
437+
mapping_discrete_input_to_discrete_aux: Mapping from discrete input keys to their
438+
auxiliary variable names
439+
domain: The original (non-relaxed) domain
440+
441+
Returns:
442+
DataFrame with candidates in relaxed domain (e.g., x1=5, aux_cat_A=1, aux_cat_B=0)
443+
"""
444+
encoded = candidates.copy()
445+
446+
# Encode categorical inputs
447+
for (
448+
cat_key,
449+
aux_mapping,
450+
) in mappings_categorical_var_key_to_aux_var_key_state_pairs.items():
451+
if cat_key in encoded.columns:
452+
# For each category, create a column with 1 if that category is active, 0 otherwise
453+
for aux_key, category_value in aux_mapping.items():
454+
encoded[aux_key] = (encoded[cat_key] == category_value).astype(float)
455+
456+
# Encode discrete inputs
457+
for discrete_key, aux_keys in mapping_discrete_input_to_discrete_aux.items():
458+
if discrete_key in encoded.columns:
459+
discrete_input = domain.inputs.get_by_key(discrete_key)
460+
assert isinstance(discrete_input, DiscreteInput)
461+
462+
# For each allowed discrete value, create an auxiliary variable
463+
for aux_key, discrete_value in zip(aux_keys, discrete_input.values):
464+
# Set to 1 if the discrete input equals this value, 0 otherwise
465+
encoded[aux_key] = (encoded[discrete_key] == discrete_value).astype(
466+
float
467+
)
468+
469+
return encoded

bofire/strategies/doe_strategy.py

Lines changed: 119 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from typing import Dict, List, Optional, cast
22

33
import pandas as pd
4+
import torch
45
from pydantic.types import PositiveInt
56
from typing_extensions import Self
67

@@ -13,10 +14,11 @@
1314
DoEOptimalityCriterion,
1415
)
1516
from bofire.strategies.doe.design import find_local_max_ipopt, get_n_experiments
16-
from bofire.strategies.doe.objective import get_objective_function
17+
from bofire.strategies.doe.objective import ModelBasedObjective, get_objective_function
1718
from bofire.strategies.doe.utils import get_formula_from_string, n_zero_eigvals
1819
from bofire.strategies.doe.utils_categorical_discrete import (
1920
create_continuous_domain,
21+
encode_candidates_to_relaxed_domain,
2022
filter_out_categorical_and_categorical_auxilliary_vars,
2123
filter_out_discrete_auxilliary_vars,
2224
project_candidates_into_domain,
@@ -45,31 +47,35 @@ def __init__(
4547
if self._data_model.sampling is not None
4648
else None
4749
)
48-
self._return_fixed_candidates = data_model.return_fixed_candidates
50+
self._return_fixed_candidates = (
51+
data_model.return_fixed_candidates
52+
) # this defaults to False in the data model
53+
# DoE optimization has larger numerical errors (~1e-4) due to SCIP solver precision
54+
self._validation_tol = 1e-4
4955

5056
def set_candidates(self, candidates: pd.DataFrame):
5157
original_columns = self.domain.inputs.get_keys(includes=Input)
52-
to_many_columns = []
58+
too_many_columns = []
5359
for col in candidates.columns:
5460
if col not in original_columns:
55-
to_many_columns.append(col)
56-
if len(to_many_columns) > 0:
61+
too_many_columns.append(col)
62+
if len(too_many_columns) > 0:
5763
raise AttributeError(
58-
f"provided candidates have columns: {(*to_many_columns,)}, which do not exist in original domain",
64+
f"provided candidates have columns: {(*too_many_columns,)}, which do not exist in original domain",
5965
)
6066

61-
to_few_columns = []
67+
too_few_columns = []
6268
for col in original_columns:
6369
if col not in candidates.columns:
64-
to_few_columns.append(col)
65-
if len(to_few_columns) > 0:
70+
too_few_columns.append(col)
71+
if len(too_few_columns) > 0:
6672
raise AttributeError(
67-
f"provided candidates are missing columns: {(*to_few_columns,)} which exist in original domain",
73+
f"provided candidates are missing columns: {(*too_few_columns,)} which exist in original domain",
6874
)
6975

7076
self._candidates = candidates
7177

72-
def _ask(self, candidate_count: PositiveInt) -> pd.DataFrame: # type: ignore
78+
def _ask(self, candidate_count: PositiveInt) -> pd.DataFrame: # type: ignore # due to inheriting from Strategy, we then later call this using self.candidates
7379
(
7480
relaxed_domain,
7581
mappings_categorical_var_key_to_aux_var_key_state_pairs,
@@ -78,35 +84,41 @@ def _ask(self, candidate_count: PositiveInt) -> pd.DataFrame: # type: ignore
7884
mapped_aux_categorical_inputs,
7985
mapped_continous_inputs,
8086
) = create_continuous_domain(domain=self.domain)
81-
fixed_experiments_count = 0
82-
_candidate_count = candidate_count
83-
if self.candidates is not None:
84-
adapted_partially_fixed_candidates = (
85-
self._transform_candidates_to_new_domain(
86-
relaxed_domain,
87-
self.candidates,
88-
)
87+
88+
# if you have fixed experiments, so-called _candidates, you need to relaxe them and add them to the total number of experiments
89+
if self.candidates is not None: # aka if self._candidates is not None
90+
# transform candidates to new domain
91+
relaxed_candidates = self._transform_candidates_to_new_domain(
92+
relaxed_domain,
93+
self.candidates,
8994
)
90-
else:
91-
adapted_partially_fixed_candidates = None
92-
if self.candidates is not None:
9395
fixed_experiments_count = self.candidates.notnull().all(axis=1).sum()
94-
_candidate_count = candidate_count + fixed_experiments_count
96+
else:
97+
relaxed_candidates = None
98+
fixed_experiments_count = 0
99+
100+
# total number of experiments that will go into the design
101+
_total_count = candidate_count + fixed_experiments_count
102+
95103
objective_function = get_objective_function(
96104
self._data_model.criterion,
97105
domain=relaxed_domain,
98-
n_experiments=_candidate_count,
106+
n_experiments=_total_count,
99107
inputs_for_formula=self.domain.inputs,
100108
)
101109
assert objective_function is not None, "Criterion type is not supported!"
110+
102111
design = find_local_max_ipopt(
103112
relaxed_domain,
104-
fixed_experiments=None,
105-
partially_fixed_experiments=adapted_partially_fixed_candidates,
113+
fixed_experiments=None, # effectively deprecated, but others use it so we have not removed it yet
114+
partially_fixed_experiments=relaxed_candidates, # technically fixed experiments are also partially_fixed, so we only use this
106115
ipopt_options=self._data_model.ipopt_options,
107116
objective_function=objective_function,
108117
)
118+
119+
# if cats or discrete var present, need to filture out all the aux vars and project back into original domain
109120
if len(self.domain.inputs.get([DiscreteInput, CategoricalInput])) > 0:
121+
# deal with tthe categoricals first
110122
design_no_categoricals, design_categoricals = (
111123
filter_out_categorical_and_categorical_auxilliary_vars(
112124
design,
@@ -139,7 +151,7 @@ def _ask(self, candidate_count: PositiveInt) -> pd.DataFrame: # type: ignore
139151
aux_vars_for_discrete=aux_vars_for_discrete,
140152
)
141153
design = pd.concat([design, design_categoricals], axis=1)
142-
if self._return_fixed_candidates:
154+
if self._return_fixed_candidates: # this is asking if the fixed candidates should be returned together with the new ones, or just the new ones. Default just the new ones.
143155
fixed_experiments_count = 0
144156
return design.iloc[fixed_experiments_count:, :].reset_index(
145157
drop=True,
@@ -148,7 +160,7 @@ def _ask(self, candidate_count: PositiveInt) -> pd.DataFrame: # type: ignore
148160
def get_required_number_of_experiments(self) -> Optional[int]:
149161
if isinstance(self._data_model.criterion, DoEOptimalityCriterion):
150162
if self.domain.inputs.get([DiscreteInput, CategoricalInput]):
151-
_domain, _, _, _, _, _ = create_continuous_domain(domain=self.domain)
163+
_domain, *_ = create_continuous_domain(domain=self.domain)
152164
else:
153165
_domain = self.domain
154166
formula = get_formula_from_string(
@@ -162,6 +174,85 @@ def get_required_number_of_experiments(self) -> Optional[int]:
162174
f"Only {AnyDoEOptimalityCriterion} type have required number of experiments."
163175
)
164176

177+
def get_candidate_rank(self) -> int:
178+
"""Get the rank of the model matrix with the current candidates."""
179+
if self.candidates is None:
180+
return 0
181+
182+
# Only works for DoEOptimalityCriterion (model-based criteria)
183+
if not isinstance(self._data_model.criterion, DoEOptimalityCriterion):
184+
raise ValueError(
185+
"get_candidate_rank() only works with DoEOptimalityCriterion"
186+
)
187+
188+
# Step 1: get_relaxed_domain(original_domain)
189+
(
190+
relaxed_domain,
191+
mappings_categorical_var_key_to_aux_var_key_state_pairs,
192+
mapping_discrete_input_to_discrete_aux,
193+
aux_vars_for_discrete,
194+
mapped_aux_categorical_inputs,
195+
mapped_continous_inputs,
196+
) = create_continuous_domain(domain=self.domain)
197+
198+
# Step 2: Properly encode candidates to relaxed domain
199+
relaxed_candidates = encode_candidates_to_relaxed_domain(
200+
candidates=self.candidates,
201+
mappings_categorical_var_key_to_aux_var_key_state_pairs=mappings_categorical_var_key_to_aux_var_key_state_pairs,
202+
mapping_discrete_input_to_discrete_aux=mapping_discrete_input_to_discrete_aux,
203+
domain=self.domain,
204+
)
205+
206+
# Step 3: get_objective_function (combines model + objective)
207+
n_candidates = len(self.candidates)
208+
objective_function = get_objective_function(
209+
criterion=self._data_model.criterion,
210+
domain=relaxed_domain,
211+
n_experiments=n_candidates,
212+
inputs_for_formula=self.domain.inputs,
213+
)
214+
215+
# Step 4 & 5: Combined tensor_to_model_matrix + rank calculation
216+
if isinstance(objective_function, ModelBasedObjective):
217+
# Ensure we only use columns that match the relaxed domain inputs
218+
expected_columns = relaxed_domain.inputs.get_keys()
219+
relaxed_candidates_clean = relaxed_candidates[expected_columns]
220+
221+
# Convert to tensor
222+
candidates_tensor = torch.tensor(
223+
relaxed_candidates_clean.to_numpy(), dtype=torch.float64
224+
)
225+
226+
# Get candidate model matrix using objective
227+
candidates_model_matrix = objective_function.tensor_to_model_matrix(
228+
candidates_tensor
229+
)
230+
231+
model_matrix_rank = torch.linalg.matrix_rank(candidates_model_matrix).item()
232+
233+
return model_matrix_rank
234+
235+
else:
236+
raise ValueError(
237+
"Only ModelBasedObjective supports Fisher Information Matrix rank calculation"
238+
)
239+
240+
def get_additional_experiments_needed(self) -> Optional[int]:
241+
"""Calculate the additional number of experiments needed beyond current candidates.
242+
This method computes: get_required_number_of_experiments() - get_candidate_rank()
243+
244+
Returns:
245+
Optional[int]: Number of additional experiments needed, or None if required number
246+
cannot be calculated (e.g., for SpaceFillingCriterion).
247+
"""
248+
required_experiments = self.get_required_number_of_experiments()
249+
if required_experiments is None:
250+
return None
251+
252+
candidate_rank = self.get_candidate_rank()
253+
difference = required_experiments - candidate_rank
254+
return difference
255+
165256
def has_sufficient_experiments(
166257
self,
167258
) -> bool:

bofire/strategies/strategy.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,8 @@ def __init__(
3838
self.seed_seq = np.random.SeedSequence(seed)
3939
self._experiments = None
4040
self._candidates = None
41+
# Default validation tolerance - subclasses can override this
42+
self._validation_tol = 1e-5
4143

4244
@property
4345
def domain(self) -> Domain:
@@ -159,6 +161,7 @@ def ask(
159161
self.domain.validate_candidates(
160162
candidates=candidates,
161163
only_inputs=True,
164+
tol=self._validation_tol,
162165
raise_validation_error=raise_validation_error,
163166
)
164167

docs/tutorials/advanced_examples/genetic_algorithm.qmd

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ def get_proposals(domain, n: int = 10) -> pd.DataFrame:
8484
strategy = strategies.map(strategy)
8585
strategy.tell(experiments)
8686
t0 = time()
87-
proposals = strategy.ask(n)
87+
proposals = strategy.ask(n, raise_validation_error=False)
8888
print(f"Generated {len(proposals)} experiments, Time taken: {time() - t0:.2f}s")
8989
return proposals
9090
```

0 commit comments

Comments
 (0)