experimental-design
diff --git a/‎bofire/strategies/doe/utils_categorical_discrete.py‎
Lines changed: 52 additions & 0 deletions b/‎bofire/strategies/doe/utils_categorical_discrete.py‎
Lines changed: 52 additions & 0 deletions
diff --git a/‎bofire/strategies/doe_strategy.py‎
Lines changed: 119 additions & 28 deletions b/‎bofire/strategies/doe_strategy.py‎
Lines changed: 119 additions & 28 deletions
diff --git a/‎bofire/strategies/strategy.py‎
Lines changed: 3 additions & 0 deletions b/‎bofire/strategies/strategy.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎docs/tutorials/advanced_examples/genetic_algorithm.qmd‎
Lines changed: 1 addition & 1 deletion b/‎docs/tutorials/advanced_examples/genetic_algorithm.qmd‎
Lines changed: 1 addition & 1 deletion
@@ -415,3 +415,55 @@ def upper_bound(w, x):
         ),
         columns=columns,
     )
+
+
+def encode_candidates_to_relaxed_domain(
+    candidates: pd.DataFrame,
+    mappings_categorical_var_key_to_aux_var_key_state_pairs: Dict[str, Dict[str, str]],
+    mapping_discrete_input_to_discrete_aux: Dict[str, List[str]],
+    domain: Domain,
+) -> pd.DataFrame:
+    """
+    Encode candidates from the original domain to the relaxed domain with proper auxiliary variable values.
+
+    For categorical inputs, creates one-hot encoding with aux_{key}_{category} naming.
+    For discrete inputs, creates auxiliary variables with values based on the discrete input value.
+    Continuous inputs are kept as-is.
+
+    Args:
+        candidates: DataFrame with candidates in original domain (e.g., x1=5, cat='A')
+        mappings_categorical_var_key_to_aux_var_key_state_pairs: Mapping from categorical keys to
+            their auxiliary variable names and category values
+        mapping_discrete_input_to_discrete_aux: Mapping from discrete input keys to their
+            auxiliary variable names
+        domain: The original (non-relaxed) domain
+
+    Returns:
+        DataFrame with candidates in relaxed domain (e.g., x1=5, aux_cat_A=1, aux_cat_B=0)
+    """
+    encoded = candidates.copy()
+
+    # Encode categorical inputs
+    for (
+        cat_key,
+        aux_mapping,
+    ) in mappings_categorical_var_key_to_aux_var_key_state_pairs.items():
+        if cat_key in encoded.columns:
+            # For each category, create a column with 1 if that category is active, 0 otherwise
+            for aux_key, category_value in aux_mapping.items():
+                encoded[aux_key] = (encoded[cat_key] == category_value).astype(float)
+
+    # Encode discrete inputs
+    for discrete_key, aux_keys in mapping_discrete_input_to_discrete_aux.items():
+        if discrete_key in encoded.columns:
+            discrete_input = domain.inputs.get_by_key(discrete_key)
+            assert isinstance(discrete_input, DiscreteInput)
+
+            # For each allowed discrete value, create an auxiliary variable
+            for aux_key, discrete_value in zip(aux_keys, discrete_input.values):
+                # Set to 1 if the discrete input equals this value, 0 otherwise
+                encoded[aux_key] = (encoded[discrete_key] == discrete_value).astype(
+                    float
+                )
+
+    return encoded
@@ -1,6 +1,7 @@
 from typing import Dict, List, Optional, cast
 
 import pandas as pd
+import torch
 from pydantic.types import PositiveInt
 from typing_extensions import Self
 
@@ -13,10 +14,11 @@
     DoEOptimalityCriterion,
 )
 from bofire.strategies.doe.design import find_local_max_ipopt, get_n_experiments
-from bofire.strategies.doe.objective import get_objective_function
+from bofire.strategies.doe.objective import ModelBasedObjective, get_objective_function
 from bofire.strategies.doe.utils import get_formula_from_string, n_zero_eigvals
 from bofire.strategies.doe.utils_categorical_discrete import (
     create_continuous_domain,
+    encode_candidates_to_relaxed_domain,
     filter_out_categorical_and_categorical_auxilliary_vars,
     filter_out_discrete_auxilliary_vars,
     project_candidates_into_domain,
@@ -45,31 +47,35 @@ def __init__(
             if self._data_model.sampling is not None
             else None
         )
-        self._return_fixed_candidates = data_model.return_fixed_candidates
+        self._return_fixed_candidates = (
+            data_model.return_fixed_candidates
+        )  # this defaults to False in the data model
+        # DoE optimization has larger numerical errors (~1e-4) due to SCIP solver precision
+        self._validation_tol = 1e-4
 
     def set_candidates(self, candidates: pd.DataFrame):
         original_columns = self.domain.inputs.get_keys(includes=Input)
-        to_many_columns = []
+        too_many_columns = []
         for col in candidates.columns:
             if col not in original_columns:
-                to_many_columns.append(col)
-        if len(to_many_columns) > 0:
+                too_many_columns.append(col)
+        if len(too_many_columns) > 0:
             raise AttributeError(
-                f"provided candidates have columns: {(*to_many_columns,)},  which do not exist in original domain",
+                f"provided candidates have columns: {(*too_many_columns,)},  which do not exist in original domain",
             )
 
-        to_few_columns = []
+        too_few_columns = []
         for col in original_columns:
             if col not in candidates.columns:
-                to_few_columns.append(col)
-        if len(to_few_columns) > 0:
+                too_few_columns.append(col)
+        if len(too_few_columns) > 0:
             raise AttributeError(
-                f"provided candidates are missing columns: {(*to_few_columns,)} which exist in original domain",
+                f"provided candidates are missing columns: {(*too_few_columns,)} which exist in original domain",
             )
 
         self._candidates = candidates
 
-    def _ask(self, candidate_count: PositiveInt) -> pd.DataFrame:  # type: ignore
+    def _ask(self, candidate_count: PositiveInt) -> pd.DataFrame:  # type: ignore # due to inheriting from Strategy, we then later call this using self.candidates
         (
             relaxed_domain,
             mappings_categorical_var_key_to_aux_var_key_state_pairs,
@@ -78,35 +84,41 @@ def _ask(self, candidate_count: PositiveInt) -> pd.DataFrame:  # type: ignore
             mapped_aux_categorical_inputs,
             mapped_continous_inputs,
         ) = create_continuous_domain(domain=self.domain)
-        fixed_experiments_count = 0
-        _candidate_count = candidate_count
-        if self.candidates is not None:
-            adapted_partially_fixed_candidates = (
-                self._transform_candidates_to_new_domain(
-                    relaxed_domain,
-                    self.candidates,
-                )
+
+        # if you have fixed experiments, so-called _candidates, you need to relaxe them and add them to the total number of experiments
+        if self.candidates is not None:  # aka if self._candidates is not None
+            # transform candidates to new domain
+            relaxed_candidates = self._transform_candidates_to_new_domain(
+                relaxed_domain,
+                self.candidates,
             )
-        else:
-            adapted_partially_fixed_candidates = None
-        if self.candidates is not None:
             fixed_experiments_count = self.candidates.notnull().all(axis=1).sum()
-            _candidate_count = candidate_count + fixed_experiments_count
+        else:
+            relaxed_candidates = None
+            fixed_experiments_count = 0
+
+        # total number of experiments that will go into the design
+        _total_count = candidate_count + fixed_experiments_count
+
         objective_function = get_objective_function(
             self._data_model.criterion,
             domain=relaxed_domain,
-            n_experiments=_candidate_count,
+            n_experiments=_total_count,
             inputs_for_formula=self.domain.inputs,
         )
         assert objective_function is not None, "Criterion type is not supported!"
+
         design = find_local_max_ipopt(
             relaxed_domain,
-            fixed_experiments=None,
-            partially_fixed_experiments=adapted_partially_fixed_candidates,
+            fixed_experiments=None,  # effectively deprecated, but others use it so we have not removed it yet
+            partially_fixed_experiments=relaxed_candidates,  # technically fixed experiments are also partially_fixed, so we only use this
             ipopt_options=self._data_model.ipopt_options,
             objective_function=objective_function,
         )
+
+        # if cats or discrete var present, need to filture out all the aux vars and project back into original domain
         if len(self.domain.inputs.get([DiscreteInput, CategoricalInput])) > 0:
+            # deal with tthe categoricals first
             design_no_categoricals, design_categoricals = (
                 filter_out_categorical_and_categorical_auxilliary_vars(
                     design,
@@ -139,7 +151,7 @@ def _ask(self, candidate_count: PositiveInt) -> pd.DataFrame:  # type: ignore
                     aux_vars_for_discrete=aux_vars_for_discrete,
                 )
                 design = pd.concat([design, design_categoricals], axis=1)
-        if self._return_fixed_candidates:
+        if self._return_fixed_candidates:  # this is asking if the fixed candidates should be returned together with the new ones, or just the new ones. Default just the new ones.
             fixed_experiments_count = 0
         return design.iloc[fixed_experiments_count:, :].reset_index(
             drop=True,
@@ -148,7 +160,7 @@ def _ask(self, candidate_count: PositiveInt) -> pd.DataFrame:  # type: ignore
     def get_required_number_of_experiments(self) -> Optional[int]:
         if isinstance(self._data_model.criterion, DoEOptimalityCriterion):
             if self.domain.inputs.get([DiscreteInput, CategoricalInput]):
-                _domain, _, _, _, _, _ = create_continuous_domain(domain=self.domain)
+                _domain, *_ = create_continuous_domain(domain=self.domain)
             else:
                 _domain = self.domain
             formula = get_formula_from_string(
@@ -162,6 +174,85 @@ def get_required_number_of_experiments(self) -> Optional[int]:
                 f"Only {AnyDoEOptimalityCriterion} type have required number of experiments."
             )
 
+    def get_candidate_rank(self) -> int:
+        """Get the rank of the model matrix with the current candidates."""
+        if self.candidates is None:
+            return 0
+
+        # Only works for DoEOptimalityCriterion (model-based criteria)
+        if not isinstance(self._data_model.criterion, DoEOptimalityCriterion):
+            raise ValueError(
+                "get_candidate_rank() only works with DoEOptimalityCriterion"
+            )
+
+        # Step 1: get_relaxed_domain(original_domain)
+        (
+            relaxed_domain,
+            mappings_categorical_var_key_to_aux_var_key_state_pairs,
+            mapping_discrete_input_to_discrete_aux,
+            aux_vars_for_discrete,
+            mapped_aux_categorical_inputs,
+            mapped_continous_inputs,
+        ) = create_continuous_domain(domain=self.domain)
+
+        # Step 2: Properly encode candidates to relaxed domain
+        relaxed_candidates = encode_candidates_to_relaxed_domain(
+            candidates=self.candidates,
+            mappings_categorical_var_key_to_aux_var_key_state_pairs=mappings_categorical_var_key_to_aux_var_key_state_pairs,
+            mapping_discrete_input_to_discrete_aux=mapping_discrete_input_to_discrete_aux,
+            domain=self.domain,
+        )
+
+        # Step 3: get_objective_function (combines model + objective)
+        n_candidates = len(self.candidates)
+        objective_function = get_objective_function(
+            criterion=self._data_model.criterion,
+            domain=relaxed_domain,
+            n_experiments=n_candidates,
+            inputs_for_formula=self.domain.inputs,
+        )
+
+        # Step 4 & 5: Combined tensor_to_model_matrix + rank calculation
+        if isinstance(objective_function, ModelBasedObjective):
+            # Ensure we only use columns that match the relaxed domain inputs
+            expected_columns = relaxed_domain.inputs.get_keys()
+            relaxed_candidates_clean = relaxed_candidates[expected_columns]
+
+            # Convert to tensor
+            candidates_tensor = torch.tensor(
+                relaxed_candidates_clean.to_numpy(), dtype=torch.float64
+            )
+
+            # Get candidate model matrix using objective
+            candidates_model_matrix = objective_function.tensor_to_model_matrix(
+                candidates_tensor
+            )
+
+            model_matrix_rank = torch.linalg.matrix_rank(candidates_model_matrix).item()
+
+            return model_matrix_rank
+
+        else:
+            raise ValueError(
+                "Only ModelBasedObjective supports Fisher Information Matrix rank calculation"
+            )
+
+    def get_additional_experiments_needed(self) -> Optional[int]:
+        """Calculate the additional number of experiments needed beyond current candidates.
+        This method computes: get_required_number_of_experiments() - get_candidate_rank()
+
+        Returns:
+            Optional[int]: Number of additional experiments needed, or None if required number
+                          cannot be calculated (e.g., for SpaceFillingCriterion).
+        """
+        required_experiments = self.get_required_number_of_experiments()
+        if required_experiments is None:
+            return None
+
+        candidate_rank = self.get_candidate_rank()
+        difference = required_experiments - candidate_rank
+        return difference
+
     def has_sufficient_experiments(
         self,
     ) -> bool:
 
@@ -38,6 +38,8 @@ def __init__(
         self.seed_seq = np.random.SeedSequence(seed)
         self._experiments = None
         self._candidates = None
+        # Default validation tolerance - subclasses can override this
+        self._validation_tol = 1e-5
 
     @property
     def domain(self) -> Domain:
@@ -159,6 +161,7 @@ def ask(
         self.domain.validate_candidates(
             candidates=candidates,
             only_inputs=True,
+            tol=self._validation_tol,
             raise_validation_error=raise_validation_error,
         )
 
 
@@ -84,7 +84,7 @@ def get_proposals(domain, n: int = 10) -> pd.DataFrame:
     strategy = strategies.map(strategy)
     strategy.tell(experiments)
     t0 = time()
-    proposals = strategy.ask(n)
+    proposals = strategy.ask(n, raise_validation_error=False)
     print(f"Generated {len(proposals)} experiments, Time taken: {time() - t0:.2f}s")
     return proposals
 ```