ascsn · pabggpnMSU · Oct 14, 2025 · Oct 10, 2025 · Oct 10, 2025 · Oct 10, 2025
diff --git a/docs/usage.md b/docs/usage.md
@@ -19,16 +19,59 @@ dataset = Dataset(data_path)
 
 # Load data for specified models and properties
 data_dict = dataset.load_data(
-    models=["FRDM12", "HFB24", "D1M",
-    "UNEDF1", "BCPM", "AME2020"],
+    models=["FRDM12", "HFB24", "D1M", "UNEDF1", "BCPM", "AME2020"],
+    keys=["BE"],
+    domain_keys=["N", "Z"],
+    truth_column_name="AME2020"  # Specify which model is the truth data
+)
+```
+
+!!! note "Truth Data with Smaller Domain"
+    The `truth_column_name` parameter allows the truth/experimental data to have a smaller domain than the prediction models. When specified:
+
+    - Prediction models are inner-joined to find their common domain
+    - Truth data is left-joined, allowing it to have fewer points
+    - Domain points without truth data will have NaN values in the truth column
+
+    This enables training on available experimental data while making predictions across the full model domain.
+
+### Alternative: Traditional Loading (All Models Share Domain)
+
+If you want all models to share the same domain, simply omit the `truth_column_name` parameter:
+
+```python
+# All models must have data at the same domain points
+data_dict = dataset.load_data(
+    models=["FRDM12", "HFB24", "D1M", "UNEDF1", "BCPM"],
     keys=["BE"],
     domain_keys=["N", "Z"]
+)
 ```
 
 ## 2. Split the Data
 
 Next, we split the data into training, validation, and test sets. `pybmc` supports random splitting as shown below.
 
+!!! tip "Training with Smaller Truth Domain"
+    When using `truth_column_name`, only rows where truth data is available (non-NaN) should be used for training. You can filter the data like this:
+
+    ```python
+    # Filter to only include rows where truth data is available
+    df_with_truth = data_dict["BE"][data_dict["BE"]["AME2020"].notna()]
+
+    # Split only the data with truth values
+    train_df, val_df, test_df = dataset.split_data(
+        {"BE": df_with_truth},
+        "BE",
+        splitting_algorithm="random",
+        train_size=0.6,
+        val_size=0.2,
+        test_size=0.2,
+    )
+    ```
+
+For cases where all models share the same domain:
+
 ```python
 # Split the data into training, validation, and test sets
 train_df, val_df, test_df = dataset.split_data(
@@ -43,13 +86,13 @@ train_df, val_df, test_df = dataset.split_data(
 
 ## 3. Initialize and Train the BMC Model
 
-Now, we initialize the `BayesianModelCombination` class. We provide the list of models, the data dictionary, and the name of the column containing the ground truth values.
+Now, we initialize the `BayesianModelCombination` class. We provide the list of models (excluding the truth column), the data dictionary, and the name of the column containing the ground truth values.
 
 ```python
 # Initialize the Bayesian Model Combination
+# Note: models_list should only include prediction models, not the truth data
 bmc = BayesianModelCombination(
-    models_list=["FRDM12", "HFB24", "D1M",
-    "UNEDF1", "BCPM", "AME2020"],
+    models_list=["FRDM12", "HFB24", "D1M", "UNEDF1", "BCPM"],
     data_dict=data_dict,
     truth_column_name="AME2020",
 )
@@ -73,8 +116,16 @@ bmc.train(training_options={"iterations": 50000, "sampler": "gibbs_sampling"})
 
 After training, we can use the `predict` method to generate predictions with uncertainty quantification. The method returns the full posterior draws, as well as DataFrames for the lower, median, and upper credible intervals.
 
+!!! note "Predictions Across Full Domain"
+    When truth data has a smaller domain, predictions can still be made for all domain points (including those without truth data). This allows you to:
+
+    - Train on available experimental data
+    - Make predictions beyond the experimental coverage
+    - Quantify uncertainty for all predictions
+
 ```python
 # Make predictions with uncertainty quantification
+# Predictions are made for ALL domain points, including those without truth data
 rndm_m, lower_df, median_df, upper_df = bmc.predict("BE")
 
 # Display the first 5 rows of the median predictions
@@ -85,10 +136,78 @@ print(median_df.head())
 
 Finally, we can evaluate the performance of our model combination using the `evaluate` method. This calculates the coverage of the credible intervals, which tells us how often the true values fall within the predicted intervals.
 
+!!! note "Evaluation on Training Data"
+    The `evaluate` method only evaluates on data points where truth values are available. Points with NaN truth values are automatically excluded from the evaluation.
+
 ```python
 # Evaluate the model's coverage
 coverage_results = bmc.evaluate()
 
 # Print the coverage for a 95% credible interval
 print(f"Coverage for 95% credible interval: {coverage_results[19]:.2f}%")
 ```
+
+## Complete Example: Truth Data with Smaller Domain
+
+Here's a complete example demonstrating the workflow when truth/experimental data is only available for a subset of domain points:
+
+```python
+import pandas as pd
+from pybmc.data import Dataset
+from pybmc.bmc import BayesianModelCombination
+
+# Initialize dataset
+dataset = Dataset(data_path="pybmc/selected_data.h5")
+
+# Load data with truth_column_name parameter
+# This allows AME2020 (truth) to have fewer domain points than the models
+data_dict = dataset.load_data(
+    models=["FRDM12", "HFB24", "D1M", "UNEDF1", "BCPM", "AME2020"],
+    keys=["BE"],
+    domain_keys=["N", "Z"],
+    truth_column_name="AME2020"  # Identifies the truth data
+)
+
+# Check the data structure
+df = data_dict["BE"]
+print(f"Total domain points: {len(df)}")
+print(f"Points with truth data: {df['AME2020'].notna().sum()}")
+print(f"Points without truth data: {df['AME2020'].isna().sum()}")
+
+# Filter to only rows with truth data for training
+df_with_truth = df[df["AME2020"].notna()].copy()
+
+# Split the data (only using points with truth)
+train_df, val_df, test_df = dataset.split_data(
+    {"BE": df_with_truth},
+    "BE",
+    splitting_algorithm="random",
+    train_size=0.6,
+    val_size=0.2,
+    test_size=0.2,
+)
+
+# Initialize BMC (models_list excludes the truth column)
+bmc = BayesianModelCombination(
+    models_list=["FRDM12", "HFB24", "D1M", "UNEDF1", "BCPM"],
+    data_dict=data_dict,
+    truth_column_name="AME2020",
+)
+
+# Orthogonalize and train on the subset with truth data
+bmc.orthogonalize("BE", train_df, components_kept=3)
+bmc.train(training_options={"iterations": 50000})
+
+# Make predictions for ALL domain points
+# This includes points where AME2020 (truth) is NaN
+rndm_m, lower_df, median_df, upper_df = bmc.predict("BE")
+
+print(f"Predictions made for {len(median_df)} domain points")
+print("This includes both points with and without experimental truth data!")
+
+# Evaluate coverage (only on points with truth data)
+coverage_results = bmc.evaluate()
+print(f"Coverage for 95% credible interval: {coverage_results[19]:.2f}%")
+```
+
+
diff --git a/pybmc/data.py b/pybmc/data.py
@@ -32,7 +32,7 @@ def __init__(self, data_source=None, verbose=True):
             self.logger.addHandler(handler)
         self.logger.setLevel(logging.INFO if verbose else logging.WARNING)
 
-    def load_data(self, models, keys=None, domain_keys=None, model_column='model'):
+    def load_data(self, models, keys=None, domain_keys=None, model_column='model', truth_column_name=None):
         """
         Load data for each property and return a dictionary of synchronized DataFrames.
         Each DataFrame has columns: domain_keys + one column per model for that property.
@@ -43,11 +43,15 @@ def load_data(self, models, keys=None, domain_keys=None, model_column='model'):
             domain_keys (list, optional): List of columns used to define the common domain (default ['N', 'Z']).
             model_column (str, optional): Name of the column in the CSV that identifies which model each row belongs to.
                                           Only used for CSV files; ignored for HDF5 files.
+            truth_column_name (str, optional): Name of the truth model. If provided, the truth data will be 
+                                               left-joined to the common domain of the other models, allowing 
+                                               the truth data to have a smaller domain than the models.
 
         Returns:
             dict: Dictionary where each key is a property name and each value is a DataFrame with columns:
                   domain_keys + one column per model for that property.
                   The DataFrames are synchronized to the intersection of the domains for all models.
+                  If truth_column_name is provided, truth data is left-joined (may have NaN values).
 
         Supports both .h5 and .csv files.
         """
@@ -64,8 +68,12 @@ def load_data(self, models, keys=None, domain_keys=None, model_column='model'):
 
         for prop in keys:
             dfs = []
+            truth_df = None
             skipped_models = []
 
+            # Separate regular models from truth model
+            regular_models = [m for m in models if m != truth_column_name]
+
             if self.data_source.endswith('.h5'):
                 for model in models:
                     df = pd.read_hdf(self.data_source, key=model)
@@ -77,7 +85,13 @@ def load_data(self, models, keys=None, domain_keys=None, model_column='model'):
                         continue
                     temp = df[domain_keys + [prop]].copy()
                     temp.rename(columns={prop: model}, inplace=True) # type: ignore
-                    dfs.append(temp)
+
+                    # Store truth data separately if truth_column_name is provided
+                    if truth_column_name and model == truth_column_name:
+                        truth_df = temp
+                    else:
+                        dfs.append(temp)
+
             elif self.data_source.endswith('.csv'):
                 df = pd.read_csv(self.data_source)
                 for model in models:
@@ -91,7 +105,12 @@ def load_data(self, models, keys=None, domain_keys=None, model_column='model'):
                         continue
                     temp = model_df[domain_keys + [prop]].copy()
                     temp.rename(columns={prop: model}, inplace=True)
-                    dfs.append(temp)
+
+                    # Store truth data separately if truth_column_name is provided
+                    if truth_column_name and model == truth_column_name:
+                        truth_df = temp
+                    else:
+                        dfs.append(temp)
             else:
                 raise ValueError("Unsupported file format. Only .h5 and .csv are supported.")
 
@@ -100,12 +119,17 @@ def load_data(self, models, keys=None, domain_keys=None, model_column='model'):
                 result[prop] = pd.DataFrame(columns=domain_keys + [m for m in models if m not in skipped_models])
                 continue
 
-            # Intersect domain for this property
+            # Intersect domain for regular models only
             common_df = dfs[0]
             for other_df in dfs[1:]:
                 common_df = pd.merge(common_df, other_df, on=domain_keys, how="inner")
-            # Drop rows with NaN in any required column
+            # Drop rows with NaN in any required column (for regular models)
             common_df = common_df.dropna()
+
+            # Left join truth data if it exists and was specified
+            if truth_df is not None:
+                common_df = pd.merge(common_df, truth_df, on=domain_keys, how="left")
+
             result[prop] = common_df
             self.data = result
         return result