Skip to content

Commit af9799f

Browse files
authored
Adding calibration by potential based on CPT (Additional calibration style) (#1959)
* Adding calibration by potential based on CPT Adding a new function to calibrate based on potentials instead of likelihood, and allowing to calibrate based on the cost per target (e.g: cost per install, cost per registration) * Adding more tests to codecov * Adjustments by Juanito * Solving issues
1 parent 64cc9de commit af9799f

File tree

6 files changed

+5935
-585
lines changed

6 files changed

+5935
-585
lines changed

docs/source/notebooks/mmm/mmm_lift_test.ipynb

Lines changed: 1607 additions & 581 deletions
Large diffs are not rendered by default.

docs/source/notebooks/mmm/mmm_upper_funnel_causal_approach.ipynb

Lines changed: 3616 additions & 0 deletions
Large diffs are not rendered by default.

pymc_marketing/mmm/lift_test.py

Lines changed: 118 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,9 @@
1313
# limitations under the License.
1414
"""Adding lift tests as observations of saturation function.
1515
16-
This provides the inner workings of `MMM.add_lift_test_measurements` method. Use that
17-
method directly while working with the `MMM` class.
18-
16+
This provides the inner workings of `MMM.add_lift_test_measurements` method.
17+
Other methods can be MMM.add_cost_per_target_calibration.
18+
Use any of these methods directly while working with the `MMM` class.
1919
"""
2020

2121
from collections.abc import Callable, Sequence
@@ -779,3 +779,118 @@ def add_lift_measurements_to_likelihood_from_saturation(
779779
get_indices=get_indices,
780780
variable_indexer_factory=variable_indexer_factory,
781781
)
782+
783+
784+
def add_cost_per_target_potentials(
785+
calibration_df: pd.DataFrame,
786+
*,
787+
model: pm.Model | None = None,
788+
cpt_variable_name: str = "cost_per_target",
789+
name_prefix: str = "cpt_calibration",
790+
get_indices: Callable[[pd.DataFrame, pm.Model], Indices] = exact_row_indices,
791+
) -> None:
792+
"""Add ``pm.Potential`` penalties to calibrate cost-per-target.
793+
794+
For each row, we compute the mean of ``cpt_variable_name`` across the date
795+
dimension for the specified (dims, channel) slice and add a soft quadratic
796+
penalty:
797+
798+
``penalty = - |cpt_mean - target|^2 / (2 * sigma^2)``.
799+
800+
Parameters
801+
----------
802+
calibration_df : pd.DataFrame
803+
Must include columns ``channel``, ``sigma``, and a target column. By
804+
default the target column is assumed to be ``cost_per_target``; if a column
805+
matching ``cpt_variable_name`` is present it will be used instead. The
806+
DataFrame must also include one column per model dimension found in the
807+
CPT variable (excluding ``date``).
808+
model : pm.Model, optional
809+
Model containing the ``cpt_variable_name`` Deterministic with dims
810+
("date", *dims, "channel"). If None, uses the current model context.
811+
cpt_variable_name : str
812+
Name of the cost-per-target Deterministic variable.
813+
name_prefix : str
814+
Prefix for created potential names.
815+
get_indices : Callable[[pd.DataFrame, pm.Model], Indices]
816+
Alignment function mapping rows to model coordinate indices.
817+
818+
Examples
819+
--------
820+
.. code-block:: python
821+
822+
calibration_df = pd.DataFrame(
823+
{
824+
"channel": ["C1", "C2"],
825+
"geo": ["US", "US"], # add dims as needed
826+
"cost_per_target": [30.0, 45.0],
827+
"sigma": [2.0, 3.0],
828+
}
829+
)
830+
831+
add_cost_per_target_potentials(
832+
calibration_df=calibration_df,
833+
model=mmm.model,
834+
cpt_variable_name="cost_per_target",
835+
name_prefix="cpt_calibration",
836+
)
837+
"""
838+
current_model: pm.Model = pm.modelcontext(model)
839+
840+
# Basic validation
841+
target_column = (
842+
cpt_variable_name
843+
if cpt_variable_name in calibration_df.columns
844+
else "cost_per_target"
845+
)
846+
847+
required_cols = {"channel", target_column, "sigma"}
848+
missing = required_cols - set(calibration_df.columns)
849+
if missing:
850+
raise KeyError(f"Missing required columns in calibration_df: {sorted(missing)}")
851+
852+
if cpt_variable_name not in current_model.named_vars:
853+
raise KeyError(
854+
f"Variable {cpt_variable_name!r} not found in model; create it before calibration."
855+
)
856+
857+
# Determine dims from the CPT variable in the model
858+
cpt_dims = current_model.named_vars_to_dims[cpt_variable_name]
859+
non_date_dims = [d for d in cpt_dims if d != "date"]
860+
861+
# Ensure calibration_df contains all needed dimension columns
862+
missing_dims = [d for d in non_date_dims if d not in calibration_df.columns]
863+
if missing_dims:
864+
raise KeyError(
865+
f"Calibration data missing dimension columns: {missing_dims}. Required dims: {non_date_dims}"
866+
)
867+
868+
# Build indices for selection in model coordinates (date excluded: we average over it)
869+
indices = get_indices(calibration_df[non_date_dims], current_model)
870+
871+
targets: npt.NDArray[np.float64] = calibration_df[target_column].to_numpy(
872+
dtype=float
873+
)
874+
sigmas: npt.NDArray[np.float64] = calibration_df["sigma"].to_numpy(dtype=float)
875+
876+
with current_model:
877+
# Compute mean over the date dimension once
878+
cpt_full = current_model[cpt_variable_name]
879+
date_axis = cpt_dims.index("date")
880+
cpt_mean = pt.mean(cpt_full, axis=date_axis)
881+
882+
# Build advanced indexing arrays for remaining dims (including channel),
883+
# preserving the order present in cpt_dims (excluding date)
884+
indexers = [
885+
pt.as_tensor_variable(indices[dim]) # type: ignore[index]
886+
for dim in cpt_dims
887+
if dim != "date"
888+
]
889+
890+
# Gather the cpt mean for each calibration row as a vector
891+
gathered_cpt = cpt_mean[tuple(indexers)]
892+
893+
# Vectorized quadratic penalties and single aggregated Potential
894+
deviation = pt.abs(gathered_cpt - targets)
895+
penalties = -(deviation**2) / (2 * (sigmas**2))
896+
pm.Potential(name_prefix, pt.sum(penalties))

pymc_marketing/mmm/multidimensional.py

Lines changed: 143 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -187,6 +187,7 @@
187187
from pymc_marketing.mmm.fourier import YearlyFourier
188188
from pymc_marketing.mmm.hsgp import HSGPBase
189189
from pymc_marketing.mmm.lift_test import (
190+
add_cost_per_target_potentials,
190191
add_lift_measurements_to_likelihood_from_saturation,
191192
scale_lift_measurements,
192193
)
@@ -1613,6 +1614,22 @@ def _set_xarray_data(
16131614
else:
16141615
data["target_data"] = target_values
16151616

1617+
# Handle optional spend data used for CPT calibration if available
1618+
if (
1619+
hasattr(self, "_calibration_spend_xarray")
1620+
and "channel_data_spend" in model.named_vars
1621+
):
1622+
spend_values = self._calibration_spend_xarray._channel
1623+
# Align to new coords
1624+
reindex_coords = {"date": coords["date"], "channel": coords["channel"]}
1625+
for dim in self.dims:
1626+
reindex_coords[dim] = coords[dim]
1627+
spend_values = spend_values.reindex(reindex_coords, fill_value=0)
1628+
# Ensure no NaNs are passed into pm.Data updates
1629+
spend_values = spend_values.fillna(0)
1630+
original_dtype = model.named_vars["channel_data_spend"].type.dtype
1631+
data["channel_data_spend"] = spend_values.astype(original_dtype)
1632+
16161633
self.new_updated_data = data
16171634
self.new_updated_coords = coords
16181635
self.new_updated_model = model
@@ -1950,6 +1967,132 @@ def add_lift_test_measurements(
19501967
name=name,
19511968
)
19521969

1970+
def add_cost_per_target_calibration(
1971+
self,
1972+
data: pd.DataFrame,
1973+
calibration_data: pd.DataFrame,
1974+
cpt_variable_name: str = "cost_per_target",
1975+
name_prefix: str = "cpt_calibration",
1976+
) -> None:
1977+
"""Calibrate cost-per-target using constraints via ``pm.Potential``.
1978+
1979+
This adds a deterministic ``cpt_variable_name`` computed as
1980+
``channel_data_spend / channel_contribution_original_scale`` and creates
1981+
per-row penalty terms based on ``calibration_data`` using a quadratic penalty:
1982+
1983+
``penalty = - |cpt_mean - target|^2 / (2 * sigma^2)``.
1984+
1985+
Parameters
1986+
----------
1987+
data : pd.DataFrame
1988+
Feature-like DataFrame with columns matching training ``X`` but with
1989+
channel values representing spend (original units). Must include the
1990+
same ``date`` and any model ``dims`` columns.
1991+
calibration_data : pd.DataFrame
1992+
DataFrame with rows specifying calibration targets. Must include:
1993+
- ``channel``: channel name in ``self.channel_columns``
1994+
- ``cost_per_target``: desired CPT value
1995+
- ``sigma``: accepted deviation; larger => weaker penalty
1996+
and one column per dimension in ``self.dims``.
1997+
cpt_variable_name : str
1998+
Name for the cost-per-target Deterministic in the model.
1999+
name_prefix : str
2000+
Prefix to use for generated potential names.
2001+
2002+
Examples
2003+
--------
2004+
Build a model and calibrate CPT for selected (dims, channel):
2005+
2006+
.. code-block:: python
2007+
2008+
# spend data in original scale with the same structure as X
2009+
spend_df = X.copy()
2010+
# e.g., if X contains impressions, replace with monetary spend
2011+
# spend_df[channels] = ...
2012+
2013+
calibration_df = pd.DataFrame(
2014+
{
2015+
"channel": ["C1", "C2"],
2016+
"geo": ["US", "US"], # dims columns as needed
2017+
"cost_per_target": [30.0, 45.0],
2018+
"sigma": [2.0, 3.0],
2019+
}
2020+
)
2021+
2022+
mmm.add_cost_per_target_calibration(
2023+
data=spend_df,
2024+
calibration_data=calibration_df,
2025+
cpt_variable_name="cost_per_target",
2026+
name_prefix="cpt_calibration",
2027+
)
2028+
"""
2029+
if not hasattr(self, "model"):
2030+
raise RuntimeError("Model must be built before adding calibration.")
2031+
2032+
# Validate required columns in calibration_data
2033+
if "channel" not in calibration_data.columns:
2034+
raise KeyError("'channel' column missing in calibration_data")
2035+
for dim in self.dims:
2036+
if dim not in calibration_data.columns:
2037+
raise KeyError(
2038+
f"The {dim} column is required in calibration_data to map to model dims."
2039+
)
2040+
2041+
# Prepare spend data as xarray (original units)
2042+
spend_ds = self._create_xarray_from_pandas(
2043+
data=data,
2044+
date_column=self.date_column,
2045+
dims=self.dims,
2046+
metric_list=self.channel_columns,
2047+
metric_coordinate_name="channel",
2048+
).transpose("date", *self.dims, "channel")
2049+
# Cache for predictive alignment
2050+
self._calibration_spend_xarray = spend_ds
2051+
2052+
with self.model:
2053+
# Ensure original-scale contribution exists
2054+
if "channel_contribution_original_scale" not in self.model.named_vars:
2055+
self.add_original_scale_contribution_variable(
2056+
[
2057+
"channel_contribution",
2058+
]
2059+
)
2060+
2061+
# Create pm.Data for spend aligned to current model coords
2062+
spend_values = spend_ds._channel
2063+
# Reindex to model coords to ensure ordering matches
2064+
reindex_coords = {"date": self.model.coords["date"]}
2065+
for dim in self.dims:
2066+
reindex_coords[dim] = self.model.coords[dim]
2067+
reindex_coords["channel"] = self.model.coords["channel"]
2068+
spend_values = spend_values.reindex(reindex_coords, fill_value=0)
2069+
# Replace any existing NaNs in spend with zeros to satisfy pm.Data
2070+
spend_values = spend_values.fillna(0)
2071+
2072+
pm.Data(
2073+
name="channel_data_spend",
2074+
value=spend_values.values,
2075+
dims=("date", *self.dims, "channel"),
2076+
)
2077+
2078+
# Build cost_per_target deterministic safely (avoid division by ~0)
2079+
denom = pt.clip(
2080+
self.model["channel_contribution_original_scale"], 1e-12, np.inf
2081+
)
2082+
pm.Deterministic(
2083+
name=cpt_variable_name,
2084+
var=self.model["channel_data_spend"] / denom,
2085+
dims=("date", *self.dims, "channel"),
2086+
)
2087+
2088+
# Create one Potential per row in calibration_data
2089+
add_cost_per_target_potentials(
2090+
calibration_df=calibration_data,
2091+
model=self.model,
2092+
cpt_variable_name=cpt_variable_name,
2093+
name_prefix=name_prefix,
2094+
)
2095+
19532096
def create_fit_data(
19542097
self,
19552098
X: pd.DataFrame | xr.Dataset | xr.DataArray,

0 commit comments

Comments
 (0)