Skip to content
Open
Show file tree
Hide file tree
Changes from 17 commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
b860f26
Added monthly precipitation data for tests. The files were generated …
Jan 13, 2025
89e8379
Added functionality for performing a PCA. Very preliminary and not ve…
Jan 13, 2025
f8666dc
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jan 13, 2025
1a887f0
Preliminary version of a script for generating mesmer-m-tp temperatur…
Jan 13, 2025
6aa64b8
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jan 14, 2025
dd107ef
Integrating MESMER-M-TP part-1 (temperature integration): Added gener…
Apr 14, 2025
280c2d5
Added style changes
Apr 14, 2025
a201b92
first working draft
Jan 9, 2026
d12984d
Merge branch 'main' into mesmermtp_precipitation
Jan 9, 2026
41516be
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jan 9, 2026
2ce820b
remove duplicated notebooks
mathause Jan 13, 2026
64d74e7
move pr data to new location
mathause Jan 13, 2026
ae35f10
remove output
mathause Jan 13, 2026
be0f0ad
update notebooks to current mesmer/ xarray
mathause Jan 14, 2026
2c20be8
Merge branch 'main' into mesmermtp_precipitation
mathause Jan 14, 2026
f164511
apply _ignore_warnings
mathause Jan 14, 2026
8468527
small updates
mathause Jan 15, 2026
4ef816e
Merge branch 'main' into mesmermtp_precipitation
mathause Jan 19, 2026
2c4a4e6
speed up data loading and stacking
mathause Jan 22, 2026
fa77591
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jan 22, 2026
af0d1c4
remove commented code block
mathause Feb 9, 2026
2a6b192
Merge branch 'main' into mesmermtp_precipitation
mathause Feb 9, 2026
376a66e
fix forgotten parens
mathause Feb 11, 2026
227b886
small restructuring
mathause Feb 11, 2026
b6b6f4c
small refactoring
mathause Feb 17, 2026
820b019
n_cov -> n_params
mathause Feb 19, 2026
6bf8812
example_mesmer_mtp_precipitation.ipynb: some refactorings and cleanups
mathause Feb 19, 2026
86543c2
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Feb 19, 2026
2d0f360
Changes to the Xarray Transformer and the GLM implementation. The GLM…
Feb 26, 2026
01b9244
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Feb 26, 2026
fee6952
Merge branch 'main' into mesmermtp_precipitation
mathause Mar 4, 2026
aaddfe9
use best_alpha instead of best_model internally
mathause Mar 5, 2026
8c71c1e
clear output from notebook
mathause Mar 5, 2026
e204f7a
small updates to the notebook
mathause Mar 5, 2026
98e6c6e
X_var had to be transposed
mathause Mar 5, 2026
11f57da
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Mar 5, 2026
84e2a12
update comment
mathause Mar 9, 2026
9d29750
Merge branch 'main' into mesmermtp_precipitation
mathause Mar 10, 2026
8154e3c
Merge branch 'main' into mesmermtp_precipitation
mathause Mar 12, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file not shown.
Binary file not shown.
16 changes: 16 additions & 0 deletions mesmer/stats/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,16 @@
find_localized_empirical_covariance_monthly,
)
from mesmer.stats._power_transformer import YeoJohnsonTransformer
from mesmer.stats._principal_component_decomposition import (
fit_principal_components,
inverse_transform_principal_components,
transform_principal_components,
)
from mesmer.stats._regularized_glm import GammaGLMXarray
from mesmer.stats._smoothing import lowess
from mesmer.stats._xarray_kde import GroupedKDEXarray
from mesmer.stats._xarray_pipelines import XarrayPipeline
from mesmer.stats._xarray_transformers import SklearnXarrayTransformer

__all__ = [
# auto regression
Expand Down Expand Up @@ -45,4 +54,11 @@
"predict_harmonic_model",
# power transformer
"YeoJohnsonTransformer",
"GammaGLMXarray",
"GroupedKDEXarray",
"XarrayPipeline",
"SklearnXarrayTransformer",
"fit_principal_components",
"inverse_transform_principal_components",
"transform_principal_components",
]
176 changes: 176 additions & 0 deletions mesmer/stats/_principal_component_decomposition.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
import numpy as np
import xarray as xr

# from mesmer.core.utils import (
# _check_dataarray_form,
# _check_dataset_form
# )
# ToDo: Add inverse transform & StandardScaling prior to transforming
from sklearn.preprocessing import StandardScaler


def fit_principal_components(X: xr.DataArray, n_components=None):
"""
Fit a principal component decomposition

Parameters
----------
X : xr.DataArray
DataArray to decompose. Must be 2D. PCA is tranformed over the second dimension.
"""

if n_components is None:
n_components = X.values.shape[1]

params = _fit_principal_component_decomposition_xr(X=X, n_components=n_components)

return params


def _fit_principal_component_decomposition_xr(
X: xr.DataArray,
n_components: int,
) -> xr.Dataset:
"""
Fit a principal component decomposition

Parameters
----------
X : xr.DataArray
DataArray to decompose. Must be 2D.

Returns
-------
:obj:`xr.Dataset`
Dataset of projection coefficients.
"""

X_np = X.values

std = StandardScaler().fit(X_np)
X_np_std = std.transform(X_np)

from sklearn.decomposition import PCA as PCA_np

pca = PCA_np(n_components=n_components).fit(X_np_std)

params = xr.Dataset(
{
"coeffs": (("component", X.dims[1]), pca.components_),
"mean": (X.dims[1], pca.mean_),
"explained_variance": ("component", pca.explained_variance_),
"std_scale": (X.dims[1], std.scale_),
"std_mean": (X.dims[1], std.mean_),
"std_var": (X.dims[1], std.var_),
Comment on lines +62 to +64
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I find it not very obvious that these belong to the StandardScaler.

},
coords={X.dims[1]: X[X.dims[1]], "component": np.arange(n_components)},
)

return params


def transform_principal_components(
X: xr.DataArray,
params: xr.DataArray,
) -> xr.DataArray:
"""
Project input data onto eigenspace.

Parameters
----------
X : xr.DataArray
DataArray to project onto eigenspace given the previously computed components.

Returns
-------
T : xr.DataArray
Vector of principal components.
"""

T = _transform_principal_component_decomposition_xr(X=X, params=params)

return T


def _transform_principal_component_decomposition_xr(
Comment on lines +90 to +95
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(there is not much point to having a wrapper function here)

X: xr.DataArray, params: xr.DataArray
) -> xr.DataArray:

from sklearn.decomposition import PCA as PCA_np

std = StandardScaler()
std.n_features_in_ = len(params["std_scale"])
std.scale_ = params["std_scale"].values
std.mean_ = params["std_mean"].values
std.var_ = params["std_var"].values

pca = PCA_np()
pca.n_components_ = params["coeffs"].shape[0]
pca.components_ = params["coeffs"].values
pca.mean_ = params["mean"].values
pca.explained_variance_ = params["explained_variance"].values

X_trans_np = pca.transform(std.transform(X.values))

X_trans = xr.DataArray(
X_trans_np,
dims=[X.dims[0], "component"],
coords={X.dims[0]: X[X.dims[0]], "component": np.arange(pca.n_components_)},
)
return X_trans


def inverse_transform_principal_components(
T: xr.DataArray,
params: xr.DataArray,
) -> xr.DataArray:
"""
Project input data onto eigenspace.

Parameters
----------
X : xr.DataArray
DataArray to project onto eigenspace given the previously computed components.

Returns
-------
T : xr.DataArray
Vector of principal components.
"""

T = _inverse_transform_principal_component_decomposition_xr(T=T, params=params)

return T


def _inverse_transform_principal_component_decomposition_xr(
T: xr.DataArray, params: xr.DataArray
) -> xr.DataArray:

from sklearn.decomposition import PCA as PCA_np

pca = PCA_np()
pca.n_components_ = params["coeffs"].shape[0]
pca.components_ = params["coeffs"].values
pca.mean_ = params["mean"].values
pca.explained_variance_ = params["explained_variance"].values

X_np_std = pca.inverse_transform(T.values)

std = StandardScaler()
std.n_features_in_ = len(params["std_scale"])
std.scale_ = params["std_scale"].values
std.mean_ = params["std_mean"].values
std.var_ = params["std_var"].values

X_np = std.inverse_transform(X_np_std)

X = xr.DataArray(
X_np,
dims=[T.dims[0], "gridcell"],
coords={
T.dims[0]: T[T.dims[0]],
params["coeffs"].dims[1]: params["coeffs"][params["coeffs"].dims[1]],
},
)
return X
182 changes: 182 additions & 0 deletions mesmer/stats/_regularized_glm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
# MESMER-M, land-climate dynamics group, S.I. Seneviratne
# Copyright (c) 2021 ETH Zurich, MESMER contributors listed in AUTHORS.
# Licensed under the GNU General Public License v3.0 or later see LICENSE or
# https://www.gnu.org/licenses/

"""
Functions to train monthly trend module of MESMER-M
"""


import numpy as np
import statsmodels.api as sm
import xarray as xr
from joblib import Parallel, delayed

from mesmer._core.utils import _ignore_warnings


# haven't properly commented this yet - WIP
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
# haven't properly commented this yet - WIP

class GammaGLMXarray:

def __init__(self, alphas, l1_wt=0.0001, n_jobs=-1):
"""
Gamma GLM (log link), fitted independently for each (gridcell, month).

alpha : list of scalar or array_like
The penalty weight. If a scalar, the same penalty weight
applies to all variables in the model. If a vector, it
must have the same length as `params`, and contains a
penalty weight for each coefficient.
L1_wt : float
Must be in [0, 1]. The L1 penalty has weight L1_wt and the
L2 penalty has weight 1 - L1_wt.
"""

self.alphas = alphas
self.l1_wt = l1_wt
self.n_jobs = n_jobs
self.params_ = None

@_ignore_warnings(
[
"Elastic net fitting did not converge",
"divide by zero encountered",
"invalid value encountered",
]
)
def _fit_single(self, X, y):
y_max = y.max()

family = sm.families.Gamma
link = sm.families.links.Log

glm = sm.GLM(y, X, family=family(link=link))

last_res = None

for alpha in self.alphas:
try:
res = glm.fit_regularized(
alpha=alpha,
L1_wt=self.l1_wt,
refit=False,
)
last_res = res
except Exception:
continue

resid = res.fittedvalues - y
if np.all(resid <= 0.4 * y_max):
return res.params

# safe fallback
if last_res is None:
return np.full(X.shape[1], np.nan)

return last_res.params

def fit(self, tas, tas_sq, pr, closest_locations):
"""
Estimate regression coefficients.

Parameters
----------
tas, tas_sq, pr:
DataArrays with dims (gridcell, year, month)

closest_locations:
DataArray with dims (gridcell, closest_gridcells)
"""

gridcells = tas.gridcell.values
months = tas.month.values

n_years = tas.sizes["year"]
n_closest = closest_locations.sizes["closest_gridcells"]
n_cov = 1 + 2 * n_closest
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have not looked at this in details but this is very use case specific - check if we could move this out of the function somehow.


def _compute(i_grid, mon):
nbrs = closest_locations.sel(gridcell=i_grid).values

X = np.c_[
np.ones(n_years),
tas.sel(gridcell=nbrs, month=mon).T.values,
tas_sq.sel(gridcell=nbrs, month=mon).T.values,
]

y = pr.sel(gridcell=i_grid, month=mon).values
return self._fit_single(X, y)

results = Parallel(n_jobs=self.n_jobs, backend="loky")(
delayed(_compute)(i_grid, mon) for mon in months for i_grid in gridcells
)

params = (
np.stack(results)
.reshape(len(months), len(gridcells), n_cov)
.transpose(1, 0, 2)
)

covariates = (
["intercept"]
+ [f"tas_{i}" for i in range(n_closest)]
+ [f"tas_sq_{i}" for i in range(n_closest)]
)

self.params_ = xr.DataArray(
params,
dims=("gridcell", "month", "covariate"),
coords={
"gridcell": tas.gridcell,
"month": tas.month,
"covariate": covariates,
},
name="params",
)

return self

def predict(self, tas, tas_sq, closest_locations):
"""
Compute μ = exp(Xβ)
"""

if self.params_ is None:
raise RuntimeError("Model must be fitted first.")

gridcells = tas.gridcell.values
months = tas.month.values
n_years = tas.sizes["year"]

mu = np.empty((len(gridcells), n_years, len(months)))

for ig, i_grid in enumerate(gridcells):
nbrs = closest_locations.sel(gridcell=i_grid).values

for im, mon in enumerate(months):
beta = self.params_.sel(gridcell=i_grid, month=mon).values

X = np.c_[
np.ones(n_years),
tas.sel(gridcell=nbrs, month=mon).T.values,
tas_sq.sel(gridcell=nbrs, month=mon).T.values,
]

mu[ig, :, im] = np.exp(X @ beta)

return xr.DataArray(
mu,
dims=("gridcell", "year", "month"),
coords={
"gridcell": tas.gridcell,
"year": tas.year,
"month": tas.month,
"lat": tas.lat,
"lon": tas.lon,
},
name="mu",
)

def residuals(self, pr, mu):
return (np.log(pr / mu)).rename("residuals")
Loading
Loading