Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "teehr"
version = "0.5.1dev6"
version = "0.5.1dev9"
description = "Tools for Exploratory Evaluation in Hydrologic Research"
authors = [
"RTI International",
Expand Down
2 changes: 1 addition & 1 deletion src/teehr/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""Initialize the TEEHR package."""
import warnings

__version__ = "0.5.1dev8"
__version__ = "0.5.1dev9"

with warnings.catch_warnings():
warnings.simplefilter("ignore", UserWarning)
Expand Down
60 changes: 46 additions & 14 deletions src/teehr/metrics/deterministic_funcs.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import numpy as np
import numpy.typing as npt
import pandas as pd
from scipy.stats import rankdata

from teehr.models.metrics.basemodels import MetricsBasemodel
from teehr.models.metrics.basemodels import TransformEnum
Expand Down Expand Up @@ -201,7 +202,21 @@ def pearson_correlation(model: MetricsBasemodel) -> Callable:
def pearson_correlation_inner(p: pd.Series, s: pd.Series) -> float:
"""Pearson Correlation Coefficient."""
p, s = _transform(p, s, model)
return np.corrcoef(s, p)[0][1]

if model.add_epsilon:
# Calculate covariance between p and s
numerator = np.cov(p, s)[0, 1]

# Calculate standard deviations and multiply them
denominator = np.nanstd(p) * np.nanstd(s) + EPSILON

# Calculate correlation coefficient
result = numerator / denominator

else:
result = np.corrcoef(s, p)[0][1]

return result

return pearson_correlation_inner

Expand Down Expand Up @@ -236,8 +251,23 @@ def r_squared(model: MetricsBasemodel) -> Callable:
def r_squared_inner(p: pd.Series, s: pd.Series) -> float:
"""R-squared."""
p, s = _transform(p, s, model)
pearson_correlation_coefficient = np.corrcoef(s, p)[0][1]
return np.power(pearson_correlation_coefficient, 2)

if model.add_epsilon:
# Calculate covariance between p and s
numerator = np.cov(p, s)[0, 1]

# Calculate standard deviations and multiply them
denominator = np.nanstd(p) * np.nanstd(s) + EPSILON

# Calculate correlation coefficient and square it
pearson_correlation_coefficient = numerator / denominator
result = np.power(pearson_correlation_coefficient, 2)

else:
pearson_correlation_coefficient = np.corrcoef(s, p)[0][1]
result = np.power(pearson_correlation_coefficient, 2)

return result

return r_squared_inner

Expand Down Expand Up @@ -311,19 +341,21 @@ def spearman_correlation_inner(p: pd.Series, s: pd.Series) -> float:
"""Spearman Rank Correlation Coefficient."""
p, s = _transform(p, s, model)

primary_rank = p.rank()
secondary_rank = s.rank()
count = len(p)
# calculate ranks (average method for ties)
primary_ranks = rankdata(p, method='average')
secondary_ranks = rankdata(s, method='average')

# calculate covariance between p_rank and s_rank
covariance = np.cov(primary_ranks, secondary_ranks)[0, 1]

# calculate standard deviations of ranks
std_primary = np.std(primary_ranks)
std_secondary = np.std(secondary_ranks)

if model.add_epsilon:
result = 1 - (
6 * np.sum(np.abs(primary_rank - secondary_rank)**2)
/ (count * (count**2 - 1)) + EPSILON
)
result = covariance / (std_primary * std_secondary + EPSILON)
else:
result = 1 - (
6 * np.sum(np.abs(primary_rank - secondary_rank)**2)
/ (count * (count**2 - 1))
)
result = covariance / (std_primary * std_secondary)

return result
Copy link
Collaborator

@samlamont samlamont Oct 27, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was doing some manual testing using the setup_v0_3_study evaluation and was getting slightly different results between this function, and the pandas (https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.corr.html) and scipy (https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.spearmanr.html#scipy.stats.spearmanr) methods (which both resulted in identical results)

Not really sure what the difference in the functions is but it might be good to understand why they're different. We could also just make use of the pandas or scipy method here? Looks like scipy has a nan_policy argument that could be helpful

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

After thorough testing and discussion, it would appear that the current implementation is the most accurate with regards to handling ties in the ranked data while also allowing error handling via Epsilon for the edge-case where the primary/secondary timeseries are constant arrays resulting in a divide by zero (this results in NaN result when using scipy.stats.spearmanr() or pandas.corr(method='spearman')). The differing results between the proposed implementation and the scipy/pandas built-ins seems to result from their use of the Spearman approximation ( res = 1 - (6 * np.sum(d**2)) / (n * (n**2 - 1)) ) -- with the results differing more when more ties are present.

Requesting a rereview for merge.


Expand Down
44 changes: 40 additions & 4 deletions tests/query/test_get_metrics_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -574,7 +574,7 @@ def test_ensemble_metrics(tmpdir):
def test_metrics_transforms(tmpdir):
"""Test applying metric transforms (non-bootstrap)."""
# Define the evaluation object.
eval = setup_v0_3_study(tmpdir)
test_eval = setup_v0_3_study(tmpdir)

# define metric requiring p,s
kge = DeterministicMetrics.KlingGuptaEfficiency()
Expand All @@ -592,21 +592,21 @@ def test_metrics_transforms(tmpdir):
mvtd_t.transform = 'log'

# get metrics_df
metrics_df_tansformed_e = eval.metrics.query(
metrics_df_tansformed_e = test_eval.metrics.query(
group_by=["primary_location_id", "configuration_name"],
include_metrics=[
kge_t_e,
mvtd_t
]
).to_pandas()
metrics_df_transformed = eval.metrics.query(
metrics_df_transformed = test_eval.metrics.query(
group_by=["primary_location_id", "configuration_name"],
include_metrics=[
kge_t,
mvtd_t
]
).to_pandas()
metrics_df = eval.metrics.query(
metrics_df = test_eval.metrics.query(
group_by=["primary_location_id", "configuration_name"],
include_metrics=[
kge,
Expand All @@ -628,6 +628,42 @@ def test_metrics_transforms(tmpdir):
assert result_kge != result_kge_t
assert result_mvtd == result_mvtd_t

# test epsilon on R2 and Pearson
r2 = DeterministicMetrics.Rsquared()
r2_e = DeterministicMetrics.Rsquared()
r2_e.add_epsilon = True
pearson = DeterministicMetrics.PearsonCorrelation()
pearson_e = DeterministicMetrics.PearsonCorrelation()
pearson_e.add_epsilon = True

# ensure we can obtain a divide by zero error
sdf = test_eval.joined_timeseries.to_sdf()
from pyspark.sql.functions import lit
sdf = sdf.withColumn("primary_value", lit(100.0))
test_eval.joined_timeseries._write_spark_df(sdf, write_mode="overwrite")

# get metrics df control and assert divide by zero occurs
metrics_df_e_control = test_eval.metrics.query(
group_by=["primary_location_id", "configuration_name"],
include_metrics=[
r2,
pearson
]
).to_pandas()
assert np.isnan(metrics_df_e_control.r_squared.values).all()
assert np.isnan(metrics_df_e_control.pearson_correlation.values).all()

# get metrics df test and ensure no divide by zero occurs
metrics_df_e_test = test_eval.metrics.query(
group_by=["primary_location_id", "configuration_name"],
include_metrics=[
r2_e,
pearson_e
]
).to_pandas()
assert np.isfinite(metrics_df_e_test.r_squared.values).all()
assert np.isfinite(metrics_df_e_test.pearson_correlation.values).all()


def test_bootstrapping_transforms(tmpdir):
"""Test applying metric transforms (bootstrap)."""
Expand Down