Skip to content

Commit e6e6c63

Browse files
committed
v0.4.1 - fixing alpha for piecewise-regression (OLS) method - respect breakpoint confidence interval
1 parent 646993a commit e6e6c63

24 files changed

+1065
-62
lines changed

Examples/32_Segmented_Regression/README.md

Lines changed: 57 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -22,62 +22,79 @@ import MannKS as mk
2222
from MannKS.segmented_trend_test import find_best_segmentation, calculate_breakpoint_probability
2323
from MannKS import plot_segmented_trend
2424

25+
# -----------------------------------------------------------------------------
2526
# 1. Generate Synthetic Data with a Structural Break
27+
# -----------------------------------------------------------------------------
2628
# Scenario: A river's pollutant levels were stable/increasing until a
2729
# Policy Reform was introduced in 2010, after which they started decreasing.
30+
# We create a datetime range spanning 20 years.
2831
np.random.seed(42)
2932
dates = pd.date_range(start='2000-01-01', end='2020-01-01', freq='ME')
3033

31-
# Use numeric time (seconds) for precise linear trend generation
34+
# Convert dates to numeric seconds for precise linear trend generation.
35+
# This ensures the underlying true signal is perfectly linear before adding noise.
3236
t_sec = dates.astype(np.int64) // 10**9
3337
t_sec = t_sec - t_sec[0] # Start at 0
3438

35-
# True Breakpoint: June 2010
39+
# True Breakpoint: June 2010 (Policy Reform)
3640
break_date = pd.Timestamp('2010-06-01')
3741
break_sec = (break_date - dates[0]).total_seconds()
3842

3943
# Define Slopes (units per second)
40-
# Approx 0.1 units/month increasing, then -0.3 units/month decreasing
41-
seconds_per_month = 30.44 * 24 * 3600
42-
# Target slopes per year for readability:
43-
# Slope 1: +1.2 units/year
44-
# Slope 2: -3.6 units/year
44+
# - Period 1 (Pre-2010): Increasing trend (+1.2 units/year)
45+
# - Period 2 (Post-2010): Decreasing trend (-3.6 units/year)
4546
slope1_per_year = 1.2
4647
slope2_per_year = -3.6
4748
slope1 = slope1_per_year / (365.25 * 24 * 3600)
4849
slope2 = slope2_per_year / (365.25 * 24 * 3600)
4950

50-
# Generate values
51+
# Generate true values (piecewise linear function)
5152
values = np.zeros(len(dates))
5253
mask_before = t_sec < break_sec
5354
mask_after = t_sec >= break_sec
5455

5556
values[mask_before] = slope1 * t_sec[mask_before]
56-
# Continuous hinge
57+
# Ensure continuous hinge at the breakpoint
5758
val_at_break = slope1 * break_sec
5859
values[mask_after] = val_at_break + slope2 * (t_sec[mask_after] - break_sec)
5960

60-
# Add noise
61+
# Add Gaussian noise to simulate measurement error
6162
values += np.random.normal(0, 0.5, len(dates))
6263

63-
# Add some censored data (values < 1.0)
64+
# -----------------------------------------------------------------------------
65+
# 2. Simulate Censored Data
66+
# -----------------------------------------------------------------------------
67+
# In environmental monitoring, low concentrations often fall below a
68+
# detection limit (e.g., < 1.0). We simulate this by marking values < 1.0
69+
# as censored strings ("<1.0").
6470
censored_mask = values < 1.0
6571
values_str = values.astype(str)
6672
values_str[censored_mask] = '<1.0'
6773

68-
# Pre-process censored data
74+
# Pre-process censored data into a format suitable for MannKS
75+
# This converts "<1.0" into numeric 1.0 and sets censored=True
6976
df_censored = mk.prepare_censored_data(values_str)
7077
df_censored['date'] = dates
7178

72-
# --- SCENARIO A: Censored Data Analysis ---
79+
# -----------------------------------------------------------------------------
80+
# 3. SCENARIO A: Analyzing Censored Data
81+
# -----------------------------------------------------------------------------
7382
print("--- SCENARIO A: Censored Data Analysis ---")
83+
# We use 'find_best_segmentation' to automatically determine the optimal
84+
# number of breakpoints (0, 1, or 2) using the BIC criterion.
85+
#
86+
# Key Parameters:
87+
# - max_breakpoints=2: Search for up to 2 changes in trend.
88+
# - use_bagging=True: Use Bootstrap Aggregating to find robust breakpoint locations.
89+
# This is crucial for censored/noisy data to avoid local minima.
90+
# - slope_scaling='year': Report slopes in units per year (easier to interpret).
7491
print("Running Model Selection (0-2 breakpoints) on Censored Data...")
7592
result_censored, summary_censored = find_best_segmentation(
7693
x=df_censored,
7794
t=df_censored['date'],
7895
max_breakpoints=2,
7996
use_bagging=True,
80-
n_bootstrap=20,
97+
n_bootstrap=20, # Use >=100 for production
8198
alpha=0.05,
8299
slope_scaling='year'
83100
)
@@ -86,7 +103,8 @@ print("\nModel Selection Summary (Censored):")
86103
print(summary_censored.to_markdown(index=False))
87104
print(f"\nBest Model (Censored): {result_censored.n_breakpoints} Breakpoints")
88105

89-
# Visualize Censored
106+
# Visualize the result
107+
# The plot will show the segments, confidence intervals, and breakpoints.
90108
plot_path_censored = os.path.join(os.path.dirname(__file__), 'segmented_plot_censored.png')
91109
plot_segmented_trend(
92110
result_censored,
@@ -96,10 +114,12 @@ plot_segmented_trend(
96114
)
97115
print(f"Plot saved to {plot_path_censored}")
98116

99-
# --- SCENARIO B: Uncensored Data Analysis ---
117+
# -----------------------------------------------------------------------------
118+
# 4. SCENARIO B: Analyzing Uncensored Data (Hypothetical)
119+
# -----------------------------------------------------------------------------
100120
print("\n--- SCENARIO B: Uncensored Data Analysis (Hypothetical) ---")
101-
# If we had better detection limits, the data would look like the raw 'values'.
102-
# We run the analysis on the raw numeric values.
121+
# For comparison, we run the same analysis on the raw numeric values,
122+
# assuming we had a perfect instrument with no detection limit.
103123
print("Running Model Selection (0-2 breakpoints) on Uncensored Data...")
104124
result_uncensored, summary_uncensored = find_best_segmentation(
105125
x=values,
@@ -125,18 +145,26 @@ plot_segmented_trend(
125145
)
126146
print(f"Plot saved to {plot_path_uncensored}")
127147

128-
# Compare Breakpoints with Standard OLS (No Bagging) for Reference
148+
# -----------------------------------------------------------------------------
149+
# 5. Deep Dive: Bootstrap vs Standard OLS
150+
# -----------------------------------------------------------------------------
151+
# We compare two methods for calculating breakpoint confidence intervals (CIs):
152+
# 1. Bootstrap (Bagging): Non-parametric, handles complex error distributions.
153+
# Often yields wider, asymmetric CIs that better reflect reality.
154+
# 2. Standard OLS: Parametric, assumes normal errors. Often yields symmetric,
155+
# optimistically narrow CIs.
129156
print("\n--- CI Comparison: Bootstrap vs Standard OLS ---")
130157

131158
# Re-run Censored without bagging to get Standard OLS CIs
132159
if result_censored.n_breakpoints > 0:
133-
# Bootstrap CI
160+
# Bootstrap CI (from previous run)
134161
bp_cens = result_censored.breakpoints[0]
135162
ci_cens = result_censored.breakpoint_cis[0]
136163
print(f"Censored (Bootstrap): {bp_cens} (CI: {ci_cens[0]} to {ci_cens[1]})")
137164

138165
# Standard OLS CI
139-
# We fix n_breakpoints to match the best result found above
166+
# We fix n_breakpoints to match the best result found above.
167+
# setting use_bagging=False triggers the standard OLS path.
140168
res_cens_std = mk.segmented_trend_test(
141169
df_censored, df_censored['date'],
142170
n_breakpoints=result_censored.n_breakpoints,
@@ -184,6 +212,13 @@ if result_uncensored.n_breakpoints > 0:
184212
)
185213
print(f"Standard OLS Uncensored Plot saved to {plot_path_uncens_ols}")
186214

215+
# -----------------------------------------------------------------------------
216+
# 6. Breakpoint Probability Analysis
217+
# -----------------------------------------------------------------------------
218+
# Using bagging results, we can ask probabilistic questions:
219+
# "What is the probability that the trend change occurred in 2010?"
220+
# This aggregates the counts from all bootstrap iterations.
221+
187222
# Calculate Probability for Uncensored
188223
prob_uncens = calculate_breakpoint_probability(
189224
result_uncensored,
@@ -192,7 +227,7 @@ prob_uncens = calculate_breakpoint_probability(
192227
)
193228
print(f"Uncensored: Probability change occurred in 2010: {prob_uncens:.1%}")
194229

195-
# Calculate Probability for Censored (since we used bagging there too)
230+
# Calculate Probability for Censored
196231
prob_cens = calculate_breakpoint_probability(
197232
result_censored,
198233
start_date='2010-01-01',

Examples/32_Segmented_Regression/run_example.py

Lines changed: 62 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -15,71 +15,91 @@
1515
from MannKS.segmented_trend_test import find_best_segmentation, calculate_breakpoint_probability
1616
from MannKS import plot_segmented_trend
1717
18+
# -----------------------------------------------------------------------------
1819
# 1. Generate Synthetic Data with a Structural Break
20+
# -----------------------------------------------------------------------------
1921
# Scenario: A river's pollutant levels were stable/increasing until a
2022
# Policy Reform was introduced in 2010, after which they started decreasing.
23+
# We create a datetime range spanning 20 years.
2124
np.random.seed(42)
2225
dates = pd.date_range(start='2000-01-01', end='2020-01-01', freq='ME')
2326
24-
# Use numeric time (seconds) for precise linear trend generation
27+
# Convert dates to numeric seconds for precise linear trend generation.
28+
# This ensures the underlying true signal is perfectly linear before adding noise.
2529
t_sec = dates.astype(np.int64) // 10**9
2630
t_sec = t_sec - t_sec[0] # Start at 0
2731
28-
# True Breakpoint: June 2010
32+
# True Breakpoint: June 2010 (Policy Reform)
2933
break_date = pd.Timestamp('2010-06-01')
3034
break_sec = (break_date - dates[0]).total_seconds()
3135
3236
# Define Slopes (units per second)
33-
# Approx 0.1 units/month increasing, then -0.3 units/month decreasing
34-
seconds_per_month = 30.44 * 24 * 3600
35-
# Target slopes per year for readability:
36-
# Slope 1: +1.2 units/year
37-
# Slope 2: -3.6 units/year
37+
# - Period 1 (Pre-2010): Increasing trend (+1.2 units/year)
38+
# - Period 2 (Post-2010): Decreasing trend (-3.6 units/year)
3839
slope1_per_year = 1.2
3940
slope2_per_year = -3.6
4041
slope1 = slope1_per_year / (365.25 * 24 * 3600)
4142
slope2 = slope2_per_year / (365.25 * 24 * 3600)
4243
43-
# Generate values
44+
# Generate true values (piecewise linear function)
4445
values = np.zeros(len(dates))
4546
mask_before = t_sec < break_sec
4647
mask_after = t_sec >= break_sec
4748
4849
values[mask_before] = slope1 * t_sec[mask_before]
49-
# Continuous hinge
50+
# Ensure continuous hinge at the breakpoint
5051
val_at_break = slope1 * break_sec
5152
values[mask_after] = val_at_break + slope2 * (t_sec[mask_after] - break_sec)
5253
53-
# Add noise
54+
# Add Gaussian noise to simulate measurement error
5455
values += np.random.normal(0, 0.5, len(dates))
5556
56-
# Add some censored data (values < 1.0)
57+
# -----------------------------------------------------------------------------
58+
# 2. Simulate Censored Data
59+
# -----------------------------------------------------------------------------
60+
# In environmental monitoring, low concentrations often fall below a
61+
# detection limit (e.g., < 1.0). We simulate this by marking values < 1.0
62+
# as censored strings ("<1.0").
5763
censored_mask = values < 1.0
5864
values_str = values.astype(str)
5965
values_str[censored_mask] = '<1.0'
6066
61-
# Pre-process censored data
67+
# Pre-process censored data into a format suitable for MannKS
68+
# This converts "<1.0" into numeric 1.0 and sets censored=True
6269
df_censored = mk.prepare_censored_data(values_str)
6370
df_censored['date'] = dates
6471
65-
# --- SCENARIO A: Censored Data Analysis ---
72+
# -----------------------------------------------------------------------------
73+
# 3. SCENARIO A: Analyzing Censored Data
74+
# -----------------------------------------------------------------------------
6675
print("--- SCENARIO A: Censored Data Analysis ---")
76+
# We use 'find_best_segmentation' to automatically determine the optimal
77+
# number of breakpoints (0, 1, or 2) using the BIC criterion.
78+
#
79+
# Key Parameters:
80+
# - max_breakpoints=2: Search for up to 2 changes in trend.
81+
# - use_bagging=True: Use Bootstrap Aggregating to find robust breakpoint locations.
82+
# This is crucial for censored/noisy data to avoid local minima.
83+
# - slope_scaling='year': Report slopes in units per year (easier to interpret).
84+
# - random_state=42: Set seed for reproducibility of bootstrap results.
6785
print("Running Model Selection (0-2 breakpoints) on Censored Data...")
6886
result_censored, summary_censored = find_best_segmentation(
6987
x=df_censored,
7088
t=df_censored['date'],
7189
max_breakpoints=2,
7290
use_bagging=True,
73-
n_bootstrap=20,
91+
n_bootstrap=20, # Use >=100 for production
7492
alpha=0.05,
75-
slope_scaling='year'
93+
slope_scaling='year',
94+
random_state=42
7695
)
7796
7897
print("\\nModel Selection Summary (Censored):")
7998
print(summary_censored.to_markdown(index=False))
8099
print(f"\\nBest Model (Censored): {result_censored.n_breakpoints} Breakpoints")
81100
82-
# Visualize Censored
101+
# Visualize the result
102+
# The plot will show the segments, confidence intervals, and breakpoints.
83103
plot_path_censored = os.path.join(os.path.dirname(__file__), 'segmented_plot_censored.png')
84104
plot_segmented_trend(
85105
result_censored,
@@ -89,10 +109,12 @@
89109
)
90110
print(f"Plot saved to {plot_path_censored}")
91111
92-
# --- SCENARIO B: Uncensored Data Analysis ---
112+
# -----------------------------------------------------------------------------
113+
# 4. SCENARIO B: Analyzing Uncensored Data (Hypothetical)
114+
# -----------------------------------------------------------------------------
93115
print("\\n--- SCENARIO B: Uncensored Data Analysis (Hypothetical) ---")
94-
# If we had better detection limits, the data would look like the raw 'values'.
95-
# We run the analysis on the raw numeric values.
116+
# For comparison, we run the same analysis on the raw numeric values,
117+
# assuming we had a perfect instrument with no detection limit.
96118
print("Running Model Selection (0-2 breakpoints) on Uncensored Data...")
97119
result_uncensored, summary_uncensored = find_best_segmentation(
98120
x=values,
@@ -101,7 +123,8 @@
101123
use_bagging=True,
102124
n_bootstrap=20,
103125
alpha=0.05,
104-
slope_scaling='year'
126+
slope_scaling='year',
127+
random_state=42
105128
)
106129
107130
print("\\nModel Selection Summary (Uncensored):")
@@ -118,18 +141,26 @@
118141
)
119142
print(f"Plot saved to {plot_path_uncensored}")
120143
121-
# Compare Breakpoints with Standard OLS (No Bagging) for Reference
144+
# -----------------------------------------------------------------------------
145+
# 5. Deep Dive: Bootstrap vs Standard OLS
146+
# -----------------------------------------------------------------------------
147+
# We compare two methods for calculating breakpoint confidence intervals (CIs):
148+
# 1. Bootstrap (Bagging): Non-parametric, handles complex error distributions.
149+
# Often yields wider, asymmetric CIs that better reflect reality.
150+
# 2. Standard OLS: Parametric, assumes normal errors. Often yields symmetric,
151+
# optimistically narrow CIs.
122152
print("\\n--- CI Comparison: Bootstrap vs Standard OLS ---")
123153
124154
# Re-run Censored without bagging to get Standard OLS CIs
125155
if result_censored.n_breakpoints > 0:
126-
# Bootstrap CI
156+
# Bootstrap CI (from previous run)
127157
bp_cens = result_censored.breakpoints[0]
128158
ci_cens = result_censored.breakpoint_cis[0]
129159
print(f"Censored (Bootstrap): {bp_cens} (CI: {ci_cens[0]} to {ci_cens[1]})")
130160
131161
# Standard OLS CI
132-
# We fix n_breakpoints to match the best result found above
162+
# We fix n_breakpoints to match the best result found above.
163+
# setting use_bagging=False triggers the standard OLS path.
133164
res_cens_std = mk.segmented_trend_test(
134165
df_censored, df_censored['date'],
135166
n_breakpoints=result_censored.n_breakpoints,
@@ -177,6 +208,13 @@
177208
)
178209
print(f"Standard OLS Uncensored Plot saved to {plot_path_uncens_ols}")
179210
211+
# -----------------------------------------------------------------------------
212+
# 6. Breakpoint Probability Analysis
213+
# -----------------------------------------------------------------------------
214+
# Using bagging results, we can ask probabilistic questions:
215+
# "What is the probability that the trend change occurred in 2010?"
216+
# This aggregates the counts from all bootstrap iterations.
217+
180218
# Calculate Probability for Uncensored
181219
prob_uncens = calculate_breakpoint_probability(
182220
result_uncensored,
@@ -185,7 +223,7 @@
185223
)
186224
print(f"Uncensored: Probability change occurred in 2010: {prob_uncens:.1%}")
187225
188-
# Calculate Probability for Censored (since we used bagging there too)
226+
# Calculate Probability for Censored
189227
prob_cens = calculate_breakpoint_probability(
190228
result_censored,
191229
start_date='2010-01-01',

0 commit comments

Comments
 (0)