@@ -22,62 +22,79 @@ import MannKS as mk
2222from MannKS.segmented_trend_test import find_best_segmentation, calculate_breakpoint_probability
2323from MannKS import plot_segmented_trend
2424
25+ # -----------------------------------------------------------------------------
2526# 1. Generate Synthetic Data with a Structural Break
27+ # -----------------------------------------------------------------------------
2628# Scenario: A river's pollutant levels were stable/increasing until a
2729# Policy Reform was introduced in 2010, after which they started decreasing.
30+ # We create a datetime range spanning 20 years.
2831np.random.seed(42 )
2932dates = pd.date_range(start = ' 2000-01-01' , end = ' 2020-01-01' , freq = ' ME' )
3033
31- # Use numeric time (seconds) for precise linear trend generation
34+ # Convert dates to numeric seconds for precise linear trend generation.
35+ # This ensures the underlying true signal is perfectly linear before adding noise.
3236t_sec = dates.astype(np.int64) // 10 ** 9
3337t_sec = t_sec - t_sec[0 ] # Start at 0
3438
35- # True Breakpoint: June 2010
39+ # True Breakpoint: June 2010 (Policy Reform)
3640break_date = pd.Timestamp(' 2010-06-01' )
3741break_sec = (break_date - dates[0 ]).total_seconds()
3842
3943# Define Slopes (units per second)
40- # Approx 0.1 units/month increasing, then -0.3 units/month decreasing
41- seconds_per_month = 30.44 * 24 * 3600
42- # Target slopes per year for readability:
43- # Slope 1: +1.2 units/year
44- # Slope 2: -3.6 units/year
44+ # - Period 1 (Pre-2010): Increasing trend (+1.2 units/year)
45+ # - Period 2 (Post-2010): Decreasing trend (-3.6 units/year)
4546slope1_per_year = 1.2
4647slope2_per_year = - 3.6
4748slope1 = slope1_per_year / (365.25 * 24 * 3600 )
4849slope2 = slope2_per_year / (365.25 * 24 * 3600 )
4950
50- # Generate values
51+ # Generate true values (piecewise linear function)
5152values = np.zeros(len (dates))
5253mask_before = t_sec < break_sec
5354mask_after = t_sec >= break_sec
5455
5556values[mask_before] = slope1 * t_sec[mask_before]
56- # Continuous hinge
57+ # Ensure continuous hinge at the breakpoint
5758val_at_break = slope1 * break_sec
5859values[mask_after] = val_at_break + slope2 * (t_sec[mask_after] - break_sec)
5960
60- # Add noise
61+ # Add Gaussian noise to simulate measurement error
6162values += np.random.normal(0 , 0.5 , len (dates))
6263
63- # Add some censored data (values < 1.0)
64+ # -----------------------------------------------------------------------------
65+ # 2. Simulate Censored Data
66+ # -----------------------------------------------------------------------------
67+ # In environmental monitoring, low concentrations often fall below a
68+ # detection limit (e.g., < 1.0). We simulate this by marking values < 1.0
69+ # as censored strings ("<1.0").
6470censored_mask = values < 1.0
6571values_str = values.astype(str )
6672values_str[censored_mask] = ' <1.0'
6773
68- # Pre-process censored data
74+ # Pre-process censored data into a format suitable for MannKS
75+ # This converts "<1.0" into numeric 1.0 and sets censored=True
6976df_censored = mk.prepare_censored_data(values_str)
7077df_censored[' date' ] = dates
7178
72- # --- SCENARIO A: Censored Data Analysis ---
79+ # -----------------------------------------------------------------------------
80+ # 3. SCENARIO A: Analyzing Censored Data
81+ # -----------------------------------------------------------------------------
7382print (" --- SCENARIO A: Censored Data Analysis ---" )
83+ # We use 'find_best_segmentation' to automatically determine the optimal
84+ # number of breakpoints (0, 1, or 2) using the BIC criterion.
85+ #
86+ # Key Parameters:
87+ # - max_breakpoints=2: Search for up to 2 changes in trend.
88+ # - use_bagging=True: Use Bootstrap Aggregating to find robust breakpoint locations.
89+ # This is crucial for censored/noisy data to avoid local minima.
90+ # - slope_scaling='year': Report slopes in units per year (easier to interpret).
7491print (" Running Model Selection (0-2 breakpoints) on Censored Data..." )
7592result_censored, summary_censored = find_best_segmentation(
7693 x = df_censored,
7794 t = df_censored[' date' ],
7895 max_breakpoints = 2 ,
7996 use_bagging = True ,
80- n_bootstrap = 20 ,
97+ n_bootstrap = 20 , # Use >=100 for production
8198 alpha = 0.05 ,
8299 slope_scaling = ' year'
83100)
@@ -86,7 +103,8 @@ print("\nModel Selection Summary (Censored):")
86103print (summary_censored.to_markdown(index = False ))
87104print (f " \n Best Model (Censored): { result_censored.n_breakpoints} Breakpoints " )
88105
89- # Visualize Censored
106+ # Visualize the result
107+ # The plot will show the segments, confidence intervals, and breakpoints.
90108plot_path_censored = os.path.join(os.path.dirname(__file__ ), ' segmented_plot_censored.png' )
91109plot_segmented_trend(
92110 result_censored,
@@ -96,10 +114,12 @@ plot_segmented_trend(
96114)
97115print (f " Plot saved to { plot_path_censored} " )
98116
99- # --- SCENARIO B: Uncensored Data Analysis ---
117+ # -----------------------------------------------------------------------------
118+ # 4. SCENARIO B: Analyzing Uncensored Data (Hypothetical)
119+ # -----------------------------------------------------------------------------
100120print (" \n --- SCENARIO B: Uncensored Data Analysis (Hypothetical) ---" )
101- # If we had better detection limits, the data would look like the raw ' values'.
102- # We run the analysis on the raw numeric values .
121+ # For comparison, we run the same analysis on the raw numeric values,
122+ # assuming we had a perfect instrument with no detection limit .
103123print (" Running Model Selection (0-2 breakpoints) on Uncensored Data..." )
104124result_uncensored, summary_uncensored = find_best_segmentation(
105125 x = values,
@@ -125,18 +145,26 @@ plot_segmented_trend(
125145)
126146print (f " Plot saved to { plot_path_uncensored} " )
127147
128- # Compare Breakpoints with Standard OLS (No Bagging) for Reference
148+ # -----------------------------------------------------------------------------
149+ # 5. Deep Dive: Bootstrap vs Standard OLS
150+ # -----------------------------------------------------------------------------
151+ # We compare two methods for calculating breakpoint confidence intervals (CIs):
152+ # 1. Bootstrap (Bagging): Non-parametric, handles complex error distributions.
153+ # Often yields wider, asymmetric CIs that better reflect reality.
154+ # 2. Standard OLS: Parametric, assumes normal errors. Often yields symmetric,
155+ # optimistically narrow CIs.
129156print (" \n --- CI Comparison: Bootstrap vs Standard OLS ---" )
130157
131158# Re-run Censored without bagging to get Standard OLS CIs
132159if result_censored.n_breakpoints > 0 :
133- # Bootstrap CI
160+ # Bootstrap CI (from previous run)
134161 bp_cens = result_censored.breakpoints[0 ]
135162 ci_cens = result_censored.breakpoint_cis[0 ]
136163 print (f " Censored (Bootstrap): { bp_cens} (CI: { ci_cens[0 ]} to { ci_cens[1 ]} ) " )
137164
138165 # Standard OLS CI
139- # We fix n_breakpoints to match the best result found above
166+ # We fix n_breakpoints to match the best result found above.
167+ # setting use_bagging=False triggers the standard OLS path.
140168 res_cens_std = mk.segmented_trend_test(
141169 df_censored, df_censored[' date' ],
142170 n_breakpoints = result_censored.n_breakpoints,
@@ -184,6 +212,13 @@ if result_uncensored.n_breakpoints > 0:
184212 )
185213 print (f " Standard OLS Uncensored Plot saved to { plot_path_uncens_ols} " )
186214
215+ # -----------------------------------------------------------------------------
216+ # 6. Breakpoint Probability Analysis
217+ # -----------------------------------------------------------------------------
218+ # Using bagging results, we can ask probabilistic questions:
219+ # "What is the probability that the trend change occurred in 2010?"
220+ # This aggregates the counts from all bootstrap iterations.
221+
187222# Calculate Probability for Uncensored
188223prob_uncens = calculate_breakpoint_probability(
189224 result_uncensored,
@@ -192,7 +227,7 @@ prob_uncens = calculate_breakpoint_probability(
192227)
193228print (f " Uncensored: Probability change occurred in 2010: { prob_uncens:.1% } " )
194229
195- # Calculate Probability for Censored (since we used bagging there too)
230+ # Calculate Probability for Censored
196231prob_cens = calculate_breakpoint_probability(
197232 result_censored,
198233 start_date = ' 2010-01-01' ,
0 commit comments