-
Notifications
You must be signed in to change notification settings - Fork 9
Expand file tree
/
Copy pathcsfrat_integrated.py
More file actions
5280 lines (4418 loc) · 244 KB
/
csfrat_integrated.py
File metadata and controls
5280 lines (4418 loc) · 244 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python3
"""
C-SFRAT Command Line Interface - Integrated Version
This script properly integrates with the core C-SFRAT functionality
"""
# =====================================================================
# 🚨 FOR EASY CONFIGURATION: Use csfrat_config.py instead! 🚨
# =====================================================================
# This script is now best used via the user-friendly configuration file.
#
# ✅ RECOMMENDED: Edit settings in 'csfrat_config.py' and run:
# python csfrat_config.py
#
# 🔧 ADVANCED: Use CLI arguments directly with this script:
# python csfrat_integrated.py --input data.csv --output report.pdf
#
# The values below are default fallbacks used when no CLI args are provided.
# =====================================================================
# Default configuration values (used as fallbacks)
input_file = 'C-SFRAT/datasets/ds1.csv'
output_filename = 'outputs/integrated_report.pdf'
confidence_level = 0.95
run_all_models = True
run_all_covariates = True
show_all_models_in_table = False
max_models_in_comparison_table = 20
selected_sheet = None
data_subset_limit = None
psse_subset_parameter = 0.9
enable_covariate_scaling = True
covariate_scaling_range = (0, 10)
ranking_method = 'mean'
multi_model_plots = True
num_models_to_compare = 3
individual_model_predictions = True
show_model_predictions_separately = True
max_models_for_individual_predictions = 3
effort_per_interval_enabled = True
effort_per_interval_settings = {
'effort_values': {'E': 1.00, 'F': 2.00, 'C': 3.00},
'number_of_intervals_to_predict': 5,
'failure_intensity_target': 0.3,
'use_model_specific_covariates': True,
'default_effort_for_unknown_covariates': 1.0,
}
prediction_parameters = {
'predict_failures': True,
'num_intervals_to_predict': 5,
'prediction_time_horizon': None,
'include_failure_intensity': True,
'prediction_intervals': False,
}
# Import configuration early
try:
import csfrat_config as config
CONFIG_LOADED = True
except ImportError:
CONFIG_LOADED = False
config = None
# Load optimization parameters from config file if available
if CONFIG_LOADED:
optimization_parameters = {
'enable_optimization': getattr(config, 'ENABLE_OPTIMIZATION', True),
'allocation_1_enabled': getattr(config, 'ALLOCATION_1_ENABLED', True),
'allocation_2_enabled': getattr(config, 'ALLOCATION_2_ENABLED', True),
'total_budget': getattr(config, 'TOTAL_BUDGET', 100),
'target_additional_defects': getattr(config, 'TARGET_ADDITIONAL_DEFECTS', 3),
'optimization_method': getattr(config, 'OPTIMIZATION_METHOD', 'both_allocations')
}
else:
# Fallback to default values if config not available
optimization_parameters = {
'enable_optimization': True,
'allocation_1_enabled': True,
'allocation_2_enabled': True,
'total_budget': 100,
'target_additional_defects': 3,
'optimization_method': 'both_allocations'
}
# Load metric weights from config file if available
if CONFIG_LOADED:
metric_weights = {
'llf': getattr(config, 'WEIGHT_LLF', 0.0),
'aic': getattr(config, 'WEIGHT_AIC', 2.0),
'bic': getattr(config, 'WEIGHT_BIC', 1.0),
'sse': getattr(config, 'WEIGHT_SSE', 1.0),
'psse': getattr(config, 'WEIGHT_PSSE', 1.0)
}
else:
# Fallback to default values if config not available
metric_weights = {
'llf': 0.0, 'aic': 2.0, 'bic': 1.0, 'sse': 1.0, 'psse': 1.0
}
import sys
import os
import logging
import traceback
# Log configuration status (config was already imported earlier)
if CONFIG_LOADED:
logging.info("✅ Configuration loaded from csfrat_config.py")
else:
logging.warning("⚠️ csfrat_config.py not found, using default values")
import numpy as np
import pandas as pd
from datetime import datetime
from itertools import combinations, chain
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
from reportlab.lib import colors
from reportlab.lib.pagesizes import letter, landscape
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib.units import inch
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, Image, PageBreak, KeepTogether
from io import BytesIO
import argparse
import symengine
import csv
import matplotlib.colors as mcolors
import colorsys
# Add C-SFRAT directory to Python path
sys.path.append(os.path.abspath('C-SFRAT'))
# Configuration for Model class
from core.model import Model
Model.maxCovariates = 3 # Set the maximum number of covariates
# Import C-SFRAT modules
from core.dataClass import Data
from core.goodnessOfFit import Comparison, PSSE
from core.allocation import EffortAllocation
import core.prediction as prediction
from scipy.optimize import shgo
# Import models
from models.geometric import Geometric
from models.discreteWeibull2 import DiscreteWeibull2
from models.negativeBinomial2 import NegativeBinomial2
from models.S_Distribution import S_Distribution
from models.IFR_SB import IFR_SB
from models.IFR_generalized_SB import IFR_Generalized_SB
from models.truncatedLogistic import TruncatedLogistic
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
# Dictionary of available models
MODEL_CLASSES = {
"Geometric": Geometric,
"DiscreteWeibull2": DiscreteWeibull2,
"NegativeBinomial2": NegativeBinomial2,
"S_Distribution": S_Distribution,
"IFR_SB": IFR_SB,
"IFR_Generalized_SB": IFR_Generalized_SB,
"TruncatedLogistic": TruncatedLogistic
}
# Global toggle for extra plot annotations (ranking footer, model extensions, etc.)
show_plot_annotations = False
# Custom Effort Allocation with Realistic Constraints
class ConstrainedEffortAllocation:
"""
Custom effort allocation that applies realistic constraints to prevent extreme budget estimates
"""
def __init__(self, model, covariate_data, allocation_type, *args):
self.model = model
self.covariate_data = covariate_data
self.hazard_array = np.concatenate((self.model.hazard_array, [self.model.hazardNumerical(self.model.n + 1, self.model.modelParameters)]))
if allocation_type == 1:
self.B = args[0]
self.runConstrainedAllocation1()
self.percentages = self.organizeResults(self.res.x, self.B)
else:
self.f = args[0]
self.runConstrainedAllocation2()
self.percentages2 = self.organizeResults(self.res2.x, self.effort)
def runConstrainedAllocation1(self):
# Apply realistic constraints for allocation 1
cons = ({'type': 'ineq', 'fun': lambda x: self.B - sum([x[i] for i in range(self.model.numCovariates)])})
# Apply realistic bounds: each covariate effort should be between 0 and total budget
# But we also add practical upper limits to prevent extreme solutions
max_per_covariate = min(self.B, 50) # Cap individual covariate effort at 50 or budget, whichever is lower
bnds = tuple((0, max_per_covariate) for i in range(self.model.numCovariates))
self.res = shgo(self.allocationFunction, args=(self.covariate_data,), bounds=bnds, constraints=cons)
self.mvfVal = -self.res.fun
self.H = self.mvfVal - self.model.mvf_array[-1]
def runConstrainedAllocation2(self):
# Apply realistic constraints for allocation 2
cons2 = ({'type': 'eq', 'fun': self.optimization2, 'args': (self.covariate_data,)})
# Apply even tighter bounds for allocation 2 to prevent extreme budget estimates
max_reasonable_effort = 20 # Maximum reasonable effort per covariate
bnds = tuple((0, max_reasonable_effort) for i in range(self.model.numCovariates))
self.res2 = shgo(lambda x: sum([x[i] for i in range(self.model.numCovariates)]), bounds=bnds, constraints=cons2)
self.effort = np.sum(self.res2.x)
def allocationFunction(self, x, covariate_data):
new_cov_data = np.concatenate((covariate_data, x[:, None]), axis=1)
omega = self.model.calcOmega(self.hazard_array, self.model.betas, new_cov_data)
return -(self.model.MVF(self.model.mle_array, omega, self.hazard_array, new_cov_data.shape[1] - 1, new_cov_data))
def optimization2(self, x, covariate_data):
res = self.allocationFunction2(x, covariate_data)
H = res - self.model.mvf_array[-1]
return self.f - H
def allocationFunction2(self, x, covariate_data):
new_cov_data = np.concatenate((covariate_data, x[:, None]), axis=1)
omega = self.model.calcOmega(self.hazard_array, self.model.betas, new_cov_data)
return self.model.MVF(self.model.mle_array, omega, self.hazard_array, new_cov_data.shape[1] - 1, new_cov_data)
def organizeResults(self, results, effort):
if effort > 0.0:
return np.multiply(np.divide(results, effort), 100)
else:
return [0.0 for i in range(len(results))]
# Standardized Table Styling Function
def get_standardized_table_style(num_highlight_rows=0, highlight_color=None, use_alternating_rows=True):
"""Create standardized table styling for consistent PDF formatting
Args:
num_highlight_rows: Number of top rows to highlight (default: 0)
highlight_color: Color for highlighting rows (default: light green)
use_alternating_rows: Whether to use alternating row colors (default: True)
Returns:
TableStyle: Standardized table style
"""
if highlight_color is None:
highlight_color = colors.HexColor('#D5F5E3') # Light green
# Professional table style
style_commands = [
# Header styling - professional dark blue with proper spacing
('BACKGROUND', (0, 0), (-1, 0), colors.HexColor('#2C3E50')),
('TEXTCOLOR', (0, 0), (-1, 0), colors.white),
('ALIGN', (0, 0), (-1, 0), 'CENTER'),
('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
('FONTSIZE', (0, 0), (-1, 0), 8),
('BOTTOMPADDING', (0, 0), (-1, 0), 6),
('TOPPADDING', (0, 0), (-1, 0), 6),
# Header border - 1.5pt bottom border for visual separation
('LINEBELOW', (0, 0), (-1, 0), 1.5, colors.HexColor('#2C3E50')),
# Data rows styling with professional typography
('FONTNAME', (0, 1), (-1, -1), 'Helvetica'),
('FONTSIZE', (0, 1), (-1, -1), 9),
('ALIGN', (0, 1), (-1, -1), 'LEFT'), # Default left alignment (numbers overridden)
# Professional padding and spacing
('LEFTPADDING', (0, 0), (-1, -1), 4),
('RIGHTPADDING', (0, 0), (-1, -1), 4),
('TOPPADDING', (0, 1), (-1, -1), 5),
('BOTTOMPADDING', (0, 1), (-1, -1), 5),
# Column borders for clarity
('GRID', (0, 0), (-1, -1), 0.5, colors.HexColor('#BDC3C7')),
# Ensure outer borders are visible
('BOX', (0, 0), (-1, -1), 0.5, colors.HexColor('#BDC3C7')),
('VALIGN', (0, 0), (-1, -1), 'MIDDLE'),
# Add word wrapping for header text
('WORDWRAP', (0, 0), (-1, 0), True),
]
# Add alternating row colors if requested
if use_alternating_rows:
style_commands.append(('ROWBACKGROUNDS', (0, 1), (-1, -1), [colors.white, colors.HexColor('#F8F9FA')]))
# Add highlighting for top rows if specified
if num_highlight_rows > 0:
for i in range(num_highlight_rows):
row_index = i + 1 # +1 because row 0 is the header
style_commands.append(('BACKGROUND', (0, row_index), (-1, row_index), highlight_color))
return TableStyle(style_commands)
# Class to mimic the GUI's critic calculation
class CriticCalculator:
"""
This class replicates the functionality of the Comparison class in C-SFRAT GUI
for calculating critic values using the Analytic Hierarchy Process (AHP)
"""
def __init__(self, weights=None):
self.mean_values = None
self.median_values = None
self.best_mean_idx = None
self.best_median_idx = None
self.num_metrics = 5 # LLF, AIC, BIC, SSE, PSSE
self.weights = weights if weights is not None else {
'llf': 1.0, 'aic': 1.0, 'bic': 1.0, 'sse': 1.0, 'psse': 1.0
}
def calculate_critic_values(self, models, weights=None):
"""
Calculate critic values for a list of models using the same AHP approach as the GUI
Args:
models: List of fitted model objects
weights: Optional dictionary of weights to override instance weights
Returns:
dict: Dictionary mapping model indices to (critic_mean, critic_median) tuples
"""
if not models:
return {}
# Use provided weights or instance weights
if weights is not None:
active_weights = weights
else:
active_weights = self.weights
# Validate weights
for metric, weight in active_weights.items():
if not (0.0 <= weight <= 10.0):
logging.warning(f"Weight for {metric} ({weight}) is outside recommended range 0.0-10.0")
# Extract metrics for each model
llf_values = [model.llfVal for model in models]
aic_values = [model.aicVal for model in models]
bic_values = [model.bicVal for model in models]
sse_values = [model.sseVal for model in models]
psse_values = [model.psseVal if hasattr(model, 'psseVal') else float('nan') for model in models]
# Calculate weight sum (same as calcWeightSum in GUI)
weight_sum = sum(active_weights.values())
if weight_sum == 0:
logging.warning("All weights are zero. Using equal weights instead.")
active_weights = {'llf': 1.0, 'aic': 1.0, 'bic': 1.0, 'sse': 1.0, 'psse': 1.0}
weight_sum = sum(active_weights.values())
# Log the weights being used
logging.info(f"Using metric weights: {active_weights}")
# Calculate AHP values for each metric (same as ahp method in GUI)
llf_ahp = np.zeros(len(models))
aic_ahp = np.zeros(len(models))
bic_ahp = np.zeros(len(models))
sse_ahp = np.zeros(len(models))
psse_ahp = np.zeros(len(models))
for i in range(len(models)):
llf_ahp[i] = self._ahp_calc(llf_values, i, active_weights['llf'], weight_sum, higher_is_better=True) # Higher LLF is better
aic_ahp[i] = self._ahp_calc(aic_values, i, active_weights['aic'], weight_sum, higher_is_better=False) # Lower AIC is better
bic_ahp[i] = self._ahp_calc(bic_values, i, active_weights['bic'], weight_sum, higher_is_better=False) # Lower BIC is better
sse_ahp[i] = self._ahp_calc(sse_values, i, active_weights['sse'], weight_sum, higher_is_better=False) # Lower SSE is better
psse_ahp[i] = self._ahp_calc(psse_values, i, active_weights['psse'], weight_sum, higher_is_better=False) # Lower PSSE is better
# Combine all metric arrays (same as ahpArray in GUI)
ahp_array = np.array([llf_ahp, aic_ahp, bic_ahp, sse_ahp, psse_ahp])
# DEBUG: Print the AHP values for debugging
logging.info(f"AHP DEBUG - llf_ahp: {llf_ahp[:5]}") # Show first 5 values
logging.info(f"AHP DEBUG - aic_ahp: {aic_ahp[:5]}")
logging.info(f"AHP DEBUG - bic_ahp: {bic_ahp[:5]}")
logging.info(f"AHP DEBUG - sse_ahp: {sse_ahp[:5]}")
logging.info(f"AHP DEBUG - psse_ahp: {psse_ahp[:5]}")
# Calculate raw mean and median values
raw_mean = np.mean(ahp_array, axis=0)
raw_median = np.median(ahp_array, axis=0)
# DEBUG: Print the raw values
logging.info(f"AHP DEBUG - raw_mean: {raw_mean[:5]}")
logging.info(f"AHP DEBUG - raw_median: {raw_median[:5]}")
# Store raw values for legend display
self.raw_mean_values = raw_mean
self.raw_median_values = raw_median
# Check if this is a single-metric configuration (Professor's recommendation)
active_metrics = sum(1 for weight in active_weights.values() if weight > 0.0)
is_single_metric = active_metrics == 1
if is_single_metric:
# For single-metric configurations, median doesn't make mathematical sense
# since we're only using one metric. Set median equal to mean.
logging.info("Single-metric configuration detected: Setting median values equal to mean values")
raw_median = raw_mean.copy()
self.raw_median_values = raw_median
# Normalize to 0.0-1.0 scale
try:
# Handle NaN values in raw arrays
raw_mean_clean = np.nan_to_num(raw_mean, nan=0.0)
raw_median_clean = np.nan_to_num(raw_median, nan=0.0)
max_mean = np.max(raw_mean_clean)
max_median = np.max(raw_median_clean)
# DEBUG: Print normalization values
logging.info(f"NORMALIZATION DEBUG - max_mean: {max_mean:.6f}, max_median: {max_median:.6f}")
logging.info(f"NORMALIZATION DEBUG - len(models): {len(models)}")
# Handle case where all values might be zero or NaN
if max_mean > 0 and not np.isnan(max_mean):
self.mean_values = np.divide(raw_mean_clean, max_mean)
logging.info(f"NORMALIZATION DEBUG - self.mean_values: {self.mean_values[:5]}")
else:
self.mean_values = np.ones(len(models)) / len(models) # Equal weights if all zero
logging.info(f"NORMALIZATION DEBUG - Using equal weights for mean: {self.mean_values[:5]}")
if max_median > 0 and not np.isnan(max_median):
self.median_values = np.divide(raw_median_clean, max_median)
logging.info(f"NORMALIZATION DEBUG - self.median_values: {self.median_values[:5]}")
else:
self.median_values = np.ones(len(models)) / len(models) # Equal weights if all zero
logging.info(f"NORMALIZATION DEBUG - Using equal weights for median: {self.median_values[:5]}")
# Find best combinations
self.best_mean_idx = np.argmax(self.mean_values)
self.best_median_idx = np.argmax(self.median_values)
except (ValueError, ZeroDivisionError):
# Handle edge cases
self.mean_values = np.ones(len(models)) / len(models)
self.median_values = np.ones(len(models)) / len(models)
self.best_mean_idx = 0 if len(models) > 0 else None
self.best_median_idx = 0 if len(models) > 0 else None
# Return results as a dictionary mapping model index to (mean, median) tuple
results = {}
for i in range(len(models)):
results[i] = (self.mean_values[i], self.median_values[i])
# DEBUG: Print the final results for first 5 models
if i < 5:
logging.info(f"FINAL RESULTS DEBUG - Model {i}: mean={self.mean_values[i]:.6f}, median={self.median_values[i]:.6f}")
return results
def _ahp_calc(self, measure_array, index, weight_value, weight_sum, higher_is_better=True):
"""
Calculate AHP (Analytic Hierarchy Process) value for a single metric
This exactly replicates the ahp method in the GUI's Comparison class
Args:
measure_array: Array of metric values for all models
index: Index of the current model
weight_value: Weight assigned to this metric
weight_sum: Sum of all weights
higher_is_better: True if higher values are better (e.g., LLF), False if lower is better (e.g., AIC, BIC, SSE, PSSE)
Returns:
float: AHP value for this metric and model
"""
# If weight is 0, the metric is not considered
if weight_value == 0:
return 0.0
try:
# Calculate normalized weight
weight = weight_value / weight_sum
except ZeroDivisionError:
# If all weights are zero, use equal weighting
weight = 1.0 / float(self.num_metrics)
# Calculate AHP value
if len(measure_array) > 1:
# Convert to absolute values for comparison
abs_array = np.absolute(measure_array)
min_val = min(abs_array)
max_val = max(abs_array)
current_val = abs(measure_array[index])
# DEBUG: Only log for PSSE to avoid spam
if not higher_is_better and weight_value > 0 and index < 3: # First 3 models for PSSE
logging.info(f"AHP DEBUG - measure_array range: [{min_val:.6f}, {max_val:.6f}], current: {current_val:.6f}, weight: {weight_value}, weight_sum: {weight_sum}")
if max_val == min_val:
# All values are the same, give equal weight
ahp_val = weight
if not higher_is_better and weight_value > 0 and index < 3:
logging.info(f"AHP DEBUG - All values same, ahp_val = weight = {ahp_val}")
else:
if higher_is_better:
# For metrics where higher is better (like LLF)
# Scale so that higher values get higher scores
ahp_val = (current_val - min_val) / (max_val - min_val) * weight
else:
# For metrics where lower is better (like AIC, BIC, SSE, PSSE)
# Scale so that lower values get higher scores
ahp_val = (max_val - current_val) / (max_val - min_val) * weight
if weight_value > 0 and index < 3: # Debug for first 3 models
logging.info(f"AHP DEBUG - Calculated ahp_val = ({max_val:.6f} - {current_val:.6f}) / ({max_val:.6f} - {min_val:.6f}) * {weight:.6f} = {ahp_val:.6f}")
else:
# If only one model, assign equal weight
ahp_val = 1.0 / float(self.num_metrics)
return ahp_val
def powerset(iterable):
"""Generate all possible combinations of elements in the iterable"""
s = list(iterable)
return chain.from_iterable(combinations(s, r) for r in range(len(s)+1))
def setup_output_directory():
"""Create the outputs directory if it doesn't exist"""
os.makedirs('outputs', exist_ok=True)
logging.info("Output directory ready")
def load_data(file_path, sheet_name=None, subset_limit=None):
"""
Load data with enhanced features for 100% GUI compatibility
Args:
file_path: Path to input data file (.csv or .xlsx)
sheet_name: Sheet name for Excel files (None = use first sheet)
subset_limit: Limit analysis to first N intervals (None = use all data)
"""
if not os.path.exists(file_path):
raise FileNotFoundError(f"Input file not found: {file_path}")
# Load data directly with pandas based on file extension
if file_path.endswith('.csv'):
df = pd.read_csv(file_path)
available_sheets = ["CSV (Single Sheet)"]
elif file_path.endswith('.xlsx') or file_path.endswith('.xls'):
# For Excel files, handle sheet selection (matches original GUI functionality)
if sheet_name:
try:
df = pd.read_excel(file_path, sheet_name=sheet_name)
logging.info(f"Loaded Excel sheet: '{sheet_name}'")
except ValueError as e:
# Get available sheets for error message
excel_file = pd.ExcelFile(file_path)
available_sheets = excel_file.sheet_names
raise ValueError(f"Sheet '{sheet_name}' not found. Available sheets: {available_sheets}")
else:
# Load first sheet by default (matches original GUI behavior)
excel_file = pd.ExcelFile(file_path)
available_sheets = excel_file.sheet_names
df = pd.read_excel(file_path, sheet_name=available_sheets[0])
logging.info(f"Loaded first Excel sheet: '{available_sheets[0]}' (available: {available_sheets})")
else:
raise ValueError(f"Unsupported file format: {file_path}")
# Data subsetting feature (matches original GUI slider functionality)
original_rows = len(df)
if subset_limit and subset_limit > 0 and subset_limit < len(df):
df = df.head(subset_limit)
logging.info(f"Applied data subset: using first {subset_limit} intervals (of {original_rows} total)")
print(f"INFO: Data subset applied - analyzing first {subset_limit} intervals (GUI slider functionality)")
# Debug the loaded data
print(f"\nDEBUG - Loaded data frame:\n{df.head(5)}")
print(f"DataFrame shape: {df.shape}")
print(f"DataFrame columns: {df.columns.tolist()}")
print(f"Total rows: {len(df)}")
if subset_limit:
print(f"Original file had {original_rows} rows, using first {len(df)} rows (subset)")
# Ensure required columns exist
required_cols = ['T', 'FC']
for col in required_cols:
if col not in df.columns:
raise ValueError(f"Required column '{col}' not found in the data file")
# Add CFC (Cumulative Failure Count) if not exists
if 'CFC' not in df.columns:
df['CFC'] = df['FC'].cumsum()
# Identify covariates (any column that is not T, FC, or CFC)
covariates = [col for col in df.columns if col not in ['T', 'FC', 'CFC']]
# Log information about the loaded data
logging.info(f"Loaded data from {file_path}")
logging.info(f"Data dimensions: {df.shape}")
logging.info(f"Covariates: {covariates}")
if 'available_sheets' in locals():
logging.info(f"Available sheets: {available_sheets}")
return df, covariates
def prepare_model_data(data_df, covariate_names):
"""Prepare data in proper format for model initialization"""
# Create a new DataFrame to ensure proper structure
model_data = pd.DataFrame()
# Add the required columns
model_data['T'] = data_df['T'].values
model_data['FC'] = data_df['FC'].values
# Calculate CFC (cumulative failures) correctly
model_data['CFC'] = data_df['FC'].cumsum().values
# Add selected covariates
for name in covariate_names:
if name in data_df.columns:
model_data[name] = data_df[name].values
# Debug info
print(f"Data shape: {model_data.shape}")
print(f"CFC values: {model_data['CFC'].values}")
return model_data
# Implement prediction_psse function similar to the GUI
def prediction_psse(model, data_df):
"""
Prediction function used for PSSE, matching the GUI implementation in C-SFRAT/core/prediction.py.
Args:
model: The fitted model object
data_df: DataFrame containing the data
Returns:
array: Array of MVF values for all data points
"""
total_points = len(data_df)
# Extract covariate data if available
covariate_data = None
if model.metricNames:
covariate_data = np.array([data_df[name].values for name in model.metricNames])
# Calculate new hazard values for prediction points
new_hazard = np.array([model.hazardNumerical(i, model.modelParameters) for i in range(model.n, total_points)])
hazard = np.concatenate((model.hazard_array, new_hazard)) if hasattr(model, 'hazard_array') else new_hazard
# Calculate omega and MVF values just like in the GUI implementation
omega = model.calcOmega(hazard, model.betas, covariate_data)
mvf_array = np.array([model.MVF(model.mle_array, omega, hazard, data_point, covariate_data)
for data_point in range(total_points)])
return mvf_array
def run_all_combinations(df, covariates):
"""Run all model and covariate combinations with direct data access"""
results = []
# Generate all covariate combinations
covariate_combinations = list(powerset(covariates))
logging.info(f"Generated {len(covariate_combinations)} covariate combinations")
# For each model and covariate combination, fit the model
for model_name, model_class in MODEL_CLASSES.items():
for covs in covariate_combinations:
covs_list = list(covs)
logging.info(f"Running {model_name} with covariates: {covs_list}")
# Create model data for this combination
model_data = prepare_model_data(df, covs_list)
try:
# Initialize and run the model
model = initialize_model(model_class, model_data, covs_list)
# Store arrays for actual and fitted values
model.t = model_data['T'].values
model.CFC = model_data['CFC'].values
# Ensure we have the MVF values available for metrics calculation
if not hasattr(model, 'mvfList') or model.mvfList is None:
# For models that don't directly set mvfList, try to access it differently
if hasattr(model, 'mvf_array'):
model.mvfList = model.mvf_array
elif hasattr(model, 'modeledFailures'):
model.mvfList = model.modeledFailures
elif hasattr(model, 'omega') and hasattr(model, 'beta'):
# Generate simple MVF values for models that follow the basic reliability pattern
model.mvfList = [model.omega * (1 - np.exp(-model.beta * t)) for t in model.t]
else:
# Last resort - create dummy values that match the observation count
logging.warning(f"Creating fallback MVF values for {model_name}")
model.mvfList = model.CFC.copy() # Use actual values as a fallback
# Ensure we have the intensity values available for metrics calculation
if not hasattr(model, 'intensityList') or model.intensityList is None:
# Calculate intensity as derivative of MVF
model.intensityList = []
for i in range(len(model.mvfList)):
if i == 0:
intensity = model.mvfList[0]
else:
intensity = model.mvfList[i] - model.mvfList[i-1]
model.intensityList.append(intensity)
# Calculate additional goodness-of-fit measures
try:
# Calculate PSSE exactly as in the GUI using subset parameter
# The GUI fits the model on a subset of data (determined by psse_subset_parameter)
# then tests prediction accuracy on the full dataset
# Create data subset for PSSE model fitting (matches GUI functionality)
subset_size = int(len(df) * psse_subset_parameter)
if subset_size < 5: # Minimum 5 data points as in GUI
subset_size = min(5, len(df))
if subset_size >= len(df): # Maximum n-1 data points as in GUI
subset_size = len(df) - 1
df_subset = df.head(subset_size)
logging.info(f"PSSE calculation: using subset of {subset_size} points (fraction: {psse_subset_parameter}) from {len(df)} total points")
# Prepare model data for the subset
subset_model_data = prepare_model_data(df_subset, covs_list)
# Create and fit model on subset data
psse_model = initialize_model(model_class, subset_model_data, covs_list)
psse_model.t = subset_model_data['T'].values
psse_model.CFC = subset_model_data['CFC'].values
# Get fitted values using the subset-trained model on full data
fitted_array = prediction_psse(psse_model, df)
# Debug the PSSE calculation
logging.info(f"PSSE calculation debug - subset_model.n: {psse_model.n}, fitted_array shape: {fitted_array.shape}, full CFC shape: {df['CFC'].values.shape}")
# Calculate PSSE exactly as in the GUI
# Use the subset model's n (number of fitted points) as the intervals parameter
psse_val = PSSE(fitted_array, df['CFC'].values, psse_model.n)
model.psseVal = psse_val
# Instead of calculating critic values directly, store the values needed for later AHP calculation
# We still calculate simple versions for debugging and comparison
simple_critic_mean, simple_critic_median = calculate_prequential_metrics(model.CFC, model.mvfList)
model.simple_critic_mean = simple_critic_mean
model.simple_critic_median = simple_critic_median
logging.info(f"Additional metrics - PSSE: {model.psseVal:.6f}, Simple Critic Mean: {model.simple_critic_mean:.6f}, Simple Critic Median: {model.simple_critic_median:.6f}")
except Exception as e:
logging.warning(f"Error calculating additional metrics: {str(e)}")
logging.warning(traceback.format_exc())
model.psseVal = float('nan')
model.simple_critic_mean = float('nan')
model.simple_critic_median = float('nan')
# Add to results if model converged
logging.info(f"Model {model.name} with covariates {covs_list} converged successfully")
logging.info(f"AIC: {model.aicVal}, BIC: {model.bicVal}, LLF: {model.llfVal}")
# Store covariate names for reporting
model.covariateNames = covs_list
results.append(model)
except Exception as e:
logging.error(f"Error running {model_name} with covariates {covs_list}: {str(e)}")
logging.error(traceback.format_exc())
# Sort models by AIC (ascending)
results.sort(key=lambda x: x.aicVal if hasattr(x, 'aicVal') else float('inf'))
# Calculate critic values using AHP (same approach as GUI)
if results:
try:
# Use the global metric_weights configuration
global metric_weights
calculator = CriticCalculator(weights=metric_weights)
critic_values = calculator.calculate_critic_values(results)
# Assign the calculated critic values to each model
for i, model in enumerate(results):
if i in critic_values:
model.criticMean, model.criticMedian = critic_values[i]
# Also store raw (unnormalized) values for legend display
model.rawCriticMean = calculator.raw_mean_values[i]
model.rawCriticMedian = calculator.raw_median_values[i]
logging.info(f"GUI-style critic values for {model.name}: Mean={model.criticMean:.6f}, Median={model.criticMedian:.6f}")
except Exception as e:
logging.error(f"Error calculating GUI-style critic values: {str(e)}")
logging.error(traceback.format_exc())
# Fall back to simple critic values if AHP calculation fails
for model in results:
if hasattr(model, 'simple_critic_mean'):
model.criticMean = model.simple_critic_mean
model.criticMedian = model.simple_critic_median
# Apply final ranking based on user's ranking method preference
results = apply_final_ranking(results)
return results
def apply_final_ranking(models):
"""Apply final ranking based on user's ranking method preference
Args:
models: List of fitted model objects with critic values
Returns:
List of models sorted by the selected ranking criterion
"""
global ranking_method
# Ensure ranking_method has a valid default value
if not ranking_method or ranking_method.lower() not in ['mean', 'median']:
ranking_method = 'mean' # Set default to mean if invalid or None
logging.warning(f"Invalid or missing ranking method. Defaulting to 'mean'")
# Log the ranking method being used
logging.info(f"Ranking models using {ranking_method.lower()} critic values")
# Sort by the selected ranking method (higher values are better)
if ranking_method.lower() == 'median':
# Sort by critic median values (descending - higher is better)
models.sort(key=lambda x: getattr(x, 'criticMedian', 0), reverse=True)
logging.info("Models ranked by critic median values")
else:
# Default to mean ranking (includes any invalid values)
models.sort(key=lambda x: getattr(x, 'criticMean', 0), reverse=True)
logging.info("Models ranked by critic mean values")
# Log the top 3 models for verification
for i, model in enumerate(models[:3]):
if hasattr(model, 'aicVal'):
logging.info(f"Rank {i+1}: {model.name} - Mean: {model.criticMean:.6f}, Median: {model.criticMedian:.6f}")
return models
def calculate_prequential_metrics(actual, fitted):
"""Calculate prequential (critic) mean and median metrics
This is a simplified direct calculation, different from the GUI's AHP approach.
It's kept for comparison and fallback purposes.
Args:
actual: List of actual cumulative failures
fitted: List of fitted MVF values from the model
Returns:
tuple: (critic_mean, critic_median)
"""
try:
# NOTE: This is a simplified version of the GUI's critic calculation
# The GUI uses a complex Analytic Hierarchy Process (AHP) approach in the Comparison class
# that normalizes all metrics (LLF, AIC, BIC, SSE, PSSE) together
# Calculate differences between actual and fitted values
diffs = np.absolute(np.subtract(np.array(fitted), np.array(actual)))
# Prequential metrics
critic_mean = np.mean(diffs)
critic_median = np.median(diffs)
return critic_mean, critic_median
except Exception as e:
logging.warning(f"Error calculating prequential metrics: {str(e)}")
return float('nan'), float('nan')
def create_growth_curve_plot(models, single_model_mode=False, predictions=None, individual_predictions=None):
"""Create a growth curve plot matching the original C-SFRAT tool style
Args:
models: Single model object or list of models
single_model_mode: If True, only plot the first model regardless of multi_model_plots setting
predictions: Dictionary containing prediction data for the best model
individual_predictions: Dictionary containing individual predictions for multiple models
"""
# Debug logging to see what the plotting function receives
logging.info(f"PLOT FUNCTION DEBUG - predictions type: {type(predictions)}")
logging.info(f"PLOT FUNCTION DEBUG - individual_predictions: {individual_predictions}")
logging.info(f"PLOT FUNCTION DEBUG - predictions method: {predictions.get('prediction_method', 'N/A') if predictions else 'N/A'}")
# Handle both single model and multi-model cases
if not isinstance(models, list):
models = [models]
if not models or len(models) == 0:
return None
# Determine how many models to plot
global multi_model_plots, num_models_to_compare, individual_model_predictions, show_model_predictions_separately
if single_model_mode or not multi_model_plots:
models_to_plot = models[:1] # Only plot the best model
plot_title = f'MVF - {models[0].name}'
else:
models_to_plot = models[:min(num_models_to_compare, len(models))]
if len(models_to_plot) > 1:
plot_title = f'MVF Comparison - Top {len(models_to_plot)} Models'
else:
plot_title = f'MVF - {models[0].name}'
# Simplified title - predictions are now seamlessly integrated
# No need to mention predictions in title since they appear as continuous extensions
fig, ax = plt.subplots(figsize=(9.5, 6.5))
# Create consistent color mapping across all plots
color_mapping, original_colors = create_model_color_mapping(models, individual_predictions)
# Plot imported data first as black step plot (matching original C-SFRAT)
if hasattr(models[0], 't') and hasattr(models[0], 'CFC'):
# Step plot with right alignment (stepMode='right' in original)
ax.step(models[0].t, models[0].CFC, where='post', color='black', linewidth=3,
label='Data', zorder=10)
# Plot each fitted model with original C-SFRAT style
for i, model in enumerate(models_to_plot):
# Create consistent model key for color mapping
model_key = model.name
if hasattr(model, 'covariateNames') and model.covariateNames:
cov_str = ",".join([c[:3] for c in model.covariateNames])
model_key += f"({cov_str})"
color = color_mapping.get(model_key, original_colors[i % len(original_colors)])
# Ensure model has required attributes for plotting
if not hasattr(model, 't') or not hasattr(model, 'CFC'):
logging.warning(f"Model {model.name} missing required plotting attributes")
continue
# Get Mean Value Function values (use mvf_array to match original)
mvf_data = []
if hasattr(model, 'mvf_array') and model.mvf_array is not None:
mvf_data = model.mvf_array
elif hasattr(model, 'mvfList') and model.mvfList is not None:
mvf_data = model.mvfList
elif hasattr(model, 'omega'):
# Generate simple MVF data if actual values not available
mvf_data = [model.omega * (1 - np.exp(-model.beta * t)) for t in model.t]
if len(mvf_data) > 0:
# Ensure data is properly converted to numpy arrays
try:
time_data = np.array([float(x) for x in model.t])
mvf_array = np.array([float(x) for x in mvf_data])
except (ValueError, TypeError) as e:
# Fallback for complex data types
logging.warning(f"Array conversion fallback for model {model.name}: {e}")
time_data = np.asfarray(model.t)
mvf_array = np.asfarray(mvf_data)
# Create model label
if len(models_to_plot) == 1:
model_label = model.name
if hasattr(model, 'covariateNames') and model.covariateNames:
model_label += f" ({', '.join(model.covariateNames)})"
else:
model_label = model.name
if hasattr(model, 'covariateNames') and model.covariateNames:
cov_str = ",".join([c[:3] for c in model.covariateNames]) # Abbreviated covariates
model_label += f" ({cov_str})"
# Add ranking information for multi-model plots
if hasattr(model, 'criticMean') and hasattr(model, 'criticMedian'):
global ranking_method
if ranking_method.lower() == 'median':
model_label += f" [MD:{model.criticMedian:.4f}]"
else:
model_label += f" [M:{model.criticMean:.4f}]"
# Plot fitted data with circles and lines (matching original C-SFRAT style)
ax.plot(time_data, mvf_array, color=color, linewidth=3, marker='o', markersize=4,
markerfacecolor=color, markeredgecolor=color, label=model_label,
alpha=0.9, zorder=5)
# REMOVED: Green effort prediction line (individual model predictions provide smooth behavior)
effort_prediction_shown = False
# Handle individual model predictions as continuous extensions (professor feedback)
if individual_predictions and individual_model_predictions and show_model_predictions_separately:
for model_key, pred_data in individual_predictions.items():
# Use consistent color mapping for predictions (same as model fit)
pred_color = color_mapping.get(model_key, original_colors[0])
# Get the model's last fitted point for connection
model = pred_data['model']
if not hasattr(model, 't') or not hasattr(model, 'mvf_array') and not hasattr(model, 'mvfList'):
continue
# Get MVF data for connection
if hasattr(model, 'mvf_array') and model.mvf_array is not None:
last_mvf = model.mvf_array[-1]
elif hasattr(model, 'mvfList') and model.mvfList is not None:
last_mvf = model.mvfList[-1]
else:
continue
# Plot individual model predictions
pred_times = pred_data['future_times']
pred_mvf = pred_data['future_mvf']
if len(pred_times) > 0 and len(pred_mvf) > 0:
# Convert prediction data to numpy arrays with better error handling
try:
pred_times_array = np.array([float(x) for x in pred_times])
pred_mvf_array = np.array([float(x) for x in pred_mvf])
# Connect the last fitted point to the first prediction point
connect_times = np.array([float(model.t[-1])] + [float(x) for x in pred_times_array])
connect_mvf = np.array([float(last_mvf)] + [float(x) for x in pred_mvf_array])