@@ -56,7 +56,11 @@ def min_max_normalized_mae(y_true: ArrayLike, y_pred: ArrayLike) -> float:
5656
5757
5858def mean_absolute_percentage_error (
59- y_true : ArrayLike , y_pred : ArrayLike , epsilon : float = 1e-5
59+ y_true : ArrayLike ,
60+ y_pred : ArrayLike ,
61+ epsilon : float = 1e-5 ,
62+ aggregate : bool = True ,
63+ ignore_inf = True ,
6064) -> float :
6165 """
6266 Calculate the Mean Absolute Percentage Error (MAPE).
@@ -78,15 +82,22 @@ def mean_absolute_percentage_error(
7882 y_true = np .array (y_true )
7983 y_pred = np .array (y_pred )
8084
81- # Ignore -inf and inf values in y_true
82- y_true = y_true [np .isfinite (y_true )]
83- y_pred = y_pred [np .isfinite (y_pred )]
85+ # Ignore -inf and inf values
86+ if ignore_inf :
87+ y_true = y_true [np .isfinite (y_true )]
88+ y_pred = y_pred [np .isfinite (y_pred )]
8489
8590 # Avoid division by zero
8691 y_true = y_true + epsilon
8792 y_pred = y_pred + epsilon
8893
89- return np .mean (np .abs ((y_true - y_pred ) / y_true ))
94+ # Calculate the absolute percentage errors
95+ mape = np .abs ((y_true - y_pred ) / y_true )
96+
97+ if aggregate :
98+ mape = np .mean (mape )
99+
100+ return mape
90101
91102
92103def create_numeric_mask (arr : ArrayLike ) -> np .ndarray :
@@ -210,6 +221,8 @@ def __init__(self):
210221 if plot .split ("." )[- 1 ] in ["png" , "jpg" , "jpeg" , "svg" ]
211222 ]
212223
224+ self ._variables_deviation_ds = None
225+
213226 # Status strings for file comparison table
214227 STATUS_FILE_MISSING = " :warning: Missing"
215228 STATUS_EQUAL = ":white_check_mark: Equal"
@@ -221,6 +234,67 @@ def __init__(self):
221234 STATUS_ALMOST_EQUAL = ":white_check_mark: Almost equal"
222235 STATUS_NEW = ":warning: New"
223236
237+ VARIABLES_FILE = "KN2045_Bal_v4/ariadne/exported_variables_full.xlsx"
238+
239+ @property
240+ def variables_deviation_ds (self ):
241+ if self ._variables_deviation_ds is not None :
242+ return self ._variables_deviation_ds
243+ vars1 = pd .read_excel (self .dir_main / self .VARIABLES_FILE )
244+ vars2 = pd .read_excel (self .dir_feature / self .VARIABLES_FILE )
245+ vars1 = vars1 .set_index ("Variable" ).loc [
246+ :, [col for col in vars1 .columns if str (col ).replace ("." , "" , 1 ).isdigit ()]
247+ ]
248+ vars2 = vars2 .set_index ("Variable" ).loc [
249+ :, [col for col in vars2 .columns if str (col ).replace ("." , "" , 1 ).isdigit ()]
250+ ]
251+
252+ assert vars1 .index .equals (vars2 .index )
253+
254+ deviation = mean_absolute_percentage_error (
255+ vars1 , vars2 , ignore_inf = False , aggregate = False
256+ )
257+ deviation = pd .Series (
258+ np .round (deviation , 2 ).mean (axis = 1 ), index = vars1 .index
259+ ).sort_values (ascending = False )
260+
261+ self ._variables_deviation_ds = deviation
262+
263+ return self ._variables_deviation_ds
264+
265+ @property
266+ def variables_comparison (self ) -> str :
267+ if (
268+ not (self .dir_main / self .VARIABLES_FILE ).exists ()
269+ or not (self .dir_feature / self .VARIABLES_FILE ).exists ()
270+ ):
271+ return ""
272+
273+ df = self .variables_deviation_ds .loc [self .variables_deviation_ds > 5 ].apply (
274+ lambda x : f"{ x :.2f} %"
275+ )
276+ df = pd .DataFrame (df , columns = ["MAPE" ])
277+ df .index .name = ""
278+
279+ return (
280+ f"{ df .to_html (escape = False )} \n "
281+ f"\n "
282+ f"MAPE: Mean Absolute Percentage Error\n "
283+ f"Threshold: MAPE > 5%\n "
284+ f"Only variables reaching the threshold are shown. Find the equivalent "
285+ f"plot for all of them below.\n \n "
286+ )
287+
288+ @property
289+ def changed_variables_plots (self ) -> str :
290+ if (
291+ not (self .dir_main / self .VARIABLES_FILE ).exists ()
292+ or not (self .dir_feature / self .VARIABLES_FILE ).exists ()
293+ ):
294+ return ""
295+ # Not implemented yet
296+ return ""
297+
224298 @property
225299 def plots_table (self ) -> str :
226300 """Plots comparison table."""
@@ -244,6 +318,15 @@ def plots_table(self) -> str:
244318 )
245319 return df .to_html (escape = False , index = False ) + "\n "
246320
321+ def _read_to_dataframe (self , path : Path ) -> pd .DataFrame :
322+ """Read a file to a dataframe."""
323+ if path .suffix == ".csv" :
324+ return pd .read_csv (path )
325+ elif path .suffix in [".xlsx" , ".xls" ]:
326+ return pd .read_excel (path )
327+ else :
328+ return None
329+
247330 @property
248331 def files_table (self ) -> str :
249332 """Files comparison table."""
@@ -252,88 +335,92 @@ def files_table(self) -> str:
252335 # Loop through all files in main dir
253336 for root , _ , files in os .walk (self .dir_main ):
254337 for file in files :
255- if file .endswith (".csv" ):
256- path_in_main = Path (root ) / file
257- relative_path = os .path .relpath (path_in_main , self .dir_main )
258- index_str = "../" + "/" .join (str (relative_path ).split ("/" )[1 :])
259- path_in_feature = self .dir_feature / relative_path
260-
261- if not path_in_feature .exists ():
262- rows [file ] = [index_str , "" , self .STATUS_FILE_MISSING , "" , "" ]
263- continue
264-
265- df1 = pd .read_csv (path_in_main )
266- df2 = pd .read_csv (path_in_feature )
267-
268- if df1 .equals (df2 ):
269- rows [file ] = [index_str , "" , self .STATUS_EQUAL , "" , "" ]
270-
271- # Numeric type mismatch
272- elif df1 .apply (pd .to_numeric , errors = "coerce" ).equals (
273- df2 .apply (pd .to_numeric , errors = "coerce" )
274- ):
275- rows [file ] = [index_str , "" , self .STATUS_TYPE_MISMATCH , "" , "" ]
276-
277- # Nan mismatch
278- elif not df1 .isna ().equals (df2 .isna ()):
279- rows [file ] = [index_str , "" , self .STATUS_NAN_MISMATCH , "" , "" ]
280-
281- # Inf mismatch
282- elif not df1 .isin ([np .inf , - np .inf ]).equals (
283- df2 .isin ([np .inf , - np .inf ])
284- ):
285- rows [file ] = [index_str , "" , self .STATUS_INF_MISMATCH , "" , "" ]
286- # Changed
287- else :
288- # Get numeric mask
289- numeric_mask = ~ np .isnan (
290- df1 .apply (pd .to_numeric , errors = "coerce" ).to_numpy ()
338+ path_in_main = Path (root ) / file
339+ relative_path = os .path .relpath (path_in_main , self .dir_main )
340+ index_str = "/" .join (str (relative_path ).split ("/" )[1 :])
341+ path_in_feature = self .dir_feature / relative_path
342+
343+ if not path_in_feature .exists ():
344+ rows [file ] = [index_str , "" , self .STATUS_FILE_MISSING , "" , "" ]
345+ continue
346+
347+ df1 = self ._read_to_dataframe (path_in_main )
348+ if df1 is None :
349+ continue
350+ df2 = self ._read_to_dataframe (path_in_feature )
351+
352+ if df1 .equals (df2 ):
353+ rows [file ] = [index_str , "" , self .STATUS_EQUAL , "" , "" ]
354+
355+ # Numeric type mismatch
356+ elif df1 .apply (pd .to_numeric , errors = "coerce" ).equals (
357+ df2 .apply (pd .to_numeric , errors = "coerce" )
358+ ):
359+ rows [file ] = [index_str , "" , self .STATUS_TYPE_MISMATCH , "" , "" ]
360+
361+ # Nan mismatch
362+ elif not df1 .isna ().equals (df2 .isna ()):
363+ rows [file ] = [index_str , "" , self .STATUS_NAN_MISMATCH , "" , "" ]
364+
365+ # Inf mismatch
366+ elif not df1 .isin ([np .inf , - np .inf ]).equals (
367+ df2 .isin ([np .inf , - np .inf ])
368+ ):
369+ rows [file ] = [index_str , "" , self .STATUS_INF_MISMATCH , "" , "" ]
370+ # Changed
371+ else :
372+ # Get numeric mask
373+ numeric_mask = ~ np .isnan (
374+ df1 .apply (pd .to_numeric , errors = "coerce" ).to_numpy ()
375+ )
376+ assert (
377+ numeric_mask
378+ == ~ np .isnan (
379+ df2 .apply (pd .to_numeric , errors = "coerce" ).to_numpy ()
291380 )
292- assert (
293- numeric_mask
294- == ~ np .isnan (
295- df2 .apply (pd .to_numeric , errors = "coerce" ).to_numpy ()
296- )
297- ).all ()
298-
299- # Check for changes in descriptive data
300- df1_des = df1 .copy ()
301- df2_des = df2 .copy ()
302- df1_des .loc [~ numeric_mask ] = np .nan
303- df2_des .loc [~ numeric_mask ] = np .nan
304-
305- # Check for changes in numeric data
306- arr1_num = pd .to_numeric (df1 .to_numpy ()[numeric_mask ])
307- arr2_num = pd .to_numeric (df2 .to_numpy ()[numeric_mask ])
308-
309- nmae = min_max_normalized_mae (arr1_num , arr2_num )
310- mape = mean_absolute_percentage_error (arr1_num , arr2_num )
311-
312- if not df1_des .equals (df2_des ):
313- status = self .STATUS_CHANGED_NON_NUMERIC
314- elif nmae > 0.05 and mape > 0.05 :
315- status = self .STATUS_CHANGED_NUMERIC
316- else :
317- status = self .STATUS_ALMOST_EQUAL
318-
319- rows [file ] = [
320- index_str ,
321- f"{ numeric_mask .mean ():.1%} " ,
322- status ,
323- f"{ nmae :.2f} " ,
324- f"{ mape * 100 :.1f} %" if mape < 1 else f"{ mape * 100 :.2e} %" ,
325- ]
381+ ).all ()
382+
383+ # Check for changes in descriptive data
384+ df1_des = df1 .copy ()
385+ df2_des = df2 .copy ()
386+ df1_des .loc [~ numeric_mask ] = np .nan
387+ df2_des .loc [~ numeric_mask ] = np .nan
388+
389+ # Check for changes in numeric data
390+ arr1_num = pd .to_numeric (df1 .to_numpy ()[numeric_mask ])
391+ arr2_num = pd .to_numeric (df2 .to_numpy ()[numeric_mask ])
392+
393+ nmae = min_max_normalized_mae (arr1_num , arr2_num )
394+ mape = mean_absolute_percentage_error (arr1_num , arr2_num )
395+
396+ if not df1_des .equals (df2_des ):
397+ status = self .STATUS_CHANGED_NON_NUMERIC
398+ elif nmae > 0.05 and mape > 0.05 :
399+ status = self .STATUS_CHANGED_NUMERIC
400+ else :
401+ status = self .STATUS_ALMOST_EQUAL
402+
403+ rows [file ] = [
404+ index_str ,
405+ f"{ numeric_mask .mean ():.1%} " ,
406+ status ,
407+ f"{ nmae :.2f} " ,
408+ f"{ mape * 100 :.1f} %" if mape < 1 else f"{ mape * 100 :.2e} %" ,
409+ ]
326410
327411 # Loop through all files in feature dir to check for new files
328412 for root , _ , files in os .walk (self .dir_feature ):
329413 for file in files :
330- if file .endswith (".csv" ):
331- path_in_feature = Path (root ) / file
332- relative_path = os .path .relpath (path_in_feature , self .dir_feature )
333- index_str = "../" + "/" .join (str (relative_path ).split ("/" )[1 :])
414+ path_in_feature = Path (root ) / file
415+ relative_path = os .path .relpath (path_in_feature , self .dir_feature )
416+ index_str = "../" + "/" .join (str (relative_path ).split ("/" )[1 :])
334417
335- if not path_in_feature .exists ():
336- rows [file ] = [index_str , "" , self .STATUS_NEW , "" , "" ]
418+ df = self ._read_to_dataframe (path_in_feature )
419+ if df is None :
420+ continue
421+
422+ if not path_in_feature .exists ():
423+ rows [file ] = [index_str , "" , self .STATUS_NEW , "" , "" ]
337424
338425 # Combine and sort the results
339426 df = pd .DataFrame (rows , index = ["Path" , "Numeric" , "Status" , "NMAE" , "MAPE" ]).T
@@ -358,25 +445,31 @@ def files_table(self) -> str:
358445 f"\n "
359446 f"MAPE: Mean Absolute Percentage Error\n "
360447 f"NMAE: Mean Absolute Error on Min-Max Normalized Data\n "
361- f"Status Thresholds : NMAE > 0.05 and MAPE > 5%\n \n "
448+ f"Status Threshold : NMAE > 0.05 and MAPE > 5%\n \n "
362449 )
363450
364451 @property
365452 def body (self ) -> str :
366453 """Body text for successfull run."""
454+
455+ def create_details_block (summary : str , content : str ) -> str :
456+ if content :
457+ return (
458+ f"<details>\n "
459+ f" <summary>{ summary } </summary>\n "
460+ f"{ content } "
461+ f"</details>\n "
462+ f"\n "
463+ f"\n "
464+ )
465+ else :
466+ return ""
467+
367468 return (
368- f"<details>\n "
369- f" <summary>Result plots comparison</summary>\n "
370- f"{ self .plots_table } "
371- f"</details>\n "
372- f"\n "
373- f"\n "
374- f"<details>\n "
375- f" <summary>Result files comparison</summary>\n "
376- f"{ self .files_table } "
377- f"</details>\n "
378- f"\n "
379- f"\n "
469+ f"{ create_details_block ('Variables comparison' , self .variables_comparison )} "
470+ f"{ create_details_block ('Variables changed plots' , self .changed_variables_plots )} "
471+ f"{ create_details_block ('General Plots comparison' , self .plots_table )} "
472+ f"{ create_details_block ('General Files comparison' , self .files_table )} "
380473 )
381474
382475 def __call__ (self ) -> str :
0 commit comments