Skip to content

Commit 60a23a8

Browse files
authored
feat: add variable comparison table (#7)
1 parent b1a5abb commit 60a23a8

File tree

2 files changed

+190
-95
lines changed

2 files changed

+190
-95
lines changed

draft_comment.py

Lines changed: 187 additions & 94 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,11 @@ def min_max_normalized_mae(y_true: ArrayLike, y_pred: ArrayLike) -> float:
5656

5757

5858
def mean_absolute_percentage_error(
59-
y_true: ArrayLike, y_pred: ArrayLike, epsilon: float = 1e-5
59+
y_true: ArrayLike,
60+
y_pred: ArrayLike,
61+
epsilon: float = 1e-5,
62+
aggregate: bool = True,
63+
ignore_inf=True,
6064
) -> float:
6165
"""
6266
Calculate the Mean Absolute Percentage Error (MAPE).
@@ -78,15 +82,22 @@ def mean_absolute_percentage_error(
7882
y_true = np.array(y_true)
7983
y_pred = np.array(y_pred)
8084

81-
# Ignore -inf and inf values in y_true
82-
y_true = y_true[np.isfinite(y_true)]
83-
y_pred = y_pred[np.isfinite(y_pred)]
85+
# Ignore -inf and inf values
86+
if ignore_inf:
87+
y_true = y_true[np.isfinite(y_true)]
88+
y_pred = y_pred[np.isfinite(y_pred)]
8489

8590
# Avoid division by zero
8691
y_true = y_true + epsilon
8792
y_pred = y_pred + epsilon
8893

89-
return np.mean(np.abs((y_true - y_pred) / y_true))
94+
# Calculate the absolute percentage errors
95+
mape = np.abs((y_true - y_pred) / y_true)
96+
97+
if aggregate:
98+
mape = np.mean(mape)
99+
100+
return mape
90101

91102

92103
def create_numeric_mask(arr: ArrayLike) -> np.ndarray:
@@ -210,6 +221,8 @@ def __init__(self):
210221
if plot.split(".")[-1] in ["png", "jpg", "jpeg", "svg"]
211222
]
212223

224+
self._variables_deviation_ds = None
225+
213226
# Status strings for file comparison table
214227
STATUS_FILE_MISSING = " :warning: Missing"
215228
STATUS_EQUAL = ":white_check_mark: Equal"
@@ -221,6 +234,67 @@ def __init__(self):
221234
STATUS_ALMOST_EQUAL = ":white_check_mark: Almost equal"
222235
STATUS_NEW = ":warning: New"
223236

237+
VARIABLES_FILE = "KN2045_Bal_v4/ariadne/exported_variables_full.xlsx"
238+
239+
@property
240+
def variables_deviation_ds(self):
241+
if self._variables_deviation_ds is not None:
242+
return self._variables_deviation_ds
243+
vars1 = pd.read_excel(self.dir_main / self.VARIABLES_FILE)
244+
vars2 = pd.read_excel(self.dir_feature / self.VARIABLES_FILE)
245+
vars1 = vars1.set_index("Variable").loc[
246+
:, [col for col in vars1.columns if str(col).replace(".", "", 1).isdigit()]
247+
]
248+
vars2 = vars2.set_index("Variable").loc[
249+
:, [col for col in vars2.columns if str(col).replace(".", "", 1).isdigit()]
250+
]
251+
252+
assert vars1.index.equals(vars2.index)
253+
254+
deviation = mean_absolute_percentage_error(
255+
vars1, vars2, ignore_inf=False, aggregate=False
256+
)
257+
deviation = pd.Series(
258+
np.round(deviation, 2).mean(axis=1), index=vars1.index
259+
).sort_values(ascending=False)
260+
261+
self._variables_deviation_ds = deviation
262+
263+
return self._variables_deviation_ds
264+
265+
@property
266+
def variables_comparison(self) -> str:
267+
if (
268+
not (self.dir_main / self.VARIABLES_FILE).exists()
269+
or not (self.dir_feature / self.VARIABLES_FILE).exists()
270+
):
271+
return ""
272+
273+
df = self.variables_deviation_ds.loc[self.variables_deviation_ds > 5].apply(
274+
lambda x: f"{x:.2f}%"
275+
)
276+
df = pd.DataFrame(df, columns=["MAPE"])
277+
df.index.name = ""
278+
279+
return (
280+
f"{df.to_html(escape=False)}\n"
281+
f"\n"
282+
f"MAPE: Mean Absolute Percentage Error\n"
283+
f"Threshold: MAPE > 5%\n"
284+
f"Only variables reaching the threshold are shown. Find the equivalent "
285+
f"plot for all of them below.\n\n"
286+
)
287+
288+
@property
289+
def changed_variables_plots(self) -> str:
290+
if (
291+
not (self.dir_main / self.VARIABLES_FILE).exists()
292+
or not (self.dir_feature / self.VARIABLES_FILE).exists()
293+
):
294+
return ""
295+
# Not implemented yet
296+
return ""
297+
224298
@property
225299
def plots_table(self) -> str:
226300
"""Plots comparison table."""
@@ -244,6 +318,15 @@ def plots_table(self) -> str:
244318
)
245319
return df.to_html(escape=False, index=False) + "\n"
246320

321+
def _read_to_dataframe(self, path: Path) -> pd.DataFrame:
322+
"""Read a file to a dataframe."""
323+
if path.suffix == ".csv":
324+
return pd.read_csv(path)
325+
elif path.suffix in [".xlsx", ".xls"]:
326+
return pd.read_excel(path)
327+
else:
328+
return None
329+
247330
@property
248331
def files_table(self) -> str:
249332
"""Files comparison table."""
@@ -252,88 +335,92 @@ def files_table(self) -> str:
252335
# Loop through all files in main dir
253336
for root, _, files in os.walk(self.dir_main):
254337
for file in files:
255-
if file.endswith(".csv"):
256-
path_in_main = Path(root) / file
257-
relative_path = os.path.relpath(path_in_main, self.dir_main)
258-
index_str = "../" + "/".join(str(relative_path).split("/")[1:])
259-
path_in_feature = self.dir_feature / relative_path
260-
261-
if not path_in_feature.exists():
262-
rows[file] = [index_str, "", self.STATUS_FILE_MISSING, "", ""]
263-
continue
264-
265-
df1 = pd.read_csv(path_in_main)
266-
df2 = pd.read_csv(path_in_feature)
267-
268-
if df1.equals(df2):
269-
rows[file] = [index_str, "", self.STATUS_EQUAL, "", ""]
270-
271-
# Numeric type mismatch
272-
elif df1.apply(pd.to_numeric, errors="coerce").equals(
273-
df2.apply(pd.to_numeric, errors="coerce")
274-
):
275-
rows[file] = [index_str, "", self.STATUS_TYPE_MISMATCH, "", ""]
276-
277-
# Nan mismatch
278-
elif not df1.isna().equals(df2.isna()):
279-
rows[file] = [index_str, "", self.STATUS_NAN_MISMATCH, "", ""]
280-
281-
# Inf mismatch
282-
elif not df1.isin([np.inf, -np.inf]).equals(
283-
df2.isin([np.inf, -np.inf])
284-
):
285-
rows[file] = [index_str, "", self.STATUS_INF_MISMATCH, "", ""]
286-
# Changed
287-
else:
288-
# Get numeric mask
289-
numeric_mask = ~np.isnan(
290-
df1.apply(pd.to_numeric, errors="coerce").to_numpy()
338+
path_in_main = Path(root) / file
339+
relative_path = os.path.relpath(path_in_main, self.dir_main)
340+
index_str = "/".join(str(relative_path).split("/")[1:])
341+
path_in_feature = self.dir_feature / relative_path
342+
343+
if not path_in_feature.exists():
344+
rows[file] = [index_str, "", self.STATUS_FILE_MISSING, "", ""]
345+
continue
346+
347+
df1 = self._read_to_dataframe(path_in_main)
348+
if df1 is None:
349+
continue
350+
df2 = self._read_to_dataframe(path_in_feature)
351+
352+
if df1.equals(df2):
353+
rows[file] = [index_str, "", self.STATUS_EQUAL, "", ""]
354+
355+
# Numeric type mismatch
356+
elif df1.apply(pd.to_numeric, errors="coerce").equals(
357+
df2.apply(pd.to_numeric, errors="coerce")
358+
):
359+
rows[file] = [index_str, "", self.STATUS_TYPE_MISMATCH, "", ""]
360+
361+
# Nan mismatch
362+
elif not df1.isna().equals(df2.isna()):
363+
rows[file] = [index_str, "", self.STATUS_NAN_MISMATCH, "", ""]
364+
365+
# Inf mismatch
366+
elif not df1.isin([np.inf, -np.inf]).equals(
367+
df2.isin([np.inf, -np.inf])
368+
):
369+
rows[file] = [index_str, "", self.STATUS_INF_MISMATCH, "", ""]
370+
# Changed
371+
else:
372+
# Get numeric mask
373+
numeric_mask = ~np.isnan(
374+
df1.apply(pd.to_numeric, errors="coerce").to_numpy()
375+
)
376+
assert (
377+
numeric_mask
378+
== ~np.isnan(
379+
df2.apply(pd.to_numeric, errors="coerce").to_numpy()
291380
)
292-
assert (
293-
numeric_mask
294-
== ~np.isnan(
295-
df2.apply(pd.to_numeric, errors="coerce").to_numpy()
296-
)
297-
).all()
298-
299-
# Check for changes in descriptive data
300-
df1_des = df1.copy()
301-
df2_des = df2.copy()
302-
df1_des.loc[~numeric_mask] = np.nan
303-
df2_des.loc[~numeric_mask] = np.nan
304-
305-
# Check for changes in numeric data
306-
arr1_num = pd.to_numeric(df1.to_numpy()[numeric_mask])
307-
arr2_num = pd.to_numeric(df2.to_numpy()[numeric_mask])
308-
309-
nmae = min_max_normalized_mae(arr1_num, arr2_num)
310-
mape = mean_absolute_percentage_error(arr1_num, arr2_num)
311-
312-
if not df1_des.equals(df2_des):
313-
status = self.STATUS_CHANGED_NON_NUMERIC
314-
elif nmae > 0.05 and mape > 0.05:
315-
status = self.STATUS_CHANGED_NUMERIC
316-
else:
317-
status = self.STATUS_ALMOST_EQUAL
318-
319-
rows[file] = [
320-
index_str,
321-
f"{numeric_mask.mean():.1%}",
322-
status,
323-
f"{nmae:.2f}",
324-
f"{mape*100:.1f}%" if mape < 1 else f"{mape*100:.2e}%",
325-
]
381+
).all()
382+
383+
# Check for changes in descriptive data
384+
df1_des = df1.copy()
385+
df2_des = df2.copy()
386+
df1_des.loc[~numeric_mask] = np.nan
387+
df2_des.loc[~numeric_mask] = np.nan
388+
389+
# Check for changes in numeric data
390+
arr1_num = pd.to_numeric(df1.to_numpy()[numeric_mask])
391+
arr2_num = pd.to_numeric(df2.to_numpy()[numeric_mask])
392+
393+
nmae = min_max_normalized_mae(arr1_num, arr2_num)
394+
mape = mean_absolute_percentage_error(arr1_num, arr2_num)
395+
396+
if not df1_des.equals(df2_des):
397+
status = self.STATUS_CHANGED_NON_NUMERIC
398+
elif nmae > 0.05 and mape > 0.05:
399+
status = self.STATUS_CHANGED_NUMERIC
400+
else:
401+
status = self.STATUS_ALMOST_EQUAL
402+
403+
rows[file] = [
404+
index_str,
405+
f"{numeric_mask.mean():.1%}",
406+
status,
407+
f"{nmae:.2f}",
408+
f"{mape*100:.1f}%" if mape < 1 else f"{mape*100:.2e}%",
409+
]
326410

327411
# Loop through all files in feature dir to check for new files
328412
for root, _, files in os.walk(self.dir_feature):
329413
for file in files:
330-
if file.endswith(".csv"):
331-
path_in_feature = Path(root) / file
332-
relative_path = os.path.relpath(path_in_feature, self.dir_feature)
333-
index_str = "../" + "/".join(str(relative_path).split("/")[1:])
414+
path_in_feature = Path(root) / file
415+
relative_path = os.path.relpath(path_in_feature, self.dir_feature)
416+
index_str = "../" + "/".join(str(relative_path).split("/")[1:])
334417

335-
if not path_in_feature.exists():
336-
rows[file] = [index_str, "", self.STATUS_NEW, "", ""]
418+
df = self._read_to_dataframe(path_in_feature)
419+
if df is None:
420+
continue
421+
422+
if not path_in_feature.exists():
423+
rows[file] = [index_str, "", self.STATUS_NEW, "", ""]
337424

338425
# Combine and sort the results
339426
df = pd.DataFrame(rows, index=["Path", "Numeric", "Status", "NMAE", "MAPE"]).T
@@ -358,25 +445,31 @@ def files_table(self) -> str:
358445
f"\n"
359446
f"MAPE: Mean Absolute Percentage Error\n"
360447
f"NMAE: Mean Absolute Error on Min-Max Normalized Data\n"
361-
f"Status Thresholds: NMAE > 0.05 and MAPE > 5%\n\n"
448+
f"Status Threshold: NMAE > 0.05 and MAPE > 5%\n\n"
362449
)
363450

364451
@property
365452
def body(self) -> str:
366453
"""Body text for successfull run."""
454+
455+
def create_details_block(summary: str, content: str) -> str:
456+
if content:
457+
return (
458+
f"<details>\n"
459+
f" <summary>{summary}</summary>\n"
460+
f"{content}"
461+
f"</details>\n"
462+
f"\n"
463+
f"\n"
464+
)
465+
else:
466+
return ""
467+
367468
return (
368-
f"<details>\n"
369-
f" <summary>Result plots comparison</summary>\n"
370-
f"{self.plots_table}"
371-
f"</details>\n"
372-
f"\n"
373-
f"\n"
374-
f"<details>\n"
375-
f" <summary>Result files comparison</summary>\n"
376-
f"{self.files_table}"
377-
f"</details>\n"
378-
f"\n"
379-
f"\n"
469+
f"{create_details_block('Variables comparison', self.variables_comparison)}"
470+
f"{create_details_block('Variables changed plots', self.changed_variables_plots)}"
471+
f"{create_details_block('General Plots comparison', self.plots_table)}"
472+
f"{create_details_block('General Files comparison', self.files_table)}"
380473
)
381474

382475
def __call__(self) -> str:

requirements.txt

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,3 @@
1-
pandas
1+
numpy
2+
pandas
3+
openpyxl

0 commit comments

Comments
 (0)