Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
105 changes: 100 additions & 5 deletions utils/compare.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,9 +128,9 @@ def merge_values(values, merge_function):


def get_values(values):
# Create data view without diff column.
if "diff" in values.columns:
values = values[[c for c in values.columns if c != "diff"]]
# Create data view without diff column and statistical columns.
exclude_cols = ["diff", "std_lhs", "std_rhs", "t-value", "p-value", "significant"]
values = values[[c for c in values.columns if c not in exclude_cols]]
has_two_runs = len(values.columns) == 2
if has_two_runs:
return (values.iloc[:, 0], values.iloc[:, 1])
Expand All @@ -150,6 +150,57 @@ def add_diff_column(metric, values, absolute_diff=False):
return values


def compute_statistics(lhs_d, rhs_d, metrics, alpha):
stats_dict = {}

for metric in metrics:
if metric not in lhs_d.columns:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe check if metric not in lhs_d.columns or metric not in rhs_d.columns: to be compatible with the existing flow, or unify elseway

continue

stats_dict[metric] = {}

# Group by program
for program in lhs_d.index.get_level_values(1).unique():
lhs_values = lhs_d.loc[(slice(None), program), metric].dropna()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Cant we do something more efficient like this pseudo code:

for program, group in lhs_d.groupby(level=1):
    lhs_values = group[metric].dropna()

rhs_values = rhs_d.loc[(slice(None), program), metric].dropna()

stats_dict[metric][program] = {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sink this conditional calculation into if len(lhs_values) >= 2 and len(rhs_values) >= 2:

'std_lhs': lhs_values.std(ddof=1) if len(lhs_values) >= 2 else float('nan'),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should take the name from --lhs-name like f'std_{lhs_name}'

'std_rhs': rhs_values.std(ddof=1) if len(rhs_values) >= 2 else float('nan'),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should take the name from --rhs-name like f'std_{rhs_name}'

}

# Compute t-test if we have enough samples
if len(lhs_values) >= 2 and len(rhs_values) >= 2:
t_stat, p_val = stats.ttest_ind(lhs_values, rhs_values)
stats_dict[metric][program]['t-value'] = t_stat
stats_dict[metric][program]['p-value'] = p_val
stats_dict[metric][program]['significant'] = "✅" if p_val < alpha else "❌"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe change to something more parsable but still human readable like Y/N

else:
stats_dict[metric][program]['t-value'] = float('nan')
stats_dict[metric][program]['p-value'] = float('nan')
stats_dict[metric][program]['significant'] = ""

return stats_dict


def add_precomputed_statistics(data, stats_dict):
"""Add precomputed statistics to the unstacked dataframe."""
for metric in data.columns.levels[0]:
if metric not in stats_dict:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As part of metrics logic unification, should this effectively become an assert (inverted)?

continue

for stat_name in ['std_lhs', 'std_rhs', 't-value', 'p-value', 'significant']:
values = []
for program in data.index:
if program in stats_dict[metric]:
values.append(stats_dict[metric][program].get(stat_name, float('nan') if stat_name != 'significant' else ""))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Eliminate duplicated expression float('nan') if stat_name != 'significant' else "" and hoist it to the top level loop

else:
values.append(float('nan') if stat_name != 'significant' else "")
data[(metric, stat_name)] = values

return data


def add_geomean_row(metrics, data, dataout):
"""
Normalize values1 over values0, compute geomean difference and add a
Expand Down Expand Up @@ -272,6 +323,16 @@ def print_result(
if not absolute_diff:
for m in metrics:
formatters[(m, "diff")] = format_relative_diff
# Add formatters for statistical columns
for m in metrics:
if (m, "p-value") in dataout.columns:
formatters[(m, "p-value")] = lambda x: "%.4f" % x if not pd.isna(x) else ""
if (m, "t-value") in dataout.columns:
formatters[(m, "t-value")] = lambda x: "%.3f" % x if not pd.isna(x) else ""
if (m, "std_lhs") in dataout.columns:
formatters[(m, "std_lhs")] = lambda x: "%.3f" % x if not pd.isna(x) else ""
if (m, "std_rhs") in dataout.columns:
formatters[(m, "std_rhs")] = lambda x: "%.3f" % x if not pd.isna(x) else ""
# Turn index into a column so we can format it...
formatted_program = dataout.index.to_series()
if shorten_names:
Expand Down Expand Up @@ -320,7 +381,10 @@ def float_format(x):
formatters=formatters,
)
print(out)
print(d.describe())
# Exclude statistical columns from summary statistics
exclude_from_summary = ["std_lhs", "std_rhs", "t-value", "p-value", "significant"]
d_summary = d.drop(columns=exclude_from_summary, level=1, errors='ignore')
print(d_summary.describe())


def main():
Expand Down Expand Up @@ -400,6 +464,19 @@ def main():
default=False,
help="Don't use abs() when sorting results",
)
parser.add_argument(
"--statistics",
action="store_true",
dest="statistics",
default=False,
help="Add statistical analysis columns (std, t-value, p-value, significance)",
)
parser.add_argument(
"--alpha",
type=float,
default=0.05,
help="Significance level for statistical tests (default: 0.05)",
)
config = parser.parse_args()

if config.show_diff is None:
Expand All @@ -425,15 +502,30 @@ def main():

# Read inputs
files = config.files
stats_dict = None
if "vs" in files:
split = files.index("vs")
lhs = files[0:split]
rhs = files[split + 1 :]

# Combine the multiple left and right hand sides.
lhs_d = readmulti(lhs)
lhs_merged = merge_values(lhs_d, config.merge_function)
rhs_d = readmulti(rhs)

# Compute statistics on raw data before merging (if requested)
if config.statistics:
# Get metrics early for statistics computation
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This part duplicates logic from after the conditional, and lacks some checks that exist there. Need to unify this logic

temp_metrics = config.metrics
if len(temp_metrics) == 0:
defaults = ["Exec_Time", "exec_time", "Value", "Runtime"]
for defkey in defaults:
if defkey in lhs_d.columns:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe check if defkey in lhs_d.columns or defkey in rhs_d.columns to be compatible with the existing flow, or unify elseway

temp_metrics = [defkey]
break
stats_dict = compute_statistics(lhs_d, rhs_d, temp_metrics, alpha=config.alpha)

# Merge data
lhs_merged = merge_values(lhs_d, config.merge_function)
rhs_merged = merge_values(rhs_d, config.merge_function)

# Combine to new dataframe
Expand Down Expand Up @@ -508,6 +600,9 @@ def main():
for metric in data.columns.levels[0]:
data = add_diff_column(metric, data, absolute_diff=config.absolute_diff)

if config.statistics and stats_dict is not None:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Better check if config.statistics and stats_dict is not None and stats_dict: for when compute returns an empty dict (it cannot return None)

data = add_precomputed_statistics(data, stats_dict)

sortkey = "diff"
# TODO: should we still be sorting by diff even if the diff is hidden?
if len(config.files) == 1:
Expand Down