-
Notifications
You must be signed in to change notification settings - Fork 365
compare.py: Add --statistics to show statistical significance with a t-test. #288
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -128,9 +128,9 @@ def merge_values(values, merge_function): | |
|
||
|
||
def get_values(values): | ||
# Create data view without diff column. | ||
if "diff" in values.columns: | ||
values = values[[c for c in values.columns if c != "diff"]] | ||
# Create data view without diff column and statistical columns. | ||
exclude_cols = ["diff", "std_lhs", "std_rhs", "t-value", "p-value", "significant"] | ||
values = values[[c for c in values.columns if c not in exclude_cols]] | ||
has_two_runs = len(values.columns) == 2 | ||
if has_two_runs: | ||
return (values.iloc[:, 0], values.iloc[:, 1]) | ||
|
@@ -150,6 +150,57 @@ def add_diff_column(metric, values, absolute_diff=False): | |
return values | ||
|
||
|
||
def compute_statistics(lhs_d, rhs_d, metrics, alpha): | ||
stats_dict = {} | ||
|
||
for metric in metrics: | ||
if metric not in lhs_d.columns: | ||
continue | ||
|
||
stats_dict[metric] = {} | ||
|
||
# Group by program | ||
for program in lhs_d.index.get_level_values(1).unique(): | ||
lhs_values = lhs_d.loc[(slice(None), program), metric].dropna() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Cant we do something more efficient like this pseudo code:
|
||
rhs_values = rhs_d.loc[(slice(None), program), metric].dropna() | ||
|
||
stats_dict[metric][program] = { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sink this conditional calculation into |
||
'std_lhs': lhs_values.std(ddof=1) if len(lhs_values) >= 2 else float('nan'), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. should take the name from |
||
'std_rhs': rhs_values.std(ddof=1) if len(rhs_values) >= 2 else float('nan'), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. should take the name from |
||
} | ||
|
||
# Compute t-test if we have enough samples | ||
if len(lhs_values) >= 2 and len(rhs_values) >= 2: | ||
t_stat, p_val = stats.ttest_ind(lhs_values, rhs_values) | ||
stats_dict[metric][program]['t-value'] = t_stat | ||
stats_dict[metric][program]['p-value'] = p_val | ||
stats_dict[metric][program]['significant'] = "✅" if p_val < alpha else "❌" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe change to something more parsable but still human readable like Y/N |
||
else: | ||
stats_dict[metric][program]['t-value'] = float('nan') | ||
stats_dict[metric][program]['p-value'] = float('nan') | ||
stats_dict[metric][program]['significant'] = "" | ||
|
||
return stats_dict | ||
|
||
|
||
def add_precomputed_statistics(data, stats_dict): | ||
"""Add precomputed statistics to the unstacked dataframe.""" | ||
for metric in data.columns.levels[0]: | ||
if metric not in stats_dict: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As part of metrics logic unification, should this effectively become an assert (inverted)? |
||
continue | ||
|
||
for stat_name in ['std_lhs', 'std_rhs', 't-value', 'p-value', 'significant']: | ||
values = [] | ||
for program in data.index: | ||
if program in stats_dict[metric]: | ||
values.append(stats_dict[metric][program].get(stat_name, float('nan') if stat_name != 'significant' else "")) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Eliminate duplicated expression |
||
else: | ||
values.append(float('nan') if stat_name != 'significant' else "") | ||
data[(metric, stat_name)] = values | ||
|
||
return data | ||
|
||
|
||
def add_geomean_row(metrics, data, dataout): | ||
""" | ||
Normalize values1 over values0, compute geomean difference and add a | ||
|
@@ -272,6 +323,16 @@ def print_result( | |
if not absolute_diff: | ||
for m in metrics: | ||
formatters[(m, "diff")] = format_relative_diff | ||
# Add formatters for statistical columns | ||
for m in metrics: | ||
if (m, "p-value") in dataout.columns: | ||
formatters[(m, "p-value")] = lambda x: "%.4f" % x if not pd.isna(x) else "" | ||
if (m, "t-value") in dataout.columns: | ||
formatters[(m, "t-value")] = lambda x: "%.3f" % x if not pd.isna(x) else "" | ||
if (m, "std_lhs") in dataout.columns: | ||
formatters[(m, "std_lhs")] = lambda x: "%.3f" % x if not pd.isna(x) else "" | ||
if (m, "std_rhs") in dataout.columns: | ||
formatters[(m, "std_rhs")] = lambda x: "%.3f" % x if not pd.isna(x) else "" | ||
# Turn index into a column so we can format it... | ||
formatted_program = dataout.index.to_series() | ||
if shorten_names: | ||
|
@@ -320,7 +381,10 @@ def float_format(x): | |
formatters=formatters, | ||
) | ||
print(out) | ||
print(d.describe()) | ||
# Exclude statistical columns from summary statistics | ||
exclude_from_summary = ["std_lhs", "std_rhs", "t-value", "p-value", "significant"] | ||
d_summary = d.drop(columns=exclude_from_summary, level=1, errors='ignore') | ||
print(d_summary.describe()) | ||
|
||
|
||
def main(): | ||
|
@@ -400,6 +464,19 @@ def main(): | |
default=False, | ||
help="Don't use abs() when sorting results", | ||
) | ||
parser.add_argument( | ||
"--statistics", | ||
action="store_true", | ||
dest="statistics", | ||
default=False, | ||
help="Add statistical analysis columns (std, t-value, p-value, significance)", | ||
) | ||
parser.add_argument( | ||
"--alpha", | ||
type=float, | ||
default=0.05, | ||
help="Significance level for statistical tests (default: 0.05)", | ||
) | ||
config = parser.parse_args() | ||
|
||
if config.show_diff is None: | ||
|
@@ -425,15 +502,30 @@ def main(): | |
|
||
# Read inputs | ||
files = config.files | ||
stats_dict = None | ||
if "vs" in files: | ||
split = files.index("vs") | ||
lhs = files[0:split] | ||
rhs = files[split + 1 :] | ||
|
||
# Combine the multiple left and right hand sides. | ||
lhs_d = readmulti(lhs) | ||
lhs_merged = merge_values(lhs_d, config.merge_function) | ||
rhs_d = readmulti(rhs) | ||
|
||
# Compute statistics on raw data before merging (if requested) | ||
if config.statistics: | ||
# Get metrics early for statistics computation | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This part duplicates logic from after the conditional, and lacks some checks that exist there. Need to unify this logic |
||
temp_metrics = config.metrics | ||
if len(temp_metrics) == 0: | ||
defaults = ["Exec_Time", "exec_time", "Value", "Runtime"] | ||
for defkey in defaults: | ||
if defkey in lhs_d.columns: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe check |
||
temp_metrics = [defkey] | ||
break | ||
stats_dict = compute_statistics(lhs_d, rhs_d, temp_metrics, alpha=config.alpha) | ||
|
||
# Merge data | ||
lhs_merged = merge_values(lhs_d, config.merge_function) | ||
rhs_merged = merge_values(rhs_d, config.merge_function) | ||
|
||
# Combine to new dataframe | ||
|
@@ -508,6 +600,9 @@ def main(): | |
for metric in data.columns.levels[0]: | ||
data = add_diff_column(metric, data, absolute_diff=config.absolute_diff) | ||
|
||
if config.statistics and stats_dict is not None: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Better check |
||
data = add_precomputed_statistics(data, stats_dict) | ||
|
||
sortkey = "diff" | ||
# TODO: should we still be sorting by diff even if the diff is hidden? | ||
if len(config.files) == 1: | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Maybe check
if metric not in lhs_d.columns or metric not in rhs_d.columns:
to be compatible with the existing flow, or unify elseway