Skip to content

Commit ed0a3e2

Browse files
committed
rename 'error' column to 'Error'
1 parent 69006d8 commit ed0a3e2

File tree

8 files changed

+41
-43
lines changed

8 files changed

+41
-43
lines changed

sdgym/benchmark.py

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -514,7 +514,7 @@ def _compute_scores(
514514
for metric_name, metric in metrics.items():
515515
scores.append({
516516
'metric': metric_name,
517-
'error': 'Metric Timeout',
517+
'Error': 'Metric Timeout',
518518
})
519519
# re-inject list to multiprocessing output
520520
output['scores'] = scores
@@ -537,7 +537,7 @@ def _compute_scores(
537537
scores[-1].update({
538538
'score': score,
539539
'normalized_score': normalized_score,
540-
'error': error,
540+
'Error': error,
541541
'metric_time': calculate_score_time(start),
542542
})
543543
# re-inject list to multiprocessing output
@@ -603,7 +603,7 @@ def _score(
603603
output = {}
604604

605605
output['timeout'] = True # To be deleted if there is no error
606-
output['error'] = 'Load Timeout' # To be deleted if there is no error
606+
output['Error'] = 'Load Timeout' # To be deleted if there is no error
607607
try:
608608
LOGGER.info(
609609
'Running %s on %s dataset %s; %s',
@@ -615,7 +615,7 @@ def _score(
615615

616616
output['dataset_size'] = get_size_of(data) / N_BYTES_IN_MB
617617
# To be deleted if there is no error
618-
output['error'] = 'Synthesizer Timeout'
618+
output['Error'] = 'Synthesizer Timeout'
619619

620620
try:
621621
synthetic_data, train_time, sample_time, synthesizer_size, peak_memory = _synthesize(
@@ -642,7 +642,7 @@ def _score(
642642
)
643643

644644
# No error so far. _compute_scores tracks its own errors by metric
645-
del output['error']
645+
del output['Error']
646646
_compute_scores(
647647
metrics,
648648
data,
@@ -671,14 +671,14 @@ def _score(
671671
output['peak_memory'] = err.peak_memory
672672

673673
output['exception'] = err.exception
674-
output['error'] = err.error
674+
output['Error'] = err.error
675675
output['timeout'] = False
676676

677677
except Exception:
678678
LOGGER.exception('Error running %s on dataset %s;', synthesizer['name'], dataset_name)
679679
exception, error = format_exception()
680680
output['exception'] = exception
681-
output['error'] = error
681+
output['Error'] = error
682682
output['timeout'] = False # There was no timeout
683683

684684
finally:
@@ -744,7 +744,7 @@ def _score_with_timeout(
744744
thread.join(timeout)
745745
if thread.is_alive():
746746
LOGGER.error('Timeout running %s on dataset %s;', synthesizer['name'], dataset_name)
747-
return {'timeout': True, 'error': 'Synthesizer Timeout'}
747+
return {'timeout': True, 'Error': 'Synthesizer Timeout'}
748748

749749
return output
750750

@@ -815,8 +815,8 @@ def _format_output(
815815
for score in output.get('scores', []):
816816
scores.insert(len(scores.columns), score['metric'], score['normalized_score'])
817817

818-
if 'error' in output:
819-
scores['error'] = output['error']
818+
if 'Error' in output:
819+
scores['Error'] = output['Error']
820820

821821
return scores
822822

@@ -1085,8 +1085,8 @@ def _add_adjusted_scores(scores, timeout):
10851085

10861086
fit_times = scores.loc[dataset_mask, 'Train_Time'].fillna(0)
10871087
sample_times = scores.loc[dataset_mask, 'Sample_Time'].fillna(0)
1088-
if 'error' in scores.columns:
1089-
errors = scores.loc[dataset_mask, 'error']
1088+
if 'Error' in scores.columns:
1089+
errors = scores.loc[dataset_mask, 'Error']
10901090
else:
10911091
errors = pd.Series([None] * dataset_mask.sum(), index=scores.index[dataset_mask])
10921092

sdgym/cli/__main__.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -39,13 +39,13 @@ def _print_table(data, sort=None, reverse=False, format=None):
3939
for field, formatter in format.items():
4040
data[field] = data[field].apply(formatter)
4141

42-
if 'error' in data:
43-
error = data['error']
42+
if 'Error' in data:
43+
error = data['Error']
4444
if pd.isna(error).all():
45-
del data['error']
45+
del data['Error']
4646
else:
4747
long_error = error.str.len() > 30
48-
data.loc[long_error, 'error'] = error[long_error].str[:30] + '...'
48+
data.loc[long_error, 'Error'] = error[long_error].str[:30] + '...'
4949

5050
print(tabulate.tabulate(data, tablefmt='github', headers=data.columns, showindex=False)) # noqa: T201
5151

sdgym/cli/summary.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -35,11 +35,11 @@ def preprocess(data):
3535
bydataset = grouped.mean()
3636
data = bydataset.reset_index()
3737

38-
if 'error' in data.columns:
38+
if 'Error' in data.columns:
3939
errors = data.error.fillna('')
4040
for message, column in KNOWN_ERRORS:
4141
data[column] = errors.str.contains(message)
42-
data.loc[data[column], 'error'] = np.nan
42+
data.loc[data[column], 'Error'] = np.nan
4343

4444
return data
4545

@@ -122,7 +122,7 @@ def summarize(data, baselines=(), datasets=None):
122122
baseline_scores = baseline_data.set_index('Dataset').Quality_Score
123123
results[f'beat_{baseline.lower()}'] = _beat_baseline(data, baseline_scores)
124124

125-
if 'error' in data.columns:
125+
if 'Error' in data.columns:
126126
grouped = data.groupby('Synthesizer')
127127
for _, error_column in KNOWN_ERRORS:
128128
results[error_column] = grouped[error_column].sum()
@@ -135,7 +135,7 @@ def summarize(data, baselines=(), datasets=None):
135135

136136

137137
def _error_counts(data):
138-
if 'error' in data.columns:
138+
if 'Error' in data.columns:
139139
return data.error.value_counts()
140140
return 0
141141

@@ -158,8 +158,8 @@ def errors_summary(data):
158158
Returns:
159159
pandas.DataFrame
160160
"""
161-
if 'error' in data.columns:
162-
all_errors = pd.DataFrame(_error_counts(data)).rename(columns={'error': 'all'})
161+
if 'Error' in data.columns:
162+
all_errors = pd.DataFrame(_error_counts(data)).rename(columns={'Error': 'all'})
163163
synthesizer_errors = data.groupby('Synthesizer').apply(_error_counts).pivot_table(level=0)
164164
for synthesizer, errors in synthesizer_errors.items():
165165
all_errors[synthesizer] = errors.fillna(0).astype(int)

sdgym/run_benchmark/upload_benchmark_results.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -231,8 +231,6 @@ def get_model_details(summary, results, df_to_plot, modality):
231231
with open(SYNTHESIZER_DESCRIPTION_PATH, 'r', encoding='utf-8') as f:
232232
synthesizer_info = yaml.safe_load(f) or {}
233233

234-
err_column = next((c for c in ('error', 'Error') if c in results.columns), None)
235-
err_column_flag = err_column is not None
236234
paretos_synthesizers = (
237235
df_to_plot.loc[df_to_plot['Pareto'].eq(True), 'Synthesizer'].astype(str).add('Synthesizer')
238236
)
@@ -259,16 +257,16 @@ def get_model_details(summary, results, df_to_plot, modality):
259257
model_details['Number of datasets - Wins'] = (
260258
model_details['Synthesizer'].map(wins).fillna(0).astype(int)
261259
)
262-
if err_column_flag:
260+
if 'Error' in results.columns:
263261
timeout_counts = (
264262
results
265-
.loc[results[err_column].eq('Synthesizer Timeout')]
263+
.loc[results['Error'].eq('Synthesizer Timeout')]
266264
.groupby('Synthesizer')['Dataset']
267265
.nunique()
268266
)
269267
error_counts = (
270268
results
271-
.loc[results[err_column].notna() & ~results[err_column].eq('Synthesizer Timeout')]
269+
.loc[results['Error'].notna() & ~results['Error'].eq('Synthesizer Timeout')]
272270
.groupby('Synthesizer')['Dataset']
273271
.nunique()
274272
)

tests/integration/test_benchmark.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -142,7 +142,7 @@ def test_benchmark_single_table_error_handling():
142142
assert not output.empty
143143
assert 'Train_Time' in output
144144
assert 'Sample_Time' in output
145-
assert output['error'].to_list() == [np.nan, np.nan, np.nan, 'ValueError: random error']
145+
assert output['Error'].to_list() == [np.nan, np.nan, np.nan, 'ValueError: random error']
146146

147147

148148
def test_benchmark_single_table_compute_quality_score():
@@ -318,7 +318,7 @@ def test_benchmark_single_table_timeout():
318318
'Diagnostic_Score': None,
319319
'Quality_Score': None,
320320
'Privacy_Score': None,
321-
'error': 'Synthesizer Timeout',
321+
'Error': 'Synthesizer Timeout',
322322
'Adjusted_Total_Time': 1 + fallback_train_time + fallback_sample_time,
323323
'Adjusted_Quality_Score': None,
324324
},
@@ -508,7 +508,7 @@ def test_benchmark_single_table_no_synthesizers_with_parameters():
508508
.all()
509509
)
510510
assert result['Evaluate_Time'] is None
511-
assert result['error'] == 'ValueError: Unknown single_table metric: a'
511+
assert result['Error'] == 'ValueError: Unknown single_table metric: a'
512512

513513

514514
def test_benchmark_single_table_custom_synthesizer():
@@ -811,7 +811,7 @@ def fit(self, data):
811811
)
812812

813813
# Assert
814-
assert result['error'].to_list() == [
814+
assert result['Error'].to_list() == [
815815
'Exception: Fitting error',
816816
np.nan,
817817
np.nan,
@@ -855,7 +855,7 @@ def sample(self, num_rows):
855855
)
856856

857857
# Assert
858-
assert result['error'].to_list() == [
858+
assert result['Error'].to_list() == [
859859
'Exception: Sampling error',
860860
np.nan,
861861
np.nan,
@@ -1034,7 +1034,7 @@ def _augment_tables(self, data):
10341034
)
10351035

10361036
# Assert
1037-
assert result['error'].to_list() == [
1037+
assert result['Error'].to_list() == [
10381038
'Exception: Fitting error',
10391039
np.nan,
10401040
'Exception: Fitting error',

tests/unit/run_benchmark/test_upload_benchmark_result.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -308,7 +308,7 @@ def test_get_model_details(mock_open, mock_yaml_load):
308308
'CTGANSynthesizer',
309309
],
310310
'Quality_Score': [0.1, 0.2, 0.3, 0.15, 0.25],
311-
'error': [
311+
'Error': [
312312
'Synthesizer Timeout', # timeout on D1 for GaussianCopulaSynthesizer
313313
'Other Error', # error on D2 for GaussianCopulaSynthesizer
314314
None, # no error on D3 for GaussianCopulaSynthesizer

tests/unit/test_benchmark.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -235,7 +235,7 @@ def test_benchmark_single_table_with_timeout(mock_multiprocessing, mock__score):
235235
# Setup
236236
mocked_process = mock_multiprocessing.Process.return_value
237237
manager = mock_multiprocessing.Manager.return_value
238-
manager_dict = {'timeout': True, 'error': 'Synthesizer Timeout'}
238+
manager_dict = {'timeout': True, 'Error': 'Synthesizer Timeout'}
239239
manager.__enter__.return_value.dict.return_value = manager_dict
240240

241241
# Run
@@ -261,7 +261,7 @@ def test_benchmark_single_table_with_timeout(mock_multiprocessing, mock__score):
261261
'Diagnostic_Score': {0: None},
262262
'Quality_Score': {0: None},
263263
'Privacy_Score': {0: None},
264-
'error': {0: 'Synthesizer Timeout'},
264+
'Error': {0: 'Synthesizer Timeout'},
265265
'Adjusted_Total_Time': {0: None},
266266
'Adjusted_Quality_Score': {0: None},
267267
})
@@ -357,14 +357,14 @@ def test__format_output():
357357
'scores': [
358358
{
359359
'metric': 'NewRowSynthesis',
360-
'error': None,
360+
'Error': None,
361361
'score': 0.998,
362362
'normalized_score': 0.998,
363363
'metric_time': 6.0,
364364
},
365365
{
366366
'metric': 'NewMetric',
367-
'error': None,
367+
'Error': None,
368368
'score': 0.998,
369369
'normalized_score': 0.998,
370370
'metric_time': 5.0,
@@ -985,15 +985,15 @@ def test__add_adjusted_scores_timeout():
985985
'Train_Time': [np.nan, 0.5],
986986
'Sample_Time': [np.nan, 0.25],
987987
'Quality_Score': [np.nan, 0.5],
988-
'error': ['Synthesizer Timeout', np.nan],
988+
'Error': ['Synthesizer Timeout', np.nan],
989989
})
990990
expected = pd.DataFrame({
991991
'Synthesizer': ['GaussianCopulaSynthesizer', 'UniformSynthesizer'],
992992
'Dataset': ['dataset1', 'dataset1'],
993993
'Train_Time': [np.nan, 0.5],
994994
'Sample_Time': [np.nan, 0.25],
995995
'Quality_Score': [np.nan, 0.5],
996-
'error': ['Synthesizer Timeout', np.nan],
996+
'Error': ['Synthesizer Timeout', np.nan],
997997
'Adjusted_Total_Time': [10.75, 1.25],
998998
'Adjusted_Quality_Score': [0.5, 0.5],
999999
})
@@ -1014,15 +1014,15 @@ def test__add_adjusted_scores_errors():
10141014
'Train_Time': [np.nan, 1.0, 1.0, 0.5],
10151015
'Sample_Time': [np.nan, np.nan, 2.0, 0.25],
10161016
'Quality_Score': [np.nan, np.nan, np.nan, 0.5],
1017-
'error': ['ValueError', 'RuntimeError', 'KeyError', np.nan],
1017+
'Error': ['ValueError', 'RuntimeError', 'KeyError', np.nan],
10181018
})
10191019
expected = pd.DataFrame({
10201020
'Synthesizer': ['ErrorOnTrain', 'ErrorOnSample', 'ErrorAfterSample', 'UniformSynthesizer'],
10211021
'Dataset': ['dataset1', 'dataset1', 'dataset1', 'dataset1'],
10221022
'Train_Time': [np.nan, 1.0, 1.0, 0.5],
10231023
'Sample_Time': [np.nan, np.nan, 2.0, 0.25],
10241024
'Quality_Score': [np.nan, np.nan, np.nan, 0.5],
1025-
'error': ['ValueError', 'RuntimeError', 'KeyError', np.nan],
1025+
'Error': ['ValueError', 'RuntimeError', 'KeyError', np.nan],
10261026
'Adjusted_Total_Time': [0.75, 1.75, 3.75, 1.25],
10271027
'Adjusted_Quality_Score': [0.5, 0.5, 0.5, 0.5],
10281028
})

tests/unit/test_summary.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ def test_make_summary_spreadsheet(
6464
index=['synth1', 'synth2'],
6565
)
6666
preprocessed_data = pd.DataFrame({'modality': ['single-table']})
67-
errors = pd.DataFrame({'synth1': [0], 'synth2': [1], 'error': ['RuntimeError: error.']})
67+
errors = pd.DataFrame({'synth1': [0], 'synth2': [1], 'Error': ['RuntimeError: error.']})
6868
preprocess_mock.return_value = preprocessed_data
6969
summarize_mock.return_value = data
7070
errors_summary_mock.return_value = errors

0 commit comments

Comments
 (0)