Skip to content

Commit 1a2a9a6

Browse files
authored
fix: don't use categoricals in output format (#63)
* fix: don't use categoricals in output format * Remove categorical usage from fetch metrics buckets output format
1 parent fd45146 commit 1a2a9a6

File tree

2 files changed

+120
-28
lines changed

2 files changed

+120
-28
lines changed

src/neptune_query/internal/output_format.py

Lines changed: 37 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -301,14 +301,13 @@ def generate_categorized_rows() -> Generator[Tuple, None, None]:
301301
np.fromiter(generate_categorized_rows(), dtype=types),
302302
)
303303

304-
experiment_dtype = pd.CategoricalDtype(categories=label_mapping)
305-
df[index_column_name] = pd.Categorical.from_codes(df[index_column_name], dtype=experiment_dtype)
306304
if timestamp_column_name:
307305
df[timestamp_column_name] = pd.to_datetime(df[timestamp_column_name], unit="ms", origin="unix", utc=True)
308306

309-
df = _pivot_and_reindex_df(df, include_point_previews, index_column_name, timestamp_column_name)
307+
df = _pivot_df(df, include_point_previews, index_column_name, timestamp_column_name)
308+
df = _restore_labels_in_index(df, index_column_name, label_mapping)
310309
df = _restore_path_column_names(df, path_mapping, "float_series" if type_suffix_in_column_names else None)
311-
df = _sort_indices(df)
310+
df = _sort_index_and_columns(df, index_column_name)
312311

313312
return df
314313

@@ -390,14 +389,13 @@ def generate_categorized_rows() -> Generator[Tuple, None, None]:
390389
np.fromiter(generate_categorized_rows(), dtype=types),
391390
)
392391

393-
experiment_dtype = pd.CategoricalDtype(categories=label_mapping)
394-
df[index_column_name] = pd.Categorical.from_codes(df[index_column_name], dtype=experiment_dtype)
395392
if timestamp_column_name:
396393
df[timestamp_column_name] = pd.to_datetime(df[timestamp_column_name], unit="ms", origin="unix", utc=True)
397394

398-
df = _pivot_and_reindex_df(df, False, index_column_name, timestamp_column_name)
395+
df = _pivot_df(df, False, index_column_name, timestamp_column_name)
396+
df = _restore_labels_in_index(df, index_column_name, label_mapping)
399397
df = _restore_path_column_names(df, path_mapping, None)
400-
df = _sort_indices(df)
398+
df = _sort_index_and_columns(df, index_column_name)
401399

402400
return df
403401

@@ -461,8 +459,6 @@ def generate_categorized_rows() -> Generator[Tuple, None, None]:
461459
df = pd.DataFrame(
462460
np.fromiter(generate_categorized_rows(), dtype=types),
463461
)
464-
experiment_dtype = pd.CategoricalDtype(categories=label_mapping)
465-
df[container_column_name] = pd.Categorical.from_codes(df[container_column_name], dtype=experiment_dtype)
466462

467463
df["bucket"] = pd.IntervalIndex.from_arrays(df["from_x"], df["to_x"], closed="right")
468464
df = df.drop(columns=["from_x", "to_x"])
@@ -475,11 +471,8 @@ def generate_categorized_rows() -> Generator[Tuple, None, None]:
475471
dropna=False,
476472
sort=False,
477473
)
478-
df.columns = df.columns.set_levels(
479-
df.columns.get_level_values(container_column_name).unique().astype(str),
480-
level=container_column_name,
481-
)
482474

475+
df = _restore_labels_in_columns(df, container_column_name, label_mapping)
483476
df = _restore_path_column_names(df, path_mapping, None)
484477

485478
# Clear out any columns that were not requested, but got added because of dropna=False
@@ -546,7 +539,7 @@ def _collapse_open_buckets(df: pd.DataFrame) -> pd.DataFrame:
546539
return df
547540

548541

549-
def _pivot_and_reindex_df(
542+
def _pivot_df(
550543
df: pd.DataFrame,
551544
include_point_previews: bool,
552545
index_column_name: str,
@@ -559,7 +552,7 @@ def _pivot_and_reindex_df(
559552
df[[index_column_name, "step"]]
560553
.astype(
561554
{
562-
index_column_name: "category",
555+
index_column_name: "uint32",
563556
"step": "float64",
564557
}
565558
)
@@ -588,15 +581,31 @@ def _pivot_and_reindex_df(
588581
)
589582

590583
# Include only observed (experiment, step) pairs
591-
df = df.filter(observed_idx, axis="index")
584+
return df.filter(observed_idx, axis="index")
592585

593-
# Replace categorical codes in `index_column_name` with strings
594-
df.index = df.index.set_levels(
595-
df.index.get_level_values(index_column_name).unique().astype(str),
596-
level=index_column_name,
597-
)
598586

599-
return df.sort_index(level=[index_column_name, "step"])
587+
def _restore_labels_in_index(
588+
df: pd.DataFrame,
589+
column_name: str,
590+
label_mapping: list[str],
591+
) -> pd.DataFrame:
592+
if df.index.empty:
593+
df.index = df.index.set_levels(df.index.get_level_values(column_name).astype(str), level=column_name)
594+
return df
595+
596+
return df.rename(index={i: label for i, label in enumerate(label_mapping)}, level=column_name)
597+
598+
599+
def _restore_labels_in_columns(
600+
df: pd.DataFrame,
601+
column_name: str,
602+
label_mapping: list[str],
603+
) -> pd.DataFrame:
604+
if df.index.empty:
605+
df.columns = df.columns.set_levels(df.columns.get_level_values(column_name).astype(str), level=column_name)
606+
return df
607+
608+
return df.rename(columns={i: label for i, label in enumerate(label_mapping)}, level=column_name)
600609

601610

602611
def _restore_path_column_names(
@@ -622,16 +631,18 @@ def _restore_path_column_names(
622631
return df.rename(columns=reverse_mapping)
623632

624633

625-
def _sort_indices(df: pd.DataFrame) -> pd.DataFrame:
634+
def _sort_index_and_columns(df: pd.DataFrame, index_column_name: str) -> pd.DataFrame:
626635
# MultiIndex DFs need to have column index order swapped: value/metric_name -> metric_name/value.
627636
# We also sort columns, but only after the original names have been restored.
628637
if isinstance(df.columns, pd.MultiIndex):
629638
df.columns.names = (None, None)
630639
df = df.swaplevel(axis="columns")
631-
return df.sort_index(axis="columns", level=0)
640+
df = df.sort_index(axis="columns", level=0)
632641
else:
633642
df.columns.name = None
634-
return df.sort_index(axis="columns")
643+
df = df.sort_index(axis="columns")
644+
645+
return df.sort_index(axis="index", level=[index_column_name, "step"])
635646

636647

637648
def create_files_dataframe(

tests/unit/internal/test_output_format.py

Lines changed: 83 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -500,10 +500,13 @@ def _generate_bucket_metrics(
500500
}
501501

502502

503+
def _a_timestamp(seconds_delta) -> datetime:
504+
return datetime(2023, 1, 1, 0, 0, 0, 0, timezone.utc) + timedelta(seconds=seconds_delta)
505+
506+
503507
def _generate_float_point_value(step: int, preview: bool) -> FloatPointValue:
504-
timestamp = datetime(2023, 1, 1, 0, 0, 0, 0, timezone.utc) + timedelta(seconds=step)
505508
return (
506-
timestamp.timestamp(),
509+
_a_timestamp(seconds_delta=step).timestamp(),
507510
float(step),
508511
float(step) * 100,
509512
preview,
@@ -605,6 +608,84 @@ def test_create_metrics_dataframe_shape(include_preview):
605608
), "The list of tuples should have the same number of rows as the DataFrame"
606609

607610

611+
def test_create_metrics_dataframe_from_exp_with_no_points():
612+
df = create_metrics_dataframe(
613+
# This input data produces a "hole" in our categorical mapping of experiment names to integers
614+
metrics_data={
615+
_generate_run_attribute_definition(1, 1): [_generate_float_point_value(1, False)],
616+
_generate_run_attribute_definition(2, 2): [],
617+
_generate_run_attribute_definition(3, 1): [_generate_float_point_value(2, False)],
618+
},
619+
sys_id_label_mapping={
620+
SysId("sysid1"): "exp1",
621+
SysId("sysid2"): "exp2",
622+
SysId("sysid3"): "exp3",
623+
},
624+
include_point_previews=False,
625+
type_suffix_in_column_names=False,
626+
index_column_name="experiment",
627+
)
628+
629+
expected_df = pd.DataFrame(
630+
data={
631+
"path1": [
632+
100.0,
633+
200.0,
634+
],
635+
},
636+
index=pd.MultiIndex.from_tuples(
637+
tuples=[
638+
("exp1", 1.0),
639+
("exp3", 2.0),
640+
],
641+
names=["experiment", "step"],
642+
),
643+
)
644+
pd.testing.assert_frame_equal(df, expected_df)
645+
646+
647+
def test_create_metrics_dataframe_from_exp_with_no_points_preview():
648+
df = create_metrics_dataframe(
649+
# This input data produces a "hole" in our categorical mapping of experiment names to integers
650+
metrics_data={
651+
_generate_run_attribute_definition(1, 1): [_generate_float_point_value(1, True)],
652+
_generate_run_attribute_definition(2, 2): [],
653+
_generate_run_attribute_definition(3, 1): [_generate_float_point_value(2, True)],
654+
},
655+
sys_id_label_mapping={
656+
SysId("sysid1"): "exp1",
657+
SysId("sysid2"): "exp2",
658+
SysId("sysid3"): "exp3",
659+
},
660+
include_point_previews=True,
661+
type_suffix_in_column_names=False,
662+
index_column_name="experiment",
663+
)
664+
665+
expected_df = pd.DataFrame(
666+
data={
667+
("path1", "is_preview"): [
668+
True,
669+
True,
670+
],
671+
("path1", "preview_completion"): [
672+
0.999,
673+
0.998,
674+
],
675+
("path1", "value"): [100.0, 200.0],
676+
},
677+
index=pd.MultiIndex.from_tuples(
678+
tuples=[
679+
("exp1", 1.0),
680+
("exp3", 2.0),
681+
],
682+
names=["experiment", "step"],
683+
),
684+
)
685+
expected_df[("path1", "is_preview")] = expected_df[("path1", "is_preview")].astype("object")
686+
pd.testing.assert_frame_equal(df, expected_df)
687+
688+
608689
@pytest.mark.parametrize("type_suffix_in_column_names", [True, False])
609690
@pytest.mark.parametrize("include_preview", [True, False])
610691
def test_create_metrics_dataframe_with_absolute_timestamp(type_suffix_in_column_names: bool, include_preview: bool):

0 commit comments

Comments
 (0)