Skip to content

Commit 3277476

Browse files
authored
feat: merge 1st and 2nd bucket (#44)
* feat: merge 1st and 2nd bucket * fix: add several tests * fix: limit 1000 and range ends equality --------- Co-authored-by: Michał Sośnicki <michal.sosnicki@neptune.ai>
1 parent d55d625 commit 3277476

File tree

7 files changed

+149
-33
lines changed

7 files changed

+149
-33
lines changed

src/neptune_query/__init__.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,6 @@
4848
resolve_files,
4949
resolve_metrics_y,
5050
resolve_sort_by,
51-
validate_limit,
5251
)
5352
from neptune_query.exceptions import NeptuneUserError
5453
from neptune_query.internal.composition import download_files as _download_files
@@ -488,7 +487,6 @@ def fetch_metric_buckets(
488487
project_identifier = get_default_project_identifier(project)
489488
experiments_filter = resolve_experiments_filter(experiments)
490489
resolved_y = resolve_metrics_y(y)
491-
validate_limit(limit, max_limit=1000)
492490

493491
return _fetch_metric_buckets.fetch_metric_buckets(
494492
project_identifier=project_identifier,

src/neptune_query/_internal.py

Lines changed: 1 addition & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -23,10 +23,7 @@
2323
import pandas as pd
2424

2525
from neptune_query import filters
26-
from neptune_query.exceptions import (
27-
NeptuneProjectNotProvided,
28-
NeptuneUserError,
29-
)
26+
from neptune_query.exceptions import NeptuneProjectNotProvided
3027
from neptune_query.internal import filters as _filters
3128
from neptune_query.internal.context import get_context
3229
from neptune_query.internal.identifiers import ProjectIdentifier
@@ -160,10 +157,3 @@ def resolve_destination_path(destination: Optional[Union[str, pathlib.Path]]) ->
160157
return destination.resolve()
161158
else:
162159
return pathlib.Path(destination).resolve()
163-
164-
165-
def validate_limit(limit: int, max_limit: int) -> None:
166-
if limit <= 0:
167-
raise NeptuneUserError(f"Limit must be a positive integer. Got: {limit}")
168-
if limit > max_limit:
169-
raise NeptuneUserError(f"Limit cannot be greater than {max_limit}. Got: {limit}")

src/neptune_query/internal/composition/fetch_metric_buckets.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,8 +68,9 @@ def fetch_metric_buckets(
6868
container_type: ContainerType,
6969
) -> pd.DataFrame:
7070
validation.validate_metrics_x(x)
71-
validation.validate_limit(limit)
71+
validation.validate_bucket_limit(limit)
7272
restricted_y = validation.restrict_attribute_filter_type(y, type_in={"float_series"})
73+
limit = limit + 1 # we request one extra bucket because the 1st one is (-inf, 1st point] and we merge it
7374

7475
valid_context = validate_context(context or get_context())
7576
client = get_client(context=valid_context)

src/neptune_query/internal/composition/validation.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,15 @@ def validate_limit(limit: Optional[int]) -> None:
9494
_validate_optional_positive_int(limit, "limit")
9595

9696

97+
def validate_bucket_limit(limit: int) -> None:
98+
if not isinstance(limit, int):
99+
raise ValueError("limit must be an integer")
100+
if limit <= 0:
101+
raise ValueError(f"limit must be a positive integer. Got: {limit}")
102+
if limit > 1000:
103+
raise ValueError(f"limit cannot be greater than 1000. Got: {limit}")
104+
105+
97106
def validate_metrics_x(x: Literal["step"]) -> Literal["step"]:
98107
"""Validate that x is 'step' (the only valid value for now)."""
99108
if x not in ("step",):

src/neptune_query/internal/output_format.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -487,6 +487,49 @@ def generate_categorized_rows() -> Generator[Tuple, None, None]:
487487
df.index.name = None
488488
df.columns.names = (container_column_name, "metric", "bucket")
489489

490+
df = _collapse_open_buckets(df)
491+
492+
return df
493+
494+
495+
def _collapse_open_buckets(df: pd.DataFrame) -> pd.DataFrame:
496+
"""
497+
1st returned bucket is always (-inf, first_point], which we merge with the 2nd bucket (first_point, end],
498+
resulting in a new bucket [first_point, end].
499+
"""
500+
df.index = df.index.astype(object) # IntervalIndex cannot mix Intervals closed from different sides
501+
502+
if df.index.empty:
503+
return df
504+
505+
if len(df.index) == 1:
506+
finite_value = None
507+
if np.isfinite(df.index[0].right) and not np.isfinite(df.index[0].left):
508+
finite_value = df.index[0].right
509+
elif np.isfinite(df.index[0].left) and not np.isfinite(df.index[0].right):
510+
finite_value = df.index[0].left
511+
512+
if finite_value is not None:
513+
new_interval = pd.Interval(left=finite_value, right=finite_value, closed="both")
514+
df.index = pd.Index([new_interval], dtype=object)
515+
return df
516+
517+
col_funcs = {
518+
"x": lambda s: s[s.last_valid_index()] if s.last_valid_index() is not None else np.nan,
519+
"y": lambda s: s[s.last_valid_index()] if s.last_valid_index() is not None else np.nan,
520+
}
521+
522+
first, second = df.index[0], df.index[1]
523+
if first.right >= second.left - second.length * 0.5: # floats can be imprecise, we use bucket length as a tolerance
524+
new_interval = pd.Interval(left=first.right, right=second.right, closed="both")
525+
new_row = df.iloc[0:2].apply(axis="index", func=lambda col: col_funcs[col.name[-1]](col))
526+
df = df.drop(index=[first, second])
527+
df.loc[new_interval] = new_row
528+
df = df.sort_index()
529+
else:
530+
new_interval = pd.Interval(left=first.right, right=first.right + second.length, closed="both")
531+
df.index = [new_interval] + list(df.index[1:])
532+
490533
return df
491534

492535

tests/e2e/v1/test_fetch_metric_buckets.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,7 @@ def create_expected_data_dict(
114114
limit: int,
115115
include_point_previews: bool, # TODO - add to the test data?
116116
) -> pd.DataFrame:
117-
bucket_ranges_x = _calculate_ranges_x(data, limit)
117+
bucket_ranges_x = _calculate_ranges_x(data, limit + 1)
118118

119119
bucket_data: dict[RunAttributeDefinition, list[TimeseriesBucket]] = {}
120120
for experiment_name, experiment_data in data.items():
@@ -238,7 +238,7 @@ def test__fetch_metric_buckets__filter_variants(
238238

239239
@pytest.mark.parametrize(
240240
"limit",
241-
[2, 3, 10, NUMBER_OF_STEPS + 10],
241+
[1, 2, 3, 10, NUMBER_OF_STEPS + 10, 1000],
242242
)
243243
@pytest.mark.parametrize(
244244
"include_point_previews",

tests/unit/internal/test_output_format.py

Lines changed: 92 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
import numpy as np
1111
import pandas as pd
1212
import pytest
13+
from pandas import Interval
1314
from pandas._testing import assert_frame_equal
1415

1516
import neptune_query as npt
@@ -1358,12 +1359,8 @@ def test_create_empty_metric_buckets_dataframe():
13581359
)
13591360

13601361
# Then
1361-
expected_df = (
1362-
pd.DataFrame(data={"bucket": []}).astype(dtype={"bucket": "interval[float64, right]"}).set_index("bucket")
1363-
)
1364-
expected_df.columns = pd.MultiIndex.from_product(
1365-
[[], [], ["local_min", "local_max"]], names=["experiment", "metric", "bucket"]
1366-
)
1362+
expected_df = pd.DataFrame(data={"bucket": []}).astype(dtype={"bucket": "object"}).set_index("bucket")
1363+
expected_df.columns = pd.MultiIndex.from_product([[], [], ["x", "y"]], names=["experiment", "metric", "bucket"])
13671364
expected_df.index.name = None
13681365

13691366
pd.testing.assert_frame_equal(df, expected_df)
@@ -1384,7 +1381,7 @@ def test_create_metric_buckets_dataframe():
13841381
assert not df.empty, "DataFrame should not be empty"
13851382

13861383
# Check the shape of the DataFrame
1387-
num_expected_rows = BUCKETS
1384+
num_expected_rows = BUCKETS - 1
13881385
assert df.shape[0] == num_expected_rows, f"DataFrame should have {num_expected_rows} rows"
13891386

13901387
# Check the columns of the DataFrame
@@ -1403,6 +1400,77 @@ def test_create_metric_buckets_dataframe():
14031400
assert df.columns.get_level_values(2).nunique() == len(METRICS), f"DataFrame should have {METRICS} metrics"
14041401

14051402

1403+
@pytest.mark.parametrize(
1404+
"data,expected_df",
1405+
[
1406+
(
1407+
{
1408+
_generate_run_attribute_definition(experiment=1, path=1): [
1409+
_generate_bucket_metric(index=0),
1410+
]
1411+
},
1412+
pd.DataFrame(
1413+
{
1414+
("exp1", "path1", "x"): [20.0],
1415+
("exp1", "path1", "y"): [0.0],
1416+
},
1417+
index=pd.Index([Interval(20.0, 20.0, closed="both")], dtype="object"),
1418+
),
1419+
),
1420+
(
1421+
{
1422+
_generate_run_attribute_definition(experiment=1, path=1): [
1423+
_generate_bucket_metric(index=0),
1424+
_generate_bucket_metric(index=2),
1425+
]
1426+
},
1427+
pd.DataFrame(
1428+
{
1429+
("exp1", "path1", "x"): [20.0, 58.0],
1430+
("exp1", "path1", "y"): [0.0, 200.0],
1431+
},
1432+
index=pd.Index(
1433+
[Interval(20.0, 40.0, closed="both"), Interval(40.0, 60.0, closed="right")], dtype="object"
1434+
),
1435+
),
1436+
),
1437+
(
1438+
{
1439+
_generate_run_attribute_definition(experiment=1, path=1): [
1440+
_generate_bucket_metric(index=0),
1441+
_generate_bucket_metric(index=3),
1442+
]
1443+
},
1444+
pd.DataFrame(
1445+
{
1446+
("exp1", "path1", "x"): [20.0, 78.0],
1447+
("exp1", "path1", "y"): [0.0, 300.0],
1448+
},
1449+
index=pd.Index(
1450+
[Interval(20.0, 40.0, closed="both"), Interval(60.0, 80.0, closed="right")], dtype="object"
1451+
),
1452+
),
1453+
),
1454+
],
1455+
)
1456+
def test_create_metric_buckets_dataframe_parametrized(data, expected_df):
1457+
# Given
1458+
sys_id_label_mapping = {
1459+
SysId("sysid1"): "exp1",
1460+
}
1461+
expected_df.columns.names = ["experiment", "metric", "bucket"]
1462+
1463+
# When
1464+
df = create_metric_buckets_dataframe(
1465+
buckets_data=data,
1466+
sys_id_label_mapping=sys_id_label_mapping,
1467+
container_column_name="experiment",
1468+
)
1469+
1470+
# Then
1471+
pd.testing.assert_frame_equal(df, expected_df)
1472+
1473+
14061474
def test_create_metric_buckets_dataframe_missing_values():
14071475
# Given
14081476
data = {
@@ -1432,17 +1500,17 @@ def test_create_metric_buckets_dataframe_missing_values():
14321500

14331501
# Then
14341502
expected = {
1435-
("exp1", "path1", "x"): [20.0, 38.0, np.nan],
1436-
("exp1", "path1", "y"): [0.0, 100.0, np.nan],
1437-
("exp1", "path2", "x"): [np.nan, 38.0, 58.0],
1438-
("exp1", "path2", "y"): [np.nan, 100.0, 200.0],
1439-
("exp2", "path1", "x"): [20.0, np.nan, 58.0],
1440-
("exp2", "path1", "y"): [0.0, np.nan, 200.00],
1503+
("exp1", "path1", "x"): [38.0, np.nan],
1504+
("exp1", "path1", "y"): [100.0, np.nan],
1505+
("exp1", "path2", "x"): [38.0, 58.0],
1506+
("exp1", "path2", "y"): [100.0, 200.0],
1507+
("exp2", "path1", "x"): [20.0, 58.0],
1508+
("exp2", "path1", "y"): [0.0, 200.00],
14411509
}
14421510

14431511
expected_df = pd.DataFrame(
14441512
dict(sorted(expected.items())),
1445-
index=pd.IntervalIndex.from_tuples([(float("-inf"), 20.0), (20.0, 40.0), (40.0, 60.0)]),
1513+
index=pd.Index([Interval(20.0, 40.0, closed="both"), Interval(40.0, 60.0, closed="right")]),
14461514
)
14471515
expected_df.columns.names = ["experiment", "metric", "bucket"]
14481516

@@ -1456,6 +1524,7 @@ def test_create_metric_buckets_dataframe_sorted():
14561524
_generate_bucket_metric(index=2),
14571525
_generate_bucket_metric(index=0),
14581526
_generate_bucket_metric(index=1),
1527+
_generate_bucket_metric(index=3),
14591528
],
14601529
}
14611530
sys_id_label_mapping = {
@@ -1470,13 +1539,19 @@ def test_create_metric_buckets_dataframe_sorted():
14701539

14711540
# Then
14721541
expected = {
1473-
("exp1", "path1", "x"): [20.0, 38.0, 58.0],
1474-
("exp1", "path1", "y"): [0.0, 100.0, 200.0],
1542+
("exp1", "path1", "x"): [38.0, 58.0, 78.0],
1543+
("exp1", "path1", "y"): [100.0, 200.0, 300.0],
14751544
}
14761545

14771546
expected_df = pd.DataFrame(
14781547
dict(sorted(expected.items())),
1479-
index=pd.IntervalIndex.from_tuples([(float("-inf"), 20.0), (20.0, 40.0), (40.0, 60.0)]),
1548+
index=pd.Index(
1549+
[
1550+
Interval(20.0, 40.0, closed="both"),
1551+
Interval(40.0, 60.0, closed="right"),
1552+
Interval(60.0, 80.0, closed="right"),
1553+
]
1554+
),
14801555
)
14811556
expected_df.columns.names = ["experiment", "metric", "bucket"]
14821557

0 commit comments

Comments
 (0)