Update z-order query tests

keller-mark · keller-mark · commit 176e954aaf7d · 2025-09-25T15:54:01.000-04:00
diff --git a/tests/test_sdata_points_zorder.py b/tests/test_sdata_points_zorder.py
@@ -1,5 +1,6 @@
 import pytest
 from os.path import join
+import numpy as np
 
 from spatialdata import read_zarr
 
@@ -36,6 +37,7 @@ def test_zorder_sorting():
     assert is_sorted(morton_sorted)
 
 
+
 def test_zorder_query():
     sdata = get_sdata()
 
@@ -55,11 +57,18 @@ def test_zorder_query():
     assert df.shape[0] == 42638083
 
     # Do the same query the "dumb" way, by checking all points
+
+    # We need an epsilon for the "dumb" query since the normalization
+    # introduces rounding issues. We can instead verify that a slightly
+    # smaller rectangle is fully contained in the morton code query
+    # estimated results.
+    EXACT_BOUNDARY_EPSILON = 1
+
     in_rect = (
-        (df["x"] >= orig_rect[0][0])
-        & (df["x"] <= orig_rect[1][0])
-        & (df["y"] >= orig_rect[0][1])
-        & (df["y"] <= orig_rect[1][1])
+        (df["x"] >= orig_rect[0][0] + EXACT_BOUNDARY_EPSILON)
+        & (df["x"] <= orig_rect[1][0] - EXACT_BOUNDARY_EPSILON)
+        & (df["y"] >= orig_rect[0][1] + EXACT_BOUNDARY_EPSILON)
+        & (df["y"] <= orig_rect[1][1] - EXACT_BOUNDARY_EPSILON)
     )
     dumb_df_subset = df.loc[in_rect]
     # Get the row indices of the points in the rectangle
@@ -69,7 +78,7 @@ def test_zorder_query():
     # Check that the estimated rows 100% contain the exact rows.
     # A.issubset(B) checks that all elements of A are in B ("A is a subset of B").
     assert set(exact_row_indices).issubset(set(estimated_row_indices))
-    assert len(exact_row_indices) == 614
+    assert len(exact_row_indices) == 552
     assert len(estimated_row_indices) <= 631
 
     # Check that the number of rows checked is less than the total number of points
@@ -89,52 +98,81 @@ def test_zorder_query():
         orig_coord_to_norm_coord(orig_rect[1], orig_x_min=x_min, orig_x_max=x_max, orig_y_min=y_min, orig_y_max=y_max)
     ]
 
+    norm_rect_rounded = [
+        # TODO: should we use floor/ceil instead of round?
+        [np.floor(norm_rect[0][0]), np.floor(norm_rect[0][1])],
+        [np.floor(norm_rect[1][0]), np.floor(norm_rect[1][1])]
+    ]
+
     in_rect_norm = (
-        (df["x_uint"] >= norm_rect[0][0])
-        & (df["x_uint"] <= norm_rect[1][0])
-        & (df["y_uint"] >= norm_rect[0][1])
-        & (df["y_uint"] <= norm_rect[1][1])
+        (df["x_uint"] >= norm_rect_rounded[0][0] + EXACT_BOUNDARY_EPSILON)
+        & (df["x_uint"] < norm_rect_rounded[1][0] - EXACT_BOUNDARY_EPSILON)
+        & (df["y_uint"] >= norm_rect_rounded[0][1] + EXACT_BOUNDARY_EPSILON)
+        & (df["y_uint"] < norm_rect_rounded[1][1] - EXACT_BOUNDARY_EPSILON)
     )
     dumb_df_subset_norm = df.loc[in_rect_norm]
     # Get the row indices of the points in the rectangle
     # (these are the indices in the original dataframe)
     exact_row_indices_norm = dumb_df_subset_norm.index.tolist()
+
+    # A.issubset(B)
+    # True if A is a subset of B and False otherwise.
     assert set(exact_row_indices_norm).issubset(set(estimated_row_indices))
-    assert len(exact_row_indices_norm) == 617
-    assert len(estimated_row_indices) <= 631
-    
 
+    assert len(exact_row_indices_norm) == 609
+    assert len(estimated_row_indices) <= 631
 
-    """
     # ========= Another query ==========
-    orig_rect = [[500, 500], [600, 600]] # x0, y0, x1, y1
+    orig_rect = [[500.0, 500.0], [600.0, 600.0]] # x0, y0, x1, y1
 
     # Query using z-order
     matching_row_ranges, rows_checked = sdata_morton_query_rect_debug(sdata, "transcripts", orig_rect)
     rect_row_indices = row_ranges_to_row_indices(matching_row_ranges)
     estimated_row_indices = df.iloc[rect_row_indices].index.tolist()
 
-    # Query the "dumb" way
+    # Do the same query the "dumb" way, by checking all points
     in_rect = (
-        (df["x"] >= orig_rect[0][0])
-        & (df["x"] <= orig_rect[1][0])
-        & (df["y"] >= orig_rect[0][1])
-        & (df["y"] <= orig_rect[1][1])
+        (df["x"] >= orig_rect[0][0] + EXACT_BOUNDARY_EPSILON)
+        & (df["x"] <= orig_rect[1][0] - EXACT_BOUNDARY_EPSILON)
+        & (df["y"] >= orig_rect[0][1] + EXACT_BOUNDARY_EPSILON)
+        & (df["y"] <= orig_rect[1][1] - EXACT_BOUNDARY_EPSILON)
     )
     dumb_df_subset = df.loc[in_rect]
+    # Get the row indices of the points in the rectangle
+    # (these are the indices in the original dataframe)
     exact_row_indices = dumb_df_subset.index.tolist()
 
-    diff_rows = set(estimated_row_indices) - set(exact_row_indices)
-    # print("Rows in estimated but not exact:", diff_rows)
-    print(df.iloc[list(diff_rows)])
-    raise NotImplementedError("Debugging")
+    # Query 2: Do a second check, this time against x_uint/y_uint (the normalized coordinates)
+    norm_rect = [
+        orig_coord_to_norm_coord(orig_rect[0], orig_x_min=x_min, orig_x_max=x_max, orig_y_min=y_min, orig_y_max=y_max),
+        orig_coord_to_norm_coord(orig_rect[1], orig_x_min=x_min, orig_x_max=x_max, orig_y_min=y_min, orig_y_max=y_max)
+    ]
+
+    norm_rect_rounded = [
+        # TODO: should we use floor/ceil instead of round?
+        [np.floor(norm_rect[0][0]), np.floor(norm_rect[0][1])],
+        [np.floor(norm_rect[1][0]), np.floor(norm_rect[1][1])]
+    ]
+
+    in_rect_norm = (
+        (df["x_uint"] >= norm_rect_rounded[0][0] + EXACT_BOUNDARY_EPSILON)
+        & (df["x_uint"] < norm_rect_rounded[1][0] - EXACT_BOUNDARY_EPSILON)
+        & (df["y_uint"] >= norm_rect_rounded[0][1] + EXACT_BOUNDARY_EPSILON)
+        & (df["y_uint"] < norm_rect_rounded[1][1] - EXACT_BOUNDARY_EPSILON)
+    )
+    dumb_df_subset_norm = df.loc[in_rect_norm]
+    # Get the row indices of the points in the rectangle
+    # (these are the indices in the original dataframe)
+    exact_row_indices_norm = dumb_df_subset_norm.index.tolist()
+
+    # A.issubset(B)
+    # True if A is a subset of B and False otherwise.
+    assert set(exact_row_indices_norm).issubset(set(estimated_row_indices))
 
     # Check that the estimated rows contain all of the exact rows.
-    assert len(set(exact_row_indices).intersection(set(estimated_row_indices))) == 0
-    assert len(exact_row_indices) <= 1123 # TODO: update
-    assert len(estimated_row_indices) <= 1163 # TODO: update
+    assert len(exact_row_indices) == 16678
+    assert len(estimated_row_indices) <= 17643
     
-    """