WIP: unit tests

keller-mark · keller-mark · commit 33c3bb309a37 · 2025-09-15T17:06:40.000-04:00
diff --git a/src/vitessce/data_utils/spatialdata_points_zorder.py b/src/vitessce/data_utils/spatialdata_points_zorder.py
@@ -156,7 +156,7 @@ def sdata_morton_sort_points(sdata, element):
 
     return sdata
 
-def sdata_morton_query_rect(sdata, element, orig_rect):
+def sdata_morton_query_rect_aux(sdata, element, orig_rect):
     #orig_rect = [[50, 50], [100, 150]] # [[x0, y0], [x1, y1]]
     #norm_rect = [
     #    orig_coord_to_norm_coord(orig_rect[0], orig_x_min=0, orig_x_max=100, orig_y_min=0, orig_y_max=200),
@@ -190,15 +190,34 @@ def sdata_morton_query_rect(sdata, element, orig_rect):
         stop_level = None,
         merge = True,
     )
+
+    return morton_intervals
+
+
+def sdata_morton_query_rect(sdata, element, orig_rect):
+    sorted_ddf = sdata.points[element]
+    morton_intervals = sdata_morton_query_rect_aux(sdata, element, orig_rect)
+
     # Get morton code column as a list of integers
     morton_sorted = sorted_ddf["morton_code_2d"].compute().values.tolist()
 
     # Get a list of row ranges that match the morton intervals.
     # (This uses binary searches internally to find the matching row indices).
     # [ (row_start, row_end), ... ]
     matching_row_ranges = zquery_rows(morton_sorted, morton_intervals, merge = True)
+
     return matching_row_ranges
 
+def sdata_morton_query_rect_debug(sdata, element, orig_rect):
+    # This is the same as the above sdata_morton_query_rect function,
+    # but it also returns the list of row indices that were checked
+    # during the binary searches.
+    sorted_ddf = sdata.points[element]
+    morton_intervals = sdata_morton_query_rect_aux(sdata, element, orig_rect)
+    morton_sorted = sorted_ddf["morton_code_2d"].compute().values.tolist()
+    matching_row_ranges, rows_checked = zquery_rows_aux(morton_sorted, morton_intervals, merge = True)
+    return matching_row_ranges, rows_checked
+
 # --------------------------
 # Functions for rectangle queries.
 # --------------------------
@@ -334,23 +353,42 @@ def zcover_rectangle(rx0:int, ry0:int, rx1:int, ry1:int, bits:int, stop_level: O
 # Morton intervals -> row ranges in a Morton-sorted column
 # --------------------------
 
-def zquery_rows(morton_sorted: List[int], intervals: List[Tuple[int,int]], merge: bool = True) -> List[Tuple[int,int]]:
+def zquery_rows_aux(morton_sorted: List[int], intervals: List[Tuple[int,int]], merge: bool = True) -> Tuple[List[Tuple[int,int]], List[int]]:
     """
     For each Z-interval [zlo, zhi], binary-search in the sorted Morton column
     and return row index half-open ranges [i, j) to scan.
     """
+
+    # Keep track of which keys were looked at during the binary searches.
+    # This is used for analysis / debugging, for instance, to enable
+    # evaluating how many HTTP requests would be needed in network-based case
+    # (which will also depend on Arrow row group size).
+    recorded_keys = []
+    def record_key_check(k: int) -> int:
+        # TODO: Does recorded_keys need to be marked as a global here?
+        recorded_keys.append(k)
+        return k
+
     ranges: List[Tuple[int,int]] = []
+    # TODO: can these multiple binary searches be optimized?
+    # Since we are doing many searches in the same array, and in each search we learn where more elements are located.
     for zlo, zhi in intervals:
-        i = bisect_left(morton_sorted, zlo)
-        j = bisect_right(morton_sorted, zhi)
+        i = bisect_left(morton_sorted, zlo, key=record_key_check)
+        # TODO: use lo=i in bisect_right to limit the search range?
+        # TODO: can the second binary search be further optimized since we just did a binary search via bisect_left?
+        j = bisect_right(morton_sorted, zhi, key=record_key_check)
         if i < j:
             ranges.append((i, j))
 
-    # TODO: record exactly which rows were queried,
-    # to enable evaluating how many HTTP requests would be needed in network-based case
-    # (will also depend on Arrow row group size)
-    
-    return merge_adjacent(ranges) if merge else ranges
+    result = merge_adjacent(ranges) if merge else ranges
+    return result, recorded_keys
+
+def zquery_rows(morton_sorted: List[int], intervals: List[Tuple[int,int]], merge: bool = True) -> List[Tuple[int,int]]:
+    """
+    For each Z-interval [zlo, zhi], binary-search in the sorted Morton column
+    and return row index half-open ranges [i, j) to scan.
+    """
+    return zquery_rows_aux(morton_sorted, intervals, merge=merge)[0]
 
 
 def row_ranges_to_row_indices(intervals: List[Tuple[int,int]]) -> List[int]:
diff --git a/tests/test_sdata_points_zorder.py b/tests/test_sdata_points_zorder.py
@@ -0,0 +1,145 @@
+import pytest
+from os.path import join
+
+from spatialdata import read_zarr
+
+from vitessce.data_utils.spatialdata_points_zorder import (
+    # Function for computing codes and sorting
+    sdata_morton_sort_points,
+    # Functions for querying
+    sdata_morton_query_rect_debug,
+    row_ranges_to_row_indices,
+    orig_coord_to_norm_coord,
+)
+
+def is_sorted(l):
+    return all(l[i] <= l[i + 1] for i in range(len(l) - 1))
+
+def get_sdata():
+    data_dir = join("docs", "notebooks", "data")
+    spatialdata_filepath = join(data_dir, "xenium_rep1_io.spatialdata.zarr")
+
+    sdata = read_zarr(spatialdata_filepath)
+    return sdata
+
+@pytest.mark.skip(reason="Temporarily disable")
+def test_zorder_sorting():
+    # TODO: use fixture here
+    sdata = get_sdata()
+
+    sdata_morton_sort_points(sdata, "transcripts")
+
+    # Check that the morton codes are sorted
+    sorted_ddf = sdata.points["transcripts"]
+    morton_sorted = sorted_ddf["morton_code_2d"].compute().values.tolist()
+
+    assert is_sorted(morton_sorted)
+
+
+def test_zorder_query():
+    sdata = get_sdata()
+
+    sdata_morton_sort_points(sdata, "transcripts")
+
+    # Query a rectangle that should return some points
+    orig_rect = [[50.0, 50.0], [100.0, 150.0]] # x0, y0, x1, y1
+    matching_row_ranges, rows_checked = sdata_morton_query_rect_debug(sdata, "transcripts", orig_rect)
+    rect_row_indices = row_ranges_to_row_indices(matching_row_ranges)
+
+    # Cannot use df.iloc on a dask dataframe, so convert it to pandas first
+    ddf = sdata.points["transcripts"]
+    df = ddf.compute()
+    df = df.reset_index(drop=True)
+    estimated_row_indices = df.iloc[rect_row_indices].index.tolist()
+
+    assert df.shape[0] == 42638083
+
+    # Do the same query the "dumb" way, by checking all points
+    in_rect = (
+        (df["x"] >= orig_rect[0][0])
+        & (df["x"] <= orig_rect[1][0])
+        & (df["y"] >= orig_rect[0][1])
+        & (df["y"] <= orig_rect[1][1])
+    )
+    dumb_df_subset = df.loc[in_rect]
+    # Get the row indices of the points in the rectangle
+    # (these are the indices in the original dataframe)
+    exact_row_indices = dumb_df_subset.index.tolist()
+
+    # Check that the estimated rows 100% contain the exact rows.
+    # A.issubset(B) checks that all elements of A are in B ("A is a subset of B").
+    assert set(exact_row_indices).issubset(set(estimated_row_indices))
+    assert len(exact_row_indices) == 614
+    assert len(estimated_row_indices) <= 631
+
+    # Check that the number of rows checked is less than the total number of points
+    assert len(rows_checked) <= 45237
+    assert len(matching_row_ranges) == 24 # Kind of an implementation detail.
+
+    # Do a second check, this time against x_uint/y_uint (the normalized coordinates)
+    # TODO: does this ensure that estimated == exact?
+
+    bounding_box = ddf.attrs["bounding_box"]
+    x_min = bounding_box["x_min"]
+    x_max = bounding_box["x_max"]
+    y_min = bounding_box["y_min"]
+    y_max = bounding_box["y_max"]
+    norm_rect = [
+        orig_coord_to_norm_coord(orig_rect[0], orig_x_min=x_min, orig_x_max=x_max, orig_y_min=y_min, orig_y_max=y_max),
+        orig_coord_to_norm_coord(orig_rect[1], orig_x_min=x_min, orig_x_max=x_max, orig_y_min=y_min, orig_y_max=y_max)
+    ]
+
+    in_rect_norm = (
+        (df["x_uint"] >= norm_rect[0][0])
+        & (df["x_uint"] <= norm_rect[1][0])
+        & (df["y_uint"] >= norm_rect[0][1])
+        & (df["y_uint"] <= norm_rect[1][1])
+    )
+    dumb_df_subset_norm = df.loc[in_rect_norm]
+    # Get the row indices of the points in the rectangle
+    # (these are the indices in the original dataframe)
+    exact_row_indices_norm = dumb_df_subset_norm.index.tolist()
+    assert set(exact_row_indices_norm).issubset(set(estimated_row_indices))
+    assert len(exact_row_indices_norm) == 617
+    assert len(estimated_row_indices) <= 631
+    
+
+
+    """
+    # ========= Another query ==========
+    orig_rect = [[500, 500], [600, 600]] # x0, y0, x1, y1
+
+    # Query using z-order
+    matching_row_ranges, rows_checked = sdata_morton_query_rect_debug(sdata, "transcripts", orig_rect)
+    rect_row_indices = row_ranges_to_row_indices(matching_row_ranges)
+    estimated_row_indices = df.iloc[rect_row_indices].index.tolist()
+
+    # Query the "dumb" way
+    in_rect = (
+        (df["x"] >= orig_rect[0][0])
+        & (df["x"] <= orig_rect[1][0])
+        & (df["y"] >= orig_rect[0][1])
+        & (df["y"] <= orig_rect[1][1])
+    )
+    dumb_df_subset = df.loc[in_rect]
+    exact_row_indices = dumb_df_subset.index.tolist()
+
+    diff_rows = set(estimated_row_indices) - set(exact_row_indices)
+    # print("Rows in estimated but not exact:", diff_rows)
+    print(df.iloc[list(diff_rows)])
+    raise NotImplementedError("Debugging")
+
+    # Check that the estimated rows contain all of the exact rows.
+    assert len(set(exact_row_indices).intersection(set(estimated_row_indices))) == 0
+    assert len(exact_row_indices) <= 1123 # TODO: update
+    assert len(estimated_row_indices) <= 1163 # TODO: update
+    
+    """
+
+
+
+
+
+
+
+