11import pytest
22from os .path import join
3+ import numpy as np
34
45from spatialdata import read_zarr
56
@@ -36,6 +37,7 @@ def test_zorder_sorting():
3637 assert is_sorted (morton_sorted )
3738
3839
40+
3941def test_zorder_query ():
4042 sdata = get_sdata ()
4143
@@ -55,11 +57,18 @@ def test_zorder_query():
5557 assert df .shape [0 ] == 42638083
5658
5759 # Do the same query the "dumb" way, by checking all points
60+
61+ # We need an epsilon for the "dumb" query since the normalization
62+ # introduces rounding issues. We can instead verify that a slightly
63+ # smaller rectangle is fully contained in the morton code query
64+ # estimated results.
65+ EXACT_BOUNDARY_EPSILON = 1
66+
5867 in_rect = (
59- (df ["x" ] >= orig_rect [0 ][0 ])
60- & (df ["x" ] <= orig_rect [1 ][0 ])
61- & (df ["y" ] >= orig_rect [0 ][1 ])
62- & (df ["y" ] <= orig_rect [1 ][1 ])
68+ (df ["x" ] >= orig_rect [0 ][0 ] + EXACT_BOUNDARY_EPSILON )
69+ & (df ["x" ] <= orig_rect [1 ][0 ] - EXACT_BOUNDARY_EPSILON )
70+ & (df ["y" ] >= orig_rect [0 ][1 ] + EXACT_BOUNDARY_EPSILON )
71+ & (df ["y" ] <= orig_rect [1 ][1 ] - EXACT_BOUNDARY_EPSILON )
6372 )
6473 dumb_df_subset = df .loc [in_rect ]
6574 # Get the row indices of the points in the rectangle
@@ -69,7 +78,7 @@ def test_zorder_query():
6978 # Check that the estimated rows 100% contain the exact rows.
7079 # A.issubset(B) checks that all elements of A are in B ("A is a subset of B").
7180 assert set (exact_row_indices ).issubset (set (estimated_row_indices ))
72- assert len (exact_row_indices ) == 614
81+ assert len (exact_row_indices ) == 552
7382 assert len (estimated_row_indices ) <= 631
7483
7584 # Check that the number of rows checked is less than the total number of points
@@ -89,52 +98,81 @@ def test_zorder_query():
8998 orig_coord_to_norm_coord (orig_rect [1 ], orig_x_min = x_min , orig_x_max = x_max , orig_y_min = y_min , orig_y_max = y_max )
9099 ]
91100
101+ norm_rect_rounded = [
102+ # TODO: should we use floor/ceil instead of round?
103+ [np .floor (norm_rect [0 ][0 ]), np .floor (norm_rect [0 ][1 ])],
104+ [np .floor (norm_rect [1 ][0 ]), np .floor (norm_rect [1 ][1 ])]
105+ ]
106+
92107 in_rect_norm = (
93- (df ["x_uint" ] >= norm_rect [0 ][0 ])
94- & (df ["x_uint" ] <= norm_rect [1 ][0 ])
95- & (df ["y_uint" ] >= norm_rect [0 ][1 ])
96- & (df ["y_uint" ] <= norm_rect [1 ][1 ])
108+ (df ["x_uint" ] >= norm_rect_rounded [0 ][0 ] + EXACT_BOUNDARY_EPSILON )
109+ & (df ["x_uint" ] < norm_rect_rounded [1 ][0 ] - EXACT_BOUNDARY_EPSILON )
110+ & (df ["y_uint" ] >= norm_rect_rounded [0 ][1 ] + EXACT_BOUNDARY_EPSILON )
111+ & (df ["y_uint" ] < norm_rect_rounded [1 ][1 ] - EXACT_BOUNDARY_EPSILON )
97112 )
98113 dumb_df_subset_norm = df .loc [in_rect_norm ]
99114 # Get the row indices of the points in the rectangle
100115 # (these are the indices in the original dataframe)
101116 exact_row_indices_norm = dumb_df_subset_norm .index .tolist ()
117+
118+ # A.issubset(B)
119+ # True if A is a subset of B and False otherwise.
102120 assert set (exact_row_indices_norm ).issubset (set (estimated_row_indices ))
103- assert len (exact_row_indices_norm ) == 617
104- assert len (estimated_row_indices ) <= 631
105-
106121
122+ assert len (exact_row_indices_norm ) == 609
123+ assert len (estimated_row_indices ) <= 631
107124
108- """
109125 # ========= Another query ==========
110- orig_rect = [[500, 500], [600, 600]] # x0, y0, x1, y1
126+ orig_rect = [[500.0 , 500.0 ], [600.0 , 600.0 ]] # x0, y0, x1, y1
111127
112128 # Query using z-order
113129 matching_row_ranges , rows_checked = sdata_morton_query_rect_debug (sdata , "transcripts" , orig_rect )
114130 rect_row_indices = row_ranges_to_row_indices (matching_row_ranges )
115131 estimated_row_indices = df .iloc [rect_row_indices ].index .tolist ()
116132
117- # Query the "dumb" way
133+ # Do the same query the "dumb" way, by checking all points
118134 in_rect = (
119- (df["x"] >= orig_rect[0][0])
120- & (df["x"] <= orig_rect[1][0])
121- & (df["y"] >= orig_rect[0][1])
122- & (df["y"] <= orig_rect[1][1])
135+ (df ["x" ] >= orig_rect [0 ][0 ] + EXACT_BOUNDARY_EPSILON )
136+ & (df ["x" ] <= orig_rect [1 ][0 ] - EXACT_BOUNDARY_EPSILON )
137+ & (df ["y" ] >= orig_rect [0 ][1 ] + EXACT_BOUNDARY_EPSILON )
138+ & (df ["y" ] <= orig_rect [1 ][1 ] - EXACT_BOUNDARY_EPSILON )
123139 )
124140 dumb_df_subset = df .loc [in_rect ]
141+ # Get the row indices of the points in the rectangle
142+ # (these are the indices in the original dataframe)
125143 exact_row_indices = dumb_df_subset .index .tolist ()
126144
127- diff_rows = set(estimated_row_indices) - set(exact_row_indices)
128- # print("Rows in estimated but not exact:", diff_rows)
129- print(df.iloc[list(diff_rows)])
130- raise NotImplementedError("Debugging")
145+ # Query 2: Do a second check, this time against x_uint/y_uint (the normalized coordinates)
146+ norm_rect = [
147+ orig_coord_to_norm_coord (orig_rect [0 ], orig_x_min = x_min , orig_x_max = x_max , orig_y_min = y_min , orig_y_max = y_max ),
148+ orig_coord_to_norm_coord (orig_rect [1 ], orig_x_min = x_min , orig_x_max = x_max , orig_y_min = y_min , orig_y_max = y_max )
149+ ]
150+
151+ norm_rect_rounded = [
152+ # TODO: should we use floor/ceil instead of round?
153+ [np .floor (norm_rect [0 ][0 ]), np .floor (norm_rect [0 ][1 ])],
154+ [np .floor (norm_rect [1 ][0 ]), np .floor (norm_rect [1 ][1 ])]
155+ ]
156+
157+ in_rect_norm = (
158+ (df ["x_uint" ] >= norm_rect_rounded [0 ][0 ] + EXACT_BOUNDARY_EPSILON )
159+ & (df ["x_uint" ] < norm_rect_rounded [1 ][0 ] - EXACT_BOUNDARY_EPSILON )
160+ & (df ["y_uint" ] >= norm_rect_rounded [0 ][1 ] + EXACT_BOUNDARY_EPSILON )
161+ & (df ["y_uint" ] < norm_rect_rounded [1 ][1 ] - EXACT_BOUNDARY_EPSILON )
162+ )
163+ dumb_df_subset_norm = df .loc [in_rect_norm ]
164+ # Get the row indices of the points in the rectangle
165+ # (these are the indices in the original dataframe)
166+ exact_row_indices_norm = dumb_df_subset_norm .index .tolist ()
167+
168+ # A.issubset(B)
169+ # True if A is a subset of B and False otherwise.
170+ assert set (exact_row_indices_norm ).issubset (set (estimated_row_indices ))
131171
132172 # Check that the estimated rows contain all of the exact rows.
133- assert len(set(exact_row_indices).intersection(set(estimated_row_indices))) == 0
134- assert len(exact_row_indices) <= 1123 # TODO: update
135- assert len(estimated_row_indices) <= 1163 # TODO: update
173+ assert len (exact_row_indices ) == 16678
174+ assert len (estimated_row_indices ) <= 17643
136175
137- """
138176
139177
140178
0 commit comments