Skip to content

Commit 02e764d

Browse files
authored
Update _utils.py - fixed get_xy_extends for some datasets.
1 parent 91d863f commit 02e764d

File tree

1 file changed

+26
-9
lines changed

1 file changed

+26
-9
lines changed

src/segger/data/parquet/_utils.py

Lines changed: 26 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ def get_xy_extents(
1818
) -> shapely.Polygon:
1919
"""
2020
Get the bounding box of the x and y coordinates from a Parquet file.
21+
If the min/max statistics are not available, compute them efficiently from the data.
2122
2223
Parameters
2324
----------
@@ -33,22 +34,38 @@ def get_xy_extents(
3334
shapely.Polygon
3435
A polygon representing the bounding box of the x and y coordinates.
3536
"""
36-
# Get index of columns of parquet file
37+
# Process row groups
3738
metadata = pq.read_metadata(filepath)
3839
schema_idx = dict(map(reversed, enumerate(metadata.schema.names)))
39-
4040
# Find min and max values across all row groups
4141
x_max = -1
4242
x_min = sys.maxsize
4343
y_max = -1
4444
y_min = sys.maxsize
45-
for i in range(metadata.num_row_groups):
46-
group = metadata.row_group(i)
47-
x_min = min(x_min, group.column(schema_idx[x]).statistics.min)
48-
x_max = max(x_max, group.column(schema_idx[x]).statistics.max)
49-
y_min = min(y_min, group.column(schema_idx[y]).statistics.min)
50-
y_max = max(y_max, group.column(schema_idx[y]).statistics.max)
51-
bounds = shapely.box(x_min, y_min, x_max, y_max)
45+
group = metadata.row_group(0)
46+
try:
47+
for i in range(metadata.num_row_groups):
48+
# print('*1')
49+
group = metadata.row_group(i)
50+
x_min = min(x_min, group.column(schema_idx[x]).statistics.min)
51+
x_max = max(x_max, group.column(schema_idx[x]).statistics.max)
52+
y_min = min(y_min, group.column(schema_idx[y]).statistics.min)
53+
y_max = max(y_max, group.column(schema_idx[y]).statistics.max)
54+
bounds = shapely.box(x_min, y_min, x_max, y_max)
55+
# If statistics are not available, compute them manually from the data
56+
except:
57+
import gc
58+
print("metadata lacks the statistics of the tile's bounding box, computing might take longer!")
59+
parquet_file = pd.read_parquet(filepath)
60+
x_col = parquet_file.loc[:, x]
61+
y_col = parquet_file.loc[:, y]
62+
del parquet_file
63+
gc.collect()
64+
x_min = x_col.min()
65+
x_min = y_col.min()
66+
x_max = x_col.max()
67+
x_max = y_col.max()
68+
bounds = shapely.geometry.box(x_min, y_min, x_max, y_max)
5269
return bounds
5370

5471

0 commit comments

Comments
 (0)