@@ -18,6 +18,7 @@ def get_xy_extents(
1818) -> shapely .Polygon :
1919 """
2020 Get the bounding box of the x and y coordinates from a Parquet file.
21+ If the min/max statistics are not available, compute them efficiently from the data.
2122
2223 Parameters
2324 ----------
@@ -33,22 +34,38 @@ def get_xy_extents(
3334 shapely.Polygon
3435 A polygon representing the bounding box of the x and y coordinates.
3536 """
36- # Get index of columns of parquet file
37+ # Process row groups
3738 metadata = pq .read_metadata (filepath )
3839 schema_idx = dict (map (reversed , enumerate (metadata .schema .names )))
39-
4040 # Find min and max values across all row groups
4141 x_max = - 1
4242 x_min = sys .maxsize
4343 y_max = - 1
4444 y_min = sys .maxsize
45- for i in range (metadata .num_row_groups ):
46- group = metadata .row_group (i )
47- x_min = min (x_min , group .column (schema_idx [x ]).statistics .min )
48- x_max = max (x_max , group .column (schema_idx [x ]).statistics .max )
49- y_min = min (y_min , group .column (schema_idx [y ]).statistics .min )
50- y_max = max (y_max , group .column (schema_idx [y ]).statistics .max )
51- bounds = shapely .box (x_min , y_min , x_max , y_max )
45+ group = metadata .row_group (0 )
46+ try :
47+ for i in range (metadata .num_row_groups ):
48+ # print('*1')
49+ group = metadata .row_group (i )
50+ x_min = min (x_min , group .column (schema_idx [x ]).statistics .min )
51+ x_max = max (x_max , group .column (schema_idx [x ]).statistics .max )
52+ y_min = min (y_min , group .column (schema_idx [y ]).statistics .min )
53+ y_max = max (y_max , group .column (schema_idx [y ]).statistics .max )
54+ bounds = shapely .box (x_min , y_min , x_max , y_max )
55+ # If statistics are not available, compute them manually from the data
56+ except :
57+ import gc
58+ print ("metadata lacks the statistics of the tile's bounding box, computing might take longer!" )
59+ parquet_file = pd .read_parquet (filepath )
60+ x_col = parquet_file .loc [:, x ]
61+ y_col = parquet_file .loc [:, y ]
62+ del parquet_file
63+ gc .collect ()
64+ x_min = x_col .min ()
65+ x_min = y_col .min ()
66+ x_max = x_col .max ()
67+ x_max = y_col .max ()
68+ bounds = shapely .geometry .box (x_min , y_min , x_max , y_max )
5269 return bounds
5370
5471
0 commit comments