1+ import glob
12import os
3+ import time
24import zipfile
35from datetime import datetime
46from enum import Enum
@@ -118,12 +120,30 @@ def check_source_cache_file_exists(
118120 table_name (str): The name of the table of source data.
119121 load_type (LoadType): The destination type of the file (either SOURCE_CACHE or PIPELINE_CACHE).
120122 """
123+ start_time = time .time ()
124+ print (
125+ f" FileManager.check_source_cache_file_exists: Checking for { table_name } "
126+ )
127+
121128 directory = (
122129 self .source_cache_directory
123130 if load_type == LoadType .SOURCE_CACHE
124131 else self .pipeline_cache_directory
125132 )
126- return len ([file for file in os .listdir (directory ) if table_name in file ]) > 0
133+ # Use glob pattern matching for more efficient file searching
134+ pattern = os .path .join (directory , f"*{ table_name } *.parquet" )
135+
136+ glob_start = time .time ()
137+ files = glob .glob (pattern )
138+ glob_time = time .time () - glob_start
139+
140+ result = len (files ) > 0
141+ total_time = time .time () - start_time
142+
143+ print (
144+ f" FileManager.check_source_cache_file_exists: Found { len (files )} files in { glob_time :.2f} s (total: { total_time :.2f} s)"
145+ )
146+ return result
127147
128148 def get_most_recent_cache (self , table_name : str ) -> gpd .GeoDataFrame | None :
129149 """
@@ -134,25 +154,45 @@ def get_most_recent_cache(self, table_name: str) -> gpd.GeoDataFrame | None:
134154 GeoDataFrame: The dataframe loaded from the most recent cached file.
135155 None: If no files exist for the given table name.
136156 """
137- cached_files = [
138- file
139- for file in os .listdir (self .source_cache_directory )
140- if table_name in file
141- ]
157+ start_time = time .time ()
158+ print (
159+ f" FileManager.get_most_recent_cache: Loading most recent cache for { table_name } "
160+ )
161+
162+ # Use glob pattern matching for more efficient file searching
163+ pattern = os .path .join (self .source_cache_directory , f"*{ table_name } *.parquet" )
164+
165+ glob_start = time .time ()
166+ cached_files = glob .glob (pattern )
167+ glob_time = time .time () - glob_start
142168
143169 if not cached_files :
170+ print (" FileManager.get_most_recent_cache: No cached files found" )
144171 return None
145172
146- cached_files .sort (
147- key = lambda x : os .path .getmtime (
148- os .path .join (self .source_cache_directory , x )
149- ),
150- reverse = True ,
173+ # Get the most recent file by modification time
174+ mtime_start = time .time ()
175+ most_recent_file = max (cached_files , key = os .path .getmtime )
176+ mtime_time = time .time () - mtime_start
177+
178+ print (
179+ f" FileManager.get_most_recent_cache: Found { len (cached_files )} files, most recent: { os .path .basename (most_recent_file )} "
180+ )
181+ print (
182+ f" FileManager.get_most_recent_cache: Glob took { glob_time :.2f} s, mtime check took { mtime_time :.2f} s"
183+ )
184+
185+ # Load the parquet file
186+ load_start = time .time ()
187+ gdf = gpd .read_parquet (most_recent_file )
188+ load_time = time .time () - load_start
189+
190+ total_time = time .time () - start_time
191+ print (
192+ f" FileManager.get_most_recent_cache: Parquet load took { load_time :.2f} s (total: { total_time :.2f} s)"
151193 )
152- most_recent_file = cached_files [0 ]
153- file_path = self .get_file_path (most_recent_file , LoadType .SOURCE_CACHE )
154194
155- return gpd . read_parquet ( file_path )
195+ return gdf
156196
157197 def load_gdf (
158198 self , file_name : str , load_type : LoadType , file_type : FileType | None = None
@@ -194,16 +234,38 @@ def save_gdf(
194234 file_type (FileType): The type of the file (GEOJSON or PARQUET).
195235 load_type (LoadType): The destination type of the file (TEMP or CACHE).
196236 """
237+ start_time = time .time ()
238+ print (f" FileManager.save_gdf: Starting save for { file_name } " )
239+
197240 file_path = self .get_file_path (file_name , load_type , file_type )
241+ print (f" FileManager.save_gdf: Target path: { file_path } " )
242+
198243 if file_type == FileType .PARQUET :
244+ print (
245+ f" FileManager.save_gdf: Writing parquet file ({ len (gdf )} rows, { len (gdf .columns )} columns)"
246+ )
247+ parquet_start = time .time ()
199248 gdf .to_parquet (file_path , index = False )
249+ parquet_time = time .time () - parquet_start
250+ print (f" FileManager.save_gdf: Parquet write took { parquet_time :.2f} s" )
200251 elif file_type == FileType .GEOJSON :
252+ print (" FileManager.save_gdf: Writing GeoJSON file" )
253+ geojson_start = time .time ()
201254 gdf .to_file (file_path , driver = "GeoJSON" )
255+ geojson_time = time .time () - geojson_start
256+ print (f" FileManager.save_gdf: GeoJSON write took { geojson_time :.2f} s" )
202257 elif file_type == FileType .CSV :
258+ print (" FileManager.save_gdf: Writing CSV file" )
259+ csv_start = time .time ()
203260 gdf .to_csv (file_path )
261+ csv_time = time .time () - csv_start
262+ print (f" FileManager.save_gdf: CSV write took { csv_time :.2f} s" )
204263 else :
205264 raise ValueError (f"Unsupported file type: { file_type } " )
206265
266+ total_time = time .time () - start_time
267+ print (f" FileManager.save_gdf: Total save operation took { total_time :.2f} s" )
268+
207269 def save_fractional_gdf (
208270 self ,
209271 gdf : gpd .GeoDataFrame ,
0 commit comments