88import dataframely as dy
99import polars as pl
1010import pyarrow
11- import pyarrow .dataset as pd
1211import pyarrow .parquet as pq
13- import pyarrow .compute as pc
14- from dateutil .relativedelta import relativedelta
1512
1613from lamp_py .aws .s3 import download_file , upload_file
1714from lamp_py .aws .kinesis import KinesisReader
2421)
2522
2623RFC3339_DATE_REGEX = r"^20(?:([1-3][0-9]-[0-1][0-9]-[0-3][0-9]))" # up to 2039-19-39
27- RFC3339_DATETIME_REGEX = RFC3339_DATE_REGEX + r"T ([0-2][0-9]:[0-5][0-9]:[0-5][0-9](?:\.\d+)?)(Z|[\+-]\d{2}:\d{2})?$"
24+ RFC3339_DATETIME_REGEX = RFC3339_DATE_REGEX + r"[T ] ([0-2][0-9]:[0-5][0-9]:[0-5][0-9](?:\.\d+)?)(Z|[\+-]\d{2}:\d{2})?$"
2825GTFS_TIME_REGEX = r"^([0-9]{2}):([0-5][0-9]):([0-5][0-9])$" # clock can be greater than 24 hours
2926
3027user = dy .Struct (
@@ -67,7 +64,7 @@ class GlidesRecord(dy.Schema):
6764 id = dy .String ()
6865 type = dy .String ()
6966 time = dy .Datetime ( # in %Y-%m-%dT%H:%M:%S%:z format before serialization
70- min = datetime (2024 , 1 , 1 ), max = datetime (2039 , 12 , 31 ) # within Python's serializable range
67+ min = datetime (2024 , 1 , 1 ), max = datetime (2039 , 12 , 31 ), time_unit = "ms" # within Python's serializable range
7168 )
7269 source = dy .String ()
7370 specversion = dy .String ()
@@ -213,57 +210,36 @@ def download_remote(self) -> None:
213210 download_file (object_path = self .remote_path , file_name = self .local_path )
214211
215212 @abstractmethod
216- def convert_records (self ) -> pd . Dataset :
213+ def convert_records (self ) -> dy . DataFrame [ GlidesRecord ] :
217214 """Convert incoming records into a flattened table of records"""
218215
219216 def append_records (self ) -> None :
220217 """Add incoming records to a local parquet file"""
221218 process_logger = ProcessLogger (process_name = "append_glides_records" , type = self .type )
222219 process_logger .log_start ()
223220
224- new_dataset = self .convert_records ()
221+ new_dataset = self .convert_records (). lazy ()
225222
226223 if os .path .exists (self .local_path ):
227- remote_records = pd . dataset (self .local_path , schema = self . get_table_schema )
228- joined_ds = pd . dataset ([new_dataset , remote_records ])
224+ remote_records = self . table_schema . scan_parquet (self .local_path , validation = "allow" )
225+ joined_ds = pl . union ([new_dataset , remote_records ])
229226 else :
230227 joined_ds = new_dataset
231228
232229 process_logger .add_metadata (
233- new_records = new_dataset .count_rows (),
234- total_records = joined_ds .count_rows (),
230+ new_records = new_dataset .select ( "time" ). count (). collect (). item (),
231+ total_records = joined_ds .select ( "time" ). count (). collect (). item (),
235232 )
236233
237- now = datetime .now ()
238- start = datetime (2024 , 1 , 1 )
239-
240234 with tempfile .TemporaryDirectory () as tmp_dir :
241235
242236 new_path = os .path .join (tmp_dir , self .base_filename )
243- row_group_count = 0
244- with pq .ParquetWriter (new_path , schema = self .get_table_schema ) as writer :
245- while start < now :
246- end = start + relativedelta (months = 1 )
247- if end < now :
248- row_group = pl .DataFrame (
249- joined_ds .filter ((pc .field ("time" ) >= start ) & (pc .field ("time" ) < end )).to_table ()
250- )
251-
252- else :
253- row_group = pl .DataFrame (joined_ds .filter ((pc .field ("time" ) >= start )).to_table ())
254-
255- if not row_group .is_empty ():
256- unique_table = (
257- row_group .unique (keep = "first" ).sort (by = ["time" ]).to_arrow ().cast (self .get_table_schema )
258- )
259-
260- row_group_count += 1
261- writer .write_table (unique_table )
262-
263- start = end
264-
265- os .replace (new_path , self .local_path )
266- process_logger .add_metadata (row_group_count = row_group_count )
237+ sorted_ds = joined_ds .unique ().sort ("time" )
238+ valid = process_logger .log_dataframely_filter_results (* self .table_schema .filter (sorted_ds ))
239+ if not valid .is_empty ():
240+ pq .write_table (valid .to_arrow ().cast (self .get_table_schema ), new_path )
241+ os .replace (new_path , self .local_path )
242+ process_logger .add_metadata (row_count = pq .read_metadata (self .local_path ).num_rows )
267243
268244 process_logger .log_complete ()
269245
@@ -290,15 +266,17 @@ def __init__(self) -> None:
290266 def unique_key (self ) -> str :
291267 return "changes"
292268
293- def convert_records (self ) -> pd . Dataset :
269+ def convert_records (self ) -> dy . DataFrame [ GlidesRecord ] :
294270 process_logger = ProcessLogger (process_name = "convert_records" , type = self .type )
295271 process_logger .log_start ()
296272
297273 editors_table = pyarrow .Table .from_pylist (self .records , schema = self .get_event_schema )
298274 editors_table = flatten_table_schema (editors_table )
299275 editors_table = explode_table_column (editors_table , "data.changes" )
300276 editors_table = flatten_table_schema (editors_table )
301- editors_dataset = pd .dataset (editors_table )
277+ editors_dataset = process_logger .log_dataframely_filter_results (
278+ * EditorChangesTable .filter (pl .DataFrame (editors_table ))
279+ )
302280
303281 process_logger .log_complete ()
304282 return editors_dataset
@@ -322,12 +300,14 @@ def __init__(self) -> None:
322300 def unique_key (self ) -> str :
323301 return "operator"
324302
325- def convert_records (self ) -> pd . Dataset :
303+ def convert_records (self ) -> dy . DataFrame [ GlidesRecord ] :
326304 process_logger = ProcessLogger (process_name = "convert_records" , type = self .type )
327305 process_logger .log_start ()
328306 osi_table = pyarrow .Table .from_pylist (self .records , schema = self .get_event_schema )
329307 osi_table = flatten_table_schema (osi_table )
330- osi_dataset = pd .dataset (osi_table )
308+ osi_dataset = process_logger .log_dataframely_filter_results (
309+ * OperatorSignInsTable .filter (pl .DataFrame (osi_table ))
310+ )
331311
332312 process_logger .log_complete ()
333313 return osi_dataset
@@ -348,7 +328,7 @@ def __init__(self) -> None:
348328 def unique_key (self ) -> str :
349329 return "tripUpdates"
350330
351- def convert_records (self ) -> pd . Dataset :
331+ def convert_records (self ) -> dy . DataFrame [ GlidesRecord ] :
352332 def flatten_multitypes (record : Dict ) -> Dict :
353333 """
354334 For each update in a record, flatten out the objects in "cars",
@@ -374,7 +354,7 @@ def flatten_multitypes(record: Dict) -> Dict:
374354 tu_table = flatten_table_schema (tu_table )
375355 tu_table = explode_table_column (tu_table , "data.tripUpdates" )
376356 tu_table = flatten_table_schema (tu_table )
377- tu_dataset = pd . dataset ( tu_table )
357+ tu_dataset = process_logger . log_dataframely_filter_results ( * TripUpdatesTable . filter ( pl . DataFrame ( tu_table )) )
378358
379359 process_logger .log_complete ()
380360 return tu_dataset
@@ -398,13 +378,15 @@ def __init__(self) -> None:
398378 def unique_key (self ) -> str :
399379 return "tripKey"
400380
401- def convert_records (self ) -> pd . Dataset :
381+ def convert_records (self ) -> dy . DataFrame [ GlidesRecord ] :
402382 process_logger = ProcessLogger (process_name = "convert_records" , type = self .type )
403383 process_logger .log_start ()
404384
405385 tu_table = pyarrow .Table .from_pylist (self .records , schema = self .get_event_schema )
406386 tu_table = flatten_table_schema (tu_table )
407- tu_dataset = pd .dataset (tu_table )
387+ tu_dataset = process_logger .log_dataframely_filter_results (
388+ * VehicleTripAssignmentTable .filter (pl .DataFrame (tu_table ))
389+ )
408390
409391 process_logger .log_complete ()
410392 return tu_dataset
0 commit comments