11from typing import Callable
2- from datetime import date , datetime , timedelta
2+ from datetime import date , timedelta
33import pyarrow
44import pyarrow .parquet as pq
55import pyarrow .dataset as pd
@@ -27,7 +27,8 @@ def __init__(
2727 remote_output_location : S3Location ,
2828 processed_schema : pyarrow .schema ,
2929 tableau_project_name : str ,
30- start_date : date | int | None = None ,
30+ num_days_ago : int | None = None ,
31+ start_date : date | None = None ,
3132 end_date : date | None = None ,
3233 partition_template : str = "year={yy}/month={mm}/day={dd}/" ,
3334 parquet_preprocess : Callable [[pyarrow .Table ], pyarrow .Table ] | None = None ,
@@ -49,11 +50,12 @@ def __init__(
4950 self .partition_template = partition_template
5051
5152 if start_date is not None and end_date is not None :
52- assert isinstance ( start_date , date )
53+ assert num_days_ago is None # only set num_days_ago or start_date and end_date
5354 assert start_date <= end_date
5455
5556 self .start_date = start_date
5657 self .end_date = end_date
58+ self .num_days_ago = num_days_ago
5759
5860 self .parquet_preprocess = parquet_preprocess # level 1 | complex preprocess
5961 self .parquet_filter = parquet_filter # level 2 | by column and simple filter
@@ -67,11 +69,11 @@ def create_parquet(self, _: DatabaseManager | None) -> None:
6769 self .update_parquet (None )
6870
6971 def update_parquet (self , _ : DatabaseManager | None ) -> bool :
70- return self .create_tableau_parquet (partition_template = self .partition_template )
72+ return self .create_tableau_parquet (partition_template = self .partition_template , num_days_ago = self . num_days_ago )
7173
7274 # pylint: disable=R0914, R0912
7375 # pylint too many local variables (more than 15)
74- def create_tableau_parquet (self , partition_template : str ) -> bool :
76+ def create_tableau_parquet (self , partition_template : str , num_days_ago : int | None ) -> bool :
7577 """
7678 Join files into single parquet file for upload to Tableau. apply filter and conversions as necessary
7779
@@ -84,11 +86,6 @@ def create_tableau_parquet(self, partition_template: str) -> bool:
8486 True if parquet created, False otherwise
8587 """
8688 process_logger = ProcessLogger ("filtered_hyper_create_parquet" )
87- if isinstance (self .start_date , int ):
88- end_datetime = datetime .now ()
89- process_logger .add_metadata (now = end_datetime )
90- self .end_date = end_datetime .date ()
91- self .start_date = self .end_date - timedelta (days = self .start_date )
9289
9390 if self .start_date is not None and self .end_date is not None :
9491 # limitation of filtered hyper only does whole days.
@@ -103,6 +100,18 @@ def create_tableau_parquet(self, partition_template: str) -> bool:
103100 end_date = self .end_date ,
104101 start_date = self .start_date ,
105102 )
103+ process_logger .add_metadata (start_date = self .start_date , end_date = self .end_date )
104+ elif isinstance (num_days_ago , int ):
105+ end_date = date .today ()
106+ start_date = end_date - timedelta (days = num_days_ago )
107+ s3_uris = file_list_from_s3_date_range (
108+ bucket_name = self .remote_input_location .bucket ,
109+ file_prefix = self .remote_input_location .prefix ,
110+ path_template = partition_template ,
111+ end_date = end_date ,
112+ start_date = start_date ,
113+ )
114+ process_logger .add_metadata (start_date = start_date , end_date = end_date )
106115 else :
107116 s3_uris = file_list_from_s3 (
108117 bucket_name = self .remote_input_location .bucket ,
@@ -115,7 +124,6 @@ def create_tableau_parquet(self, partition_template: str) -> bool:
115124 format = "parquet" ,
116125 filesystem = S3FileSystem (),
117126 )
118- process_logger .add_metadata (start_date = self .start_date , end_date = self .end_date )
119127 process_logger .log_start ()
120128 if len (ds_paths ) == 0 :
121129 process_logger .add_metadata (n_paths_zero = len (ds_paths ))
0 commit comments