2
2
import tempfile
3
3
import uuid
4
4
import warnings
5
+ from dataclasses import asdict , dataclass
5
6
from datetime import datetime , timezone
6
- from typing import Any , Callable , Dict , List , Optional , Tuple , Union
7
+ from typing import Any , Callable , Dict , List , Optional , Tuple , Union , cast
7
8
8
9
import numpy as np
9
10
import pandas
@@ -55,6 +56,12 @@ class SparkOfflineStoreConfig(FeastConfigBaseModel):
55
56
""" AWS Region if applicable for s3-based staging locations"""
56
57
57
58
59
+ @dataclass (frozen = True )
60
+ class SparkFeatureViewQueryContext (offline_utils .FeatureViewQueryContext ):
61
+ min_date_partition : Optional [str ]
62
+ max_date_partition : str
63
+
64
+
58
65
class SparkOfflineStore (OfflineStore ):
59
66
@staticmethod
60
67
def pull_latest_from_table_or_query (
@@ -101,6 +108,7 @@ def pull_latest_from_table_or_query(
101
108
aliases_as_string = ", " .join (aliases )
102
109
103
110
date_partition_column = data_source .date_partition_column
111
+ date_partition_column_format = data_source .date_partition_column_format
104
112
105
113
start_date_str = _format_datetime (start_date )
106
114
end_date_str = _format_datetime (end_date )
@@ -112,7 +120,7 @@ def pull_latest_from_table_or_query(
112
120
SELECT { fields_as_string } ,
113
121
ROW_NUMBER() OVER({ partition_by_join_key_string } ORDER BY { timestamp_desc_string } ) AS feast_row_
114
122
FROM { from_expression } t1
115
- WHERE { timestamp_field } BETWEEN TIMESTAMP('{ start_date_str } ') AND TIMESTAMP('{ end_date_str } '){ " AND " + date_partition_column + " >= '" + start_date .strftime ("%Y-%m-%d" ) + "' AND " + date_partition_column + " <= '" + end_date .strftime ("%Y-%m-%d" ) + "' " if date_partition_column != "" and date_partition_column is not None else "" }
123
+ WHERE { timestamp_field } BETWEEN TIMESTAMP('{ start_date_str } ') AND TIMESTAMP('{ end_date_str } '){ " AND " + date_partition_column + " >= '" + start_date .strftime (date_partition_column_format ) + "' AND " + date_partition_column + " <= '" + end_date .strftime (date_partition_column_format ) + "' " if date_partition_column != "" and date_partition_column is not None else "" }
116
124
) t2
117
125
WHERE feast_row_ = 1
118
126
"""
@@ -136,8 +144,12 @@ def get_historical_features(
136
144
full_feature_names : bool = False ,
137
145
) -> RetrievalJob :
138
146
assert isinstance (config .offline_store , SparkOfflineStoreConfig )
147
+ date_partition_column_formats = []
139
148
for fv in feature_views :
140
149
assert isinstance (fv .batch_source , SparkSource )
150
+ date_partition_column_formats .append (
151
+ fv .batch_source .date_partition_column_format
152
+ )
141
153
142
154
warnings .warn (
143
155
"The spark offline store is an experimental feature in alpha development. "
@@ -186,8 +198,27 @@ def get_historical_features(
186
198
entity_df_event_timestamp_range ,
187
199
)
188
200
201
+ spark_query_context = [
202
+ SparkFeatureViewQueryContext (
203
+ ** asdict (context ),
204
+ min_date_partition = datetime .fromisoformat (
205
+ context .min_event_timestamp
206
+ ).strftime (date_format )
207
+ if context .min_event_timestamp is not None
208
+ else None ,
209
+ max_date_partition = datetime .fromisoformat (
210
+ context .max_event_timestamp
211
+ ).strftime (date_format ),
212
+ )
213
+ for date_format , context in zip (
214
+ date_partition_column_formats , query_context
215
+ )
216
+ ]
217
+
189
218
query = offline_utils .build_point_in_time_query (
190
- feature_view_query_contexts = query_context ,
219
+ feature_view_query_contexts = cast (
220
+ List [offline_utils .FeatureViewQueryContext ], spark_query_context
221
+ ),
191
222
left_table_query_string = tmp_entity_df_table_name ,
192
223
entity_df_event_timestamp_col = event_timestamp_col ,
193
224
entity_df_columns = entity_schema .keys (),
@@ -651,13 +682,13 @@ def _cast_data_frame(
651
682
FROM {{ featureview.table_subquery }}
652
683
WHERE {{ featureview.timestamp_field }} <= '{{ featureview.max_event_timestamp }}'
653
684
{% if featureview.date_partition_column != "" and featureview.date_partition_column is not none %}
654
- AND {{ featureview.date_partition_column }} <= '{{ featureview.max_event_timestamp[:10] }}'
685
+ AND {{ featureview.date_partition_column }} <= '{{ featureview.max_date_partition }}'
655
686
{% endif %}
656
687
657
688
{% if featureview.ttl == 0 %}{% else %}
658
689
AND {{ featureview.timestamp_field }} >= '{{ featureview.min_event_timestamp }}'
659
690
{% if featureview.date_partition_column != "" and featureview.date_partition_column is not none %}
660
- AND {{ featureview.date_partition_column }} >= '{{ featureview.min_event_timestamp[:10] }}'
691
+ AND {{ featureview.date_partition_column }} >= '{{ featureview.min_date_partition }}'
661
692
{% endif %}
662
693
{% endif %}
663
694
),
0 commit comments