@@ -3,7 +3,7 @@ use crate::{
3
3
bq_analytics:: gcs_handler:: upload_parquet_to_gcs,
4
4
gap_detectors:: ProcessingResult ,
5
5
utils:: {
6
- counters:: { PARQUET_HANDLER_BUFFER_SIZE , PARQUET_STRUCT_SIZE } ,
6
+ counters:: { PARQUET_HANDLER_CURRENT_BUFFER_SIZE , PARQUET_STRUCT_SIZE } ,
7
7
util:: naive_datetime_to_timestamp,
8
8
} ,
9
9
} ;
@@ -23,7 +23,6 @@ use tracing::{debug, error, info};
23
23
#[ derive( Debug , Default , Clone ) ]
24
24
pub struct ParquetDataGeneric < ParquetType > {
25
25
pub data : Vec < ParquetType > ,
26
- pub transaction_version_to_struct_count : AHashMap < i64 , i64 > ,
27
26
}
28
27
29
28
pub trait NamedTable {
71
70
pub upload_interval : Duration ,
72
71
pub max_buffer_size : usize ,
73
72
pub last_upload_time : Instant ,
73
+ pub processor_name : String ,
74
74
}
75
75
fn create_new_writer ( schema : Arc < Type > ) -> Result < SerializedFileWriter < Vec < u8 > > > {
76
76
let props = WriterProperties :: builder ( )
@@ -103,6 +103,7 @@ where
103
103
schema : Arc < Type > ,
104
104
upload_interval : Duration ,
105
105
max_buffer_size : usize ,
106
+ processor_name : String ,
106
107
) -> Result < Self > {
107
108
// had to append unique id to avoid concurrent write issues
108
109
let writer = create_new_writer ( schema. clone ( ) ) ?;
@@ -119,6 +120,7 @@ where
119
120
upload_interval,
120
121
max_buffer_size,
121
122
last_upload_time : Instant :: now ( ) ,
123
+ processor_name,
122
124
} )
123
125
}
124
126
@@ -128,47 +130,54 @@ where
128
130
changes : ParquetDataGeneric < ParquetType > ,
129
131
) -> Result < ( ) > {
130
132
let parquet_structs = changes. data ;
131
- self . transaction_version_to_struct_count
132
- . extend ( changes. transaction_version_to_struct_count ) ;
133
-
133
+ let processor_name = self . processor_name . clone ( ) ;
134
134
for parquet_struct in parquet_structs {
135
135
let size_of_struct = allocative:: size_of_unique ( & parquet_struct) ;
136
136
PARQUET_STRUCT_SIZE
137
- . with_label_values ( & [ ParquetType :: TABLE_NAME ] )
137
+ . with_label_values ( & [ & processor_name , ParquetType :: TABLE_NAME ] )
138
138
. set ( size_of_struct as i64 ) ;
139
139
self . buffer_size_bytes += size_of_struct;
140
140
self . buffer . push ( parquet_struct) ;
141
141
142
142
if self . buffer_size_bytes >= self . max_buffer_size {
143
- info ! ( "Max buffer size reached, uploading to GCS." ) ;
143
+ debug ! (
144
+ table_name = ParquetType :: TABLE_NAME ,
145
+ buffer_size = self . buffer_size_bytes,
146
+ max_buffer_size = self . max_buffer_size,
147
+ "Max buffer size reached, uploading to GCS."
148
+ ) ;
144
149
if let Err ( e) = self . upload_buffer ( gcs_client) . await {
145
150
error ! ( "Failed to upload buffer: {}" , e) ;
146
151
return Err ( e) ;
147
152
}
148
153
self . last_upload_time = Instant :: now ( ) ;
149
154
}
155
+ }
150
156
151
- if self . last_upload_time . elapsed ( ) >= self . upload_interval {
152
- info ! (
153
- "Time has elapsed more than {} since last upload." ,
154
- self . upload_interval. as_secs( )
155
- ) ;
156
- if let Err ( e) = self . upload_buffer ( gcs_client) . await {
157
- error ! ( "Failed to upload buffer: {}" , e) ;
158
- return Err ( e) ;
159
- }
160
- self . last_upload_time = Instant :: now ( ) ;
157
+ if self . last_upload_time . elapsed ( ) >= self . upload_interval {
158
+ info ! (
159
+ "Time has elapsed more than {} since last upload for {}" ,
160
+ self . upload_interval. as_secs( ) ,
161
+ ParquetType :: TABLE_NAME
162
+ ) ;
163
+ if let Err ( e) = self . upload_buffer ( gcs_client) . await {
164
+ error ! ( "Failed to upload buffer: {}" , e) ;
165
+ return Err ( e) ;
161
166
}
167
+ self . last_upload_time = Instant :: now ( ) ;
162
168
}
163
169
164
- PARQUET_HANDLER_BUFFER_SIZE
165
- . with_label_values ( & [ ParquetType :: TABLE_NAME ] )
166
- . set ( self . buffer . len ( ) as i64 ) ;
170
+ PARQUET_HANDLER_CURRENT_BUFFER_SIZE
171
+ . with_label_values ( & [ & self . processor_name , ParquetType :: TABLE_NAME ] )
172
+ . set ( self . buffer_size_bytes as i64 ) ;
173
+
167
174
Ok ( ( ) )
168
175
}
169
176
170
177
async fn upload_buffer ( & mut self , gcs_client : & GCSClient ) -> Result < ( ) > {
178
+ // This is to cover the case when interval duration has passed but buffer is empty
171
179
if self . buffer . is_empty ( ) {
180
+ debug ! ( "Buffer is empty, skipping upload." ) ;
172
181
return Ok ( ( ) ) ;
173
182
}
174
183
let start_version = self
@@ -183,9 +192,7 @@ where
183
192
let end_version = last. version ( ) ;
184
193
let last_transaction_timestamp = naive_datetime_to_timestamp ( last. get_timestamp ( ) ) ;
185
194
186
- let txn_version_to_struct_count =
187
- process_struct_count_map ( & self . buffer , & mut self . transaction_version_to_struct_count ) ;
188
-
195
+ let parquet_processed_transactions = build_parquet_processed_transactions ( & self . buffer ) ;
189
196
let struct_buffer = std:: mem:: take ( & mut self . buffer ) ;
190
197
191
198
let mut row_group_writer = self
@@ -206,12 +213,6 @@ where
206
213
. into_inner ( )
207
214
. context ( "Failed to get inner buffer" ) ?;
208
215
209
- debug ! (
210
- table_name = ParquetType :: TABLE_NAME ,
211
- start_version = start_version,
212
- end_version = end_version,
213
- "Max buffer size reached, uploading to GCS."
214
- ) ;
215
216
let bucket_root = PathBuf :: from ( & self . bucket_root ) ;
216
217
217
218
upload_parquet_to_gcs (
@@ -220,6 +221,7 @@ where
220
221
ParquetType :: TABLE_NAME ,
221
222
& self . bucket_name ,
222
223
& bucket_root,
224
+ self . processor_name . clone ( ) ,
223
225
)
224
226
. await ?;
225
227
@@ -229,7 +231,9 @@ where
229
231
start_version,
230
232
end_version,
231
233
last_transaction_timestamp : Some ( last_transaction_timestamp) ,
232
- txn_version_to_struct_count,
234
+ txn_version_to_struct_count : None ,
235
+ parquet_processed_structs : Some ( parquet_processed_transactions) ,
236
+ table_name : ParquetType :: TABLE_NAME . to_string ( ) ,
233
237
} ;
234
238
235
239
self . gap_detector_sender
@@ -243,19 +247,18 @@ where
243
247
}
244
248
}
245
249
246
- fn process_struct_count_map < ParquetType : NamedTable + HasVersion > (
250
+ fn build_parquet_processed_transactions < ParquetType : NamedTable + HasVersion > (
247
251
buffer : & [ ParquetType ] ,
248
- txn_version_to_struct_count : & mut AHashMap < i64 , i64 > ,
249
252
) -> AHashMap < i64 , i64 > {
250
253
let mut txn_version_to_struct_count_for_gap_detector = AHashMap :: new ( ) ;
251
254
252
255
for item in buffer. iter ( ) {
253
256
let version = item. version ( ) ;
254
257
255
- if let Some ( count ) = txn_version_to_struct_count . get ( & ( version ) ) {
256
- txn_version_to_struct_count_for_gap_detector . insert ( version, * count ) ;
257
- txn_version_to_struct_count . remove ( & ( version ) ) ;
258
- }
258
+ txn_version_to_struct_count_for_gap_detector
259
+ . entry ( version)
260
+ . and_modify ( |count| * count += 1 )
261
+ . or_insert ( 1 ) ;
259
262
}
260
263
txn_version_to_struct_count_for_gap_detector
261
264
}
0 commit comments