@@ -29,12 +29,20 @@ use itertools::Itertools;
29
29
use serde_json:: Value ;
30
30
use std:: {
31
31
collections:: { HashMap , HashSet } ,
32
+ num:: NonZeroU32 ,
32
33
sync:: Arc ,
33
34
} ;
34
35
use tracing:: error;
35
36
36
- use super :: EventFormat ;
37
- use crate :: { metadata:: SchemaVersion , storage:: StreamType , utils:: arrow:: get_field} ;
37
+ use super :: { EventFormat , LogSource } ;
38
+ use crate :: {
39
+ metadata:: SchemaVersion ,
40
+ storage:: StreamType ,
41
+ utils:: {
42
+ arrow:: get_field,
43
+ json:: { convert_array_to_object, flatten:: convert_to_array} ,
44
+ } ,
45
+ } ;
38
46
39
47
pub struct Event {
40
48
pub json : Value ,
@@ -64,34 +72,48 @@ impl EventFormat for Event {
64
72
self ,
65
73
stored_schema : & HashMap < String , Arc < Field > > ,
66
74
time_partition : Option < & String > ,
75
+ time_partition_limit : Option < NonZeroU32 > ,
76
+ custom_partition : Option < & String > ,
67
77
schema_version : SchemaVersion ,
78
+ log_source : & LogSource ,
68
79
) -> Result < ( Self :: Data , Vec < Arc < Field > > , bool ) , anyhow:: Error > {
69
- // incoming event may be a single json or a json array
70
- // but Data (type defined above) is a vector of json values
71
- // hence we need to convert the incoming event to a vector of json values
72
- let value_arr = match self . json {
73
- Value :: Array ( arr) => arr,
74
- value @ Value :: Object ( _) => vec ! [ value] ,
75
- _ => unreachable ! ( "flatten would have failed beforehand" ) ,
80
+ let flattened = if time_partition. is_some ( ) || custom_partition. is_some ( ) {
81
+ convert_array_to_object (
82
+ self . json ,
83
+ time_partition,
84
+ time_partition_limit,
85
+ custom_partition,
86
+ schema_version,
87
+ log_source,
88
+ ) ?
89
+ } else {
90
+ vec ! [ convert_to_array( convert_array_to_object(
91
+ self . json,
92
+ None ,
93
+ None ,
94
+ None ,
95
+ schema_version,
96
+ log_source,
97
+ ) ?) ?]
76
98
} ;
77
99
78
100
// collect all the keys from all the json objects in the request body
79
101
let fields =
80
- collect_keys ( value_arr . iter ( ) ) . expect ( "fields can be collected from array of objects" ) ;
102
+ collect_keys ( flattened . iter ( ) ) . expect ( "fields can be collected from array of objects" ) ;
81
103
82
104
let mut is_first = false ;
83
105
let schema = match derive_arrow_schema ( stored_schema, fields) {
84
106
Some ( schema) => schema,
85
107
_ => {
86
- let mut infer_schema = infer_json_schema_from_iterator ( value_arr . iter ( ) . map ( Ok ) )
108
+ let mut infer_schema = infer_json_schema_from_iterator ( flattened . iter ( ) . map ( Ok ) )
87
109
. map_err ( |err| {
88
110
anyhow ! ( "Could not infer schema for this event due to err {:?}" , err)
89
111
} ) ?;
90
112
let new_infer_schema = super :: update_field_type_in_schema (
91
113
Arc :: new ( infer_schema) ,
92
114
Some ( stored_schema) ,
93
115
time_partition,
94
- Some ( & value_arr ) ,
116
+ Some ( & flattened ) ,
95
117
schema_version,
96
118
) ;
97
119
infer_schema = Schema :: new ( new_infer_schema. fields ( ) . clone ( ) ) ;
@@ -110,7 +132,7 @@ impl EventFormat for Event {
110
132
}
111
133
} ;
112
134
113
- if value_arr
135
+ if flattened
114
136
. iter ( )
115
137
. any ( |value| fields_mismatch ( & schema, value, schema_version) )
116
138
{
@@ -119,7 +141,7 @@ impl EventFormat for Event {
119
141
) ) ;
120
142
}
121
143
122
- Ok ( ( value_arr , schema, is_first) )
144
+ Ok ( ( flattened , schema, is_first) )
123
145
}
124
146
125
147
// Convert the Data type (defined above) to arrow record batch
@@ -147,7 +169,9 @@ impl EventFormat for Event {
147
169
static_schema_flag : bool ,
148
170
custom_partitions : Option < & String > ,
149
171
time_partition : Option < & String > ,
172
+ time_partition_limit : Option < NonZeroU32 > ,
150
173
schema_version : SchemaVersion ,
174
+ log_source : & LogSource ,
151
175
stream_type : StreamType ,
152
176
) -> Result < super :: Event , anyhow:: Error > {
153
177
let custom_partition_values = match custom_partitions. as_ref ( ) {
@@ -167,7 +191,10 @@ impl EventFormat for Event {
167
191
storage_schema,
168
192
static_schema_flag,
169
193
time_partition,
194
+ time_partition_limit,
195
+ custom_partitions,
170
196
schema_version,
197
+ log_source,
171
198
) ?;
172
199
173
200
Ok ( super :: Event {
@@ -385,7 +412,15 @@ mod tests {
385
412
} ) ;
386
413
387
414
let ( rb, _) = Event :: new ( json)
388
- . into_recordbatch ( & HashMap :: default ( ) , false , None , SchemaVersion :: V0 )
415
+ . into_recordbatch (
416
+ & HashMap :: default ( ) ,
417
+ false ,
418
+ None ,
419
+ None ,
420
+ None ,
421
+ SchemaVersion :: V0 ,
422
+ & LogSource :: Json ,
423
+ )
389
424
. unwrap ( ) ;
390
425
391
426
assert_eq ! ( rb. num_rows( ) , 1 ) ;
@@ -413,7 +448,15 @@ mod tests {
413
448
} ) ;
414
449
415
450
let ( rb, _) = Event :: new ( json)
416
- . into_recordbatch ( & HashMap :: default ( ) , false , None , SchemaVersion :: V0 )
451
+ . into_recordbatch (
452
+ & HashMap :: default ( ) ,
453
+ false ,
454
+ None ,
455
+ None ,
456
+ None ,
457
+ SchemaVersion :: V0 ,
458
+ & LogSource :: Json ,
459
+ )
417
460
. unwrap ( ) ;
418
461
419
462
assert_eq ! ( rb. num_rows( ) , 1 ) ;
@@ -445,7 +488,15 @@ mod tests {
445
488
) ;
446
489
447
490
let ( rb, _) = Event :: new ( json)
448
- . into_recordbatch ( & schema, false , None , SchemaVersion :: V0 )
491
+ . into_recordbatch (
492
+ & schema,
493
+ false ,
494
+ None ,
495
+ None ,
496
+ None ,
497
+ SchemaVersion :: V0 ,
498
+ & LogSource :: Json ,
499
+ )
449
500
. unwrap ( ) ;
450
501
451
502
assert_eq ! ( rb. num_rows( ) , 1 ) ;
@@ -477,7 +528,15 @@ mod tests {
477
528
) ;
478
529
479
530
assert ! ( Event :: new( json)
480
- . into_recordbatch( & schema, false , None , SchemaVersion :: V0 , )
531
+ . into_recordbatch(
532
+ & schema,
533
+ false ,
534
+ None ,
535
+ None ,
536
+ None ,
537
+ SchemaVersion :: V0 ,
538
+ & LogSource :: Json
539
+ )
481
540
. is_err( ) ) ;
482
541
}
483
542
@@ -495,7 +554,15 @@ mod tests {
495
554
) ;
496
555
497
556
let ( rb, _) = Event :: new ( json)
498
- . into_recordbatch ( & schema, false , None , SchemaVersion :: V0 )
557
+ . into_recordbatch (
558
+ & schema,
559
+ false ,
560
+ None ,
561
+ None ,
562
+ None ,
563
+ SchemaVersion :: V0 ,
564
+ & LogSource :: Json ,
565
+ )
499
566
. unwrap ( ) ;
500
567
501
568
assert_eq ! ( rb. num_rows( ) , 1 ) ;
@@ -521,7 +588,15 @@ mod tests {
521
588
] ) ;
522
589
523
590
let ( rb, _) = Event :: new ( json)
524
- . into_recordbatch ( & HashMap :: default ( ) , false , None , SchemaVersion :: V0 )
591
+ . into_recordbatch (
592
+ & HashMap :: default ( ) ,
593
+ false ,
594
+ None ,
595
+ None ,
596
+ None ,
597
+ SchemaVersion :: V0 ,
598
+ & LogSource :: Json ,
599
+ )
525
600
. unwrap ( ) ;
526
601
527
602
assert_eq ! ( rb. num_rows( ) , 3 ) ;
@@ -569,7 +644,15 @@ mod tests {
569
644
] ) ;
570
645
571
646
let ( rb, _) = Event :: new ( json)
572
- . into_recordbatch ( & HashMap :: default ( ) , false , None , SchemaVersion :: V0 )
647
+ . into_recordbatch (
648
+ & HashMap :: default ( ) ,
649
+ false ,
650
+ None ,
651
+ None ,
652
+ None ,
653
+ SchemaVersion :: V0 ,
654
+ & LogSource :: Json ,
655
+ )
573
656
. unwrap ( ) ;
574
657
575
658
assert_eq ! ( rb. num_rows( ) , 3 ) ;
@@ -618,7 +701,15 @@ mod tests {
618
701
) ;
619
702
620
703
let ( rb, _) = Event :: new ( json)
621
- . into_recordbatch ( & schema, false , None , SchemaVersion :: V0 )
704
+ . into_recordbatch (
705
+ & schema,
706
+ false ,
707
+ None ,
708
+ None ,
709
+ None ,
710
+ SchemaVersion :: V0 ,
711
+ & LogSource :: Json ,
712
+ )
622
713
. unwrap ( ) ;
623
714
624
715
assert_eq ! ( rb. num_rows( ) , 3 ) ;
@@ -667,7 +758,15 @@ mod tests {
667
758
) ;
668
759
669
760
assert ! ( Event :: new( json)
670
- . into_recordbatch( & schema, false , None , SchemaVersion :: V0 , )
761
+ . into_recordbatch(
762
+ & schema,
763
+ false ,
764
+ None ,
765
+ None ,
766
+ None ,
767
+ SchemaVersion :: V0 ,
768
+ & LogSource :: Json
769
+ )
671
770
. is_err( ) ) ;
672
771
}
673
772
@@ -696,7 +795,15 @@ mod tests {
696
795
] ) ;
697
796
698
797
let ( rb, _) = Event :: new ( json)
699
- . into_recordbatch ( & HashMap :: default ( ) , false , None , SchemaVersion :: V0 )
798
+ . into_recordbatch (
799
+ & HashMap :: default ( ) ,
800
+ false ,
801
+ None ,
802
+ None ,
803
+ None ,
804
+ SchemaVersion :: V0 ,
805
+ & LogSource :: Json ,
806
+ )
700
807
. unwrap ( ) ;
701
808
assert_eq ! ( rb. num_rows( ) , 4 ) ;
702
809
assert_eq ! ( rb. num_columns( ) , 5 ) ;
@@ -768,7 +875,15 @@ mod tests {
768
875
] ) ;
769
876
770
877
let ( rb, _) = Event :: new ( json)
771
- . into_recordbatch ( & HashMap :: default ( ) , false , None , SchemaVersion :: V1 )
878
+ . into_recordbatch (
879
+ & HashMap :: default ( ) ,
880
+ false ,
881
+ None ,
882
+ None ,
883
+ None ,
884
+ SchemaVersion :: V1 ,
885
+ & LogSource :: Json ,
886
+ )
772
887
. unwrap ( ) ;
773
888
774
889
assert_eq ! ( rb. num_rows( ) , 4 ) ;
0 commit comments