@@ -15,6 +15,8 @@ class FieldStriperCompiler<TClass> {
1515
1616 private readonly ParquetSchema _schema ;
1717 private readonly DataField _df ;
18+ private readonly bool _hasRls ;
19+ private readonly bool _hasDls ;
1820
1921 // input parameters
2022 private readonly ParameterExpression _dfParam = Expression . Parameter ( typeof ( DataField ) , "df" ) ;
@@ -34,10 +36,14 @@ class FieldStriperCompiler<TClass> {
3436 // currently iterated class element
3537 private readonly ParameterExpression _classElementVar = Expression . Variable ( typeof ( TClass ) , "curr" ) ;
3638
39+ private static readonly Expression NullListOfInt = Expression . Convert ( Expression . Constant ( null ) , typeof ( List < int > ) ) ;
40+
3741 public FieldStriperCompiler ( ParquetSchema schema , DataField df ) {
3842
3943 _schema = schema ;
4044 _df = df ;
45+ _hasRls = _df . MaxRepetitionLevel > 0 ;
46+ _hasDls = _df . MaxDefinitionLevel > 0 ;
4147
4248 //
4349 _valuesListType = typeof ( List < > ) . MakeGenericType ( df . ClrType ) ;
@@ -82,38 +88,40 @@ private Expression WriteValue(ParameterExpression valueVar,
8288
8389 // only need RL and DL-1
8490 Expression . Block (
85- Expression . Call ( _dlsVar , LevelsAddMethod , Expression . Constant ( dl - 1 ) ) ,
86- Expression . Call ( _rlsVar , LevelsAddMethod , currentRlVar ) ) ,
91+ _hasDls ? Expression . Call ( _dlsVar , LevelsAddMethod , Expression . Constant ( dl - 1 ) ) : Expression . Empty ( ) ,
92+ _hasRls ? Expression . Call ( _rlsVar , LevelsAddMethod , currentRlVar ) : Expression . Empty ( ) ) ,
8793
8894 // everything, but value must be non-null
8995 Expression . Block (
9096 Expression . Call ( _valuesVar , _valuesListAddMethod , getNonNullValue ) ,
91- Expression . Call ( _dlsVar , LevelsAddMethod , Expression . Constant ( dl ) ) ,
92- Expression . Call ( _rlsVar , LevelsAddMethod , currentRlVar ) ) ) ;
97+ _hasDls ? Expression . Call ( _dlsVar , LevelsAddMethod , Expression . Constant ( dl ) ) : Expression . Empty ( ) ,
98+ _hasRls ? Expression . Call ( _rlsVar , LevelsAddMethod , currentRlVar ) : Expression . Empty ( ) ) ) ;
9399
94100 } else {
95101 // required atomics are simple - add value, RL and DL as is
96102 return Expression . Block (
97103 Expression . Call ( _valuesVar , _valuesListAddMethod , valueVar ) ,
98- Expression . Call ( _dlsVar , LevelsAddMethod , Expression . Constant ( dl ) ) ,
99- Expression . Call ( _rlsVar , LevelsAddMethod , currentRlVar ) ) ;
104+ _hasDls ? Expression . Call ( _dlsVar , LevelsAddMethod , Expression . Constant ( dl ) ) : Expression . Empty ( ) ,
105+ _hasRls ? Expression . Call ( _rlsVar , LevelsAddMethod , currentRlVar ) : Expression . Empty ( ) ) ;
100106 }
101107 }
102108
103109 // non-atomics still need RL and DL dumped
104110 return Expression . Block (
105- Expression . Call ( _dlsVar , LevelsAddMethod , Expression . Constant ( dl ) ) ,
106- Expression . Call ( _rlsVar , LevelsAddMethod , currentRlVar ) ) ;
111+ _hasDls ? Expression . Call ( _dlsVar , LevelsAddMethod , Expression . Constant ( dl ) ) : Expression . Empty ( ) ,
112+ _hasRls ? Expression . Call ( _rlsVar , LevelsAddMethod , currentRlVar ) : Expression . Empty ( ) ) ;
107113
108114 }
109115
110116 private Expression WriteMissingValue ( int dl , Expression currentRlVar ) {
111117 return Expression . Block (
112- Expression . Call ( _dlsVar , LevelsAddMethod , Expression . Constant ( dl ) ) ,
113- Expression . Call ( _rlsVar , LevelsAddMethod , currentRlVar ) ) ;
118+ _hasDls ? Expression . Call ( _dlsVar , LevelsAddMethod , Expression . Constant ( dl ) ) : Expression . Empty ( ) ,
119+ _hasRls ? Expression . Call ( _rlsVar , LevelsAddMethod , currentRlVar ) : Expression . Empty ( ) ) ;
114120 }
115121
116- private Expression WhileBody ( Expression element , bool isAtomic , int dl , ParameterExpression currentRlVar , ParameterExpression seenFieldsVar , Field field , int rlDepth , Type elementType , List < string > path ) {
122+ private Expression WhileBody ( Expression element , bool isAtomic , int dl , ParameterExpression currentRlVar ,
123+ ParameterExpression seenFieldsVar , Field field , int rlDepth , Type elementType , List < string > path ) {
124+
117125 string suffix = field . Path . ToString ( ) . Replace ( "." , "_" ) ;
118126 ParameterExpression chRepetitionLevelVar = Expression . Variable ( typeof ( int ) , $ "chRepetitionLevel_{ suffix } ") ;
119127 ParameterExpression valueVar = Expression . Variable ( elementType , $ "value_{ suffix } ") ;
@@ -127,13 +135,15 @@ private Expression WhileBody(Expression element, bool isAtomic, int dl, Paramete
127135 // L9-13
128136 Expression . IfThenElse (
129137 // if seenFields.Contains(field.Path)
130- Expression . Call ( seenFieldsVar , typeof ( HashSet < string > ) . GetMethod ( "Contains" ) ! , Expression . Constant ( field . Path . ToString ( ) ) ) ,
138+ //Expression.Call(seenFieldsVar, typeof(HashSet<string>).GetMethod("Contains")!, Expression.Constant(field.Path.ToString())),
139+ Expression . IsTrue ( seenFieldsVar ) ,
131140
132141 // chRepetitionLevelVar = treeDepth
133142 Expression . Assign ( chRepetitionLevelVar , Expression . Constant ( rlDepth ) ) ,
134143
135144 // seenFields.Add(field.Path)
136- Expression . Call ( seenFieldsVar , typeof ( HashSet < string > ) . GetMethod ( "Add" ) ! , Expression . Constant ( field . Path . ToString ( ) ) )
145+ //Expression.Call(seenFieldsVar, typeof(HashSet<string>).GetMethod("Add")!, Expression.Constant(field.Path.ToString()))
146+ Expression . Assign ( seenFieldsVar , Expression . Constant ( true ) )
137147 ) ,
138148
139149 // L14-
@@ -195,13 +205,14 @@ private Expression DissectRecord(
195205 Expression levelProperty = Expression . Property ( rootVar , levelPropertyName ) ;
196206 Type levelPropertyType = rootType . GetProperty ( levelPropertyName ) ! . PropertyType ;
197207 ParameterExpression seenFieldsVar = Expression . Variable ( typeof ( HashSet < string > ) , $ "seenFieldsVar_{ levelPropertyName } ") ;
208+ ParameterExpression seenVar = Expression . Variable ( typeof ( bool ) , $ "seen_{ levelPropertyName } ") ;
198209
199210 Expression extraBody ;
200211 if ( isRepeated ) {
201212 Type elementType = ExtractElementTypeFromEnumerableType ( levelPropertyType ) ;
202213 Expression collection = levelProperty ;
203214 ParameterExpression element = Expression . Variable ( elementType , "element" ) ;
204- Expression elementProcessor = WhileBody ( element , isAtomic , dl , currentRlVar , seenFieldsVar , field , rlDepth , elementType , path ) ;
215+ Expression elementProcessor = WhileBody ( element , isAtomic , dl , currentRlVar , seenVar , field , rlDepth , elementType , path ) ;
205216 extraBody = elementProcessor . Loop ( collection , elementType , element ) ;
206217
207218 // todo: if levelProperty (collection) is null, we need extra iteration with null value (which rep and def level?)
@@ -212,12 +223,12 @@ private Expression DissectRecord(
212223 extraBody ) ;
213224 } else {
214225 Expression element = levelProperty ;
215- extraBody = WhileBody ( element , isAtomic , dl , currentRlVar , seenFieldsVar , field , rlDepth , levelPropertyType , path ) ;
226+ extraBody = WhileBody ( element , isAtomic , dl , currentRlVar , seenVar , field , rlDepth , levelPropertyType , path ) ;
216227 }
217228
218229 return Expression . Block (
219- new [ ] { seenFieldsVar } ,
220- Expression . Assign ( seenFieldsVar , Expression . New ( typeof ( HashSet < string > ) ) ) ,
230+ new [ ] { seenVar } ,
231+ Expression . Assign ( seenVar , Expression . Constant ( false ) ) ,
221232 extraBody ) ;
222233 }
223234
@@ -236,16 +247,16 @@ public FieldStriper<TClass> Compile() {
236247 // init 3 building blocks
237248 Expression . Block (
238249 Expression . Assign ( _valuesVar , Expression . New ( _valuesListType ) ) ,
239- Expression . Assign ( _dlsVar , Expression . New ( typeof ( List < int > ) ) ) ,
240- Expression . Assign ( _rlsVar , Expression . New ( typeof ( List < int > ) ) ) ) ,
250+ Expression . Assign ( _dlsVar , _hasDls ? Expression . New ( typeof ( List < int > ) ) : NullListOfInt ) ,
251+ Expression . Assign ( _rlsVar , _hasRls ? Expression . New ( typeof ( List < int > ) ) : NullListOfInt ) ) ,
241252
242253 iterationLoop ,
243254
244255 // result: use triple to construct ShreddedColumn and return (last element in the block)
245256 Expression . New ( ShreddedColumnConstructor ,
246257 Expression . Call ( _valuesVar , _valuesListType . GetMethod ( "ToArray" ) ! ) ,
247- _df . MaxDefinitionLevel == 0 ? Expression . Convert ( Expression . Constant ( null ) , typeof ( List < int > ) ) : _dlsVar ,
248- _df . MaxRepetitionLevel == 0 ? Expression . Convert ( Expression . Constant ( null ) , typeof ( List < int > ) ) : _rlsVar )
258+ _dlsVar ,
259+ _rlsVar )
249260 ) ;
250261
251262 Func < DataField , IEnumerable < TClass > , ShreddedColumn > lambda = Expression
0 commit comments