@@ -37,8 +37,8 @@ private[scrooge] object ParquetCollectionFormatCompatibility {
3737 * The result is projected file schema with the same optional/required fields as the
3838 * projected read schema, but collection type format as the file schema.
3939 *
40- * @param projectedReadSchema read schema specifying field projection
4140 * @param fileSchema file schema to be projected
41+ * @param projectedReadSchema read schema specifying field projection
4242 */
4343 def projectFileSchema (fileSchema : MessageType , projectedReadSchema : MessageType ): MessageType = {
4444 val projectedFileSchema = projectFileType(fileSchema, projectedReadSchema, FieldContext ()).asGroupType()
@@ -55,50 +55,70 @@ private[scrooge] object ParquetCollectionFormatCompatibility {
5555 * handle projection and possible nested collection types in the repeated type.
5656 */
5757 private def projectFileType (fileType : Type , projectedReadType : Type , fieldContext : FieldContext ): Type = {
58- (extractCollectionGroup(projectedReadType), extractCollectionGroup(fileType)) match {
59- case _ if projectedReadType.isPrimitive && fileType.isPrimitive =>
58+ if (projectedReadType.isPrimitive || fileType.isPrimitive) {
59+ // Base-cases to handle primitive types:
60+ if (projectedReadType.isPrimitive && fileType.isPrimitive) {
61+ // The field is a primitive in both schemas
6062 projectedReadType
61- case _ if projectedReadType.isPrimitive != fileType.isPrimitive =>
63+ } else {
64+ // The field is primitive in one schema but non-primitive in the othe other
6265 throw new DecodingSchemaMismatchException (
6366 s " Found schema mismatch between projected read type: \n $projectedReadType\n " +
6467 s " and file type: \n ${fileType}"
6568 )
66- case (Some (projectedReadGroup : ListGroup ), Some (fileGroup : ListGroup )) =>
67- projectFileGroup(fileGroup, projectedReadGroup, fieldContext.copy(nestedListLevel = fieldContext.nestedListLevel + 1 ), formatter= ParquetListFormatter )
68- case (Some (projectedReadGroup : MapGroup ), Some (fileGroup : MapGroup )) =>
69- projectFileGroup(fileGroup, projectedReadGroup, fieldContext, formatter= ParquetMapFormatter )
70- case _ => // Struct projection
71- val projectedReadGroupType = projectedReadType.asGroupType
72- val fileGroupType = fileType.asGroupType
73- val projectedReadFields = projectedReadGroupType.getFields.asScala.map { projectedReadField =>
74- if (! fileGroupType.containsField(projectedReadField.getName)) {
75- if (! projectedReadField.isRepetition(Repetition .OPTIONAL )) {
76- throw new DecodingSchemaMismatchException (
77- s " Found non-optional projected read field ${projectedReadField.getName}: \n $projectedReadField\n\n " +
78- s " not present in the given file group type: \n ${fileGroupType}"
79- )
80- }
81- projectedReadField
82- } else {
83- val fileFieldIndex = fileGroupType.getFieldIndex(projectedReadField.getName)
84- val fileField = fileGroupType.getFields.get(fileFieldIndex)
85- if (fileField.isRepetition(Repetition .OPTIONAL ) && projectedReadField.isRepetition(Repetition .REQUIRED )) {
86- throw new DecodingSchemaMismatchException (
87- s " Found required projected read field ${projectedReadField.getName}: \n $projectedReadField\n\n " +
88- s " on optional file field: \n ${fileField}"
89- )
69+ }
70+ } else {
71+ // Recursive cases to handle non-primitives (lists, maps, and structs):
72+ (extractCollectionGroup(projectedReadType), extractCollectionGroup(fileType)) match {
73+ case (Some (projectedReadGroup : ListGroup ), Some (fileGroup : ListGroup )) =>
74+ projectFileGroup(fileGroup, projectedReadGroup, fieldContext.copy(nestedListLevel = fieldContext.nestedListLevel + 1 ), formatter= ParquetListFormatter )
75+ case (Some (projectedReadGroup : MapGroup ), Some (fileGroup : MapGroup )) =>
76+ projectFileGroup(fileGroup, projectedReadGroup, fieldContext, formatter= ParquetMapFormatter )
77+ case _ => // Struct projection
78+ val projectedReadGroupType = projectedReadType.asGroupType
79+ val fileGroupType = fileType.asGroupType
80+ val projectedReadFields = projectedReadGroupType.getFields.asScala.map { projectedReadField =>
81+ if (! fileGroupType.containsField(projectedReadField.getName)) {
82+ // The projected read schema includes a field which is missing from the file schema.
83+ if (projectedReadField.isRepetition(Repetition .OPTIONAL )) {
84+ // The missing field is optional in the projected read schema. Since the file schema
85+ // doesn't contain this field there are no collection compatibility concerns to worry
86+ // about and we can simply use the supplied schema:
87+ projectedReadField
88+ } else {
89+ // The missing field is repeated or required, which is an error:
90+ throw new DecodingSchemaMismatchException (
91+ s " Found non-optional projected read field ${projectedReadField.getName}: \n $projectedReadField\n\n " +
92+ s " not present in the given file group type: \n ${fileGroupType}"
93+ )
94+ }
95+ } else {
96+ // The field is present in both schemas, so first check that the schemas specify compatible repetition
97+ // values for the field, then recursively process the fields:
98+ val fileFieldIndex = fileGroupType.getFieldIndex(projectedReadField.getName)
99+ val fileField = fileGroupType.getFields.get(fileFieldIndex)
100+ if (fileField.isRepetition(Repetition .OPTIONAL ) && projectedReadField.isRepetition(Repetition .REQUIRED )) {
101+ // The field is optional in the file schema but required in the projected read schema; this is an error:
102+ throw new DecodingSchemaMismatchException (
103+ s " Found required projected read field ${projectedReadField.getName}: \n $projectedReadField\n\n " +
104+ s " on optional file field: \n ${fileField}"
105+ )
106+ } else {
107+ // The field's repetitions are compatible in both schemas (e.g. optional in both schemas or required
108+ // in both), so recursively process the field:
109+ projectFileType(fileField, projectedReadField, FieldContext (projectedReadField.getName))
110+ }
90111 }
91- projectFileType(fileField, projectedReadField, FieldContext (projectedReadField.getName))
92112 }
93- }
94- projectedReadGroupType.withNewFields(projectedReadFields.asJava)
113+ projectedReadGroupType.withNewFields(projectedReadFields.asJava)
114+ }
95115 }
96116 }
97117
98118 private def projectFileGroup (fileGroup : CollectionGroup ,
99119 projectedReadGroup : CollectionGroup ,
100120 fieldContext : FieldContext ,
101- formatter : ParquetCollectionFormatter ) = {
121+ formatter : ParquetCollectionFormatter ): GroupType = {
102122 val projectedFileRepeatedType = formatter.formatCompatibleRepeatedType(
103123 fileGroup.repeatedType,
104124 projectedReadGroup.repeatedType,
@@ -118,8 +138,8 @@ private[scrooge] trait ParquetCollectionFormatter {
118138 /**
119139 * Format source repeated type in the structure of target repeated type.
120140 *
121- * @param readRepeatedType repeated type from which the formatted result get content
122141 * @param fileRepeatedType repeated type from which the formatted result get the structure
142+ * @param readRepeatedType repeated type from which the formatted result get content
123143 * @param recursiveSolver solver for the inner content of the repeated type
124144 * @return formatted result
125145 */
0 commit comments