|
20 | 20 | import static java.lang.String.format;
|
21 | 21 |
|
22 | 22 | import org.apache.parquet.schema.*;
|
| 23 | +import org.apache.parquet.schema.LogicalTypeAnnotation.*; |
23 | 24 | import org.apache.parquet.schema.Type.Repetition;
|
24 | 25 | import static org.apache.parquet.schema.LogicalTypeAnnotation.TimeUnit.MICROS;
|
25 | 26 | import static org.apache.parquet.schema.LogicalTypeAnnotation.timestampType;
|
@@ -155,6 +156,93 @@ public static MessageType toParquetSchema(StructType structType) {
|
155 | 156 | return new MessageType("Default Kernel Schema", types);
|
156 | 157 | }
|
157 | 158 |
|
| 159 | + /** |
| 160 | + * Convert the given Parquet data type to a Kernel data type. |
| 161 | + * |
| 162 | + * TODO(r.chen): Test this function. |
| 163 | + * |
| 164 | + * @param type Parquet type object |
| 165 | + * @return {@link DataType} representing the Parquet type in Kernel. |
| 166 | + */ |
| 167 | + public static DataType toKernelType(Type type) { |
| 168 | + if (type.isPrimitive()) { |
| 169 | + PrimitiveType pt = type.asPrimitiveType(); |
| 170 | + |
| 171 | + if (pt.getOriginalType() == OriginalType.DECIMAL) { |
| 172 | + DecimalLogicalTypeAnnotation dlta = |
| 173 | + (DecimalLogicalTypeAnnotation) pt.getLogicalTypeAnnotation(); |
| 174 | + return new DecimalType(dlta.getPrecision(), dlta.getScale()); |
| 175 | + } else if (pt.getPrimitiveTypeName() == BOOLEAN) { |
| 176 | + return BooleanType.BOOLEAN; |
| 177 | + } else if (pt.getPrimitiveTypeName() == INT32) { |
| 178 | + if (pt.getOriginalType() == OriginalType.INT_8) { |
| 179 | + return ByteType.BYTE; |
| 180 | + } else if (pt.getOriginalType() == OriginalType.INT_16) { |
| 181 | + return ShortType.SHORT; |
| 182 | + } else if (pt.getLogicalTypeAnnotation() == LogicalTypeAnnotation.dateType()) { |
| 183 | + return DateType.DATE; |
| 184 | + } |
| 185 | + return IntegerType.INTEGER; |
| 186 | + } else if (pt.getPrimitiveTypeName() == INT64) { |
| 187 | + if (pt.getOriginalType() == OriginalType.TIMESTAMP_MICROS) { |
| 188 | + TimestampLogicalTypeAnnotation tlta = |
| 189 | + (TimestampLogicalTypeAnnotation) pt.getLogicalTypeAnnotation(); |
| 190 | + return tlta.isAdjustedToUTC() ? |
| 191 | + TimestampType.TIMESTAMP : TimestampNTZType.TIMESTAMP_NTZ; |
| 192 | + } |
| 193 | + return LongType.LONG; |
| 194 | + } else if (pt.getPrimitiveTypeName() == FLOAT) { |
| 195 | + return FloatType.FLOAT; |
| 196 | + } else if (pt.getPrimitiveTypeName() == DOUBLE) { |
| 197 | + return DoubleType.DOUBLE; |
| 198 | + } else if (pt.getPrimitiveTypeName() == BINARY) { |
| 199 | + if (pt.getLogicalTypeAnnotation() == LogicalTypeAnnotation.stringType()) { |
| 200 | + return StringType.STRING; |
| 201 | + } else { |
| 202 | + return BinaryType.BINARY; |
| 203 | + } |
| 204 | + } else { |
| 205 | + throw new UnsupportedOperationException( |
| 206 | + "Converting the given Parquet data type to Kernel is not supported: " + type); |
| 207 | + } |
| 208 | + } else { |
| 209 | + if (type.getLogicalTypeAnnotation() == LogicalTypeAnnotation.listType()) { |
| 210 | + GroupType gt = (GroupType) type; |
| 211 | + Type childType = gt.getType(0); |
| 212 | + return new ArrayType( |
| 213 | + toKernelType(childType), childType.getRepetition() == OPTIONAL); |
| 214 | + } else if (type.getLogicalTypeAnnotation() == LogicalTypeAnnotation.mapType()) { |
| 215 | + GroupType gt = (GroupType) type; |
| 216 | + Type keyType = gt.getType(0); |
| 217 | + Type valueType = gt.getType(1); |
| 218 | + return new MapType( |
| 219 | + toKernelType(keyType), |
| 220 | + toKernelType(valueType), |
| 221 | + valueType.getRepetition() == OPTIONAL |
| 222 | + ); |
| 223 | + } else { |
| 224 | + List<StructField> kernelFields = new ArrayList<>(); |
| 225 | + GroupType gt = (GroupType) type; |
| 226 | + for (Type parquetType : gt.getFields()) { |
| 227 | + FieldMetadata.Builder metadataBuilder = FieldMetadata.builder(); |
| 228 | + if (type.getId() != null) { |
| 229 | + metadataBuilder.putLong( |
| 230 | + ColumnMapping.PARQUET_FIELD_ID_KEY, |
| 231 | + (long) (type.getId().intValue()) |
| 232 | + ); |
| 233 | + } |
| 234 | + kernelFields.add(new StructField( |
| 235 | + parquetType.getName(), |
| 236 | + toKernelType(parquetType), |
| 237 | + parquetType.getRepetition() == OPTIONAL, |
| 238 | + metadataBuilder.build() |
| 239 | + )); |
| 240 | + } |
| 241 | + return new StructType(kernelFields); |
| 242 | + } |
| 243 | + } |
| 244 | + } |
| 245 | + |
158 | 246 | private static List<Type> pruneFields(
|
159 | 247 | GroupType type, StructType deltaDataType, boolean hasFieldIds) {
|
160 | 248 | // prune fields including nested pruning like in pruneSchema
|
@@ -248,8 +336,6 @@ private static Type toParquetType(
|
248 | 336 | type = toParquetMapType((MapType) dataType, name, repetition);
|
249 | 337 | } else if (dataType instanceof StructType) {
|
250 | 338 | type = toParquetStructType((StructType) dataType, name, repetition);
|
251 |
| - } else if (dataType instanceof VariantType) { |
252 |
| - type = toParquetVariantType(name, repetition); |
253 | 339 | } else {
|
254 | 340 | throw new UnsupportedOperationException(
|
255 | 341 | "Writing given type data to Parquet is not supported: " + dataType);
|
@@ -311,13 +397,6 @@ private static Type toParquetStructType(StructType structType, String name,
|
311 | 397 | return new GroupType(repetition, name, fields);
|
312 | 398 | }
|
313 | 399 |
|
314 |
| - private static Type toParquetVariantType(String name, Repetition repetition) { |
315 |
| - return Types.buildGroup(repetition) |
316 |
| - .addField(toParquetType(BinaryType.BINARY, "value", REQUIRED, Optional.empty())) |
317 |
| - .addField(toParquetType(BinaryType.BINARY, "metadata", REQUIRED, Optional.empty())) |
318 |
| - .named(name); |
319 |
| - } |
320 |
| - |
321 | 400 | /**
|
322 | 401 | * Recursively checks whether the given data type has any Parquet field ids in it.
|
323 | 402 | */
|
|
0 commit comments