Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions common/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,10 @@ under the License.
<groupId>org.apache.parquet</groupId>
<artifactId>parquet-hadoop</artifactId>
</dependency>
<dependency>
<groupId>org.apache.parquet</groupId>
<artifactId>parquet-format-structures</artifactId>
</dependency>
<dependency>
<groupId>org.apache.arrow</groupId>
<artifactId>arrow-vector</artifactId>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,10 @@ ColumnDescriptor getDescriptor() {
return descriptor;
}

String getPath() {
return String.join(".", this.descriptor.getPath());
}

/**
* Set the batch size of this reader to be 'batchSize'. Also initializes the native column reader.
*/
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.apache.comet.parquet;

import java.util.Map;

import org.apache.hadoop.conf.Configuration;
import org.apache.spark.sql.catalyst.InternalRow;
import org.apache.spark.sql.execution.metric.SQLMetric;
import org.apache.spark.sql.types.StructType;

/**
* A specialized NativeBatchReader for Iceberg that accepts ParquetMetadata as a JSON string. This
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

accepts ParquetMetadata as a JSON string - actually it accepts byte[] parquetMetadataBytes at https://github.com/apache/datafusion-comet/pull/2680/files#diff-e57878f6cd8036999500de5719f8f4bbe28e1ed5dcb79a02ad7d7eb206f37473R44, i.e. not a String but bytes.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you for catching this. The first version I did used JSON, but this is more efficient.

* allows Iceberg to pass metadata in serialized form with a two-step initialization pattern.
*/
public class IcebergCometNativeBatchReader extends NativeBatchReader {

public IcebergCometNativeBatchReader(StructType requiredSchema) {
super();
this.sparkSchema = requiredSchema;
}

/** Initialize the reader using FileInfo instead of PartitionedFile. */
public void init(
Configuration conf,
FileInfo fileInfo,
byte[] parquetMetadataBytes,
byte[] nativeFilter,
int capacity,
StructType dataSchema,
boolean isCaseSensitive,
boolean useFieldId,
boolean ignoreMissingIds,
boolean useLegacyDateTimestamp,
StructType partitionSchema,
InternalRow partitionValues,
AbstractColumnReader[] preInitializedReaders,
Map<String, SQLMetric> metrics)
throws Throwable {

// Set parent fields
this.conf = conf;
this.fileInfo = fileInfo;
this.footer = new ParquetMetadataSerializer().deserialize(parquetMetadataBytes);
this.nativeFilter = nativeFilter;
this.capacity = capacity;
this.dataSchema = dataSchema;
this.isCaseSensitive = isCaseSensitive;
this.useFieldId = useFieldId;
this.ignoreMissingIds = ignoreMissingIds;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
this.ignoreMissingIds = ignoreMissingIds;
this.ignoreMissingIds = ignoreMissingIds;
this.useLegacyDateTimestamp = useLegacyDateTimestamp;

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

this.partitionSchema = partitionSchema;
this.partitionValues = partitionValues;
this.preInitializedReaders = preInitializedReaders;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
this.preInitializedReaders = preInitializedReaders;
this.preInitializedReaders = preInitializedReaders;
this.metrics.clear();
if (metrics != null) {
this.metrics.putAll(metrics);
}

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done


// Call parent init method
super.init();
}

public StructType getSparkSchema() {
return this.sparkSchema;
}
}
Loading
Loading