Skip to content

Commit c3a552f

Browse files
authored
chore: merge comet-parquet-exec branch into main (#1318)
1 parent 517c255 commit c3a552f

File tree

1,324 files changed

+239269
-108
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,324 files changed

+239269
-108
lines changed

common/src/main/java/org/apache/comet/parquet/AbstractColumnReader.java

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
import org.slf4j.LoggerFactory;
2424

2525
import org.apache.parquet.column.ColumnDescriptor;
26+
import org.apache.parquet.schema.Type;
2627
import org.apache.spark.sql.types.DataType;
2728
import org.apache.spark.sql.types.TimestampNTZType$;
2829

@@ -36,6 +37,9 @@ public abstract class AbstractColumnReader implements AutoCloseable {
3637
/** The Spark data type. */
3738
protected final DataType type;
3839

40+
/** The Spark data type. */
41+
protected final Type fieldType;
42+
3943
/** Parquet column descriptor. */
4044
protected final ColumnDescriptor descriptor;
4145

@@ -61,13 +65,23 @@ public abstract class AbstractColumnReader implements AutoCloseable {
6165

6266
public AbstractColumnReader(
6367
DataType type,
68+
Type fieldType,
6469
ColumnDescriptor descriptor,
6570
boolean useDecimal128,
6671
boolean useLegacyDateTimestamp) {
6772
this.type = type;
73+
this.fieldType = fieldType;
6874
this.descriptor = descriptor;
6975
this.useDecimal128 = useDecimal128;
7076
this.useLegacyDateTimestamp = useLegacyDateTimestamp;
77+
}
78+
79+
public AbstractColumnReader(
80+
DataType type,
81+
ColumnDescriptor descriptor,
82+
boolean useDecimal128,
83+
boolean useLegacyDateTimestamp) {
84+
this(type, null, descriptor, useDecimal128, useLegacyDateTimestamp);
7185
TypeUtil.checkParquetType(descriptor, type);
7286
}
7387

common/src/main/java/org/apache/comet/parquet/BatchReader.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -272,7 +272,7 @@ public void init() throws URISyntaxException, IOException {
272272
requestedSchema =
273273
CometParquetReadSupport.clipParquetSchema(
274274
requestedSchema, sparkSchema, isCaseSensitive, useFieldId, ignoreMissingIds);
275-
if (requestedSchema.getColumns().size() != sparkSchema.size()) {
275+
if (requestedSchema.getFieldCount() != sparkSchema.size()) {
276276
throw new IllegalArgumentException(
277277
String.format(
278278
"Spark schema has %d columns while " + "Parquet schema has %d columns",

common/src/main/java/org/apache/comet/parquet/Native.java

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -234,4 +234,56 @@ public static native void setPageV2(
234234
* @param handle the handle to the native Parquet column reader
235235
*/
236236
public static native void closeColumnReader(long handle);
237+
238+
///////////// Arrow Native Parquet Reader APIs
239+
// TODO: Add partitionValues(?), improve requiredColumns to use a projection mask that corresponds
240+
// to arrow.
241+
// Add batch size, datetimeRebaseModeSpec, metrics(how?)...
242+
243+
/**
244+
* Initialize a record batch reader for a PartitionedFile
245+
*
246+
* @param filePath
247+
* @param start
248+
* @param length
249+
* @return a handle to the record batch reader, used in subsequent calls.
250+
*/
251+
public static native long initRecordBatchReader(
252+
String filePath,
253+
long fileSize,
254+
long start,
255+
long length,
256+
byte[] requiredSchema,
257+
String sessionTimezone);
258+
259+
// arrow native version of read batch
260+
/**
261+
* Read the next batch of data into memory on native side
262+
*
263+
* @param handle
264+
* @return the number of rows read
265+
*/
266+
public static native int readNextRecordBatch(long handle);
267+
268+
// arrow native equivalent of currentBatch. 'columnNum' is number of the column in the record
269+
// batch
270+
/**
271+
* Load the column corresponding to columnNum in the currently loaded record batch into JVM
272+
*
273+
* @param handle
274+
* @param columnNum
275+
* @param arrayAddr
276+
* @param schemaAddr
277+
*/
278+
public static native void currentColumnBatch(
279+
long handle, int columnNum, long arrayAddr, long schemaAddr);
280+
281+
// arrow native version to close record batch reader
282+
283+
/**
284+
* Close the record batch reader. Free the resources
285+
*
286+
* @param handle
287+
*/
288+
public static native void closeRecordBatchReader(long handle);
237289
}

0 commit comments

Comments
 (0)