Skip to content

Commit 50a75f3

Browse files
authored
docs: Add documentation for accelerating Iceberg Parquet scans with Comet (#1683)
1 parent fd09a79 commit 50a75f3

File tree

12 files changed

+208
-17
lines changed

12 files changed

+208
-17
lines changed

common/src/main/java/org/apache/arrow/c/CometSchemaImporter.java renamed to common/src/main/java/org/apache/arrow/c/AbstractCometSchemaImporter.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,12 +24,12 @@
2424
import org.apache.arrow.vector.types.pojo.Field;
2525

2626
/** This is a simple wrapper around SchemaImporter to make it accessible from Java Arrow. */
27-
public class CometSchemaImporter {
27+
public abstract class AbstractCometSchemaImporter {
2828
private final BufferAllocator allocator;
2929
private final SchemaImporter importer;
3030
private final CDataDictionaryProvider provider = new CDataDictionaryProvider();
3131

32-
public CometSchemaImporter(BufferAllocator allocator) {
32+
public AbstractCometSchemaImporter(BufferAllocator allocator) {
3333
this.allocator = allocator;
3434
this.importer = new SchemaImporter(allocator);
3535
}
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
package org.apache.comet;
21+
22+
import org.apache.arrow.c.*;
23+
import org.apache.arrow.memory.BufferAllocator;
24+
25+
/** This is a simple wrapper around SchemaImporter to make it accessible from Java Arrow. */
26+
public class CometSchemaImporter extends AbstractCometSchemaImporter {
27+
public CometSchemaImporter(BufferAllocator allocator) {
28+
super(allocator);
29+
}
30+
}

common/src/main/java/org/apache/comet/parquet/BatchReader.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,6 @@
3434
import org.slf4j.Logger;
3535
import org.slf4j.LoggerFactory;
3636

37-
import org.apache.arrow.c.CometSchemaImporter;
3837
import org.apache.arrow.memory.BufferAllocator;
3938
import org.apache.arrow.memory.RootAllocator;
4039
import org.apache.commons.lang3.tuple.Pair;
@@ -65,6 +64,7 @@
6564
import org.apache.spark.util.AccumulatorV2;
6665

6766
import org.apache.comet.CometConf;
67+
import org.apache.comet.CometSchemaImporter;
6868
import org.apache.comet.shims.ShimBatchReader;
6969
import org.apache.comet.shims.ShimFileFormat;
7070
import org.apache.comet.vector.CometVector;

common/src/main/java/org/apache/comet/parquet/ColumnReader.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,6 @@
2727

2828
import org.apache.arrow.c.ArrowArray;
2929
import org.apache.arrow.c.ArrowSchema;
30-
import org.apache.arrow.c.CometSchemaImporter;
3130
import org.apache.arrow.memory.BufferAllocator;
3231
import org.apache.arrow.memory.RootAllocator;
3332
import org.apache.arrow.vector.FieldVector;
@@ -44,6 +43,7 @@
4443
import org.apache.spark.sql.types.DataType;
4544

4645
import org.apache.comet.CometConf;
46+
import org.apache.comet.CometSchemaImporter;
4747
import org.apache.comet.vector.CometDecodedVector;
4848
import org.apache.comet.vector.CometDictionary;
4949
import org.apache.comet.vector.CometDictionaryVector;

common/src/main/java/org/apache/comet/parquet/LazyColumnReader.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,11 +21,11 @@
2121

2222
import java.io.IOException;
2323

24-
import org.apache.arrow.c.CometSchemaImporter;
2524
import org.apache.parquet.column.ColumnDescriptor;
2625
import org.apache.parquet.column.page.PageReader;
2726
import org.apache.spark.sql.types.DataType;
2827

28+
import org.apache.comet.CometSchemaImporter;
2929
import org.apache.comet.vector.CometLazyVector;
3030
import org.apache.comet.vector.CometVector;
3131

common/src/main/java/org/apache/comet/parquet/NativeBatchReader.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,6 @@
3636
import org.slf4j.Logger;
3737
import org.slf4j.LoggerFactory;
3838

39-
import org.apache.arrow.c.CometSchemaImporter;
4039
import org.apache.arrow.memory.BufferAllocator;
4140
import org.apache.arrow.memory.RootAllocator;
4241
import org.apache.arrow.vector.ipc.WriteChannel;
@@ -71,6 +70,7 @@
7170
import org.apache.spark.util.AccumulatorV2;
7271

7372
import org.apache.comet.CometConf;
73+
import org.apache.comet.CometSchemaImporter;
7474
import org.apache.comet.shims.ShimBatchReader;
7575
import org.apache.comet.shims.ShimFileFormat;
7676
import org.apache.comet.vector.CometVector;

common/src/main/java/org/apache/comet/parquet/NativeColumnReader.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,13 +24,13 @@
2424

2525
import org.apache.arrow.c.ArrowArray;
2626
import org.apache.arrow.c.ArrowSchema;
27-
import org.apache.arrow.c.CometSchemaImporter;
2827
import org.apache.arrow.memory.BufferAllocator;
2928
import org.apache.arrow.memory.RootAllocator;
3029
import org.apache.parquet.column.ColumnDescriptor;
3130
import org.apache.parquet.schema.Type;
3231
import org.apache.spark.sql.types.DataType;
3332

33+
import org.apache.comet.CometSchemaImporter;
3434
import org.apache.comet.vector.*;
3535

3636
// TODO: extend ColumnReader instead of AbstractColumnReader to reduce code duplication

common/src/main/java/org/apache/comet/parquet/SupportsComet.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919

2020
package org.apache.comet.parquet;
2121

22+
/** This is implemented in Apache Iceberg */
2223
public interface SupportsComet {
2324
boolean isCometEnabled();
2425
}

common/src/main/java/org/apache/comet/parquet/Utils.java

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,13 +19,16 @@
1919

2020
package org.apache.comet.parquet;
2121

22-
import org.apache.arrow.c.CometSchemaImporter;
2322
import org.apache.parquet.column.ColumnDescriptor;
2423
import org.apache.parquet.schema.LogicalTypeAnnotation;
2524
import org.apache.parquet.schema.PrimitiveType;
2625
import org.apache.spark.sql.types.*;
2726

27+
import org.apache.comet.CometSchemaImporter;
28+
2829
public class Utils {
30+
31+
/** This method is called from Apache Iceberg. */
2932
public static ColumnReader getColumnReader(
3033
DataType type,
3134
ColumnDescriptor descriptor,

docs/source/user-guide/datasources.md

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -19,29 +19,36 @@
1919

2020
# Supported Spark Data Sources
2121

22-
## Parquet
22+
## File Formats
23+
24+
### Parquet
2325

2426
When `spark.comet.scan.enabled` is enabled, Parquet scans will be performed natively by Comet if all data types
2527
in the schema are supported. When this option is not enabled, the scan will fall back to Spark. In this case,
2628
enabling `spark.comet.convert.parquet.enabled` will immediately convert the data into Arrow format, allowing native
2729
execution to happen after that, but the process may not be efficient.
2830

29-
## CSV
31+
### CSV
3032

3133
Comet does not provide native CSV scan, but when `spark.comet.convert.csv.enabled` is enabled, data is immediately
3234
converted into Arrow format, allowing native execution to happen after that.
3335

34-
## JSON
36+
### JSON
3537

3638
Comet does not provide native JSON scan, but when `spark.comet.convert.json.enabled` is enabled, data is immediately
3739
converted into Arrow format, allowing native execution to happen after that.
3840

39-
# Supported Storages
41+
## Data Catalogs
42+
43+
### Apache Iceberg
44+
45+
See the dedicated [Comet and Iceberg Guide](iceberg.md).
46+
47+
## Supported Storages
4048

41-
## Local
42-
In progress
49+
Comet supports most standard storage systems, such as local file system and object storage.
4350

44-
## HDFS
51+
### HDFS
4552

4653
Apache DataFusion Comet native reader seamlessly scans files from remote HDFS for [supported formats](#supported-spark-data-sources)
4754

0 commit comments

Comments
 (0)