apache
diff --git a/‎common/src/main/java/org/apache/comet/parquet/Native.java‎
Lines changed: 2 additions & 0 deletions b/‎common/src/main/java/org/apache/comet/parquet/Native.java‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎common/src/main/java/org/apache/comet/parquet/NativeBatchReader.java‎
Lines changed: 24 additions & 5 deletions b/‎common/src/main/java/org/apache/comet/parquet/NativeBatchReader.java‎
Lines changed: 24 additions & 5 deletions
diff --git a/‎native/core/src/execution/planner.rs‎
Lines changed: 1 addition & 1 deletion b/‎native/core/src/execution/planner.rs‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎native/core/src/parquet/mod.rs‎
Lines changed: 23 additions & 2 deletions b/‎native/core/src/parquet/mod.rs‎
Lines changed: 23 additions & 2 deletions
diff --git a/‎spark/src/main/scala/org/apache/comet/parquet/CometParquetFileFormat.scala‎
Lines changed: 30 additions & 21 deletions b/‎spark/src/main/scala/org/apache/comet/parquet/CometParquetFileFormat.scala‎
Lines changed: 30 additions & 21 deletions
diff --git a/‎spark/src/main/scala/org/apache/comet/parquet/CometParquetPartitionReaderFactory.scala‎
Lines changed: 1 addition & 0 deletions b/‎spark/src/main/scala/org/apache/comet/parquet/CometParquetPartitionReaderFactory.scala‎
Lines changed: 1 addition & 0 deletions
@@ -253,7 +253,9 @@ public static native long initRecordBatchReader(
       long fileSize,
       long start,
       long length,
+      byte[] filter,
       byte[] requiredSchema,
+      byte[] dataSchema,
       String sessionTimezone);
 
   // arrow native version of read batch
 
@@ -108,6 +108,7 @@ public class NativeBatchReader extends RecordReader<Void, ColumnarBatch> impleme
   private final Map<String, SQLMetric> metrics;
 
   private StructType sparkSchema;
+  private StructType dataSchema;
   private MessageType requestedSchema;
   private CometVector[] vectors;
   private AbstractColumnReader[] columnReaders;
@@ -117,6 +118,7 @@ public class NativeBatchReader extends RecordReader<Void, ColumnarBatch> impleme
   private boolean[] missingColumns;
   private boolean isInitialized;
   private ParquetMetadata footer;
+  private byte[] nativeFilter;
 
   /**
    * Whether the native scan should always return decimal represented by 128 bits, regardless of its
@@ -190,8 +192,10 @@ public NativeBatchReader(AbstractColumnReader[] columnReaders) {
       Configuration conf,
       PartitionedFile inputSplit,
       ParquetMetadata footer,
+      byte[] nativeFilter,
       int capacity,
       StructType sparkSchema,
+      StructType dataSchema,
       boolean isCaseSensitive,
       boolean useFieldId,
       boolean ignoreMissingIds,
@@ -202,6 +206,7 @@ public NativeBatchReader(AbstractColumnReader[] columnReaders) {
     this.conf = conf;
     this.capacity = capacity;
     this.sparkSchema = sparkSchema;
+    this.dataSchema = dataSchema;
     this.isCaseSensitive = isCaseSensitive;
     this.useFieldId = useFieldId;
     this.ignoreMissingIds = ignoreMissingIds;
@@ -210,6 +215,7 @@ public NativeBatchReader(AbstractColumnReader[] columnReaders) {
     this.partitionValues = partitionValues;
     this.file = inputSplit;
     this.footer = footer;
+    this.nativeFilter = nativeFilter;
     this.metrics = metrics;
     this.taskContext = TaskContext$.MODULE$.get();
   }
@@ -262,10 +268,9 @@ public void init() throws URISyntaxException, IOException {
     String timeZoneId = conf.get("spark.sql.session.timeZone");
     // Native code uses "UTC" always as the timeZoneId when converting from spark to arrow schema.
     Schema arrowSchema = Utils$.MODULE$.toArrowSchema(sparkSchema, "UTC");
-    ByteArrayOutputStream out = new ByteArrayOutputStream();
-    WriteChannel writeChannel = new WriteChannel(Channels.newChannel(out));
-    MessageSerializer.serialize(writeChannel, arrowSchema);
-    byte[] serializedRequestedArrowSchema = out.toByteArray();
+    byte[] serializedRequestedArrowSchema = serializeArrowSchema(arrowSchema);
+    Schema dataArrowSchema = Utils$.MODULE$.toArrowSchema(dataSchema, "UTC");
+    byte[] serializedDataArrowSchema = serializeArrowSchema(dataArrowSchema);
 
     //// Create Column readers
     List<ColumnDescriptor> columns = requestedSchema.getColumns();
@@ -350,7 +355,14 @@ public void init() throws URISyntaxException, IOException {
 
     this.handle =
         Native.initRecordBatchReader(
-            filePath, fileSize, start, length, serializedRequestedArrowSchema, timeZoneId);
+            filePath,
+            fileSize,
+            start,
+            length,
+            nativeFilter,
+            serializedRequestedArrowSchema,
+            serializedDataArrowSchema,
+            timeZoneId);
     isInitialized = true;
   }
 
@@ -524,4 +536,11 @@ private int loadNextBatch() throws Throwable {
       return Option.apply(null); // None
     }
   }
+
+  private byte[] serializeArrowSchema(Schema schema) throws IOException {
+    ByteArrayOutputStream out = new ByteArrayOutputStream();
+    WriteChannel writeChannel = new WriteChannel(Channels.newChannel(out));
+    MessageSerializer.serialize(writeChannel, schema);
+    return out.toByteArray();
+  }
 }
@@ -230,7 +230,7 @@ impl PhysicalPlanner {
     }
 
     /// Create a DataFusion physical expression from Spark physical expression
-    fn create_expr(
+    pub(crate) fn create_expr(
         &self,
         spark_expr: &Expr,
         input_schema: SchemaRef,
 
@@ -45,6 +45,8 @@ use jni::{
 
 use self::util::jni::TypePromotionInfo;
 use crate::execution::operators::ExecutionError;
+use crate::execution::planner::PhysicalPlanner;
+use crate::execution::serde;
 use crate::execution::utils::SparkArrowConvert;
 use crate::parquet::data_type::AsBytes;
 use crate::parquet::parquet_exec::init_datasource_exec;
@@ -644,7 +646,9 @@ pub unsafe extern "system" fn Java_org_apache_comet_parquet_Native_initRecordBat
     file_size: jlong,
     start: jlong,
     length: jlong,
+    filter: jbyteArray,
     required_schema: jbyteArray,
+    data_schema: jbyteArray,
     session_timezone: jstring,
 ) -> jlong {
     try_unwrap_or_throw(&e, |mut env| unsafe {
@@ -666,6 +670,23 @@ pub unsafe extern "system" fn Java_org_apache_comet_parquet_Native_initRecordBat
         let required_schema_buffer = env.convert_byte_array(&required_schema_array)?;
         let required_schema = Arc::new(deserialize_schema(required_schema_buffer.as_bytes())?);
 
+        let data_schema_array = JByteArray::from_raw(data_schema);
+        let data_schema_buffer = env.convert_byte_array(&data_schema_array)?;
+        let data_schema = Arc::new(deserialize_schema(data_schema_buffer.as_bytes())?);
+
+        let planer = PhysicalPlanner::default();
+
+        let data_filters = if !filter.is_null() {
+            let filter_array = JByteArray::from_raw(filter);
+            let filter_buffer = env.convert_byte_array(&filter_array)?;
+            let filter_expr = serde::deserialize_expr(filter_buffer.as_slice())?;
+            Some(vec![
+                planer.create_expr(&filter_expr, Arc::clone(&data_schema))?
+            ])
+        } else {
+            None
+        };
+
         let file_groups =
             get_file_groups_single_file(&object_store_path, file_size as u64, start, length);
 
@@ -676,13 +697,13 @@ pub unsafe extern "system" fn Java_org_apache_comet_parquet_Native_initRecordBat
 
         let scan = init_datasource_exec(
             required_schema,
-            None,
+            Some(data_schema),
             None,
             None,
             object_store_url,
             file_groups,
             None,
-            None,
+            data_filters,
             session_timezone.as_str(),
         )?;
 
 
@@ -114,36 +114,33 @@ class CometParquetFileFormat extends ParquetFileFormat with MetricsSupport with
         footerFileMetaData,
         datetimeRebaseModeInRead)
 
-      val pushed = if (parquetFilterPushDown) {
-        val parquetSchema = footerFileMetaData.getSchema
-        val parquetFilters = new ParquetFilters(
-          parquetSchema,
-          pushDownDate,
-          pushDownTimestamp,
-          pushDownDecimal,
-          pushDownStringPredicate,
-          pushDownInFilterThreshold,
-          isCaseSensitive,
-          datetimeRebaseSpec)
-        filters
-          // Collects all converted Parquet filter predicates. Notice that not all predicates can
-          // be converted (`ParquetFilters.createFilter` returns an `Option`). That's why a
-          // `flatMap` is used here.
-          .flatMap(parquetFilters.createFilter)
-          .reduceOption(FilterApi.and)
-      } else {
-        None
-      }
-      pushed.foreach(p => ParquetInputFormat.setFilterPredicate(sharedConf, p))
+      val parquetSchema = footerFileMetaData.getSchema
+      val parquetFilters = new ParquetFilters(
+        parquetSchema,
+        dataSchema,
+        pushDownDate,
+        pushDownTimestamp,
+        pushDownDecimal,
+        pushDownStringPredicate,
+        pushDownInFilterThreshold,
+        isCaseSensitive,
+        datetimeRebaseSpec)
 
       val recordBatchReader =
         if (nativeIcebergCompat) {
+          val pushed = if (parquetFilterPushDown) {
+            parquetFilters.createNativeFilters(filters)
+          } else {
+            None
+          }
           val batchReader = new NativeBatchReader(
             sharedConf,
             file,
             footer,
+            pushed.orNull,
             capacity,
             requiredSchema,
+            dataSchema,
             isCaseSensitive,
             useFieldId,
             ignoreMissingIds,
@@ -160,6 +157,18 @@ class CometParquetFileFormat extends ParquetFileFormat with MetricsSupport with
           }
           batchReader
         } else {
+          val pushed = if (parquetFilterPushDown) {
+            filters
+              // Collects all converted Parquet filter predicates. Notice that not all predicates
+              // can be converted (`ParquetFilters.createFilter` returns an `Option`). That's why
+              // a `flatMap` is used here.
+              .flatMap(parquetFilters.createFilter)
+              .reduceOption(FilterApi.and)
+          } else {
+            None
+          }
+          pushed.foreach(p => ParquetInputFormat.setFilterPredicate(sharedConf, p))
+
           val batchReader = new BatchReader(
             sharedConf,
             file,
 
@@ -199,6 +199,7 @@ case class CometParquetPartitionReaderFactory(
       val parquetSchema = footerFileMetaData.getSchema
       val parquetFilters = new ParquetFilters(
         parquetSchema,
+        readDataSchema,
         pushDownDate,
         pushDownTimestamp,
         pushDownDecimal,
Original file line number	Diff line number	Diff line change
`@@ -230,7 +230,7 @@ impl PhysicalPlanner {`
`230`	`230`	`}`
`231`	`231`
`232`	`232`	`/// Create a DataFusion physical expression from Spark physical expression`
`233`		`- fn create_expr(`
	`233`	`+ pub(crate) fn create_expr(`
`234`	`234`	`&self,`
`235`	`235`	`spark_expr: &Expr,`
`236`	`236`	`input_schema: SchemaRef,`