2626import org .apache .comet .vector .NativeUtil ;
2727
2828/**
29- * An iterator that can be used to get batches of Arrow arrays from a Spark iterator of
30- * ColumnarBatch. It will consume input iterator and return Arrow arrays by addresses. This is
31- * called by native code to retrieve Arrow arrays from Spark through JNI.
29+ * Iterator for fetching batches from JVM to native code. Usually called via JNI from native
30+ * ScanExec.
31+ *
32+ * <p>Batches are owned by the JVM. Native code can safely access the batch after calling `next` but
33+ * the native code must not retain references to the batch because the next call to `hasNext` will
34+ * signal to the JVM that the batch can be closed.
3235 */
3336public class CometBatchIterator {
34- final Iterator <ColumnarBatch > input ;
35- final NativeUtil nativeUtil ;
37+ private final Iterator <ColumnarBatch > input ;
38+ private final NativeUtil nativeUtil ;
39+ private ColumnarBatch previousBatch = null ;
3640 private ColumnarBatch currentBatch = null ;
3741
3842 CometBatchIterator (Iterator <ColumnarBatch > input , NativeUtil nativeUtil ) {
@@ -41,11 +45,16 @@ public class CometBatchIterator {
4145 }
4246
4347 /**
44- * Fetch the next input batch.
48+ * Fetch the next input batch and allow the previous batch to be closed (this may not happen
49+ * immediately).
4550 *
4651 * @return Number of rows in next batch or -1 if no batches left.
4752 */
4853 public int hasNext () {
54+
55+ // release reference to previous batch
56+ previousBatch = null ;
57+
4958 if (currentBatch == null ) {
5059 if (input .hasNext ()) {
5160 currentBatch = input .next ();
@@ -59,7 +68,7 @@ public int hasNext() {
5968 }
6069
6170 /**
62- * Get the next batches of Arrow arrays.
71+ * Get the next batch of Arrow arrays.
6372 *
6473 * @param arrayAddrs The addresses of the ArrowArray structures.
6574 * @param schemaAddrs The addresses of the ArrowSchema structures.
@@ -69,8 +78,16 @@ public int next(long[] arrayAddrs, long[] schemaAddrs) {
6978 if (currentBatch == null ) {
7079 return -1 ;
7180 }
81+
82+ // export the batch using the Arrow C Data Interface
7283 int numRows = nativeUtil .exportBatch (arrayAddrs , schemaAddrs , currentBatch );
84+
85+ // keep a reference to the exported batch so that it doesn't get garbage collected
86+ // while the native code is still processing it
87+ previousBatch = currentBatch ;
88+
7389 currentBatch = null ;
90+
7491 return numRows ;
7592 }
7693}
0 commit comments