fix(java): Spark writer properly handles string and array data (#4335)

a10y · web-flow · commit d8e24a9f5402 · 2025-08-22T20:37:34.000Z
We need to use setSafe() as the initial allocation might not be large
enough to hold all of the string data in the InternalRow, and a realloc
might be required.

Also adds support for Spark `ArrayType`, and augments the existing
roundtrip writer unit test to include an `array&lt;string&gt;`

---------

Signed-off-by: Andrew Duffy &lt;andrew@a10y.dev&gt;
diff --git a/java/testfiles/Cargo.lock b/java/testfiles/Cargo.lock
diff --git a/java/vortex-spark/src/main/java/dev/vortex/spark/write/VortexDataWriter.java b/java/vortex-spark/src/main/java/dev/vortex/spark/write/VortexDataWriter.java
@@ -8,6 +8,7 @@
 import dev.vortex.relocated.org.apache.arrow.memory.RootAllocator;
 import dev.vortex.relocated.org.apache.arrow.vector.*;
 import dev.vortex.relocated.org.apache.arrow.vector.VectorSchemaRoot;
+import dev.vortex.relocated.org.apache.arrow.vector.complex.ListVector;
 import dev.vortex.relocated.org.apache.arrow.vector.ipc.ArrowStreamWriter;
 import dev.vortex.spark.SparkTypes;
 import java.io.ByteArrayOutputStream;
@@ -20,6 +21,8 @@
 import java.util.List;
 import java.util.Map;
 import org.apache.spark.sql.catalyst.InternalRow;
+import org.apache.spark.sql.catalyst.expressions.SpecializedGetters;
+import org.apache.spark.sql.catalyst.util.ArrayData;
 import org.apache.spark.sql.connector.write.DataWriter;
 import org.apache.spark.sql.connector.write.WriterCommitMessage;
 import org.apache.spark.sql.types.*;
@@ -185,7 +188,8 @@ private void writeBatch() throws IOException {
     /**
      * Populates an Arrow vector with a value from an InternalRow.
      */
-    private void populateVector(FieldVector vector, DataType dataType, InternalRow row, int fieldIndex, int rowIndex) {
+    private void populateVector(
+            FieldVector vector, DataType dataType, SpecializedGetters row, int fieldIndex, int rowIndex) {
         if (dataType instanceof BooleanType) {
             ((BitVector) vector).set(rowIndex, row.getBoolean(fieldIndex) ? 1 : 0);
         } else if (dataType instanceof ByteType) {
@@ -203,12 +207,12 @@ private void populateVector(FieldVector vector, DataType dataType, InternalRow r
         } else if (dataType instanceof StringType) {
             UTF8String str = row.getUTF8String(fieldIndex);
             if (str != null) {
-                ((VarCharVector) vector).set(rowIndex, str.getBytes());
+                ((VarCharVector) vector).setSafe(rowIndex, str.getBytes());
             }
         } else if (dataType instanceof BinaryType) {
             byte[] bytes = row.getBinary(fieldIndex);
             if (bytes != null) {
-                ((VarBinaryVector) vector).set(rowIndex, bytes);
+                ((VarBinaryVector) vector).setSafe(rowIndex, bytes);
             }
         } else if (dataType instanceof DecimalType) {
             DecimalType decType = (DecimalType) dataType;
@@ -218,9 +222,19 @@ private void populateVector(FieldVector vector, DataType dataType, InternalRow r
                         .toJavaBigDecimal();
                 ((DecimalVector) vector).set(rowIndex, decimal);
             }
+        } else if (dataType instanceof ArrayType) {
+            ArrayType arrayType = (ArrayType) dataType;
+            ArrayData data = row.getArray(fieldIndex);
+            ListVector listVector = ((ListVector) vector);
+            int writtenElements = listVector.getElementEndIndex(listVector.getLastSet());
+            listVector.startNewValue(rowIndex);
+            for (int i = 0; i < data.numElements(); i++) {
+                populateVector(listVector.getDataVector(), arrayType.elementType(), data, i, writtenElements + i);
+            }
+            listVector.endValue(rowIndex, data.numElements());
         } else {
             // For unsupported types, set null
-            vector.setNull(rowIndex);
+            throw new IllegalArgumentException("Unsupported data type: " + dataType);
         }
     }
 
diff --git a/java/vortex-spark/src/test/java/dev/vortex/spark/VortexDataSourceWriteTest.java b/java/vortex-spark/src/test/java/dev/vortex/spark/VortexDataSourceWriteTest.java
@@ -15,12 +15,10 @@
 import java.util.stream.Collectors;
 import java.util.stream.Stream;
 import org.apache.spark.sql.*;
-import org.apache.spark.sql.api.java.UDF1;
 import org.apache.spark.sql.types.DataTypes;
 import org.apache.spark.sql.types.StructField;
 import org.apache.spark.sql.types.StructType;
 import org.junit.jupiter.api.*;
-import org.junit.jupiter.api.Assumptions;
 import org.junit.jupiter.api.io.TempDir;
 
 /**
@@ -68,7 +66,7 @@ public void testWriteAndReadVortexFiles() throws IOException {
 
         // Verify original data
         assertEquals(numRows, originalDf.count(), "Original DataFrame should have " + numRows + " rows");
-        assertEquals(2, originalDf.columns().length, "Original DataFrame should have 2 columns");
+        assertEquals(3, originalDf.columns().length, "Original DataFrame should have 2 columns");
 
         // When: Repartition to 2 partitions and write as Vortex
         Path outputPath = tempDir.resolve("vortex_output");
@@ -284,14 +282,12 @@ public void testSpecialCharactersAndNulls() throws IOException {
      * and their string representations.
      */
     private Dataset<Row> createTestDataFrame(int numRows) {
-        // Register UDF for integer to string conversion
-        spark.udf().register("intToString", (UDF1<Integer, String>) value -> "value_" + value, DataTypes.StringType);
-
         // Create DataFrame with monotonically increasing integers
-        Dataset<Row> df = spark.range(0, numRows)
-                .selectExpr("cast(id as int) as id", "concat('value_', cast(id as string)) as value");
-
-        return df;
+        return spark.range(0, numRows)
+                .selectExpr(
+                        "cast(id as int) as id",
+                        "concat('value_', cast(id as string)) as value",
+                        "array('Alpha', 'Bravo', 'Charlie') AS elements");
     }
 
     /**