apache
diff --git a/‎hudi-common/src/main/java/org/apache/hudi/common/config/HoodieStorageConfig.java‎
Lines changed: 10 additions & 0 deletions b/‎hudi-common/src/main/java/org/apache/hudi/common/config/HoodieStorageConfig.java‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎hudi-hadoop-common/src/main/java/org/apache/hudi/avro/HoodieAvroWriteSupport.java‎
Lines changed: 393 additions & 3 deletions b/‎hudi-hadoop-common/src/main/java/org/apache/hudi/avro/HoodieAvroWriteSupport.java‎
Lines changed: 393 additions & 3 deletions
diff --git a/‎hudi-hadoop-common/src/main/java/org/apache/hudi/avro/VariantShreddingProvider.java‎
Lines changed: 66 additions & 0 deletions b/‎hudi-hadoop-common/src/main/java/org/apache/hudi/avro/VariantShreddingProvider.java‎
Lines changed: 66 additions & 0 deletions
diff --git a/‎hudi-hadoop-common/src/main/java/org/apache/hudi/io/storage/hadoop/HoodieAvroFileWriterFactory.java‎
Lines changed: 30 additions & 1 deletion b/‎hudi-hadoop-common/src/main/java/org/apache/hudi/io/storage/hadoop/HoodieAvroFileWriterFactory.java‎
Lines changed: 30 additions & 1 deletion
diff --git a/‎hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieHadoopFsRelationFactory.scala‎
Lines changed: 12 additions & 0 deletions b/‎hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieHadoopFsRelationFactory.scala‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/dml/schema/TestVariantDataType.scala‎
Lines changed: 166 additions & 0 deletions b/‎hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/dml/schema/TestVariantDataType.scala‎
Lines changed: 166 additions & 0 deletions
@@ -198,6 +198,16 @@ public class HoodieStorageConfig extends HoodieConfig {
           + "When disabled, only unshredded variant data can be read. "
           + "Equivalent to Spark's spark.sql.variant.allowReadingShredded.");
 
+  public static final ConfigProperty<String> PARQUET_VARIANT_SHREDDING_PROVIDER_CLASS = ConfigProperty
+      .key("hoodie.parquet.variant.shredding.provider.class")
+      .noDefaultValue()
+      .markAdvanced()
+      .sinceVersion("1.1.0")
+      .withDocumentation("Fully-qualified class name of the VariantShreddingProvider implementation "
+          + "used to shred variant values at write time in the Avro record path. "
+          + "The provider parses variant binary data and populates typed_value columns. "
+          + "When not set, the provider is auto-detected from the classpath.");
+
   public static final ConfigProperty<Boolean> WRITE_UTC_TIMEZONE = ConfigProperty
       .key("hoodie.parquet.write.utc-timezone.enabled")
       .defaultValue(true)
 
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.hudi.avro;
+
+import org.apache.hudi.common.schema.HoodieSchema;
+
+import org.apache.avro.Schema;
+import org.apache.avro.generic.GenericRecord;
+
+/**
+ * Interface for shredding variant values at write time.
+ * <p>
+ * Implementations parse variant binary data (value + metadata bytes) and produce
+ * a shredded {@link GenericRecord} with typed_value columns populated according
+ * to the shredding schema.
+ * <p>
+ * This interface allows the variant binary parsing logic (which may depend on
+ * engine-specific libraries like Spark's variant module) to be loaded via reflection,
+ * keeping the core write support free of engine-specific dependencies.
+ */
+public interface VariantShreddingProvider {
+
+  /**
+   * Transform an unshredded variant GenericRecord into a shredded one.
+   * <p>
+   * The input record is expected to have:
+   * <ul>
+   *   <li>{@code value}: ByteBuffer containing the variant value binary</li>
+   *   <li>{@code metadata}: ByteBuffer containing the variant metadata binary</li>
+   * </ul>
+   * <p>
+   * The output record should conform to {@code shreddedSchema} and have:
+   * <ul>
+   *   <li>{@code value}: ByteBuffer or null (null when typed_value captures the full value)</li>
+   *   <li>{@code metadata}: ByteBuffer (always present)</li>
+   *   <li>{@code typed_value}: the typed representation extracted from the variant binary,
+   *       or null if the variant type does not match the typed_value schema</li>
+   * </ul>
+   *
+   * @param unshreddedVariant GenericRecord with {value: ByteBuffer, metadata: ByteBuffer}
+   * @param shreddedSchema    target Avro schema with {value: nullable ByteBuffer, metadata: ByteBuffer, typed_value: type}
+   * @param variantSchema     HoodieSchema.Variant containing the shredding schema information
+   * @return a GenericRecord conforming to shreddedSchema with typed_value populated where possible
+   */
+  GenericRecord shredVariantRecord(
+      GenericRecord unshreddedVariant,
+      Schema shreddedSchema,
+      HoodieSchema.Variant variantSchema);
+}
@@ -49,6 +49,7 @@
 import java.util.Properties;
 
 import static org.apache.hudi.common.config.HoodieStorageConfig.HFILE_WRITER_TO_ALLOW_DUPLICATES;
+import static org.apache.hudi.common.config.HoodieStorageConfig.PARQUET_VARIANT_SHREDDING_PROVIDER_CLASS;
 import static org.apache.parquet.avro.HoodieAvroParquetSchemaConverter.getAvroSchemaConverter;
 
 public class HoodieAvroFileWriterFactory extends HoodieFileWriterFactory {
@@ -123,9 +124,37 @@ protected HoodieFileWriter newOrcFileWriter(
   private HoodieAvroWriteSupport getHoodieAvroWriteSupport(HoodieSchema schema,
                                                            HoodieConfig config, boolean enableBloomFilter) {
     Option<BloomFilter> filter = enableBloomFilter ? Option.of(createBloomFilter(config)) : Option.empty();
+    HoodieSchema effectiveSchema = HoodieAvroWriteSupport.generateEffectiveSchema(schema, config);
+    Properties props = config.getProps();
+    // Auto-detect variant shredding provider from classpath if not explicitly configured
+    if (!props.containsKey(PARQUET_VARIANT_SHREDDING_PROVIDER_CLASS.key())) {
+      String detected = detectShreddingProvider();
+      if (detected != null) {
+        props.setProperty(PARQUET_VARIANT_SHREDDING_PROVIDER_CLASS.key(), detected);
+      }
+    }
     return (HoodieAvroWriteSupport) ReflectionUtils.loadClass(
         config.getStringOrDefault(HoodieStorageConfig.HOODIE_AVRO_WRITE_SUPPORT_CLASS),
         new Class<?>[] {MessageType.class, HoodieSchema.class, Option.class, Properties.class},
-        getAvroSchemaConverter(storage.getConf().unwrapAs(Configuration.class)).convert(schema), schema, filter, config.getProps());
+        getAvroSchemaConverter(storage.getConf().unwrapAs(Configuration.class)).convert(effectiveSchema), schema, filter, props);
+  }
+
+  /**
+   * Auto-detect a {@link org.apache.hudi.avro.VariantShreddingProvider} implementation
+   * available on the classpath. Returns the fully-qualified class name if found, or null.
+   */
+  private static String detectShreddingProvider() {
+    String[] candidates = {
+        "org.apache.hudi.variant.Spark4VariantShreddingProvider"
+    };
+    for (String candidate : candidates) {
+      try {
+        Class.forName(candidate);
+        return candidate;
+      } catch (ClassNotFoundException e) {
+        // not on classpath, try next
+      }
+    }
+    return null;
   }
 }
@@ -62,6 +62,18 @@ abstract class HoodieBaseHadoopFsRelationFactory(val sqlContext: SQLContext,
                                                  val schemaSpec: Option[StructType],
                                                  val isBootstrap: Boolean
                                                 ) extends SparkAdapterSupport with HoodieHadoopFsRelationFactory with Logging {
+  // Propagate Hudi's variant allow-reading-shredded config to Spark's SQLConf.
+  // ParquetToSparkSchemaConverter reads this from SQLConf.get(), so it must be set
+  // before query execution starts here during table resolution
+  if (HoodieSparkUtils.gteqSpark4_0) {
+    val sqlConf = sqlContext.sparkSession.sessionState.conf
+    val hoodieParquetAllowReadingShreddedConfKey = "hoodie.parquet.variant.allow.reading.shredded"
+    val allowReadingShredded = options.getOrElse(
+      hoodieParquetAllowReadingShreddedConfKey,
+      sqlConf.getConfString(hoodieParquetAllowReadingShreddedConfKey, "true"))
+    sqlConf.setConfString("spark.sql.variant.allowReadingShredded", allowReadingShredded)
+  }
+
   protected lazy val sparkSession: SparkSession = sqlContext.sparkSession
   protected lazy val optParams: Map[String, String] = options
 
 
@@ -23,6 +23,10 @@ import org.apache.hudi.HoodieSparkUtils
 import org.apache.hudi.common.testutils.HoodieTestUtils
 import org.apache.hudi.common.util.StringUtils
 
+import org.apache.hadoop.fs.{FileSystem, Path => HadoopPath}
+import org.apache.parquet.hadoop.ParquetFileReader
+import org.apache.parquet.hadoop.util.HadoopInputFile
+import org.apache.parquet.schema.{GroupType, MessageType, Type}
 import org.apache.spark.sql.hudi.common.HoodieSparkSqlTestBase
 
 
@@ -171,4 +175,166 @@ class TestVariantDataType extends HoodieSparkSqlTestBase {
 
     spark.sql(s"drop table $tableName")
   }
+
+  test("Test Shredded Variant Write and Read + Validate Parquet Schema after Write") {
+    assume(HoodieSparkUtils.gteqSpark4_0, "Variant type requires Spark 4.0 or higher")
+
+    // Test 1: Shredding enabled with forced schema → parquet should have typed_value
+    withRecordType()(withTempDir { tmp =>
+      val tableName = generateTableName
+      spark.sql(
+        s"""
+           |create table $tableName (
+           |  id int,
+           |  name string,
+           |  v variant,
+           |  ts long
+           |) using hudi
+           | location '${tmp.getCanonicalPath}'
+           | tblproperties (
+           |  primaryKey = 'id',
+           |  type = 'cow',
+           |  preCombineField = 'ts'
+           | )
+        """.stripMargin)
+
+      spark.sql("set hoodie.parquet.variant.write.shredding.enabled = true")
+      spark.sql("set hoodie.parquet.variant.allow.reading.shredded = true")
+      spark.sql("set hoodie.parquet.variant.force.shredding.schema.for.test = a int, b string")
+
+      spark.sql(
+        s"""
+           |insert into $tableName values
+           |  (1, 'row1', parse_json('{"a": 1, "b": "hello"}'), 1000)
+        """.stripMargin)
+      checkAnswer(s"select id, name, cast(v as string), ts from $tableName order by id")(
+        Seq(1, "row1", "{\"a\":1,\"b\":\"hello\"}", 1000)
+      )
+
+      // Verify parquet schema has shredded structure with typed_value
+      val parquetFiles = listDataParquetFiles(tmp.getCanonicalPath)
+      assert(parquetFiles.nonEmpty, "Should have at least one data parquet file")
+
+      parquetFiles.foreach { filePath =>
+        val schema = readParquetSchema(filePath)
+        val variantGroup = getFieldAsGroup(schema, "v")
+        assert(groupContainsField(variantGroup, "typed_value"),
+          s"Shredded variant should have typed_value field. Schema:\n$variantGroup")
+        val valueField = variantGroup.getType(variantGroup.getFieldIndex("value"))
+        assert(valueField.getRepetition == Type.Repetition.OPTIONAL,
+          "Shredded variant value field should be OPTIONAL")
+        val metadataField = variantGroup.getType(variantGroup.getFieldIndex("metadata"))
+        assert(metadataField.getRepetition == Type.Repetition.REQUIRED,
+          "Shredded variant metadata field should be REQUIRED")
+      }
+    })
+  }
+
+  test("Test Unshredded Variant Write and Read + Validate Parquet Schema after Write") {
+    assume(HoodieSparkUtils.gteqSpark4_0, "Variant type requires Spark 4.0 or higher")
+    // Shredding disabled parquet should NOT have typed_value
+    withRecordType()(withTempDir { tmp =>
+      val tableName = generateTableName
+      spark.sql(
+        s"""
+           |create table $tableName (
+           |  id int,
+           |  name string,
+           |  v variant,
+           |  ts long
+           |) using hudi
+           | location '${tmp.getCanonicalPath}'
+           | tblproperties (
+           |  primaryKey = 'id',
+           |  type = 'cow',
+           |  preCombineField = 'ts'
+           | )
+              """.stripMargin)
+
+      spark.sql(s"set hoodie.parquet.variant.write.shredding.enabled = false")
+
+      spark.sql(
+        s"""
+           |insert into $tableName values
+           |  (1, 'row1', parse_json('{"a": 1, "b": "hello"}'), 1000)
+              """.stripMargin)
+
+      checkAnswer(s"select id, name, cast(v as string), ts from $tableName order by id")(
+        Seq(1, "row1", "{\"a\":1,\"b\":\"hello\"}", 1000)
+      )
+
+      // Verify parquet schema does NOT have typed_value
+      val parquetFiles = listDataParquetFiles(tmp.getCanonicalPath)
+      assert(parquetFiles.nonEmpty, "Should have at least one data parquet file")
+
+      parquetFiles.foreach { filePath =>
+        val schema = readParquetSchema(filePath)
+        val variantGroup = getFieldAsGroup(schema, "v")
+        assert(!groupContainsField(variantGroup, "typed_value"),
+          s"Non-shredded variant should NOT have typed_value field. Schema:\n$variantGroup")
+        val valueField = variantGroup.getType(variantGroup.getFieldIndex("value"))
+        assert(valueField.getRepetition == Type.Repetition.REQUIRED,
+          "Non-shredded variant value field should be REQUIRED")
+      }
+
+      // Verify data can still be read back for the non-shredded case
+      checkAnswer(s"select id, name, cast(v as string), ts from $tableName order by id")(
+        Seq(1, "row1", "{\"a\":1,\"b\":\"hello\"}", 1000)
+      )
+    })
+  }
+
+  /**
+   * Lists data parquet files in the table directory, excluding Hudi metadata files.
+   */
+  private def listDataParquetFiles(tablePath: String): Seq[String] = {
+    val conf = spark.sparkContext.hadoopConfiguration
+    val fs = FileSystem.get(new HadoopPath(tablePath).toUri, conf)
+    val iter = fs.listFiles(new HadoopPath(tablePath), true)
+    val files = scala.collection.mutable.ArrayBuffer[String]()
+    while (iter.hasNext) {
+      val file = iter.next()
+      val path = file.getPath.toString
+      if (path.endsWith(".parquet") && !path.contains(".hoodie")) {
+        files += path
+      }
+    }
+    files.toSeq
+  }
+
+  /**
+   * Reads the Parquet schema (MessageType) from a parquet file.
+   */
+  private def readParquetSchema(filePath: String): MessageType = {
+    val conf = spark.sparkContext.hadoopConfiguration
+    val inputFile = HadoopInputFile.fromPath(new HadoopPath(filePath), conf)
+    val reader = ParquetFileReader.open(inputFile)
+    try {
+      reader.getFooter.getFileMetaData.getSchema
+    } finally {
+      reader.close()
+    }
+  }
+
+  /**
+   * Gets a named field from a GroupType (MessageType) and returns it as a GroupType.
+   * Uses getFieldIndex(String) + getType(int) to avoid Scala overload resolution issues.
+   */
+  private def getFieldAsGroup(parent: GroupType, fieldName: String): GroupType = {
+    val idx: Int = parent.getFieldIndex(fieldName)
+    parent.getType(idx).asGroupType()
+  }
+
+  /**
+   * Checks whether a GroupType contains a field with the given name.
+   * Uses try/catch on getFieldIndex to avoid Scala-Java collection converter dependencies.
+   */
+  private def groupContainsField(group: GroupType, fieldName: String): Boolean = {
+    try {
+      group.getFieldIndex(fieldName)
+      true
+    } catch {
+      case _: Exception => false
+    }
+  }
 }