richardc-db
diff --git a/‎.github/workflows/new_pull_request.yaml‎
Lines changed: 0 additions & 16 deletions b/‎.github/workflows/new_pull_request.yaml‎
Lines changed: 0 additions & 16 deletions
diff --git a/‎.github/workflows/new_updated_issue.yaml‎
Lines changed: 0 additions & 32 deletions b/‎.github/workflows/new_updated_issue.yaml‎
Lines changed: 0 additions & 32 deletions
diff --git a/‎.github/workflows/spark_test.yaml‎
Lines changed: 3 additions & 2 deletions b/‎.github/workflows/spark_test.yaml‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎.github/workflows/unidoc.yaml‎
Lines changed: 19 additions & 0 deletions b/‎.github/workflows/unidoc.yaml‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎.github/workflows/updated_pull_request.yaml‎
Lines changed: 0 additions & 22 deletions b/‎.github/workflows/updated_pull_request.yaml‎
Lines changed: 0 additions & 22 deletions
diff --git a/‎PROTOCOL.md‎
Lines changed: 1 addition & 1 deletion b/‎PROTOCOL.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎build.sbt‎
Lines changed: 5 additions & 0 deletions b/‎build.sbt‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎connectors/flink/src/test/java/io/delta/flink/sink/DeltaSinkStreamingExecutionITCase.java‎
Lines changed: 6 additions & 0 deletions b/‎connectors/flink/src/test/java/io/delta/flink/sink/DeltaSinkStreamingExecutionITCase.java‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎connectors/golden-tables/src/test/scala/io/delta/golden/GoldenTables.scala‎
Lines changed: 1 addition & 0 deletions b/‎connectors/golden-tables/src/test/scala/io/delta/golden/GoldenTables.scala‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎iceberg/src/main/scala/org/apache/spark/sql/delta/IcebergFileManifest.scala‎
Lines changed: 59 additions & 42 deletions b/‎iceberg/src/main/scala/org/apache/spark/sql/delta/IcebergFileManifest.scala‎
Lines changed: 59 additions & 42 deletions
@@ -16,6 +16,7 @@ jobs:
         with:
           PATTERNS: |
             **
+            .github/workflows/**
             !kernel/**
             !connectors/**
       - name: install java
@@ -56,9 +57,9 @@ jobs:
           pipenv run pip install cryptography==37.0.4
           pipenv run pip install twine==4.0.1
           pipenv run pip install wheel==0.33.4
-          pipenv run pip install setuptools==41.0.1
+          pipenv run pip install setuptools==41.1.0
           pipenv run pip install pydocstyle==3.0.0
-          pipenv run pip install pandas==1.0.5
+          pipenv run pip install pandas==1.1.3
           pipenv run pip install pyarrow==8.0.0
           pipenv run pip install numpy==1.20.3
         if: steps.git-diff.outputs.diff
 
@@ -0,0 +1,19 @@
+  name: "Unidoc generation"
+  on: [push, pull_request]
+  jobs:
+    build:
+      name: "Generate unidoc"
+      runs-on: ubuntu-20.04
+      strategy:
+        matrix:
+        # These Scala versions must match those in the build.sbt
+          scala: [2.13.8, 2.12.17]
+      steps:
+        - name: install java
+          uses: actions/setup-java@v3
+          with:
+            distribution: "zulu"
+            java-version: "8"
+        - uses: actions/checkout@v3
+        - name: generate unidoc
+          run: build/sbt "++ ${{ matrix.scala }}" unidoc
@@ -1660,7 +1660,7 @@ The concrete format is as follows, with all numerical values written in big endi
 
 Bytes | Name | Description
 -|-|-
-0 — 1 | version | The format version of this file: `1` for the format described here.
+0 | version | The format version of this file: `1` for the format described here.
 `repeat for each DV i` | | For each DV
 `<start of i>` — `<start of i> + 3` | dataSize | Size of this DV’s data (without the checksum)
 `<start of i> + 4` — `<start of i> + 4 + dataSize - 1` | bitmapData | One 64-bit RoaringBitmap serialised as described above.
 
@@ -300,6 +300,11 @@ lazy val kernelDefaults = (project in file("kernel/kernel-defaults"))
       "commons-io" % "commons-io" % "2.8.0" % "test",
       "com.novocode" % "junit-interface" % "0.11" % "test",
       "org.slf4j" % "slf4j-log4j12" % "1.7.36" % "test",
+      // JMH dependencies allow writing micro-benchmarks for testing performance of components.
+      // JMH has framework to define benchmarks and takes care of many common functionalities
+      // such as warm runs, cold runs, defining benchmark parameter variables etc.
+      "org.openjdk.jmh" % "jmh-core" % "1.37" % "test",
+      "org.openjdk.jmh" % "jmh-generator-annprocess" % "1.37" % "test",
 
       "org.apache.spark" %% "spark-hive" % sparkVersion % "test" classifier "tests",
       "org.apache.spark" %% "spark-sql" % sparkVersion % "test" classifier "tests",
 
@@ -192,6 +192,9 @@ public void testDeltaSink(boolean isPartitioned, boolean triggerFailover) throws
      *
      * @param exceptionMode whether to throw an exception before or after Delta log commit.
      */
+    @Disabled(
+        "This test is flaky, for some runs it fails with unexpected numbers of files. "
+            + "Investigation of if this is a connector or test issue is ongoing")
     @ResourceLock("StreamingFailoverDeltaGlobalCommitter")
     @ParameterizedTest(name = "isPartitioned = {0}, exceptionMode = {1}")
     @CsvSource({
@@ -335,6 +338,9 @@ public void shouldResumeSink_savepointNoDrainState() throws Exception {
             .hasNoDuplicateAddFiles();
     }
 
+    @Disabled(
+        "This test is flaky, for some runs it fails with 'Seems there was a duplicated AddFile in"
+            + " Delta log. Investigation of if this is a connector or test issue is ongoing")
     @ParameterizedTest(
         name = "init parallelism level = {0}, parallelism level after resuming job = {1}")
     @CsvSource({"3, 3", "3, 6", "6, 3"})
 
@@ -378,6 +378,7 @@ class GoldenTables extends QueryTest with SharedSparkSession {
 
     val commitInfoFile = CommitInfo(
       version = Some(0L),
+      inCommitTimestamp = None,
       timestamp = new Timestamp(1540415658000L),
       userId = Some("user_0"),
       userName = Some("username_0"),
 
@@ -22,7 +22,7 @@ import org.apache.spark.sql.delta.{DeltaColumnMapping, SerializableFileStatus}
 import org.apache.spark.sql.delta.sources.DeltaSQLConf
 import org.apache.spark.sql.delta.util.{DateFormatter, TimestampFormatter}
 import org.apache.hadoop.fs.Path
-import org.apache.iceberg.{PartitionData, RowLevelOperationMode, Table, TableProperties}
+import org.apache.iceberg.{PartitionData, RowLevelOperationMode, StructLike, Table, TableProperties}
 import org.apache.iceberg.transforms.IcebergPartitionUtil
 
 import org.apache.spark.internal.Logging
@@ -48,6 +48,31 @@ class IcebergFileManifest(
 
   val basePath = table.location()
 
+  val icebergSchema = table.schema()
+
+  // we must use field id to look up the partition value; consider scenario with iceberg
+  // behavior chance since 1.4.0:
+  // 1) create table with partition schema (a[col_name]: 1[field_id]), add file1;
+  //    The partition data for file1 is (a:1:some_part_value)
+  // 2) add new partition col b and the partition schema becomes (a: 1, b: 2), add file2;
+  //    the partition data for file2 is (a:1:some_part_value, b:2:some_part_value)
+  // 3) remove partition col a, then add file3;
+  //    for iceberg < 1.4.0: the partFields is (a:1(void), b:2); the partition data for
+  //                         file3 is (a:1(void):null, b:2:some_part_value);
+  //    for iceberg 1.4.0:   the partFields is (b:2); When it reads file1 (a:1:some_part_value),
+  //                         it must use the field_id instead of index to look up the partition
+  //                         value, as the partField and partitionData from file1 have different
+  //                         ordering and thus same index indicates different column.
+  val physicalNameToField = table.spec().fields().asScala.collect {
+      case field if field.transform().toString != VOID_TRANSFORM =>
+        DeltaColumnMapping.getPhysicalName(partitionSchema(field.name)) -> field
+    }.toMap
+
+  val dateFormatter = DateFormatter()
+
+  val timestampFormatter =
+    TimestampFormatter(ConvertUtils.timestampPartitionPattern, java.util.TimeZone.getDefault)
+
   override def numFiles: Long = {
     if (_numFiles.isEmpty) getFileSparkResults()
     _numFiles.get
@@ -77,30 +102,6 @@ class IcebergFileManifest(
     val schemaBatchSize =
       spark.sessionState.conf.getConf(DeltaSQLConf.DELTA_IMPORT_BATCH_SIZE_SCHEMA_INFERENCE)
 
-    val partFields = table.spec().fields().asScala
-    val icebergSchema = table.schema()
-    // we must use field id to look up the partition value; consider scenario with iceberg
-    // behavior chance since 1.4.0:
-    // 1) create table with partition schema (a[col_name]: 1[field_id]), add file1;
-    //    The partition data for file1 is (a:1:some_part_value)
-    // 2) add new partition col b and the partition schema becomes (a: 1, b: 2), add file2;
-    //    the partition data for file2 is (a:1:some_part_value, b:2:some_part_value)
-    // 3) remove partition col a, then add file3;
-    //    for iceberg < 1.4.0: the partFields is (a:1(void), b:2); the partition data for
-    //                         file3 is (a:1(void):null, b:2:some_part_value);
-    //    for iceberg 1.4.0:   the partFields is (b:2); When it reads file1 (a:1:some_part_value),
-    //                         it must use the field_id instead of index to look up the partition
-    //                         value, as the partField and partitionData from file1 have different
-    //                         ordering and thus same index indicates different column.
-    val physicalNameToField = partFields.collect {
-      case field if field.transform().toString != VOID_TRANSFORM =>
-        DeltaColumnMapping.getPhysicalName(partitionSchema(field.name)) -> field
-    }.toMap
-
-    val dateFormatter = DateFormatter()
-    val timestampFormatter = TimestampFormatter(ConvertUtils.timestampPartitionPattern,
-      java.util.TimeZone.getDefault)
-
     // This flag is strongly not recommended to turn on, but we still provide a flag for regression
     // purpose.
     val unsafeConvertMorTable =
@@ -131,23 +132,9 @@ class IcebergFileManifest(
               s"Please trigger an Iceberg compaction and retry the command.")
         }
         val partitionValues = if (spark.sessionState.conf.getConf(
-          DeltaSQLConf.DELTA_CONVERT_ICEBERG_USE_NATIVE_PARTITION_VALUES)) {
-
-          val icebergPartition = fileScanTask.file().partition()
-          val icebergPartitionData = icebergPartition.asInstanceOf[PartitionData]
-          val fieldIdToIdx = icebergPartitionData.getPartitionType.fields().asScala.zipWithIndex
-            .map(kv => kv._1.fieldId() -> kv._2).toMap
-          val physicalNameToPartValueMap = physicalNameToField
-            .map { case (physicalName, field) =>
-              val fieldIndex = fieldIdToIdx.get(field.fieldId())
-              val partValueAsString = fieldIndex.map {idx =>
-                val partValue = icebergPartitionData.get(idx)
-                IcebergPartitionUtil.partitionValueToString(
-                    field, partValue, icebergSchema, dateFormatter, timestampFormatter)
-              }.getOrElse(null)
-              physicalName -> partValueAsString
-            }
-          Some(physicalNameToPartValueMap)
+            DeltaSQLConf.DELTA_CONVERT_ICEBERG_USE_NATIVE_PARTITION_VALUES)) {
+          Some(convertIcebergPartitionToPartitionValues(
+            fileScanTask.file().partition()))
         } else None
         (filePath, partitionValues)
       }
@@ -171,4 +158,34 @@ class IcebergFileManifest(
   }
 
   override def close(): Unit = fileSparkResults.map(_.unpersist())
+
+  def convertIcebergPartitionToPartitionValues(partition: StructLike):
+      Map[String, String] = {
+    val icebergPartitionData = partition.asInstanceOf[PartitionData]
+    val fieldIdToIdx = icebergPartitionData.getPartitionType
+      .fields()
+      .asScala
+      .zipWithIndex
+      .map(kv => kv._1.fieldId() -> kv._2)
+      .toMap
+    val physicalNameToPartValueMap = physicalNameToField
+      .map {
+        case (physicalName, field) =>
+          val fieldIndex = fieldIdToIdx.get(field.fieldId())
+          val partValueAsString = fieldIndex
+            .map { idx =>
+              val partValue = icebergPartitionData.get(idx)
+              IcebergPartitionUtil.partitionValueToString(
+                field,
+                partValue,
+                icebergSchema,
+                dateFormatter,
+                timestampFormatter
+              )
+            }
+            .getOrElse(null)
+          physicalName -> partValueAsString
+      }
+    physicalNameToPartValueMap
+  }
 }