change

annie-mac · annie-mac · commit 592661a007f3 · 2025-08-26T21:27:36.000-07:00
diff --git a/sdk/cosmos/azure-cosmos-spark_3-3_2-12/src/main/scala/com/azure/cosmos/spark/SparkInternalsBridge.scala b/sdk/cosmos/azure-cosmos-spark_3-3_2-12/src/main/scala/com/azure/cosmos/spark/SparkInternalsBridge.scala
@@ -15,6 +15,15 @@ import java.util.Locale
 import java.util.concurrent.atomic.{AtomicBoolean, AtomicReference}
 import scala.collection.mutable.ArrayBuffer
 
+class SparkInternalsBridge {
+  // Only used in ChangeFeedMetricsListener, which is easier for test validation
+  def getInternalCustomTaskMetricsAsSQLMetric(
+                                              knownCosmosMetricNames: Set[String],
+                                              taskMetrics: TaskMetrics) : Map[String, SQLMetric] = {
+    SparkInternalsBridge.getInternalCustomTaskMetricsAsSQLMetricInternal(knownCosmosMetricNames, taskMetrics)
+  }
+}
+
 object SparkInternalsBridge extends BasicLoggingTrait {
   private val SPARK_REFLECTION_ACCESS_ALLOWED_PROPERTY = "COSMOS.SPARK_REFLECTION_ACCESS_ALLOWED"
   private val SPARK_REFLECTION_ACCESS_ALLOWED_VARIABLE = "COSMOS_SPARK_REFLECTION_ACCESS_ALLOWED"
diff --git a/sdk/cosmos/azure-cosmos-spark_3-4_2-12/src/main/scala/com/azure/cosmos/spark/SparkInternalsBridge.scala b/sdk/cosmos/azure-cosmos-spark_3-4_2-12/src/main/scala/com/azure/cosmos/spark/SparkInternalsBridge.scala
@@ -14,6 +14,15 @@ import java.lang.reflect.Method
 import java.util.Locale
 import java.util.concurrent.atomic.{AtomicBoolean, AtomicReference}
 
+class SparkInternalsBridge {
+  // Only used in ChangeFeedMetricsListener, which is easier for test validation
+  def getInternalCustomTaskMetricsAsSQLMetric(
+                                              knownCosmosMetricNames: Set[String],
+                                              taskMetrics: TaskMetrics) : Map[String, SQLMetric] = {
+    SparkInternalsBridge.getInternalCustomTaskMetricsAsSQLMetricInternal(knownCosmosMetricNames, taskMetrics)
+  }
+}
+
 object SparkInternalsBridge extends BasicLoggingTrait {
   private val SPARK_REFLECTION_ACCESS_ALLOWED_PROPERTY = "COSMOS.SPARK_REFLECTION_ACCESS_ALLOWED"
   private val SPARK_REFLECTION_ACCESS_ALLOWED_VARIABLE = "COSMOS_SPARK_REFLECTION_ACCESS_ALLOWED"
diff --git a/sdk/cosmos/azure-cosmos-spark_3_2-12/docs/configuration-reference.md b/sdk/cosmos/azure-cosmos-spark_3_2-12/docs/configuration-reference.md
@@ -112,12 +112,13 @@ Used to influence the json serialization/deserialization behavior
 | `spark.cosmos.serialization.dateTimeConversionMode` | `Default` | The date/time conversion mode (`Default`, `AlwaysEpochMilliseconds`, `AlwaysEpochMillisecondsWithSystemDefaultTimezone`). With `Default` the standard Spark 3.* behavior is used (`java.sql.Date`/`java.time.LocalDate` are converted to EpochDay, `java.sql.Timestamp`/`java.time.Instant` are converted to MicrosecondsFromEpoch). With `AlwaysEpochMilliseconds` the same behavior the Cosmos DB connector for Spark 2.4 used is applied - `java.sql.Date`, `java.time.LocalDate`, `java.sql.Timestamp` and `java.time.Instant` are converted to MillisecondsFromEpoch. The behavior for `AlwaysEpochMillisecondsWithSystemDefaultTimezone` is identical with `AlwaysEpochMilliseconds` except that it will assume System default time zone / Spark session time zone (specified via `spark.sql.session.timezone`) instead of UTC when the date/time to be parsed has no explicit time zone. |
 
 #### Change feed (only for Spark-Streaming using `cosmos.oltp.changeFeed` data source, which is read-only) configuration
-| Config Property Name                              | Default                                                | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 |
-|:--------------------------------------------------|:-------------------------------------------------------|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| 
-| `spark.cosmos.changeFeed.startFrom`               | `Beginning`                                            | ChangeFeed Start from settings (`Now`, `Beginning`  or a certain point in time (UTC) for example `2020-02-10T14:15:03`) - the default value is `Beginning`. If the write config contains a `checkpointLocation` and any checkpoints exist, the stream is always continued independent of the `spark.cosmos.changeFeed.startFrom` settings - you need to change `checkpointLocation` or delete checkpoints to restart the stream if that is the intention.                                                   | 
-| `spark.cosmos.changeFeed.mode`                    | `Incremental/LatestVersion`                            | ChangeFeed mode (`Incremental/LatestVersion` or `FullFidelity/AllVersionsAndDeletes`) - NOTE: `FullFidelity/AllVersionsAndDeletes` is in experimental state right now. It requires that the subscription/account has been enabled for the private preview and there are known breaking changes that will happen for `FullFidelity/AllVersionsAndDeletes` (schema of the returned documents). It is recommended to only use `FullFidelity/AllVersionsAndDeletes` for non-production scenarios at this point. | 
-| `spark.cosmos.changeFeed.itemCountPerTriggerHint` | None (process all available data in first micro-batch) | Approximate maximum number of items read from change feed for each micro-batch/trigger. If not set, all available data in the changefeed is going to be processed in the first micro-batch. This could overload the client-resources (especially memory), so choosing a value to cap the resource consumption in the Spark executors is advisable here. Usually a reasonable value would be at least in the 100-thousands or single-digit millions.                                                         |
-| `spark.cosmos.changeFeed.batchCheckpointLocation` | None                                                   | Can be used to generate checkpoints when using change feed queries in batch mode - and proceeding on the next iteration where the previous left off.                                                                                                                                                                                                                                                                                                                                                        |
+| Config Property Name                                        | Default                                                | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 |
+|:------------------------------------------------------------|:-------------------------------------------------------|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| 
+| `spark.cosmos.changeFeed.startFrom`                         | `Beginning`                                            | ChangeFeed Start from settings (`Now`, `Beginning`  or a certain point in time (UTC) for example `2020-02-10T14:15:03`) - the default value is `Beginning`. If the write config contains a `checkpointLocation` and any checkpoints exist, the stream is always continued independent of the `spark.cosmos.changeFeed.startFrom` settings - you need to change `checkpointLocation` or delete checkpoints to restart the stream if that is the intention.                                                   | 
+| `spark.cosmos.changeFeed.mode`                              | `Incremental/LatestVersion`                            | ChangeFeed mode (`Incremental/LatestVersion` or `FullFidelity/AllVersionsAndDeletes`) - NOTE: `FullFidelity/AllVersionsAndDeletes` is in experimental state right now. It requires that the subscription/account has been enabled for the private preview and there are known breaking changes that will happen for `FullFidelity/AllVersionsAndDeletes` (schema of the returned documents). It is recommended to only use `FullFidelity/AllVersionsAndDeletes` for non-production scenarios at this point. | 
+| `spark.cosmos.changeFeed.itemCountPerTriggerHint`           | None (process all available data in first micro-batch) | Approximate maximum number of items read from change feed for each micro-batch/trigger. If not set, all available data in the changefeed is going to be processed in the first micro-batch. This could overload the client-resources (especially memory), so choosing a value to cap the resource consumption in the Spark executors is advisable here. Usually a reasonable value would be at least in the 100-thousands or single-digit millions.                                                         |
+| `spark.cosmos.changeFeed.batchCheckpointLocation`           | None                                                   | Can be used to generate checkpoints when using change feed queries in batch mode - and proceeding on the next iteration where the previous left off.                                                                                                                                                                                                                                                                                                                                                        |
+| `spark.cosmos.changeFeed.performance.monitoring.enabled`    | `true`                                                 | A Flag to indicate whether enable change feed performance monitoring. When enabled, custom task metrics will be tracked internally, which will be used to dynamically tuning the change feed micro-batch size.                                                                                                                                                                                                                                                                                              |
 
 #### Json conversion configuration
 | Config Property Name                     | Default   | Description                                                                                                                                                                                                                                        |
diff --git a/sdk/cosmos/azure-cosmos-spark_3_2-12/src/test/scala/com/azure/cosmos/spark/CosmosPartitionPlannerSpec.scala b/sdk/cosmos/azure-cosmos-spark_3_2-12/src/test/scala/com/azure/cosmos/spark/CosmosPartitionPlannerSpec.scala
@@ -487,7 +487,7 @@ class CosmosPartitionPlannerSpec extends UnitSpec {
     calculate(1).endLsn.get shouldEqual 2150
   }
 
-  it should "calculateEndLsn should distribute rate based on metrics with readLimit and multiple partitions" in {
+  it should "calculateEndLsn should distribute rate based on metrics with readLimit" in {
     val clientConfig = spark.CosmosClientConfiguration(
       UUID.randomUUID().toString,
       UUID.randomUUID().toString,

Original file line number	Diff line number	Diff line change
`@@ -487,7 +487,7 @@ class CosmosPartitionPlannerSpec extends UnitSpec {`
`487`	`487`	`calculate(1).endLsn.get shouldEqual 2150`
`488`	`488`	`}`
`489`	`489`
`490`		`- it should "calculateEndLsn should distribute rate based on metrics with readLimit and multiple partitions" in {`
	`490`	`+ it should "calculateEndLsn should distribute rate based on metrics with readLimit" in {`
`491`	`491`	`val clientConfig = spark.CosmosClientConfiguration(`
`492`	`492`	`UUID.randomUUID().toString,`
`493`	`493`	`UUID.randomUUID().toString,`