Prepare for 2.0 release (#168)

george-zubrienko · web-flow · commit 79bdc75d79bb · 2026-01-30T15:49:20.000+01:00
diff --git a/.helm/templates/crd-microsoft-synapse.yaml b/.helm/templates/crd-microsoft-synapse.yaml
@@ -13,7 +13,7 @@ spec:
     shortNames:
       - mssynapsestream
   versions:
-    - name: v1beta1
+    - name: v1
       served: true
       storage: true
       additionalPrinterColumns:
@@ -23,7 +23,7 @@ spec:
         - name: Entity
           type: string
           jsonPath: .spec.sourceSettings.name
-        - name: Refresh Interval
+        - name: Change Capture Interval
           type: string
           jsonPath: .spec.sourceSettings.changeCaptureIntervalSeconds
         - name: Sink location
@@ -78,10 +78,10 @@ spec:
                       type: string
                     baseLocation:
                       type: string
-                      description: Location root for Synapse Link data (Dataverse container)
+                      description: Location root for Synapse Link data (Dataverse container), in a format abfss://account@container.dfs.core.windows.net/
                     changeCaptureIntervalSeconds:
                       type: integer
-                      description: How long to wait before polling for next result set. Can be from 1 to 1 hour.
+                      description: How long to wait before polling for next change set. Accepted range is between 1s and 3600s
                       minimum: 1
                       maximum: 3600
                 connectionStringRef:
@@ -93,7 +93,7 @@ spec:
                       type: string
                 jobTemplateRef:
                   description: |
-                    Name of the job template to be used for the streaming job if stream is running in normal mode.
+                    Name of the job template to be used for the application in streaming mode.
                   type: object
                   properties:
                     name:
@@ -104,7 +104,7 @@ spec:
                       type: string
                 backfillJobTemplateRef:
                   description: |
-                    Name of the job template to be used for the streaming job if stream is running in the backfill mode.
+                    Name of the job template to be used for the for the application in backfill mode.
                   type: object
                   properties:
                     name:
@@ -121,42 +121,42 @@ spec:
                   description: Max retry delay on blob reads for the http client.
                 rowsPerGroup:
                   type: integer
-                  description: Number of rows per parquet rowgroup.
+                  description: Maximum number of rows to be grouped together for the staging process to consume.
                 groupingIntervalSeconds:
                   type: integer
-                  description: Max time to wait for rowsPerGroup to accumulate. Can be from 1 to 60 seconds.
+                  description: Max time to wait for rowsPerGroup to accumulate and then proceed to staging. Can be from 1 to 60 seconds.
                   minimum: 1
                   maximum: 60
                 sinkSettings:
                   type: object
                   properties:
                     optimizeSettings:
                       type: object
-                      description: Optimization settings for Iceberg tables.
+                      description: Configuration for target table optimize (data file aggregation into bigger files)
                       properties:
                         batchThreshold:
                           type: integer
                           default: 60
-                          description: Number of batches to accumulate before running the optimization query.
+                          description: Number of batches to accumulate before triggering the OPTIMIZE
                         fileSizeThreshold:
                           type: string
                           default: 100MB
-                          description: File size to target for the optimization query.
+                          description: File size to target when running OPTIMIZE
                       default:
                         batchThreshold: 60
                         fileSizeThreshold: 100MB
                     snapshotExpirationSettings:
                       type: object
-                      description: Expiration query configuration for Iceberg tables.
+                      description: Configuration for EXPIRE SNAPSHOTS (table transaction log cutoff)
                       properties:
                         batchThreshold:
                           type: integer
                           default: 60
-                          description: Number of batches to accumulate before running the snapshot expiration query.
+                          description: Number of batches to accumulate triggering EXPIRE SNAPSHOTS
                         retentionThreshold:
                           type: string
                           default: 6h
-                          description: File retention period.
+                          description: Maximum age of records in the transaction log to keep
                       default:
                         batchThreshold: 60
                         retentionThreshold: 6h
@@ -167,17 +167,17 @@ spec:
                         batchThreshold:
                           type: integer
                           default: 60
-                          description: Number of batches to accumulate before running the expire orphan files query.
+                          description: Configuration for EXPIRE ORPHAN FILES (cleanup of files no longer referenced by Iceberg snapshots)
                         retentionThreshold:
                           type: string
                           default: 6h
-                          description: File retention period.
+                          description: Number of batches to accumulate before triggering EXPIRE ORPHAN FILES
                       default:
                         batchThreshold: 60
                         retentionThreshold: 6h
                     analyzeSettings:
                       type: object
-                      description: Settings for running ANALYZE on target.
+                      description: Configuration for ANALYZE (full refresh of extended statistics on the target)
                       properties:
                         batchThreshold:
                           type: integer
@@ -195,9 +195,21 @@ spec:
                     targetTableName:
                       type: string
                       description: Name for the target Iceberg table.
+                    sinkCatalogSettings:
+                      type: object
+                      description: Connection settings for Iceberg REST Catalog for the sink (target). This is used by watermarking process.
+                      properties:
+                        namespace:
+                          type: string
+                        warehouse:
+                          type: string
+                        catalogUri:
+                          type: string
                 lookBackInterval:
                   type: integer
                   description: |
+                    DEPRECATED - TO BE REMOVED IN 2.1 RELEASE. DO NOT USE THIS SETTING!
+                    
                     Number of seconds to look back when determining first set of changes to extract.
                     Can be set in interval from 1 second to 10 hours. Default is 1 hour.
                   minimum: 1
@@ -206,15 +218,19 @@ spec:
                 stagingDataSettings:
                   type: object
                   properties:
+                    dataLocation:
+                      type: string
+                      description: Option data location override. Only use this setting for debugging and never in production environments.
                     maxRowsPerFile:
                       type: integer
+                      description: The maximum number of rows per each data file in the staging table.
                       default: 10000
-                    dataLocation:
-                      type: string
                     tableNamePrefix:
                       type: string
+                      description: Prefix for staging tables created by Arcane. Must be UNIQUE in the WAREHOUSE scope.
                     catalog:
                       type: object
+                      description: Settings for Iceberg REST Catalog used for staging tables
                       properties:
                         catalogName:
                           type: string
@@ -233,6 +249,7 @@ spec:
                     overwrite
                 fieldSelectionRule:
                   type: object
+                  description: INCLUDE will only use fields provided. You must specify mandatory fields like ARCANE_MERGE_KEY as well. EXCLUDE will exclude provided fields instead. ALL (default) will use all fields without filters.
                   properties:
                     ruleType:
                       type: string
diff --git a/build.sbt b/build.sbt
@@ -25,7 +25,7 @@ lazy val plugin = (project in file("."))
     name := "arcane-stream-microsoft-synapse-link",
     idePackagePrefix := Some("com.sneaksanddata.arcane.microsoft_synapse_link"),
 
-    libraryDependencies += "com.sneaksanddata" % "arcane-framework_3" % "1.2.4-11-g42e238e",
+    libraryDependencies += "com.sneaksanddata" % "arcane-framework_3" % "1.2.4-19-gdc3dd1f",
     libraryDependencies += "io.netty" % "netty-tcnative-boringssl-static" % "2.0.74.Final",
 
     // bugfix for upgrade header
diff --git a/integration-tests.env b/integration-tests.env
@@ -14,4 +14,7 @@ ARCANE_FRAMEWORK__STORAGE_ACCOUNT=devstoreaccount1
 ARCANE_FRAMEWORK__STORAGE_CONTAINER=cdm-e2e
 ARCANE_FRAMEWORK__STORAGE_ENDPOINT=http://localhost:10001/devstoreaccount1
 ARCANE_FRAMEWORK__S3_CATALOG_AUTH_SESSION_TIMEOUT_MILLIS=2000
-ARCANE_FRAMEWORK__MERGE_SERVICE_CONNECTION_URI=jdbc:trino://localhost:8080/iceberg/test?user=test
+ARCANE_FRAMEWORK__MERGE_SERVICE_CONNECTION_URI=jdbc:trino://localhost:8080/iceberg/test?user=test
+ARCANE_FRAMEWORK__ICEBERG_SINK_NAMESPACE=test
+ARCANE_FRAMEWORK__ICEBERG_SINK_WAREHOUSE=demo
+ARCANE_FRAMEWORK__ICEBERG_SINK_CATALOG_URI=http://localhost:20001/catalog
diff --git a/src/main/scala/main.scala b/src/main/scala/main.scala
@@ -22,7 +22,7 @@ import com.sneaksanddata.arcane.framework.services.filters.{
   FieldsFilteringService,
   FieldsFilteringService as FrameworkFieldsFilteringService
 }
-import com.sneaksanddata.arcane.framework.services.iceberg.IcebergS3CatalogWriter
+import com.sneaksanddata.arcane.framework.services.iceberg.{IcebergS3CatalogWriter, IcebergTablePropertyManager}
 import com.sneaksanddata.arcane.framework.services.metrics.{ArcaneDimensionsProvider, DataDog, DeclaredMetrics}
 import com.sneaksanddata.arcane.framework.services.storage.services.azure.AzureBlobStorageReader
 import com.sneaksanddata.arcane.framework.services.streaming.data_providers.backfill.{
@@ -101,7 +101,8 @@ object main extends ZIOAppDefault {
     ArcaneDimensionsProvider.layer,
     DataDog.UdsPublisher.layer,
     WatermarkProcessor.layer,
-    BackfillOverwriteWatermarkProcessor.layer
+    BackfillOverwriteWatermarkProcessor.layer,
+    IcebergTablePropertyManager.layer
   )
 
   @main
diff --git a/src/main/scala/models/app/MicrosoftSynapseLinkStreamContext.scala b/src/main/scala/models/app/MicrosoftSynapseLinkStreamContext.scala
@@ -30,19 +30,19 @@ trait ParallelismSettings:
 trait GraphExecutionSettings:
   val sourceDeleteDryRun: Boolean
 
-/** The context for the SQL Server Change Tracking stream.
+/** The context for the Synapse Link stream.
   *
   * @param spec
   *   The stream specification
   */
 case class MicrosoftSynapseLinkStreamContext(spec: StreamSpec)
     extends StreamContext
     with GroupingSettings
-    with IcebergCatalogSettings
+    with IcebergStagingSettings
     with JdbcMergeServiceClientSettings
     with VersionedDataGraphBuilderSettings
     with AzureConnectionSettings
-    with TargetTableSettings
+    with SinkSettings
     with ParallelismSettings
     with TablePropertiesSettings
     with FieldSelectionRuleSettings
@@ -65,8 +65,8 @@ case class MicrosoftSynapseLinkStreamContext(spec: StreamSpec)
   override val stagingLocation: Option[String] = spec.stagingDataSettings.dataLocation
 
   override val additionalProperties: Map[String, String] = sys.env.get("ARCANE_FRAMEWORK__CATALOG_NO_AUTH") match
-    case Some(_) => Map()
-    case None    => IcebergCatalogCredential.oAuth2Properties
+    case Some(_) => S3CatalogFileIO.properties
+    case None    => S3CatalogFileIO.properties ++ IcebergCatalogCredential.oAuth2Properties
 
   override val s3CatalogFileIO: S3CatalogFileIO = S3CatalogFileIO
 
@@ -169,6 +169,17 @@ case class MicrosoftSynapseLinkStreamContext(spec: StreamSpec)
   val metricsPublisherInterval: Duration = Duration.ofMillis(
     sys.env.getOrElse("ARCANE_FRAMEWORK__METRICS_PUBLISHER_INTERVAL_MILLIS", "100").toInt
   )
+  override val icebergSinkSettings: IcebergSinkSettings = new IcebergSinkSettings {
+    override val namespace: String =
+      sys.env("ARCANE_FRAMEWORK__ICEBERG_SINK_NAMESPACE") // spec.sinkSettings.sinkCatalogSettings.namespace.getOrElse()
+    override val warehouse: String =
+      sys.env("ARCANE_FRAMEWORK__ICEBERG_SINK_WAREHOUSE") // spec.sinkSettings.sinkCatalogSettings.warehouse.getOrElse()
+    override val catalogUri: String =
+      sys.env(
+        "ARCANE_FRAMEWORK__ICEBERG_SINK_CATALOG_URI"
+      ) // spec.sinkSettings.sinkCatalogSettings.catalogUri.getOrElse()
+    override val additionalProperties: Map[String, String] = IcebergCatalogCredential.oAuth2Properties
+  }
 
 given Conversion[MicrosoftSynapseLinkStreamContext, DatagramSocketConfig] with
   def apply(context: MicrosoftSynapseLinkStreamContext): DatagramSocketConfig =
@@ -180,8 +191,8 @@ given Conversion[MicrosoftSynapseLinkStreamContext, MetricsConfig] with
 
 object MicrosoftSynapseLinkStreamContext {
 
-  type Environment = StreamContext & GroupingSettings & VersionedDataGraphBuilderSettings & IcebergCatalogSettings &
-    JdbcMergeServiceClientSettings & AzureConnectionSettings & TargetTableSettings & MicrosoftSynapseLinkStreamContext &
+  type Environment = StreamContext & GroupingSettings & VersionedDataGraphBuilderSettings & IcebergStagingSettings &
+    JdbcMergeServiceClientSettings & AzureConnectionSettings & SinkSettings & MicrosoftSynapseLinkStreamContext &
     GraphExecutionSettings & TablePropertiesSettings & FieldSelectionRuleSettings & BackfillSettings &
     StagingDataSettings & SynapseSourceSettings & SourceBufferingSettings & MetricsConfig & DatagramSocketConfig &
     DatadogPublisherConfig
diff --git a/src/main/scala/models/app/contracts/StreamSpec.scala b/src/main/scala/models/app/contracts/StreamSpec.scala
@@ -36,14 +36,21 @@ case class OrphanFilesExpirationSettings(batchThreshold: Int, retentionThreshold
 
 case class AnalyzeSettings(batchThreshold: Int, includedColumns: Seq[String]) derives ReadWriter
 
+case class IcebergSinkSettings(
+    namespace: Option[String] = None,
+    warehouse: Option[String] = None,
+    catalogUri: Option[String] = None
+) derives ReadWriter
+
 /** The configuration of Iceberg sink.
   */
 case class SinkSettings(
     targetTableName: String,
     optimizeSettings: OptimizeSettingsSpec,
     snapshotExpirationSettings: SnapshotExpirationSettingsSpec,
     orphanFilesExpirationSettings: OrphanFilesExpirationSettings,
-    analyzeSettings: AnalyzeSettings
+    analyzeSettings: AnalyzeSettings,
+    sinkCatalogSettings: Option[IcebergSinkSettings] = None
 ) derives ReadWriter
 
 /** The configuration of the stream source.
diff --git a/src/test/scala/common/Common.scala b/src/test/scala/common/Common.scala
@@ -10,7 +10,7 @@ import com.sneaksanddata.arcane.framework.services.app.{GenericStreamRunnerServi
 import com.sneaksanddata.arcane.framework.services.app.base.{InterruptionToken, StreamLifetimeService}
 import com.sneaksanddata.arcane.framework.services.caching.schema_cache.MutableSchemaCache
 import com.sneaksanddata.arcane.framework.services.filters.{ColumnSummaryFieldsFilteringService, FieldsFilteringService}
-import com.sneaksanddata.arcane.framework.services.iceberg.IcebergS3CatalogWriter
+import com.sneaksanddata.arcane.framework.services.iceberg.{IcebergS3CatalogWriter, IcebergTablePropertyManager}
 import com.sneaksanddata.arcane.framework.services.merging.JdbcMergeServiceClient
 import com.sneaksanddata.arcane.framework.services.metrics.{ArcaneDimensionsProvider, DataDog, DeclaredMetrics}
 import com.sneaksanddata.arcane.framework.services.storage.services.azure.AzureBlobStorageReader
@@ -102,7 +102,8 @@ object Common:
       ArcaneDimensionsProvider.layer,
       DataDog.UdsPublisher.layer,
       WatermarkProcessor.layer,
-      BackfillOverwriteWatermarkProcessor.layer
+      BackfillOverwriteWatermarkProcessor.layer,
+      IcebergTablePropertyManager.layer
     )
 
   /** Gets the data from the *target* table. Using the connection string provided in the
diff --git a/unit-tests.env b/unit-tests.env