[DOP-29428] Add support for Iceberg with REST catalog and S3 warehouse

Ilyas Gasanov · Ilyas Gasanov · commit addbd175c0f8 · 2025-10-24T10:35:05.000+03:00
diff --git a/docs/changelog/next_release/283.feature.rst b/docs/changelog/next_release/283.feature.rst
@@ -0,0 +1 @@
+Added support for Iceberg with REST catalog and S3 warehouse
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -56,7 +56,7 @@ pyjwt = { version = "^2.10.1", optional = true }
 jinja2 = { version = "^3.1.6", optional = true }
 python-multipart = { version = "^0.0.20", optional = true }
 celery = { version = "^5.5.0", optional = true }
-onetl = { version = ">=0.13.5,<0.15.0", extras = ["all"], optional = true }
+onetl = { git = "https://github.com/MobileTeleSystems/onetl.git", branch = "develop", extras = ["all"], optional = true }
 pyspark = { version = "<4.0.0", optional = true }
 pyyaml = { version = "*", optional = true }
 psycopg2-binary = { version = "^2.9.10", optional = true }
diff --git a/syncmaster/dto/connections.py b/syncmaster/dto/connections.py
@@ -73,6 +73,23 @@ class HiveConnectionDTO(ConnectionDTO):
     type: ClassVar[str] = "hive"
 
 
+@dataclass
+class IcebergRESTCatalogS3ConnectionDTO(ConnectionDTO):
+    metastore_url: str
+    s3_warehouse_path: str
+    s3_host: str
+    s3_bucket: str
+    s3_region: str
+    s3_access_key: str
+    s3_secret_key: str
+    metastore_username: str
+    metastore_password: str
+    s3_port: int | None = None
+    s3_protocol: str = "https"
+    s3_path_style_access: bool = False
+    type: ClassVar[str] = "iceberg_rest_s3"
+
+
 @dataclass
 class HDFSConnectionDTO(ConnectionDTO):
     user: str
diff --git a/syncmaster/dto/transfers.py b/syncmaster/dto/transfers.py
@@ -107,6 +107,16 @@ def __post_init__(self):
         self.options.setdefault("if_exists", "replace_overlapping_partitions")
 
 
+@dataclass
+class IcebergRESTCatalogS3TransferDTO(DBTransferDTO):
+    type: ClassVar[str] = "iceberg_rest_s3"
+    catalog_name: str | None = None
+
+    def __post_init__(self):
+        super().__post_init__()
+        self.options.setdefault("if_exists", "replace_overlapping_partitions")
+
+
 @dataclass
 class S3TransferDTO(FileTransferDTO):
     type: ClassVar[str] = "s3"
diff --git a/syncmaster/schemas/v1/connections/connection_base.py b/syncmaster/schemas/v1/connections/connection_base.py
@@ -2,11 +2,17 @@
 # SPDX-License-Identifier: Apache-2.0
 from pydantic import BaseModel, ConfigDict, Field
 
-from syncmaster.schemas.v1.auth import ReadBasicAuthSchema, ReadS3AuthSchema
-from syncmaster.schemas.v1.auth.samba import ReadSambaAuthSchema
+from syncmaster.schemas.v1.auth import (
+    ReadBasicAuthSchema,
+    ReadIcebergRESTCatalogBasicAuthSchema,
+    ReadS3AuthSchema,
+    ReadSambaAuthSchema,
+)
 from syncmaster.schemas.v1.types import NameConstr
 
-ReadConnectionAuthDataSchema = ReadBasicAuthSchema | ReadS3AuthSchema | ReadSambaAuthSchema
+ReadConnectionAuthDataSchema = (
+    ReadBasicAuthSchema | ReadS3AuthSchema | ReadSambaAuthSchema | ReadIcebergRESTCatalogBasicAuthSchema
+)
 
 
 class CreateConnectionBaseSchema(BaseModel):
diff --git a/syncmaster/worker/controller.py b/syncmaster/worker/controller.py
@@ -17,6 +17,7 @@
     FTPSConnectionDTO,
     HDFSConnectionDTO,
     HiveConnectionDTO,
+    IcebergRESTCatalogS3ConnectionDTO,
     MSSQLConnectionDTO,
     MySQLConnectionDTO,
     OracleConnectionDTO,
@@ -33,6 +34,7 @@
     FTPTransferDTO,
     HDFSTransferDTO,
     HiveTransferDTO,
+    IcebergRESTCatalogS3TransferDTO,
     MSSQLTransferDTO,
     MySQLTransferDTO,
     OracleTransferDTO,
@@ -49,6 +51,7 @@
 from syncmaster.worker.handlers.base import Handler
 from syncmaster.worker.handlers.db.clickhouse import ClickhouseHandler
 from syncmaster.worker.handlers.db.hive import HiveHandler
+from syncmaster.worker.handlers.db.iceberg import IcebergHandler
 from syncmaster.worker.handlers.db.mssql import MSSQLHandler
 from syncmaster.worker.handlers.db.mysql import MySQLHandler
 from syncmaster.worker.handlers.db.oracle import OracleHandler
@@ -72,6 +75,12 @@
         HiveTransferDTO,
         RunDTO,
     ),
+    "iceberg_rest_s3": (
+        IcebergHandler,
+        IcebergRESTCatalogS3ConnectionDTO,
+        IcebergRESTCatalogS3TransferDTO,
+        RunDTO,
+    ),
     "oracle": (
         OracleHandler,
         OracleConnectionDTO,
@@ -269,6 +278,10 @@ def _perform_incremental_transfer(self) -> None:
     def _get_transfer_hwm_name(self) -> str:
         if self.source_handler.connection_dto.type in FILE_CONNECTION_TYPES:
             hwm_name_suffix = self.source_handler.transfer_dto.directory_path
+        elif self.source_handler.connection_dto.type == "iceberg_rest_s3":
+            hwm_name_suffix = (
+                f"{self.source_handler.transfer_dto.catalog_name}.{self.source_handler.transfer_dto.table_name}"
+            )
         else:
             hwm_name_suffix = self.source_handler.transfer_dto.table_name
         hwm_name = "_".join(
diff --git a/syncmaster/worker/handlers/db/base.py b/syncmaster/worker/handlers/db/base.py
@@ -39,11 +39,8 @@ def read(self) -> DataFrame:
         reader_params = {}
         if self.transfer_dto.strategy.type == "incremental":
             self.transfer_dto.strategy.increment_by = self._quote_field(self.transfer_dto.strategy.increment_by)
-            hwm_name = (
-                f"{self.transfer_dto.id}_{self.connection_dto.type}_{self.transfer_dto.table_name}"  # noqa: WPS237
-            )
             reader_params["hwm"] = DBReader.AutoDetectHWM(
-                name=hwm_name,
+                name=self._get_hwm_name(),
                 expression=self.transfer_dto.strategy.increment_by,
             )
 
@@ -137,5 +134,8 @@ def _get_reading_options(self) -> dict:
 
         return options
 
+    def _get_hwm_name(self):
+        return f"{self.transfer_dto.id}_{self.connection_dto.type}_{self.transfer_dto.table_name}"  # noqa: WPS237
+
     def _quote_field(self, field: str) -> str:
         return f'"{field}"'
diff --git a/syncmaster/worker/handlers/db/iceberg.py b/syncmaster/worker/handlers/db/iceberg.py
@@ -0,0 +1,95 @@
+# SPDX-FileCopyrightText: 2023-2024 MTS PJSC
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+from onetl.connection import Iceberg
+from onetl.hooks import slot, support_hooks
+
+from syncmaster.dto.connections import IcebergRESTCatalogS3ConnectionDTO
+from syncmaster.dto.transfers import IcebergRESTCatalogS3TransferDTO
+from syncmaster.worker.handlers.db.base import DBHandler
+
+if TYPE_CHECKING:
+    from pyspark.sql import SparkSession
+    from pyspark.sql.dataframe import DataFrame
+
+
+@support_hooks
+class IcebergHandler(DBHandler):
+    connection: Iceberg
+    connection_dto: IcebergRESTCatalogS3ConnectionDTO
+    transfer_dto: IcebergRESTCatalogS3TransferDTO
+    _operators = {
+        "regexp": "RLIKE",
+        **DBHandler._operators,
+    }
+
+    def connect(self, spark: SparkSession):
+        self.connection = Iceberg(
+            spark=spark,
+            catalog_name=self.transfer_dto.catalog_name,
+            catalog=Iceberg.RESTCatalog(
+                uri=self.connection_dto.metastore_url,
+                auth=Iceberg.RESTCatalog.BasicAuth(
+                    user=self.connection_dto.metastore_username,
+                    password=self.connection_dto.metastore_password,
+                ),
+            ),
+            warehouse=Iceberg.S3Warehouse(
+                path=self.connection_dto.s3_warehouse_path,
+                host=self.connection_dto.s3_host,
+                port=self.connection_dto.s3_port,
+                protocol=self.connection_dto.s3_protocol,
+                bucket=self.connection_dto.s3_bucket,
+                path_style_access=self.connection_dto.s3_path_style_access,
+                region=self.connection_dto.s3_region,
+                access_key=self.connection_dto.s3_access_key,
+                secret_key=self.connection_dto.s3_secret_key,
+            ),
+        ).check()
+
+    @slot
+    def read(self) -> DataFrame:
+        return super().read()
+
+    @slot
+    def write(self, df: DataFrame) -> None:
+        return super().write(df)
+
+    def _normalize_column_names(self, df: DataFrame) -> DataFrame:
+        for column_name in df.columns:
+            df = df.withColumnRenamed(column_name, column_name.lower())
+        return df
+
+    def _make_rows_filter_expression(self, filters: list[dict]) -> str | None:
+        expressions = []
+        for filter in filters:
+            op = self._operators[filter["type"]]
+            field = self._quote_field(filter["field"])
+            value = filter.get("value")
+
+            if value is None:
+                expressions.append(f"{field} {op}")
+                continue
+
+            if op == "ILIKE":
+                expressions.append(f"LOWER({field}) LIKE LOWER('{value}')")
+            elif op == "NOT ILIKE":
+                expressions.append(f"NOT LOWER({field}) LIKE LOWER('{value}')")
+            else:
+                expressions.append(f"{field} {op} '{value}'")
+
+        return " AND ".join(expressions) or None
+
+    def _get_reading_options(self) -> dict:
+        return {}
+
+    def _get_hwm_name(self):
+        table = f"{self.transfer_dto.catalog_name}.{self.transfer_dto.table_name}"
+        return f"{self.transfer_dto.id}_{self.connection_dto.type}_{table}"  # noqa: WPS237
+
+    def _quote_field(self, field: str) -> str:
+        return f"`{field}`"

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+Added support for Iceberg with REST catalog and S3 warehouse`