Altinity
diff --git a/‎docs/en/engines/table-engines/special/tiered-distributed.md‎
Lines changed: 106 additions & 0 deletions b/‎docs/en/engines/table-engines/special/tiered-distributed.md‎
Lines changed: 106 additions & 0 deletions
diff --git a/‎src/Databases/DataLake/GlueCatalog.cpp‎
Lines changed: 158 additions & 15 deletions b/‎src/Databases/DataLake/GlueCatalog.cpp‎
Lines changed: 158 additions & 15 deletions
diff --git a/‎src/Databases/DataLake/GlueCatalog.h‎
Lines changed: 2 additions & 0 deletions b/‎src/Databases/DataLake/GlueCatalog.h‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/Interpreters/ClusterProxy/SelectStreamFactory.cpp‎
Lines changed: 29 additions & 2 deletions b/‎src/Interpreters/ClusterProxy/SelectStreamFactory.cpp‎
Lines changed: 29 additions & 2 deletions
@@ -0,0 +1,106 @@
+---
+description: 'Hybrid unions multiple data sources behind per-layer predicates so queries behave like a single table while data is migrated or tiered.'
+slug: /engines/table-engines/special/tiered-distributed
+title: 'Hybrid Table Engine'
+sidebar_label: 'Hybrid'
+sidebar_position: 11
+---
+
+# Hybrid table engine
+
+`Hybrid` builds on top of the [Distributed](./distributed.md) table engine. It lets you expose several data sources as one logical table and assign every source its own predicate.
+The engine rewrites incoming queries so that each layer receives the original query plus its predicate. This keeps all of the Distributed optimisations (remote aggregation, `skip_unused_shards`,
+global JOIN pushdown, and so on) while you duplicate or migrate data across clusters, storage types, or formats.
+
+It keeps the same execution pipeline as `engine=Distributed` but can read from multiple underlying sources simultaneously—similar to `engine=Merge`—while still pushing logic down to each source.
+
+Typical use cases include:
+
+- Zero-downtime migrations where "old" and "new" replicas temporarily overlap.
+- Tiered storage, for example fresh data on a local cluster and historical data in S3.
+- Gradual roll-outs where only a subset of rows should be served from a new backend.
+
+By giving mutually exclusive predicates to the layers (for example, `date < watermark` and `date >= watermark`), you ensure that each row is read from exactly one source.
+
+## Engine definition
+
+```sql
+CREATE TABLE [IF NOT EXISTS] [db.]table_name
+(
+    column1 type1,
+    column2 type2,
+    ...
+)
+ENGINE = Hybrid(table_function_1, predicate_1 [, table_function_2, predicate_2 ...])
+```
+
+You must pass at least two arguments – the first table function and its predicate. Additional sources are appended as `table_function, predicate` pairs. The first table function is also used for `INSERT` statements.
+
+### Arguments and behaviour
+
+- `table_function_n` must be a valid table function (for example `remote`, `remoteSecure`, `cluster`, `clusterAllReplicas`, `s3Cluster`) or a fully qualified table name (`database.table`). The first argument must be a table function—such as `remote` or `cluster`—because it instantiates the underlying `Distributed` storage.
+- `predicate_n` must be an expression that can be evaluated on the table columns. The engine adds it to the layer's query with an additional `AND`, so expressions like `event_date >= '2025-09-01'` or `id BETWEEN 10 AND 15` are typical.
+- The query planner picks the same processing stage for every layer as it does for the base `Distributed` plan, so remote aggregation, ORDER BY pushdown, `skip_unused_shards`, and the legacy/analyzer execution modes behave the same way.
+- `INSERT` statements are forwarded to the first table function only. If you need multi-destination writes, use explicit `INSERT` statements into the respective sources.
+- Align schemas across the layers. ClickHouse builds a common header; if the physical types differ you may need to add casts on one side or in the query, just as you would when reading from heterogeneous replicas.
+
+## Example: local cluster plus S3 historical tier
+
+The following commands illustrate a two-layer layout. Hot data stays on a local ClickHouse cluster, while historical rows come from public S3 Parquet files.
+
+```sql
+-- Local MergeTree table that keeps current data
+CREATE OR REPLACE TABLE btc_blocks_local
+(
+    `hash` FixedString(64),
+    `version` Int64,
+    `mediantime` DateTime64(9),
+    `nonce` Int64,
+    `bits` FixedString(8),
+    `difficulty` Float64,
+    `chainwork` FixedString(64),
+    `size` Int64,
+    `weight` Int64,
+    `coinbase_param` String,
+    `number` Int64,
+    `transaction_count` Int64,
+    `merkle_root` FixedString(64),
+    `stripped_size` Int64,
+    `timestamp` DateTime64(9),
+    `date` Date
+)
+ENGINE = MergeTree
+ORDER BY (timestamp)
+PARTITION BY toYYYYMM(date);
+
+-- Hybrid table that unions the local shard with historical data in S3
+CREATE OR REPLACE TABLE btc_blocks ENGINE = Hybrid(
+    remote('localhost:9000', currentDatabase(), 'btc_blocks_local'), date >= '2025-09-01',
+    s3('s3://aws-public-blockchain/v1.0/btc/blocks/**.parquet', NOSIGN), date < '2025-09-01'
+) AS btc_blocks_local;
+
+-- Writes target the first (remote) layer
+INSERT INTO btc_blocks
+SELECT *
+FROM s3('s3://aws-public-blockchain/v1.0/btc/blocks/**.parquet', NOSIGN)
+WHERE date BETWEEN '2025-09-01' AND '2025-09-30';
+
+-- Reads seamlessly combine both predicates
+SELECT * FROM btc_blocks WHERE date = '2025-08-01'; -- data from s3
+SELECT * FROM btc_blocks WHERE date = '2025-09-05'; -- data from MergeTree (TODO: still analyzes s3)
+SELECT * FROM btc_blocks WHERE date IN ('2025-08-31','2025-09-01') -- data from both sources, single copy always
+
+
+-- Run analytic queries as usual
+SELECT
+    date,
+    count(),
+    uniqExact(CAST(hash, 'Nullable(String)')) AS hashes,
+    sum(CAST(number, 'Nullable(Int64)')) AS blocks_seen
+FROM btc_blocks
+WHERE date BETWEEN '2025-08-01' AND '2025-09-30'
+GROUP BY date
+ORDER BY date;
+```
+
+Because the predicates are applied inside every layer, queries such as `ORDER BY`, `GROUP BY`, `LIMIT`, `JOIN`, and `EXPLAIN` behave as if you were reading from a single `Distributed` table. When sources expose different physical types (for example `FixedString(64)` versus `String` in Parquet), add explicit casts during ingestion or in the query, as shown above.
@@ -317,11 +317,31 @@ bool GlueCatalog::tryGetTableMetadata(
             {
                 result.setDataLakeSpecificProperties(DataLakeSpecificProperties{.iceberg_metadata_file_location = table_params.at("metadata_location")});
             }
+            else if (table_outcome.GetStorageDescriptor().LocationHasBeenSet())
+            {
+                const auto & location = table_outcome.GetStorageDescriptor().GetLocation();
+
+                std::string location_with_slash = location;
+                if (!location_with_slash.ends_with('/'))
+                    location_with_slash += '/';
+
+                // Resolve the actual metadata file path based on table location
+                std::string resolved_metadata_path = resolveMetadataPathFromTableLocation(location_with_slash, result);
+                if (resolved_metadata_path.empty())
+                {
+                    result.setTableIsNotReadable(fmt::format("Could not determine metadata_location of table `{}`. ",
+                        database_name + "." + table_name));
+                }
+                else
+                {
+                    result.setDataLakeSpecificProperties(DataLakeSpecificProperties{.iceberg_metadata_file_location = resolved_metadata_path});
+                }
+            }
             else
             {
-                 result.setTableIsNotReadable(fmt::format("Cannot read table `{}` because it has no metadata_location. " \
-                     "It means that it's unreadable with Glue catalog in ClickHouse, readable tables must have 'metadata_location' in table parameters",
-                     database_name + "." + table_name));
+                result.setTableIsNotReadable(fmt::format("Cannot read table `{}` because it has no metadata_location. " \
+                    "It means that it's unreadable with Glue catalog in ClickHouse, readable tables must have 'metadata_location' in table parameters",
+                    database_name + "." + table_name));
             }
         };
 
@@ -415,37 +435,41 @@ bool GlueCatalog::empty() const
 bool GlueCatalog::classifyTimestampTZ(const String & column_name, const TableMetadata & table_metadata) const
 {
     String metadata_path;
+    String metadata_uri;
     if (auto table_specific_properties = table_metadata.getDataLakeSpecificProperties();
         table_specific_properties.has_value())
     {
         metadata_path = table_specific_properties->iceberg_metadata_file_location;
+        metadata_uri = metadata_path;
         if (metadata_path.starts_with("s3:/"))
             metadata_path = metadata_path.substr(5);
 
-        // Delete bucket
+        // Delete bucket from path
         std::size_t pos = metadata_path.find('/');
         if (pos != std::string::npos)
             metadata_path = metadata_path.substr(pos + 1);
     }
     else
-        throw DB::Exception(DB::ErrorCodes::BAD_ARGUMENTS, "Metadata specific properties should be defined");
+        throw DB::Exception(DB::ErrorCodes::BAD_ARGUMENTS, "Failed to read table metadata, reason why table is unreadable: {}", table_metadata.getReasonWhyTableIsUnreadable());
 
-    if (!metadata_objects.get(metadata_path))
+    if (!metadata_objects.get(metadata_uri))
     {
         DB::ASTStorage * storage = table_engine_definition->as<DB::ASTStorage>();
         DB::ASTs args = storage->engine->arguments->children;
 
-        auto table_endpoint = settings.storage_endpoint;
+        String storage_endpoint = !settings.storage_endpoint.empty() ? settings.storage_endpoint : metadata_uri;
+        
         if (args.empty())
-            args.emplace_back(std::make_shared<DB::ASTLiteral>(table_endpoint));
+            args.emplace_back(std::make_shared<DB::ASTLiteral>(storage_endpoint));
         else
-            args[0] = std::make_shared<DB::ASTLiteral>(table_endpoint);
+            args[0] = std::make_shared<DB::ASTLiteral>(storage_endpoint);
 
-        if (args.size() == 1 && table_metadata.hasStorageCredentials())
+        if (args.size() == 1)
         {
-            auto storage_credentials = table_metadata.getStorageCredentials();
-            if (storage_credentials)
-                storage_credentials->addCredentialsToEngineArgs(args);
+            if (table_metadata.hasStorageCredentials())
+                table_metadata.getStorageCredentials()->addCredentialsToEngineArgs(args);
+           else if (!credentials.IsExpiredOrEmpty())
+                DataLake::S3Credentials(credentials.GetAWSAccessKeyId(), credentials.GetAWSSecretKey(), credentials.GetSessionToken()).addCredentialsToEngineArgs(args);
         }
 
         auto storage_settings = std::make_shared<DB::DataLakeStorageSettings>();
@@ -464,9 +488,9 @@ bool GlueCatalog::classifyTimestampTZ(const String & column_name, const TableMet
         Poco::JSON::Parser parser;
         Poco::Dynamic::Var result = parser.parse(metadata_file);
         auto metadata_object = result.extract<Poco::JSON::Object::Ptr>();
-        metadata_objects.set(metadata_path, std::make_shared<Poco::JSON::Object::Ptr>(metadata_object));
+        metadata_objects.set(metadata_uri, std::make_shared<Poco::JSON::Object::Ptr>(metadata_object));
     }
-    auto metadata_object = *metadata_objects.get(metadata_path);
+    auto metadata_object = *metadata_objects.get(metadata_uri);
     auto current_schema_id = metadata_object->getValue<Int64>("current-schema-id");
     auto schemas = metadata_object->getArray(DB::Iceberg::f_schemas);
     for (size_t i = 0; i < schemas->size(); ++i)
@@ -487,6 +511,125 @@ bool GlueCatalog::classifyTimestampTZ(const String & column_name, const TableMet
     return false;
 }
 
+/// This function tries two resolve the metadata file path by following means:
+/// 1. Tries to read version-hint.text to get the latest version.
+/// 2. Lists all *.metadata.json files in the metadata directory and takes the most recent one.
+String GlueCatalog::resolveMetadataPathFromTableLocation(const String & table_location, const TableMetadata & table_metadata) const
+{
+    // Construct path to version-hint.text
+    String version_hint_path = table_location + "metadata/version-hint.text";
+
+    DB::ASTStorage * storage = table_engine_definition->as<DB::ASTStorage>();
+    DB::ASTs args = storage->engine->arguments->children;
+
+    String storage_endpoint = !settings.storage_endpoint.empty() ? settings.storage_endpoint : table_location;
+    if (args.empty())
+        args.emplace_back(std::make_shared<DB::ASTLiteral>(storage_endpoint));
+    else
+        args[0] = std::make_shared<DB::ASTLiteral>(storage_endpoint);
+
+    if (args.size() == 1 && table_metadata.hasStorageCredentials())
+    {
+        auto storage_credentials = table_metadata.getStorageCredentials();
+        if (storage_credentials)
+            storage_credentials->addCredentialsToEngineArgs(args);
+    }
+
+    auto storage_settings = std::make_shared<DB::DataLakeStorageSettings>();
+    storage_settings->loadFromSettingsChanges(settings.allChanged());
+    auto configuration = std::make_shared<DB::StorageS3IcebergConfiguration>(storage_settings);
+    configuration->initialize(args, getContext(), false);
+
+    auto object_storage = configuration->createObjectStorage(getContext(), true);
+    const auto & read_settings = getContext()->getReadSettings();
+
+    try
+    {
+        // Try to read version-hint.text to get the latest version
+        String version_hint_object_path = version_hint_path;
+        if (version_hint_object_path.starts_with("s3://"))
+        {
+            version_hint_object_path = version_hint_object_path.substr(5);
+            // Remove bucket from path
+            std::size_t pos = version_hint_object_path.find('/');
+            if (pos != std::string::npos)
+                version_hint_object_path = version_hint_object_path.substr(pos + 1);
+        }
+
+        DB::StoredObject version_hint_stored_object(version_hint_object_path);
+        auto version_hint_buf = object_storage->readObject(version_hint_stored_object, read_settings);
+        String version_str;
+        readString(version_str, *version_hint_buf);
+
+        boost::algorithm::trim(version_str);
+
+        LOG_TRACE(log, "Read version {} from version-hint.text for table location '{}'", version_str, table_location);
+
+        return table_location + "metadata/v" + version_str + "-metadata.json";
+    }
+    catch (...)
+    {
+        LOG_TRACE(log, "Could not read version-hint.text from '{}', trying to find latest metadata file", version_hint_path);
+
+        try
+        {
+            String bucket_with_prefix;
+            String metadata_dir = table_location + "metadata/";
+            String metadata_dir_path = metadata_dir;
+
+            if (metadata_dir_path.starts_with("s3://"))
+            {
+                metadata_dir_path = metadata_dir_path.substr(5);
+                // Remove bucket from path
+                std::size_t pos = metadata_dir_path.find('/');
+                if (pos != std::string::npos)
+                {
+                    metadata_dir_path = metadata_dir_path.substr(pos + 1);
+                    bucket_with_prefix = table_location.substr(0, pos + 6);
+                }
+            }
+            else
+                return "";
+
+            // List all files in metadata directory
+            DB::RelativePathsWithMetadata files;
+            object_storage->listObjects(metadata_dir_path, files, 0);
+
+            // Filter for .metadata.json files and find the most recent one
+            String latest_metadata_file;
+            std::optional<DB::ObjectMetadata> latest_metadata;
+
+            for (const auto & file : files)
+            {
+                if (file->getPath().ends_with(".metadata.json"))
+                {
+                    // Get file metadata to check last modified time
+                    if (!latest_metadata.has_value() ||
+                        (file->metadata->last_modified > latest_metadata->last_modified))
+                    {
+                        latest_metadata_file = file->getPath();
+                        latest_metadata = file->metadata;
+                    }
+                }
+            }
+
+            if (!latest_metadata_file.empty())
+            {
+                LOG_TRACE(log, "Found latest metadata file: {}", latest_metadata_file);
+                return bucket_with_prefix + latest_metadata_file;
+            }
+
+            LOG_TRACE(log, "No <...>.metadata.json files found,");
+            return "";
+        }
+        catch (...)
+        {
+            LOG_TRACE(log, "Failed to list metadata directory");
+            return "";
+        }
+    }
+}
+
 void GlueCatalog::createNamespaceIfNotExists(const String & namespace_name) const
 {
     Aws::Glue::Model::CreateDatabaseRequest create_request;
 
@@ -81,6 +81,8 @@ class GlueCatalog final : public ICatalog, private DB::WithContext
     /// This method allows to clarify the actual type of the timestamp column.
     bool classifyTimestampTZ(const String & column_name, const TableMetadata & table_metadata) const;
 
+    String resolveMetadataPathFromTableLocation(const String & table_location, const TableMetadata & table_metadata) const;
+
     mutable DB::CacheBase<String, Poco::JSON::Object::Ptr> metadata_objects;
 };
 
 
@@ -67,7 +67,8 @@ ASTPtr rewriteSelectQuery(
     const ASTPtr & query,
     const std::string & remote_database,
     const std::string & remote_table,
-    ASTPtr table_function_ptr)
+    ASTPtr table_function_ptr,
+    ASTPtr additional_filter)
 {
     auto modified_query_ast = query->clone();
 
@@ -80,8 +81,33 @@ ASTPtr rewriteSelectQuery(
 
     if (!context->getSettingsRef()[Setting::allow_experimental_analyzer])
     {
+        // Apply additional filter if provided
+        if (additional_filter)
+        {
+            if (select_query.where())
+            {
+                /// WHERE <old> AND <filter>
+                select_query.setExpression(
+                    ASTSelectQuery::Expression::WHERE,
+                    makeASTFunction("and", select_query.where(), additional_filter->clone()));
+            }
+            else
+            {
+                /// No WHERE – simply set it
+                select_query.setExpression(
+                    ASTSelectQuery::Expression::WHERE, additional_filter->clone());
+            }
+        }
+
         if (table_function_ptr)
-            select_query.addTableFunction(table_function_ptr);
+        {
+            select_query.addTableFunction(table_function_ptr->clone());
+
+            // Reset semantic table information for all column identifiers to prevent
+            // RestoreQualifiedNamesVisitor from adding wrong table names
+            ResetSemanticTableVisitor::Data data;
+            ResetSemanticTableVisitor(data).visit(modified_query_ast);
+        }
         else
             select_query.replaceDatabaseAndTable(remote_database, remote_table);
 
@@ -93,6 +119,7 @@ ASTPtr rewriteSelectQuery(
             data.distributed_table = DatabaseAndTableWithAlias(*getTableExpression(query->as<ASTSelectQuery &>(), 0));
             data.remote_table.database = remote_database;
             data.remote_table.table = remote_table;
+
             RestoreQualifiedNamesVisitor(data).visit(modified_query_ast);
         }
     }