Merge pull request #1071 from Altinity/mf-25.8-tiered-distributed

Enmk · web-flow · commit cf4d13742ddf · 2025-10-15T14:12:09.000+02:00
engine=Hybrid
diff --git a/docs/en/engines/table-engines/special/tiered-distributed.md b/docs/en/engines/table-engines/special/tiered-distributed.md
@@ -0,0 +1,106 @@
+---
+description: 'Hybrid unions multiple data sources behind per-layer predicates so queries behave like a single table while data is migrated or tiered.'
+slug: /engines/table-engines/special/tiered-distributed
+title: 'Hybrid Table Engine'
+sidebar_label: 'Hybrid'
+sidebar_position: 11
+---
+
+# Hybrid table engine
+
+`Hybrid` builds on top of the [Distributed](./distributed.md) table engine. It lets you expose several data sources as one logical table and assign every source its own predicate.
+The engine rewrites incoming queries so that each layer receives the original query plus its predicate. This keeps all of the Distributed optimisations (remote aggregation, `skip_unused_shards`,
+global JOIN pushdown, and so on) while you duplicate or migrate data across clusters, storage types, or formats.
+
+It keeps the same execution pipeline as `engine=Distributed` but can read from multiple underlying sources simultaneously—similar to `engine=Merge`—while still pushing logic down to each source.
+
+Typical use cases include:
+
+- Zero-downtime migrations where "old" and "new" replicas temporarily overlap.
+- Tiered storage, for example fresh data on a local cluster and historical data in S3.
+- Gradual roll-outs where only a subset of rows should be served from a new backend.
+
+By giving mutually exclusive predicates to the layers (for example, `date < watermark` and `date >= watermark`), you ensure that each row is read from exactly one source.
+
+## Engine definition
+
+```sql
+CREATE TABLE [IF NOT EXISTS] [db.]table_name
+(
+    column1 type1,
+    column2 type2,
+    ...
+)
+ENGINE = Hybrid(table_function_1, predicate_1 [, table_function_2, predicate_2 ...])
+```
+
+You must pass at least two arguments – the first table function and its predicate. Additional sources are appended as `table_function, predicate` pairs. The first table function is also used for `INSERT` statements.
+
+### Arguments and behaviour
+
+- `table_function_n` must be a valid table function (for example `remote`, `remoteSecure`, `cluster`, `clusterAllReplicas`, `s3Cluster`) or a fully qualified table name (`database.table`). The first argument must be a table function—such as `remote` or `cluster`—because it instantiates the underlying `Distributed` storage.
+- `predicate_n` must be an expression that can be evaluated on the table columns. The engine adds it to the layer's query with an additional `AND`, so expressions like `event_date >= '2025-09-01'` or `id BETWEEN 10 AND 15` are typical.
+- The query planner picks the same processing stage for every layer as it does for the base `Distributed` plan, so remote aggregation, ORDER BY pushdown, `skip_unused_shards`, and the legacy/analyzer execution modes behave the same way.
+- `INSERT` statements are forwarded to the first table function only. If you need multi-destination writes, use explicit `INSERT` statements into the respective sources.
+- Align schemas across the layers. ClickHouse builds a common header; if the physical types differ you may need to add casts on one side or in the query, just as you would when reading from heterogeneous replicas.
+
+## Example: local cluster plus S3 historical tier
+
+The following commands illustrate a two-layer layout. Hot data stays on a local ClickHouse cluster, while historical rows come from public S3 Parquet files.
+
+```sql
+-- Local MergeTree table that keeps current data
+CREATE OR REPLACE TABLE btc_blocks_local
+(
+    `hash` FixedString(64),
+    `version` Int64,
+    `mediantime` DateTime64(9),
+    `nonce` Int64,
+    `bits` FixedString(8),
+    `difficulty` Float64,
+    `chainwork` FixedString(64),
+    `size` Int64,
+    `weight` Int64,
+    `coinbase_param` String,
+    `number` Int64,
+    `transaction_count` Int64,
+    `merkle_root` FixedString(64),
+    `stripped_size` Int64,
+    `timestamp` DateTime64(9),
+    `date` Date
+)
+ENGINE = MergeTree
+ORDER BY (timestamp)
+PARTITION BY toYYYYMM(date);
+
+-- Hybrid table that unions the local shard with historical data in S3
+CREATE OR REPLACE TABLE btc_blocks ENGINE = Hybrid(
+    remote('localhost:9000', currentDatabase(), 'btc_blocks_local'), date >= '2025-09-01',
+    s3('s3://aws-public-blockchain/v1.0/btc/blocks/**.parquet', NOSIGN), date < '2025-09-01'
+) AS btc_blocks_local;
+
+-- Writes target the first (remote) layer
+INSERT INTO btc_blocks
+SELECT *
+FROM s3('s3://aws-public-blockchain/v1.0/btc/blocks/**.parquet', NOSIGN)
+WHERE date BETWEEN '2025-09-01' AND '2025-09-30';
+
+-- Reads seamlessly combine both predicates
+SELECT * FROM btc_blocks WHERE date = '2025-08-01'; -- data from s3
+SELECT * FROM btc_blocks WHERE date = '2025-09-05'; -- data from MergeTree (TODO: still analyzes s3)
+SELECT * FROM btc_blocks WHERE date IN ('2025-08-31','2025-09-01') -- data from both sources, single copy always
+
+
+-- Run analytic queries as usual
+SELECT
+    date,
+    count(),
+    uniqExact(CAST(hash, 'Nullable(String)')) AS hashes,
+    sum(CAST(number, 'Nullable(Int64)')) AS blocks_seen
+FROM btc_blocks
+WHERE date BETWEEN '2025-08-01' AND '2025-09-30'
+GROUP BY date
+ORDER BY date;
+```
+
+Because the predicates are applied inside every layer, queries such as `ORDER BY`, `GROUP BY`, `LIMIT`, `JOIN`, and `EXPLAIN` behave as if you were reading from a single `Distributed` table. When sources expose different physical types (for example `FixedString(64)` versus `String` in Parquet), add explicit casts during ingestion or in the query, as shown above.
diff --git a/src/Interpreters/ClusterProxy/SelectStreamFactory.cpp b/src/Interpreters/ClusterProxy/SelectStreamFactory.cpp
@@ -67,7 +67,8 @@ ASTPtr rewriteSelectQuery(
     const ASTPtr & query,
     const std::string & remote_database,
     const std::string & remote_table,
-    ASTPtr table_function_ptr)
+    ASTPtr table_function_ptr,
+    ASTPtr additional_filter)
 {
     auto modified_query_ast = query->clone();
 
@@ -80,8 +81,33 @@ ASTPtr rewriteSelectQuery(
 
     if (!context->getSettingsRef()[Setting::allow_experimental_analyzer])
     {
+        // Apply additional filter if provided
+        if (additional_filter)
+        {
+            if (select_query.where())
+            {
+                /// WHERE <old> AND <filter>
+                select_query.setExpression(
+                    ASTSelectQuery::Expression::WHERE,
+                    makeASTFunction("and", select_query.where(), additional_filter->clone()));
+            }
+            else
+            {
+                /// No WHERE – simply set it
+                select_query.setExpression(
+                    ASTSelectQuery::Expression::WHERE, additional_filter->clone());
+            }
+        }
+
         if (table_function_ptr)
-            select_query.addTableFunction(table_function_ptr);
+        {
+            select_query.addTableFunction(table_function_ptr->clone());
+
+            // Reset semantic table information for all column identifiers to prevent
+            // RestoreQualifiedNamesVisitor from adding wrong table names
+            ResetSemanticTableVisitor::Data data;
+            ResetSemanticTableVisitor(data).visit(modified_query_ast);
+        }
         else
             select_query.replaceDatabaseAndTable(remote_database, remote_table);
 
@@ -93,6 +119,7 @@ ASTPtr rewriteSelectQuery(
             data.distributed_table = DatabaseAndTableWithAlias(*getTableExpression(query->as<ASTSelectQuery &>(), 0));
             data.remote_table.database = remote_database;
             data.remote_table.table = remote_table;
+
             RestoreQualifiedNamesVisitor(data).visit(modified_query_ast);
         }
     }
diff --git a/src/Interpreters/ClusterProxy/SelectStreamFactory.h b/src/Interpreters/ClusterProxy/SelectStreamFactory.h
@@ -41,7 +41,8 @@ ASTPtr rewriteSelectQuery(
     const ASTPtr & query,
     const std::string & remote_database,
     const std::string & remote_table,
-    ASTPtr table_function_ptr = nullptr);
+    ASTPtr table_function_ptr = nullptr,
+    ASTPtr additional_filter = nullptr);
 
 using ColumnsDescriptionByShardNum = std::unordered_map<UInt32, ColumnsDescription>;
 using AdditionalShardFilterGenerator = std::function<ASTPtr(uint64_t)>;
diff --git a/src/Interpreters/ClusterProxy/executeQuery.cpp b/src/Interpreters/ClusterProxy/executeQuery.cpp
@@ -15,6 +15,7 @@
 #include <Parsers/ASTInsertQuery.h>
 #include <Planner/Utils.h>
 #include <Processors/QueryPlan/ParallelReplicasLocalPlan.h>
+#include <Processors/QueryPlan/DistributedCreateLocalPlan.h>
 #include <Processors/QueryPlan/QueryPlan.h>
 #include <Processors/QueryPlan/ReadFromLocalReplica.h>
 #include <Processors/QueryPlan/ReadFromPreparedSource.h>
@@ -333,7 +334,8 @@ void executeQuery(
     const std::string & sharding_key_column_name,
     const DistributedSettings & distributed_settings,
     AdditionalShardFilterGenerator shard_filter_generator,
-    bool is_remote_function)
+    bool is_remote_function,
+    std::span<const SelectQueryInfo> additional_query_infos)
 {
     const Settings & settings = context->getSettingsRef();
 
@@ -361,6 +363,7 @@ void executeQuery(
     new_context->increaseDistributedDepth();
 
     const size_t shards = cluster->getShardCount();
+    const bool has_additional_query_infos = !additional_query_infos.empty();
 
     if (context->getSettingsRef()[Setting::allow_experimental_analyzer])
     {
@@ -470,6 +473,29 @@ void executeQuery(
         plans.emplace_back(std::move(plan));
     }
 
+    if (has_additional_query_infos)
+    {
+        if (!header)
+            throw Exception(ErrorCodes::LOGICAL_ERROR, "Header is not initialized for local hybrid plan creation");
+
+        const Block & header_block = *header;
+        for (const auto & additional_query_info : additional_query_infos)
+        {
+            auto additional_plan = createLocalPlan(
+                additional_query_info.query,
+                header_block,
+                context,
+                processed_stage,
+                0,  /// shard_num is not applicable for local hybrid plans
+                1,  /// shard_count is not applicable for local hybrid plans
+                false,
+                false,
+                "");
+
+            plans.emplace_back(std::move(additional_plan));
+        }
+    }
+
     if (plans.empty())
         return;
 
@@ -485,6 +511,8 @@ void executeQuery(
         input_headers.emplace_back(plan->getCurrentHeader());
 
     auto union_step = std::make_unique<UnionStep>(std::move(input_headers));
+    if (has_additional_query_infos)
+        union_step->setStepDescription("Hybrid");
     query_plan.unitePlans(std::move(union_step), std::move(plans));
 }
 
diff --git a/src/Interpreters/ClusterProxy/executeQuery.h b/src/Interpreters/ClusterProxy/executeQuery.h
@@ -5,6 +5,7 @@
 #include <Parsers/IAST_fwd.h>
 
 #include <optional>
+#include <span>
 
 namespace DB
 {
@@ -88,7 +89,8 @@ void executeQuery(
     const std::string & sharding_key_column_name,
     const DistributedSettings & distributed_settings,
     AdditionalShardFilterGenerator shard_filter_generator,
-    bool is_remote_function);
+    bool is_remote_function,
+    std::span<const SelectQueryInfo> additional_query_infos = {});
 
 std::optional<QueryPipeline> executeInsertSelectWithParallelReplicas(
     const ASTInsertQuery & query_ast,
diff --git a/src/Interpreters/TranslateQualifiedNamesVisitor.cpp b/src/Interpreters/TranslateQualifiedNamesVisitor.cpp
@@ -399,4 +399,15 @@ void RestoreQualifiedNamesMatcher::visit(ASTIdentifier & identifier, ASTPtr &, D
     }
 }
 
+void ResetSemanticTableMatcher::visit(ASTPtr & ast, Data & data)
+{
+    if (auto * t = ast->as<ASTIdentifier>())
+        visit(*t, ast, data);
+}
+
+void ResetSemanticTableMatcher::visit(ASTIdentifier & identifier, ASTPtr &, Data &)
+{
+    identifier.resetSemanticTable();
+}
+
 }
diff --git a/src/Interpreters/TranslateQualifiedNamesVisitor.h b/src/Interpreters/TranslateQualifiedNamesVisitor.h
@@ -80,4 +80,33 @@ struct RestoreQualifiedNamesMatcher
 
 using RestoreQualifiedNamesVisitor = InDepthNodeVisitor<RestoreQualifiedNamesMatcher, true>;
 
+
+/// Reset semantic->table for all column identifiers in the AST.
+///
+/// PROBLEM DESCRIPTION:
+/// When an AST is passed through multiple query rewrites (e.g., in Hybrid -> remote),
+/// the semantic->table information attached to ASTIdentifier nodes can become stale and
+/// cause incorrect column qualification. This happens because:
+///
+/// 1. During initial parsing, semantic->table is populated with the original table name
+/// 2. When the query is rewritten (e.g., FROM clause changed from table to remote() function inside Hybrid),
+///    the AST structure is modified but semantic->table information is preserved
+/// 3. Subsequent visitors like RestoreQualifiedNamesVisitor called in remote() function over the same AST
+///    may use this stale semantic->table information to incorrectly qualify column names with the original table name
+///
+/// SOLUTION:
+/// This visitor clears semantic->table for all column identifiers, ensuring that subsequent
+/// visitors work with clean semantic information and don't apply stale table qualifications.
+struct ResetSemanticTableMatcher
+{
+    // No data needed for this visitor
+    struct Data {};
+
+    static bool needChildVisit(const ASTPtr &, const ASTPtr &) { return true; }
+    static void visit(ASTPtr & ast, Data & data);
+    static void visit(ASTIdentifier & identifier, ASTPtr &, Data & data);
+};
+
+using ResetSemanticTableVisitor = InDepthNodeVisitor<ResetSemanticTableMatcher, true>;
+
 }
diff --git a/src/Parsers/ASTIdentifier.cpp b/src/Parsers/ASTIdentifier.cpp
@@ -167,6 +167,17 @@ void ASTIdentifier::restoreTable()
     }
 }
 
+void ASTIdentifier::resetSemanticTable()
+{
+    // Only reset semantic table for column identifiers (not table identifiers)
+    if (semantic && !semantic->special)
+    {
+        semantic->table.clear();
+        semantic->can_be_alias = true;
+        semantic->membership = std::nullopt;
+    }
+}
+
 std::shared_ptr<ASTTableIdentifier> ASTIdentifier::createTable() const
 {
     if (name_parts.size() == 1) return std::make_shared<ASTTableIdentifier>(name_parts[0]);
diff --git a/src/Parsers/ASTIdentifier.h b/src/Parsers/ASTIdentifier.h
@@ -52,6 +52,7 @@ class ASTIdentifier : public ASTWithAlias
     void updateTreeHashImpl(SipHash & hash_state, bool ignore_alias) const override;
 
     void restoreTable();  // TODO(ilezhankin): get rid of this
+    void resetSemanticTable();  // Reset semantic to empty string (see ResetSemanticTableVisitor)
     std::shared_ptr<ASTTableIdentifier> createTable() const;  // returns |nullptr| if identifier is not table.
 
     String full_name;
diff --git a/src/Planner/PlannerActionsVisitor.cpp b/src/Planner/PlannerActionsVisitor.cpp
@@ -348,7 +348,7 @@ class ActionNodeNameHelper
             }
             default:
             {
-                throw Exception(ErrorCodes::LOGICAL_ERROR, "Invalid action query tree node {}", node->formatASTForErrorMessage());
+                throw Exception(ErrorCodes::LOGICAL_ERROR, "Invalid action query tree node {} (node_type: {})", node->formatASTForErrorMessage(), static_cast<int>(node_type));
             }
         }
 
diff --git a/src/Storages/StorageDistributed.cpp b/src/Storages/StorageDistributed.cpp
diff --git a/src/Storages/StorageDistributed.h b/src/Storages/StorageDistributed.h
diff --git a/src/Storages/registerStorages.cpp b/src/Storages/registerStorages.cpp
diff --git a/tests/queries/0_stateless/03642_tiered_distributed.reference b/tests/queries/0_stateless/03642_tiered_distributed.reference
diff --git a/tests/queries/0_stateless/03642_tiered_distributed.sql b/tests/queries/0_stateless/03642_tiered_distributed.sql

Original file line number	Diff line number	Diff line change
`@@ -399,4 +399,15 @@ void RestoreQualifiedNamesMatcher::visit(ASTIdentifier & identifier, ASTPtr &, D`
`399`	`399`	`}`
`400`	`400`	`}`
`401`	`401`
	`402`	`+void ResetSemanticTableMatcher::visit(ASTPtr & ast, Data & data)`
	`403`	`+{`
	`404`	`+ if (auto * t = ast->as<ASTIdentifier>())`
	`405`	`+ visit(*t, ast, data);`
	`406`	`+}`
	`407`	`+`
	`408`	`+void ResetSemanticTableMatcher::visit(ASTIdentifier & identifier, ASTPtr &, Data &)`
	`409`	`+{`
	`410`	`+ identifier.resetSemanticTable();`
	`411`	`+}`
	`412`	`+`
`402`	`413`	`}`
Original file line number	Diff line number	Diff line change
`@@ -167,6 +167,17 @@ void ASTIdentifier::restoreTable()`
`167`	`167`	`}`
`168`	`168`	`}`
`169`	`169`
	`170`	`+void ASTIdentifier::resetSemanticTable()`
	`171`	`+{`
	`172`	`+ // Only reset semantic table for column identifiers (not table identifiers)`
	`173`	`+ if (semantic && !semantic->special)`
	`174`	`+ {`
	`175`	`+ semantic->table.clear();`
	`176`	`+ semantic->can_be_alias = true;`
	`177`	`+ semantic->membership = std::nullopt;`
	`178`	`+ }`
	`179`	`+}`
	`180`	`+`
`170`	`181`	`std::shared_ptr<ASTTableIdentifier> ASTIdentifier::createTable() const`
`171`	`182`	`{`
`172`	`183`	`if (name_parts.size() == 1) return std::make_shared<ASTTableIdentifier>(name_parts[0]);`
Original file line number	Diff line number	Diff line change
`@@ -348,7 +348,7 @@ class ActionNodeNameHelper`
`348`	`348`	`}`
`349`	`349`	`default:`
`350`	`350`	`{`
`351`		`- throw Exception(ErrorCodes::LOGICAL_ERROR, "Invalid action query tree node {}", node->formatASTForErrorMessage());`
	`351`	`+ throw Exception(ErrorCodes::LOGICAL_ERROR, "Invalid action query tree node {} (node_type: {})", node->formatASTForErrorMessage(), static_cast<int>(node_type));`
`352`	`352`	`}`
`353`	`353`	`}`
`354`	`354`