Skip to content

Commit ae4c2cb

Browse files
serxazvonand
authored andcommitted
Merge pull request ClickHouse#90243 from AVMusorin/avoid-possible-null-pointer
Fix crash in StorageDistributed when parsing malformed shard directory names
1 parent e7db2db commit ae4c2cb

File tree

2 files changed

+73
-2
lines changed

2 files changed

+73
-2
lines changed

src/Storages/StorageDistributed.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1690,9 +1690,11 @@ Cluster::Addresses StorageDistributed::parseAddresses(const std::string & name)
16901690
continue;
16911691
}
16921692

1693-
if (address.replica_index > replicas)
1693+
if (address.replica_index == 0 || address.replica_index > replicas)
16941694
{
1695-
LOG_ERROR(log, "No shard with replica_index={} ({})", address.replica_index, name);
1695+
LOG_ERROR(log, "Invalid replica_index={} for directory '{}' (cluster has {} replicas for shard {}). "
1696+
"Expected directory format: 'shardN_replicaM' or 'shardN_all_replicas'",
1697+
address.replica_index, dirname, replicas, address.shard_index);
16961698
continue;
16971699
}
16981700

tests/integration/test_distributed_format/test.py

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -213,3 +213,72 @@ def test_remove_replica(started_cluster):
213213
"/etc/clickhouse-server/config.d/another_remote_servers.xml",
214214
]
215215
)
216+
217+
def test_invalid_shard_directory_format(started_cluster):
218+
"""
219+
Test that ClickHouse doesn't crash when it encounters
220+
a malformed directory name like 'shard1_all_replicas_bkp'
221+
during distributed table initialization.
222+
"""
223+
node.query("drop table if exists test.dist_invalid sync")
224+
node.query("drop table if exists test.local_invalid sync")
225+
node.query(
226+
"create table test.local_invalid (x UInt64, s String) engine = MergeTree order by x"
227+
)
228+
node.query(
229+
"create table test.dist_invalid (x UInt64, s String) "
230+
"engine = Distributed('test_cluster_internal_replication', test, local_invalid)"
231+
)
232+
233+
node.query(
234+
"insert into test.dist_invalid values (1, 'a'), (2, 'bb')",
235+
settings={"use_compact_format_in_distributed_parts_names": "1"},
236+
)
237+
238+
data_path = node.query(
239+
"SELECT arrayElement(data_paths, 1) FROM system.tables "
240+
"WHERE database='test' AND name='dist_invalid'"
241+
).strip()
242+
243+
# Create a malformed directory that would cause the bug
244+
malformed_dir = f"{data_path}/shard1_all_replicas_bkp"
245+
node.exec_in_container(["mkdir", "-p", malformed_dir])
246+
247+
# Create a dummy file so the directory isn't considered empty
248+
node.exec_in_container(["touch", f"{malformed_dir}/dummy.txt"])
249+
250+
invalid_formats = [
251+
"shard1_all_replicas_backup",
252+
"shard1_all_replicas_old",
253+
"shard2_all_replicas_tmp",
254+
]
255+
for invalid_dir in invalid_formats:
256+
invalid_path = f"{data_path}/{invalid_dir}"
257+
node.exec_in_container(["mkdir", "-p", invalid_path])
258+
# just dummy file to have something in the directory
259+
node.exec_in_container(["touch", f"{invalid_path}/dummy.txt"])
260+
261+
# Reproduce server restart with detach and attach
262+
node.query("detach table test.dist_invalid")
263+
node.query("attach table test.dist_invalid")
264+
265+
node.query("SYSTEM FLUSH LOGS system.text_log")
266+
267+
error_logs = node.query(
268+
"""
269+
SELECT count()
270+
FROM system.text_log
271+
WHERE level = 'Error'
272+
AND message LIKE '%Invalid replica_index%'
273+
AND message LIKE '%shard1_all_replicas%'
274+
"""
275+
).strip()
276+
277+
# We should have at least one error log for each malformed directory
278+
# But we don't strictly require this in case logging is disabled
279+
# The important thing is that the server didn't crash
280+
print(f"Found {error_logs} error log entries for invalid directories")
281+
282+
# Clean up
283+
node.query("drop table test.dist_invalid sync")
284+
node.query("drop table test.local_invalid sync")

0 commit comments

Comments
 (0)