Skip to content

Commit 4487431

Browse files
committed
looking good before ordering
1 parent b571f5a commit 4487431

File tree

7 files changed

+291
-26
lines changed

7 files changed

+291
-26
lines changed

src/Core/Settings.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6876,6 +6876,9 @@ Overwrite file if it already exists when exporting a merge tree part
68766876
)", 0) \
68776877
DECLARE(Bool, export_merge_tree_partition_force_export, false, R"(
68786878
Ignore existing partition export and overwrite the zookeeper entry
6879+
)", 0) \
6880+
DECLARE(UInt64, export_merge_tree_partition_max_retries, 3, R"(
6881+
Maximum number of retries for exporting a merge tree part in an export partition task
68796882
)", 0) \
68806883
\
68816884
/* ####################################################### */ \

src/Storages/ExportReplicatedMergeTreePartitionManifest.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ struct ExportReplicatedMergeTreePartitionManifest
1919
size_t number_of_parts;
2020
std::vector<String> parts;
2121
time_t create_time;
22+
size_t max_retries;
2223

2324
std::string toJsonString() const
2425
{
@@ -36,6 +37,7 @@ struct ExportReplicatedMergeTreePartitionManifest
3637
json.set("parts", parts_array);
3738

3839
json.set("create_time", create_time);
40+
json.set("max_retries", max_retries);
3941
std::ostringstream oss; // STYLE_CHECK_ALLOW_STD_STRING_STREAM
4042
oss.exceptions(std::ios::failbit);
4143
Poco::JSON::Stringifier::stringify(json, oss);
@@ -55,7 +57,7 @@ struct ExportReplicatedMergeTreePartitionManifest
5557
manifest.destination_table = json->getValue<String>("destination_table");
5658
manifest.source_replica = json->getValue<String>("source_replica");
5759
manifest.number_of_parts = json->getValue<size_t>("number_of_parts");
58-
60+
manifest.max_retries = json->getValue<size_t>("max_retries");
5961
auto parts_array = json->getArray("parts");
6062
for (size_t i = 0; i < parts_array->size(); ++i)
6163
manifest.parts.push_back(parts_array->getElement<String>(static_cast<unsigned int>(i)));

src/Storages/MergeTree/ExportPartitionTaskScheduler.cpp

Lines changed: 27 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@ void ExportPartitionTaskScheduler::run()
2222

2323
for (const auto & [key, entry] : storage.export_merge_tree_partition_task_entries)
2424
{
25-
2625
const auto & manifest = entry.manifest;
2726
const auto & database = storage.getContext()->resolveDatabase(manifest.destination_database);
2827
const auto & table = manifest.destination_table;
@@ -112,11 +111,13 @@ void ExportPartitionTaskScheduler::run()
112111
{
113112
tryLogCurrentException(__PRETTY_FUNCTION__);
114113
zk->tryRemove(fs::path(storage.zookeeper_path) / "exports" / key / "locks" / zk_part_name);
115-
/// todo arthur re-schedule this so we can try later
116114
/// we should not increment retry_count because the node might just be full
117115
}
118116
}
119117
}
118+
119+
/// maybe we failed to schedule or failed to export, need to retry eventually
120+
storage.export_merge_tree_partition_select_task->scheduleAfter(1000 * 5);
120121
}
121122

122123
void ExportPartitionTaskScheduler::handlePartExportCompletion(
@@ -137,7 +138,7 @@ void ExportPartitionTaskScheduler::handlePartExportCompletion(
137138
}
138139
else
139140
{
140-
handlePartExportFailure(processing_parts_path, processed_part_path, part_name, export_path, zk, result.exception);
141+
handlePartExportFailure(processing_parts_path, part_name, export_path, zk, result.exception, manifest.max_retries);
141142
}
142143
}
143144

@@ -182,7 +183,6 @@ void ExportPartitionTaskScheduler::handlePartExportSuccess(
182183
// Remove children before parent (order matters for multi operations)
183184
// Maybe a ls + multi rm..
184185
requests.emplace_back(zkutil::makeRemoveRequest(processing_parts_path / part_name / "retry_count", -1));
185-
requests.emplace_back(zkutil::makeRemoveRequest(processing_parts_path / part_name / "max_retry", -1));
186186
requests.emplace_back(zkutil::makeRemoveRequest(processing_parts_path / part_name / "status", -1));
187187
requests.emplace_back(zkutil::makeRemoveRequest(processing_parts_path / part_name, -1));
188188
}
@@ -223,15 +223,30 @@ void ExportPartitionTaskScheduler::handlePartExportSuccess(
223223

224224
void ExportPartitionTaskScheduler::handlePartExportFailure(
225225
const std::filesystem::path & processing_parts_path,
226-
const std::filesystem::path & processed_part_path,
227226
const std::string & part_name,
228227
const std::filesystem::path & export_path,
229228
const zkutil::ZooKeeperPtr & zk,
230-
const String & exception
229+
const String & exception,
230+
size_t max_retries
231231
)
232232
{
233233
tryLogCurrentException(__PRETTY_FUNCTION__);
234234

235+
Coordination::Stat locked_by_stat;
236+
std::string locked_by;
237+
238+
if (!zk->tryGet(export_path / "locks" / part_name, locked_by, &locked_by_stat))
239+
{
240+
LOG_INFO(storage.log, "ExportPartition: Part {} is not locked by any replica, will not increment error counts", part_name);
241+
return;
242+
}
243+
244+
if (locked_by != storage.replica_name)
245+
{
246+
LOG_INFO(storage.log, "ExportPartition: Part {} is locked by another replica, will not increment error counts", part_name);
247+
return;
248+
}
249+
235250
Coordination::Requests ops;
236251

237252
const auto processing_part_path = processing_parts_path / part_name;
@@ -240,21 +255,17 @@ void ExportPartitionTaskScheduler::handlePartExportFailure(
240255
{
241256
std::size_t retry_count = std::stoull(retry_count_string.c_str()) + 1;
242257

243-
244258
ops.emplace_back(zkutil::makeSetRequest(processing_part_path / "retry_count", std::to_string(retry_count), -1));
245-
ops.emplace_back(zkutil::makeRemoveRequest(export_path / "locks" / part_name, -1));
259+
ops.emplace_back(zkutil::makeRemoveRequest(export_path / "locks" / part_name, locked_by_stat.version));
246260

247-
if (retry_count >= 3)
261+
if (retry_count >= max_retries)
248262
{
249-
/// remove from processing and create in processed with status FAILED
250-
251-
ops.emplace_back(zkutil::makeRemoveRequest(processing_parts_path / part_name, -1));
252-
ops.emplace_back(zkutil::makeCreateRequest(processed_part_path, "", zkutil::CreateMode::Persistent));
253-
ops.emplace_back(zkutil::makeCreateRequest(processed_part_path / "status", "FAILED", zkutil::CreateMode::Persistent));
254-
ops.emplace_back(zkutil::makeCreateRequest(processed_part_path / "finished_by", storage.replica_name, zkutil::CreateMode::Persistent));
263+
/// just set status in processing_part_path and finished_by
264+
ops.emplace_back(zkutil::makeSetRequest(processing_part_path / "status", "FAILED", -1));
265+
ops.emplace_back(zkutil::makeCreateRequest(processing_part_path / "finished_by", storage.replica_name, zkutil::CreateMode::Persistent));
255266
ops.emplace_back(zkutil::makeSetRequest(export_path / "status", "FAILED", -1));
256267

257-
LOG_INFO(storage.log, "ExportPartition: Marked part export {} as failed", part_name);
268+
LOG_INFO(storage.log, "ExportPartition: Retry count limit exceeded for part {}, will try to fail the entire task", part_name);
258269
}
259270

260271
std::size_t num_exceptions = 0;

src/Storages/MergeTree/ExportPartitionTaskScheduler.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,11 +41,11 @@ class ExportPartitionTaskScheduler
4141

4242
void handlePartExportFailure(
4343
const std::filesystem::path & processing_parts_path,
44-
const std::filesystem::path & processed_part_path,
4544
const std::string & part_name,
4645
const std::filesystem::path & export_path,
4746
const zkutil::ZooKeeperPtr & zk,
48-
const String & exception);
47+
const String & exception,
48+
size_t max_retries);
4949
};
5050

5151
}

src/Storages/StorageReplicatedMergeTree.cpp

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,7 @@ namespace Setting
193193
extern const SettingsBool update_sequential_consistency;
194194
extern const SettingsBool allow_experimental_export_merge_tree_part;
195195
extern const SettingsBool export_merge_tree_partition_force_export;
196+
extern const SettingsUInt64 export_merge_tree_partition_max_retries;
196197
}
197198

198199
namespace MergeTreeSetting
@@ -8124,6 +8125,7 @@ void StorageReplicatedMergeTree::exportPartitionToTable(const PartitionCommand &
81248125
manifest.number_of_parts = part_names.size();
81258126
manifest.parts = part_names;
81268127
manifest.create_time = time(nullptr);
8128+
manifest.max_retries = query_context->getSettingsRef()[Setting::export_merge_tree_partition_max_retries];
81278129

81288130
ops.emplace_back(zkutil::makeCreateRequest(
81298131
fs::path(partition_exports_path) / "metadata.json",
@@ -8147,11 +8149,6 @@ void StorageReplicatedMergeTree::exportPartitionToTable(const PartitionCommand &
81478149
"",
81488150
zkutil::CreateMode::Persistent));
81498151

8150-
ops.emplace_back(zkutil::makeCreateRequest(
8151-
fs::path(partition_exports_path) / "processing" / part / "max_retry",
8152-
"3",
8153-
zkutil::CreateMode::Persistent));
8154-
81558152
ops.emplace_back(zkutil::makeCreateRequest(
81568153
fs::path(partition_exports_path) / "processing" / part / "status",
81578154
"PENDING",
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
<clickhouse>
2+
<profiles>
3+
<default>
4+
<s3_retry_attempts>3</s3_retry_attempts>
5+
</default>
6+
</profiles>
7+
</clickhouse>
8+

0 commit comments

Comments
 (0)