Skip to content

Commit 56f4994

Browse files
zustonJunfan Zhang
andauthored
[#2241] feat(server): Introduce option to mark server unhealthy once any storage corrupted (#2245)
### What changes were proposed in this pull request? Introduce option to mark server unhealthy once any storage corrupted. ### Why are the changes needed? For: #2241 This feature is to reduce the impact while the local directories are corrupted. ### Does this PR introduce _any_ user-facing change? Yes. `rss.server.health.markUnhealthyOnceStorageCorruption` is introduced, the default value is false that will not activate this feature by default. ### How was this patch tested? Existing unit tests. Co-authored-by: Junfan Zhang <zhangjunfan@qiyi.com>
1 parent 7ca0719 commit 56f4994

File tree

2 files changed

+20
-0
lines changed

2 files changed

+20
-0
lines changed

server/src/main/java/org/apache/uniffle/server/LocalStorageChecker.java

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ public class LocalStorageChecker extends Checker {
6262
private boolean isHealthy = true;
6363
private ExecutorService workers;
6464
private ReconfigurableConfManager.Reconfigurable<Long> diskCheckerExecutionTimeoutMs;
65+
private boolean markUnhealthyOnceDirCorrupted = false;
6566

6667
public LocalStorageChecker(ShuffleServerConf conf, List<LocalStorage> storages) {
6768
super(conf);
@@ -88,6 +89,9 @@ public LocalStorageChecker(ShuffleServerConf conf, List<LocalStorage> storages)
8889
this.diskCheckerExecutionTimeoutMs =
8990
conf.getReconfigurableConf(ShuffleServerConf.HEALTH_CHECKER_LOCAL_STORAGE_EXECUTE_TIMEOUT);
9091
this.workers = Executors.newFixedThreadPool(basePaths.size());
92+
93+
this.markUnhealthyOnceDirCorrupted =
94+
conf.get(ShuffleServerConf.SERVER_UNHEALTHY_ONCE_STORAGE_CORRUPTION);
9195
}
9296

9397
@Override
@@ -179,6 +183,15 @@ public boolean checkIsHealthy() {
179183
return false;
180184
}
181185

186+
if (markUnhealthyOnceDirCorrupted && corruptedDirs.get() > 0) {
187+
if (isHealthy) {
188+
LOG.info(
189+
"shuffle server become unhealthy because {} corrupted dirs exist", corruptedDirs.get());
190+
}
191+
isHealthy = false;
192+
return false;
193+
}
194+
182195
double availablePercentage = num.get() * 100.0 / storageInfos.size();
183196
if (Double.compare(availablePercentage, minStorageHealthyPercentage) >= 0) {
184197
if (!isHealthy) {

server/src/main/java/org/apache/uniffle/server/ShuffleServerConf.java

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -267,6 +267,13 @@ public class ShuffleServerConf extends RssBaseConf {
267267
.defaultValue(2 * 1024L * 1024L)
268268
.withDescription("The index file size hint");
269269

270+
public static final ConfigOption<Boolean> SERVER_UNHEALTHY_ONCE_STORAGE_CORRUPTION =
271+
ConfigOptions.key("rss.server.health.markUnhealthyOnceStorageCorruption")
272+
.booleanType()
273+
.defaultValue(false)
274+
.withDescription(
275+
"Mark server unhealthy once any storage corrupted. Default value is false");
276+
270277
public static final ConfigOption<Double> HEALTH_STORAGE_MAX_USAGE_PERCENTAGE =
271278
ConfigOptions.key("rss.server.health.max.storage.usage.percentage")
272279
.doubleType()

0 commit comments

Comments
 (0)