Skip to content

Commit 17d1d2c

Browse files
committed
fix: make NOSPACE error code more accurate
Current implementation has a critical error when the following conditions are met: - clients are creating files in goal ec(D, P). - more than one minute has passed. - all the chunkservers restart. In such case, it is very likely that master replies to some of the chunk write (creations) requests SAUNAFS_ERROR_NOSPACE. This happens when not being able to create the chunk due to not having at least D chunkservers available and having at least one available. The error code SAUNAFS_ERROR_NOSPACE is not retryable by the client, which is correct, but makes the write operations to completely fail. The incorrect behavior is the one from the master, which must reply SAUNAFS_ERROR_NOCHUNKSERVERS in such cases and only leave the NOSPACE error code for the case when at least D chunkservers are available. Signed-off-by: Dave <dave@leil.io>
1 parent f9333fe commit 17d1d2c

File tree

3 files changed

+29
-7
lines changed

3 files changed

+29
-7
lines changed

src/master/chunks.cc

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1379,15 +1379,18 @@ uint8_t chunk_create(bool quotaExceeded, uint8_t goal, uint8_t *operation, uint6
13791379
if (quotaExceeded) { return SAUNAFS_ERROR_QUOTA; }
13801380

13811381
// Next check availability of chunkservers for the given goal
1382-
auto serversWithChunkTypes = matocsserv_getservers_for_new_chunk(goal, minServerVersion);
1382+
uint16_t minServerCount = 0;
1383+
auto serversWithChunkTypes =
1384+
matocsserv_getservers_for_new_chunk(goal, minServerCount, minServerVersion);
13831385
if (serversWithChunkTypes.empty()) {
13841386
uint16_t usableChunkservers, totalChunkservers;
13851387
double minUsage, maxUsage;
13861388
matocsserv_usagedifference(&minUsage, &maxUsage, &usableChunkservers, &totalChunkservers);
13871389

1388-
if (usableChunkservers > 0 && eventloop_time() > starttime + kStartupGracePeriodSeconds) {
1389-
// if there are chunkservers and it's at least one minute after start then it means that
1390-
// there is no space left
1390+
if (usableChunkservers >= minServerCount &&
1391+
eventloop_time() > starttime + kStartupGracePeriodSeconds) {
1392+
// if there are enough chunkservers and it's at least one minute after start then it
1393+
// means that there is no space left
13911394
return SAUNAFS_ERROR_NOSPACE;
13921395
}
13931396

src/master/matocsserv.cc

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -352,11 +352,13 @@ std::vector<ServerWithUsage> matocsserv_getservers_sorted() {
352352
}
353353

354354
std::vector<std::pair<matocsserventry *, ChunkPartType>> matocsserv_getservers_for_new_chunk(
355-
uint8_t goal_id, uint32_t min_server_version) {
355+
uint8_t goal_id, uint16_t &min_server_count, uint32_t min_server_version) {
356356
static std::array<ChunkCreationHistory, GoalId::kMax + 1> history;
357357
GetServersForNewChunk getter;
358358
const Goal &goal(gFSOperations->getGoalDefinition(goal_id));
359359

360+
min_server_count = std::numeric_limits<uint16_t>::max();
361+
360362
for (const auto &eptr : matocsservList) {
361363
if (eptr->mode != ChunkserverConnectionMode::KILL && eptr->totalspace > 0 &&
362364
eptr->usedspace <= eptr->totalspace &&
@@ -386,6 +388,9 @@ std::vector<std::pair<matocsserventry *, ChunkPartType>> matocsserv_getservers_f
386388
// we don't use any chunkserver twice if not necessary.
387389
for (const auto &slice : goal) {
388390
std::vector<std::pair<matocsserventry *, ChunkPartType>> slice_ret;
391+
min_server_count =
392+
std::min(min_server_count,
393+
static_cast<uint16_t>(slice_traits::getNumberOfDataParts(slice.getType())));
389394

390395
// add parts index permutation here
391396
std::array<int, Goal::Slice::kMaxPartsCount> shuffle;

src/master/matocsserv.h

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -89,8 +89,22 @@ std::vector<ServerWithUsage> matocsserv_getservers_sorted();
8989
uint32_t matocsserv_get_version(matocsserventry* eptr);
9090
void matocsserv_usagedifference(double *minusage, double *maxusage, uint16_t *usablescount,
9191
uint16_t *totalscount);
92-
std::vector<std::pair<matocsserventry*, ChunkPartType>> matocsserv_getservers_for_new_chunk(
93-
uint8_t goalId, uint32_t min_server_version = 0);
92+
93+
/*! \brief Get chunkservers for a new chunk.
94+
*
95+
* This function returns a list of chunkservers that can be used for a new chunk creation. The
96+
* returned servers are randomly shuffled and sorted by disk usage, so the least used and least
97+
* loaded servers are returned first.
98+
*
99+
* \param goalId - the id of the goal for which the chunk is being created.
100+
* \param min_server_count[out] - the minimum number of servers that should be returned for each
101+
* slice in the goal. This is needed to determine if there are
102+
* enough servers to create a chunk with the given goal.
103+
* \param min_server_version - return only chunkservers with higher (or equal) version.
104+
* \return List of chunkservers for new chunk creation with their types.
105+
*/
106+
std::vector<std::pair<matocsserventry *, ChunkPartType>> matocsserv_getservers_for_new_chunk(
107+
uint8_t goalId, uint16_t &min_server_count, uint32_t min_server_version = 0);
94108
void matocsserv_getspace(uint64_t* totalspace, uint64_t* availspace);
95109
const char* matocsserv_getstrip(matocsserventry* eptr);
96110
uint32_t matocsserv_get_servip(matocsserventry *eptr);

0 commit comments

Comments
 (0)