fix: fix CS kill/crash when writing data

dmga44 · dmga44 · commit 196e86ffbef1 · 2026-02-20T06:26:39.000+01:00
Recent tests show chunks unavailable when performing the following
test:
- start writing small files in ec(6,2) in the background.
- kill two chunkservers.
- wait for the writes of the files to finish.
- bring the two chunkservers back.
- wait for the data to be replicated.
- stop some other two chunkservers.
- validate data is available.

In the last step, there are six chunkservers available and no chunk
parts missing so there should be chunks unavailable. The error
happening was CRC error in the kill and restarted chunkservers.

The issue found is the following:
- some chunk gets its data parts successfully written to the drive.
- the client gets to know this (chunk write finished OK) and sends
WRITE_END packet to the CSs.
- the CS gets killed after receiving the WRITE_END but before doing
the job_close (hddClose) that is the responsable function to sync the
metadata parts to the drive. Therefore, the data parts of those chunks
are fine, but the CRC of the blocks is incorrect.
- the client unlocks the chunk in the master side (WRITE_END packet)
without noticing any issue and without retrying the write (since it
finished everything it had to write).
- there is no version increase in the other chunk parts and after the
CS is restarted, its chunk parts are registered as good ones, despite
the previously mentioned CRC error (which no component knows about).
- after stopping other CSs and trying to write, the issue emerges.

The solution so far is to move the endChunkLock call to after the
job_close is processed and increase the priority of the close
operations. This way we make sure that master receives notice about the
write end after all that chunk part related operations are completed.

This solution does not solve the case when
USE_CHUNKSERVER_SIDE_CHUNK_LOCK option is disabled.

A test was added to check the previously mentioned scenario.

Signed-off-by: Dave &lt;dave@leil.io&gt;
diff --git a/src/chunkserver/bgjobs.cc b/src/chunkserver/bgjobs.cc
@@ -146,10 +146,14 @@ uint32_t JobPool::addJob(ChunkOperation operation, JobCallback callback, void *e
 	job->state = JobPool::State::Enabled;
 	job->listenerId = listenerId;
 	listenerInfo.jobHash[jobId] = std::move(job);
-	// Use higher priority (0) for Open and GetBlocks operations
-	jobsQueue->put(
-	    jobId, operation, reinterpret_cast<uint8_t *>(listenerInfo.jobHash[jobId].get()), 1,
-	    (operation == ChunkOperation::Open || operation == ChunkOperation::GetBlocks) ? 0 : 1);
+	// Use higher priority (0) for Open, Close and GetBlocks operations
+	uint8_t priority =
+	    (operation == ChunkOperation::Open || operation == ChunkOperation::GetBlocks ||
+	     operation == ChunkOperation::Close)
+	        ? 0
+	        : 1;
+	jobsQueue->put(jobId, operation, reinterpret_cast<uint8_t *>(listenerInfo.jobHash[jobId].get()),
+	               1, priority);
 	return jobId;
 }
 
diff --git a/src/chunkserver/chunk_high_level_ops.cc b/src/chunkserver/chunk_high_level_ops.cc
@@ -549,16 +549,34 @@ void WriteHighLevelOp::cleanup() {
 	}
 
 	if (isChunkOpen_) {
-		job_close(*workerJobPool(), kEmptyCallback, chunkId_, chunkType_);
+		if (isChunkLocked_) {
+			// We need to wait for sync the metadata before releasing the lock, so we use callback to
+			// release the lock after
+			job_close(*workerJobPool(), jobCloseWriteCallback(chunkId_, chunkType_, SAUNAFS_STATUS_OK),
+			          chunkId_, chunkType_);
+		} else {
+			job_close(*workerJobPool(), kEmptyCallback, chunkId_, chunkType_);
+		}
 		isChunkOpen_ = false;
-	}
-
-	if (isChunkLocked_) {
+	} else if (isChunkLocked_) {
 		masterconn_get_job_pool()->endChunkLock(chunkId_, chunkType_, SAUNAFS_STATUS_OK);
 	}
+
 	isChunkLocked_ = false;
 	partiallyCompletedWrites_.clear();
 	chunkId_ = 0;
 	chunkVersion_ = 0;
 	chunkType_ = slice_traits::standard::ChunkPartType();
 }
+
+std::function<void(uint8_t status, void *packet)> jobCloseWriteCallback(uint64_t chunkId,
+                                                                        ChunkPartType chunkType,
+                                                                        uint8_t untoldStatus) {
+	return [chunkId, chunkType, untoldStatus](uint8_t status, void * /*entry*/) {
+		if (untoldStatus == SAUNAFS_STATUS_OK && status != SAUNAFS_STATUS_OK) {
+			masterconn_get_job_pool()->endChunkLock(chunkId, chunkType, status);
+		} else {
+			masterconn_get_job_pool()->endChunkLock(chunkId, chunkType, untoldStatus);
+		}
+	};
+}
diff --git a/src/chunkserver/chunk_high_level_ops.h b/src/chunkserver/chunk_high_level_ops.h
@@ -318,3 +318,16 @@ class WriteHighLevelOp : public HighLevelOp {
 	/// List of write data buffers waiting to be written to the chunk.
 	std::list<std::shared_ptr<InputBuffer>> writeDataBuffers_;
 };
+
+/// @brief Creates a callback function for closing a write operation.
+/// This is only used for write operations, such that the chunk is locked. This callback will be
+/// called after the close operation is completed, to release the chunk lock. The close job syncs
+/// the metadata part of the chunk, so it is important for the master to know if also that operation
+/// succeeded, and not only the write data jobs.
+/// @param chunkId ID of the chunk.
+/// @param chunkType Type of the chunk.
+/// @param untoldStatus Error status untold to the client, to be sent to the master.
+/// @return A function to be used as a callback for closing a write operation.
+std::function<void(uint8_t status, void *packet)> jobCloseWriteCallback(uint64_t chunkId,
+                                                                        ChunkPartType chunkType,
+                                                                        uint8_t untoldStatus);
diff --git a/src/chunkserver/masterconn.cc b/src/chunkserver/masterconn.cc
@@ -38,6 +38,7 @@
 #include "chunkserver/bgjobs.h"
 #include "chunkserver/hddspacemgr.h"
 #include "chunkserver/master_connection.h"
+#include "chunkserver/network_main_thread.h"
 #include "common/event_loop.h"
 #include "common/massert.h"
 #include "common/network_address.h"
@@ -77,8 +78,6 @@ constexpr uint32_t kDefaultReplicationNumberOfWorkers = 5;
 constexpr uint32_t kMinReplicationNumberOfWorkers = 1;
 static uint32_t gReplicationNumberOfWorkers = kDefaultReplicationNumberOfWorkers;
 
-static std::atomic<bool> gDoTerminate = false;
-
 static void* gReconnectHook;
 
 //  Stats
@@ -145,8 +144,6 @@ JobPool* masterconn_get_job_pool() {
 	return gJobPool.get();
 }
 
-void masterconn_wantexit(void) { gDoTerminate.store(true); }
-
 int masterconn_canexit(void) {
 	if (gJobPool->getJobCount() == 0 && gReplicationJobPool->getJobCount() == 0 &&
 	    gMasterConnSingleton->isOutputQueueEmpty()) {
@@ -187,7 +184,7 @@ void masterconn_desc(std::vector<pollfd> &pdesc) {
 		}
 	}
 
-	eptr->providePollDescriptors(pdesc, gDoTerminate.load());
+	eptr->providePollDescriptors(pdesc, doTerminate());
 }
 
 void masterconn_send_status() {
@@ -323,7 +320,7 @@ int masterconn_init(void) {
 	gReconnectHook =
 	    eventloop_timeregister(TIMEMODE_RUN_LATE, reconnectionDelay,
 	                           rnd_ranged<uint32_t>(reconnectionDelay), masterconn_reconnect);
-	eventloop_wantexitregister(masterconn_wantexit);
+
 	eventloop_canexitregister(masterconn_canexit);
 	eventloop_destructregister(masterconn_term);
 	eventloop_pollregister(masterconn_desc, masterconn_serve);
diff --git a/src/chunkserver/network_main_thread.cc b/src/chunkserver/network_main_thread.cc
@@ -64,6 +64,12 @@ static uint32_t gNrOfNetworkWorkers;
 static uint32_t gNrOfHddWorkersPerNetworkWorker;
 static uint32_t gBgjobsCountPerNetworkWorker;
 
+static std::atomic<bool> gDoTerminate = false;
+
+bool doTerminate() {
+	return gDoTerminate.load();
+}
+
 void chunkReplicatorReload() {
 	unsigned rep_total = cfg_get_minmaxvalue<unsigned>("REPLICATION_TOTAL_TIMEOUT_MS",
 	                                                   ChunkReplicator::kDefaultTotalTimeout_ms,
@@ -177,6 +183,10 @@ void mainNetworkThreadReload(void) {
 
 void mainNetworkThreadDesc(std::vector<pollfd> &pdesc) {
 	TRACETHIS();
+	if (doTerminate()) {
+		return;
+	}
+
 	pdesc.push_back({lsock, POLLIN, 0});
 	lsockpdescpos = pdesc.size() - 1;
 }
@@ -196,6 +206,8 @@ void mainNetworkThreadWantExit(void) {
 	for (auto& threadObject : networkThreadObjects) {
 		threadObject.askForTermination();
 	}
+
+	gDoTerminate.store(true);
 }
 
 int mainNetworkThreadCanExit(void) {
@@ -222,6 +234,10 @@ void mainNetworkThreadTerm(void) {
 
 void mainNetworkThreadServe(const std::vector<pollfd> &pdesc) {
 	TRACETHIS();
+	if (doTerminate()) {
+		return;
+	}
+
 	int newSocketFD;
 
 	if (lsockpdescpos >= 0 && (pdesc[lsockpdescpos].revents & POLLIN)) {
diff --git a/src/chunkserver/network_main_thread.h b/src/chunkserver/network_main_thread.h
@@ -28,3 +28,5 @@ int mainNetworkThreadInitThreads(void);
 
 uint32_t mainNetworkThreadGetListenIp();
 uint16_t mainNetworkThreadGetListenPort();
+
+bool doTerminate();
diff --git a/src/chunkserver/network_worker_thread.cc b/src/chunkserver/network_worker_thread.cc
@@ -85,20 +85,28 @@ void NetworkWorkerThread::operator()() {
 	static std::atomic_uint16_t threadCounter(0);
 	std::string threadName = "netWorker_" + std::to_string(threadCounter++);
 	pthread_setname_np(pthread_self(), threadName.c_str());
+	bool lastDoTerminateValue = false;
 
 	while (!canTerminate_.load()) {
+		if (doTerminate.load() && !lastDoTerminateValue) {
+			// We've just switched to terminating mode, start wrapping up.
+			lastDoTerminateValue = true;
+			std::lock_guard lock(csservheadLock);
+			for (auto &entry : csservEntries) { entry.closeJobs(); }
+		}
+
 		preparePollFds(doTerminate.load());
 		int fdWithEvents = poll(pdesc.data(), pdesc.size(), gPollTimeout);
 
 		if (fdWithEvents < 0) {
 			if (errno == EAGAIN) {
-				safs::log_warn("{}: poll returned EAGAIN", __func__);
+				safs::log_warn("{} loop: poll returned EAGAIN", threadName);
 				usleep(100000);
 				continue;
 			}
 
 			if (errno != EINTR) {
-				safs::log_warn("{}: poll error: {}", __func__, strerr(errno));
+				safs::log_warn("{} loop: poll error: {}", threadName, strerr(errno));
 				break;
 			}
 		} else {
@@ -116,7 +124,8 @@ void NetworkWorkerThread::operator()() {
 bool NetworkWorkerThread::updateAndCheckTerminationStatus() {
 	std::lock_guard lock(csservheadLock);
 	bool canTerminate =
-	    doTerminate.load() && (csservEntries.empty() ||
+	    doTerminate.load() && ((csservEntries.empty() &&
+	                            (bgJobPool_.get() == nullptr || bgJobPool_->getJobCount() == 0)) ||
 	                           terminationTimer_.elapsed_ms() > kNWForcefulTerminationTimeout_ms);
 	canTerminate_.store(canTerminate);
 	return canTerminate;
@@ -334,7 +343,6 @@ void NetworkWorkerThread::askForTermination() {
 	doTerminate = true;
 	std::unique_lock lock(csservheadLock);
 	terminationTimer_.reset();
-	for (auto &entry : csservEntries) { entry.closeJobs(); }
 }
 
 void NetworkWorkerThread::addConnection(int newSocketFD) {
diff --git a/tests/test_suites/LongSystemTests/test_kill_cs_while_writing_small_files.sh b/tests/test_suites/LongSystemTests/test_kill_cs_while_writing_small_files.sh
@@ -0,0 +1,100 @@
+timeout_set 25 minutes
+
+CHUNKSERVERS=8 \
+	USE_RAMDISK=YES \
+	MOUNT_EXTRA_CONFIG="sfscachemode=NEVER,sfswriteworkers=30,sfsioretries=13" \
+	CHUNKSERVER_EXTRA_CONFIG="NR_OF_NETWORK_WORKERS = 1|NR_OF_HDD_WORKERS_PER_NETWORK_WORKER = 1" \
+	MASTER_EXTRA_CONFIG="CHUNKS_WRITE_REP_LIMIT = 10|CHUNKS_READ_REP_LIMIT = 10|`
+		`CHUNKS_LOOP_MIN_TIME = 1|CHUNKS_LOOP_MAX_CPU = 90|CHUNKS_LOOP_PERIOD = 50|`
+		`OPERATIONS_DELAY_INIT = 0" \
+	MASTER_CUSTOM_GOALS="8 ec62: \$ec(6,2)"
+	setup_local_empty_saunafs info
+
+cd ${info[mount0]}
+
+number_of_files=1000
+
+for i in $(seq 1 ${number_of_files}); do
+	dd if=/dev/random of=${TEMP_DIR}/file_$i bs=64K count=6 conv=fsync &> /dev/null
+done
+
+mkdir dir
+saunafs setgoal ec62 dir
+saunafs settrashtime 0 dir
+
+for test_loop in {1..10}; do
+	for i in $(seq 1 ${number_of_files}); do
+		dd if="${TEMP_DIR}/file_${i}" of="dir/file_${i}" bs=384K count=1 status=none &> /dev/null &
+	done
+
+	# Write some files with all CSs available, then kill some CSs and write more files, to increase
+	# the probability of having chunks with parts being written at a time when the chunkservers are
+	# killed.
+	sleep 0.3
+
+	saunafs_chunkserver_daemon 0 kill
+	saunafs_chunkserver_daemon 1 kill
+	echo "Chunkservers killed, now starting them again"
+
+	wait
+	echo "All files written"
+
+	# Wait enough time for the chunks to have increased their version because of the lost copies,
+	# which will prevent the chunks from having copies with wrong data.
+	sleep 10
+	echo "Starting chunkservers"
+
+	saunafs_chunkserver_daemon 0 start
+	saunafs_chunkserver_daemon 1 start
+	saunafs_wait_for_all_ready_chunkservers
+
+	# Wait for the chunkservers to fix the chunks, which may take some time because of the number of
+	# missing parts and the fact that we have only 1 worker thread per chunkserver.
+	while true; do
+		current_chunk_ok=0
+		for i in $(seq 1 ${number_of_files}); do
+			if saunafs fileinfo dir/file_$i | tail -n1 | grep -E \
+				'^[[:space:]]*copy 8:' > /dev/null; then
+				current_chunk_ok=$((current_chunk_ok + 1))
+			fi
+		done
+
+		echo "Checked all files, ${current_chunk_ok} copies are OK"
+		if (( current_chunk_ok == number_of_files )); then
+			break
+		fi
+
+		sleep 5
+	done
+
+	saunafs_chunkserver_daemon 6 stop
+	saunafs_chunkserver_daemon 7 stop
+
+	# Test if we can read files with only 6 copies, specially when reading from chunkservers 0 and
+	# 1, which were killed and restarted.
+	for i in $(seq 1 ${number_of_files}); do
+		assert_success dd if="dir/file_$i" of=/dev/null bs=384K count=1 status=none
+		cmp ${TEMP_DIR}/file_$i dir/file_$i || echo "File $i is different after reading back"
+	done
+
+	saunafs_chunkserver_daemon 6 start
+	saunafs_chunkserver_daemon 7 start
+	saunafs_wait_for_all_ready_chunkservers
+
+	rm dir/file_*
+
+	# Ensure all CS report zero chunks before starting the next loop, to reuse the space
+	while true; do
+		cs_with_chunks=$(saunafs-admin list-chunkservers localhost "${info[matocl]}" | \
+			awk '/chunks:/ && $2!=0 {print $0}' | wc -l)
+
+		echo "${cs_with_chunks} CSs report non-zero chunk counts"
+		if (( cs_with_chunks == 0 )); then
+			break
+		fi
+
+		sleep 1
+	done
+
+	echo "Test loop ${test_loop} completed"
+done