feat: Add USE_CHUNKSERVER_SIDE_CHUNK_LOCK option

dmga44 · dmga44 · commit 2651c752c5a0 · 2026-02-13T11:39:57.000+01:00
The new behavior implemented previously can be now enabled and disabled
via the option USE_CHUNKSERVER_SIDE_CHUNK_LOCK. This option is
reloadable. The decision making instant is at the moment of sending the
specific packet type to the chunkservers.

The testing framework was modified in order to enable this option in
all tests, while the default master behavior has it disabled by
default.

Signed-off-by: Dave &lt;dave@leil.io&gt;
diff --git a/doc/sfsmaster.cfg.5.adoc b/doc/sfsmaster.cfg.5.adoc
@@ -290,6 +290,10 @@ for TLS connections (there is no default value).
 Path to the trusted CA certificate which is used to authenticate
 the TLS connection (there is no default value).
 
+*USE_CHUNKSERVER_SIDE_CHUNK_LOCK (EXPERIMENTAL)*:: When set to 1, enables sending
+chunk part lock messages to the chunkservers. This can be useful to track down which
+chunk parts are currently being written. Reloadable (default: 0).
+
 == NOTES
 
 Chunks in master are tested in loop. Speed (or frequency) is regulated by two
diff --git a/src/admin/dump_config_command.cc b/src/admin/dump_config_command.cc
@@ -128,6 +128,7 @@ const static std::unordered_map<std::string, std::string> defaultOptionsMaster =
     {"SNAPSHOT_INITIAL_BATCH_SIZE_LIMIT", "10000"},
     {"FILE_TEST_LOOP_MIN_TIME", "3600"},
     {"PRIORITIZE_DATA_PARTS", "1"},
+    {"USE_CHUNKSERVER_SIDE_CHUNK_LOCK", "0"},
     {"CREATE_EMPTY_FOLDERS_WHEN_SPACE_DEPLETED", "1"},
 };
 
diff --git a/src/data/sfsmaster.cfg.in b/src/data/sfsmaster.cfg.in
@@ -355,3 +355,9 @@
 ## Example: /etc/saunafs/ssl/ca.crt
 ## (There is no default)
 # TLS_CA_CERT_FILE =
+
+## When set to 1, enables sending chunk part lock messages to the chunkservers.
+## This can be useful to track down which chunk parts are currently being 
+## written. Reloadable. 
+## (Default: 0)
+# USE_CHUNKSERVER_SIDE_CHUNK_LOCK = 0
diff --git a/src/master/chunks.cc b/src/master/chunks.cc
@@ -109,6 +109,7 @@ static uint64_t gEndangeredChunksMaxCapacity;
 static uint64_t gDisconnectedCounter = 0;
 inline LinearAssignmentCache gLinearAssignmentCache;
 inline bool gUseLinearAssignmentOptimizer;
+static bool gUseChunkserverSideChunkLock;
 bool gAvoidSameIpChunkservers = false;
 
 struct ChunkPart {
@@ -1180,10 +1181,11 @@ int chunk_get_partstomodify(uint64_t chunkid, int &recover, int &remove) {
 
 // Chunk operations
 
-/// @brief Performs the chunk creation operation, which consists of creating a new chunk with 
+/// @brief Performs the chunk creation operation, which consists of creating a new chunk with
 /// version 1, associating it with the given goal and sending create chunk messages to the provided
 /// chunkservers. The parts in the chunk are marked as being written (it is expecteted that client
-/// starts writing) if the corresponding chunkserver supports locking and the create chunk message was sent with locking.
+/// starts writing) if the corresponding chunkserver supports locking and the create chunk message
+/// was sent with locking.
 /// @param createdChunk A reference to a pointer where the created chunk will be stored.
 /// @param goal The goal that will be associated with the created chunk.
 /// @param serversWithChunkTypes The list of chunkservers to create the chunk on.
@@ -1201,7 +1203,8 @@ void chunk_create_operation(
 		                                        server_with_type.second));
 		bool sentChunkLock = false;
 		matocsserv_send_createchunk(server_with_type.first, createdChunk->chunkid,
-		                            server_with_type.second, createdChunk->version, sentChunkLock);
+		                            server_with_type.second, createdChunk->version,
+		                            gUseChunkserverSideChunkLock, sentChunkLock);
 
 		if (sentChunkLock) { createdChunk->parts.back().mark_being_written(); }
 		// If the chunk lock was not sent, it means that the chunkserver does not support locking,
@@ -1225,14 +1228,14 @@ void chunk_increase_version_operation(Chunk *chunk, bool needsLocking) {
 			part.version = chunk->version + 1;
 			// If part is already being written then we don't need to ask the chunkserver to lock
 			// it again, and we can just increase the version.
-			bool partNeedsLocking = !part.is_being_written() && needsLocking;
+			bool partNeedsLocking =
+			    !part.is_being_written() && needsLocking && gUseChunkserverSideChunkLock;
 			bool sentChunkLock = false;
 			matocsserv_send_setchunkversion(part.server(), chunk->chunkid, chunk->version + 1,
-			                                chunk->version, part.type, partNeedsLocking, sentChunkLock);
+			                                chunk->version, part.type, partNeedsLocking,
+			                                sentChunkLock);
 
-			if (partNeedsLocking && sentChunkLock) {
-				part.mark_being_written();
-			}
+			if (partNeedsLocking && sentChunkLock) { part.mark_being_written(); }
 		}
 	}
 
@@ -1248,18 +1251,20 @@ void chunk_increase_version_operation(Chunk *chunk, bool needsLocking) {
 void chunk_lock_operation(Chunk *chunk) {
 	bool mustWaitForReply = false;
 	assert(chunk->isWritable());
-	for (auto &part : chunk->parts) {
-		if (part.is_valid()) {
-			if (part.is_busy()) { continue; }
-			// No busy parts from now on
-
-			bool sentChunkLock = false;
-			matocsserv_send_chunklock(part.server(), chunk->chunkid, part.type,
-			                          !part.is_being_written(), sentChunkLock);
-			if (sentChunkLock) {
-				part.mark_being_written();
-				mustWaitForReply = true;
-				part.mark_busy();
+	if (gUseChunkserverSideChunkLock) {
+		for (auto &part : chunk->parts) {
+			if (part.is_valid()) {
+				if (part.is_busy()) { continue; }
+				// No busy parts from now on
+
+				bool sentChunkLock = false;
+				matocsserv_send_chunklock(part.server(), chunk->chunkid, part.type,
+				                          !part.is_being_written(), sentChunkLock);
+				if (sentChunkLock) {
+					part.mark_being_written();
+					mustWaitForReply = true;
+					part.mark_busy();
+				}
 			}
 		}
 	}
@@ -1298,7 +1303,8 @@ void chunk_duplicate_operation(Chunk *originalChunk, uint8_t goal, Chunk *&newCh
 			bool sentChunkLock = false;
 			matocsserv_send_duplicatechunk(oldPart.server(), newChunk->chunkid, newChunk->version,
 			                               oldPart.type, originalChunk->chunkid,
-			                               originalChunk->version, sentChunkLock);
+			                               originalChunk->version, gUseChunkserverSideChunkLock,
+			                               sentChunkLock);
 
 			if (sentChunkLock) { newChunk->parts.back().mark_being_written(); }
 		}
@@ -3173,6 +3179,7 @@ void chunk_reload(void) {
 	gAvoidSameIpChunkservers = cfg_getuint32("AVOID_SAME_IP_CHUNKSERVERS", 0);
 	gRedundancyLevel = cfg_getuint32("REDUNDANCY_LEVEL", 0);
 	gUseLinearAssignmentOptimizer = cfg_getuint32("USE_LINEAR_ASSIGNMENT_OPTIMIZER", 1);
+	gUseChunkserverSideChunkLock = cfg_getuint32("USE_CHUNKSERVER_SIDE_CHUNK_LOCK", 0);
 
 	uint32_t disableChunksDel = cfg_getuint32("DISABLE_CHUNKS_DEL", 0);
 	if (disableChunksDel) {
@@ -3268,6 +3275,7 @@ int chunk_strinit(void) {
 	gAvoidSameIpChunkservers = cfg_getuint32("AVOID_SAME_IP_CHUNKSERVERS", 0);
 	gRedundancyLevel = cfg_getuint32("REDUNDANCY_LEVEL", 0);
 	gUseLinearAssignmentOptimizer = cfg_getuint32("USE_LINEAR_ASSIGNMENT_OPTIMIZER", 1);
+	gUseChunkserverSideChunkLock = cfg_getuint32("USE_CHUNKSERVER_SIDE_CHUNK_LOCK", 0);
 
 	if (disableChunksDel) {
 		MaxDelHardLimit = MaxDelSoftLimit = 0;
diff --git a/src/master/matocsserv.cc b/src/master/matocsserv.cc
@@ -604,12 +604,12 @@ void matocsserv_got_chunk_checksum(matocsserventry *eptr, const uint8_t *data, u
 }
 
 int matocsserv_send_createchunk(matocsserventry *eptr, uint64_t chunkId, ChunkPartType chunkType,
-                                uint32_t chunkVersion, bool &sentChunkLock) {
+                                uint32_t chunkVersion, bool needsLock, bool &sentChunkLock) {
 	sentChunkLock = false;
 	if (eptr->mode != ChunkserverConnectionMode::KILL) {
 		eptr->outputPackets.push_back(OutputPacket());
 		sassert(eptr->version >= kFirstECVersion);
-		if (eptr->version >= kFirstVersionWithChunkserverSideChunkLock) {
+		if (eptr->version >= kFirstVersionWithChunkserverSideChunkLock && needsLock) {
 			// For newer chunkservers, create and lock part
 			matocs::createAndLockChunk::serialize(eptr->outputPackets.back().packet, chunkId,
 			                                      chunkType, chunkVersion);
@@ -730,11 +730,11 @@ void matocsserv_got_replicatechunk_status(matocsserventry *eptr, const std::vect
 }
 
 int matocsserv_send_chunklock(matocsserventry *eptr, uint64_t chunkId, ChunkPartType chunkType,
-                              bool needLock, bool &sentChunkLock) {
+                              bool needsLock, bool &sentChunkLock) {
 	sentChunkLock = false;
 	if (eptr->mode != ChunkserverConnectionMode::KILL) {
 		sassert(eptr->version >= kFirstECVersion);
-		if (eptr->version >= kFirstVersionWithChunkserverSideChunkLock && needLock) {
+		if (eptr->version >= kFirstVersionWithChunkserverSideChunkLock && needsLock) {
 			eptr->outputPackets.emplace_back();
 			matocs::chunkLock::serialize(eptr->outputPackets.back().packet, chunkId, chunkType);
 			sentChunkLock = true;
@@ -794,12 +794,12 @@ int matocsserv_send_chunkunlock(matocsserventry *eptr, uint64_t chunkId, ChunkPa
 }
 
 int matocsserv_send_setchunkversion(matocsserventry *eptr, uint64_t chunkId, uint32_t newVersion,
-		uint32_t chunkVersion, ChunkPartType chunkType, bool needChunkLock, bool &sentChunkLock) {
+		uint32_t chunkVersion, ChunkPartType chunkType, bool needsLock, bool &sentChunkLock) {
 	sentChunkLock = false;
 	if (eptr->mode != ChunkserverConnectionMode::KILL) {
 		eptr->outputPackets.emplace_back();
 		sassert(eptr->version >= kFirstECVersion);
-		if (eptr->version >= kFirstVersionWithChunkserverSideChunkLock && needChunkLock) {
+		if (eptr->version >= kFirstVersionWithChunkserverSideChunkLock && needsLock) {
 			// For newer chunkservers, set version with chunk lock
 			matocs::setVersionAndLock::serialize(eptr->outputPackets.back().packet, chunkId,
 			                                     chunkType, chunkVersion, newVersion);
@@ -835,13 +835,14 @@ void matocsserv_got_setchunkversion_status(matocsserventry *eptr,
 
 int matocsserv_send_duplicatechunk(matocsserventry *eptr, uint64_t newChunkId,
                                    uint32_t newChunkVersion, ChunkPartType chunkType,
-                                   uint64_t chunkId, uint32_t chunkVersion, bool &sentChunkLock) {
+                                   uint64_t chunkId, uint32_t chunkVersion, bool needsLock,
+                                   bool &sentChunkLock) {
 	sentChunkLock = false;
 	if (eptr->mode == ChunkserverConnectionMode::KILL) { return 0; }
 
 	OutputPacket outPacket;
 	sassert(eptr->version >= kFirstECVersion);
-	if (eptr->version >= kFirstVersionWithChunkserverSideChunkLock) {
+	if (eptr->version >= kFirstVersionWithChunkserverSideChunkLock && needsLock) {
 		// For newer chunkservers, duplicate with chunk lock
 		matocs::duplicateAndLockChunk::serialize(outPacket.packet, newChunkId, newChunkVersion,
 		                                         chunkType, chunkId, chunkVersion);
diff --git a/src/master/matocsserv.h b/src/master/matocsserv.h
@@ -107,16 +107,17 @@ int matocsserv_send_sau_replicatechunk(matocsserventry* eptr,
 int matocsserv_send_deletechunk(matocsserventry* eptr,
 		uint64_t chunkId, uint32_t chunkVersion, ChunkPartType chunkType);
 int matocsserv_send_createchunk(matocsserventry *eptr, uint64_t chunkid, ChunkPartType chunkType,
-                                uint32_t version, bool &sentChunkLock);
+                                uint32_t version, bool needsLock, bool &sentChunkLock);
 int matocsserv_send_chunklock(matocsserventry *eptr, uint64_t chunkId, ChunkPartType chunkType,
-                              bool needLock, bool &sentChunkLock);
+                              bool needsLock, bool &sentChunkLock);
 int matocsserv_send_chunkunlock(matocsserventry *eptr, uint64_t chunkId, ChunkPartType chunkType);
 int matocsserv_send_setchunkversion(matocsserventry *eptr, uint64_t chunkId, uint32_t newVersion,
-                                    uint32_t chunkVersion, ChunkPartType chunkType,
-                                    bool needsLocking, bool &sentChunkLock);
+                                    uint32_t chunkVersion, ChunkPartType chunkType, bool needsLock,
+                                    bool &sentChunkLock);
 int matocsserv_send_duplicatechunk(matocsserventry *eptr, uint64_t newChunkId,
                                    uint32_t newChunkVersion, ChunkPartType chunkType,
-                                   uint64_t chunkId, uint32_t chunkVersion, bool &sentChunkLock);
+                                   uint64_t chunkId, uint32_t chunkVersion, bool needsLock,
+                                   bool &sentChunkLock);
 void matocsserv_send_truncatechunk(matocsserventry* eptr,
 		uint64_t chunkid, ChunkPartType chunkType, uint32_t length,
 		uint32_t version, uint32_t oldversion);
diff --git a/tests/test_suites/ShortSystemTests/test_concurrent_random_writes_on_chunk.sh b/tests/test_suites/ShortSystemTests/test_concurrent_random_writes_on_chunk.sh
@@ -1,4 +1,4 @@
-timeout_set 30 seconds
+timeout_set 45 seconds
 
 CHUNKSERVERS=8 \
 	USE_RAMDISK=YES \
@@ -12,22 +12,51 @@ cd "${info[mount0]}"
 mkdir dir
 saunafs setgoal ec62 dir
 
-times_to_repeat=512
+times_to_repeat=1024
 FILE_SIZE=$(( times_to_repeat * 4 * 1024 )) file-generate ${TEMP_DIR}/original_file
 
-# Write 4KB at a time, 1KB in each of 4 mounts, and repeat this 512 times, so that the file is
+# Write 4KB at a time, 1KB in each of 4 mounts, and repeat this 1024 times, so that the file is
 # written in random order and with many concurrent writes.
 
+master_reloading_loop_file=${TEMP_DIR}/master_reloading_loop_file
+switch_use_chunkserver_side_chunk_lock_thread() {
+	touch ${master_reloading_loop_file}
+	while true; do
+		if [ ! -e ${master_reloading_loop_file} ]; then
+			break 2
+		fi
+		sleep 0.15
+		current=$(grep USE_CHUNKSERVER_SIDE_CHUNK_LOCK ${info[master0_master_cfg]} | tail -n 1 | awk '{print $3}')
+		echo "Switching USE_CHUNKSERVER_SIDE_CHUNK_LOCK to $(( 1 - current ))"
+		sed -i "s/USE_CHUNKSERVER_SIDE_CHUNK_LOCK = ./USE_CHUNKSERVER_SIDE_CHUNK_LOCK = $(( 1 - current ))/g" ${info[master0_master_cfg]}
+		saunafs_master_daemon reload
+	done
+	echo "chunkservers_restarting_loop stopped"
+}
+
+stop_switch_use_chunkserver_side_chunk_lock_thread() {
+	rm -f ${master_reloading_loop_file}
+}
+
+switch_use_chunkserver_side_chunk_lock_thread &
+switch_use_chunkserver_side_chunk_lock_thread_pid=$!
+
 for i in $(seq 0 $((times_to_repeat - 1))); do
 	shuffled_seq=($(shuf -e $(seq 0 3)))
+	pids=()
 	for mount in $(seq 0 3); do
 		dd if="${TEMP_DIR}/original_file" of="${info[mount${mount}]}/dir/file" bs=1K \
 			skip=$(( i * 4 + ${shuffled_seq[$mount]} )) \
 			seek=$(( i * 4 + ${shuffled_seq[$mount]} )) \
 			count=1 conv=notrunc 2>/dev/null &
+		pids+=("$!")
 	done
-	wait
+	if [ ${#pids[@]} -gt 0 ]; then
+		wait "${pids[@]}"
+	fi
 	echo "Done writing $i-th block of 4KB"
 done
 
+stop_switch_use_chunkserver_side_chunk_lock_thread
+
 MESSAGE="Validating file after concurrent random writes" expect_success file-validate dir/file
diff --git a/tests/tools/saunafs.sh b/tests/tools/saunafs.sh
@@ -372,6 +372,7 @@ create_sfsmaster_master_cfg_() {
 	echo "MATOTS_LISTEN_PORT = ${saunafs_info_[matots]}"
 	echo "METADATA_CHECKSUM_INTERVAL = 1"
 	echo "ADMIN_PASSWORD = ${saunafs_info_[admin_password]}"
+	echo "USE_CHUNKSERVER_SIDE_CHUNK_LOCK = 1"
 	create_magic_debug_log_entry_ "master_${masterserver_id}"
 	echo "${MASTER_EXTRA_CONFIG-}" | tr '|' '\n'
 	echo "${!this_module_cfg_variable-}" | tr '|' '\n'