Cherry-pick NVIDIA#4174 and NVIDIA#4186 from 2.7 to main (NVIDIA#4193)

pcnudde · web-flow · commit 902bf98426ef · 2026-02-18T06:39:01.000+08:00
## Summary - Cherry-pick of NVIDIA#4174: reduce lock scope in `Cacheable._get_item` — `produce_item` now runs outside the lock so concurrent receivers aren't blocked - Cherry-pick of NVIDIA#4186: avoid self-message deadlock when swarm trainer submits learn result to itself — local submission bypasses `broadcast_and_wait`, adds unit test coverage
diff --git a/nvflare/app_common/ccwf/swarm_client_ctl.py b/nvflare/app_common/ccwf/swarm_client_ctl.py
@@ -845,23 +845,31 @@ def do_learn_task(self, name: str, task_data: Shareable, fl_ctx: FLContext, abor
                 time.sleep(self.request_to_submit_result_interval)
 
             # send the result to the aggr
-            self.log_info(fl_ctx, f"sending training result to aggregation client {aggr}")
+            if aggr == self.me:
+                # Avoid synchronous self-message path through CoreCell._send_direct_message.
+                self.log_info(fl_ctx, "submitting training result locally (aggregation client is self)")
+                engine = fl_ctx.get_engine()
+                local_fl_ctx = fl_ctx.clone()
+                local_fl_ctx.set_peer_context(engine.new_context())
+                reply = self._process_learn_result(result, local_fl_ctx, abort_signal)
+            else:
+                self.log_info(fl_ctx, f"sending training result to aggregation client {aggr}")
 
-            task = Task(
-                name=self.report_learn_result_task_name,
-                data=result,
-                timeout=int(self.learn_task_ack_timeout),
-                secure=self.is_task_secure(fl_ctx),
-            )
+                task = Task(
+                    name=self.report_learn_result_task_name,
+                    data=result,
+                    timeout=int(self.learn_task_ack_timeout),
+                    secure=self.is_task_secure(fl_ctx),
+                )
 
-            resp = self.broadcast_and_wait(
-                task=task,
-                targets=[aggr],
-                min_responses=1,
-                fl_ctx=fl_ctx,
-            )
+                resp = self.broadcast_and_wait(
+                    task=task,
+                    targets=[aggr],
+                    min_responses=1,
+                    fl_ctx=fl_ctx,
+                )
 
-            reply = resp.get(aggr)
+                reply = resp.get(aggr)
             if not reply:
                 self.log_error(fl_ctx, f"failed to receive reply from aggregation client: {aggr}")
                 self.update_status(action="receive_learn_result_reply", error=ReturnCode.EXECUTION_EXCEPTION)
diff --git a/nvflare/fuel/f3/streaming/cacheable.py b/nvflare/fuel/f3/streaming/cacheable.py
@@ -90,20 +90,35 @@ def clear_cache(self):
     def _get_item(self, index: int, requester: str) -> bytes:
         with self.lock:
             if not self.cache:
-                # the cache has been cleared
+                cache_available = False
                 data = None
             else:
+                cache_available = True
                 data, _ = self.cache[index]
 
-            if data is None:
-                data = self.produce_item(index)
-                if self.cache:
-                    self.cache[index] = (data, 0)
-                    self.logger.debug(f"created and cached item {index} for {requester}: {len(data)} bytes")
-            else:
-                self.logger.debug(f"got item {index} from cache for {requester}")
+        if not cache_available:
+            return self.produce_item(index)
+
+        if data is not None:
+            self.logger.debug(f"got item {index} from cache for {requester}")
             return data
 
+        # Produce outside the lock so concurrent receivers aren't blocked.
+        # If two receivers produce the same item simultaneously, the first
+        # to re-acquire the lock stores its result; the second uses it.
+        data = self.produce_item(index)
+
+        with self.lock:
+            if self.cache:
+                existing, count = self.cache[index]
+                if existing is None:
+                    self.cache[index] = (data, count)
+                    self.logger.debug(f"created and cached item {index} for {requester}: {len(data)} bytes")
+                else:
+                    data = existing
+                    self.logger.debug(f"got item {index} from cache for {requester} (produced concurrently)")
+        return data
+
     def _adjust_cache(self, start: int, count: int):
         with self.lock:
             if not self.cache:
diff --git a/tests/unit_test/app_common/ccwf/test_swarm_self_message_deadlock.py b/tests/unit_test/app_common/ccwf/test_swarm_self_message_deadlock.py
@@ -34,7 +34,17 @@
 import threading
 import time
 import unittest
-
+from types import SimpleNamespace
+from unittest import mock
+
+from nvflare.apis.fl_constant import ReturnCode as FLReturnCode
+from nvflare.apis.fl_context import FLContextManager
+from nvflare.apis.shareable import Shareable, make_reply
+from nvflare.apis.signal import Signal
+from nvflare.app_common.abstract.learnable import Learnable
+from nvflare.app_common.app_constant import AppConstants
+from nvflare.app_common.ccwf.common import Constant
+from nvflare.app_common.ccwf.swarm_client_ctl import SwarmClientController
 from nvflare.fuel.f3.cellnet.core_cell import CoreCell, Message, MessageHeaderKey, TargetMessage
 from nvflare.fuel.f3.cellnet.defs import ReturnCode
 from nvflare.fuel.utils.network_utils import get_open_ports
@@ -415,5 +425,79 @@ def blocking_handler(message: Message):
         self.assertTrue(deadlock_detected.is_set(), "Deadlock should be detected - tensor wait timed out")
 
 
+class TestSwarmResultSubmissionFix(unittest.TestCase):
+    def test_local_submit_when_aggregator_is_self(self):
+        class _DummyGatherer:
+            def __init__(self, **kwargs):
+                self.for_round = kwargs.get("for_round", 0)
+
+        class _DummyEngine:
+            def __init__(self):
+                self.submit_req_calls = 0
+
+            def send_aux_request(self, **kwargs):
+                self.submit_req_calls += 1
+                return {"site-1": make_reply(FLReturnCode.OK)}
+
+            def new_context(self):
+                return FLContextManager(engine=self, identity_name="site-1", job_id="job").new_context()
+
+        engine = _DummyEngine()
+        fl_ctx = FLContextManager(engine=engine, identity_name="site-1", job_id="job").new_context()
+        abort_signal = Signal()
+
+        task_data = Shareable()
+        task_data.set_header(AppConstants.CURRENT_ROUND, 1)
+        task_data.set_header(Constant.AGGREGATOR, "site-1")
+
+        learn_result = make_reply(FLReturnCode.OK)
+
+        ctl = object.__new__(SwarmClientController)
+        ctl.me = "site-1"
+        ctl.is_trainer = True
+        ctl.gatherer = None
+        ctl.gatherer_waiter = threading.Event()
+        ctl.metric_comparator = object()
+        ctl.trainers = ["site-1"]
+        ctl.learn_task_timeout = 10
+        ctl.min_responses_required = 1
+        ctl.wait_time_after_min_resps_received = 0
+        ctl.aggregator = object()
+        ctl.max_concurrent_submissions = 1
+        ctl.request_to_submit_result_max_wait = 10
+        ctl.request_to_submit_result_msg_timeout = 1
+        ctl.request_to_submit_result_interval = 0
+        ctl.request_to_submit_learn_result_task_name = "request_submit"
+        ctl.report_learn_result_task_name = "report_result"
+        ctl.learn_task_ack_timeout = 5
+        ctl.shareable_generator = SimpleNamespace(shareable_to_learnable=lambda _task_data, _ctx: Learnable())
+        ctl.get_config_prop = lambda key, default=None: ["site-1"] if key == Constant.CLIENTS else default
+        ctl.execute_learn_task = lambda _task_data, _ctx, _abort_signal: learn_result
+        ctl.is_task_secure = lambda _ctx: False
+        ctl.update_status = lambda **kwargs: None
+        ctl.fire_event = lambda *_args, **_kwargs: None
+        ctl.log_info = lambda *_args, **_kwargs: None
+        ctl.log_debug = lambda *_args, **_kwargs: None
+        ctl.log_warning = lambda *_args, **_kwargs: None
+        ctl.log_error = lambda *_args, **_kwargs: None
+        ctl.broadcast_and_wait = mock.Mock(
+            side_effect=AssertionError("broadcast_and_wait must not be called for local result submission")
+        )
+        ctl._process_learn_result = mock.Mock(return_value=make_reply(FLReturnCode.OK))
+
+        with mock.patch("nvflare.app_common.ccwf.swarm_client_ctl.Gatherer", _DummyGatherer):
+            ctl.do_learn_task("train", task_data, fl_ctx, abort_signal)
+
+        ctl.broadcast_and_wait.assert_not_called()
+        ctl._process_learn_result.assert_called_once()
+        self.assertEqual(engine.submit_req_calls, 1, "submission permission request should still be sent once")
+
+        called_result, called_fl_ctx, called_abort_signal = ctl._process_learn_result.call_args[0]
+        self.assertIs(called_result, learn_result)
+        self.assertIs(called_abort_signal, abort_signal)
+        self.assertIsNot(called_fl_ctx, fl_ctx)
+        self.assertEqual(called_fl_ctx.get_peer_context().get_identity_name(), "site-1")
+
+
 if __name__ == "__main__":
     unittest.main()