dask
diff --git a/‎distributed/client.py‎
Lines changed: 3 additions & 5 deletions b/‎distributed/client.py‎
Lines changed: 3 additions & 5 deletions
diff --git a/‎distributed/scheduler.py‎
Lines changed: 141 additions & 67 deletions b/‎distributed/scheduler.py‎
Lines changed: 141 additions & 67 deletions
diff --git a/‎distributed/tests/test_client.py‎
Lines changed: 6 additions & 15 deletions b/‎distributed/tests/test_client.py‎
Lines changed: 6 additions & 15 deletions
@@ -3095,11 +3095,9 @@ async def _rebalance(self, futures=None, workers=None):
         else:
             keys = None
         result = await self.scheduler.rebalance(keys=keys, workers=workers)
-        if result["status"] == "missing-data":
-            raise KeyError(
-                f"During rebalance {len(result['keys'])} keys were found to be missing"
-            )
-        assert result["status"] == "OK"
+        if result["status"] == "partial-fail":
+            raise KeyError(f"Could not rebalance keys: {result['keys']}")
+        assert result["status"] == "OK", result
 
     def rebalance(self, futures=None, workers=None, **kwargs):
         """Rebalance data within network
 
@@ -24,7 +24,6 @@
 import sortedcontainers
 from tlz import (
     compose,
-    concat,
     first,
     groupby,
     merge,
@@ -5392,7 +5391,7 @@ async def scatter(
         return keys
 
     async def gather(self, comm=None, keys=None, serializers=None):
-        """Collect data in from workers"""
+        """Collect data from workers to the scheduler"""
         parent: SchedulerState = cast(SchedulerState, self)
         ws: WorkerState
         keys = list(keys)
@@ -5598,31 +5597,108 @@ async def proxy(self, comm=None, msg=None, worker=None, serializers=None):
         )
         return d[worker]
 
-    async def _delete_worker_data(self, worker_address, keys):
+    async def _gather_on_worker(
+        self, worker_address: str, who_has: "dict[Hashable, list[str]]"
+    ) -> set:
+        """Peer-to-peer copy of keys from multiple workers to a single worker
+
+        Parameters
+        ----------
+        worker_address: str
+            Recipient worker address to copy keys to
+        who_has: dict[Hashable, list[str]]
+            {key: [sender address, sender address, ...], key: ...}
+
+        Returns
+        -------
+        returns:
+            set of keys that failed to be copied
+        """
+        try:
+            result = await retry_operation(
+                self.rpc(addr=worker_address).gather, who_has=who_has
+            )
+        except OSError as e:
+            # This can happen e.g. if the worker is going through controlled shutdown;
+            # it doesn't necessarily mean that it went unexpectedly missing
+            logger.warning(
+                f"Communication with worker {worker_address} failed during "
+                f"replication: {e.__class__.__name__}: {e}"
+            )
+            return set(who_has)
+
+        parent: SchedulerState = cast(SchedulerState, self)
+        ws: WorkerState = parent._workers_dv.get(worker_address)
+
+        if ws is None:
+            logger.warning(f"Worker {worker_address} lost during replication")
+            return set(who_has)
+        elif result["status"] == "OK":
+            keys_failed = set()
+            keys_ok = who_has.keys()
+        elif result["status"] == "partial-fail":
+            keys_failed = set(result["keys"])
+            keys_ok = who_has.keys() - keys_failed
+            logger.warning(
+                f"Worker {worker_address} failed to acquire keys: {result['keys']}"
+            )
+        else:  # pragma: nocover
+            raise ValueError(f"Unexpected message from {worker_address}: {result}")
+
+        for key in keys_ok:
+            ts: TaskState = parent._tasks.get(key)
+            if ts is None or ts._state != "memory":
+                logger.warning(f"Key lost during replication: {key}")
+                continue
+            if ts not in ws._has_what:
+                ws._nbytes += ts.get_nbytes()
+                ws._has_what[ts] = None
+                ts._who_has.add(ws)
+
+        return keys_failed
+
+    async def _delete_worker_data(self, worker_address: str, keys: "list[str]") -> None:
         """Delete data from a worker and update the corresponding worker/task states
 
         Parameters
         ----------
         worker_address: str
             Worker address to delete keys from
-        keys: List[str]
+        keys: list[str]
             List of keys to delete on the specified worker
         """
         parent: SchedulerState = cast(SchedulerState, self)
 
-        await retry_operation(
-            self.rpc(addr=worker_address).free_keys,
-            keys=list(keys),
-            reason="rebalance/replicate",
-        )
+        try:
+            await retry_operation(
+                self.rpc(addr=worker_address).free_keys,
+                keys=list(keys),
+                reason="rebalance/replicate",
+            )
+        except OSError as e:
+            # This can happen e.g. if the worker is going through controlled shutdown;
+            # it doesn't necessarily mean that it went unexpectedly missing
+            logger.warning(
+                f"Communication with worker {worker_address} failed during "
+                f"replication: {e.__class__.__name__}: {e}"
+            )
+            return
+
+        ws: WorkerState = parent._workers_dv.get(worker_address)
+        if ws is None:
+            return
+
+        for key in keys:
+            ts: TaskState = parent._tasks.get(key)
+            if ts is not None and ts in ws._has_what:
+                assert ts._state == "memory"
+                del ws._has_what[ts]
+                ts._who_has.remove(ws)
+                ws._nbytes -= ts.get_nbytes()
+                if not ts._who_has:
+                    # Last copy deleted
+                    self.transitions({key: "released"})
 
-        ws: WorkerState = parent._workers_dv[worker_address]
-        ts: TaskState
-        tasks: set = {parent._tasks[key] for key in keys}
-        for ts in tasks:
-            del ws._has_what[ts]
-            ts._who_has.remove(ws)
-            ws._nbytes -= ts.get_nbytes()
         self.log_event(ws._address, {"action": "remove-worker-data", "keys": keys})
 
     async def rebalance(
@@ -5717,14 +5793,18 @@ async def rebalance(
                     if k not in parent._tasks or not parent._tasks[k].who_has
                 ]
                 if missing_data:
-                    return {"status": "missing-data", "keys": missing_data}
+                    return {"status": "partial-fail", "keys": missing_data}
 
             msgs = self._rebalance_find_msgs(keys, workers)
             if not msgs:
                 return {"status": "OK"}
 
             async with self._lock:
-                return await self._rebalance_move_data(msgs)
+                result = await self._rebalance_move_data(msgs)
+                if result["status"] == "partial-fail" and keys is None:
+                    # Only return failed keys if the client explicitly asked for them
+                    result = {"status": "OK"}
+                return result
 
     def _rebalance_find_msgs(
         self: SchedulerState,
@@ -5881,7 +5961,7 @@ def _rebalance_find_msgs(
                     # move on to the next task of the same sender.
                     continue
 
-                # Schedule task for transfer from sender to receiver
+                # Schedule task for transfer from sender to recipient
                 msgs.append((snd_ws, rec_ws, ts))
 
                 # *_bytes_max/min are all negative for heap sorting
@@ -5902,7 +5982,7 @@ def _rebalance_find_msgs(
                 else:
                     heapq.heappop(senders)
 
-                # If receiver still has bytes to gain, push it back into the receivers
+                # If recipient still has bytes to gain, push it back into the recipients
                 # heap; it may or may not come back on top again.
                 if rec_bytes_min < 0:
                     # See definition of recipients above
@@ -5927,29 +6007,46 @@ async def _rebalance_move_data(
         self, msgs: "list[tuple[WorkerState, WorkerState, TaskState]]"
     ) -> dict:
         """Perform the actual transfer of data across the network in rebalance().
-        Takes in input the output of _rebalance_find_msgs().
+        Takes in input the output of _rebalance_find_msgs(), that is a list of tuples:
+
+        - sender worker
+        - recipient worker
+        - task to be transferred
 
         FIXME this method is not robust when the cluster is not idle.
         """
-        ts: TaskState
         snd_ws: WorkerState
         rec_ws: WorkerState
+        ts: TaskState
 
         to_recipients = defaultdict(lambda: defaultdict(list))
+        for snd_ws, rec_ws, ts in msgs:
+            to_recipients[rec_ws.address][ts._key].append(snd_ws.address)
+        failed_keys_by_recipient = dict(
+            zip(
+                to_recipients,
+                await asyncio.gather(
+                    *(
+                        # Note: this never raises exceptions
+                        self._gather_on_worker(w, who_has)
+                        for w, who_has in to_recipients.items()
+                    )
+                ),
+            )
+        )
+
         to_senders = defaultdict(list)
-        for sender, recipient, ts in msgs:
-            to_recipients[recipient.address][ts._key].append(sender.address)
-            to_senders[sender.address].append(ts._key)
+        for snd_ws, rec_ws, ts in msgs:
+            if ts._key not in failed_keys_by_recipient[rec_ws.address]:
+                to_senders[snd_ws.address].append(ts._key)
 
-        result = await asyncio.gather(
-            *(
-                retry_operation(self.rpc(addr=r).gather, who_has=v)
-                for r, v in to_recipients.items()
-            )
+        # Note: this never raises exceptions
+        await asyncio.gather(
+            *(self._delete_worker_data(r, v) for r, v in to_senders.items())
         )
+
         for r, v in to_recipients.items():
             self.log_event(r, {"action": "rebalance", "who_has": v})
-
         self.log_event(
             "all",
             {
@@ -5960,31 +6057,11 @@ async def _rebalance_move_data(
             },
         )
 
-        if any(r["status"] != "OK" for r in result):
-            return {
-                "status": "missing-data",
-                "keys": list(
-                    concat(
-                        r["keys"].keys()
-                        for r in result
-                        if r["status"] == "missing-data"
-                    )
-                ),
-            }
-
-        for snd_ws, rec_ws, ts in msgs:
-            assert ts._state == "memory"
-            ts._who_has.add(rec_ws)
-            rec_ws._has_what[ts] = None
-            rec_ws.nbytes += ts.get_nbytes()
-            self.log.append(
-                ("rebalance", ts._key, time(), snd_ws.address, rec_ws.address)
-            )
-
-        await asyncio.gather(
-            *(self._delete_worker_data(r, v) for r, v in to_senders.items())
-        )
-        return {"status": "OK"}
+        missing_keys = {k for r in failed_keys_by_recipient.values() for k in r}
+        if missing_keys:
+            return {"status": "partial-fail", "keys": list(missing_keys)}
+        else:
+            return {"status": "OK"}
 
     async def replicate(
         self,
@@ -6035,7 +6112,7 @@ async def replicate(
             tasks = {parent._tasks[k] for k in keys}
             missing_data = [ts._key for ts in tasks if not ts._who_has]
             if missing_data:
-                return {"status": "missing-data", "keys": missing_data}
+                return {"status": "partial-fail", "keys": missing_data}
 
             # Delete extraneous data
             if delete:
@@ -6048,6 +6125,7 @@ async def replicate(
                         ):
                             del_worker_tasks[ws].add(ts)
 
+                # Note: this never raises exceptions
                 await asyncio.gather(
                     *[
                         self._delete_worker_data(ws._address, [t.key for t in tasks])
@@ -6077,19 +6155,15 @@ async def replicate(
                             wws._address for wws in ts._who_has
                         ]
 
-                results = await asyncio.gather(
+                await asyncio.gather(
                     *(
-                        retry_operation(self.rpc(addr=w).gather, who_has=who_has)
+                        # Note: this never raises exceptions
+                        self._gather_on_worker(w, who_has)
                         for w, who_has in gathers.items()
                     )
                 )
-                for w, v in zip(gathers, results):
-                    if v["status"] == "OK":
-                        self.add_keys(worker=w, keys=list(gathers[w]))
-                    else:
-                        logger.warning("Communication failed during replication: %s", v)
-
-                    self.log_event(w, {"action": "replicate-add", "keys": gathers[w]})
+                for r, v in gathers.items():
+                    self.log_event(r, {"action": "replicate-add", "who_has": v})
 
             self.log_event(
                 "all",
@@ -7655,7 +7729,7 @@ def validate_task_state(ts: TaskState):
         assert dts._state != "forgotten"
 
     assert (ts._processing_on is not None) == (ts._state == "processing")
-    assert (not not ts._who_has) == (ts._state == "memory"), (ts, ts._who_has)
+    assert bool(ts._who_has) == (ts._state == "memory"), (ts, ts._who_has, ts._state)
 
     if ts._state == "processing":
         assert all([dts._who_has for dts in ts._dependencies]), (
 
@@ -2993,21 +2993,12 @@ async def test_rebalance_unprepared(c, s, a, b):
     s.validate_state()
 
 
-@gen_cluster(client=True, Worker=Nanny, worker_kwargs={"memory_limit": "1 GiB"})
-async def test_rebalance_raises_missing_data(c, s, *_):
-    a, b = s.workers
-    futures = c.map(lambda _: "x" * (2 ** 29 // 10), range(10), workers=[a])
-    await wait(futures)
-    # Wait for heartbeats
-    while s.memory.process < 2 ** 29:
-        await asyncio.sleep(0.1)
-
-    # Descoping the futures enqueues a coroutine to release the data on the server
-    del futures
-    with pytest.raises(KeyError, match="keys were found to be missing"):
-        # During the synchronous part of rebalance, the futures still exist, but they
-        # will be (partially) gone by the time the actual transferring happens.
-        await c.rebalance()
+@gen_cluster(client=True)
+async def test_rebalance_raises_on_explicit_missing_data(c, s, a, b):
+    """rebalance() raises KeyError if explicitly listed futures disappear"""
+    f = Future("x", client=c, state="memory")
+    with pytest.raises(KeyError, match="Could not rebalance keys:"):
+        await c.rebalance(futures=[f])
 
 
 @gen_cluster(client=True)